Series comparison

-[Qemu-devel] [PULL 00/61] Block layer patches
+[Qemu-devel] [PULL v3 00/35] Block layer patches
-The following changes since commit 4c8c1cc544dbd5e2564868e61c5037258e393832:
+The following changes since commit 281f327487c9c9b1599f93c589a408bbf4a651b8:
-  Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.10-pull-request' into staging (2017-06-22 19:01:58 +0100)
+  Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.12-pull-request' into staging (2017-12-22 00:11:36 +0000)
 are available in the git repository at:
   git://repo.or.cz/qemu/kevin.git tags/for-upstream
-for you to fetch changes up to 1512008812410ca4054506a7c44343088abdd977:
+for you to fetch changes up to 1a63a907507fbbcfaee3f622907ec244b7eabda8:
-  Merge remote-tracking branch 'mreitz/tags/pull-block-2017-06-23' into queue-block (2017-06-23 14:09:12 +0200)
+  block: Keep nodes drained between reopen_queue/multiple (2017-12-22 15:05:32 +0100)
 ----------------------------------------------------------------
 Block layer patches
 ----------------------------------------------------------------
-Alberto Garcia (9):
+Doug Gale (1):
-      throttle: Update throttle-groups.c documentation
+      nvme: Add tracing
       qcow2: Remove unused Error variable in do_perform_cow()
       qcow2: Use unsigned int for both members of Qcow2COWRegion
       qcow2: Make perform_cow() call do_perform_cow() twice
       qcow2: Split do_perform_cow() into _read(), _encrypt() and _write()
       qcow2: Allow reading both COW regions with only one request
       qcow2: Pass a QEMUIOVector to do_perform_cow_{read,write}()
       qcow2: Merge the writing of the COW regions with the guest data
       qcow2: Use offset_into_cluster() and offset_to_l2_index()
-Kevin Wolf (37):
+Edgar Kaziakhmedov (1):
-      commit: Fix completion with extra reference
+      qcow2: get rid of qcow2_backing_read1 routine
       qemu-iotests: Allow starting new qemu after cleanup
       qemu-iotests: Test exiting qemu with running job
       doc: Document generic -blockdev options
       doc: Document driver-specific -blockdev options
       qed: Use bottom half to resume waiting requests
       qed: Make qed_read_table() synchronous
       qed: Remove callback from qed_read_table()
       qed: Remove callback from qed_read_l2_table()
       qed: Remove callback from qed_find_cluster()
       qed: Make qed_read_backing_file() synchronous
       qed: Make qed_copy_from_backing_file() synchronous
       qed: Remove callback from qed_copy_from_backing_file()
       qed: Make qed_write_header() synchronous
       qed: Remove callback from qed_write_header()
       qed: Make qed_write_table() synchronous
       qed: Remove GenericCB
       qed: Remove callback from qed_write_table()
       qed: Make qed_aio_read_data() synchronous
       qed: Make qed_aio_write_main() synchronous
       qed: Inline qed_commit_l2_update()
       qed: Add return value to qed_aio_write_l1_update()
       qed: Add return value to qed_aio_write_l2_update()
       qed: Add return value to qed_aio_write_main()
       qed: Add return value to qed_aio_write_cow()
       qed: Add return value to qed_aio_write_inplace/alloc()
       qed: Add return value to qed_aio_read/write_data()
       qed: Remove ret argument from qed_aio_next_io()
       qed: Remove recursion in qed_aio_next_io()
       qed: Implement .bdrv_co_readv/writev
       qed: Use CoQueue for serialising allocations
       qed: Simplify request handling
       qed: Use a coroutine for need_check_timer
       qed: Add coroutine_fn to I/O path functions
       qed: Use bdrv_co_* for coroutine_fns
       block: Remove bdrv_aio_readv/writev/flush()
       Merge remote-tracking branch 'mreitz/tags/pull-block-2017-06-23' into queue-block
-Manos Pitsidianakis (1):
+Fam Zheng (2):
-      block: change variable names in BlockDriverState
+      block: Open backing image in force share mode for size probe
       block: Remove unused bdrv_requests_pending
-Max Reitz (3):
+John Snow (1):
-      blkdebug: Catch bs->exact_filename overflow
+      iotests: fix 197 for vpc
       blkverify: Catch bs->exact_filename overflow
       block: Do not strcmp() with NULL uri->scheme
-Stefan Hajnoczi (10):
+Kevin Wolf (27):
-      block: count bdrv_co_rw_vmstate() requests
+      block: Formats don't need CONSISTENT_READ with NO_IO
-      block: use BDRV_POLL_WHILE() in bdrv_rw_vmstate()
+      block: Make bdrv_drain_invoke() recursive
-      migration: avoid recursive AioContext locking in save_vmstate()
+      block: Call .drain_begin only once in bdrv_drain_all_begin()
-      migration: use bdrv_drain_all_begin/end() instead bdrv_drain_all()
+      test-bdrv-drain: Test BlockDriver callbacks for drain
-      virtio-pci: use ioeventfd even when KVM is disabled
+      block: bdrv_drain_recurse(): Remove unused begin parameter
-      migration: hold AioContext lock for loadvm qemu_fclose()
+      block: Don't wait for requests in bdrv_drain*_end()
-      qemu-iotests: 068: extract _qemu() function
+      block: Unify order in drain functions
-      qemu-iotests: 068: use -drive/-device instead of -hda
+      block: Don't acquire AioContext in hmp_qemu_io()
-      qemu-iotests: 068: test iothread mode
+      block: Document that x-blockdev-change breaks quorum children list
-      qemu-img: don't shadow opts variable in img_dd()
+      block: Assert drain_all is only called from main AioContext
       block: Make bdrv_drain() driver callbacks non-recursive
       test-bdrv-drain: Test callback for bdrv_drain
       test-bdrv-drain: Test bs->quiesce_counter
       blockjob: Pause job on draining any job BDS
       test-bdrv-drain: Test drain vs. block jobs
       block: Don't block_job_pause_all() in bdrv_drain_all()
       block: Nested drain_end must still call callbacks
       test-bdrv-drain: Test nested drain sections
       block: Don't notify parents in drain call chain
       block: Add bdrv_subtree_drained_begin/end()
       test-bdrv-drain: Tests for bdrv_subtree_drain
       test-bdrv-drain: Test behaviour in coroutine context
       test-bdrv-drain: Recursive draining with multiple parents
       block: Allow graph changes in subtree drained section
       test-bdrv-drain: Test graph changes in drained section
       commit: Simplify reopen of base
       block: Keep nodes drained between reopen_queue/multiple
-Stephen Bates (1):
+Thomas Huth (3):
-      nvme: Add support for Read Data and Write Data in CMBs.
+      block: Remove the obsolete -drive boot=on|off parameter
       block: Remove the deprecated -hdachs option
       block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter
-sochin.jiang (1):
+ qapi/block-core.json             |   4 +
-      fix: avoid an infinite loop or a dangling pointer problem in img_commit
+ block/qcow2.h                    |   3 -
  include/block/block.h            |  15 +-
  include/block/block_int.h        |   6 +-
  block.c                          |  75 ++++-
  block/commit.c                   |   8 +-
  block/io.c                       | 164 +++++++---
  block/qcow2.c                    |  51 +--
  block/replication.c              |   6 +
  blockdev.c                       |  11 -
  blockjob.c                       |  22 +-
  hmp.c                            |   6 -
  hw/block/nvme.c                  | 349 +++++++++++++++++----
  qemu-io-cmds.c                   |   3 +
  tests/test-bdrv-drain.c          | 651 +++++++++++++++++++++++++++++++++++++++
  vl.c                             |  86 +-----
  hw/block/trace-events            |  93 ++++++
  qemu-doc.texi                    |  29 +-
  qemu-options.hx                  |  19 +-
  tests/Makefile.include           |   2 +
  tests/qemu-iotests/197           |   4 +
  tests/qemu-iotests/common.filter |   3 +-
 files changed, 1294 insertions(+), 316 deletions(-)
  create mode 100644 tests/test-bdrv-drain.c
- block/Makefile.objs            |   2 +-
- block/blkdebug.c               |  46 +--
- block/blkreplay.c              |   8 +-
- block/blkverify.c              |  12 +-
- block/block-backend.c          |  22 +-
- block/commit.c                 |   7 +
- block/file-posix.c             |  34 +-
- block/io.c                     | 240 ++-----------
- block/iscsi.c                  |  20 +-
- block/mirror.c                 |   8 +-
- block/nbd-client.c             |   8 +-
- block/nbd-client.h             |   4 +-
- block/nbd.c                    |   6 +-
- block/nfs.c                    |   2 +-
- block/qcow2-cluster.c          | 201 ++++++++---
- block/qcow2.c                  |  94 +++--
- block/qcow2.h                  |  11 +-
- block/qed-cluster.c            | 124 +++----
- block/qed-gencb.c              |  33 --
- block/qed-table.c              | 261 +++++---------
- block/qed.c                    | 779 ++++++++++++++++-------------------------
- block/qed.h                    |  54 +--
- block/raw-format.c             |   8 +-
- block/rbd.c                    |   4 +-
- block/sheepdog.c               |  12 +-
- block/ssh.c                    |   2 +-
- block/throttle-groups.c        |   2 +-
- block/trace-events             |   3 -
- blockjob.c                     |   4 +-
- hw/block/nvme.c                |  83 +++--
- hw/block/nvme.h                |   1 +
- hw/virtio/virtio-pci.c         |   2 +-
- include/block/block.h          |  16 +-
- include/block/block_int.h      |   6 +-
- include/block/blockjob.h       |  18 +
- include/sysemu/block-backend.h |  20 +-
- migration/savevm.c             |  32 +-
- qemu-img.c                     |  29 +-
- qemu-io-cmds.c                 |  46 +--
- qemu-options.hx                | 221 ++++++++++--
- tests/qemu-iotests/068         |  37 +-
- tests/qemu-iotests/068.out     |  11 +-
- tests/qemu-iotests/185         | 206 +++++++++++
- tests/qemu-iotests/185.out     |  59 ++++
- tests/qemu-iotests/common.qemu |   3 +
- tests/qemu-iotests/group       |   1 +
-files changed, 1477 insertions(+), 1325 deletions(-)
- delete mode 100644 block/qed-gencb.c
- create mode 100755 tests/qemu-iotests/185
- create mode 100644 tests/qemu-iotests/185.out

-[Qemu-devel] [PULL 54/61] qed: Use bdrv_co_* for coroutine_fns
+[Qemu-devel] [PULL v3 01/35] block: Formats don't need CONSISTENT_READ with NO_IO
-All functions that are marked coroutine_fn can directly call the
+Commit 1f4ad7d fixed 'qemu-img info' for raw images that are currently
-bdrv_co_* version of functions instead of going through the wrapper.
+in use as a mirror target. It is not enough for image formats, though,
 as these still unconditionally request BLK_PERM_CONSISTENT_READ.
 As this permission is geared towards whether the guest-visible data is
 consistent, and has no impact on whether the metadata is sane, and
 'qemu-img info' does not read guest-visible data (except for the raw
 format), it makes sense to not require BLK_PERM_CONSISTENT_READ if there
 is not going to be any guest I/O performed, regardless of image format.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qed.c | 16 +++++++++-------
+ block.c | 6 +++++-
-file changed, 9 insertions(+), 7 deletions(-)
+file changed, 5 insertions(+), 1 deletion(-)
-diff --git a/block/qed.c b/block/qed.c
+diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qed.c
+--- a/block.c
-+++ b/block/qed.c
++++ b/block.c
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_write_header(BDRVQEDState *s)
+@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
-     };
+     assert(role == &child_backing || role == &child_file);
-     qemu_iovec_init_external(&qiov, &iov, 1);
+     if (!backing) {
--    ret = bdrv_preadv(s->bs->file, 0, &qiov);
++        int flags = bdrv_reopen_get_flags(reopen_queue, bs);
-+    ret = bdrv_co_preadv(s->bs->file, 0, qiov.size, &qiov, 0);
++
-     if (ret < 0) {
+         /* Apart from the modifications below, the same permissions are
-         goto out;
+          * forwarded and left alone as for filters */
-     }
+         bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared,
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_write_header(BDRVQEDState *s)
+@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
-     /* Update header */
-     qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);
+         /* bs->file always needs to be consistent because of the metadata. We
+          * can never allow other users to resize or write to it. */
--    ret = bdrv_pwritev(s->bs->file, 0, &qiov);
+-        perm |= BLK_PERM_CONSISTENT_READ;
-+    ret = bdrv_co_pwritev(s->bs->file, 0, qiov.size,  &qiov, 0);
++        if (!(flags & BDRV_O_NO_IO)) {
-     if (ret < 0) {
++            perm |= BLK_PERM_CONSISTENT_READ;
-         goto out;
++        }
-     }
+         shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
+     } else {
-     qemu_iovec_concat(*backing_qiov, qiov, 0, size);
+         /* We want consistent read from backing files if the parent needs it.
      BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
 -    ret = bdrv_preadv(s->bs->backing, pos, *backing_qiov);
 +    ret = bdrv_co_preadv(s->bs->backing, pos, size, *backing_qiov, 0);
      if (ret < 0) {
          return ret;
      }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_copy_from_backing_file(BDRVQEDState *s,
      }
      BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
 -    ret = bdrv_pwritev(s->bs->file, offset, &qiov);
 +    ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, 0);
      if (ret < 0) {
          goto out;
      }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
      trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size);
      BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
 -    ret = bdrv_pwritev(s->bs->file, offset, &acb->cur_qiov);
 +    ret = bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size,
 +                          &acb->cur_qiov, 0);
      if (ret < 0) {
          return ret;
      }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
               * region.  The solution is to flush after writing a new data
               * cluster and before updating the L2 table.
               */
 -            ret = bdrv_flush(s->bs->file->bs);
 +            ret = bdrv_co_flush(s->bs->file->bs);
              if (ret < 0) {
                  return ret;
              }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_aio_read_data(void *opaque, int ret,
      }
      BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
 -    ret = bdrv_preadv(bs->file, offset, &acb->cur_qiov);
 +    ret = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size,
 +                         &acb->cur_qiov, 0);
      if (ret < 0) {
          return ret;
      }
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 59/61] blkverify: Catch bs->exact_filename overflow
+[Qemu-devel] [PULL v3 02/35] iotests: fix 197 for vpc
-From: Max Reitz <mreitz@redhat.com>
+From: John Snow <jsnow@redhat.com>
-The bs->exact_filename field may not be sufficient to store the full
+VPC has some difficulty creating geometries of particular size.
-blkverify node filename. In this case, we should not generate a filename
+However, we can indeed force it to use a literal one, so let's
-at all instead of an unusable one.
+do that for the sake of test 197, which is testing some specific
 offsets.
-Cc: qemu-stable@nongnu.org
+Signed-off-by: John Snow <jsnow@redhat.com>
-Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
+Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 Message-id: 20170613172006.19685-3-mreitz@redhat.com
 Reviewed-by: Alberto Garcia <berto@igalia.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Lukáš Doktor <ldoktor@redhat.com>
 ---
- block/blkverify.c | 12 ++++++++----
+ tests/qemu-iotests/197           | 4 ++++
-file changed, 8 insertions(+), 4 deletions(-)
+ tests/qemu-iotests/common.filter | 3 ++-
 files changed, 6 insertions(+), 1 deletion(-)
-diff --git a/block/blkverify.c b/block/blkverify.c
+diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197
 index XXXXXXX..XXXXXXX 100755
 --- a/tests/qemu-iotests/197
 +++ b/tests/qemu-iotests/197
@@ -XXX,XX +XXX,XX @@ echo '=== Copy-on-read ==='
  echo
  # Prep the images
 +# VPC rounds image sizes to a specific geometry, force a specific size.
 +if [ "$IMGFMT" = "vpc" ]; then
 +    IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size")
 +fi
  _make_test_img 4G
  $QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io
  IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \
 diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
 index XXXXXXX..XXXXXXX 100644
---- a/block/blkverify.c
+--- a/tests/qemu-iotests/common.filter
-+++ b/block/blkverify.c
++++ b/tests/qemu-iotests/common.filter
-@@ -XXX,XX +XXX,XX @@ static void blkverify_refresh_filename(BlockDriverState *bs, QDict *options)
+@@ -XXX,XX +XXX,XX @@ _filter_img_create()
-     if (bs->file->bs->exact_filename[0]
+         -e "s# log_size=[0-9]\\+##g" \
-         && s->test_file->bs->exact_filename[0])
+         -e "s# refcount_bits=[0-9]\\+##g" \
-     {
+         -e "s# key-secret=[a-zA-Z0-9]\\+##g" \
--        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+-        -e "s# iter-time=[0-9]\\+##g"
--                 "blkverify:%s:%s",
++        -e "s# iter-time=[0-9]\\+##g" \
--                 bs->file->bs->exact_filename,
++        -e "s# force_size=\\(on\\|off\\)##g"
 -                 s->test_file->bs->exact_filename);
 +        int ret = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
 +                           "blkverify:%s:%s",
 +                           bs->file->bs->exact_filename,
 +                           s->test_file->bs->exact_filename);
 +        if (ret >= sizeof(bs->exact_filename)) {
 +            /* An overflow makes the filename unusable, so do not report any */
 +            bs->exact_filename[0] = 0;
 +        }
      }
  }
+ _filter_img_info()
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 34/61] qed: Remove callback from qed_write_header()
+[Qemu-devel] [PULL v3 03/35] block: Make bdrv_drain_invoke() recursive
+This change separates bdrv_drain_invoke(), which calls the BlockDriver
+drain callbacks, from bdrv_drain_recurse(). Instead, the function
+performs its own recursion now.
+One reason for this is that bdrv_drain_recurse() can be called multiple
+times by bdrv_drain_all_begin(), but the callbacks may only be called
+once. The separation is necessary to fix this bug.
+The other reason is that we intend to go to a model where we call all
+driver callbacks first, and only then start polling. This is not fully
+achieved yet with this patch, as bdrv_drain_invoke() contains a
+BDRV_POLL_WHILE() loop for the block driver callbacks, which can still
+call callbacks for any unrelated event. It's a step in this direction
+anyway.
+Cc: qemu-stable@nongnu.org
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qed.c | 32 ++++++++++++--------------------
+ block/io.c | 14 +++++++++++---
-file changed, 12 insertions(+), 20 deletions(-)
+file changed, 11 insertions(+), 3 deletions(-)
-diff --git a/block/qed.c b/block/qed.c
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qed.c
+--- a/block/io.c
-+++ b/block/qed.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ int qed_write_header_sync(BDRVQEDState *s)
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
-  * This function only updates known header fields in-place and does not affect
+     bdrv_wakeup(bs);
-  * extra data after the QED header.
+ }
-  */
--static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
++/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
--                             void *opaque)
+ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 +static int qed_write_header(BDRVQEDState *s)
  {
-     /* We must write full sectors for O_DIRECT but cannot necessarily generate
++    BdrvChild *child, *tmp;
-      * the data following the header if an unrecognized compat feature is
+     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
-@@ -XXX,XX +XXX,XX @@ static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
-     ret = 0;
+     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
- out:
+@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
-     qemu_vfree(buf);
+     data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
--    cb(opaque, ret);
+     bdrv_coroutine_enter(bs, data.co);
-+    return ret;
+     BDRV_POLL_WHILE(bs, !data.done);
 +
 +    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
 +        bdrv_drain_invoke(child->bs, begin);
 +    }
  }
- static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
+ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
-@@ -XXX,XX +XXX,XX @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
+@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
      BdrvChild *child, *tmp;
      bool waited;
 -    /* Ensure any pending metadata writes are submitted to bs->file.  */
 -    bdrv_drain_invoke(bs, begin);
 -
      /* Wait for drained requests to finish */
      waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
          bdrv_parent_drained_begin(bs);
      }
++    bdrv_drain_invoke(bs, true);
+     bdrv_drain_recurse(bs, true);
  }
--static void qed_finish_clear_need_check(void *opaque, int ret)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
 -{
 -    /* Do nothing */
 -}
 -
 -static void qed_flush_after_clear_need_check(void *opaque, int ret)
 -{
 -    BDRVQEDState *s = opaque;
 -
 -    bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s);
 -
 -    /* No need to wait until flush completes */
 -    qed_unplug_allocating_write_reqs(s);
 -}
 -
  static void qed_clear_need_check(void *opaque, int ret)
  {
      BDRVQEDState *s = opaque;
@@ -XXX,XX +XXX,XX @@ static void qed_clear_need_check(void *opaque, int ret)
      }
-     s->header.features &= ~QED_F_NEED_CHECK;
+     bdrv_parent_drained_end(bs);
--    qed_write_header(s, qed_flush_after_clear_need_check, s);
++    bdrv_drain_invoke(bs, false);
-+    ret = qed_write_header(s);
+     bdrv_drain_recurse(bs, false);
-+    (void) ret;
+     aio_enable_external(bdrv_get_aio_context(bs));
 +
 +    qed_unplug_allocating_write_reqs(s);
 +
 +    ret = bdrv_flush(s->bs);
 +    (void) ret;
  }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
- static void qed_need_check_timer_cb(void *opaque)
+             aio_context_acquire(aio_context);
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
- {
+                 if (aio_context == bdrv_get_aio_context(bs)) {
-     BDRVQEDState *s = acb_to_s(acb);
++                    /* FIXME Calling this multiple times is wrong */
-     BlockCompletionFunc *cb;
++                    bdrv_drain_invoke(bs, true);
-+    int ret;
+                     waited |= bdrv_drain_recurse(bs, true);
+                 }
-     /* Cancel timer when the first allocating request comes in */
+             }
-     if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+         aio_context_acquire(aio_context);
+         aio_enable_external(aio_context);
-     if (qed_should_set_need_check(s)) {
+         bdrv_parent_drained_end(bs);
-         s->header.features |= QED_F_NEED_CHECK;
++        bdrv_drain_invoke(bs, false);
--        qed_write_header(s, cb, acb);
+         bdrv_drain_recurse(bs, false);
-+        ret = qed_write_header(s);
+         aio_context_release(aio_context);
 +        cb(acb, ret);
      } else {
          cb(acb, 0);
      }
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 53/61] qed: Add coroutine_fn to I/O path functions
+[Qemu-devel] [PULL v3 04/35] block: Call .drain_begin only once in bdrv_drain_all_begin()
-Now that we stay in coroutine context for the whole request when doing
+bdrv_drain_all_begin() used to call the .bdrv_co_drain_begin() driver
-reads or writes, we can add coroutine_fn annotations to many functions
+callback inside its polling loop. This means that how many times it got
-that can do I/O or yield directly.
+called for each node depended on long it had to poll the event loop.
+This is obviously not right and results in nodes that stay drained even
+after bdrv_drain_all_end(), which calls .bdrv_co_drain_begin() once per
+node.
+Fix bdrv_drain_all_begin() to call the callback only once, too.
+Cc: qemu-stable@nongnu.org
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qed-cluster.c |  5 +++--
+ block/io.c | 3 +--
- block/qed.c         | 44 ++++++++++++++++++++++++--------------------
+file changed, 1 insertion(+), 2 deletions(-)
  block/qed.h         |  5 +++--
 files changed, 30 insertions(+), 24 deletions(-)
-diff --git a/block/qed-cluster.c b/block/qed-cluster.c
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qed-cluster.c
+--- a/block/io.c
-+++ b/block/qed-cluster.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
-  * On failure QED_CLUSTER_L2 or QED_CLUSTER_L1 is returned for missing L2 or L1
+         aio_context_acquire(aio_context);
-  * table offset, respectively. len is number of contiguous unallocated bytes.
+         bdrv_parent_drained_begin(bs);
-  */
+         aio_disable_external(aio_context);
--int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
++        bdrv_drain_invoke(bs, true);
--                     size_t *len, uint64_t *img_offset)
+         aio_context_release(aio_context);
-+int coroutine_fn qed_find_cluster(BDRVQEDState *s, QEDRequest *request,
-+                                  uint64_t pos, size_t *len,
+         if (!g_slist_find(aio_ctxs, aio_context)) {
-+                                  uint64_t *img_offset)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
- {
+             aio_context_acquire(aio_context);
-     uint64_t l2_offset;
+             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-     uint64_t offset = 0;
+                 if (aio_context == bdrv_get_aio_context(bs)) {
-diff --git a/block/qed.c b/block/qed.c
+-                    /* FIXME Calling this multiple times is wrong */
-index XXXXXXX..XXXXXXX 100644
+-                    bdrv_drain_invoke(bs, true);
---- a/block/qed.c
+                     waited |= bdrv_drain_recurse(bs, true);
-+++ b/block/qed.c
+                 }
-@@ -XXX,XX +XXX,XX @@ int qed_write_header_sync(BDRVQEDState *s)
+             }
   * This function only updates known header fields in-place and does not affect
   * extra data after the QED header.
   */
 -static int qed_write_header(BDRVQEDState *s)
 +static int coroutine_fn qed_write_header(BDRVQEDState *s)
  {
      /* We must write full sectors for O_DIRECT but cannot necessarily generate
       * the data following the header if an unrecognized compat feature is
@@ -XXX,XX +XXX,XX @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
      qemu_co_enter_next(&s->allocating_write_reqs);
  }
 -static void qed_need_check_timer_entry(void *opaque)
 +static void coroutine_fn qed_need_check_timer_entry(void *opaque)
  {
      BDRVQEDState *s = opaque;
      int ret;
@@ -XXX,XX +XXX,XX @@ static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
   * This function reads qiov->size bytes starting at pos from the backing file.
   * If there is no backing file then zeroes are read.
   */
 -static int qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
 -                                 QEMUIOVector *qiov,
 -                                 QEMUIOVector **backing_qiov)
 +static int coroutine_fn qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
 +                                              QEMUIOVector *qiov,
 +                                              QEMUIOVector **backing_qiov)
  {
      uint64_t backing_length = 0;
      size_t size;
@@ -XXX,XX +XXX,XX @@ static int qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
   * @len:        Number of bytes
   * @offset:     Byte offset in image file
   */
 -static int qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
 -                                      uint64_t len, uint64_t offset)
 +static int coroutine_fn qed_copy_from_backing_file(BDRVQEDState *s,
 +                                                   uint64_t pos, uint64_t len,
 +                                                   uint64_t offset)
  {
      QEMUIOVector qiov;
      QEMUIOVector *backing_qiov = NULL;
@@ -XXX,XX +XXX,XX @@ out:
   * The cluster offset may be an allocated byte offset in the image file, the
   * zero cluster marker, or the unallocated cluster marker.
   */
 -static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
 -                                unsigned int n, uint64_t cluster)
 +static void coroutine_fn qed_update_l2_table(BDRVQEDState *s, QEDTable *table,
 +                                             int index, unsigned int n,
 +                                             uint64_t cluster)
  {
      int i;
      for (i = index; i < index + n; i++) {
@@ -XXX,XX +XXX,XX @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
      }
  }
 -static void qed_aio_complete(QEDAIOCB *acb)
 +static void coroutine_fn qed_aio_complete(QEDAIOCB *acb)
  {
      BDRVQEDState *s = acb_to_s(acb);
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb)
  /**
   * Update L1 table with new L2 table offset and write it out
   */
 -static int qed_aio_write_l1_update(QEDAIOCB *acb)
 +static int coroutine_fn qed_aio_write_l1_update(QEDAIOCB *acb)
  {
      BDRVQEDState *s = acb_to_s(acb);
      CachedL2Table *l2_table = acb->request.l2_table;
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_l1_update(QEDAIOCB *acb)
  /**
   * Update L2 table with new cluster offsets and write them out
   */
 -static int qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
 +static int coroutine_fn qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
  {
      BDRVQEDState *s = acb_to_s(acb);
      bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
  /**
   * Write data to the image file
   */
 -static int qed_aio_write_main(QEDAIOCB *acb)
 +static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
  {
      BDRVQEDState *s = acb_to_s(acb);
      uint64_t offset = acb->cur_cluster +
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_main(QEDAIOCB *acb)
  /**
   * Populate untouched regions of new data cluster
   */
 -static int qed_aio_write_cow(QEDAIOCB *acb)
 +static int coroutine_fn qed_aio_write_cow(QEDAIOCB *acb)
  {
      BDRVQEDState *s = acb_to_s(acb);
      uint64_t start, len, offset;
@@ -XXX,XX +XXX,XX @@ static bool qed_should_set_need_check(BDRVQEDState *s)
   *
   * This path is taken when writing to previously unallocated clusters.
   */
 -static int qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 +static int coroutine_fn qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
  {
      BDRVQEDState *s = acb_to_s(acb);
      int ret;
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
   *
   * This path is taken when writing to already allocated clusters.
   */
 -static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
 +static int coroutine_fn qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset,
 +                                              size_t len)
  {
      /* Allocate buffer for zero writes */
      if (acb->flags & QED_AIOCB_ZERO) {
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
   * @offset:     Cluster offset in bytes
   * @len:        Length in bytes
   */
 -static int qed_aio_write_data(void *opaque, int ret,
 -                              uint64_t offset, size_t len)
 +static int coroutine_fn qed_aio_write_data(void *opaque, int ret,
 +                                           uint64_t offset, size_t len)
  {
      QEDAIOCB *acb = opaque;
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_data(void *opaque, int ret,
   * @offset:     Cluster offset in bytes
   * @len:        Length in bytes
   */
 -static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
 +static int coroutine_fn qed_aio_read_data(void *opaque, int ret,
 +                                          uint64_t offset, size_t len)
  {
      QEDAIOCB *acb = opaque;
      BDRVQEDState *s = acb_to_s(acb);
@@ -XXX,XX +XXX,XX @@ static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
  /**
   * Begin next I/O or complete the request
   */
 -static int qed_aio_next_io(QEDAIOCB *acb)
 +static int coroutine_fn qed_aio_next_io(QEDAIOCB *acb)
  {
      BDRVQEDState *s = acb_to_s(acb);
      uint64_t offset;
 diff --git a/block/qed.h b/block/qed.h
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.h
 +++ b/block/qed.h
@@ -XXX,XX +XXX,XX @@ int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
  /**
   * Cluster functions
   */
 -int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
 -                     size_t *len, uint64_t *img_offset);
 +int coroutine_fn qed_find_cluster(BDRVQEDState *s, QEDRequest *request,
 +                                  uint64_t pos, size_t *len,
 +                                  uint64_t *img_offset);
  /**
   * Consistency check
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 03/61] qemu-iotests: Test exiting qemu with running job
+[Qemu-devel] [PULL v3 05/35] test-bdrv-drain: Test BlockDriver callbacks for drain
-When qemu is exited, all running jobs should be cancelled successfully.
+This adds a test case that the BlockDriver callbacks for drain are
-This adds a test for this for all types of block jobs that currently
+called in bdrv_drained_all_begin/end(), and that both of them are called
-exist in qemu.
+exactly once.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 ---
- tests/qemu-iotests/185     | 206 +++++++++++++++++++++++++++++++++++++++++++++
+ tests/test-bdrv-drain.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++
- tests/qemu-iotests/185.out |  59 +++++++++++++
+ tests/Makefile.include  |   2 +
- tests/qemu-iotests/group   |   1 +
+files changed, 139 insertions(+)
-files changed, 266 insertions(+)
+ create mode 100644 tests/test-bdrv-drain.c
  create mode 100755 tests/qemu-iotests/185
  create mode 100644 tests/qemu-iotests/185.out
-diff --git a/tests/qemu-iotests/185 b/tests/qemu-iotests/185
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 new file mode 100755
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/qemu-iotests/185
@@ -XXX,XX +XXX,XX @@
 +#!/bin/bash
 +#
 +# Test exiting qemu while jobs are still running
 +#
 +# Copyright (C) 2017 Red Hat, Inc.
 +#
 +# This program is free software; you can redistribute it and/or modify
 +# it under the terms of the GNU General Public License as published by
 +# the Free Software Foundation; either version 2 of the License, or
 +# (at your option) any later version.
 +#
 +# This program is distributed in the hope that it will be useful,
 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 +# GNU General Public License for more details.
 +#
 +# You should have received a copy of the GNU General Public License
 +# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 +#
 +
 +# creator
 +owner=kwolf@redhat.com
 +
 +seq=`basename $0`
 +echo "QA output created by $seq"
 +
 +here=`pwd`
 +status=1 # failure is the default!
 +
 +MIG_SOCKET="${TEST_DIR}/migrate"
 +
 +_cleanup()
 +{
 +    rm -f "${TEST_IMG}.mid"
 +    rm -f "${TEST_IMG}.copy"
 +    _cleanup_test_img
 +    _cleanup_qemu
 +}
 +trap "_cleanup; exit \$status" 0 1 2 3 15
 +
 +# get standard environment, filters and checks
 +. ./common.rc
 +. ./common.filter
 +. ./common.qemu
 +
 +_supported_fmt qcow2
 +_supported_proto file
 +_supported_os Linux
 +
 +size=64M
 +TEST_IMG="${TEST_IMG}.base" _make_test_img $size
 +
 +echo
 +echo === Starting VM ===
 +echo
 +
 +qemu_comm_method="qmp"
 +
 +_launch_qemu \
 +    -drive file="${TEST_IMG}.base",cache=$CACHEMODE,driver=$IMGFMT,id=disk
 +h=$QEMU_HANDLE
 +_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
 +
 +echo
 +echo === Creating backing chain ===
 +echo
 +
 +_send_qemu_cmd $h \
 +    "{ 'execute': 'blockdev-snapshot-sync',
 +       'arguments': { 'device': 'disk',
 +                      'snapshot-file': '$TEST_IMG.mid',
 +                      'format': '$IMGFMT',
 +                      'mode': 'absolute-paths' } }" \
 +    "return"
 +
 +_send_qemu_cmd $h \
 +    "{ 'execute': 'human-monitor-command',
 +       'arguments': { 'command-line':
 +                      'qemu-io disk \"write 0 4M\"' } }" \
 +    "return"
 +
 +_send_qemu_cmd $h \
 +    "{ 'execute': 'blockdev-snapshot-sync',
 +       'arguments': { 'device': 'disk',
 +                      'snapshot-file': '$TEST_IMG',
 +                      'format': '$IMGFMT',
 +                      'mode': 'absolute-paths' } }" \
 +    "return"
 +
 +echo
 +echo === Start commit job and exit qemu ===
 +echo
 +
 +# Note that the reference output intentionally includes the 'offset' field in
 +# BLOCK_JOB_CANCELLED events for all of the following block jobs. They are
 +# predictable and any change in the offsets would hint at a bug in the job
 +# throttling code.
 +#
 +# In order to achieve these predictable offsets, all of the following tests
 +# use speed=65536. Each job will perform exactly one iteration before it has
 +# to sleep at least for a second, which is plenty of time for the 'quit' QMP
 +# command to be received (after receiving the command, the rest runs
 +# synchronously, so jobs can arbitrarily continue or complete).
 +#
 +# The buffer size for commit and streaming is 512k (waiting for 8 seconds after
 +# the first request), for active commit and mirror it's large enough to cover
 +# the full 4M, and for backup it's the qcow2 cluster size, which we know is
 +# 64k. As all of these are at least as large as the speed, we are sure that the
 +# offset doesn't advance after the first iteration before qemu exits.
 +
 +_send_qemu_cmd $h \
 +    "{ 'execute': 'block-commit',
 +       'arguments': { 'device': 'disk',
 +                      'base':'$TEST_IMG.base',
 +                      'top': '$TEST_IMG.mid',
 +                      'speed': 65536 } }" \
 +    "return"
 +
 +_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
 +wait=1 _cleanup_qemu
 +
 +echo
 +echo === Start active commit job and exit qemu ===
 +echo
 +
 +_launch_qemu \
 +    -drive file="${TEST_IMG}",cache=$CACHEMODE,driver=$IMGFMT,id=disk
 +h=$QEMU_HANDLE
 +_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
 +
 +_send_qemu_cmd $h \
 +    "{ 'execute': 'block-commit',
 +       'arguments': { 'device': 'disk',
 +                      'base':'$TEST_IMG.base',
 +                      'speed': 65536 } }" \
 +    "return"
 +
 +_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
 +wait=1 _cleanup_qemu
 +
 +echo
 +echo === Start mirror job and exit qemu ===
 +echo
 +
 +_launch_qemu \
 +    -drive file="${TEST_IMG}",cache=$CACHEMODE,driver=$IMGFMT,id=disk
 +h=$QEMU_HANDLE
 +_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
 +
 +_send_qemu_cmd $h \
 +    "{ 'execute': 'drive-mirror',
 +       'arguments': { 'device': 'disk',
 +                      'target': '$TEST_IMG.copy',
 +                      'format': '$IMGFMT',
 +                      'sync': 'full',
 +                      'speed': 65536 } }" \
 +    "return"
 +
 +_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
 +wait=1 _cleanup_qemu
 +
 +echo
 +echo === Start backup job and exit qemu ===
 +echo
 +
 +_launch_qemu \
 +    -drive file="${TEST_IMG}",cache=$CACHEMODE,driver=$IMGFMT,id=disk
 +h=$QEMU_HANDLE
 +_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
 +
 +_send_qemu_cmd $h \
 +    "{ 'execute': 'drive-backup',
 +       'arguments': { 'device': 'disk',
 +                      'target': '$TEST_IMG.copy',
 +                      'format': '$IMGFMT',
 +                      'sync': 'full',
 +                      'speed': 65536 } }" \
 +    "return"
 +
 +_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
 +wait=1 _cleanup_qemu
 +
 +echo
 +echo === Start streaming job and exit qemu ===
 +echo
 +
 +_launch_qemu \
 +    -drive file="${TEST_IMG}",cache=$CACHEMODE,driver=$IMGFMT,id=disk
 +h=$QEMU_HANDLE
 +_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
 +
 +_send_qemu_cmd $h \
 +    "{ 'execute': 'block-stream',
 +       'arguments': { 'device': 'disk',
 +                      'speed': 65536 } }" \
 +    "return"
 +
 +_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
 +wait=1 _cleanup_qemu
 +
 +_check_test_img
 +
 +# success, all done
 +echo "*** done"
 +rm -f $seq.full
 +status=0
 diff --git a/tests/qemu-iotests/185.out b/tests/qemu-iotests/185.out
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/tests/qemu-iotests/185.out
++++ b/tests/test-bdrv-drain.c
 @@ -XXX,XX +XXX,XX @@
-+QA output created by 185
++/*
-+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864
++ * Block node draining tests
 + *
 + * Copyright (c) 2017 Kevin Wolf <kwolf@redhat.com>
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a copy
 + * of this software and associated documentation files (the "Software"), to deal
 + * in the Software without restriction, including without limitation the rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
-+=== Starting VM ===
++#include "qemu/osdep.h"
 +#include "block/block.h"
 +#include "sysemu/block-backend.h"
 +#include "qapi/error.h"
 +
-+{"return": {}}
++typedef struct BDRVTestState {
 +    int drain_count;
 +} BDRVTestState;
 +
-+=== Creating backing chain ===
++static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
 +{
 +    BDRVTestState *s = bs->opaque;
 +    s->drain_count++;
 +}
 +
-+Formatting 'TEST_DIR/t.qcow2.mid', fmt=qcow2 size=67108864 backing_file=TEST_DIR/t.qcow2.base backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
++static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
-+{"return": {}}
++{
-+wrote 4194304/4194304 bytes at offset 0
++    BDRVTestState *s = bs->opaque;
-+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++    s->drain_count--;
-+{"return": ""}
++}
 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 size=67108864 backing_file=TEST_DIR/t.qcow2.mid backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
 +{"return": {}}
 +
-+=== Start commit job and exit qemu ===
++static void bdrv_test_close(BlockDriverState *bs)
 +{
 +    BDRVTestState *s = bs->opaque;
 +    g_assert_cmpint(s->drain_count, >, 0);
 +}
 +
-+{"return": {}}
++static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
-+{"return": {}}
++                                            uint64_t offset, uint64_t bytes,
-+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
++                                            QEMUIOVector *qiov, int flags)
-+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 524288, "speed": 65536, "type": "commit"}}
++{
 +    /* We want this request to stay until the polling loop in drain waits for
 +     * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
 +     * first and polls its result, too, but it shouldn't accidentally complete
 +     * this request yet. */
 +    qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
 +
-+=== Start active commit job and exit qemu ===
++    return 0;
 +}
 +
-+{"return": {}}
++static BlockDriver bdrv_test = {
-+{"return": {}}
++    .format_name            = "test",
-+{"return": {}}
++    .instance_size          = sizeof(BDRVTestState),
 +{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
 +{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 4194304, "offset": 4194304, "speed": 65536, "type": "commit"}}
 +
-+=== Start mirror job and exit qemu ===
++    .bdrv_close             = bdrv_test_close,
 +    .bdrv_co_preadv         = bdrv_test_co_preadv,
 +
-+{"return": {}}
++    .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
-+Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 size=67108864 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
++    .bdrv_co_drain_end      = bdrv_test_co_drain_end,
-+{"return": {}}
++};
 +{"return": {}}
 +{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
 +{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 4194304, "offset": 4194304, "speed": 65536, "type": "mirror"}}
 +
-+=== Start backup job and exit qemu ===
++static void aio_ret_cb(void *opaque, int ret)
 +{
 +    int *aio_ret = opaque;
 +    *aio_ret = ret;
 +}
 +
-+{"return": {}}
++static void test_drv_cb_drain_all(void)
-+Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 size=67108864 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
++{
-+{"return": {}}
++    BlockBackend *blk;
-+{"return": {}}
++    BlockDriverState *bs;
-+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
++    BDRVTestState *s;
-+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 65536, "speed": 65536, "type": "backup"}}
++    BlockAIOCB *acb;
 +    int aio_ret;
 +
-+=== Start streaming job and exit qemu ===
++    QEMUIOVector qiov;
 +    struct iovec iov = {
 +        .iov_base = NULL,
 +        .iov_len = 0,
 +    };
 +    qemu_iovec_init_external(&qiov, &iov, 1);
 +
-+{"return": {}}
++    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
-+{"return": {}}
++    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
-+{"return": {}}
++                              &error_abort);
-+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
++    s = bs->opaque;
-+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 524288, "speed": 65536, "type": "stream"}}
++    blk_insert_bs(blk, bs, &error_abort);
-+No errors were found on the image.
++
-+*** done
++    /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
-diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
++    g_assert_cmpint(s->drain_count, ==, 0);
 +    bdrv_drain_all_begin();
 +    g_assert_cmpint(s->drain_count, ==, 1);
 +    bdrv_drain_all_end();
 +    g_assert_cmpint(s->drain_count, ==, 0);
 +
 +    /* Now do the same while a request is pending */
 +    aio_ret = -EINPROGRESS;
 +    acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
 +    g_assert(acb != NULL);
 +    g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
 +
 +    g_assert_cmpint(s->drain_count, ==, 0);
 +    bdrv_drain_all_begin();
 +    g_assert_cmpint(aio_ret, ==, 0);
 +    g_assert_cmpint(s->drain_count, ==, 1);
 +    bdrv_drain_all_end();
 +    g_assert_cmpint(s->drain_count, ==, 0);
 +
 +    bdrv_unref(bs);
 +    blk_unref(blk);
 +}
 +
 +int main(int argc, char **argv)
 +{
 +    bdrv_init();
 +    qemu_init_main_loop(&error_abort);
 +
 +    g_test_init(&argc, &argv, NULL);
 +
 +    g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
 +
 +    return g_test_run();
 +}
 diff --git a/tests/Makefile.include b/tests/Makefile.include
 index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/group
+--- a/tests/Makefile.include
-+++ b/tests/qemu-iotests/group
++++ b/tests/Makefile.include
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ gcov-files-test-thread-pool-y = thread-pool.c
-rw auto migration
+ gcov-files-test-hbitmap-y = util/hbitmap.c
-rw auto quick
+ check-unit-y += tests/test-hbitmap$(EXESUF)
-rw auto migration
+ gcov-files-test-hbitmap-y = blockjob.c
-+185 rw auto
++check-unit-y += tests/test-bdrv-drain$(EXESUF)
  check-unit-y += tests/test-blockjob$(EXESUF)
  check-unit-y += tests/test-blockjob-txn$(EXESUF)
  check-unit-y += tests/test-x86-cpuid$(EXESUF)
@@ -XXX,XX +XXX,XX @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
  tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
  tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
  tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
 +tests/test-bdrv-drain$(EXESUF): tests/test-bdrv-drain.o $(test-block-obj-y) $(test-util-obj-y)
  tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
  tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
  tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 52/61] qed: Use a coroutine for need_check_timer
+[Qemu-devel] [PULL v3 06/35] block: bdrv_drain_recurse(): Remove unused begin parameter
-This fixes the last place where we degraded from AIO to actual blocking
+Now that the bdrv_drain_invoke() calls are pulled up to the callers of
-synchronous I/O requests. Putting it into a coroutine means that instead
+bdrv_drain_recurse(), the 'begin' parameter isn't needed any more.
 of blocking, the coroutine simply yields while doing I/O.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qed.c | 33 +++++++++++++++++----------------
+ block/io.c | 12 ++++++------
-file changed, 17 insertions(+), 16 deletions(-)
+file changed, 6 insertions(+), 6 deletions(-)
-diff --git a/block/qed.c b/block/qed.c
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qed.c
+--- a/block/io.c
-+++ b/block/qed.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
+@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
-     qemu_co_enter_next(&s->allocating_write_reqs);
+     }
  }
--static void qed_clear_need_check(void *opaque, int ret)
+-static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
-+static void qed_need_check_timer_entry(void *opaque)
++static bool bdrv_drain_recurse(BlockDriverState *bs)
  {
-     BDRVQEDState *s = opaque;
+     BdrvChild *child, *tmp;
-+    int ret;
+     bool waited;
+@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
--    if (ret) {
+              */
-+    /* The timer should only fire when allocating writes have drained */
+             bdrv_ref(bs);
-+    assert(!s->allocating_acb);
+         }
-+
+-        waited |= bdrv_drain_recurse(bs, begin);
-+    trace_qed_need_check_timer_cb(s);
++        waited |= bdrv_drain_recurse(bs);
-+
+         if (in_main_loop) {
-+    qed_acquire(s);
+             bdrv_unref(bs);
-+    qed_plug_allocating_write_reqs(s);
+         }
-+
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
 +    /* Ensure writes are on disk before clearing flag */
 +    ret = bdrv_co_flush(s->bs->file->bs);
 +    qed_release(s);
 +    if (ret < 0) {
          qed_unplug_allocating_write_reqs(s);
          return;
      }
-@@ -XXX,XX +XXX,XX @@ static void qed_clear_need_check(void *opaque, int ret)
+     bdrv_drain_invoke(bs, true);
-     qed_unplug_allocating_write_reqs(s);
+-    bdrv_drain_recurse(bs, true);
++    bdrv_drain_recurse(bs);
 -    ret = bdrv_flush(s->bs);
 +    ret = bdrv_co_flush(s->bs);
      (void) ret;
  }
- static void qed_need_check_timer_cb(void *opaque)
+ void bdrv_drained_end(BlockDriverState *bs)
- {
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
--    BDRVQEDState *s = opaque;
--
+     bdrv_parent_drained_end(bs);
--    /* The timer should only fire when allocating writes have drained */
+     bdrv_drain_invoke(bs, false);
--    assert(!s->allocating_acb);
+-    bdrv_drain_recurse(bs, false);
--
++    bdrv_drain_recurse(bs);
--    trace_qed_need_check_timer_cb(s);
+     aio_enable_external(bdrv_get_aio_context(bs));
 -
 -    qed_acquire(s);
 -    qed_plug_allocating_write_reqs(s);
 -
 -    /* Ensure writes are on disk before clearing flag */
 -    bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
 -    qed_release(s);
 +    Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque);
 +    qemu_coroutine_enter(co);
  }
- void qed_acquire(BDRVQEDState *s)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
              aio_context_acquire(aio_context);
              for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                  if (aio_context == bdrv_get_aio_context(bs)) {
 -                    waited |= bdrv_drain_recurse(bs, true);
 +                    waited |= bdrv_drain_recurse(bs);
                  }
              }
              aio_context_release(aio_context);
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
          aio_enable_external(aio_context);
          bdrv_parent_drained_end(bs);
          bdrv_drain_invoke(bs, false);
 -        bdrv_drain_recurse(bs, false);
 +        bdrv_drain_recurse(bs);
          aio_context_release(aio_context);
      }
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 50/61] qed: Use CoQueue for serialising allocations
+[Qemu-devel] [PULL v3 07/35] block: Don't wait for requests in bdrv_drain*_end()
-Now that we're running in coroutine context, the ad-hoc serialisation
+The device is drained, so there is no point in waiting for requests at
-code (which drops a request that has to wait out of coroutine context)
+the end of the drained section. Remove the bdrv_drain_recurse() calls
-can be replaced by a CoQueue.
+there.
-This means that when we resume a serialised request, it is running in
+The bdrv_drain_recurse() calls were introduced in commit 481cad48e5e
-coroutine context again and its I/O isn't blocking any more.
+in order to call the .bdrv_co_drain_end() driver callback. This is now
 done by a separate bdrv_drain_invoke() call.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qed.c | 49 +++++++++++++++++--------------------------------
+ block/io.c | 2 --
- block/qed.h |  3 ++-
+file changed, 2 deletions(-)
 files changed, 19 insertions(+), 33 deletions(-)
-diff --git a/block/qed.c b/block/qed.c
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qed.c
+--- a/block/io.c
-+++ b/block/qed.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
- static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
+     bdrv_parent_drained_end(bs);
- {
+     bdrv_drain_invoke(bs, false);
--    QEDAIOCB *acb;
+-    bdrv_drain_recurse(bs);
--
+     aio_enable_external(bdrv_get_aio_context(bs));
      assert(s->allocating_write_reqs_plugged);
      s->allocating_write_reqs_plugged = false;
 -
 -    acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
 -    if (acb) {
 -        qed_aio_start_io(acb);
 -    }
 +    qemu_co_enter_next(&s->allocating_write_reqs);
  }
- static void qed_clear_need_check(void *opaque, int ret)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
-@@ -XXX,XX +XXX,XX @@ static void qed_need_check_timer_cb(void *opaque)
+         aio_enable_external(aio_context);
-     BDRVQEDState *s = opaque;
+         bdrv_parent_drained_end(bs);
+         bdrv_drain_invoke(bs, false);
-     /* The timer should only fire when allocating writes have drained */
+-        bdrv_drain_recurse(bs);
--    assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs));
+         aio_context_release(aio_context);
 +    assert(!s->allocating_acb);
      trace_qed_need_check_timer_cb(s);
@@ -XXX,XX +XXX,XX @@ static int bdrv_qed_do_open(BlockDriverState *bs, QDict *options, int flags,
      int ret;
      s->bs = bs;
 -    QSIMPLEQ_INIT(&s->allocating_write_reqs);
 +    qemu_co_queue_init(&s->allocating_write_reqs);
      ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
      if (ret < 0) {
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete_bh(void *opaque)
      qed_release(s);
  }
 -static void qed_resume_alloc_bh(void *opaque)
 -{
 -    qed_aio_start_io(opaque);
 -}
 -
  static void qed_aio_complete(QEDAIOCB *acb, int ret)
  {
      BDRVQEDState *s = acb_to_s(acb);
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
       * next request in the queue.  This ensures that we don't cycle through
       * requests multiple times but rather finish one at a time completely.
       */
 -    if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
 -        QEDAIOCB *next_acb;
 -        QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
 -        next_acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
 -        if (next_acb) {
 -            aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs),
 -                                    qed_resume_alloc_bh, next_acb);
 +    if (acb == s->allocating_acb) {
 +        s->allocating_acb = NULL;
 +        if (!qemu_co_queue_empty(&s->allocating_write_reqs)) {
 +            qemu_co_enter_next(&s->allocating_write_reqs);
          } else if (s->header.features & QED_F_NEED_CHECK) {
              qed_start_need_check_timer(s);
          }
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
      int ret;
      /* Cancel timer when the first allocating request comes in */
 -    if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
 +    if (s->allocating_acb == NULL) {
          qed_cancel_need_check_timer(s);
      }
-     /* Freeze this request if another allocating write is in progress */
--    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
--        QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next);
--    }
--    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
--        s->allocating_write_reqs_plugged) {
--        return -EINPROGRESS; /* wait for existing request to finish */
-+    if (s->allocating_acb != acb || s->allocating_write_reqs_plugged) {
-+        if (s->allocating_acb != NULL) {
-+            qemu_co_queue_wait(&s->allocating_write_reqs, NULL);
-+            assert(s->allocating_acb == NULL);
-+        }
-+        s->allocating_acb = acb;
-+        return -EAGAIN; /* start over with looking up table entries */
-     }
-     acb->cur_nclusters = qed_bytes_to_clusters(s,
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb)
-             ret = qed_aio_read_data(acb, ret, offset, len);
-         }
--        if (ret < 0) {
--            if (ret != -EINPROGRESS) {
--                qed_aio_complete(acb, ret);
--            }
-+        if (ret < 0 && ret != -EAGAIN) {
-+            qed_aio_complete(acb, ret);
-             return;
-         }
-     }
-diff --git a/block/qed.h b/block/qed.h
-index XXXXXXX..XXXXXXX 100644
---- a/block/qed.h
-+++ b/block/qed.h
-@@ -XXX,XX +XXX,XX @@ typedef struct {
-     uint32_t l2_mask;
-     /* Allocating write request queue */
--    QSIMPLEQ_HEAD(, QEDAIOCB) allocating_write_reqs;
-+    QEDAIOCB *allocating_acb;
-+    CoQueue allocating_write_reqs;
-     bool allocating_write_reqs_plugged;
-     /* Periodic flush and clear need check flag */
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 44/61] qed: Add return value to qed_aio_write_cow()
+[Qemu-devel] [PULL v3 08/35] block: Unify order in drain functions
-Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
+Drain requests are propagated to child nodes, parent nodes and directly
-just return an error code and let the caller handle it.
+to the AioContext. The order in which this happened was different
 between all combinations of drain/drain_all and begin/end.
-While refactoring qed_aio_write_alloc() to accomodate the change,
+The correct order is to keep children only drained when their parents
-qed_aio_write_zero_cluster() ended up with a single line, so I chose to
+are also drained. This means that at the start of a drained section, the
-inline that line and remove the function completely.
+AioContext needs to be drained first, the parents second and only then
 the children. The correct order for the end of a drained section is the
 opposite.
 This patch changes the three other functions to follow the example of
 bdrv_drained_begin(), which is the only one that got it right.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qed.c | 58 +++++++++++++++++++++-------------------------------------
+ block/io.c | 12 ++++++++----
-file changed, 21 insertions(+), 37 deletions(-)
+file changed, 8 insertions(+), 4 deletions(-)
-diff --git a/block/qed.c b/block/qed.c
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qed.c
+--- a/block/io.c
-+++ b/block/qed.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_main(QEDAIOCB *acb)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
- /**
+         return;
   * Populate untouched regions of new data cluster
   */
 -static void qed_aio_write_cow(void *opaque, int ret)
 +static int qed_aio_write_cow(QEDAIOCB *acb)
  {
 -    QEDAIOCB *acb = opaque;
      BDRVQEDState *s = acb_to_s(acb);
      uint64_t start, len, offset;
 +    int ret;
      /* Populate front untouched region of new data cluster */
      start = qed_start_of_cluster(s, acb->cur_pos);
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_cow(void *opaque, int ret)
      trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
      ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
 -    if (ret) {
 -        qed_aio_complete(acb, ret);
 -        return;
 +    if (ret < 0) {
 +        return ret;
      }
-     /* Populate back untouched region of new data cluster */
++    /* Stop things in parent-to-child order */
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_cow(void *opaque, int ret)
+     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
+         aio_disable_external(bdrv_get_aio_context(bs));
-     trace_qed_aio_write_postfill(s, acb, start, len, offset);
+         bdrv_parent_drained_begin(bs);
-     ret = qed_copy_from_backing_file(s, start, len, offset);
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
--    if (ret) {
+         return;
 -        qed_aio_complete(acb, ret);
 -        return;
 -    }
 -
 -    ret = qed_aio_write_main(acb);
      if (ret < 0) {
 -        qed_aio_complete(acb, ret);
 -        return;
 +        return ret;
      }
--    qed_aio_next_io(acb, 0);
-+
+-    bdrv_parent_drained_end(bs);
-+    return qed_aio_write_main(acb);
++    /* Re-enable things in child-to-parent order */
      bdrv_drain_invoke(bs, false);
 +    bdrv_parent_drained_end(bs);
      aio_enable_external(bdrv_get_aio_context(bs));
  }
- /**
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
-@@ -XXX,XX +XXX,XX @@ static bool qed_should_set_need_check(BDRVQEDState *s)
+     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-     return !(s->header.features & QED_F_NEED_CHECK);
+         AioContext *aio_context = bdrv_get_aio_context(bs);
- }
++        /* Stop things in parent-to-child order */
--static void qed_aio_write_zero_cluster(void *opaque, int ret)
+         aio_context_acquire(aio_context);
--{
+-        bdrv_parent_drained_begin(bs);
--    QEDAIOCB *acb = opaque;
+         aio_disable_external(aio_context);
--
++        bdrv_parent_drained_begin(bs);
--    if (ret) {
+         bdrv_drain_invoke(bs, true);
--        qed_aio_complete(acb, ret);
+         aio_context_release(aio_context);
--        return;
--    }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
--
+     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
--    ret = qed_aio_write_l2_update(acb, 1);
+         AioContext *aio_context = bdrv_get_aio_context(bs);
--    if (ret < 0) {
--        qed_aio_complete(acb, ret);
++        /* Re-enable things in child-to-parent order */
--        return;
+         aio_context_acquire(aio_context);
--    }
+-        aio_enable_external(aio_context);
--    qed_aio_next_io(acb, 0);
+-        bdrv_parent_drained_end(bs);
--}
+         bdrv_drain_invoke(bs, false);
--
++        bdrv_parent_drained_end(bs);
- /**
++        aio_enable_external(aio_context);
-  * Write new data cluster
+         aio_context_release(aio_context);
   *
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_zero_cluster(void *opaque, int ret)
  static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
  {
      BDRVQEDState *s = acb_to_s(acb);
 -    BlockCompletionFunc *cb;
      int ret;
      /* Cancel timer when the first allocating request comes in */
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
              qed_aio_start_io(acb);
              return;
          }
 -
 -        cb = qed_aio_write_zero_cluster;
      } else {
 -        cb = qed_aio_write_cow;
          acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
      }
-     if (qed_should_set_need_check(s)) {
-         s->header.features |= QED_F_NEED_CHECK;
-         ret = qed_write_header(s);
--        cb(acb, ret);
-+        if (ret < 0) {
-+            qed_aio_complete(acb, ret);
-+            return;
-+        }
-+    }
-+
-+    if (acb->flags & QED_AIOCB_ZERO) {
-+        ret = qed_aio_write_l2_update(acb, 1);
-     } else {
--        cb(acb, 0);
-+        ret = qed_aio_write_cow(acb);
-     }
-+    if (ret < 0) {
-+        qed_aio_complete(acb, ret);
-+        return;
-+    }
-+    qed_aio_next_io(acb, 0);
- }
- /**
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 51/61] qed: Simplify request handling
+[Qemu-devel] [PULL v3 09/35] block: Don't acquire AioContext in hmp_qemu_io()
-Now that we process a request in the same coroutine from beginning to
+Commit 15afd94a047 added code to acquire and release the AioContext in
-end and don't drop out of it any more, we can look like a proper
+qemuio_command(). This means that the lock is taken twice now in the
-coroutine-based driver and simply call qed_aio_next_io() and get a
+call path from hmp_qemu_io(). This causes BDRV_POLL_WHILE() to hang for
-return value from it instead of spawning an additional coroutine that
+any requests issued to nodes in a non-mainloop AioContext.
-reenters the parent when it's done.
 Dropping the first locking from hmp_qemu_io() fixes the problem.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qed.c | 101 +++++++++++++-----------------------------------------------
+ hmp.c | 6 ------
- block/qed.h |   3 +-
+file changed, 6 deletions(-)
 files changed, 22 insertions(+), 82 deletions(-)
-diff --git a/block/qed.c b/block/qed.c
+diff --git a/hmp.c b/hmp.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qed.c
+--- a/hmp.c
-+++ b/block/qed.c
++++ b/hmp.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
  #include "qapi/qmp/qerror.h"
  #include "sysemu/block-backend.h"
 -static const AIOCBInfo qed_aiocb_info = {
 -    .aiocb_size         = sizeof(QEDAIOCB),
 -};
 -
  static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
                            const char *filename)
  {
-@@ -XXX,XX +XXX,XX @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
+     BlockBackend *blk;
-     return l2_table;
+     BlockBackend *local_blk = NULL;
- }
+-    AioContext *aio_context;
+     const char* device = qdict_get_str(qdict, "device");
--static void qed_aio_next_io(QEDAIOCB *acb);
+     const char* command = qdict_get_str(qdict, "command");
--
+     Error *err = NULL;
--static void qed_aio_start_io(QEDAIOCB *acb)
+@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
 -{
 -    qed_aio_next_io(acb);
 -}
 -
  static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
  {
      assert(!s->allocating_write_reqs_plugged);
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs,
  static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
  {
 -    return acb->common.bs->opaque;
 +    return acb->bs->opaque;
  }
  /**
@@ -XXX,XX +XXX,XX @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
      }
  }
 -static void qed_aio_complete_bh(void *opaque)
 -{
 -    QEDAIOCB *acb = opaque;
 -    BDRVQEDState *s = acb_to_s(acb);
 -    BlockCompletionFunc *cb = acb->common.cb;
 -    void *user_opaque = acb->common.opaque;
 -    int ret = acb->bh_ret;
 -
 -    qemu_aio_unref(acb);
 -
 -    /* Invoke callback */
 -    qed_acquire(s);
 -    cb(user_opaque, ret);
 -    qed_release(s);
 -}
 -
 -static void qed_aio_complete(QEDAIOCB *acb, int ret)
 +static void qed_aio_complete(QEDAIOCB *acb)
  {
      BDRVQEDState *s = acb_to_s(acb);
 -    trace_qed_aio_complete(s, acb, ret);
 -
      /* Free resources */
      qemu_iovec_destroy(&acb->cur_qiov);
      qed_unref_l2_cache_entry(acb->request.l2_table);
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
          acb->qiov->iov[0].iov_base = NULL;
      }
 -    /* Arrange for a bh to invoke the completion function */
 -    acb->bh_ret = ret;
 -    aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs),
 -                            qed_aio_complete_bh, acb);
 -
      /* Start next allocating write request waiting behind this one.  Note that
       * requests enqueue themselves when they first hit an unallocated cluster
       * but they wait until the entire request is finished before waking up the
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
          struct iovec *iov = acb->qiov->iov;
          if (!iov->iov_base) {
 -            iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len);
 +            iov->iov_base = qemu_try_blockalign(acb->bs, iov->iov_len);
              if (iov->iov_base == NULL) {
                  return -ENOMEM;
              }
@@ -XXX,XX +XXX,XX @@ static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
  {
      QEDAIOCB *acb = opaque;
      BDRVQEDState *s = acb_to_s(acb);
 -    BlockDriverState *bs = acb->common.bs;
 +    BlockDriverState *bs = acb->bs;
      /* Adjust offset into cluster */
      offset += qed_offset_into_cluster(s, acb->cur_pos);
@@ -XXX,XX +XXX,XX @@ static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
  /**
   * Begin next I/O or complete the request
   */
 -static void qed_aio_next_io(QEDAIOCB *acb)
 +static int qed_aio_next_io(QEDAIOCB *acb)
  {
      BDRVQEDState *s = acb_to_s(acb);
      uint64_t offset;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb)
          /* Complete request */
          if (acb->cur_pos >= acb->end_pos) {
 -            qed_aio_complete(acb, 0);
 -            return;
 +            ret = 0;
 +            break;
          }
          /* Find next cluster and start I/O */
          len = acb->end_pos - acb->cur_pos;
          ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
          if (ret < 0) {
 -            qed_aio_complete(acb, ret);
 -            return;
 +            break;
          }
          if (acb->flags & QED_AIOCB_WRITE) {
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb)
          }
          if (ret < 0 && ret != -EAGAIN) {
 -            qed_aio_complete(acb, ret);
 -            return;
 +            break;
          }
      }
--}
+-    aio_context = blk_get_aio_context(blk);
--typedef struct QEDRequestCo {
+-    aio_context_acquire(aio_context);
 -    Coroutine *co;
 -    bool done;
 -    int ret;
 -} QEDRequestCo;
 -
--static void qed_co_request_cb(void *opaque, int ret)
+     /*
--{
+      * Notably absent: Proper permission management. This is sad, but it seems
--    QEDRequestCo *co = opaque;
+      * almost impossible to achieve without changing the semantics and thereby
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
       */
      qemuio_command(blk, command);
 -    aio_context_release(aio_context);
 -
--    co->done = true;
+ fail:
--    co->ret = ret;
+     blk_unref(local_blk);
--    qemu_coroutine_enter_if_inactive(co->co);
+     hmp_handle_error(mon, &err);
 +    trace_qed_aio_complete(s, acb, ret);
 +    qed_aio_complete(acb);
 +    return ret;
  }
  static int coroutine_fn qed_co_request(BlockDriverState *bs, int64_t sector_num,
                                         QEMUIOVector *qiov, int nb_sectors,
                                         int flags)
  {
 -    QEDRequestCo co = {
 -        .co     = qemu_coroutine_self(),
 -        .done   = false,
 +    QEDAIOCB acb = {
 +        .bs         = bs,
 +        .cur_pos    = (uint64_t) sector_num * BDRV_SECTOR_SIZE,
 +        .end_pos    = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE,
 +        .qiov       = qiov,
 +        .flags      = flags,
      };
 -    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, qed_co_request_cb, &co);
 -
 -    trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, &co, flags);
 +    qemu_iovec_init(&acb.cur_qiov, qiov->niov);
 -    acb->flags = flags;
 -    acb->qiov = qiov;
 -    acb->qiov_offset = 0;
 -    acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
 -    acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE;
 -    acb->backing_qiov = NULL;
 -    acb->request.l2_table = NULL;
 -    qemu_iovec_init(&acb->cur_qiov, qiov->niov);
 +    trace_qed_aio_setup(bs->opaque, &acb, sector_num, nb_sectors, NULL, flags);
      /* Start request */
 -    qed_aio_start_io(acb);
 -
 -    if (!co.done) {
 -        qemu_coroutine_yield();
 -    }
 -
 -    return co.ret;
 +    return qed_aio_next_io(&acb);
  }
  static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
 diff --git a/block/qed.h b/block/qed.h
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.h
 +++ b/block/qed.h
@@ -XXX,XX +XXX,XX @@ enum {
  };
  typedef struct QEDAIOCB {
 -    BlockAIOCB common;
 -    int bh_ret;                     /* final return status for completion bh */
 +    BlockDriverState *bs;
      QSIMPLEQ_ENTRY(QEDAIOCB) next;  /* next request */
      int flags;                      /* QED_AIOCB_* bits ORed together */
      uint64_t end_pos;               /* request end on block device, in bytes */
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 18/61] qcow2: Use unsigned int for both members of Qcow2COWRegion
+[Qemu-devel] [PULL v3 10/35] qcow2: get rid of qcow2_backing_read1 routine
-From: Alberto Garcia <berto@igalia.com>
+From: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
-Qcow2COWRegion has two attributes:
+Since bdrv_co_preadv does all neccessary checks including
 reading after the end of the backing file, avoid duplication
 of verification before bdrv_co_preadv call.
-- The offset of the COW region from the start of the first cluster
+Signed-off-by: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
-  touched by the I/O request. Since it's always going to be positive
+Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
   and the maximum request size is at most INT_MAX, we can use a
   regular unsigned int to store this offset.
 - The size of the COW region in bytes. This is guaranteed to be >= 0,
   so we should use an unsigned type instead.
 In x86_64 this reduces the size of Qcow2COWRegion from 16 to 8 bytes.
 It will also help keep some assertions simpler now that we know that
 there are no negative numbers.
 The prototype of do_perform_cow() is also updated to reflect these
 changes.
 Signed-off-by: Alberto Garcia <berto@igalia.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/qcow2-cluster.c | 4 ++--
+ block/qcow2.h |  3 ---
- block/qcow2.h         | 4 ++--
+ block/qcow2.c | 51 ++++++++-------------------------------------------
-files changed, 4 insertions(+), 4 deletions(-)
+files changed, 8 insertions(+), 46 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
-@@ -XXX,XX +XXX,XX @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
- static int coroutine_fn do_perform_cow(BlockDriverState *bs,
-                                        uint64_t src_cluster_offset,
-                                        uint64_t cluster_offset,
--                                       int offset_in_cluster,
--                                       int bytes)
-+                                       unsigned offset_in_cluster,
-+                                       unsigned bytes)
- {
-     BDRVQcow2State *s = bs->opaque;
-     QEMUIOVector qiov;
 diff --git a/block/qcow2.h b/block/qcow2.h
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2.h
 +++ b/block/qcow2.h
-@@ -XXX,XX +XXX,XX @@ typedef struct Qcow2COWRegion {
+@@ -XXX,XX +XXX,XX @@ uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset)
-      * Offset of the COW region in bytes from the start of the first cluster
+ }
-      * touched by the request.
-      */
+ /* qcow2.c functions */
--    uint64_t    offset;
+-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-+    unsigned    offset;
+-                  int64_t sector_num, int nb_sectors);
+-
-     /** Number of bytes to copy */
+ int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
--    int         nb_bytes;
+                                      int refcount_order, bool generous_increase,
-+    unsigned    nb_bytes;
+                                      uint64_t *refblock_count);
- } Qcow2COWRegion;
+diff --git a/block/qcow2.c b/block/qcow2.c
+index XXXXXXX..XXXXXXX 100644
- /**
+--- a/block/qcow2.c
 +++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
      return status;
  }
 -/* handle reading after the end of the backing file */
 -int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
 -                        int64_t offset, int bytes)
 -{
 -    uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE;
 -    int n1;
 -
 -    if ((offset + bytes) <= bs_size) {
 -        return bytes;
 -    }
 -
 -    if (offset >= bs_size) {
 -        n1 = 0;
 -    } else {
 -        n1 = bs_size - offset;
 -    }
 -
 -    qemu_iovec_memset(qiov, n1, 0, bytes - n1);
 -
 -    return n1;
 -}
 -
  static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
                                          uint64_t bytes, QEMUIOVector *qiov,
                                          int flags)
  {
      BDRVQcow2State *s = bs->opaque;
 -    int offset_in_cluster, n1;
 +    int offset_in_cluster;
      int ret;
      unsigned int cur_bytes; /* number of bytes in current iteration */
      uint64_t cluster_offset = 0;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
          case QCOW2_CLUSTER_UNALLOCATED:
              if (bs->backing) {
 -                /* read from the base image */
 -                n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov,
 -                                         offset, cur_bytes);
 -                if (n1 > 0) {
 -                    QEMUIOVector local_qiov;
 -
 -                    qemu_iovec_init(&local_qiov, hd_qiov.niov);
 -                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1);
 -
 -                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
 -                    qemu_co_mutex_unlock(&s->lock);
 -                    ret = bdrv_co_preadv(bs->backing, offset, n1,
 -                                         &local_qiov, 0);
 -                    qemu_co_mutex_lock(&s->lock);
 -
 -                    qemu_iovec_destroy(&local_qiov);
 -
 -                    if (ret < 0) {
 -                        goto fail;
 -                    }
 +                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
 +                qemu_co_mutex_unlock(&s->lock);
 +                ret = bdrv_co_preadv(bs->backing, offset, cur_bytes,
 +                                     &hd_qiov, 0);
 +                qemu_co_mutex_lock(&s->lock);
 +                if (ret < 0) {
 +                    goto fail;
                  }
              } else {
                  /* Note: in this case, no need to wait */
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 08/61] doc: Document generic -blockdev options
+[Qemu-devel] [PULL v3 11/35] block: Document that x-blockdev-change breaks quorum children list
-This adds documentation for the -blockdev options that apply to all
+Removing a quorum child node with x-blockdev-change results in a quorum
-nodes independent of the block driver used.
+driver state that cannot be recreated with create options because it
 would require a list with gaps. This causes trouble in at least
 .bdrv_refresh_filename().
-All options that are shared by -blockdev and -drive are now explained in
+Document this problem so that we won't accidentally mark the command
-the section for -blockdev. The documentation of -drive mentions that all
+stable without having addressed it.
 -blockdev options are accepted as well.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Alberto Garcia <berto@igalia.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 ---
- qemu-options.hx | 108 +++++++++++++++++++++++++++++++++++++++++---------------
+ qapi/block-core.json | 4 ++++
-file changed, 79 insertions(+), 29 deletions(-)
+file changed, 4 insertions(+)
-diff --git a/qemu-options.hx b/qemu-options.hx
+diff --git a/qapi/block-core.json b/qapi/block-core.json
 index XXXXXXX..XXXXXXX 100644
---- a/qemu-options.hx
+--- a/qapi/block-core.json
-+++ b/qemu-options.hx
++++ b/qapi/block-core.json
-@@ -XXX,XX +XXX,XX @@ DEF("blockdev", HAS_ARG, QEMU_OPTION_blockdev,
+@@ -XXX,XX +XXX,XX @@
-     "          [,read-only=on|off][,detect-zeroes=on|off|unmap]\n"
+ # does not support all kinds of operations, all kinds of children, nor
-     "          [,driver specific parameters...]\n"
+ # all block drivers.
-     "                configure a block backend\n", QEMU_ARCH_ALL)
+ #
-+STEXI
++# FIXME Removing children from a quorum node means introducing gaps in the
-+@item -blockdev @var{option}[,@var{option}[,@var{option}[,...]]]
++# child indices. This cannot be represented in the 'children' list of
-+@findex -blockdev
++# BlockdevOptionsQuorum, as returned by .bdrv_refresh_filename().
-+
++#
-+Define a new block driver node.
+ # Warning: The data in a new quorum child MUST be consistent with that of
-+
+ # the rest of the array.
-+@table @option
+ #
 +@item Valid options for any block driver node:
 +
 +@table @code
 +@item driver
 +Specifies the block driver to use for the given node.
 +@item node-name
 +This defines the name of the block driver node by which it will be referenced
 +later. The name must be unique, i.e. it must not match the name of a different
 +block driver node, or (if you use @option{-drive} as well) the ID of a drive.
 +
 +If no node name is specified, it is automatically generated. The generated node
 +name is not intended to be predictable and changes between QEMU invocations.
 +For the top level, an explicit node name must be specified.
 +@item read-only
 +Open the node read-only. Guest write attempts will fail.
 +@item cache.direct
 +The host page cache can be avoided with @option{cache.direct=on}. This will
 +attempt to do disk IO directly to the guest's memory. QEMU may still perform an
 +internal copy of the data.
 +@item cache.no-flush
 +In case you don't care about data integrity over host failures, you can use
 +@option{cache.no-flush=on}. This option tells QEMU that it never needs to write
 +any data to the disk but can instead keep things in cache. If anything goes
 +wrong, like your host losing power, the disk storage getting disconnected
 +accidentally, etc. your image will most probably be rendered unusable.
 +@item discard=@var{discard}
 +@var{discard} is one of "ignore" (or "off") or "unmap" (or "on") and controls
 +whether @code{discard} (also known as @code{trim} or @code{unmap}) requests are
 +ignored or passed to the filesystem. Some machine types may not support
 +discard requests.
 +@item detect-zeroes=@var{detect-zeroes}
 +@var{detect-zeroes} is "off", "on" or "unmap" and enables the automatic
 +conversion of plain zero writes by the OS to driver specific optimized
 +zero write commands. You may even choose "unmap" if @var{discard} is set
 +to "unmap" to allow a zero write to be converted to an @code{unmap} operation.
 +@end table
 +
 +@end table
 +
 +ETEXI
  DEF("drive", HAS_ARG, QEMU_OPTION_drive,
      "-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n"
@@ -XXX,XX +XXX,XX @@ STEXI
  @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
  @findex -drive
 -Define a new drive. Valid options are:
 +Define a new drive. This includes creating a block driver node (the backend) as
 +well as a guest device, and is mostly a shortcut for defining the corresponding
 +@option{-blockdev} and @option{-device} options.
 +
 +@option{-drive} accepts all options that are accepted by @option{-blockdev}. In
 +addition, it knows the following options:
  @table @option
  @item file=@var{file}
@@ -XXX,XX +XXX,XX @@ These options have the same definition as they have in @option{-hdachs}.
  @var{snapshot} is "on" or "off" and controls snapshot mode for the given drive
  (see @option{-snapshot}).
  @item cache=@var{cache}
 -@var{cache} is "none", "writeback", "unsafe", "directsync" or "writethrough" and controls how the host cache is used to access block data.
 +@var{cache} is "none", "writeback", "unsafe", "directsync" or "writethrough"
 +and controls how the host cache is used to access block data. This is a
 +shortcut that sets the @option{cache.direct} and @option{cache.no-flush}
 +options (as in @option{-blockdev}), and additionally @option{cache.writeback},
 +which provides a default for the @option{write-cache} option of block guest
 +devices (as in @option{-device}). The modes correspond to the following
 +settings:
 +
 +@c Our texi2pod.pl script doesn't support @multitable, so fall back to using
 +@c plain ASCII art (well, UTF-8 art really). This looks okay both in the manpage
 +@c and the HTML output.
 +@example
 +@             │ cache.writeback   cache.direct   cache.no-flush
 +─────────────┼─────────────────────────────────────────────────
 +writeback    │ on                off            off
 +none         │ on                on             off
 +writethrough │ off               off            off
 +directsync   │ off               on             off
 +unsafe       │ on                off            on
 +@end example
 +
 +The default mode is @option{cache=writeback}.
 +
  @item aio=@var{aio}
  @var{aio} is "threads", or "native" and selects between pthread based disk I/O and native Linux AIO.
 -@item discard=@var{discard}
 -@var{discard} is one of "ignore" (or "off") or "unmap" (or "on") and controls whether @dfn{discard} (also known as @dfn{trim} or @dfn{unmap}) requests are ignored or passed to the filesystem.  Some machine types may not support discard requests.
  @item format=@var{format}
  Specify which disk @var{format} will be used rather than detecting
  the format.  Can be used to specify format=raw to avoid interpreting
@@ -XXX,XX +XXX,XX @@ Specify which @var{action} to take on write and read errors. Valid actions are:
  "report" (report the error to the guest), "enospc" (pause QEMU only if the
  host disk is full; report the error to the guest otherwise).
  The default setting is @option{werror=enospc} and @option{rerror=report}.
 -@item readonly
 -Open drive @option{file} as read-only. Guest write attempts will fail.
  @item copy-on-read=@var{copy-on-read}
  @var{copy-on-read} is "on" or "off" and enables whether to copy read backing
  file sectors into the image file.
 -@item detect-zeroes=@var{detect-zeroes}
 -@var{detect-zeroes} is "off", "on" or "unmap" and enables the automatic
 -conversion of plain zero writes by the OS to driver specific optimized
 -zero write commands. You may even choose "unmap" if @var{discard} is set
 -to "unmap" to allow a zero write to be converted to an UNMAP operation.
  @item bps=@var{b},bps_rd=@var{r},bps_wr=@var{w}
  Specify bandwidth throttling limits in bytes per second, either for all request
  types or for reads or writes only.  Small values can lead to timeouts or hangs
@@ -XXX,XX +XXX,XX @@ prevent guests from circumventing throttling limits by using many small disks
  instead of a single larger disk.
  @end table
 -By default, the @option{cache=writeback} mode is used. It will report data
 +By default, the @option{cache.writeback=on} mode is used. It will report data
  writes as completed as soon as the data is present in the host page cache.
  This is safe as long as your guest OS makes sure to correctly flush disk caches
  where needed. If your guest OS does not handle volatile disk write caches
  correctly and your host crashes or loses power, then the guest may experience
  data corruption.
 -For such guests, you should consider using @option{cache=writethrough}. This
 +For such guests, you should consider using @option{cache.writeback=off}. This
  means that the host page cache will be used to read and write data, but write
  notification will be sent to the guest only after QEMU has made sure to flush
  each write to the disk. Be aware that this has a major impact on performance.
 -The host page cache can be avoided entirely with @option{cache=none}.  This will
 -attempt to do disk IO directly to the guest's memory.  QEMU may still perform
 -an internal copy of the data. Note that this is considered a writeback mode and
 -the guest OS must handle the disk write cache correctly in order to avoid data
 -corruption on host crashes.
 -
 -The host page cache can be avoided while only sending write notifications to
 -the guest when the data has been flushed to the disk using
 -@option{cache=directsync}.
 -
 -In case you don't care about data integrity over host failures, use
 -@option{cache=unsafe}. This option tells QEMU that it never needs to write any
 -data to the disk but can instead keep things in cache. If anything goes wrong,
 -like your host losing power, the disk storage getting disconnected accidentally,
 -etc. your image will most probably be rendered unusable.   When using
 -the @option{-snapshot} option, unsafe caching is always used.
 +When using the @option{-snapshot} option, unsafe caching is always used.
  Copy-on-read avoids accessing the same backing file sectors repeatedly and is
  useful when the backing file is over a slow network.  By default copy-on-read
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 16/61] nvme: Add support for Read Data and Write Data in CMBs.
+[Qemu-devel] [PULL v3 12/35] nvme: Add tracing
-From: Stephen Bates <sbates@raithlin.com>
+From: Doug Gale <doug16k@gmail.com>
-Add the ability for the NVMe model to support both the RDS and WDS
+Add trace output for commands, errors, and undefined behavior.
-modes in the Controller Memory Buffer.
+Add guest error log output for undefined behavior.
 Report invalid undefined accesses to MMIO.
 Annotate unlikely error checks with unlikely.
-Although not currently supported in the upstreamed Linux kernel a fork
+Signed-off-by: Doug Gale <doug16k@gmail.com>
-with support exists [1] and user-space test programs that build on
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
-this also exist [2].
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Useful for testing CMB functionality in preperation for real CMB
 enabled NVMe devices (coming soon).
 [1] https://github.com/sbates130272/linux-p2pmem
 [2] https://github.com/sbates130272/p2pmem-test
 Signed-off-by: Stephen Bates <sbates@raithlin.com>
 Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
 Reviewed-by: Keith Busch <keith.busch@intel.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- hw/block/nvme.c | 83 +++++++++++++++++++++++++++++++++++++++------------------
+ hw/block/nvme.c       | 349 ++++++++++++++++++++++++++++++++++++++++++--------
- hw/block/nvme.h |  1 +
+ hw/block/trace-events |  93 ++++++++++++++
-files changed, 58 insertions(+), 26 deletions(-)
+files changed, 390 insertions(+), 52 deletions(-)
 diff --git a/hw/block/nvme.c b/hw/block/nvme.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/block/nvme.c
 +++ b/hw/block/nvme.c
 @@ -XXX,XX +XXX,XX @@
-  *              cmb_size_mb=<cmb_size_mb[optional]>
+ #include "qapi/visitor.h"
-  *
+ #include "sysemu/block-backend.h"
-  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
-- * offset 0 in BAR2 and supports SQS only for now.
++#include "qemu/log.h"
-+ * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
++#include "trace.h"
-  */
+ #include "nvme.h"
- #include "qemu/osdep.h"
++#define NVME_GUEST_ERR(trace, fmt, ...) \
 +    do { \
 +        (trace_##trace)(__VA_ARGS__); \
 +        qemu_log_mask(LOG_GUEST_ERROR, #trace \
 +            " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
 +    } while (0)
 +
  static void nvme_process_sq(void *opaque);
  static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 @@ -XXX,XX +XXX,XX @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
-     }
- }
--static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
--    uint32_t len, NvmeCtrl *n)
-+static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
-+                             uint64_t prp2, uint32_t len, NvmeCtrl *n)
  {
-     hwaddr trans_len = n->page_size - (prp1 % n->page_size);
+     if (cq->irq_enabled) {
          if (msix_enabled(&(n->parent_obj))) {
 +            trace_nvme_irq_msix(cq->vector);
              msix_notify(&(n->parent_obj), cq->vector);
          } else {
 +            trace_nvme_irq_pin();
              pci_irq_pulse(&n->parent_obj);
          }
 +    } else {
 +        trace_nvme_irq_masked();
      }
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
      trans_len = MIN(len, trans_len);
-@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
+     int num_prps = (len >> n->page_bits) + 1;
-     if (!prp1) {
+-    if (!prp1) {
-         return NVME_INVALID_FIELD | NVME_DNR;
++    if (unlikely(!prp1)) {
-+    } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
++        trace_nvme_err_invalid_prp();
-+               prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
+         return NVME_INVALID_FIELD | NVME_DNR;
-+        qsg->nsg = 0;
+     } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
-+        qemu_iovec_init(iov, num_prps);
+                prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
-+        qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - n->ctrl_mem.addr], trans_len);
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
-+    } else {
+     }
 +        pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
 +        qemu_sglist_add(qsg, prp1, trans_len);
      }
 -
 -    pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
 -    qemu_sglist_add(qsg, prp1, trans_len);
      len -= trans_len;
      if (len) {
-         if (!prp2) {
+-        if (!prp2) {
-@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
++        if (unlikely(!prp2)) {
++            trace_nvme_err_invalid_prp2_missing();
-             nents = (len + n->page_size - 1) >> n->page_bits;
+             goto unmap;
-             prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
+         }
--            pci_dma_read(&n->parent_obj, prp2, (void *)prp_list, prp_trans);
+         if (len > n->page_size) {
-+            nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
              while (len != 0) {
                  uint64_t prp_ent = le64_to_cpu(prp_list[i]);
-@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
+                 if (i == n->max_prp_ents - 1 && len > n->page_size) {
-                     i = 0;
+-                    if (!prp_ent || prp_ent & (n->page_size - 1)) {
-                     nents = (len + n->page_size - 1) >> n->page_bits;
++                    if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
-                     prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
++                        trace_nvme_err_invalid_prplist_ent(prp_ent);
--                    pci_dma_read(&n->parent_obj, prp_ent, (void *)prp_list,
+                         goto unmap;
-+                    nvme_addr_read(n, prp_ent, (void *)prp_list,
+                     }
-                         prp_trans);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                      prp_ent = le64_to_cpu(prp_list[i]);
                  }
-@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
 -                if (!prp_ent || prp_ent & (n->page_size - 1)) {
 +                if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
 +                    trace_nvme_err_invalid_prplist_ent(prp_ent);
                      goto unmap;
                  }
-                 trans_len = MIN(len, n->page_size);
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
 -                qemu_sglist_add(qsg, prp_ent, trans_len);
 +                if (qsg->nsg){
 +                    qemu_sglist_add(qsg, prp_ent, trans_len);
 +                } else {
 +                    qemu_iovec_add(iov, (void *)&n->cmbuf[prp_ent - n->ctrl_mem.addr], trans_len);
 +                }
                  len -= trans_len;
                  i++;
              }
-@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
+         } else {
-             if (prp2 & (n->page_size - 1)) {
+-            if (prp2 & (n->page_size - 1)) {
 +            if (unlikely(prp2 & (n->page_size - 1))) {
 +                trace_nvme_err_invalid_prp2_align(prp2);
                  goto unmap;
              }
--            qemu_sglist_add(qsg, prp2, len);
+             if (qsg->nsg) {
-+            if (qsg->nsg) {
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
-+                qemu_sglist_add(qsg, prp2, len);
+     QEMUIOVector iov;
-+            } else {
+     uint16_t status = NVME_SUCCESS;
-+                qemu_iovec_add(iov, (void *)&n->cmbuf[prp2 - n->ctrl_mem.addr], trans_len);
-+            }
++    trace_nvme_dma_read(prp1, prp2);
-         }
++
-     }
+     if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
          return NVME_INVALID_FIELD | NVME_DNR;
      }
      if (qsg.nsg > 0) {
 -        if (dma_buf_read(ptr, len, &qsg)) {
 +        if (unlikely(dma_buf_read(ptr, len, &qsg))) {
 +            trace_nvme_err_invalid_dma();
              status = NVME_INVALID_FIELD | NVME_DNR;
          }
          qemu_sglist_destroy(&qsg);
      } else {
 -        if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
 +        if (unlikely(qemu_iovec_to_buf(&iov, 0, ptr, len) != len)) {
 +            trace_nvme_err_invalid_dma();
              status = NVME_INVALID_FIELD | NVME_DNR;
          }
          qemu_iovec_destroy(&iov);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
      uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
      uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS);
 -    if (slba + nlb > ns->id_ns.nsze) {
 +    if (unlikely(slba + nlb > ns->id_ns.nsze)) {
 +        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
          return NVME_LBA_RANGE | NVME_DNR;
      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
      int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
      enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
 -    if ((slba + nlb) > ns->id_ns.nsze) {
 +    trace_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
 +
 +    if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
          block_acct_invalid(blk_get_stats(n->conf.blk), acct);
 +        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
          return NVME_LBA_RANGE | NVME_DNR;
      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
      NvmeNamespace *ns;
      uint32_t nsid = le32_to_cpu(cmd->nsid);
 -    if (nsid == 0 || nsid > n->num_namespaces) {
 +    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
 +        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
          return NVME_INVALID_NSID | NVME_DNR;
      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
      case NVME_CMD_READ:
          return nvme_rw(n, ns, cmd, req);
      default:
 +        trace_nvme_err_invalid_opc(cmd->opcode);
          return NVME_INVALID_OPCODE | NVME_DNR;
      }
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
      NvmeCQueue *cq;
      uint16_t qid = le16_to_cpu(c->qid);
 -    if (!qid || nvme_check_sqid(n, qid)) {
 +    if (unlikely(!qid || nvme_check_sqid(n, qid))) {
 +        trace_nvme_err_invalid_del_sq(qid);
          return NVME_INVALID_QID | NVME_DNR;
      }
 +    trace_nvme_del_sq(qid);
 +
      sq = n->sq[qid];
      while (!QTAILQ_EMPTY(&sq->out_req_list)) {
          req = QTAILQ_FIRST(&sq->out_req_list);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
      uint16_t qflags = le16_to_cpu(c->sq_flags);
      uint64_t prp1 = le64_to_cpu(c->prp1);
 -    if (!cqid || nvme_check_cqid(n, cqid)) {
 +    trace_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
 +
 +    if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
 +        trace_nvme_err_invalid_create_sq_cqid(cqid);
          return NVME_INVALID_CQID | NVME_DNR;
      }
 -    if (!sqid || !nvme_check_sqid(n, sqid)) {
 +    if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) {
 +        trace_nvme_err_invalid_create_sq_sqid(sqid);
          return NVME_INVALID_QID | NVME_DNR;
      }
 -    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
 +    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
 +        trace_nvme_err_invalid_create_sq_size(qsize);
          return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
      }
 -    if (!prp1 || prp1 & (n->page_size - 1)) {
 +    if (unlikely(!prp1 || prp1 & (n->page_size - 1))) {
 +        trace_nvme_err_invalid_create_sq_addr(prp1);
          return NVME_INVALID_FIELD | NVME_DNR;
      }
 -    if (!(NVME_SQ_FLAGS_PC(qflags))) {
 +    if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
 +        trace_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
          return NVME_INVALID_FIELD | NVME_DNR;
      }
      sq = g_malloc0(sizeof(*sq));
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
      NvmeCQueue *cq;
      uint16_t qid = le16_to_cpu(c->qid);
 -    if (!qid || nvme_check_cqid(n, qid)) {
 +    if (unlikely(!qid || nvme_check_cqid(n, qid))) {
 +        trace_nvme_err_invalid_del_cq_cqid(qid);
          return NVME_INVALID_CQID | NVME_DNR;
      }
      cq = n->cq[qid];
 -    if (!QTAILQ_EMPTY(&cq->sq_list)) {
 +    if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
 +        trace_nvme_err_invalid_del_cq_notempty(qid);
          return NVME_INVALID_QUEUE_DEL;
      }
 +    trace_nvme_del_cq(qid);
      nvme_free_cq(cq, n);
      return NVME_SUCCESS;
-@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
+ }
-     uint64_t prp1, uint64_t prp2)
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
      uint16_t qflags = le16_to_cpu(c->cq_flags);
      uint64_t prp1 = le64_to_cpu(c->prp1);
 -    if (!cqid || !nvme_check_cqid(n, cqid)) {
 +    trace_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
 +                         NVME_CQ_FLAGS_IEN(qflags) != 0);
 +
 +    if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) {
 +        trace_nvme_err_invalid_create_cq_cqid(cqid);
          return NVME_INVALID_CQID | NVME_DNR;
      }
 -    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
 +    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
 +        trace_nvme_err_invalid_create_cq_size(qsize);
          return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
      }
 -    if (!prp1) {
 +    if (unlikely(!prp1)) {
 +        trace_nvme_err_invalid_create_cq_addr(prp1);
          return NVME_INVALID_FIELD | NVME_DNR;
      }
 -    if (vector > n->num_queues) {
 +    if (unlikely(vector > n->num_queues)) {
 +        trace_nvme_err_invalid_create_cq_vector(vector);
          return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
      }
 -    if (!(NVME_CQ_FLAGS_PC(qflags))) {
 +    if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
 +        trace_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
          return NVME_INVALID_FIELD | NVME_DNR;
      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
      uint64_t prp1 = le64_to_cpu(c->prp1);
      uint64_t prp2 = le64_to_cpu(c->prp2);
 +    trace_nvme_identify_ctrl();
 +
      return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
          prp1, prp2);
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
      uint64_t prp1 = le64_to_cpu(c->prp1);
      uint64_t prp2 = le64_to_cpu(c->prp2);
 -    if (nsid == 0 || nsid > n->num_namespaces) {
 +    trace_nvme_identify_ns(nsid);
 +
 +    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
 +        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
          return NVME_INVALID_NSID | NVME_DNR;
      }
      ns = &n->namespaces[nsid - 1];
 +
      return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
          prp1, prp2);
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
      uint16_t ret;
      int i, j = 0;
 +    trace_nvme_identify_nslist(min_nsid);
 +
      list = g_malloc0(data_len);
      for (i = 0; i < n->num_namespaces; i++) {
          if (i < min_nsid) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
      case 0x02:
          return nvme_identify_nslist(n, c);
      default:
 +        trace_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
          return NVME_INVALID_FIELD | NVME_DNR;
      }
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
      switch (dw10) {
      case NVME_VOLATILE_WRITE_CACHE:
          result = blk_enable_write_cache(n->conf.blk);
 +        trace_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
          break;
      case NVME_NUMBER_OF_QUEUES:
          result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
 +        trace_nvme_getfeat_numq(result);
          break;
      default:
 +        trace_nvme_err_invalid_getfeat(dw10);
          return NVME_INVALID_FIELD | NVME_DNR;
      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
          blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
          break;
      case NVME_NUMBER_OF_QUEUES:
 +        trace_nvme_setfeat_numq((dw11 & 0xFFFF) + 1,
 +                                ((dw11 >> 16) & 0xFFFF) + 1,
 +                                n->num_queues - 1, n->num_queues - 1);
          req->cqe.result =
              cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
          break;
      default:
 +        trace_nvme_err_invalid_setfeat(dw10);
          return NVME_INVALID_FIELD | NVME_DNR;
      }
      return NVME_SUCCESS;
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
      case NVME_ADM_CMD_GET_FEATURES:
          return nvme_get_feature(n, cmd, req);
      default:
 +        trace_nvme_err_invalid_admin_opc(cmd->opcode);
          return NVME_INVALID_OPCODE | NVME_DNR;
      }
  }
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
      uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
      uint32_t page_size = 1 << page_bits;
 -    if (n->cq[0] || n->sq[0] || !n->bar.asq || !n->bar.acq ||
 -            n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1) ||
 -            NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) ||
 -            NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap) ||
 -            NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) ||
 -            NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes) ||
 -            NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.sqes) ||
 -            NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.sqes) ||
 -            !NVME_AQA_ASQS(n->bar.aqa) || !NVME_AQA_ACQS(n->bar.aqa)) {
 +    if (unlikely(n->cq[0])) {
 +        trace_nvme_err_startfail_cq();
 +        return -1;
 +    }
 +    if (unlikely(n->sq[0])) {
 +        trace_nvme_err_startfail_sq();
 +        return -1;
 +    }
 +    if (unlikely(!n->bar.asq)) {
 +        trace_nvme_err_startfail_nbarasq();
 +        return -1;
 +    }
 +    if (unlikely(!n->bar.acq)) {
 +        trace_nvme_err_startfail_nbaracq();
 +        return -1;
 +    }
 +    if (unlikely(n->bar.asq & (page_size - 1))) {
 +        trace_nvme_err_startfail_asq_misaligned(n->bar.asq);
 +        return -1;
 +    }
 +    if (unlikely(n->bar.acq & (page_size - 1))) {
 +        trace_nvme_err_startfail_acq_misaligned(n->bar.acq);
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_MPS(n->bar.cc) <
 +                 NVME_CAP_MPSMIN(n->bar.cap))) {
 +        trace_nvme_err_startfail_page_too_small(
 +                    NVME_CC_MPS(n->bar.cc),
 +                    NVME_CAP_MPSMIN(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_MPS(n->bar.cc) >
 +                 NVME_CAP_MPSMAX(n->bar.cap))) {
 +        trace_nvme_err_startfail_page_too_large(
 +                    NVME_CC_MPS(n->bar.cc),
 +                    NVME_CAP_MPSMAX(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
 +                 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
 +        trace_nvme_err_startfail_cqent_too_small(
 +                    NVME_CC_IOCQES(n->bar.cc),
 +                    NVME_CTRL_CQES_MIN(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
 +                 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
 +        trace_nvme_err_startfail_cqent_too_large(
 +                    NVME_CC_IOCQES(n->bar.cc),
 +                    NVME_CTRL_CQES_MAX(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
 +                 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
 +        trace_nvme_err_startfail_sqent_too_small(
 +                    NVME_CC_IOSQES(n->bar.cc),
 +                    NVME_CTRL_SQES_MIN(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
 +                 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
 +        trace_nvme_err_startfail_sqent_too_large(
 +                    NVME_CC_IOSQES(n->bar.cc),
 +                    NVME_CTRL_SQES_MAX(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
 +        trace_nvme_err_startfail_asqent_sz_zero();
 +        return -1;
 +    }
 +    if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
 +        trace_nvme_err_startfail_acqent_sz_zero();
          return -1;
      }
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
  static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
      unsigned size)
  {
-     QEMUSGList qsg;
++    if (unlikely(offset & (sizeof(uint32_t) - 1))) {
-+    QEMUIOVector iov;
++        NVME_GUEST_ERR(nvme_ub_mmiowr_misaligned32,
-+    uint16_t status = NVME_SUCCESS;
++                       "MMIO write not 32-bit aligned,"
++                       " offset=0x%"PRIx64"", offset);
--    if (nvme_map_prp(&qsg, prp1, prp2, len, n)) {
++        /* should be ignored, fall through for now */
-+    if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
++    }
-         return NVME_INVALID_FIELD | NVME_DNR;
++
-     }
++    if (unlikely(size < sizeof(uint32_t))) {
--    if (dma_buf_read(ptr, len, &qsg)) {
++        NVME_GUEST_ERR(nvme_ub_mmiowr_toosmall,
-+    if (qsg.nsg > 0) {
++                       "MMIO write smaller than 32-bits,"
-+        if (dma_buf_read(ptr, len, &qsg)) {
++                       " offset=0x%"PRIx64", size=%u",
-+            status = NVME_INVALID_FIELD | NVME_DNR;
++                       offset, size);
 +        /* should be ignored, fall through for now */
 +    }
 +
      switch (offset) {
 -    case 0xc:
 +    case 0xc:   /* INTMS */
 +        if (unlikely(msix_enabled(&(n->parent_obj)))) {
 +            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
 +                           "undefined access to interrupt mask set"
 +                           " when MSI-X is enabled");
 +            /* should be ignored, fall through for now */
 +        }
-         qemu_sglist_destroy(&qsg);
+         n->bar.intms |= data & 0xffffffff;
--        return NVME_INVALID_FIELD | NVME_DNR;
+         n->bar.intmc = n->bar.intms;
 +        trace_nvme_mmio_intm_set(data & 0xffffffff,
 +                                 n->bar.intmc);
          break;
 -    case 0x10:
 +    case 0x10:  /* INTMC */
 +        if (unlikely(msix_enabled(&(n->parent_obj)))) {
 +            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
 +                           "undefined access to interrupt mask clr"
 +                           " when MSI-X is enabled");
 +            /* should be ignored, fall through for now */
 +        }
          n->bar.intms &= ~(data & 0xffffffff);
          n->bar.intmc = n->bar.intms;
 +        trace_nvme_mmio_intm_clr(data & 0xffffffff,
 +                                 n->bar.intmc);
          break;
 -    case 0x14:
 +    case 0x14:  /* CC */
 +        trace_nvme_mmio_cfg(data & 0xffffffff);
          /* Windows first sends data, then sends enable bit */
          if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
              !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
@@ -XXX,XX +XXX,XX @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
          if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
              n->bar.cc = data;
 -            if (nvme_start_ctrl(n)) {
 +            if (unlikely(nvme_start_ctrl(n))) {
 +                trace_nvme_err_startfail();
                  n->bar.csts = NVME_CSTS_FAILED;
              } else {
 +                trace_nvme_mmio_start_success();
                  n->bar.csts = NVME_CSTS_READY;
              }
          } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
 +            trace_nvme_mmio_stopped();
              nvme_clear_ctrl(n);
              n->bar.csts &= ~NVME_CSTS_READY;
          }
          if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
 -                nvme_clear_ctrl(n);
 -                n->bar.cc = data;
 -                n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
 +            trace_nvme_mmio_shutdown_set();
 +            nvme_clear_ctrl(n);
 +            n->bar.cc = data;
 +            n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
          } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
 -                n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
 -                n->bar.cc = data;
 +            trace_nvme_mmio_shutdown_cleared();
 +            n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
 +            n->bar.cc = data;
 +        }
 +        break;
 +    case 0x1C:  /* CSTS */
 +        if (data & (1 << 4)) {
 +            NVME_GUEST_ERR(nvme_ub_mmiowr_ssreset_w1c_unsupported,
 +                           "attempted to W1C CSTS.NSSRO"
 +                           " but CAP.NSSRS is zero (not supported)");
 +        } else if (data != 0) {
 +            NVME_GUEST_ERR(nvme_ub_mmiowr_ro_csts,
 +                           "attempted to set a read only bit"
 +                           " of controller status");
 +        }
 +        break;
 +    case 0x20:  /* NSSR */
 +        if (data == 0x4E564D65) {
 +            trace_nvme_ub_mmiowr_ssreset_unsupported();
 +        } else {
 +            /* The spec says that writes of other values have no effect */
 +            return;
          }
          break;
 -    case 0x24:
 +    case 0x24:  /* AQA */
          n->bar.aqa = data & 0xffffffff;
 +        trace_nvme_mmio_aqattr(data & 0xffffffff);
          break;
 -    case 0x28:
 +    case 0x28:  /* ASQ */
          n->bar.asq = data;
 +        trace_nvme_mmio_asqaddr(data);
          break;
 -    case 0x2c:
 +    case 0x2c:  /* ASQ hi */
          n->bar.asq |= data << 32;
 +        trace_nvme_mmio_asqaddr_hi(data, n->bar.asq);
          break;
 -    case 0x30:
 +    case 0x30:  /* ACQ */
 +        trace_nvme_mmio_acqaddr(data);
          n->bar.acq = data;
          break;
 -    case 0x34:
 +    case 0x34:  /* ACQ hi */
          n->bar.acq |= data << 32;
 +        trace_nvme_mmio_acqaddr_hi(data, n->bar.acq);
          break;
 +    case 0x38:  /* CMBLOC */
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbloc_reserved,
 +                       "invalid write to reserved CMBLOC"
 +                       " when CMBSZ is zero, ignored");
 +        return;
 +    case 0x3C:  /* CMBSZ */
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
 +                       "invalid write to read only CMBSZ, ignored");
 +        return;
      default:
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
 +                       "invalid MMIO write,"
 +                       " offset=0x%"PRIx64", data=%"PRIx64"",
 +                       offset, data);
          break;
      }
  }
@@ -XXX,XX +XXX,XX @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
      uint8_t *ptr = (uint8_t *)&n->bar;
      uint64_t val = 0;
 +    if (unlikely(addr & (sizeof(uint32_t) - 1))) {
 +        NVME_GUEST_ERR(nvme_ub_mmiord_misaligned32,
 +                       "MMIO read not 32-bit aligned,"
 +                       " offset=0x%"PRIx64"", addr);
 +        /* should RAZ, fall through for now */
 +    } else if (unlikely(size < sizeof(uint32_t))) {
 +        NVME_GUEST_ERR(nvme_ub_mmiord_toosmall,
 +                       "MMIO read smaller than 32-bits,"
 +                       " offset=0x%"PRIx64"", addr);
 +        /* should RAZ, fall through for now */
 +    }
 +
      if (addr < sizeof(n->bar)) {
          memcpy(&val, ptr + addr, size);
 +    } else {
-+        if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
++        NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
-+            status = NVME_INVALID_FIELD | NVME_DNR;
++                       "MMIO read beyond last register,"
-+        }
++                       " offset=0x%"PRIx64", returning 0", addr);
-+        qemu_iovec_destroy(&iov);
+     }
-     }
++
--    qemu_sglist_destroy(&qsg);
+     return val;
--    return NVME_SUCCESS;
+ }
-+    return status;
- }
+@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
+ {
- static void nvme_post_cqes(void *opaque)
+     uint32_t qid;
-@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
-         return NVME_LBA_RANGE | NVME_DNR;
+-    if (addr & ((1 << 2) - 1)) {
-     }
++    if (unlikely(addr & ((1 << 2) - 1))) {
++        NVME_GUEST_ERR(nvme_ub_db_wr_misaligned,
--    if (nvme_map_prp(&req->qsg, prp1, prp2, data_size, n)) {
++                       "doorbell write not 32-bit aligned,"
-+    if (nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, data_size, n)) {
++                       " offset=0x%"PRIx64", ignoring", addr);
-         block_acct_invalid(blk_get_stats(n->conf.blk), acct);
+         return;
-         return NVME_INVALID_FIELD | NVME_DNR;
+     }
-     }
+     if (((addr - 0x1000) >> 2) & 1) {
--    assert((nlb << data_shift) == req->qsg.size);
++        /* Completion queue doorbell write */
--
++
--    req->has_sg = true;
+         uint16_t new_head = val & 0xffff;
-     dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct);
+         int start_sqs;
--    req->aiocb = is_write ?
+         NvmeCQueue *cq;
--        dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
--                      nvme_rw_cb, req) :
+         qid = (addr - (0x1000 + (1 << 2))) >> 3;
--        dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
+-        if (nvme_check_cqid(n, qid)) {
--                     nvme_rw_cb, req);
++        if (unlikely(nvme_check_cqid(n, qid))) {
-+    if (req->qsg.nsg > 0) {
++            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cq,
-+        req->has_sg = true;
++                           "completion queue doorbell write"
-+        req->aiocb = is_write ?
++                           " for nonexistent queue,"
-+            dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
++                           " sqid=%"PRIu32", ignoring", qid);
-+                          nvme_rw_cb, req) :
+             return;
-+            dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
+         }
-+                         nvme_rw_cb, req);
-+    } else {
+         cq = n->cq[qid];
-+        req->has_sg = false;
+-        if (new_head >= cq->size) {
-+        req->aiocb = is_write ?
++        if (unlikely(new_head >= cq->size)) {
-+            blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
++            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cqhead,
-+                            req) :
++                           "completion queue doorbell write value"
-+            blk_aio_preadv(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
++                           " beyond queue size, sqid=%"PRIu32","
-+                           req);
++                           " new_head=%"PRIu16", ignoring",
-+    }
++                           qid, new_head);
+             return;
-     return NVME_NO_COMPLETE;
+         }
- }
-@@ -XXX,XX +XXX,XX @@ static int nvme_init(PCIDevice *pci_dev)
+@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
-         NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
+             nvme_isr_notify(n, cq);
-         NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
+         }
-         NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
+     } else {
--        NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 0);
++        /* Submission queue doorbell write */
--        NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 0);
++
-+        NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
+         uint16_t new_tail = val & 0xffff;
-+        NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
+         NvmeSQueue *sq;
-         NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
-         NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->cmb_size_mb);
+         qid = (addr - 0x1000) >> 3;
+-        if (nvme_check_sqid(n, qid)) {
-+        n->cmbloc = n->bar.cmbloc;
++        if (unlikely(nvme_check_sqid(n, qid))) {
-+        n->cmbsz = n->bar.cmbsz;
++            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sq,
-+
++                           "submission queue doorbell write"
-         n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
++                           " for nonexistent queue,"
-         memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n,
++                           " sqid=%"PRIu32", ignoring", qid);
-                               "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
+             return;
-diff --git a/hw/block/nvme.h b/hw/block/nvme.h
+         }
          sq = n->sq[qid];
 -        if (new_tail >= sq->size) {
 +        if (unlikely(new_tail >= sq->size)) {
 +            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sqtail,
 +                           "submission queue doorbell write value"
 +                           " beyond queue size, sqid=%"PRIu32","
 +                           " new_tail=%"PRIu16", ignoring",
 +                           qid, new_tail);
              return;
          }
 diff --git a/hw/block/trace-events b/hw/block/trace-events
 index XXXXXXX..XXXXXXX 100644
---- a/hw/block/nvme.h
+--- a/hw/block/trace-events
-+++ b/hw/block/nvme.h
++++ b/hw/block/trace-events
-@@ -XXX,XX +XXX,XX @@ typedef struct NvmeRequest {
+@@ -XXX,XX +XXX,XX @@ virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint6
-     NvmeCqe                 cqe;
+ hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
-     BlockAcctCookie         acct;
+ hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int trans) "blk %p CHS %u %u %u trans %d"
-     QEMUSGList              qsg;
-+    QEMUIOVector            iov;
++# hw/block/nvme.c
-     QTAILQ_ENTRY(NvmeRequest)entry;
++# nvme traces for successful events
- } NvmeRequest;
++nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u"
++nvme_irq_pin(void) "pulsing IRQ pin"
 +nvme_irq_masked(void) "IRQ is masked"
 +nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
 +nvme_rw(char const *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
 +nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
 +nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
 +nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
 +nvme_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16""
 +nvme_identify_ctrl(void) "identify controller"
 +nvme_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
 +nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16""
 +nvme_getfeat_vwcache(char const* result) "get feature volatile write cache, result=%s"
 +nvme_getfeat_numq(int result) "get feature number of queues, result=%d"
 +nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
 +nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
 +nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64""
 +nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64""
 +nvme_mmio_aqattr(uint64_t data) "wrote MMIO, admin queue attributes=0x%"PRIx64""
 +nvme_mmio_asqaddr(uint64_t data) "wrote MMIO, admin submission queue address=0x%"PRIx64""
 +nvme_mmio_acqaddr(uint64_t data) "wrote MMIO, admin completion queue address=0x%"PRIx64""
 +nvme_mmio_asqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin submission queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
 +nvme_mmio_acqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin completion queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
 +nvme_mmio_start_success(void) "setting controller enable bit succeeded"
 +nvme_mmio_stopped(void) "cleared controller enable bit"
 +nvme_mmio_shutdown_set(void) "shutdown bit set"
 +nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
 +
 +# nvme traces for error conditions
 +nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
 +nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
 +nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""
 +nvme_err_invalid_prp2_missing(void) "PRP2 is null and more data to be transferred"
 +nvme_err_invalid_field(void) "invalid field"
 +nvme_err_invalid_prp(void) "invalid PRP"
 +nvme_err_invalid_sgl(void) "invalid SGL"
 +nvme_err_invalid_ns(uint32_t ns, uint32_t limit) "invalid namespace %u not within 1-%u"
 +nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
 +nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
 +nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
 +nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
 +nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
 +nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16""
 +nvme_err_invalid_create_sq_size(uint16_t qsize) "failed creating submission queue, invalid qsize=%"PRIu16""
 +nvme_err_invalid_create_sq_addr(uint64_t addr) "failed creating submission queue, addr=0x%"PRIx64""
 +nvme_err_invalid_create_sq_qflags(uint16_t qflags) "failed creating submission queue, qflags=%"PRIu16""
 +nvme_err_invalid_del_cq_cqid(uint16_t cqid) "failed deleting completion queue, cqid=%"PRIu16""
 +nvme_err_invalid_del_cq_notempty(uint16_t cqid) "failed deleting completion queue, it is not empty, cqid=%"PRIu16""
 +nvme_err_invalid_create_cq_cqid(uint16_t cqid) "failed creating completion queue, cqid=%"PRIu16""
 +nvme_err_invalid_create_cq_size(uint16_t size) "failed creating completion queue, size=%"PRIu16""
 +nvme_err_invalid_create_cq_addr(uint64_t addr) "failed creating completion queue, addr=0x%"PRIx64""
 +nvme_err_invalid_create_cq_vector(uint16_t vector) "failed creating completion queue, vector=%"PRIu16""
 +nvme_err_invalid_create_cq_qflags(uint16_t qflags) "failed creating completion queue, qflags=%"PRIu16""
 +nvme_err_invalid_identify_cns(uint16_t cns) "identify, invalid cns=0x%"PRIx16""
 +nvme_err_invalid_getfeat(int dw10) "invalid get features, dw10=0x%"PRIx32""
 +nvme_err_invalid_setfeat(uint32_t dw10) "invalid set features, dw10=0x%"PRIx32""
 +nvme_err_startfail_cq(void) "nvme_start_ctrl failed because there are non-admin completion queues"
 +nvme_err_startfail_sq(void) "nvme_start_ctrl failed because there are non-admin submission queues"
 +nvme_err_startfail_nbarasq(void) "nvme_start_ctrl failed because the admin submission queue address is null"
 +nvme_err_startfail_nbaracq(void) "nvme_start_ctrl failed because the admin completion queue address is null"
 +nvme_err_startfail_asq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin submission queue address is misaligned: 0x%"PRIx64""
 +nvme_err_startfail_acq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin completion queue address is misaligned: 0x%"PRIx64""
 +nvme_err_startfail_page_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too small: log2size=%u, min=%u"
 +nvme_err_startfail_page_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too large: log2size=%u, max=%u"
 +nvme_err_startfail_cqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too small: log2size=%u, min=%u"
 +nvme_err_startfail_cqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too large: log2size=%u, max=%u"
 +nvme_err_startfail_sqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too small: log2size=%u, min=%u"
 +nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too large: log2size=%u, max=%u"
 +nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero"
 +nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero"
 +nvme_err_startfail(void) "setting controller enable bit failed"
 +
 +# Traces for undefined behavior
 +nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64""
 +nvme_ub_mmiowr_toosmall(uint64_t offset, unsigned size) "MMIO write smaller than 32 bits, offset=0x%"PRIx64", size=%u"
 +nvme_ub_mmiowr_intmask_with_msix(void) "undefined access to interrupt mask set when MSI-X is enabled"
 +nvme_ub_mmiowr_ro_csts(void) "attempted to set a read only bit of controller status"
 +nvme_ub_mmiowr_ssreset_w1c_unsupported(void) "attempted to W1C CSTS.NSSRO but CAP.NSSRS is zero (not supported)"
 +nvme_ub_mmiowr_ssreset_unsupported(void) "attempted NVM subsystem reset but CAP.NSSRS is zero (not supported)"
 +nvme_ub_mmiowr_cmbloc_reserved(void) "invalid write to reserved CMBLOC when CMBSZ is zero, ignored"
 +nvme_ub_mmiowr_cmbsz_readonly(void) "invalid write to read only CMBSZ, ignored"
 +nvme_ub_mmiowr_invalid(uint64_t offset, uint64_t data) "invalid MMIO write, offset=0x%"PRIx64", data=0x%"PRIx64""
 +nvme_ub_mmiord_misaligned32(uint64_t offset) "MMIO read not 32-bit aligned, offset=0x%"PRIx64""
 +nvme_ub_mmiord_toosmall(uint64_t offset) "MMIO read smaller than 32-bits, offset=0x%"PRIx64""
 +nvme_ub_mmiord_invalid_ofs(uint64_t offset) "MMIO read beyond last register, offset=0x%"PRIx64", returning 0"
 +nvme_ub_db_wr_misaligned(uint64_t offset) "doorbell write not 32-bit aligned, offset=0x%"PRIx64", ignoring"
 +nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for nonexistent queue, cqid=%"PRIu32", ignoring"
 +nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring"
 +nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring"
 +nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring"
 +
  # hw/block/xen_disk.c
  xen_disk_alloc(char *name) "%s"
  xen_disk_init(char *name) "%s"
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 07/61] migration: use bdrv_drain_all_begin/end() instead bdrv_drain_all()
+[Qemu-devel] [PULL v3 13/35] block: Open backing image in force share mode for size probe
-From: Stefan Hajnoczi <stefanha@redhat.com>
+From: Fam Zheng <famz@redhat.com>
-blk/bdrv_drain_all() only takes effect for a single instant and then
+Management tools create overlays of running guests with qemu-img:
 resumes block jobs, guest devices, and other external clients like the
 NBD server.  This can be handy when performing a synchronous drain
 before terminating the program, for example.
-Monitor commands usually need to quiesce I/O across an entire code
+  $ qemu-img create -b /image/in/use.qcow2 -f qcow2 /overlay/image.qcow2
 region so blk/bdrv_drain_all() is not suitable.  They must use
 bdrv_drain_all_begin/end() to mark the region.  This prevents new I/O
 requests from slipping in or worse - block jobs completing and modifying
 the graph.
-I audited other blk/bdrv_drain_all() callers but did not find anything
+but this doesn't work anymore due to image locking:
 that needs a similar fix.  This patch fixes the savevm/loadvm commands.
 Although I haven't encountered a read world issue this makes the code
 safer.
-Suggested-by: Kevin Wolf <kwolf@redhat.com>
+    qemu-img: /overlay/image.qcow2: Failed to get shared "write" lock
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+    Is another process using the image?
     Could not open backing image to determine size.
 Use the force share option to allow this use case again.
 Cc: qemu-stable@nongnu.org
 Signed-off-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- migration/savevm.c | 18 +++++++++++++++---
+ block.c | 3 ++-
-file changed, 15 insertions(+), 3 deletions(-)
+file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/migration/savevm.c b/migration/savevm.c
+diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
---- a/migration/savevm.c
+--- a/block.c
-+++ b/migration/savevm.c
++++ b/block.c
-@@ -XXX,XX +XXX,XX @@ int save_snapshot(const char *name, Error **errp)
+@@ -XXX,XX +XXX,XX @@ void bdrv_img_create(const char *filename, const char *fmt,
-     }
+         back_flags = flags;
-     vm_stop(RUN_STATE_SAVE_VM);
+         back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
-+    bdrv_drain_all_begin();
++        backing_options = qdict_new();
-+
+         if (backing_fmt) {
-     aio_context_acquire(aio_context);
+-            backing_options = qdict_new();
+             qdict_put_str(backing_options, "driver", backing_fmt);
-     memset(sn, 0, sizeof(*sn));
+         }
-@@ -XXX,XX +XXX,XX @@ int save_snapshot(const char *name, Error **errp)
++        qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
-     if (aio_context) {
-         aio_context_release(aio_context);
+         bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
-     }
+                        &local_err);
 +
 +    bdrv_drain_all_end();
 +
      if (saved_vm_running) {
          vm_start();
      }
@@ -XXX,XX +XXX,XX @@ int load_snapshot(const char *name, Error **errp)
      }
      /* Flush all IO requests so they don't interfere with the new state.  */
 -    bdrv_drain_all();
 +    bdrv_drain_all_begin();
      ret = bdrv_all_goto_snapshot(name, &bs);
      if (ret < 0) {
          error_setg(errp, "Error %d while activating snapshot '%s' on '%s'",
                       ret, name, bdrv_get_device_name(bs));
 -        return ret;
 +        goto err_drain;
      }
      /* restore the VM state */
      f = qemu_fopen_bdrv(bs_vm_state, 0);
      if (!f) {
          error_setg(errp, "Could not open VM state file");
 -        return -EINVAL;
 +        ret = -EINVAL;
 +        goto err_drain;
      }
      qemu_system_reset(SHUTDOWN_CAUSE_NONE);
@@ -XXX,XX +XXX,XX @@ int load_snapshot(const char *name, Error **errp)
      ret = qemu_loadvm_state(f);
      aio_context_release(aio_context);
 +    bdrv_drain_all_end();
 +
      migration_incoming_state_destroy();
      if (ret < 0) {
          error_setg(errp, "Error %d while loading VM state", ret);
@@ -XXX,XX +XXX,XX @@ int load_snapshot(const char *name, Error **errp)
      }
      return 0;
 +
 +err_drain:
 +    bdrv_drain_all_end();
 +    return ret;
  }
  void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 22/61] qcow2: Pass a QEMUIOVector to do_perform_cow_{read, write}()
+[Qemu-devel] [PULL v3 14/35] block: Remove the obsolete -drive boot=on|off parameter
-From: Alberto Garcia <berto@igalia.com>
+From: Thomas Huth <thuth@redhat.com>
-Instead of passing a single buffer pointer to do_perform_cow_write(),
+It's not working anymore since QEMU v1.3.0 - time to remove it now.
 pass a QEMUIOVector. This will allow us to merge the write requests
 for the COW regions and the actual data into a single one.
-Although do_perform_cow_read() does not strictly need to change its
+Signed-off-by: Thomas Huth <thuth@redhat.com>
-API, we're doing it here as well for consistency.
+Reviewed-by: John Snow <jsnow@redhat.com>
+Reviewed-by: Markus Armbruster <armbru@redhat.com>
 Signed-off-by: Alberto Garcia <berto@igalia.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/qcow2-cluster.c | 51 ++++++++++++++++++++++++---------------------------
+ blockdev.c    | 11 -----------
-file changed, 24 insertions(+), 27 deletions(-)
+ qemu-doc.texi |  6 ------
 files changed, 17 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
+diff --git a/blockdev.c b/blockdev.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
+--- a/blockdev.c
-+++ b/block/qcow2-cluster.c
++++ b/blockdev.c
-@@ -XXX,XX +XXX,XX @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
+@@ -XXX,XX +XXX,XX @@ QemuOptsList qemu_legacy_drive_opts = {
- static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
+             .type = QEMU_OPT_STRING,
-                                             uint64_t src_cluster_offset,
+             .help = "chs translation (auto, lba, none)",
-                                             unsigned offset_in_cluster,
+         },{
--                                            uint8_t *buffer,
+-            .name = "boot",
--                                            unsigned bytes)
+-            .type = QEMU_OPT_BOOL,
-+                                            QEMUIOVector *qiov)
+-            .help = "(deprecated, ignored)",
- {
+-        },{
--    QEMUIOVector qiov;
+             .name = "addr",
--    struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
+             .type = QEMU_OPT_STRING,
-     int ret;
+             .help = "pci address (virtio only)",
+@@ -XXX,XX +XXX,XX @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type)
 -    if (bytes == 0) {
 +    if (qiov->size == 0) {
          return 0;
      }
 -    qemu_iovec_init_external(&qiov, &iov, 1);
 -
      BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
      if (!bs->drv) {
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
       * which can lead to deadlock when block layer copy-on-read is enabled.
       */
      ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster,
 -                                  bytes, &qiov, 0);
 +                                  qiov->size, qiov, 0);
      if (ret < 0) {
          return ret;
      }
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
  static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
                                               uint64_t cluster_offset,
                                               unsigned offset_in_cluster,
 -                                             uint8_t *buffer,
 -                                             unsigned bytes)
 +                                             QEMUIOVector *qiov)
  {
 -    QEMUIOVector qiov;
 -    struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
      int ret;
 -    if (bytes == 0) {
 +    if (qiov->size == 0) {
          return 0;
      }
 -    qemu_iovec_init_external(&qiov, &iov, 1);
 -
      ret = qcow2_pre_write_overlap_check(bs, 0,
 -            cluster_offset + offset_in_cluster, bytes);
 +            cluster_offset + offset_in_cluster, qiov->size);
      if (ret < 0) {
          return ret;
      }
      BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
      ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster,
 -                          bytes, &qiov, 0);
 +                          qiov->size, qiov, 0);
      if (ret < 0) {
          return ret;
      }
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
      unsigned data_bytes = end->offset - (start->offset + start->nb_bytes);
      bool merge_reads;
      uint8_t *start_buffer, *end_buffer;
 +    QEMUIOVector qiov;
      int ret;
      assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
      /* The part of the buffer where the end region is located */
      end_buffer = start_buffer + buffer_size - end->nb_bytes;
 +    qemu_iovec_init(&qiov, 1);
 +
      qemu_co_mutex_unlock(&s->lock);
      /* First we read the existing data from both COW regions. We
       * either read the whole region in one go, or the start and end
       * regions separately. */
      if (merge_reads) {
 -        ret = do_perform_cow_read(bs, m->offset, start->offset,
 -                                  start_buffer, buffer_size);
 +        qemu_iovec_add(&qiov, start_buffer, buffer_size);
 +        ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
      } else {
 -        ret = do_perform_cow_read(bs, m->offset, start->offset,
 -                                  start_buffer, start->nb_bytes);
 +        qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
 +        ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
          if (ret < 0) {
              goto fail;
          }
 -        ret = do_perform_cow_read(bs, m->offset, end->offset,
 -                                  end_buffer, end->nb_bytes);
 +        qemu_iovec_reset(&qiov);
 +        qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
 +        ret = do_perform_cow_read(bs, m->offset, end->offset, &qiov);
      }
      if (ret < 0) {
          goto fail;
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
      }
      /* And now we can write everything */
 -    ret = do_perform_cow_write(bs, m->alloc_offset, start->offset,
 -                               start_buffer, start->nb_bytes);
 +    qemu_iovec_reset(&qiov);
 +    qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
 +    ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
      if (ret < 0) {
          goto fail;
      }
--    ret = do_perform_cow_write(bs, m->alloc_offset, end->offset,
+-    /* Deprecated option boot=[on|off] */
--                               end_buffer, end->nb_bytes);
+-    if (qemu_opt_get(legacy_opts, "boot") != NULL) {
-+    qemu_iovec_reset(&qiov);
+-        fprintf(stderr, "qemu-kvm: boot=on|off is deprecated and will be "
-+    qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
+-                "ignored. Future versions will reject this parameter. Please "
-+    ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
+-                "update your scripts.\n");
- fail:
+-    }
-     qemu_co_mutex_lock(&s->lock);
+-
+     /* Other deprecated options */
-@@ -XXX,XX +XXX,XX @@ fail:
+     if (!qtest_enabled()) {
-     }
+         for (i = 0; i < ARRAY_SIZE(deprecated); i++) {
+diff --git a/qemu-doc.texi b/qemu-doc.texi
-     qemu_vfree(start_buffer);
+index XXXXXXX..XXXXXXX 100644
-+    qemu_iovec_destroy(&qiov);
+--- a/qemu-doc.texi
-     return ret;
++++ b/qemu-doc.texi
- }
+@@ -XXX,XX +XXX,XX @@ deprecated.
  @section System emulator command line arguments
 -@subsection -drive boot=on|off (since 1.3.0)
 -
 -The ``boot=on|off'' option to the ``-drive'' argument is
 -ignored. Applications should use the ``bootindex=N'' parameter
 -to set an absolute ordering between devices instead.
 -
  @subsection -tdf (since 1.3.0)
  The ``-tdf'' argument is ignored. The behaviour implemented
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 09/61] doc: Document driver-specific -blockdev options
+[Qemu-devel] [PULL v3 15/35] block: Remove the deprecated -hdachs option
-This documents the driver-specific options for the raw, qcow2 and file
+From: Thomas Huth <thuth@redhat.com>
-block drivers for the man page. For everything else, we refer to the
-QAPI documentation.
+It's been marked as deprecated since QEMU v2.10.0, and so far nobody
+complained that we should keep it, so let's remove this legacy option
 now to simplify the code quite a bit.
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 Reviewed-by: John Snow <jsnow@redhat.com>
 Reviewed-by: Markus Armbruster <armbru@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
 ---
- qemu-options.hx | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
+ vl.c            | 86 ++-------------------------------------------------------
-file changed, 114 insertions(+), 1 deletion(-)
+ qemu-doc.texi   |  8 ------
+ qemu-options.hx | 19 ++-----------
 files changed, 4 insertions(+), 109 deletions(-)
 diff --git a/vl.c b/vl.c
 index XXXXXXX..XXXXXXX 100644
 --- a/vl.c
 +++ b/vl.c
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
      const char *boot_order = NULL;
      const char *boot_once = NULL;
      DisplayState *ds;
 -    int cyls, heads, secs, translation;
      QemuOpts *opts, *machine_opts;
 -    QemuOpts *hda_opts = NULL, *icount_opts = NULL, *accel_opts = NULL;
 +    QemuOpts *icount_opts = NULL, *accel_opts = NULL;
      QemuOptsList *olist;
      int optind;
      const char *optarg;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
      cpu_model = NULL;
      snapshot = 0;
 -    cyls = heads = secs = 0;
 -    translation = BIOS_ATA_TRANSLATION_AUTO;
      nb_nics = 0;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
          if (optind >= argc)
              break;
          if (argv[optind][0] != '-') {
 -            hda_opts = drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
 +            drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
          } else {
              const QEMUOption *popt;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
                  cpu_model = optarg;
                  break;
              case QEMU_OPTION_hda:
 -                {
 -                    char buf[256];
 -                    if (cyls == 0)
 -                        snprintf(buf, sizeof(buf), "%s", HD_OPTS);
 -                    else
 -                        snprintf(buf, sizeof(buf),
 -                                 "%s,cyls=%d,heads=%d,secs=%d%s",
 -                                 HD_OPTS , cyls, heads, secs,
 -                                 translation == BIOS_ATA_TRANSLATION_LBA ?
 -                                 ",trans=lba" :
 -                                 translation == BIOS_ATA_TRANSLATION_NONE ?
 -                                 ",trans=none" : "");
 -                    drive_add(IF_DEFAULT, 0, optarg, buf);
 -                    break;
 -                }
              case QEMU_OPTION_hdb:
              case QEMU_OPTION_hdc:
              case QEMU_OPTION_hdd:
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
              case QEMU_OPTION_snapshot:
                  snapshot = 1;
                  break;
 -            case QEMU_OPTION_hdachs:
 -                {
 -                    const char *p;
 -                    p = optarg;
 -                    cyls = strtol(p, (char **)&p, 0);
 -                    if (cyls < 1 || cyls > 16383)
 -                        goto chs_fail;
 -                    if (*p != ',')
 -                        goto chs_fail;
 -                    p++;
 -                    heads = strtol(p, (char **)&p, 0);
 -                    if (heads < 1 || heads > 16)
 -                        goto chs_fail;
 -                    if (*p != ',')
 -                        goto chs_fail;
 -                    p++;
 -                    secs = strtol(p, (char **)&p, 0);
 -                    if (secs < 1 || secs > 63)
 -                        goto chs_fail;
 -                    if (*p == ',') {
 -                        p++;
 -                        if (!strcmp(p, "large")) {
 -                            translation = BIOS_ATA_TRANSLATION_LARGE;
 -                        } else if (!strcmp(p, "rechs")) {
 -                            translation = BIOS_ATA_TRANSLATION_RECHS;
 -                        } else if (!strcmp(p, "none")) {
 -                            translation = BIOS_ATA_TRANSLATION_NONE;
 -                        } else if (!strcmp(p, "lba")) {
 -                            translation = BIOS_ATA_TRANSLATION_LBA;
 -                        } else if (!strcmp(p, "auto")) {
 -                            translation = BIOS_ATA_TRANSLATION_AUTO;
 -                        } else {
 -                            goto chs_fail;
 -                        }
 -                    } else if (*p != '\0') {
 -                    chs_fail:
 -                        error_report("invalid physical CHS format");
 -                        exit(1);
 -                    }
 -                    if (hda_opts != NULL) {
 -                        qemu_opt_set_number(hda_opts, "cyls", cyls,
 -                                            &error_abort);
 -                        qemu_opt_set_number(hda_opts, "heads", heads,
 -                                            &error_abort);
 -                        qemu_opt_set_number(hda_opts, "secs", secs,
 -                                            &error_abort);
 -                        if (translation == BIOS_ATA_TRANSLATION_LARGE) {
 -                            qemu_opt_set(hda_opts, "trans", "large",
 -                                         &error_abort);
 -                        } else if (translation == BIOS_ATA_TRANSLATION_RECHS) {
 -                            qemu_opt_set(hda_opts, "trans", "rechs",
 -                                         &error_abort);
 -                        } else if (translation == BIOS_ATA_TRANSLATION_LBA) {
 -                            qemu_opt_set(hda_opts, "trans", "lba",
 -                                         &error_abort);
 -                        } else if (translation == BIOS_ATA_TRANSLATION_NONE) {
 -                            qemu_opt_set(hda_opts, "trans", "none",
 -                                         &error_abort);
 -                        }
 -                    }
 -                }
 -                error_report("'-hdachs' is deprecated, please use '-device"
 -                             " ide-hd,cyls=c,heads=h,secs=s,...' instead");
 -                break;
              case QEMU_OPTION_numa:
                  opts = qemu_opts_parse_noisily(qemu_find_opts("numa"),
                                                 optarg, true);
 diff --git a/qemu-doc.texi b/qemu-doc.texi
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-doc.texi
 +++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ The ``--net dump'' argument is now replaced with the
  ``-object filter-dump'' argument which works in combination
  with the modern ``-netdev`` backends instead.
 -@subsection -hdachs (since 2.10.0)
 -
 -The ``-hdachs'' argument is now a synonym for setting
 -the ``cyls'', ``heads'', ``secs'', and ``trans'' properties
 -on the ``ide-hd'' device using the ``-device'' argument.
 -The new syntax allows different settings to be provided
 -per disk.
 -
  @subsection -usbdevice (since 2.10.0)
  The ``-usbdevice DEV'' argument is now a synonym for setting
 diff --git a/qemu-options.hx b/qemu-options.hx
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-options.hx
 +++ b/qemu-options.hx
-@@ -XXX,XX +XXX,XX @@ STEXI
+@@ -XXX,XX +XXX,XX @@ of available connectors of a given interface type.
- @item -blockdev @var{option}[,@var{option}[,@var{option}[,...]]]
+ @item media=@var{media}
- @findex -blockdev
+ This option defines the type of the media: disk or cdrom.
+ @item cyls=@var{c},heads=@var{h},secs=@var{s}[,trans=@var{t}]
--Define a new block driver node.
+-These options have the same definition as they have in @option{-hdachs}.
-+Define a new block driver node. Some of the options apply to all block drivers,
+-These parameters are deprecated, use the corresponding parameters
-+other options are only accepted for a specific block driver. See below for a
++Force disk physical geometry and the optional BIOS translation (trans=none or
-+list of generic options and options for the most common block drivers.
++lba). These parameters are deprecated, use the corresponding parameters
-+
+ of @code{-device} instead.
-+Options that expect a reference to another node (e.g. @code{file}) can be
+ @item snapshot=@var{snapshot}
-+given in two ways. Either you specify the node name of an already existing node
+ @var{snapshot} is "on" or "off" and controls snapshot mode for the given drive
-+(file=@var{node-name}), or you define a new node inline, adding options
+@@ -XXX,XX +XXX,XX @@ the raw disk image you use is not written back. You can however force
-+for the referenced node after a dot (file.filename=@var{path},file.aio=native).
+ the write back by pressing @key{C-a s} (@pxref{disk_images}).
 +
 +A block driver node created with @option{-blockdev} can be used for a guest
 +device by specifying its node name for the @code{drive} property in a
 +@option{-device} argument that defines a block device.
  @table @option
  @item Valid options for any block driver node:
@@ -XXX,XX +XXX,XX @@ zero write commands. You may even choose "unmap" if @var{discard} is set
  to "unmap" to allow a zero write to be converted to an @code{unmap} operation.
  @end table
 +@item Driver-specific options for @code{file}
 +
 +This is the protocol-level block driver for accessing regular files.
 +
 +@table @code
 +@item filename
 +The path to the image file in the local filesystem
 +@item aio
 +Specifies the AIO backend (threads/native, default: threads)
 +@end table
 +Example:
 +@example
 +-blockdev driver=file,node-name=disk,filename=disk.img
 +@end example
 +
 +@item Driver-specific options for @code{raw}
 +
 +This is the image format block driver for raw images. It is usually
 +stacked on top of a protocol level block driver such as @code{file}.
 +
 +@table @code
 +@item file
 +Reference to or definition of the data source block driver node
 +(e.g. a @code{file} driver node)
 +@end table
 +Example 1:
 +@example
 +-blockdev driver=file,node-name=disk_file,filename=disk.img
 +-blockdev driver=raw,node-name=disk,file=disk_file
 +@end example
 +Example 2:
 +@example
 +-blockdev driver=raw,node-name=disk,file.driver=file,file.filename=disk.img
 +@end example
 +
 +@item Driver-specific options for @code{qcow2}
 +
 +This is the image format block driver for qcow2 images. It is usually
 +stacked on top of a protocol level block driver such as @code{file}.
 +
 +@table @code
 +@item file
 +Reference to or definition of the data source block driver node
 +(e.g. a @code{file} driver node)
 +
 +@item backing
 +Reference to or definition of the backing file block device (default is taken
 +from the image file). It is allowed to pass an empty string here in order to
 +disable the default backing file.
 +
 +@item lazy-refcounts
 +Whether to enable the lazy refcounts feature (on/off; default is taken from the
 +image file)
 +
 +@item cache-size
 +The maximum total size of the L2 table and refcount block caches in bytes
 +(default: 1048576 bytes or 8 clusters, whichever is larger)
 +
 +@item l2-cache-size
 +The maximum size of the L2 table cache in bytes
 +(default: 4/5 of the total cache size)
 +
 +@item refcount-cache-size
 +The maximum size of the refcount block cache in bytes
 +(default: 1/5 of the total cache size)
 +
 +@item cache-clean-interval
 +Clean unused entries in the L2 and refcount caches. The interval is in seconds.
 +The default value is 0 and it disables this feature.
 +
 +@item pass-discard-request
 +Whether discard requests to the qcow2 device should be forwarded to the data
 +source (on/off; default: on if discard=unmap is specified, off otherwise)
 +
 +@item pass-discard-snapshot
 +Whether discard requests for the data source should be issued when a snapshot
 +operation (e.g. deleting a snapshot) frees clusters in the qcow2 file (on/off;
 +default: on)
 +
 +@item pass-discard-other
 +Whether discard requests for the data source should be issued on other
 +occasions where a cluster gets freed (on/off; default: off)
 +
 +@item overlap-check
 +Which overlap checks to perform for writes to the image
 +(none/constant/cached/all; default: cached). For details or finer
 +granularity control refer to the QAPI documentation of @code{blockdev-add}.
 +@end table
 +
 +Example 1:
 +@example
 +-blockdev driver=file,node-name=my_file,filename=/tmp/disk.qcow2
 +-blockdev driver=qcow2,node-name=hda,file=my_file,overlap-check=none,cache-size=16777216
 +@end example
 +Example 2:
 +@example
 +-blockdev driver=qcow2,node-name=disk,file.driver=http,file.filename=http://example.com/image.qcow2
 +@end example
 +
 +@item Driver-specific options for other drivers
 +Please refer to the QAPI documentation of the @code{blockdev-add} QMP command.
 +
  @end table
  ETEXI
+-DEF("hdachs", HAS_ARG, QEMU_OPTION_hdachs, \
+-    "-hdachs c,h,s[,t]\n" \
+-    "                force hard disk 0 physical geometry and the optional BIOS\n" \
+-    "                translation (t=none or lba) (usually QEMU can guess them)\n",
+-    QEMU_ARCH_ALL)
+-STEXI
+-@item -hdachs @var{c},@var{h},@var{s},[,@var{t}]
+-@findex -hdachs
+-Force hard disk 0 physical geometry (1 <= @var{c} <= 16383, 1 <=
+-@var{h} <= 16, 1 <= @var{s} <= 63) and optionally force the BIOS
+-translation mode (@var{t}=none, lba or auto). Usually QEMU can guess
+-all those parameters. This option is deprecated, please use
+-@code{-device ide-hd,cyls=c,heads=h,secs=s,...} instead.
+-ETEXI
+-
+ DEF("fsdev", HAS_ARG, QEMU_OPTION_fsdev,
+     "-fsdev fsdriver,id=id[,path=path,][security_model={mapped-xattr|mapped-file|passthrough|none}]\n"
+     " [,writeout=immediate][,readonly][,socket=socket|sock_fd=sock_fd][,fmode=fmode][,dmode=dmode]\n"
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 24/61] qcow2: Use offset_into_cluster() and offset_to_l2_index()
+[Qemu-devel] [PULL v3 16/35] block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter
-From: Alberto Garcia <berto@igalia.com>
+From: Thomas Huth <thuth@redhat.com>
-We already have functions for doing these calculations, so let's use
+Looks like we forgot to announce the deprecation of these options in
-them instead of doing everything by hand. This makes the code a bit
+the corresponding chapter of the qemu-doc text, so let's do that now.
 more readable.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
+Signed-off-by: Thomas Huth <thuth@redhat.com>
 Reviewed-by: John Snow <jsnow@redhat.com>
 Reviewed-by: Markus Armbruster <armbru@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/qcow2-cluster.c | 4 ++--
+ qemu-doc.texi | 15 +++++++++++++++
- block/qcow2.c         | 2 +-
+file changed, 15 insertions(+)
 files changed, 3 insertions(+), 3 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
+diff --git a/qemu-doc.texi b/qemu-doc.texi
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
+--- a/qemu-doc.texi
-+++ b/block/qcow2-cluster.c
++++ b/qemu-doc.texi
-@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
+@@ -XXX,XX +XXX,XX @@ longer be directly supported in QEMU.
+ The ``-drive if=scsi'' argument is replaced by the the
-     /* find the cluster offset for the given disk offset */
+ ``-device BUS-TYPE'' argument combined with ``-drive if=none''.
--    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
++@subsection -drive cyls=...,heads=...,secs=...,trans=... (since 2.10.0)
-+    l2_index = offset_to_l2_index(s, offset);
++
-     *cluster_offset = be64_to_cpu(l2_table[l2_index]);
++The drive geometry arguments are replaced by the the geometry arguments
++that can be specified with the ``-device'' parameter.
-     nb_clusters = size_to_clusters(s, bytes_needed);
++
-@@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
++@subsection -drive serial=... (since 2.10.0)
++
-     /* find the cluster offset for the given disk offset */
++The drive serial argument is replaced by the the serial argument
++that can be specified with the ``-device'' parameter.
--    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
++
-+    l2_index = offset_to_l2_index(s, offset);
++@subsection -drive addr=... (since 2.10.0)
++
-     *new_l2_table = l2_table;
++The drive addr argument is replaced by the the addr argument
-     *new_l2_index = l2_index;
++that can be specified with the ``-device'' parameter.
-diff --git a/block/qcow2.c b/block/qcow2.c
++
-index XXXXXXX..XXXXXXX 100644
+ @subsection -net dump (since 2.10.0)
---- a/block/qcow2.c
-+++ b/block/qcow2.c
+ The ``--net dump'' argument is now replaced with the
@@ -XXX,XX +XXX,XX @@ static int validate_table_offset(BlockDriverState *bs, uint64_t offset,
      }
      /* Tables must be cluster aligned */
 -    if (offset & (s->cluster_size - 1)) {
 +    if (offset_into_cluster(s, offset) != 0) {
          return -EINVAL;
      }
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 28/61] qed: Remove callback from qed_read_l2_table()
+[Qemu-devel] [PULL v3 17/35] block: Remove unused bdrv_requests_pending
+From: Fam Zheng <famz@redhat.com>
+Signed-off-by: Fam Zheng <famz@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qed-cluster.c | 94 ++++++++++++++++++-----------------------------------
+ include/block/block_int.h |  1 -
- block/qed-table.c   | 15 +++------
+ block/io.c                | 18 ------------------
- block/qed.h         |  3 +-
+files changed, 19 deletions(-)
 files changed, 36 insertions(+), 76 deletions(-)
-diff --git a/block/qed-cluster.c b/block/qed-cluster.c
+diff --git a/include/block/block_int.h b/include/block/block_int.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/qed-cluster.c
+--- a/include/block/block_int.h
-+++ b/block/qed-cluster.c
++++ b/include/block/block_int.h
-@@ -XXX,XX +XXX,XX @@ static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
+@@ -XXX,XX +XXX,XX @@ bool blk_dev_is_tray_open(BlockBackend *blk);
-     return i - index;
+ bool blk_dev_is_medium_locked(BlockBackend *blk);
  void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes);
 -bool bdrv_requests_pending(BlockDriverState *bs);
  void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
  void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_disable_copy_on_read(BlockDriverState *bs)
      assert(old >= 1);
  }
--typedef struct {
+-/* Check if any requests are in-flight (including throttled requests) */
--    BDRVQEDState *s;
+-bool bdrv_requests_pending(BlockDriverState *bs)
--    uint64_t pos;
+-{
--    size_t len;
+-    BdrvChild *child;
 -
--    QEDRequest *request;
+-    if (atomic_read(&bs->in_flight)) {
--
+-        return true;
 -    /* User callback */
 -    QEDFindClusterFunc *cb;
 -    void *opaque;
 -} QEDFindClusterCB;
 -
 -static void qed_find_cluster_cb(void *opaque, int ret)
 -{
 -    QEDFindClusterCB *find_cluster_cb = opaque;
 -    BDRVQEDState *s = find_cluster_cb->s;
 -    QEDRequest *request = find_cluster_cb->request;
 -    uint64_t offset = 0;
 -    size_t len = 0;
 -    unsigned int index;
 -    unsigned int n;
 -
 -    qed_acquire(s);
 -    if (ret) {
 -        goto out;
 -    }
 -
--    index = qed_l2_index(s, find_cluster_cb->pos);
+-    QLIST_FOREACH(child, &bs->children, next) {
--    n = qed_bytes_to_clusters(s,
+-        if (bdrv_requests_pending(child->bs)) {
--                              qed_offset_into_cluster(s, find_cluster_cb->pos) +
+-            return true;
--                              find_cluster_cb->len);
+-        }
 -    n = qed_count_contiguous_clusters(s, request->l2_table->table,
 -                                      index, n, &offset);
 -
 -    if (qed_offset_is_unalloc_cluster(offset)) {
 -        ret = QED_CLUSTER_L2;
 -    } else if (qed_offset_is_zero_cluster(offset)) {
 -        ret = QED_CLUSTER_ZERO;
 -    } else if (qed_check_cluster_offset(s, offset)) {
 -        ret = QED_CLUSTER_FOUND;
 -    } else {
 -        ret = -EINVAL;
 -    }
 -
--    len = MIN(find_cluster_cb->len, n * s->header.cluster_size -
+-    return false;
 -              qed_offset_into_cluster(s, find_cluster_cb->pos));
 -
 -out:
 -    find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
 -    qed_release(s);
 -    g_free(find_cluster_cb);
 -}
 -
- /**
+ typedef struct {
-  * Find the offset of a data cluster
+     Coroutine *co;
-  *
+     BlockDriverState *bs;
@@ -XXX,XX +XXX,XX @@ out:
  void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
                        size_t len, QEDFindClusterFunc *cb, void *opaque)
  {
 -    QEDFindClusterCB *find_cluster_cb;
      uint64_t l2_offset;
 +    uint64_t offset = 0;
 +    unsigned int index;
 +    unsigned int n;
 +    int ret;
      /* Limit length to L2 boundary.  Requests are broken up at the L2 boundary
       * so that a request acts on one L2 table at a time.
@@ -XXX,XX +XXX,XX @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
          return;
      }
 -    find_cluster_cb = g_malloc(sizeof(*find_cluster_cb));
 -    find_cluster_cb->s = s;
 -    find_cluster_cb->pos = pos;
 -    find_cluster_cb->len = len;
 -    find_cluster_cb->cb = cb;
 -    find_cluster_cb->opaque = opaque;
 -    find_cluster_cb->request = request;
 +    ret = qed_read_l2_table(s, request, l2_offset);
 +    qed_acquire(s);
 +    if (ret) {
 +        goto out;
 +    }
 +
 +    index = qed_l2_index(s, pos);
 +    n = qed_bytes_to_clusters(s,
 +                              qed_offset_into_cluster(s, pos) + len);
 +    n = qed_count_contiguous_clusters(s, request->l2_table->table,
 +                                      index, n, &offset);
 +
 +    if (qed_offset_is_unalloc_cluster(offset)) {
 +        ret = QED_CLUSTER_L2;
 +    } else if (qed_offset_is_zero_cluster(offset)) {
 +        ret = QED_CLUSTER_ZERO;
 +    } else if (qed_check_cluster_offset(s, offset)) {
 +        ret = QED_CLUSTER_FOUND;
 +    } else {
 +        ret = -EINVAL;
 +    }
 +
 +    len = MIN(len,
 +              n * s->header.cluster_size - qed_offset_into_cluster(s, pos));
 -    qed_read_l2_table(s, request, l2_offset,
 -                      qed_find_cluster_cb, find_cluster_cb);
 +out:
 +    cb(opaque, ret, offset, len);
 +    qed_release(s);
  }
 diff --git a/block/qed-table.c b/block/qed-table.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed-table.c
 +++ b/block/qed-table.c
@@ -XXX,XX +XXX,XX @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
      return ret;
  }
 -void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
 -                       BlockCompletionFunc *cb, void *opaque)
 +int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
  {
      int ret;
@@ -XXX,XX +XXX,XX @@ void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
      /* Check for cached L2 entry */
      request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset);
      if (request->l2_table) {
 -        cb(opaque, 0);
 -        return;
 +        return 0;
      }
      request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
@@ -XXX,XX +XXX,XX @@ void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
      }
      qed_release(s);
 -    cb(opaque, ret);
 +    return ret;
  }
  int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
  {
 -    int ret = -EINPROGRESS;
 -
 -    qed_read_l2_table(s, request, offset, qed_sync_cb, &ret);
 -    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
 -
 -    return ret;
 +    return qed_read_l2_table(s, request, offset);
  }
  void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
 diff --git a/block/qed.h b/block/qed.h
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.h
 +++ b/block/qed.h
@@ -XXX,XX +XXX,XX @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
                              unsigned int n);
  int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                             uint64_t offset);
 -void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
 -                       BlockCompletionFunc *cb, void *opaque);
 +int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset);
  void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
                          unsigned int index, unsigned int n, bool flush,
                          BlockCompletionFunc *cb, void *opaque);
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 05/61] block: use BDRV_POLL_WHILE() in bdrv_rw_vmstate()
+[Qemu-devel] [PULL v3 18/35] block: Assert drain_all is only called from main AioContext
-From: Stefan Hajnoczi <stefanha@redhat.com>
-Calling aio_poll() directly may have been fine previously, but this is
-the future, man!  The difference between an aio_poll() loop and
-BDRV_POLL_WHILE() is that BDRV_POLL_WHILE() releases the AioContext
-around aio_poll().
-This allows the IOThread to run fd handlers or BHs to complete the
-request.  Failure to release the AioContext causes deadlocks.
-Using BDRV_POLL_WHILE() partially fixes a 'savevm' hang with -object
-iothread.
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
 ---
- block/io.c | 4 +---
+ block/io.c | 6 ++++++
-file changed, 1 insertion(+), 3 deletions(-)
+file changed, 6 insertions(+)
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
-         Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
+     BdrvNextIterator it;
+     GSList *aio_ctxs = NULL, *ctx;
-         bdrv_coroutine_enter(bs, co);
--        while (data.ret == -EINPROGRESS) {
++    /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
--            aio_poll(bdrv_get_aio_context(bs), true);
++     * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
--        }
++     * nodes in several different AioContexts, so make sure we're in the main
-+        BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
++     * context. */
-         return data.ret;
++    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
-     }
++
- }
+     block_job_pause_all();
      for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 42/61] qed: Add return value to qed_aio_write_l2_update()
+[Qemu-devel] [PULL v3 19/35] block: Make bdrv_drain() driver callbacks non-recursive
-Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
+bdrv_drained_begin() doesn't increase bs->quiesce_counter recursively
-just return an error code and let the caller handle it.
+and also doesn't notify other parent nodes of children, which both means
 that the child nodes are not actually drained, and bdrv_drained_begin()
 is providing useful functionality only on a single node.
 To keep things consistent, we also shouldn't call the block driver
 callbacks recursively.
 A proper recursive drain version that provides an actually working
 drained section for child nodes will be introduced later.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
 ---
- block/qed.c | 43 ++++++++++++++++++++++++++-----------------
+ block/io.c | 16 +++++++++-------
-file changed, 26 insertions(+), 17 deletions(-)
+file changed, 9 insertions(+), 7 deletions(-)
-diff --git a/block/qed.c b/block/qed.c
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qed.c
+--- a/block/io.c
-+++ b/block/qed.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_l1_update(QEDAIOCB *acb)
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
- /**
+ }
-  * Update L2 table with new cluster offsets and write them out
-  */
+ /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
--static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
+-static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
-+static int qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
++static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
  {
-     BDRVQEDState *s = acb_to_s(acb);
+     BdrvChild *child, *tmp;
-     bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
+     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
--    int index;
+@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
--
+     bdrv_coroutine_enter(bs, data.co);
--    if (ret) {
+     BDRV_POLL_WHILE(bs, !data.done);
--        goto err;
--    }
+-    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
-+    int index, ret;
+-        bdrv_drain_invoke(child->bs, begin);
++    if (recursive) {
-     if (need_alloc) {
++        QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
-         qed_unref_l2_cache_entry(acb->request.l2_table);
++            bdrv_drain_invoke(child->bs, begin, true);
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
          /* Write out the whole new L2 table */
          ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
          if (ret) {
 -            goto err;
 +            return ret;
          }
 -        ret = qed_aio_write_l1_update(acb);
 -        qed_aio_next_io(acb, ret);
 -
 +        return qed_aio_write_l1_update(acb);
      } else {
          /* Write out only the updated part of the L2 table */
          ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
                                   false);
 -        qed_aio_next_io(acb, ret);
 +        if (ret) {
 +            return ret;
 +        }
      }
--    return;
--
--err:
--    qed_aio_complete(acb, ret);
-+    return 0;
  }
- /**
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
+         bdrv_parent_drained_begin(bs);
               */
              ret = bdrv_flush(s->bs->file->bs);
          }
 -        qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
 +        if (ret) {
 +            goto err;
 +        }
 +        ret = qed_aio_write_l2_update(acb, acb->cur_cluster);
 +        if (ret) {
 +            goto err;
 +        }
 +        qed_aio_next_io(acb, 0);
      }
-+    return;
-+
+-    bdrv_drain_invoke(bs, true);
-+err:
++    bdrv_drain_invoke(bs, true, false);
-+    qed_aio_complete(acb, ret);
+     bdrv_drain_recurse(bs);
  }
- /**
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_zero_cluster(void *opaque, int ret)
          return;
      }
--    qed_aio_write_l2_update(acb, 0, 1);
+     /* Re-enable things in child-to-parent order */
-+    ret = qed_aio_write_l2_update(acb, 1);
+-    bdrv_drain_invoke(bs, false);
-+    if (ret < 0) {
++    bdrv_drain_invoke(bs, false, false);
-+        qed_aio_complete(acb, ret);
+     bdrv_parent_drained_end(bs);
-+        return;
+     aio_enable_external(bdrv_get_aio_context(bs));
 +    }
 +    qed_aio_next_io(acb, 0);
  }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
- /**
+         aio_context_acquire(aio_context);
          aio_disable_external(aio_context);
          bdrv_parent_drained_begin(bs);
 -        bdrv_drain_invoke(bs, true);
 +        bdrv_drain_invoke(bs, true, true);
          aio_context_release(aio_context);
          if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
          /* Re-enable things in child-to-parent order */
          aio_context_acquire(aio_context);
 -        bdrv_drain_invoke(bs, false);
 +        bdrv_drain_invoke(bs, false, true);
          bdrv_parent_drained_end(bs);
          aio_enable_external(aio_context);
          aio_context_release(aio_context);
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 40/61] qed: Inline qed_commit_l2_update()
+[Qemu-devel] [PULL v3 20/35] test-bdrv-drain: Test callback for bdrv_drain
-qed_commit_l2_update() is unconditionally called at the end of
+The existing test is for bdrv_drain_all_begin/end() only. Generalise the
-qed_aio_write_l1_update(). Inline it.
+test case so that it can be run for the other variants as well. At the
 moment this is only bdrv_drain_begin/end(), but in a while, we'll add
 another one.
 Also, add a backing file to the test node to test whether the operations
 work recursively.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qed.c | 36 ++++++++++++++----------------------
+ tests/test-bdrv-drain.c | 69 ++++++++++++++++++++++++++++++++++++++++++++-----
-file changed, 14 insertions(+), 22 deletions(-)
+file changed, 62 insertions(+), 7 deletions(-)
-diff --git a/block/qed.c b/block/qed.c
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qed.c
+--- a/tests/test-bdrv-drain.c
-+++ b/block/qed.c
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
+@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_test = {
      .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
      .bdrv_co_drain_end      = bdrv_test_co_drain_end,
 +
 +    .bdrv_child_perm        = bdrv_format_default_perms,
  };
  static void aio_ret_cb(void *opaque, int ret)
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
      *aio_ret = ret;
  }
- /**
+-static void test_drv_cb_drain_all(void)
-- * Commit the current L2 table to the cache
++enum drain_type {
-+ * Update L1 table with new L2 table offset and write it out
++    BDRV_DRAIN_ALL,
-  */
++    BDRV_DRAIN,
--static void qed_commit_l2_update(void *opaque, int ret)
++};
-+static void qed_aio_write_l1_update(void *opaque, int ret)
++
 +static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
 +{
 +    switch (drain_type) {
 +    case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
 +    case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
 +    default:                    g_assert_not_reached();
 +    }
 +}
 +
 +static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
 +{
 +    switch (drain_type) {
 +    case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
 +    case BDRV_DRAIN:            bdrv_drained_end(bs); break;
 +    default:                    g_assert_not_reached();
 +    }
 +}
 +
 +static void test_drv_cb_common(enum drain_type drain_type, bool recursive)
  {
-     QEDAIOCB *acb = opaque;
+     BlockBackend *blk;
-     BDRVQEDState *s = acb_to_s(acb);
+-    BlockDriverState *bs;
-     CachedL2Table *l2_table = acb->request.l2_table;
+-    BDRVTestState *s;
-     uint64_t l2_offset = l2_table->offset;
++    BlockDriverState *bs, *backing;
-+    int index;
++    BDRVTestState *s, *backing_s;
      BlockAIOCB *acb;
      int aio_ret;
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
      s = bs->opaque;
      blk_insert_bs(blk, bs, &error_abort);
 +    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
 +    backing_s = backing->opaque;
 +    bdrv_set_backing_hd(bs, backing, &error_abort);
 +
-+    if (ret) {
+     /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
-+        qed_aio_complete(acb, ret);
+     g_assert_cmpint(s->drain_count, ==, 0);
-+        return;
+-    bdrv_drain_all_begin();
-+    }
++    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +    index = qed_l1_index(s, acb->cur_pos);
 +    s->l1_table->offsets[index] = l2_table->offset;
 +
-+    ret = qed_write_l1_table(s, index, 1);
++    do_drain_begin(drain_type, bs);
 +
-+    /* Commit the current L2 table to the cache */
+     g_assert_cmpint(s->drain_count, ==, 1);
-     qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
+-    bdrv_drain_all_end();
++    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
-     /* This is guaranteed to succeed because we just committed the entry to the
++
-@@ -XXX,XX +XXX,XX @@ static void qed_commit_l2_update(void *opaque, int ret)
++    do_drain_end(drain_type, bs);
-     qed_aio_next_io(acb, ret);
++
      g_assert_cmpint(s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
      /* Now do the same while a request is pending */
      aio_ret = -EINPROGRESS;
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
      g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
      g_assert_cmpint(s->drain_count, ==, 0);
 -    bdrv_drain_all_begin();
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    do_drain_begin(drain_type, bs);
 +
      g_assert_cmpint(aio_ret, ==, 0);
      g_assert_cmpint(s->drain_count, ==, 1);
 -    bdrv_drain_all_end();
 +    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
 +
 +    do_drain_end(drain_type, bs);
 +
      g_assert_cmpint(s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +    bdrv_unref(backing);
      bdrv_unref(bs);
      blk_unref(blk);
  }
--/**
++static void test_drv_cb_drain_all(void)
-- * Update L1 table with new L2 table offset and write it out
++{
-- */
++    test_drv_cb_common(BDRV_DRAIN_ALL, true);
--static void qed_aio_write_l1_update(void *opaque, int ret)
++}
--{
++
--    QEDAIOCB *acb = opaque;
++static void test_drv_cb_drain(void)
--    BDRVQEDState *s = acb_to_s(acb);
++{
--    int index;
++    test_drv_cb_common(BDRV_DRAIN, false);
--
++}
--    if (ret) {
++
--        qed_aio_complete(acb, ret);
+ int main(int argc, char **argv)
--        return;
+ {
--    }
+     bdrv_init();
--
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
--    index = qed_l1_index(s, acb->cur_pos);
+     g_test_init(&argc, &argv, NULL);
--    s->l1_table->offsets[index] = acb->request.l2_table->offset;
--
+     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
--    ret = qed_write_l1_table(s, index, 1);
++    g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
--    qed_commit_l2_update(acb, ret);
--}
+     return g_test_run();
+ }
  /**
   * Update L2 table with new cluster offsets and write them out
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 02/61] qemu-iotests: Allow starting new qemu after cleanup
+[Qemu-devel] [PULL v3 21/35] test-bdrv-drain: Test bs->quiesce_counter
-After _cleanup_qemu(), test cases should be able to start the next qemu
+This is currently only working correctly for bdrv_drain(), not for
-process and call _cleanup_qemu() for that one as well. For this to work
+bdrv_drain_all(). Leave a comment for the drain_all case, we'll address
-cleanly, we need to improve the cleanup so that the second invocation
+it later.
 doesn't try to kill the qemu instances from the first invocation a
 second time (which would result in error messages).
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
 ---
- tests/qemu-iotests/common.qemu | 3 +++
+ tests/test-bdrv-drain.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
-file changed, 3 insertions(+)
+file changed, 45 insertions(+)
-diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/common.qemu
+--- a/tests/test-bdrv-drain.c
-+++ b/tests/qemu-iotests/common.qemu
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ function _cleanup_qemu()
+@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
-         rm -f "${QEMU_FIFO_IN}_${i}" "${QEMU_FIFO_OUT}_${i}"
+     test_drv_cb_common(BDRV_DRAIN, false);
-         eval "exec ${QEMU_IN[$i]}<&-"   # close file descriptors
+ }
-         eval "exec ${QEMU_OUT[$i]}<&-"
 +static void test_quiesce_common(enum drain_type drain_type, bool recursive)
 +{
 +    BlockBackend *blk;
 +    BlockDriverState *bs, *backing;
 +
-+        unset QEMU_IN[$i]
++    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
-+        unset QEMU_OUT[$i]
++    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
-     done
++                              &error_abort);
 +    blk_insert_bs(blk, bs, &error_abort);
 +
 +    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
 +    bdrv_set_backing_hd(bs, backing, &error_abort);
 +
 +    g_assert_cmpint(bs->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +
 +    do_drain_begin(drain_type, bs);
 +
 +    g_assert_cmpint(bs->quiesce_counter, ==, 1);
 +    g_assert_cmpint(backing->quiesce_counter, ==, !!recursive);
 +
 +    do_drain_end(drain_type, bs);
 +
 +    g_assert_cmpint(bs->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +
 +    bdrv_unref(backing);
 +    bdrv_unref(bs);
 +    blk_unref(blk);
 +}
 +
 +static void test_quiesce_drain_all(void)
 +{
 +    // XXX drain_all doesn't quiesce
 +    //test_quiesce_common(BDRV_DRAIN_ALL, true);
 +}
 +
 +static void test_quiesce_drain(void)
 +{
 +    test_quiesce_common(BDRV_DRAIN, false);
 +}
 +
  int main(int argc, char **argv)
  {
      bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
      g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 +    g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
 +    g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 +
      return g_test_run();
  }
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 57/61] fix: avoid an infinite loop or a dangling pointer problem in img_commit
+[Qemu-devel] [PULL v3 22/35] blockjob: Pause job on draining any job BDS
-From: "sochin.jiang" <sochin.jiang@huawei.com>
+Block jobs already paused themselves when their main BlockBackend
 entered a drained section. This is not good enough: We also want to
 pause a block job and may not submit new requests if, for example, the
 mirror target node should be drained.
-img_commit could fall into an infinite loop calling run_block_job() if
+This implements .drained_begin/end callbacks in child_job in order to
-its blockjob fails on any I/O error, fix this already known problem.
+consider all block nodes related to the job, and removes the
 BlockBackend callbacks which are unnecessary now because the root of the
 job main BlockBackend is always referenced with a child_job, too.
-Signed-off-by: sochin.jiang <sochin.jiang@huawei.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Message-id: 1497509253-28941-1-git-send-email-sochin.jiang@huawei.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- blockjob.c               |  4 ++--
+ blockjob.c | 22 +++++++++-------------
- include/block/blockjob.h | 18 ++++++++++++++++++
+file changed, 9 insertions(+), 13 deletions(-)
  qemu-img.c               | 20 +++++++++++++-------
 files changed, 33 insertions(+), 9 deletions(-)
 diff --git a/blockjob.c b/blockjob.c
 index XXXXXXX..XXXXXXX 100644
 --- a/blockjob.c
 +++ b/blockjob.c
-@@ -XXX,XX +XXX,XX @@ static void block_job_resume(BlockJob *job)
+@@ -XXX,XX +XXX,XX @@ static char *child_job_get_parent_desc(BdrvChild *c)
-     block_job_enter(job);
+                            job->id);
  }
--static void block_job_ref(BlockJob *job)
+-static const BdrvChildRole child_job = {
-+void block_job_ref(BlockJob *job)
+-    .get_parent_desc    = child_job_get_parent_desc,
 -    .stay_at_node       = true,
 -};
 -
 -static void block_job_drained_begin(void *opaque)
 +static void child_job_drained_begin(BdrvChild *c)
  {
-     ++job->refcnt;
+-    BlockJob *job = opaque;
 +    BlockJob *job = c->opaque;
      block_job_pause(job);
  }
-@@ -XXX,XX +XXX,XX @@ static void block_job_attached_aio_context(AioContext *new_context,
-                                            void *opaque);
+-static void block_job_drained_end(void *opaque)
- static void block_job_detach_aio_context(void *opaque);
++static void child_job_drained_end(BdrvChild *c)
 -static void block_job_unref(BlockJob *job)
 +void block_job_unref(BlockJob *job)
  {
-     if (--job->refcnt == 0) {
+-    BlockJob *job = opaque;
-         BlockDriverState *bs = blk_bs(job->blk);
++    BlockJob *job = c->opaque;
-diff --git a/include/block/blockjob.h b/include/block/blockjob.h
+     block_job_resume(job);
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/blockjob.h
 +++ b/include/block/blockjob.h
@@ -XXX,XX +XXX,XX @@ void block_job_iostatus_reset(BlockJob *job);
  BlockJobTxn *block_job_txn_new(void);
  /**
 + * block_job_ref:
 + *
 + * Add a reference to BlockJob refcnt, it will be decreased with
 + * block_job_unref, and then be freed if it comes to be the last
 + * reference.
 + */
 +void block_job_ref(BlockJob *job);
 +
 +/**
 + * block_job_unref:
 + *
 + * Release a reference that was previously acquired with block_job_ref
 + * or block_job_create. If it's the last reference to the object, it will be
 + * freed.
 + */
 +void block_job_unref(BlockJob *job);
 +
 +/**
   * block_job_txn_unref:
   *
   * Release a reference that was previously acquired with block_job_txn_add_job
 diff --git a/qemu-img.c b/qemu-img.c
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-img.c
 +++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ static void common_block_job_cb(void *opaque, int ret)
  static void run_block_job(BlockJob *job, Error **errp)
  {
      AioContext *aio_context = blk_get_aio_context(job->blk);
 +    int ret = 0;
 -    /* FIXME In error cases, the job simply goes away and we access a dangling
 -     * pointer below. */
      aio_context_acquire(aio_context);
 +    block_job_ref(job);
      do {
          aio_poll(aio_context, true);
          qemu_progress_print(job->len ?
                              ((float)job->offset / job->len * 100.f) : 0.0f, 0);
 -    } while (!job->ready);
 +    } while (!job->ready && !job->completed);
 -    block_job_complete_sync(job, errp);
 +    if (!job->completed) {
 +        ret = block_job_complete_sync(job, errp);
 +    } else {
 +        ret = job->ret;
 +    }
 +    block_job_unref(job);
      aio_context_release(aio_context);
 -    /* A block job may finish instantaneously without publishing any progress,
 -     * so just signal completion here */
 -    qemu_progress_print(100.f, 0);
 +    /* publish completion progress only when success */
 +    if (!ret) {
 +        qemu_progress_print(100.f, 0);
 +    }
  }
- static int img_commit(int argc, char **argv)
+-static const BlockDevOps block_job_dev_ops = {
 -    .drained_begin = block_job_drained_begin,
 -    .drained_end = block_job_drained_end,
 +static const BdrvChildRole child_job = {
 +    .get_parent_desc    = child_job_get_parent_desc,
 +    .drained_begin      = child_job_drained_begin,
 +    .drained_end        = child_job_drained_end,
 +    .stay_at_node       = true,
  };
  void block_job_remove_all_bdrv(BlockJob *job)
@@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
      block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort);
      bs->job = job;
 -    blk_set_dev_ops(blk, &block_job_dev_ops, job);
      bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
      QLIST_INSERT_HEAD(&block_jobs, job, job_list);
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 20/61] qcow2: Split do_perform_cow() into _read(), _encrypt() and _write()
+[Qemu-devel] [PULL v3 23/35] test-bdrv-drain: Test drain vs. block jobs
-From: Alberto Garcia <berto@igalia.com>
+Block jobs must be paused if any of the involved nodes are drained.
-This patch splits do_perform_cow() into three separate functions to
-read, encrypt and write the COW regions.
-perform_cow() can now read both regions first, then encrypt them and
-finally write them to disk. The memory allocation is also done in
-this function now, using one single buffer large enough to hold both
-regions.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/qcow2-cluster.c | 117 +++++++++++++++++++++++++++++++++++++-------------
+ tests/test-bdrv-drain.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++
-file changed, 87 insertions(+), 30 deletions(-)
+file changed, 121 insertions(+)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
+--- a/tests/test-bdrv-drain.c
-+++ b/block/qcow2-cluster.c
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
+@@ -XXX,XX +XXX,XX @@
-     return 0;
  #include "qemu/osdep.h"
  #include "block/block.h"
 +#include "block/blockjob_int.h"
  #include "sysemu/block-backend.h"
  #include "qapi/error.h"
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
      test_quiesce_common(BDRV_DRAIN, false);
  }
--static int coroutine_fn do_perform_cow(BlockDriverState *bs,
++
--                                       uint64_t src_cluster_offset,
++typedef struct TestBlockJob {
--                                       uint64_t cluster_offset,
++    BlockJob common;
--                                       unsigned offset_in_cluster,
++    bool should_complete;
--                                       unsigned bytes)
++} TestBlockJob;
-+static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
++
-+                                            uint64_t src_cluster_offset,
++static void test_job_completed(BlockJob *job, void *opaque)
-+                                            unsigned offset_in_cluster,
++{
-+                                            uint8_t *buffer,
++    block_job_completed(job, 0);
 +                                            unsigned bytes)
  {
 -    BDRVQcow2State *s = bs->opaque;
      QEMUIOVector qiov;
 -    struct iovec iov;
 +    struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
      int ret;
      if (bytes == 0) {
          return 0;
      }
 -    iov.iov_len = bytes;
 -    iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
 -    if (iov.iov_base == NULL) {
 -        return -ENOMEM;
 -    }
 -
      qemu_iovec_init_external(&qiov, &iov, 1);
      BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
      if (!bs->drv) {
 -        ret = -ENOMEDIUM;
 -        goto out;
 +        return -ENOMEDIUM;
      }
      /* Call .bdrv_co_readv() directly instead of using the public block-layer
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn do_perform_cow(BlockDriverState *bs,
      ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster,
                                    bytes, &qiov, 0);
      if (ret < 0) {
 -        goto out;
 +        return ret;
      }
 -    if (bs->encrypted) {
 +    return 0;
 +}
 +
-+static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
++static void coroutine_fn test_job_start(void *opaque)
 +                                                uint64_t src_cluster_offset,
 +                                                unsigned offset_in_cluster,
 +                                                uint8_t *buffer,
 +                                                unsigned bytes)
 +{
-+    if (bytes && bs->encrypted) {
++    TestBlockJob *s = opaque;
-+        BDRVQcow2State *s = bs->opaque;
++
-         int64_t sector = (src_cluster_offset + offset_in_cluster)
++    while (!s->should_complete) {
-                          >> BDRV_SECTOR_BITS;
++        block_job_sleep_ns(&s->common, 100000);
-         assert(s->cipher);
++    }
-         assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
++
-         assert((bytes & ~BDRV_SECTOR_MASK) == 0);
++    block_job_defer_to_main_loop(&s->common, test_job_completed, NULL);
 -        if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base,
 +        if (qcow2_encrypt_sectors(s, sector, buffer, buffer,
                                    bytes >> BDRV_SECTOR_BITS, true, NULL) < 0) {
 -            ret = -EIO;
 -            goto out;
 +            return false;
          }
      }
 +    return true;
 +}
 +
-+static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
++static void test_job_complete(BlockJob *job, Error **errp)
 +                                             uint64_t cluster_offset,
 +                                             unsigned offset_in_cluster,
 +                                             uint8_t *buffer,
 +                                             unsigned bytes)
 +{
-+    QEMUIOVector qiov;
++    TestBlockJob *s = container_of(job, TestBlockJob, common);
-+    struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
++    s->should_complete = true;
 +}
 +
 +BlockJobDriver test_job_driver = {
 +    .instance_size  = sizeof(TestBlockJob),
 +    .start          = test_job_start,
 +    .complete       = test_job_complete,
 +};
 +
 +static void test_blockjob_common(enum drain_type drain_type)
 +{
 +    BlockBackend *blk_src, *blk_target;
 +    BlockDriverState *src, *target;
 +    BlockJob *job;
 +    int ret;
 +
-+    if (bytes == 0) {
++    src = bdrv_new_open_driver(&bdrv_test, "source", BDRV_O_RDWR,
-+        return 0;
++                               &error_abort);
 +    blk_src = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    blk_insert_bs(blk_src, src, &error_abort);
 +
 +    target = bdrv_new_open_driver(&bdrv_test, "target", BDRV_O_RDWR,
 +                                  &error_abort);
 +    blk_target = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    blk_insert_bs(blk_target, target, &error_abort);
 +
 +    job = block_job_create("job0", &test_job_driver, src, 0, BLK_PERM_ALL, 0,
 +                           0, NULL, NULL, &error_abort);
 +    block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort);
 +    block_job_start(job);
 +
 +    g_assert_cmpint(job->pause_count, ==, 0);
 +    g_assert_false(job->paused);
 +    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
 +
 +    do_drain_begin(drain_type, src);
 +
 +    if (drain_type == BDRV_DRAIN_ALL) {
 +        /* bdrv_drain_all() drains both src and target, and involves an
 +         * additional block_job_pause_all() */
 +        g_assert_cmpint(job->pause_count, ==, 3);
 +    } else {
 +        g_assert_cmpint(job->pause_count, ==, 1);
 +    }
++    /* XXX We don't wait until the job is actually paused. Is this okay? */
++    /* g_assert_true(job->paused); */
++    g_assert_false(job->busy); /* The job is paused */
 +
-+    qemu_iovec_init_external(&qiov, &iov, 1);
++    do_drain_end(drain_type, src);
++
-     ret = qcow2_pre_write_overlap_check(bs, 0,
++    g_assert_cmpint(job->pause_count, ==, 0);
-             cluster_offset + offset_in_cluster, bytes);
++    g_assert_false(job->paused);
-     if (ret < 0) {
++    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
--        goto out;
++
-+        return ret;
++    do_drain_begin(drain_type, target);
-     }
++
++    if (drain_type == BDRV_DRAIN_ALL) {
-     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
++        /* bdrv_drain_all() drains both src and target, and involves an
-     ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster,
++         * additional block_job_pause_all() */
-                           bytes, &qiov, 0);
++        g_assert_cmpint(job->pause_count, ==, 3);
-     if (ret < 0) {
++    } else {
--        goto out;
++        g_assert_cmpint(job->pause_count, ==, 1);
-+        return ret;
++    }
-     }
++    /* XXX We don't wait until the job is actually paused. Is this okay? */
++    /* g_assert_true(job->paused); */
--    ret = 0;
++    g_assert_false(job->busy); /* The job is paused */
--out:
++
--    qemu_vfree(iov.iov_base);
++    do_drain_end(drain_type, target);
--    return ret;
++
-+    return 0;
++    g_assert_cmpint(job->pause_count, ==, 0);
 +    g_assert_false(job->paused);
 +    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
 +
 +    ret = block_job_complete_sync(job, &error_abort);
 +    g_assert_cmpint(ret, ==, 0);
 +
 +    blk_unref(blk_src);
 +    blk_unref(blk_target);
 +    bdrv_unref(src);
 +    bdrv_unref(target);
 +}
 +
 +static void test_blockjob_drain_all(void)
 +{
 +    test_blockjob_common(BDRV_DRAIN_ALL);
 +}
 +
 +static void test_blockjob_drain(void)
 +{
 +    test_blockjob_common(BDRV_DRAIN);
 +}
 +
  int main(int argc, char **argv)
  {
      bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
      g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 +    g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
 +    g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 +
      return g_test_run();
  }
-@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
-     BDRVQcow2State *s = bs->opaque;
-     Qcow2COWRegion *start = &m->cow_start;
-     Qcow2COWRegion *end = &m->cow_end;
-+    unsigned buffer_size;
-+    uint8_t *start_buffer, *end_buffer;
-     int ret;
-+    assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
-+
-     if (start->nb_bytes == 0 && end->nb_bytes == 0) {
-         return 0;
-     }
-+    /* Reserve a buffer large enough to store the data from both the
-+     * start and end COW regions. Add some padding in the middle if
-+     * necessary to make sure that the end region is optimally aligned */
-+    buffer_size = QEMU_ALIGN_UP(start->nb_bytes, bdrv_opt_mem_align(bs)) +
-+        end->nb_bytes;
-+    start_buffer = qemu_try_blockalign(bs, buffer_size);
-+    if (start_buffer == NULL) {
-+        return -ENOMEM;
-+    }
-+    /* The part of the buffer where the end region is located */
-+    end_buffer = start_buffer + buffer_size - end->nb_bytes;
-+
-     qemu_co_mutex_unlock(&s->lock);
--    ret = do_perform_cow(bs, m->offset, m->alloc_offset,
--                         start->offset, start->nb_bytes);
-+    /* First we read the existing data from both COW regions */
-+    ret = do_perform_cow_read(bs, m->offset, start->offset,
-+                              start_buffer, start->nb_bytes);
-     if (ret < 0) {
-         goto fail;
-     }
--    ret = do_perform_cow(bs, m->offset, m->alloc_offset,
--                         end->offset, end->nb_bytes);
-+    ret = do_perform_cow_read(bs, m->offset, end->offset,
-+                              end_buffer, end->nb_bytes);
-+    if (ret < 0) {
-+        goto fail;
-+    }
-+
-+    /* Encrypt the data if necessary before writing it */
-+    if (bs->encrypted) {
-+        if (!do_perform_cow_encrypt(bs, m->offset, start->offset,
-+                                    start_buffer, start->nb_bytes) ||
-+            !do_perform_cow_encrypt(bs, m->offset, end->offset,
-+                                    end_buffer, end->nb_bytes)) {
-+            ret = -EIO;
-+            goto fail;
-+        }
-+    }
-+
-+    /* And now we can write everything */
-+    ret = do_perform_cow_write(bs, m->alloc_offset, start->offset,
-+                               start_buffer, start->nb_bytes);
-+    if (ret < 0) {
-+        goto fail;
-+    }
-+    ret = do_perform_cow_write(bs, m->alloc_offset, end->offset,
-+                               end_buffer, end->nb_bytes);
- fail:
-     qemu_co_mutex_lock(&s->lock);
-@@ -XXX,XX +XXX,XX @@ fail:
-         qcow2_cache_depends_on_flush(s->l2_table_cache);
-     }
-+    qemu_vfree(start_buffer);
-     return ret;
- }
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 19/61] qcow2: Make perform_cow() call do_perform_cow() twice
+[Qemu-devel] [PULL v3 24/35] block: Don't block_job_pause_all() in bdrv_drain_all()
-From: Alberto Garcia <berto@igalia.com>
+Block jobs are already paused using the BdrvChildRole drain callbacks,
 so we don't need an additional block_job_pause_all() call.
-Instead of calling perform_cow() twice with a different COW region
-each time, call it just once and make perform_cow() handle both
-regions.
-This patch simply moves code around. The next one will do the actual
-reordering of the COW operations.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/qcow2-cluster.c | 36 ++++++++++++++++++++++--------------
+ block/io.c              |  4 ----
-file changed, 22 insertions(+), 14 deletions(-)
+ tests/test-bdrv-drain.c | 10 ++++------
 files changed, 4 insertions(+), 10 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
+--- a/block/io.c
-+++ b/block/qcow2-cluster.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn do_perform_cow(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
-     struct iovec iov;
+      * context. */
-     int ret;
+     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
-+    if (bytes == 0) {
+-    block_job_pause_all();
-+        return 0;
+-
-+    }
+     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-+
+         AioContext *aio_context = bdrv_get_aio_context(bs);
-     iov.iov_len = bytes;
-     iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
-     if (iov.iov_base == NULL) {
+         aio_enable_external(aio_context);
-@@ -XXX,XX +XXX,XX @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
+         aio_context_release(aio_context);
-     return cluster_offset;
+     }
 -
 -    block_job_resume_all();
  }
--static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
+ void bdrv_drain_all(void)
-+static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
- {
+index XXXXXXX..XXXXXXX 100644
-     BDRVQcow2State *s = bs->opaque;
+--- a/tests/test-bdrv-drain.c
-+    Qcow2COWRegion *start = &m->cow_start;
++++ b/tests/test-bdrv-drain.c
-+    Qcow2COWRegion *end = &m->cow_end;
+@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
-     int ret;
+     do_drain_begin(drain_type, src);
--    if (r->nb_bytes == 0) {
+     if (drain_type == BDRV_DRAIN_ALL) {
-+    if (start->nb_bytes == 0 && end->nb_bytes == 0) {
+-        /* bdrv_drain_all() drains both src and target, and involves an
-         return 0;
+-         * additional block_job_pause_all() */
 -        g_assert_cmpint(job->pause_count, ==, 3);
 +        /* bdrv_drain_all() drains both src and target */
 +        g_assert_cmpint(job->pause_count, ==, 2);
      } else {
          g_assert_cmpint(job->pause_count, ==, 1);
      }
+@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
-     qemu_co_mutex_unlock(&s->lock);
+     do_drain_begin(drain_type, target);
--    ret = do_perform_cow(bs, m->offset, m->alloc_offset, r->offset, r->nb_bytes);
--    qemu_co_mutex_lock(&s->lock);
+     if (drain_type == BDRV_DRAIN_ALL) {
--
+-        /* bdrv_drain_all() drains both src and target, and involves an
-+    ret = do_perform_cow(bs, m->offset, m->alloc_offset,
+-         * additional block_job_pause_all() */
-+                         start->offset, start->nb_bytes);
+-        g_assert_cmpint(job->pause_count, ==, 3);
-     if (ret < 0) {
++        /* bdrv_drain_all() drains both src and target */
--        return ret;
++        g_assert_cmpint(job->pause_count, ==, 2);
-+        goto fail;
+     } else {
-     }
+         g_assert_cmpint(job->pause_count, ==, 1);
 +    ret = do_perform_cow(bs, m->offset, m->alloc_offset,
 +                         end->offset, end->nb_bytes);
 +
 +fail:
 +    qemu_co_mutex_lock(&s->lock);
 +
      /*
       * Before we update the L2 table to actually point to the new cluster, we
       * need to be sure that the refcounts have been increased and COW was
       * handled.
       */
 -    qcow2_cache_depends_on_flush(s->l2_table_cache);
 +    if (ret == 0) {
 +        qcow2_cache_depends_on_flush(s->l2_table_cache);
 +    }
 -    return 0;
 +    return ret;
  }
  int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
      }
      /* copy content of unmodified sectors */
 -    ret = perform_cow(bs, m, &m->cow_start);
 -    if (ret < 0) {
 -        goto err;
 -    }
 -
 -    ret = perform_cow(bs, m, &m->cow_end);
 +    ret = perform_cow(bs, m);
      if (ret < 0) {
          goto err;
      }
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 41/61] qed: Add return value to qed_aio_write_l1_update()
+[Qemu-devel] [PULL v3 25/35] block: Nested drain_end must still call callbacks
-Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
+bdrv_do_drained_begin() restricts the call of parent callbacks and
-just return an error code and let the caller handle it.
+aio_disable_external() to the outermost drain section, but the block
 driver callbacks are always called. bdrv_do_drained_end() must match
 this behaviour, otherwise nodes stay drained even if begin/end calls
 were balanced.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qed.c | 19 +++++++++----------
+ block/io.c | 12 +++++++-----
-file changed, 9 insertions(+), 10 deletions(-)
+file changed, 7 insertions(+), 5 deletions(-)
-diff --git a/block/qed.c b/block/qed.c
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qed.c
+--- a/block/io.c
-+++ b/block/qed.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
- /**
-  * Update L1 table with new L2 table offset and write it out
+ void bdrv_drained_end(BlockDriverState *bs)
   */
 -static void qed_aio_write_l1_update(void *opaque, int ret)
 +static int qed_aio_write_l1_update(QEDAIOCB *acb)
  {
--    QEDAIOCB *acb = opaque;
++    int old_quiesce_counter;
-     BDRVQEDState *s = acb_to_s(acb);
++
-     CachedL2Table *l2_table = acb->request.l2_table;
+     if (qemu_in_coroutine()) {
-     uint64_t l2_offset = l2_table->offset;
+         bdrv_co_yield_to_drain(bs, false);
--    int index;
+         return;
--
+     }
--    if (ret) {
+     assert(bs->quiesce_counter > 0);
--        qed_aio_complete(acb, ret);
+-    if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
 -        return;
 -    }
-+    int index, ret;
++    old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
-     index = qed_l1_index(s, acb->cur_pos);
+     /* Re-enable things in child-to-parent order */
-     s->l1_table->offsets[index] = l2_table->offset;
+     bdrv_drain_invoke(bs, false, false);
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l1_update(void *opaque, int ret)
+-    bdrv_parent_drained_end(bs);
-     acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
+-    aio_enable_external(bdrv_get_aio_context(bs));
-     assert(acb->request.l2_table != NULL);
++    if (old_quiesce_counter == 1) {
++        bdrv_parent_drained_end(bs);
--    qed_aio_next_io(acb, ret);
++        aio_enable_external(bdrv_get_aio_context(bs));
-+    return ret;
++    }
  }
+ /*
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
      if (need_alloc) {
          /* Write out the whole new L2 table */
          ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
 -        qed_aio_write_l1_update(acb, ret);
 +        if (ret) {
 +            goto err;
 +        }
 +        ret = qed_aio_write_l1_update(acb);
 +        qed_aio_next_io(acb, ret);
 +
      } else {
          /* Write out only the updated part of the L2 table */
          ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 23/61] qcow2: Merge the writing of the COW regions with the guest data
+[Qemu-devel] [PULL v3 26/35] test-bdrv-drain: Test nested drain sections
-From: Alberto Garcia <berto@igalia.com>
-If the guest tries to write data that results on the allocation of a
-new cluster, instead of writing the guest data first and then the data
-from the COW regions, write everything together using one single I/O
-operation.
-This can improve the write performance by 25% or more, depending on
-several factors such as the media type, the cluster size and the I/O
-request size.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/qcow2-cluster.c | 40 ++++++++++++++++++++++++--------
+ tests/test-bdrv-drain.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++
- block/qcow2.c         | 64 +++++++++++++++++++++++++++++++++++++++++++--------
+file changed, 57 insertions(+)
  block/qcow2.h         |  7 ++++++
 files changed, 91 insertions(+), 20 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
+--- a/tests/test-bdrv-drain.c
-+++ b/block/qcow2-cluster.c
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
+@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
-     assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
+ enum drain_type {
-     assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes);
+     BDRV_DRAIN_ALL,
-     assert(start->offset + start->nb_bytes <= end->offset);
+     BDRV_DRAIN,
-+    assert(!m->data_qiov || m->data_qiov->size == data_bytes);
++    DRAIN_TYPE_MAX,
+ };
-     if (start->nb_bytes == 0 && end->nb_bytes == 0) {
-         return 0;
+ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
-@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
+@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
-     /* The part of the buffer where the end region is located */
+     test_quiesce_common(BDRV_DRAIN, false);
-     end_buffer = start_buffer + buffer_size - end->nb_bytes;
+ }
--    qemu_iovec_init(&qiov, 1);
++static void test_nested(void)
-+    qemu_iovec_init(&qiov, 2 + (m->data_qiov ? m->data_qiov->niov : 0));
++{
++    BlockBackend *blk;
-     qemu_co_mutex_unlock(&s->lock);
++    BlockDriverState *bs, *backing;
-     /* First we read the existing data from both COW regions. We
++    BDRVTestState *s, *backing_s;
-@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
++    enum drain_type outer, inner;
-         }
++
-     }
++    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
++    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
--    /* And now we can write everything */
++                              &error_abort);
--    qemu_iovec_reset(&qiov);
++    s = bs->opaque;
--    qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
++    blk_insert_bs(blk, bs, &error_abort);
--    ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
++
--    if (ret < 0) {
++    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
--        goto fail;
++    backing_s = backing->opaque;
-+    /* And now we can write everything. If we have the guest data we
++    bdrv_set_backing_hd(bs, backing, &error_abort);
-+     * can write everything in one single operation */
++
-+    if (m->data_qiov) {
++    for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
-+        qemu_iovec_reset(&qiov);
++        for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
-+        if (start->nb_bytes) {
++            /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
-+            qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
++            int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
 +                                  (inner != BDRV_DRAIN_ALL);
 +            int backing_quiesce = 0;
 +            int backing_cb_cnt  = (outer != BDRV_DRAIN) +
 +                                  (inner != BDRV_DRAIN);
 +
 +            g_assert_cmpint(bs->quiesce_counter, ==, 0);
 +            g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +            g_assert_cmpint(s->drain_count, ==, 0);
 +            g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +            do_drain_begin(outer, bs);
 +            do_drain_begin(inner, bs);
 +
 +            g_assert_cmpint(bs->quiesce_counter, ==, bs_quiesce);
 +            g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
 +            g_assert_cmpint(s->drain_count, ==, 2);
 +            g_assert_cmpint(backing_s->drain_count, ==, backing_cb_cnt);
 +
 +            do_drain_end(inner, bs);
 +            do_drain_end(outer, bs);
 +
 +            g_assert_cmpint(bs->quiesce_counter, ==, 0);
 +            g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +            g_assert_cmpint(s->drain_count, ==, 0);
 +            g_assert_cmpint(backing_s->drain_count, ==, 0);
 +        }
-+        qemu_iovec_concat(&qiov, m->data_qiov, 0, data_bytes);
-+        if (end->nb_bytes) {
-+            qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
-+        }
-+        /* NOTE: we have a write_aio blkdebug event here followed by
-+         * a cow_write one in do_perform_cow_write(), but there's only
-+         * one single I/O operation */
-+        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
-+        ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
-+    } else {
-+        /* If there's no guest data then write both COW regions separately */
-+        qemu_iovec_reset(&qiov);
-+        qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
-+        ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
-+        if (ret < 0) {
-+            goto fail;
-+        }
-+
-+        qemu_iovec_reset(&qiov);
-+        qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
-+        ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
-     }
--    qemu_iovec_reset(&qiov);
--    qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
--    ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
- fail:
-     qemu_co_mutex_lock(&s->lock);
-diff --git a/block/qcow2.c b/block/qcow2.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.c
-+++ b/block/qcow2.c
-@@ -XXX,XX +XXX,XX @@ fail:
-     return ret;
- }
-+/* Check if it's possible to merge a write request with the writing of
-+ * the data from the COW regions */
-+static bool merge_cow(uint64_t offset, unsigned bytes,
-+                      QEMUIOVector *hd_qiov, QCowL2Meta *l2meta)
-+{
-+    QCowL2Meta *m;
-+
-+    for (m = l2meta; m != NULL; m = m->next) {
-+        /* If both COW regions are empty then there's nothing to merge */
-+        if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) {
-+            continue;
-+        }
-+
-+        /* The data (middle) region must be immediately after the
-+         * start region */
-+        if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
-+            continue;
-+        }
-+
-+        /* The end region must be immediately after the data (middle)
-+         * region */
-+        if (m->offset + m->cow_end.offset != offset + bytes) {
-+            continue;
-+        }
-+
-+        /* Make sure that adding both COW regions to the QEMUIOVector
-+         * does not exceed IOV_MAX */
-+        if (hd_qiov->niov > IOV_MAX - 2) {
-+            continue;
-+        }
-+
-+        m->data_qiov = hd_qiov;
-+        return true;
 +    }
 +
-+    return false;
++    bdrv_unref(backing);
 +    bdrv_unref(bs);
 +    blk_unref(blk);
 +}
 +
- static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
-                                          uint64_t bytes, QEMUIOVector *qiov,
+ typedef struct TestBlockJob {
-                                          int flags)
+     BlockJob common;
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
-             goto fail;
+     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
-         }
+     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
--        qemu_co_mutex_unlock(&s->lock);
++    g_test_add_func("/bdrv-drain/nested", test_nested);
 -        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
 -        trace_qcow2_writev_data(qemu_coroutine_self(),
 -                                cluster_offset + offset_in_cluster);
 -        ret = bdrv_co_pwritev(bs->file,
 -                              cluster_offset + offset_in_cluster,
 -                              cur_bytes, &hd_qiov, 0);
 -        qemu_co_mutex_lock(&s->lock);
 -        if (ret < 0) {
 -            goto fail;
 +        /* If we need to do COW, check if it's possible to merge the
 +         * writing of the guest data together with that of the COW regions.
 +         * If it's not possible (or not necessary) then write the
 +         * guest data now. */
 +        if (!merge_cow(offset, cur_bytes, &hd_qiov, l2meta)) {
 +            qemu_co_mutex_unlock(&s->lock);
 +            BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
 +            trace_qcow2_writev_data(qemu_coroutine_self(),
 +                                    cluster_offset + offset_in_cluster);
 +            ret = bdrv_co_pwritev(bs->file,
 +                                  cluster_offset + offset_in_cluster,
 +                                  cur_bytes, &hd_qiov, 0);
 +            qemu_co_mutex_lock(&s->lock);
 +            if (ret < 0) {
 +                goto fail;
 +            }
          }
          while (l2meta != NULL) {
 diff --git a/block/qcow2.h b/block/qcow2.h
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2.h
 +++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ typedef struct QCowL2Meta
       */
      Qcow2COWRegion cow_end;
 +    /**
 +     * The I/O vector with the data from the actual guest write request.
 +     * If non-NULL, this is meant to be merged together with the data
 +     * from @cow_start and @cow_end into one single write operation.
 +     */
 +    QEMUIOVector *data_qiov;
 +
-     /** Pointer to next L2Meta of the same write request */
+     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
-     struct QCowL2Meta *next;
+     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 32/61] qed: Remove callback from qed_copy_from_backing_file()
+[Qemu-devel] [PULL v3 27/35] block: Don't notify parents in drain call chain
-With this change, qed_aio_write_prefill() and qed_aio_write_postfill()
+This is in preparation for subtree drains, i.e. drained sections that
-collapse into a single function. This is reflected by a rename of the
+affect not only a single node, but recursively all child nodes, too.
-combined function to qed_aio_write_cow().
 Calling the parent callbacks for drain is pointless when we just came
 from that parent node recursively and leads to multiple increases of
 bs->quiesce_counter in a single drain call. Don't do it.
 In order for this to work correctly, the parent callback must be called
 for every bdrv_drain_begin/end() call, not only for the outermost one:
 If we have a node N with two parents A and B, recursive draining of A
 should cause the quiesce_counter of B to increase because its child N is
 drained independently of B. If now B is recursively drained, too, A must
 increase its quiesce_counter because N is drained independently of A
 only now, even if N is going from quiesce_counter 1 to 2.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qed.c | 57 +++++++++++++++++++++++----------------------------------
+ include/block/block.h |  4 ++--
-file changed, 23 insertions(+), 34 deletions(-)
+ block.c               | 13 +++++++++----
+ block/io.c            | 47 ++++++++++++++++++++++++++++++++++-------------
-diff --git a/block/qed.c b/block/qed.c
+files changed, 45 insertions(+), 19 deletions(-)
 diff --git a/include/block/block.h b/include/block/block.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/qed.c
+--- a/include/block/block.h
-+++ b/block/qed.c
++++ b/include/block/block.h
-@@ -XXX,XX +XXX,XX @@ static int qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
+@@ -XXX,XX +XXX,XX @@ void bdrv_io_unplug(BlockDriverState *bs);
-  * @pos:        Byte position in device
+  * Begin a quiesced section of all users of @bs. This is part of
-  * @len:        Number of bytes
+  * bdrv_drained_begin.
   * @offset:     Byte offset in image file
 - * @cb:         Completion function
 - * @opaque:     User data for completion function
   */
--static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
+-void bdrv_parent_drained_begin(BlockDriverState *bs);
--                                       uint64_t len, uint64_t offset,
++void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
 -                                       BlockCompletionFunc *cb,
 -                                       void *opaque)
 +static int qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
 +                                      uint64_t len, uint64_t offset)
  {
      QEMUIOVector qiov;
      QEMUIOVector *backing_qiov = NULL;
@@ -XXX,XX +XXX,XX @@ static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
      /* Skip copy entirely if there is no work to do */
      if (len == 0) {
 -        cb(opaque, 0);
 -        return;
 +        return 0;
      }
      iov = (struct iovec) {
@@ -XXX,XX +XXX,XX @@ static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
      ret = 0;
  out:
      qemu_vfree(iov.iov_base);
 -    cb(opaque, ret);
 +    return ret;
  }
  /**
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
+  * bdrv_parent_drained_end:
- }
+@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs);
   * End a quiesced section of all users of @bs. This is part of
   * bdrv_drained_end.
   */
 -void bdrv_parent_drained_end(BlockDriverState *bs);
 +void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
  /**
-- * Populate back untouched region of new data cluster
+  * bdrv_drained_begin:
-+ * Populate untouched regions of new data cluster
+diff --git a/block.c b/block.c
-  */
+index XXXXXXX..XXXXXXX 100644
--static void qed_aio_write_postfill(void *opaque, int ret)
+--- a/block.c
-+static void qed_aio_write_cow(void *opaque, int ret)
++++ b/block.c
- {
+@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
-     QEDAIOCB *acb = opaque;
+                                       BlockDriverState *new_bs)
-     BDRVQEDState *s = acb_to_s(acb);
+ {
--    uint64_t start = acb->cur_pos + acb->cur_qiov.size;
+     BlockDriverState *old_bs = child->bs;
--    uint64_t len =
++    int i;
--        qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
--    uint64_t offset = acb->cur_cluster +
+     if (old_bs && new_bs) {
--                      qed_offset_into_cluster(s, acb->cur_pos) +
+         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
--                      acb->cur_qiov.size;
+     }
-+    uint64_t start, len, offset;
+     if (old_bs) {
          if (old_bs->quiesce_counter && child->role->drained_end) {
 -            child->role->drained_end(child);
 +            for (i = 0; i < old_bs->quiesce_counter; i++) {
 +                child->role->drained_end(child);
 +            }
          }
          if (child->role->detach) {
              child->role->detach(child);
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
      if (new_bs) {
          QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
          if (new_bs->quiesce_counter && child->role->drained_begin) {
 -            child->role->drained_begin(child);
 +            for (i = 0; i < new_bs->quiesce_counter; i++) {
 +                child->role->drained_begin(child);
 +            }
          }
          if (child->role->attach) {
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
      AioContext *ctx = bdrv_get_aio_context(bs);
      aio_disable_external(ctx);
 -    bdrv_parent_drained_begin(bs);
 +    bdrv_parent_drained_begin(bs, NULL);
      bdrv_drain(bs); /* ensure there are no in-flight requests */
      while (aio_poll(ctx, false)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
       */
      aio_context_acquire(new_context);
      bdrv_attach_aio_context(bs, new_context);
 -    bdrv_parent_drained_end(bs);
 +    bdrv_parent_drained_end(bs, NULL);
      aio_enable_external(ctx);
      aio_context_release(new_context);
  }
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@
  static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
      int64_t offset, int bytes, BdrvRequestFlags flags);
 -void bdrv_parent_drained_begin(BlockDriverState *bs)
 +void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
  {
      BdrvChild *c, *next;
      QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
 +        if (c == ignore) {
 +            continue;
 +        }
          if (c->role->drained_begin) {
              c->role->drained_begin(c);
          }
      }
  }
 -void bdrv_parent_drained_end(BlockDriverState *bs)
 +void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
  {
      BdrvChild *c, *next;
      QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
 +        if (c == ignore) {
 +            continue;
 +        }
          if (c->role->drained_end) {
              c->role->drained_end(c);
          }
@@ -XXX,XX +XXX,XX @@ typedef struct {
      BlockDriverState *bs;
      bool done;
      bool begin;
 +    BdrvChild *parent;
  } BdrvCoDrainData;
  static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
      return waited;
  }
 +static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
 +static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
 +
-+    /* Populate front untouched region of new data cluster */
+ static void bdrv_co_drain_bh_cb(void *opaque)
-+    start = qed_start_of_cluster(s, acb->cur_pos);
+ {
-+    len = qed_offset_into_cluster(s, acb->cur_pos);
+     BdrvCoDrainData *data = opaque;
+@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
-+    trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
-+    ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
+     bdrv_dec_in_flight(bs);
-     if (ret) {
+     if (data->begin) {
-         qed_aio_complete(acb, ret);
+-        bdrv_drained_begin(bs);
 +        bdrv_do_drained_begin(bs, data->parent);
      } else {
 -        bdrv_drained_end(bs);
 +        bdrv_do_drained_end(bs, data->parent);
      }
      data->done = true;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
  }
  static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
 -                                                bool begin)
 +                                                bool begin, BdrvChild *parent)
  {
      BdrvCoDrainData data;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
          .bs = bs,
          .done = false,
          .begin = begin,
 +        .parent = parent,
      };
      bdrv_inc_in_flight(bs);
      aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
      assert(data.done);
  }
 -void bdrv_drained_begin(BlockDriverState *bs)
 +static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
  {
      if (qemu_in_coroutine()) {
 -        bdrv_co_yield_to_drain(bs, true);
 +        bdrv_co_yield_to_drain(bs, true, parent);
          return;
      }
--    trace_qed_aio_write_postfill(s, acb, start, len, offset);
+     /* Stop things in parent-to-child order */
--    qed_copy_from_backing_file(s, start, len, offset,
+     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
--                                qed_aio_write_main, acb);
+         aio_disable_external(bdrv_get_aio_context(bs));
--}
+-        bdrv_parent_drained_begin(bs);
-+    /* Populate back untouched region of new data cluster */
+     }
-+    start = acb->cur_pos + acb->cur_qiov.size;
-+    len = qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
++    bdrv_parent_drained_begin(bs, parent);
-+    offset = acb->cur_cluster +
+     bdrv_drain_invoke(bs, true, false);
-+             qed_offset_into_cluster(s, acb->cur_pos) +
+     bdrv_drain_recurse(bs);
-+             acb->cur_qiov.size;
+ }
--/**
+-void bdrv_drained_end(BlockDriverState *bs)
-- * Populate front untouched region of new data cluster
++void bdrv_drained_begin(BlockDriverState *bs)
-- */
++{
--static void qed_aio_write_prefill(void *opaque, int ret)
++    bdrv_do_drained_begin(bs, NULL);
--{
++}
--    QEDAIOCB *acb = opaque;
++
--    BDRVQEDState *s = acb_to_s(acb);
++static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
--    uint64_t start = qed_start_of_cluster(s, acb->cur_pos);
+ {
--    uint64_t len = qed_offset_into_cluster(s, acb->cur_pos);
+     int old_quiesce_counter;
-+    trace_qed_aio_write_postfill(s, acb, start, len, offset);
-+    ret = qed_copy_from_backing_file(s, start, len, offset);
+     if (qemu_in_coroutine()) {
+-        bdrv_co_yield_to_drain(bs, false);
--    trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
++        bdrv_co_yield_to_drain(bs, false, parent);
--    qed_copy_from_backing_file(s, start, len, acb->cur_cluster,
+         return;
--                                qed_aio_write_postfill, acb);
+     }
-+    qed_aio_write_main(acb, ret);
+     assert(bs->quiesce_counter > 0);
- }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
- /**
+     /* Re-enable things in child-to-parent order */
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+     bdrv_drain_invoke(bs, false, false);
++    bdrv_parent_drained_end(bs, parent);
-         cb = qed_aio_write_zero_cluster;
+     if (old_quiesce_counter == 1) {
-     } else {
+-        bdrv_parent_drained_end(bs);
--        cb = qed_aio_write_prefill;
+         aio_enable_external(bdrv_get_aio_context(bs));
-+        cb = qed_aio_write_cow;
+     }
-         acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
+ }
-     }
++void bdrv_drained_end(BlockDriverState *bs)
 +{
 +    bdrv_do_drained_end(bs, NULL);
 +}
 +
  /*
   * Wait for pending requests to complete on a single BlockDriverState subtree,
   * and suspend block driver's internal I/O until next request arrives.
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
          /* Stop things in parent-to-child order */
          aio_context_acquire(aio_context);
          aio_disable_external(aio_context);
 -        bdrv_parent_drained_begin(bs);
 +        bdrv_parent_drained_begin(bs, NULL);
          bdrv_drain_invoke(bs, true, true);
          aio_context_release(aio_context);
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
          /* Re-enable things in child-to-parent order */
          aio_context_acquire(aio_context);
          bdrv_drain_invoke(bs, false, true);
 -        bdrv_parent_drained_end(bs);
 +        bdrv_parent_drained_end(bs, NULL);
          aio_enable_external(aio_context);
          aio_context_release(aio_context);
      }
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 37/61] qed: Remove callback from qed_write_table()
+[Qemu-devel] [PULL v3 28/35] block: Add bdrv_subtree_drained_begin/end()
+bdrv_drained_begin() waits for the completion of requests in the whole
+subtree, but it only actually keeps its immediate bs parameter quiesced
+until bdrv_drained_end().
+Add a version that keeps the whole subtree drained. As of this commit,
+graph changes cannot be allowed during a subtree drained section, but
+this will be fixed soon.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qed-table.c | 47 ++++++++++++-----------------------------------
+ include/block/block.h | 13 +++++++++++++
- block/qed.c       | 12 +++++++-----
+ block/io.c            | 54 ++++++++++++++++++++++++++++++++++++++++-----------
- block/qed.h       |  8 +++-----
+files changed, 56 insertions(+), 11 deletions(-)
 files changed, 22 insertions(+), 45 deletions(-)
-diff --git a/block/qed-table.c b/block/qed-table.c
+diff --git a/include/block/block.h b/include/block/block.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/qed-table.c
+--- a/include/block/block.h
-+++ b/block/qed-table.c
++++ b/include/block/block.h
-@@ -XXX,XX +XXX,XX @@ out:
+@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
-  * @index:      Index of first element
+ void bdrv_drained_begin(BlockDriverState *bs);
-  * @n:          Number of elements
-  * @flush:      Whether or not to sync to disk
+ /**
-- * @cb:         Completion function
++ * Like bdrv_drained_begin, but recursively begins a quiesced section for
-- * @opaque:     Argument for completion function
++ * exclusive access to all child nodes as well.
 + *
 + * Graph changes are not allowed during a subtree drain section.
 + */
 +void bdrv_subtree_drained_begin(BlockDriverState *bs);
 +
 +/**
   * bdrv_drained_end:
   *
   * End a quiescent section started by bdrv_drained_begin().
   */
--static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
+ void bdrv_drained_end(BlockDriverState *bs);
--                            unsigned int index, unsigned int n, bool flush,
--                            BlockCompletionFunc *cb, void *opaque)
++/**
-+static int qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
++ * End a quiescent section started by bdrv_subtree_drained_begin().
-+                           unsigned int index, unsigned int n, bool flush)
++ */
 +void bdrv_subtree_drained_end(BlockDriverState *bs);
 +
  void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
                      Error **errp);
  void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
      BlockDriverState *bs;
      bool done;
      bool begin;
 +    bool recursive;
      BdrvChild *parent;
  } BdrvCoDrainData;
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
      return waited;
  }
 -static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
 -static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
 +static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 +                                  BdrvChild *parent);
 +static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 +                                BdrvChild *parent);
  static void bdrv_co_drain_bh_cb(void *opaque)
  {
-     unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1;
+@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
-     unsigned int start, end, i;
-@@ -XXX,XX +XXX,XX @@ static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
+     bdrv_dec_in_flight(bs);
-     ret = 0;
+     if (data->begin) {
- out:
+-        bdrv_do_drained_begin(bs, data->parent);
-     qemu_vfree(new_table);
++        bdrv_do_drained_begin(bs, data->recursive, data->parent);
--    cb(opaque, ret);
+     } else {
--}
+-        bdrv_do_drained_end(bs, data->parent);
--
++        bdrv_do_drained_end(bs, data->recursive, data->parent);
--/**
+     }
-- * Propagate return value from async callback
-- */
+     data->done = true;
--static void qed_sync_cb(void *opaque, int ret)
+@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 -{
 -    *(int *)opaque = ret;
 +    return ret;
  }
- int qed_read_l1_table_sync(BDRVQEDState *s)
+ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-@@ -XXX,XX +XXX,XX @@ int qed_read_l1_table_sync(BDRVQEDState *s)
+-                                                bool begin, BdrvChild *parent)
-     return qed_read_table(s, s->header.l1_table_offset, s->l1_table);
++                                                bool begin, bool recursive,
 +                                                BdrvChild *parent)
  {
      BdrvCoDrainData data;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
          .bs = bs,
          .done = false,
          .begin = begin,
 +        .recursive = recursive,
          .parent = parent,
      };
      bdrv_inc_in_flight(bs);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
      assert(data.done);
  }
--void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
+-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
--                        BlockCompletionFunc *cb, void *opaque)
++static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-+int qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n)
++                                  BdrvChild *parent)
  {
-     BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE);
++    BdrvChild *child, *next;
--    qed_write_table(s, s->header.l1_table_offset,
++
--                    s->l1_table, index, n, false, cb, opaque);
+     if (qemu_in_coroutine()) {
-+    return qed_write_table(s, s->header.l1_table_offset,
+-        bdrv_co_yield_to_drain(bs, true, parent);
-+                           s->l1_table, index, n, false);
++        bdrv_co_yield_to_drain(bs, true, recursive, parent);
          return;
      }
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
      bdrv_parent_drained_begin(bs, parent);
      bdrv_drain_invoke(bs, true, false);
      bdrv_drain_recurse(bs);
 +
 +    if (recursive) {
 +        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 +            bdrv_do_drained_begin(child->bs, true, child);
 +        }
 +    }
  }
- int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
+ void bdrv_drained_begin(BlockDriverState *bs)
                              unsigned int n)
  {
--    int ret = -EINPROGRESS;
+-    bdrv_do_drained_begin(bs, NULL);
--
++    bdrv_do_drained_begin(bs, false, NULL);
--    qed_write_l1_table(s, index, n, qed_sync_cb, &ret);
++}
--    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
++
--
++void bdrv_subtree_drained_begin(BlockDriverState *bs)
--    return ret;
++{
-+    return qed_write_l1_table(s, index, n);
++    bdrv_do_drained_begin(bs, true, NULL);
  }
- int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
+-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
-@@ -XXX,XX +XXX,XX @@ int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset
++static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
-     return qed_read_l2_table(s, request, offset);
++                                BdrvChild *parent)
  {
 +    BdrvChild *child, *next;
      int old_quiesce_counter;
      if (qemu_in_coroutine()) {
 -        bdrv_co_yield_to_drain(bs, false, parent);
 +        bdrv_co_yield_to_drain(bs, false, recursive, parent);
          return;
      }
      assert(bs->quiesce_counter > 0);
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
      if (old_quiesce_counter == 1) {
          aio_enable_external(bdrv_get_aio_context(bs));
      }
 +
 +    if (recursive) {
 +        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 +            bdrv_do_drained_end(child->bs, true, child);
 +        }
 +    }
  }
--void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
+ void bdrv_drained_end(BlockDriverState *bs)
 -                        unsigned int index, unsigned int n, bool flush,
 -                        BlockCompletionFunc *cb, void *opaque)
 +int qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
 +                       unsigned int index, unsigned int n, bool flush)
  {
-     BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE);
+-    bdrv_do_drained_end(bs, NULL);
--    qed_write_table(s, request->l2_table->offset,
++    bdrv_do_drained_end(bs, false, NULL);
--                    request->l2_table->table, index, n, flush, cb, opaque);
++}
-+    return qed_write_table(s, request->l2_table->offset,
++
-+                           request->l2_table->table, index, n, flush);
++void bdrv_subtree_drained_end(BlockDriverState *bs)
 +{
 +    bdrv_do_drained_end(bs, true, NULL);
  }
- int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
+ /*
                              unsigned int index, unsigned int n, bool flush)
  {
 -    int ret = -EINPROGRESS;
 -
 -    qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret);
 -    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
 -
 -    return ret;
 +    return qed_write_l2_table(s, request, index, n, flush);
  }
 diff --git a/block/qed.c b/block/qed.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.c
 +++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l1_update(void *opaque, int ret)
      index = qed_l1_index(s, acb->cur_pos);
      s->l1_table->offsets[index] = acb->request.l2_table->offset;
 -    qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb);
 +    ret = qed_write_l1_table(s, index, 1);
 +    qed_commit_l2_update(acb, ret);
  }
  /**
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
      if (need_alloc) {
          /* Write out the whole new L2 table */
 -        qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
 -                           qed_aio_write_l1_update, acb);
 +        ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
 +        qed_aio_write_l1_update(acb, ret);
      } else {
          /* Write out only the updated part of the L2 table */
 -        qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
 -                           qed_aio_next_io_cb, acb);
 +        ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
 +                                 false);
 +        qed_aio_next_io(acb, ret);
      }
      return;
 diff --git a/block/qed.h b/block/qed.h
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.h
 +++ b/block/qed.h
@@ -XXX,XX +XXX,XX @@ void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table);
   * Table I/O functions
   */
  int qed_read_l1_table_sync(BDRVQEDState *s);
 -void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
 -                        BlockCompletionFunc *cb, void *opaque);
 +int qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n);
  int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
                              unsigned int n);
  int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                             uint64_t offset);
  int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset);
 -void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
 -                        unsigned int index, unsigned int n, bool flush,
 -                        BlockCompletionFunc *cb, void *opaque);
 +int qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
 +                       unsigned int index, unsigned int n, bool flush);
  int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                              unsigned int index, unsigned int n, bool flush);
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 39/61] qed: Make qed_aio_write_main() synchronous
+[Qemu-devel] [PULL v3 29/35] test-bdrv-drain: Tests for bdrv_subtree_drain
-Note that this code is generally not running in coroutine context, so
+Add a subtree drain version to the existing test cases.
 this is an actual blocking synchronous operation. We'll fix this in a
 moment.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qed.c | 61 +++++++++++++++++++------------------------------------------
+ tests/test-bdrv-drain.c | 27 ++++++++++++++++++++++++++-
-file changed, 19 insertions(+), 42 deletions(-)
+file changed, 26 insertions(+), 1 deletion(-)
-diff --git a/block/qed.c b/block/qed.c
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qed.c
+--- a/tests/test-bdrv-drain.c
-+++ b/block/qed.c
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_start_io(QEDAIOCB *acb)
+@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
-     qed_aio_next_io(acb, 0);
+ enum drain_type {
      BDRV_DRAIN_ALL,
      BDRV_DRAIN,
 +    BDRV_SUBTREE_DRAIN,
      DRAIN_TYPE_MAX,
  };
@@ -XXX,XX +XXX,XX @@ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
      switch (drain_type) {
      case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
      case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
 +    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_begin(bs); break;
      default:                    g_assert_not_reached();
      }
  }
+@@ -XXX,XX +XXX,XX @@ static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
--static void qed_aio_next_io_cb(void *opaque, int ret)
+     switch (drain_type) {
--{
+     case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
--    QEDAIOCB *acb = opaque;
+     case BDRV_DRAIN:            bdrv_drained_end(bs); break;
--
++    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_end(bs); break;
--    qed_aio_next_io(acb, ret);
+     default:                    g_assert_not_reached();
--}
+     }
--
+ }
- static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
+@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
      test_drv_cb_common(BDRV_DRAIN, false);
  }
 +static void test_drv_cb_drain_subtree(void)
 +{
 +    test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
 +}
 +
  static void test_quiesce_common(enum drain_type drain_type, bool recursive)
  {
-     assert(!s->allocating_write_reqs_plugged);
+     BlockBackend *blk;
-@@ -XXX,XX +XXX,XX @@ err:
+@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
-     qed_aio_complete(acb, ret);
+     test_quiesce_common(BDRV_DRAIN, false);
  }
--static void qed_aio_write_l2_update_cb(void *opaque, int ret)
++static void test_quiesce_drain_subtree(void)
--{
++{
--    QEDAIOCB *acb = opaque;
++    test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
--    qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
++}
 -}
 -
 -/**
 - * Flush new data clusters before updating the L2 table
 - *
 - * This flush is necessary when a backing file is in use.  A crash during an
 - * allocating write could result in empty clusters in the image.  If the write
 - * only touched a subregion of the cluster, then backing image sectors have
 - * been lost in the untouched region.  The solution is to flush after writing a
 - * new data cluster and before updating the L2 table.
 - */
 -static void qed_aio_write_flush_before_l2_update(void *opaque, int ret)
 -{
 -    QEDAIOCB *acb = opaque;
 -    BDRVQEDState *s = acb_to_s(acb);
 -
 -    if (!bdrv_aio_flush(s->bs->file->bs, qed_aio_write_l2_update_cb, opaque)) {
 -        qed_aio_complete(acb, -EIO);
 -    }
 -}
 -
  /**
   * Write data to the image file
   */
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
      BDRVQEDState *s = acb_to_s(acb);
      uint64_t offset = acb->cur_cluster +
                        qed_offset_into_cluster(s, acb->cur_pos);
 -    BlockCompletionFunc *next_fn;
      trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size);
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
          return;
      }
 +    BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
 +    ret = bdrv_pwritev(s->bs->file, offset, &acb->cur_qiov);
 +    if (ret >= 0) {
 +        ret = 0;
 +    }
 +
-     if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
+ static void test_nested(void)
--        next_fn = qed_aio_next_io_cb;
+ {
-+        qed_aio_next_io(acb, ret);
+     BlockBackend *blk;
-     } else {
+@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
-         if (s->bs->backing) {
+             /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
--            next_fn = qed_aio_write_flush_before_l2_update;
+             int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
--        } else {
+                                   (inner != BDRV_DRAIN_ALL);
--            next_fn = qed_aio_write_l2_update_cb;
+-            int backing_quiesce = 0;
-+            /*
++            int backing_quiesce = (outer == BDRV_SUBTREE_DRAIN) +
-+             * Flush new data clusters before updating the L2 table
++                                  (inner == BDRV_SUBTREE_DRAIN);
-+             *
+             int backing_cb_cnt  = (outer != BDRV_DRAIN) +
-+             * This flush is necessary when a backing file is in use.  A crash
+                                   (inner != BDRV_DRAIN);
-+             * during an allocating write could result in empty clusters in the
-+             * image.  If the write only touched a subregion of the cluster,
+@@ -XXX,XX +XXX,XX @@ static void test_blockjob_drain(void)
-+             * then backing image sectors have been lost in the untouched
+     test_blockjob_common(BDRV_DRAIN);
 +             * region.  The solution is to flush after writing a new data
 +             * cluster and before updating the L2 table.
 +             */
 +            ret = bdrv_flush(s->bs->file->bs);
          }
 +        qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
      }
 -
 -    BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
 -    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
 -                    &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
 -                    next_fn, acb);
  }
- /**
++static void test_blockjob_drain_subtree(void)
 +{
 +    test_blockjob_common(BDRV_SUBTREE_DRAIN);
 +}
 +
  int main(int argc, char **argv)
  {
      bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
      g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 +    g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
 +                    test_drv_cb_drain_subtree);
      g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
      g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 +    g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
 +                    test_quiesce_drain_subtree);
      g_test_add_func("/bdrv-drain/nested", test_nested);
      g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
      g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 +    g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
 +                    test_blockjob_drain_subtree);
      return g_test_run();
  }
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 49/61] qed: Implement .bdrv_co_readv/writev
+[Qemu-devel] [PULL v3 30/35] test-bdrv-drain: Test behaviour in coroutine context
-Most of the qed code is now synchronous and matches the coroutine model.
+If bdrv_do_drained_begin/end() are called in coroutine context, they
-One notable exception is the serialisation between requests which can
+first use a BH to get out of the coroutine context. Call some existing
-still schedule a callback. Before we can replace this with coroutine
+tests again from a coroutine to cover this code path.
 locks, let's convert the driver's external interfaces to the coroutine
 versions.
 We need to be careful to handle both requests that call the completion
 callback directly from the calling coroutine (i.e. fully synchronous
 code) and requests that involve some callback, so that we need to yield
 and wait for the completion callback coming from outside the coroutine.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qed.c | 97 ++++++++++++++++++++++++++-----------------------------------
+ tests/test-bdrv-drain.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
-file changed, 42 insertions(+), 55 deletions(-)
+file changed, 59 insertions(+)
-diff --git a/block/qed.c b/block/qed.c
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qed.c
+--- a/tests/test-bdrv-drain.c
-+++ b/block/qed.c
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb)
+@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
-     }
+     *aio_ret = ret;
  }
--static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
++typedef struct CallInCoroutineData {
--                                 int64_t sector_num,
++    void (*entry)(void);
 -                                 QEMUIOVector *qiov, int nb_sectors,
 -                                 BlockCompletionFunc *cb,
 -                                 void *opaque, int flags)
 +typedef struct QEDRequestCo {
 +    Coroutine *co;
 +    bool done;
-+    int ret;
++} CallInCoroutineData;
 +} QEDRequestCo;
 +
-+static void qed_co_request_cb(void *opaque, int ret)
++static coroutine_fn void call_in_coroutine_entry(void *opaque)
- {
++{
--    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque);
++    CallInCoroutineData *data = opaque;
-+    QEDRequestCo *co = opaque;
++
++    data->entry();
--    trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
++    data->done = true;
 -                        opaque, flags);
 +    co->done = true;
 +    co->ret = ret;
 +    qemu_coroutine_enter_if_inactive(co->co);
 +}
 +
-+static int coroutine_fn qed_co_request(BlockDriverState *bs, int64_t sector_num,
++static void call_in_coroutine(void (*entry)(void))
 +                                       QEMUIOVector *qiov, int nb_sectors,
 +                                       int flags)
 +{
-+    QEDRequestCo co = {
++    Coroutine *co;
-+        .co     = qemu_coroutine_self(),
++    CallInCoroutineData data = {
 +        .entry  = entry,
 +        .done   = false,
 +    };
-+    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, qed_co_request_cb, &co);
 +
-+    trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, &co, flags);
++    co = qemu_coroutine_create(call_in_coroutine_entry, &data);
++    qemu_coroutine_enter(co);
-     acb->flags = flags;
++    while (!data.done) {
-     acb->qiov = qiov;
++        aio_poll(qemu_get_aio_context(), true);
@@ -XXX,XX +XXX,XX @@ static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
      /* Start request */
      qed_aio_start_io(acb);
 -    return &acb->common;
 -}
 -static BlockAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
 -                                      int64_t sector_num,
 -                                      QEMUIOVector *qiov, int nb_sectors,
 -                                      BlockCompletionFunc *cb,
 -                                      void *opaque)
 -{
 -    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
 +    if (!co.done) {
 +        qemu_coroutine_yield();
 +    }
++}
 +
-+    return co.ret;
+ enum drain_type {
      BDRV_DRAIN_ALL,
      BDRV_DRAIN,
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_subtree(void)
      test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
  }
--static BlockAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
++static void test_drv_cb_co_drain(void)
--                                       int64_t sector_num,
++{
--                                       QEMUIOVector *qiov, int nb_sectors,
++    call_in_coroutine(test_drv_cb_drain);
--                                       BlockCompletionFunc *cb,
++}
--                                       void *opaque)
++
-+static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
++static void test_drv_cb_co_drain_subtree(void)
-+                                          int64_t sector_num, int nb_sectors,
++{
-+                                          QEMUIOVector *qiov)
++    call_in_coroutine(test_drv_cb_drain_subtree);
 +}
 +
  static void test_quiesce_common(enum drain_type drain_type, bool recursive)
  {
--    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
+     BlockBackend *blk;
--                         opaque, QED_AIOCB_WRITE);
+@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain_subtree(void)
-+    return qed_co_request(bs, sector_num, qiov, nb_sectors, 0);
+     test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
  }
--typedef struct {
++static void test_quiesce_co_drain(void)
--    Coroutine *co;
++{
--    int ret;
++    call_in_coroutine(test_quiesce_drain);
--    bool done;
++}
--} QEDWriteZeroesCB;
++
--
++static void test_quiesce_co_drain_subtree(void)
--static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
++{
-+static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
++    call_in_coroutine(test_quiesce_drain_subtree);
-+                                           int64_t sector_num, int nb_sectors,
++}
-+                                           QEMUIOVector *qiov)
++
  static void test_nested(void)
  {
--    QEDWriteZeroesCB *cb = opaque;
+     BlockBackend *blk;
--
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
--    cb->done = true;
+     g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
--    cb->ret = ret;
+                     test_drv_cb_drain_subtree);
--    if (cb->co) {
--        aio_co_wake(cb->co);
++    // XXX bdrv_drain_all() doesn't work in coroutine context
--    }
++    g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
-+    return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
++    g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
- }
++                    test_drv_cb_co_drain_subtree);
++
- static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
++
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
+     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
-                                                   int count,
+     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
-                                                   BdrvRequestFlags flags)
+     g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
- {
+                     test_quiesce_drain_subtree);
--    BlockAIOCB *blockacb;
-     BDRVQEDState *s = bs->opaque;
++    // XXX bdrv_drain_all() doesn't work in coroutine context
--    QEDWriteZeroesCB cb = { .done = false };
++    g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
-     QEMUIOVector qiov;
++    g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
-     struct iovec iov;
++                    test_quiesce_co_drain_subtree);
++
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
+     g_test_add_func("/bdrv-drain/nested", test_nested);
-     iov.iov_len = count;
+     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
      qemu_iovec_init_external(&qiov, &iov, 1);
 -    blockacb = qed_aio_setup(bs, offset >> BDRV_SECTOR_BITS, &qiov,
 -                             count >> BDRV_SECTOR_BITS,
 -                             qed_co_pwrite_zeroes_cb, &cb,
 -                             QED_AIOCB_WRITE | QED_AIOCB_ZERO);
 -    if (!blockacb) {
 -        return -EIO;
 -    }
 -    if (!cb.done) {
 -        cb.co = qemu_coroutine_self();
 -        qemu_coroutine_yield();
 -    }
 -    assert(cb.done);
 -    return cb.ret;
 +    return qed_co_request(bs, offset >> BDRV_SECTOR_BITS, &qiov,
 +                          count >> BDRV_SECTOR_BITS,
 +                          QED_AIOCB_WRITE | QED_AIOCB_ZERO);
  }
  static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_qed = {
      .bdrv_create              = bdrv_qed_create,
      .bdrv_has_zero_init       = bdrv_has_zero_init_1,
      .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
 -    .bdrv_aio_readv           = bdrv_qed_aio_readv,
 -    .bdrv_aio_writev          = bdrv_qed_aio_writev,
 +    .bdrv_co_readv            = bdrv_qed_co_readv,
 +    .bdrv_co_writev           = bdrv_qed_co_writev,
      .bdrv_co_pwrite_zeroes    = bdrv_qed_co_pwrite_zeroes,
      .bdrv_truncate            = bdrv_qed_truncate,
      .bdrv_getlength           = bdrv_qed_getlength,
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 15/61] qemu-iotests: 068: test iothread mode
+[Qemu-devel] [PULL v3 31/35] test-bdrv-drain: Recursive draining with multiple parents
-From: Stefan Hajnoczi <stefanha@redhat.com>
+Test that drain sections are correctly propagated through the graph.
-Perform the savevm/loadvm test with both iothread on and off.  This
-covers the recently found savevm/loadvm hang when iothread is enabled.
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- tests/qemu-iotests/068     | 23 ++++++++++++++---------
+ tests/test-bdrv-drain.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++
- tests/qemu-iotests/068.out | 11 ++++++++++-
+file changed, 74 insertions(+)
 files changed, 24 insertions(+), 10 deletions(-)
-diff --git a/tests/qemu-iotests/068 b/tests/qemu-iotests/068
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
-index XXXXXXX..XXXXXXX 100755
+index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/068
+--- a/tests/test-bdrv-drain.c
-+++ b/tests/qemu-iotests/068
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ _supported_os Linux
+@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
- IMGOPTS="compat=1.1"
+     blk_unref(blk);
  IMG_SIZE=128K
 -echo
 -echo "=== Saving and reloading a VM state to/from a qcow2 image ==="
 -echo
 -_make_test_img $IMG_SIZE
 -
  case "$QEMU_DEFAULT_MACHINE" in
    s390-ccw-virtio)
        platform_parm="-no-shutdown"
@@ -XXX,XX +XXX,XX @@ _qemu()
      _filter_qemu | _filter_hmp
  }
--# Give qemu some time to boot before saving the VM state
++static void test_multiparent(void)
--bash -c 'sleep 1; echo -e "savevm 0\nquit"' | _qemu
++{
--# Now try to continue from that VM state (this should just work)
++    BlockBackend *blk_a, *blk_b;
--echo quit | _qemu -loadvm 0
++    BlockDriverState *bs_a, *bs_b, *backing;
-+for extra_args in \
++    BDRVTestState *a_s, *b_s, *backing_s;
 +    "" \
 +    "-object iothread,id=iothread0 -set device.hba0.iothread=iothread0"; do
 +    echo
 +    echo "=== Saving and reloading a VM state to/from a qcow2 image ($extra_args) ==="
 +    echo
 +
-+    _make_test_img $IMG_SIZE
++    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
 +                                &error_abort);
 +    a_s = bs_a->opaque;
 +    blk_insert_bs(blk_a, bs_a, &error_abort);
 +
-+    # Give qemu some time to boot before saving the VM state
++    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
-+    bash -c 'sleep 1; echo -e "savevm 0\nquit"' | _qemu $extra_args
++    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
-+    # Now try to continue from that VM state (this should just work)
++                                &error_abort);
-+    echo quit | _qemu $extra_args -loadvm 0
++    b_s = bs_b->opaque;
-+done
++    blk_insert_bs(blk_b, bs_b, &error_abort);
  # success, all done
  echo "*** done"
 diff --git a/tests/qemu-iotests/068.out b/tests/qemu-iotests/068.out
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/qemu-iotests/068.out
 +++ b/tests/qemu-iotests/068.out
@@ -XXX,XX +XXX,XX @@
  QA output created by 068
 -=== Saving and reloading a VM state to/from a qcow2 image ===
 +=== Saving and reloading a VM state to/from a qcow2 image () ===
 +
-+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=131072
++    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
-+QEMU X.Y.Z monitor - type 'help' for more information
++    backing_s = backing->opaque;
-+(qemu) savevm 0
++    bdrv_set_backing_hd(bs_a, backing, &error_abort);
-+(qemu) quit
++    bdrv_set_backing_hd(bs_b, backing, &error_abort);
 +QEMU X.Y.Z monitor - type 'help' for more information
 +(qemu) quit
 +
-+=== Saving and reloading a VM state to/from a qcow2 image (-object iothread,id=iothread0 -set device.hba0.iothread=iothread0) ===
++    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
++    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=131072
++    g_assert_cmpint(backing->quiesce_counter, ==, 0);
- QEMU X.Y.Z monitor - type 'help' for more information
++    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 1);
 +    g_assert_cmpint(a_s->drain_count, ==, 1);
 +    g_assert_cmpint(b_s->drain_count, ==, 1);
 +    g_assert_cmpint(backing_s->drain_count, ==, 1);
 +
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 2);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 2);
 +    g_assert_cmpint(a_s->drain_count, ==, 2);
 +    g_assert_cmpint(b_s->drain_count, ==, 2);
 +    g_assert_cmpint(backing_s->drain_count, ==, 2);
 +
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 1);
 +    g_assert_cmpint(a_s->drain_count, ==, 1);
 +    g_assert_cmpint(b_s->drain_count, ==, 1);
 +    g_assert_cmpint(backing_s->drain_count, ==, 1);
 +
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    bdrv_unref(backing);
 +    bdrv_unref(bs_a);
 +    bdrv_unref(bs_b);
 +    blk_unref(blk_a);
 +    blk_unref(blk_b);
 +}
 +
  typedef struct TestBlockJob {
      BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
                      test_quiesce_co_drain_subtree);
      g_test_add_func("/bdrv-drain/nested", test_nested);
 +    g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
      g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
      g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 45/61] qed: Add return value to qed_aio_write_inplace/alloc()
+[Qemu-devel] [PULL v3 32/35] block: Allow graph changes in subtree drained section
-Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
+We need to remember how many of the drain sections in which a node is
-just return an error code and let the caller handle it.
+were recursive (i.e. subtree drain rather than node drain), so that they
 can be correctly applied when children are added or removed during the
 drained section.
 With this change, it is safe to modify the graph even inside a
 bdrv_subtree_drained_begin/end() section.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qed.c | 43 ++++++++++++++++++++-----------------------
+ include/block/block.h     |  2 --
-file changed, 20 insertions(+), 23 deletions(-)
+ include/block/block_int.h |  5 +++++
+ block.c                   | 32 +++++++++++++++++++++++++++++---
-diff --git a/block/qed.c b/block/qed.c
+ block/io.c                | 28 ++++++++++++++++++++++++----
-index XXXXXXX..XXXXXXX 100644
+files changed, 58 insertions(+), 9 deletions(-)
---- a/block/qed.c
-+++ b/block/qed.c
+diff --git a/include/block/block.h b/include/block/block.h
-@@ -XXX,XX +XXX,XX @@ static bool qed_should_set_need_check(BDRVQEDState *s)
+index XXXXXXX..XXXXXXX 100644
-  *
+--- a/include/block/block.h
-  * This path is taken when writing to previously unallocated clusters.
++++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs);
  /**
   * Like bdrv_drained_begin, but recursively begins a quiesced section for
   * exclusive access to all child nodes as well.
 - *
 - * Graph changes are not allowed during a subtree drain section.
   */
--static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+ void bdrv_subtree_drained_begin(BlockDriverState *bs);
-+static int qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 diff --git a/include/block/block_int.h b/include/block/block_int.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/block_int.h
 +++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
      /* Accessed with atomic ops.  */
      int quiesce_counter;
 +    int recursive_quiesce_counter;
 +
      unsigned int write_gen;               /* Current data generation */
      /* Protected by reqs_lock.  */
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
      int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
      BdrvRequestFlags flags);
 +void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
 +void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
 +
  int get_tmp_filename(char *filename, int size);
  BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
                              const char *filename);
 diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block.c
 +++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_child_cb_drained_end(BdrvChild *child)
      bdrv_drained_end(bs);
  }
 +static void bdrv_child_cb_attach(BdrvChild *child)
 +{
 +    BlockDriverState *bs = child->opaque;
 +    bdrv_apply_subtree_drain(child, bs);
 +}
 +
 +static void bdrv_child_cb_detach(BdrvChild *child)
 +{
 +    BlockDriverState *bs = child->opaque;
 +    bdrv_unapply_subtree_drain(child, bs);
 +}
 +
  static int bdrv_child_cb_inactivate(BdrvChild *child)
  {
-     BDRVQEDState *s = acb_to_s(acb);
+     BlockDriverState *bs = child->opaque;
-     int ret;
+@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_file = {
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+     .inherit_options = bdrv_inherited_options,
      .drained_begin   = bdrv_child_cb_drained_begin,
      .drained_end     = bdrv_child_cb_drained_end,
 +    .attach          = bdrv_child_cb_attach,
 +    .detach          = bdrv_child_cb_detach,
      .inactivate      = bdrv_child_cb_inactivate,
  };
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_format = {
      .inherit_options = bdrv_inherited_fmt_options,
      .drained_begin   = bdrv_child_cb_drained_begin,
      .drained_end     = bdrv_child_cb_drained_end,
 +    .attach          = bdrv_child_cb_attach,
 +    .detach          = bdrv_child_cb_detach,
      .inactivate      = bdrv_child_cb_inactivate,
  };
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_attach(BdrvChild *c)
                      parent->backing_blocker);
      bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
                      parent->backing_blocker);
 +
 +    bdrv_child_cb_attach(c);
  }
  static void bdrv_backing_detach(BdrvChild *c)
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_detach(BdrvChild *c)
      bdrv_op_unblock_all(c->bs, parent->backing_blocker);
      error_free(parent->backing_blocker);
      parent->backing_blocker = NULL;
 +
 +    bdrv_child_cb_detach(c);
  }
  /*
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
          assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
      }
-     if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
+     if (old_bs) {
-         s->allocating_write_reqs_plugged) {
++        /* Detach first so that the recursive drain sections coming from @child
--        return; /* wait for existing request to finish */
++         * are already gone and we only end the drain sections that came from
-+        return -EINPROGRESS; /* wait for existing request to finish */
++         * elsewhere. */
 +        if (child->role->detach) {
 +            child->role->detach(child);
 +        }
          if (old_bs->quiesce_counter && child->role->drained_end) {
              for (i = 0; i < old_bs->quiesce_counter; i++) {
                  child->role->drained_end(child);
              }
          }
 -        if (child->role->detach) {
 -            child->role->detach(child);
 -        }
          QLIST_REMOVE(child, next_parent);
      }
-     acb->cur_nclusters = qed_bytes_to_clusters(s,
+@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+             }
-     if (acb->flags & QED_AIOCB_ZERO) {
+         }
-         /* Skip ahead if the clusters are already zero */
-         if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
++        /* Attach only after starting new drained sections, so that recursive
--            qed_aio_start_io(acb);
++         * drain sections coming from @child don't get an extra .drained_begin
--            return;
++         * callback. */
-+            return 0;
+         if (child->role->attach) {
-         }
+             child->role->attach(child);
-     } else {
+         }
-         acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
+diff --git a/block/io.c b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+index XXXXXXX..XXXXXXX 100644
-         s->header.features |= QED_F_NEED_CHECK;
+--- a/block/io.c
-         ret = qed_write_header(s);
++++ b/block/io.c
-         if (ret < 0) {
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
--            qed_aio_complete(acb, ret);
+     assert(data.done);
--            return;
+ }
-+            return ret;
-         }
+-static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 -                                  BdrvChild *parent)
 +void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 +                           BdrvChild *parent)
  {
      BdrvChild *child, *next;
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
      bdrv_drain_recurse(bs);
      if (recursive) {
 +        bs->recursive_quiesce_counter++;
          QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
              bdrv_do_drained_begin(child->bs, true, child);
          }
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_begin(BlockDriverState *bs)
      bdrv_do_drained_begin(bs, true, NULL);
  }
 -static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 -                                BdrvChild *parent)
 +void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 +                         BdrvChild *parent)
  {
      BdrvChild *child, *next;
      int old_quiesce_counter;
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
      }
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+     if (recursive) {
-         ret = qed_aio_write_cow(acb);
++        bs->recursive_quiesce_counter--;
-     }
+         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
-     if (ret < 0) {
+             bdrv_do_drained_end(child->bs, true, child);
--        qed_aio_complete(acb, ret);
+         }
--        return;
+@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_end(BlockDriverState *bs)
-+        return ret;
+     bdrv_do_drained_end(bs, true, NULL);
-     }
+ }
--    qed_aio_next_io(acb, 0);
-+    return 0;
++void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
- }
++{
++    int i;
- /**
++
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
++    for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
-  *
++        bdrv_do_drained_begin(child->bs, true, child);
   * This path is taken when writing to already allocated clusters.
   */
 -static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
 +static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
  {
 -    int ret;
 -
      /* Allocate buffer for zero writes */
      if (acb->flags & QED_AIOCB_ZERO) {
          struct iovec *iov = acb->qiov->iov;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
          if (!iov->iov_base) {
              iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len);
              if (iov->iov_base == NULL) {
 -                qed_aio_complete(acb, -ENOMEM);
 -                return;
 +                return -ENOMEM;
              }
              memset(iov->iov_base, 0, iov->iov_len);
          }
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
      qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
      /* Do the actual write */
 -    ret = qed_aio_write_main(acb);
 -    if (ret < 0) {
 -        qed_aio_complete(acb, ret);
 -        return;
 -    }
 -    qed_aio_next_io(acb, 0);
 +    return qed_aio_write_main(acb);
  }
  /**
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_data(void *opaque, int ret,
      switch (ret) {
      case QED_CLUSTER_FOUND:
 -        qed_aio_write_inplace(acb, offset, len);
 +        ret = qed_aio_write_inplace(acb, offset, len);
          break;
      case QED_CLUSTER_L2:
      case QED_CLUSTER_L1:
      case QED_CLUSTER_ZERO:
 -        qed_aio_write_alloc(acb, len);
 +        ret = qed_aio_write_alloc(acb, len);
          break;
      default:
 -        qed_aio_complete(acb, ret);
 +        assert(ret < 0);
          break;
      }
 +
 +    if (ret < 0) {
 +        if (ret != -EINPROGRESS) {
 +            qed_aio_complete(acb, ret);
 +        }
 +        return;
 +    }
-+    qed_aio_next_io(acb, 0);
++}
- }
++
++void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
- /**
++{
 +    int i;
 +
 +    for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
 +        bdrv_do_drained_end(child->bs, true, child);
 +    }
 +}
 +
  /*
   * Wait for pending requests to complete on a single BlockDriverState subtree,
   * and suspend block driver's internal I/O until next request arrives.
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 13/61] qemu-iotests: 068: extract _qemu() function
+[Qemu-devel] [PULL v3 33/35] test-bdrv-drain: Test graph changes in drained section
-From: Stefan Hajnoczi <stefanha@redhat.com>
-Avoid duplicating the QEMU command-line.
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- tests/qemu-iotests/068 | 15 +++++++++------
+ tests/test-bdrv-drain.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
-file changed, 9 insertions(+), 6 deletions(-)
+file changed, 80 insertions(+)
-diff --git a/tests/qemu-iotests/068 b/tests/qemu-iotests/068
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
-index XXXXXXX..XXXXXXX 100755
+index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/068
+--- a/tests/test-bdrv-drain.c
-+++ b/tests/qemu-iotests/068
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ case "$QEMU_DEFAULT_MACHINE" in
+@@ -XXX,XX +XXX,XX @@ static void test_multiparent(void)
-       ;;
+     blk_unref(blk_b);
- esac
+ }
--# Give qemu some time to boot before saving the VM state
++static void test_graph_change(void)
 -bash -c 'sleep 1; echo -e "savevm 0\nquit"' |\
 -    $QEMU $platform_parm -nographic -monitor stdio -serial none -hda "$TEST_IMG" |\
 +_qemu()
 +{
-+    $QEMU $platform_parm -nographic -monitor stdio -serial none -hda "$TEST_IMG" \
++    BlockBackend *blk_a, *blk_b;
-+          "$@" |\
++    BlockDriverState *bs_a, *bs_b, *backing;
-     _filter_qemu | _filter_hmp
++    BDRVTestState *a_s, *b_s, *backing_s;
 +
 +    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
 +                                &error_abort);
 +    a_s = bs_a->opaque;
 +    blk_insert_bs(blk_a, bs_a, &error_abort);
 +
 +    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
 +                                &error_abort);
 +    b_s = bs_b->opaque;
 +    blk_insert_bs(blk_b, bs_b, &error_abort);
 +
 +    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
 +    backing_s = backing->opaque;
 +    bdrv_set_backing_hd(bs_a, backing, &error_abort);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
 +
 +    bdrv_set_backing_hd(bs_b, backing, &error_abort);
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 5);
 +    g_assert_cmpint(a_s->drain_count, ==, 5);
 +    g_assert_cmpint(b_s->drain_count, ==, 5);
 +    g_assert_cmpint(backing_s->drain_count, ==, 5);
 +
 +    bdrv_set_backing_hd(bs_b, NULL, &error_abort);
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 3);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 3);
 +    g_assert_cmpint(a_s->drain_count, ==, 3);
 +    g_assert_cmpint(b_s->drain_count, ==, 2);
 +    g_assert_cmpint(backing_s->drain_count, ==, 3);
 +
 +    bdrv_set_backing_hd(bs_b, backing, &error_abort);
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 5);
 +    g_assert_cmpint(a_s->drain_count, ==, 5);
 +    g_assert_cmpint(b_s->drain_count, ==, 5);
 +    g_assert_cmpint(backing_s->drain_count, ==, 5);
 +
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    bdrv_unref(backing);
 +    bdrv_unref(bs_a);
 +    bdrv_unref(bs_b);
 +    blk_unref(blk_a);
 +    blk_unref(blk_b);
 +}
 +
-+# Give qemu some time to boot before saving the VM state
-+bash -c 'sleep 1; echo -e "savevm 0\nquit"' | _qemu
+ typedef struct TestBlockJob {
- # Now try to continue from that VM state (this should just work)
+     BlockJob common;
--echo quit |\
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
--    $QEMU $platform_parm -nographic -monitor stdio -serial none -hda "$TEST_IMG" -loadvm 0 |\
--    _filter_qemu | _filter_hmp
+     g_test_add_func("/bdrv-drain/nested", test_nested);
-+echo quit | _qemu -loadvm 0
+     g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
++    g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
- # success, all done
- echo "*** done"
+     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
      g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 01/61] commit: Fix completion with extra reference
+[Qemu-devel] [PULL v3 34/35] commit: Simplify reopen of base
-commit_complete() can't assume that after its block_job_completed() the
+Since commit bde70715, base is the only node that is reopened in
-job is actually immediately freed; someone else may still be holding
+commit_start(). This means that the code, which still involves an
-references. In this case, the op blockers on the intermediate nodes make
+explicit BlockReopenQueue, can now be simplified by using bdrv_reopen().
 the graph reconfiguration in the completion code fail.
-Call block_job_remove_all_bdrv() manually so that we know for sure that
-any blockers on intermediate nodes are given up.
-Cc: qemu-stable@nongnu.org
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 ---
- block/commit.c | 7 +++++++
+ block/commit.c | 8 +-------
-file changed, 7 insertions(+)
+file changed, 1 insertion(+), 7 deletions(-)
 diff --git a/block/commit.c b/block/commit.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/commit.c
 +++ b/block/commit.c
-@@ -XXX,XX +XXX,XX @@ static void commit_complete(BlockJob *job, void *opaque)
+@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
-     }
+                   const char *filter_node_name, Error **errp)
-     g_free(s->backing_file_str);
+ {
-     blk_unref(s->top);
+     CommitBlockJob *s;
-+
+-    BlockReopenQueue *reopen_queue = NULL;
-+    /* If there is more than one reference to the job (e.g. if called from
+     int orig_base_flags;
-+     * block_job_finish_sync()), block_job_completed() won't free it and
+     BlockDriverState *iter;
-+     * therefore the blockers on the intermediate nodes remain. This would
+     BlockDriverState *commit_top_bs = NULL;
-+     * cause bdrv_set_backing_hd() to fail. */
+@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
-+    block_job_remove_all_bdrv(job);
+     /* convert base to r/w, if necessary */
-+
+     orig_base_flags = bdrv_get_flags(base);
-     block_job_completed(&s->common, ret);
+     if (!(orig_base_flags & BDRV_O_RDWR)) {
-     g_free(data);
+-        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
+-                                         orig_base_flags | BDRV_O_RDWR);
 -    }
 -
 -    if (reopen_queue) {
 -        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
 +        bdrv_reopen(base, orig_base_flags | BDRV_O_RDWR, &local_err);
          if (local_err != NULL) {
              error_propagate(errp, local_err);
              goto fail;
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 06/61] migration: avoid recursive AioContext locking in save_vmstate()
+Deleted patch
-From: Stefan Hajnoczi <stefanha@redhat.com>
-AioContext was designed to allow nested acquire/release calls.  It uses
-a recursive mutex so callers don't need to worry about nesting...or so
-we thought.
-BDRV_POLL_WHILE() is used to wait for block I/O requests.  It releases
-the AioContext temporarily around aio_poll().  This gives IOThreads a
-chance to acquire the AioContext to process I/O completions.
-It turns out that recursive locking and BDRV_POLL_WHILE() don't mix.
-BDRV_POLL_WHILE() only releases the AioContext once, so the IOThread
-will not be able to acquire the AioContext if it was acquired
-multiple times.
-Instead of trying to release AioContext n times in BDRV_POLL_WHILE(),
-this patch simply avoids nested locking in save_vmstate().  It's the
-simplest fix and we should step back to consider the big picture with
-all the recent changes to block layer threading.
-This patch is the final fix to solve 'savevm' hanging with -object
-iothread.
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- migration/savevm.c | 12 +++++++++++-
-file changed, 11 insertions(+), 1 deletion(-)
-diff --git a/migration/savevm.c b/migration/savevm.c
-index XXXXXXX..XXXXXXX 100644
---- a/migration/savevm.c
-+++ b/migration/savevm.c
-@@ -XXX,XX +XXX,XX @@ int save_snapshot(const char *name, Error **errp)
-         goto the_end;
-     }
-+    /* The bdrv_all_create_snapshot() call that follows acquires the AioContext
-+     * for itself.  BDRV_POLL_WHILE() does not support nested locking because
-+     * it only releases the lock once.  Therefore synchronous I/O will deadlock
-+     * unless we release the AioContext before bdrv_all_create_snapshot().
-+     */
-+    aio_context_release(aio_context);
-+    aio_context = NULL;
-+
-     ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs);
-     if (ret < 0) {
-         error_setg(errp, "Error while creating snapshot on '%s'",
-@@ -XXX,XX +XXX,XX @@ int save_snapshot(const char *name, Error **errp)
-     ret = 0;
-  the_end:
--    aio_context_release(aio_context);
-+    if (aio_context) {
-+        aio_context_release(aio_context);
-+    }
-     if (saved_vm_running) {
-         vm_start();
-     }
---
-.8.3.1

-[Qemu-devel] [PULL 10/61] throttle: Update throttle-groups.c documentation
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-There used to be throttle_timers_{detach,attach}_aio_context() calls
-in bdrv_set_aio_context(), but since 7ca7f0f6db1fedd28d490795d778cf239
-they are now in blk_set_aio_context().
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- block/throttle-groups.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/block/throttle-groups.c b/block/throttle-groups.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/throttle-groups.c
-+++ b/block/throttle-groups.c
-@@ -XXX,XX +XXX,XX @@
-  * Again, all this is handled internally and is mostly transparent to
-  * the outside. The 'throttle_timers' field however has an additional
-  * constraint because it may be temporarily invalid (see for example
-- * bdrv_set_aio_context()). Therefore in this file a thread will
-+ * blk_set_aio_context()). Therefore in this file a thread will
-  * access some other BlockBackend's timers only after verifying that
-  * that BlockBackend has throttled requests in the queue.
-  */
---
-.8.3.1

-[Qemu-devel] [PULL 11/61] virtio-pci: use ioeventfd even when KVM is disabled
+Deleted patch
-From: Stefan Hajnoczi <stefanha@redhat.com>
-Old kvm.ko versions only supported a tiny number of ioeventfds so
-virtio-pci avoids ioeventfds when kvm_has_many_ioeventfds() returns 0.
-Do not check kvm_has_many_ioeventfds() when KVM is disabled since it
-always returns 0.  Since commit 8c56c1a592b5092d91da8d8943c17777d6462a6f
-("memory: emulate ioeventfd") it has been possible to use ioeventfds in
-qtest or TCG mode.
-This patch makes -device virtio-blk-pci,iothread=iothread0 work even
-when KVM is disabled.
-I have tested that virtio-blk-pci works under TCG both with and without
-iothread.
-Cc: Michael S. Tsirkin <mst@redhat.com>
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- hw/virtio/virtio-pci.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/virtio/virtio-pci.c
-+++ b/hw/virtio/virtio-pci.c
-@@ -XXX,XX +XXX,XX @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
-     bool pcie_port = pci_bus_is_express(pci_dev->bus) &&
-                      !pci_bus_is_root(pci_dev->bus);
--    if (!kvm_has_many_ioeventfds()) {
-+    if (kvm_enabled() && !kvm_has_many_ioeventfds()) {
-         proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD;
-     }
---
-.8.3.1

-[Qemu-devel] [PULL 12/61] migration: hold AioContext lock for loadvm qemu_fclose()
+Deleted patch
-From: Stefan Hajnoczi <stefanha@redhat.com>
-migration_incoming_state_destroy() uses qemu_fclose() on the vmstate
-file.  Make sure to call it inside an AioContext acquire/release region.
-This fixes an 'qemu: qemu_mutex_unlock: Operation not permitted' abort
-in loadvm.
-This patch closes the vmstate file before ending the drained region.
-Previously we closed the vmstate file after ending the drained region.
-The order does not matter.
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- migration/savevm.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/migration/savevm.c b/migration/savevm.c
-index XXXXXXX..XXXXXXX 100644
---- a/migration/savevm.c
-+++ b/migration/savevm.c
-@@ -XXX,XX +XXX,XX @@ int load_snapshot(const char *name, Error **errp)
-     aio_context_acquire(aio_context);
-     ret = qemu_loadvm_state(f);
-+    migration_incoming_state_destroy();
-     aio_context_release(aio_context);
-     bdrv_drain_all_end();
--    migration_incoming_state_destroy();
-     if (ret < 0) {
-         error_setg(errp, "Error %d while loading VM state", ret);
-         return ret;
---
-.8.3.1

-[Qemu-devel] [PULL 29/61] qed: Remove callback from qed_find_cluster()
+Deleted patch
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- block/qed-cluster.c | 39 ++++++++++++++++++++++-----------------
- block/qed.c         | 24 +++++++++++-------------
- block/qed.h         |  4 ++--
-files changed, 35 insertions(+), 32 deletions(-)
-diff --git a/block/qed-cluster.c b/block/qed-cluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qed-cluster.c
-+++ b/block/qed-cluster.c
-@@ -XXX,XX +XXX,XX @@ static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
-  * @s:          QED state
-  * @request:    L2 cache entry
-  * @pos:        Byte position in device
-- * @len:        Number of bytes
-- * @cb:         Completion function
-- * @opaque:     User data for completion function
-+ * @len:        Number of bytes (may be shortened on return)
-+ * @img_offset: Contains offset in the image file on success
-  *
-  * This function translates a position in the block device to an offset in the
-- * image file.  It invokes the cb completion callback to report back the
-- * translated offset or unallocated range in the image file.
-+ * image file. The translated offset or unallocated range in the image file is
-+ * reported back in *img_offset and *len.
-  *
-  * If the L2 table exists, request->l2_table points to the L2 table cache entry
-  * and the caller must free the reference when they are finished.  The cache
-  * entry is exposed in this way to avoid callers having to read the L2 table
-  * again later during request processing.  If request->l2_table is non-NULL it
-  * will be unreferenced before taking on the new cache entry.
-+ *
-+ * On success QED_CLUSTER_FOUND is returned and img_offset/len are a contiguous
-+ * range in the image file.
-+ *
-+ * On failure QED_CLUSTER_L2 or QED_CLUSTER_L1 is returned for missing L2 or L1
-+ * table offset, respectively. len is number of contiguous unallocated bytes.
-  */
--void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
--                      size_t len, QEDFindClusterFunc *cb, void *opaque)
-+int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
-+                     size_t *len, uint64_t *img_offset)
- {
-     uint64_t l2_offset;
-     uint64_t offset = 0;
-@@ -XXX,XX +XXX,XX @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
-     /* Limit length to L2 boundary.  Requests are broken up at the L2 boundary
-      * so that a request acts on one L2 table at a time.
-      */
--    len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos);
-+    *len = MIN(*len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos);
-     l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)];
-     if (qed_offset_is_unalloc_cluster(l2_offset)) {
--        cb(opaque, QED_CLUSTER_L1, 0, len);
--        return;
-+        *img_offset = 0;
-+        return QED_CLUSTER_L1;
-     }
-     if (!qed_check_table_offset(s, l2_offset)) {
--        cb(opaque, -EINVAL, 0, 0);
--        return;
-+        *img_offset = *len = 0;
-+        return -EINVAL;
-     }
-     ret = qed_read_l2_table(s, request, l2_offset);
-@@ -XXX,XX +XXX,XX @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
-     }
-     index = qed_l2_index(s, pos);
--    n = qed_bytes_to_clusters(s,
--                              qed_offset_into_cluster(s, pos) + len);
-+    n = qed_bytes_to_clusters(s, qed_offset_into_cluster(s, pos) + *len);
-     n = qed_count_contiguous_clusters(s, request->l2_table->table,
-                                       index, n, &offset);
-@@ -XXX,XX +XXX,XX @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
-         ret = -EINVAL;
-     }
--    len = MIN(len,
--              n * s->header.cluster_size - qed_offset_into_cluster(s, pos));
-+    *len = MIN(*len,
-+               n * s->header.cluster_size - qed_offset_into_cluster(s, pos));
- out:
--    cb(opaque, ret, offset, len);
-+    *img_offset = offset;
-     qed_release(s);
-+    return ret;
- }
-diff --git a/block/qed.c b/block/qed.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qed.c
-+++ b/block/qed.c
-@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs,
-         .file = file,
-     };
-     QEDRequest request = { .l2_table = NULL };
-+    uint64_t offset;
-+    int ret;
--    qed_find_cluster(s, &request, cb.pos, len, qed_is_allocated_cb, &cb);
-+    ret = qed_find_cluster(s, &request, cb.pos, &len, &offset);
-+    qed_is_allocated_cb(&cb, ret, offset, len);
--    /* Now sleep if the callback wasn't invoked immediately */
--    while (cb.status == BDRV_BLOCK_OFFSET_MASK) {
--        cb.co = qemu_coroutine_self();
--        qemu_coroutine_yield();
--    }
-+    /* The callback was invoked immediately */
-+    assert(cb.status != BDRV_BLOCK_OFFSET_MASK);
-     qed_unref_l2_cache_entry(request.l2_table);
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
-  *              or -errno
-  * @offset:     Cluster offset in bytes
-  * @len:        Length in bytes
-- *
-- * Callback from qed_find_cluster().
-  */
- static void qed_aio_write_data(void *opaque, int ret,
-                                uint64_t offset, size_t len)
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_data(void *opaque, int ret,
-  *              or -errno
-  * @offset:     Cluster offset in bytes
-  * @len:        Length in bytes
-- *
-- * Callback from qed_find_cluster().
-  */
- static void qed_aio_read_data(void *opaque, int ret,
-                               uint64_t offset, size_t len)
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
-     BDRVQEDState *s = acb_to_s(acb);
-     QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
-                                 qed_aio_write_data : qed_aio_read_data;
-+    uint64_t offset;
-+    size_t len;
-     trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
-@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
-     }
-     /* Find next cluster and start I/O */
--    qed_find_cluster(s, &acb->request,
--                      acb->cur_pos, acb->end_pos - acb->cur_pos,
--                      io_fn, acb);
-+    len = acb->end_pos - acb->cur_pos;
-+    ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
-+    io_fn(acb, ret, offset, len);
- }
- static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
-diff --git a/block/qed.h b/block/qed.h
-index XXXXXXX..XXXXXXX 100644
---- a/block/qed.h
-+++ b/block/qed.h
-@@ -XXX,XX +XXX,XX @@ int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
- /**
-  * Cluster functions
-  */
--void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
--                      size_t len, QEDFindClusterFunc *cb, void *opaque);
-+int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
-+                     size_t *len, uint64_t *img_offset);
- /**
-  * Consistency check
---
-.8.3.1

-[Qemu-devel] [PULL 47/61] qed: Remove ret argument from qed_aio_next_io()
+[Qemu-devel] [PULL v3 35/35] block: Keep nodes drained between reopen_queue/multiple
-All callers pass ret = 0, so we can just remove it.
+The bdrv_reopen*() implementation doesn't like it if the graph is
 changed between queuing nodes for reopen and actually reopening them
 (one of the reasons is that queuing can be recursive).
 So instead of draining the device only in bdrv_reopen_multiple(),
 require that callers already drained all affected nodes, and assert this
 in bdrv_reopen_queue().
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
 ---
- block/qed.c | 17 ++++++-----------
+ block.c             | 23 ++++++++++++++++-------
-file changed, 6 insertions(+), 11 deletions(-)
+ block/replication.c |  6 ++++++
  qemu-io-cmds.c      |  3 +++
 files changed, 25 insertions(+), 7 deletions(-)
-diff --git a/block/qed.c b/block/qed.c
+diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qed.c
+--- a/block.c
-+++ b/block/qed.c
++++ b/block.c
-@@ -XXX,XX +XXX,XX @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
+@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
-     return l2_table;
+  * returns a pointer to bs_queue, which is either the newly allocated
   * bs_queue, or the existing bs_queue being used.
   *
 + * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
   */
  static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
                                                   BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
      BdrvChild *child;
      QDict *old_options, *explicit_options;
 +    /* Make sure that the caller remembered to use a drained section. This is
 +     * important to avoid graph changes between the recursive queuing here and
 +     * bdrv_reopen_multiple(). */
 +    assert(bs->quiesce_counter > 0);
 +
      if (bs_queue == NULL) {
          bs_queue = g_new0(BlockReopenQueue, 1);
          QSIMPLEQ_INIT(bs_queue);
@@ -XXX,XX +XXX,XX @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
   * If all devices prepare successfully, then the changes are committed
   * to all devices.
   *
 + * All affected nodes must be drained between bdrv_reopen_queue() and
 + * bdrv_reopen_multiple().
   */
  int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp)
  {
@@ -XXX,XX +XXX,XX @@ int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **er
      assert(bs_queue != NULL);
 -    aio_context_release(ctx);
 -    bdrv_drain_all_begin();
 -    aio_context_acquire(ctx);
 -
      QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
 +        assert(bs_entry->state.bs->quiesce_counter > 0);
          if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
              error_propagate(errp, local_err);
              goto cleanup;
@@ -XXX,XX +XXX,XX @@ cleanup:
      }
      g_free(bs_queue);
 -    bdrv_drain_all_end();
 -
      return ret;
  }
--static void qed_aio_next_io(QEDAIOCB *acb, int ret);
+@@ -XXX,XX +XXX,XX @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
 +static void qed_aio_next_io(QEDAIOCB *acb);
  static void qed_aio_start_io(QEDAIOCB *acb)
  {
--    qed_aio_next_io(acb, 0);
+     int ret = -1;
-+    qed_aio_next_io(acb);
+     Error *local_err = NULL;
 -    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
 +    BlockReopenQueue *queue;
 +    bdrv_subtree_drained_begin(bs);
 +
 +    queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
      ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err);
      if (local_err != NULL) {
          error_propagate(errp, local_err);
      }
 +
 +    bdrv_subtree_drained_end(bs);
 +
      return ret;
  }
- static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
+diff --git a/block/replication.c b/block/replication.c
-@@ -XXX,XX +XXX,XX @@ static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
+index XXXXXXX..XXXXXXX 100644
- /**
+--- a/block/replication.c
-  * Begin next I/O or complete the request
++++ b/block/replication.c
-  */
+@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
--static void qed_aio_next_io(QEDAIOCB *acb, int ret)
+         new_secondary_flags = s->orig_secondary_flags;
 +static void qed_aio_next_io(QEDAIOCB *acb)
  {
      BDRVQEDState *s = acb_to_s(acb);
      uint64_t offset;
      size_t len;
 +    int ret;
 -    trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
 +    trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
      if (acb->backing_qiov) {
          qemu_iovec_destroy(acb->backing_qiov);
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
          acb->backing_qiov = NULL;
      }
--    /* Handle I/O error */
++    bdrv_subtree_drained_begin(s->hidden_disk->bs);
--    if (ret) {
++    bdrv_subtree_drained_begin(s->secondary_disk->bs);
--        qed_aio_complete(acb, ret);
++
--        return;
+     if (orig_hidden_flags != new_hidden_flags) {
--    }
+         reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, NULL,
--
+                                          new_hidden_flags);
-     acb->qiov_offset += acb->cur_qiov.size;
+@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
-     acb->cur_pos += acb->cur_qiov.size;
+                              reopen_queue, &local_err);
-     qemu_iovec_reset(&acb->cur_qiov);
+         error_propagate(errp, local_err);
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
          }
          return;
      }
--    qed_aio_next_io(acb, 0);
++
-+    qed_aio_next_io(acb);
++    bdrv_subtree_drained_end(s->hidden_disk->bs);
 +    bdrv_subtree_drained_end(s->secondary_disk->bs);
  }
- static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
+ static void backup_job_cleanup(BlockDriverState *bs)
 diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-io-cmds.c
 +++ b/qemu-io-cmds.c
@@ -XXX,XX +XXX,XX @@ static int reopen_f(BlockBackend *blk, int argc, char **argv)
      opts = qopts ? qemu_opts_to_qdict(qopts, NULL) : NULL;
      qemu_opts_reset(&reopen_opts);
 +    bdrv_subtree_drained_begin(bs);
      brq = bdrv_reopen_queue(NULL, bs, opts, flags);
      bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, &local_err);
 +    bdrv_subtree_drained_end(bs);
 +
      if (local_err) {
          error_report_err(local_err);
      } else {
 --
-.8.3.1
+.13.6

-[Qemu-devel] [PULL 58/61] blkdebug: Catch bs->exact_filename overflow
+Deleted patch
-From: Max Reitz <mreitz@redhat.com>
-The bs->exact_filename field may not be sufficient to store the full
-blkdebug node filename. In this case, we should not generate a filename
-at all instead of an unusable one.
-Cc: qemu-stable@nongnu.org
-Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Message-id: 20170613172006.19685-2-mreitz@redhat.com
-Reviewed-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/blkdebug.c | 10 +++++++---
-file changed, 7 insertions(+), 3 deletions(-)
-diff --git a/block/blkdebug.c b/block/blkdebug.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/blkdebug.c
-+++ b/block/blkdebug.c
-@@ -XXX,XX +XXX,XX @@ static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options)
-     }
-     if (!force_json && bs->file->bs->exact_filename[0]) {
--        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
--                 "blkdebug:%s:%s", s->config_file ?: "",
--                 bs->file->bs->exact_filename);
-+        int ret = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-+                           "blkdebug:%s:%s", s->config_file ?: "",
-+                           bs->file->bs->exact_filename);
-+        if (ret >= sizeof(bs->exact_filename)) {
-+            /* An overflow makes the filename unusable, so do not report any */
-+            bs->exact_filename[0] = 0;
-+        }
-     }
-     opts = qdict_new();
---
-.8.3.1

-[Qemu-devel] [PULL 60/61] block: Do not strcmp() with NULL uri->scheme
+Deleted patch
-From: Max Reitz <mreitz@redhat.com>
-uri_parse(...)->scheme may be NULL. In fact, probably every field may be
-NULL, and the callers do test this for all of the other fields but not
-for scheme (except for block/gluster.c; block/vxhs.c does not access
-that field at all).
-We can easily fix this by using g_strcmp0() instead of strcmp().
-Cc: qemu-stable@nongnu.org
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Message-id: 20170613205726.13544-1-mreitz@redhat.com
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/nbd.c      | 6 +++---
- block/nfs.c      | 2 +-
- block/sheepdog.c | 6 +++---
- block/ssh.c      | 2 +-
-files changed, 8 insertions(+), 8 deletions(-)
-diff --git a/block/nbd.c b/block/nbd.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nbd.c
-+++ b/block/nbd.c
-@@ -XXX,XX +XXX,XX @@ static int nbd_parse_uri(const char *filename, QDict *options)
-     }
-     /* transport */
--    if (!strcmp(uri->scheme, "nbd")) {
-+    if (!g_strcmp0(uri->scheme, "nbd")) {
-         is_unix = false;
--    } else if (!strcmp(uri->scheme, "nbd+tcp")) {
-+    } else if (!g_strcmp0(uri->scheme, "nbd+tcp")) {
-         is_unix = false;
--    } else if (!strcmp(uri->scheme, "nbd+unix")) {
-+    } else if (!g_strcmp0(uri->scheme, "nbd+unix")) {
-         is_unix = true;
-     } else {
-         ret = -EINVAL;
-diff --git a/block/nfs.c b/block/nfs.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nfs.c
-+++ b/block/nfs.c
-@@ -XXX,XX +XXX,XX @@ static int nfs_parse_uri(const char *filename, QDict *options, Error **errp)
-         error_setg(errp, "Invalid URI specified");
-         goto out;
-     }
--    if (strcmp(uri->scheme, "nfs") != 0) {
-+    if (g_strcmp0(uri->scheme, "nfs") != 0) {
-         error_setg(errp, "URI scheme must be 'nfs'");
-         goto out;
-     }
-diff --git a/block/sheepdog.c b/block/sheepdog.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/sheepdog.c
-+++ b/block/sheepdog.c
-@@ -XXX,XX +XXX,XX @@ static void sd_parse_uri(SheepdogConfig *cfg, const char *filename,
-     }
-     /* transport */
--    if (!strcmp(uri->scheme, "sheepdog")) {
-+    if (!g_strcmp0(uri->scheme, "sheepdog")) {
-         is_unix = false;
--    } else if (!strcmp(uri->scheme, "sheepdog+tcp")) {
-+    } else if (!g_strcmp0(uri->scheme, "sheepdog+tcp")) {
-         is_unix = false;
--    } else if (!strcmp(uri->scheme, "sheepdog+unix")) {
-+    } else if (!g_strcmp0(uri->scheme, "sheepdog+unix")) {
-         is_unix = true;
-     } else {
-         error_setg(&err, "URI scheme must be 'sheepdog', 'sheepdog+tcp',"
-diff --git a/block/ssh.c b/block/ssh.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/ssh.c
-+++ b/block/ssh.c
-@@ -XXX,XX +XXX,XX @@ static int parse_uri(const char *filename, QDict *options, Error **errp)
-         return -EINVAL;
-     }
--    if (strcmp(uri->scheme, "ssh") != 0) {
-+    if (g_strcmp0(uri->scheme, "ssh") != 0) {
-         error_setg(errp, "URI scheme must be 'ssh'");
-         goto err;
-     }
---
-.8.3.1

The following changes since commit 4c8c1cc544dbd5e2564868e61c5037258e393832:

Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.10-pull-request' into staging (2017-06-22 19:01:58 +0100)

are available in the git repository at:

git://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to 1512008812410ca4054506a7c44343088abdd977:

Merge remote-tracking branch 'mreitz/tags/pull-block-2017-06-23' into queue-block (2017-06-23 14:09:12 +0200)

----------------------------------------------------------------

Block layer patches

----------------------------------------------------------------
Alberto Garcia (9):
      throttle: Update throttle-groups.c documentation
      qcow2: Remove unused Error variable in do_perform_cow()
      qcow2: Use unsigned int for both members of Qcow2COWRegion
      qcow2: Make perform_cow() call do_perform_cow() twice
      qcow2: Split do_perform_cow() into _read(), _encrypt() and _write()
      qcow2: Allow reading both COW regions with only one request
      qcow2: Pass a QEMUIOVector to do_perform_cow_{read,write}()
      qcow2: Merge the writing of the COW regions with the guest data
      qcow2: Use offset_into_cluster() and offset_to_l2_index()

Kevin Wolf (37):
      commit: Fix completion with extra reference
      qemu-iotests: Allow starting new qemu after cleanup
      qemu-iotests: Test exiting qemu with running job
      doc: Document generic -blockdev options
      doc: Document driver-specific -blockdev options
      qed: Use bottom half to resume waiting requests
      qed: Make qed_read_table() synchronous
      qed: Remove callback from qed_read_table()
      qed: Remove callback from qed_read_l2_table()
      qed: Remove callback from qed_find_cluster()
      qed: Make qed_read_backing_file() synchronous
      qed: Make qed_copy_from_backing_file() synchronous
      qed: Remove callback from qed_copy_from_backing_file()
      qed: Make qed_write_header() synchronous
      qed: Remove callback from qed_write_header()
      qed: Make qed_write_table() synchronous
      qed: Remove GenericCB
      qed: Remove callback from qed_write_table()
      qed: Make qed_aio_read_data() synchronous
      qed: Make qed_aio_write_main() synchronous
      qed: Inline qed_commit_l2_update()
      qed: Add return value to qed_aio_write_l1_update()
      qed: Add return value to qed_aio_write_l2_update()
      qed: Add return value to qed_aio_write_main()
      qed: Add return value to qed_aio_write_cow()
      qed: Add return value to qed_aio_write_inplace/alloc()
      qed: Add return value to qed_aio_read/write_data()
      qed: Remove ret argument from qed_aio_next_io()
      qed: Remove recursion in qed_aio_next_io()
      qed: Implement .bdrv_co_readv/writev
      qed: Use CoQueue for serialising allocations
      qed: Simplify request handling
      qed: Use a coroutine for need_check_timer
      qed: Add coroutine_fn to I/O path functions
      qed: Use bdrv_co_* for coroutine_fns
      block: Remove bdrv_aio_readv/writev/flush()
      Merge remote-tracking branch 'mreitz/tags/pull-block-2017-06-23' into queue-block

Manos Pitsidianakis (1):
      block: change variable names in BlockDriverState

Max Reitz (3):
      blkdebug: Catch bs->exact_filename overflow
      blkverify: Catch bs->exact_filename overflow
      block: Do not strcmp() with NULL uri->scheme

Stefan Hajnoczi (10):
      block: count bdrv_co_rw_vmstate() requests
      block: use BDRV_POLL_WHILE() in bdrv_rw_vmstate()
      migration: avoid recursive AioContext locking in save_vmstate()
      migration: use bdrv_drain_all_begin/end() instead bdrv_drain_all()
      virtio-pci: use ioeventfd even when KVM is disabled
      migration: hold AioContext lock for loadvm qemu_fclose()
      qemu-iotests: 068: extract _qemu() function
      qemu-iotests: 068: use -drive/-device instead of -hda
      qemu-iotests: 068: test iothread mode
      qemu-img: don't shadow opts variable in img_dd()

Stephen Bates (1):
      nvme: Add support for Read Data and Write Data in CMBs.

sochin.jiang (1):
      fix: avoid an infinite loop or a dangling pointer problem in img_commit

commit_complete() can't assume that after its block_job_completed() the
job is actually immediately freed; someone else may still be holding
references. In this case, the op blockers on the intermediate nodes make
the graph reconfiguration in the completion code fail.

Call block_job_remove_all_bdrv() manually so that we know for sure that
any blockers on intermediate nodes are given up.

Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 block/commit.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/block/commit.c b/block/commit.c
index XXXXXXX..XXXXXXX 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ static void commit_complete(BlockJob *job, void *opaque)
     }
     g_free(s->backing_file_str);
     blk_unref(s->top);
+
+    /* If there is more than one reference to the job (e.g. if called from
+     * block_job_finish_sync()), block_job_completed() won't free it and
+     * therefore the blockers on the intermediate nodes remain. This would
+     * cause bdrv_set_backing_hd() to fail. */
+    block_job_remove_all_bdrv(job);
+
     block_job_completed(&s->common, ret);
     g_free(data);
 
-- 
1.8.3.1

After _cleanup_qemu(), test cases should be able to start the next qemu
process and call _cleanup_qemu() for that one as well. For this to work
cleanly, we need to improve the cleanup so that the second invocation
doesn't try to kill the qemu instances from the first invocation a
second time (which would result in error messages).

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/common.qemu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/common.qemu
+++ b/tests/qemu-iotests/common.qemu
@@ -XXX,XX +XXX,XX @@ function _cleanup_qemu()
         rm -f "${QEMU_FIFO_IN}_${i}" "${QEMU_FIFO_OUT}_${i}"
         eval "exec ${QEMU_IN[$i]}<&-"   # close file descriptors
         eval "exec ${QEMU_OUT[$i]}<&-"
+
+        unset QEMU_IN[$i]
+        unset QEMU_OUT[$i]
     done
 }
-- 
1.8.3.1

When qemu is exited, all running jobs should be cancelled successfully.
This adds a test for this for all types of block jobs that currently
exist in qemu.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 tests/qemu-iotests/185     | 206 +++++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/185.out |  59 +++++++++++++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 266 insertions(+)
 create mode 100755 tests/qemu-iotests/185
 create mode 100644 tests/qemu-iotests/185.out

diff --git a/tests/qemu-iotests/185 b/tests/qemu-iotests/185
new file mode 100755
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/185
@@ -XXX,XX +XXX,XX @@
+#!/bin/bash
+#
+# Test exiting qemu while jobs are still running
+#
+# Copyright (C) 2017 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=kwolf@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+status=1 # failure is the default!
+
+MIG_SOCKET="${TEST_DIR}/migrate"
+
+_cleanup()
+{
+    rm -f "${TEST_IMG}.mid"
+    rm -f "${TEST_IMG}.copy"
+    _cleanup_test_img
+    _cleanup_qemu
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.qemu
+
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+
+size=64M
+TEST_IMG="${TEST_IMG}.base" _make_test_img $size
+
+echo
+echo === Starting VM ===
+echo
+
+qemu_comm_method="qmp"
+
+_launch_qemu \
+    -drive file="${TEST_IMG}.base",cache=$CACHEMODE,driver=$IMGFMT,id=disk
+h=$QEMU_HANDLE
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
+
+echo
+echo === Creating backing chain ===
+echo
+
+_send_qemu_cmd $h \
+    "{ 'execute': 'blockdev-snapshot-sync',
+       'arguments': { 'device': 'disk',
+                      'snapshot-file': '$TEST_IMG.mid',
+                      'format': '$IMGFMT',
+                      'mode': 'absolute-paths' } }" \
+    "return"
+
+_send_qemu_cmd $h \
+    "{ 'execute': 'human-monitor-command',
+       'arguments': { 'command-line':
+                      'qemu-io disk \"write 0 4M\"' } }" \
+    "return"
+
+_send_qemu_cmd $h \
+    "{ 'execute': 'blockdev-snapshot-sync',
+       'arguments': { 'device': 'disk',
+                      'snapshot-file': '$TEST_IMG',
+                      'format': '$IMGFMT',
+                      'mode': 'absolute-paths' } }" \
+    "return"
+
+echo
+echo === Start commit job and exit qemu ===
+echo
+
+# Note that the reference output intentionally includes the 'offset' field in
+# BLOCK_JOB_CANCELLED events for all of the following block jobs. They are
+# predictable and any change in the offsets would hint at a bug in the job
+# throttling code.
+#
+# In order to achieve these predictable offsets, all of the following tests
+# use speed=65536. Each job will perform exactly one iteration before it has
+# to sleep at least for a second, which is plenty of time for the 'quit' QMP
+# command to be received (after receiving the command, the rest runs
+# synchronously, so jobs can arbitrarily continue or complete).
+#
+# The buffer size for commit and streaming is 512k (waiting for 8 seconds after
+# the first request), for active commit and mirror it's large enough to cover
+# the full 4M, and for backup it's the qcow2 cluster size, which we know is
+# 64k. As all of these are at least as large as the speed, we are sure that the
+# offset doesn't advance after the first iteration before qemu exits.
+
+_send_qemu_cmd $h \
+    "{ 'execute': 'block-commit',
+       'arguments': { 'device': 'disk',
+                      'base':'$TEST_IMG.base',
+                      'top': '$TEST_IMG.mid',
+                      'speed': 65536 } }" \
+    "return"
+
+_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
+wait=1 _cleanup_qemu
+
+echo
+echo === Start active commit job and exit qemu ===
+echo
+
+_launch_qemu \
+    -drive file="${TEST_IMG}",cache=$CACHEMODE,driver=$IMGFMT,id=disk
+h=$QEMU_HANDLE
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
+
+_send_qemu_cmd $h \
+    "{ 'execute': 'block-commit',
+       'arguments': { 'device': 'disk',
+                      'base':'$TEST_IMG.base',
+                      'speed': 65536 } }" \
+    "return"
+
+_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
+wait=1 _cleanup_qemu
+
+echo
+echo === Start mirror job and exit qemu ===
+echo
+
+_launch_qemu \
+    -drive file="${TEST_IMG}",cache=$CACHEMODE,driver=$IMGFMT,id=disk
+h=$QEMU_HANDLE
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
+
+_send_qemu_cmd $h \
+    "{ 'execute': 'drive-mirror',
+       'arguments': { 'device': 'disk',
+                      'target': '$TEST_IMG.copy',
+                      'format': '$IMGFMT',
+                      'sync': 'full',
+                      'speed': 65536 } }" \
+    "return"
+
+_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
+wait=1 _cleanup_qemu
+
+echo
+echo === Start backup job and exit qemu ===
+echo
+
+_launch_qemu \
+    -drive file="${TEST_IMG}",cache=$CACHEMODE,driver=$IMGFMT,id=disk
+h=$QEMU_HANDLE
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
+
+_send_qemu_cmd $h \
+    "{ 'execute': 'drive-backup',
+       'arguments': { 'device': 'disk',
+                      'target': '$TEST_IMG.copy',
+                      'format': '$IMGFMT',
+                      'sync': 'full',
+                      'speed': 65536 } }" \
+    "return"
+
+_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
+wait=1 _cleanup_qemu
+
+echo
+echo === Start streaming job and exit qemu ===
+echo
+
+_launch_qemu \
+    -drive file="${TEST_IMG}",cache=$CACHEMODE,driver=$IMGFMT,id=disk
+h=$QEMU_HANDLE
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
+
+_send_qemu_cmd $h \
+    "{ 'execute': 'block-stream',
+       'arguments': { 'device': 'disk',
+                      'speed': 65536 } }" \
+    "return"
+
+_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
+wait=1 _cleanup_qemu
+
+_check_test_img
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/185.out b/tests/qemu-iotests/185.out
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/185.out
@@ -XXX,XX +XXX,XX @@
+QA output created by 185
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864
+
+=== Starting VM ===
+
+{"return": {}}
+
+=== Creating backing chain ===
+
+Formatting 'TEST_DIR/t.qcow2.mid', fmt=qcow2 size=67108864 backing_file=TEST_DIR/t.qcow2.base backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
+{"return": {}}
+wrote 4194304/4194304 bytes at offset 0
+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+{"return": ""}
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 size=67108864 backing_file=TEST_DIR/t.qcow2.mid backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
+{"return": {}}
+
+=== Start commit job and exit qemu ===
+
+{"return": {}}
+{"return": {}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 524288, "speed": 65536, "type": "commit"}}
+
+=== Start active commit job and exit qemu ===
+
+{"return": {}}
+{"return": {}}
+{"return": {}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 4194304, "offset": 4194304, "speed": 65536, "type": "commit"}}
+
+=== Start mirror job and exit qemu ===
+
+{"return": {}}
+Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 size=67108864 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
+{"return": {}}
+{"return": {}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 4194304, "offset": 4194304, "speed": 65536, "type": "mirror"}}
+
+=== Start backup job and exit qemu ===
+
+{"return": {}}
+Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 size=67108864 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
+{"return": {}}
+{"return": {}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 65536, "speed": 65536, "type": "backup"}}
+
+=== Start streaming job and exit qemu ===
+
+{"return": {}}
+{"return": {}}
+{"return": {}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 524288, "speed": 65536, "type": "stream"}}
+No errors were found on the image.
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 181 rw auto migration
 182 rw auto quick
 183 rw auto migration
+185 rw auto
-- 
1.8.3.1

From: Stefan Hajnoczi <stefanha@redhat.com>

Calling aio_poll() directly may have been fine previously, but this is
the future, man!  The difference between an aio_poll() loop and
BDRV_POLL_WHILE() is that BDRV_POLL_WHILE() releases the AioContext
around aio_poll().

This allows the IOThread to run fd handlers or BHs to complete the
request.  Failure to release the AioContext causes deadlocks.

Using BDRV_POLL_WHILE() partially fixes a 'savevm' hang with -object
iothread.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
         Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
 
         bdrv_coroutine_enter(bs, co);
-        while (data.ret == -EINPROGRESS) {
-            aio_poll(bdrv_get_aio_context(bs), true);
-        }
+        BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
         return data.ret;
     }
 }
-- 
1.8.3.1

From: Stefan Hajnoczi <stefanha@redhat.com>

AioContext was designed to allow nested acquire/release calls.  It uses
a recursive mutex so callers don't need to worry about nesting...or so
we thought.

BDRV_POLL_WHILE() is used to wait for block I/O requests.  It releases
the AioContext temporarily around aio_poll().  This gives IOThreads a
chance to acquire the AioContext to process I/O completions.

It turns out that recursive locking and BDRV_POLL_WHILE() don't mix.
BDRV_POLL_WHILE() only releases the AioContext once, so the IOThread
will not be able to acquire the AioContext if it was acquired
multiple times.

Instead of trying to release AioContext n times in BDRV_POLL_WHILE(),
this patch simply avoids nested locking in save_vmstate().  It's the
simplest fix and we should step back to consider the big picture with
all the recent changes to block layer threading.

This patch is the final fix to solve 'savevm' hanging with -object
iothread.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 migration/savevm.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -XXX,XX +XXX,XX @@ int save_snapshot(const char *name, Error **errp)
         goto the_end;
     }
 
+    /* The bdrv_all_create_snapshot() call that follows acquires the AioContext
+     * for itself.  BDRV_POLL_WHILE() does not support nested locking because
+     * it only releases the lock once.  Therefore synchronous I/O will deadlock
+     * unless we release the AioContext before bdrv_all_create_snapshot().
+     */
+    aio_context_release(aio_context);
+    aio_context = NULL;
+
     ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs);
     if (ret < 0) {
         error_setg(errp, "Error while creating snapshot on '%s'",
@@ -XXX,XX +XXX,XX @@ int save_snapshot(const char *name, Error **errp)
     ret = 0;
 
  the_end:
-    aio_context_release(aio_context);
+    if (aio_context) {
+        aio_context_release(aio_context);
+    }
     if (saved_vm_running) {
         vm_start();
     }
-- 
1.8.3.1

From: Stefan Hajnoczi <stefanha@redhat.com>

blk/bdrv_drain_all() only takes effect for a single instant and then
resumes block jobs, guest devices, and other external clients like the
NBD server.  This can be handy when performing a synchronous drain
before terminating the program, for example.

Monitor commands usually need to quiesce I/O across an entire code
region so blk/bdrv_drain_all() is not suitable.  They must use
bdrv_drain_all_begin/end() to mark the region.  This prevents new I/O
requests from slipping in or worse - block jobs completing and modifying
the graph.

I audited other blk/bdrv_drain_all() callers but did not find anything
that needs a similar fix.  This patch fixes the savevm/loadvm commands.
Although I haven't encountered a read world issue this makes the code
safer.

Suggested-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 migration/savevm.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -XXX,XX +XXX,XX @@ int save_snapshot(const char *name, Error **errp)
     }
     vm_stop(RUN_STATE_SAVE_VM);
 
+    bdrv_drain_all_begin();
+
     aio_context_acquire(aio_context);
 
     memset(sn, 0, sizeof(*sn));
@@ -XXX,XX +XXX,XX @@ int save_snapshot(const char *name, Error **errp)
     if (aio_context) {
         aio_context_release(aio_context);
     }
+
+    bdrv_drain_all_end();
+
     if (saved_vm_running) {
         vm_start();
     }
@@ -XXX,XX +XXX,XX @@ int load_snapshot(const char *name, Error **errp)
     }
 
     /* Flush all IO requests so they don't interfere with the new state.  */
-    bdrv_drain_all();
+    bdrv_drain_all_begin();
 
     ret = bdrv_all_goto_snapshot(name, &bs);
     if (ret < 0) {
         error_setg(errp, "Error %d while activating snapshot '%s' on '%s'",
                      ret, name, bdrv_get_device_name(bs));
-        return ret;
+        goto err_drain;
     }
 
     /* restore the VM state */
     f = qemu_fopen_bdrv(bs_vm_state, 0);
     if (!f) {
         error_setg(errp, "Could not open VM state file");
-        return -EINVAL;
+        ret = -EINVAL;
+        goto err_drain;
     }
 
     qemu_system_reset(SHUTDOWN_CAUSE_NONE);
@@ -XXX,XX +XXX,XX @@ int load_snapshot(const char *name, Error **errp)
     ret = qemu_loadvm_state(f);
     aio_context_release(aio_context);
 
+    bdrv_drain_all_end();
+
     migration_incoming_state_destroy();
     if (ret < 0) {
         error_setg(errp, "Error %d while loading VM state", ret);
@@ -XXX,XX +XXX,XX @@ int load_snapshot(const char *name, Error **errp)
     }
 
     return 0;
+
+err_drain:
+    bdrv_drain_all_end();
+    return ret;
 }
 
 void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
-- 
1.8.3.1

This adds documentation for the -blockdev options that apply to all
nodes independent of the block driver used.

All options that are shared by -blockdev and -drive are now explained in
the section for -blockdev. The documentation of -drive mentions that all
-blockdev options are accepted as well.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 qemu-options.hx | 108 +++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 79 insertions(+), 29 deletions(-)

diff --git a/qemu-options.hx b/qemu-options.hx
index XXXXXXX..XXXXXXX 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ DEF("blockdev", HAS_ARG, QEMU_OPTION_blockdev,
     "          [,read-only=on|off][,detect-zeroes=on|off|unmap]\n"
     "          [,driver specific parameters...]\n"
     "                configure a block backend\n", QEMU_ARCH_ALL)
+STEXI
+@item -blockdev @var{option}[,@var{option}[,@var{option}[,...]]]
+@findex -blockdev
+
+Define a new block driver node.
+
+@table @option
+@item Valid options for any block driver node:
+
+@table @code
+@item driver
+Specifies the block driver to use for the given node.
+@item node-name
+This defines the name of the block driver node by which it will be referenced
+later. The name must be unique, i.e. it must not match the name of a different
+block driver node, or (if you use @option{-drive} as well) the ID of a drive.
+
+If no node name is specified, it is automatically generated. The generated node
+name is not intended to be predictable and changes between QEMU invocations.
+For the top level, an explicit node name must be specified.
+@item read-only
+Open the node read-only. Guest write attempts will fail.
+@item cache.direct
+The host page cache can be avoided with @option{cache.direct=on}. This will
+attempt to do disk IO directly to the guest's memory. QEMU may still perform an
+internal copy of the data.
+@item cache.no-flush
+In case you don't care about data integrity over host failures, you can use
+@option{cache.no-flush=on}. This option tells QEMU that it never needs to write
+any data to the disk but can instead keep things in cache. If anything goes
+wrong, like your host losing power, the disk storage getting disconnected
+accidentally, etc. your image will most probably be rendered unusable.
+@item discard=@var{discard}
+@var{discard} is one of "ignore" (or "off") or "unmap" (or "on") and controls
+whether @code{discard} (also known as @code{trim} or @code{unmap}) requests are
+ignored or passed to the filesystem. Some machine types may not support
+discard requests.
+@item detect-zeroes=@var{detect-zeroes}
+@var{detect-zeroes} is "off", "on" or "unmap" and enables the automatic
+conversion of plain zero writes by the OS to driver specific optimized
+zero write commands. You may even choose "unmap" if @var{discard} is set
+to "unmap" to allow a zero write to be converted to an @code{unmap} operation.
+@end table
+
+@end table
+
+ETEXI
 
 DEF("drive", HAS_ARG, QEMU_OPTION_drive,
     "-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n"
@@ -XXX,XX +XXX,XX @@ STEXI
 @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
 @findex -drive
 
-Define a new drive. Valid options are:
+Define a new drive. This includes creating a block driver node (the backend) as
+well as a guest device, and is mostly a shortcut for defining the corresponding
+@option{-blockdev} and @option{-device} options.
+
+@option{-drive} accepts all options that are accepted by @option{-blockdev}. In
+addition, it knows the following options:
 
 @table @option
 @item file=@var{file}
@@ -XXX,XX +XXX,XX @@ These options have the same definition as they have in @option{-hdachs}.
 @var{snapshot} is "on" or "off" and controls snapshot mode for the given drive
 (see @option{-snapshot}).
 @item cache=@var{cache}
-@var{cache} is "none", "writeback", "unsafe", "directsync" or "writethrough" and controls how the host cache is used to access block data.
+@var{cache} is "none", "writeback", "unsafe", "directsync" or "writethrough"
+and controls how the host cache is used to access block data. This is a
+shortcut that sets the @option{cache.direct} and @option{cache.no-flush}
+options (as in @option{-blockdev}), and additionally @option{cache.writeback},
+which provides a default for the @option{write-cache} option of block guest
+devices (as in @option{-device}). The modes correspond to the following
+settings:
+
+@c Our texi2pod.pl script doesn't support @multitable, so fall back to using
+@c plain ASCII art (well, UTF-8 art really). This looks okay both in the manpage
+@c and the HTML output.
+@example
+@             │ cache.writeback   cache.direct   cache.no-flush
+─────────────┼─────────────────────────────────────────────────
+writeback    │ on                off            off
+none         │ on                on             off
+writethrough │ off               off            off
+directsync   │ off               on             off
+unsafe       │ on                off            on
+@end example
+
+The default mode is @option{cache=writeback}.
+
 @item aio=@var{aio}
 @var{aio} is "threads", or "native" and selects between pthread based disk I/O and native Linux AIO.
-@item discard=@var{discard}
-@var{discard} is one of "ignore" (or "off") or "unmap" (or "on") and controls whether @dfn{discard} (also known as @dfn{trim} or @dfn{unmap}) requests are ignored or passed to the filesystem.  Some machine types may not support discard requests.
 @item format=@var{format}
 Specify which disk @var{format} will be used rather than detecting
 the format.  Can be used to specify format=raw to avoid interpreting
@@ -XXX,XX +XXX,XX @@ Specify which @var{action} to take on write and read errors. Valid actions are:
 "report" (report the error to the guest), "enospc" (pause QEMU only if the
 host disk is full; report the error to the guest otherwise).
 The default setting is @option{werror=enospc} and @option{rerror=report}.
-@item readonly
-Open drive @option{file} as read-only. Guest write attempts will fail.
 @item copy-on-read=@var{copy-on-read}
 @var{copy-on-read} is "on" or "off" and enables whether to copy read backing
 file sectors into the image file.
-@item detect-zeroes=@var{detect-zeroes}
-@var{detect-zeroes} is "off", "on" or "unmap" and enables the automatic
-conversion of plain zero writes by the OS to driver specific optimized
-zero write commands. You may even choose "unmap" if @var{discard} is set
-to "unmap" to allow a zero write to be converted to an UNMAP operation.
 @item bps=@var{b},bps_rd=@var{r},bps_wr=@var{w}
 Specify bandwidth throttling limits in bytes per second, either for all request
 types or for reads or writes only.  Small values can lead to timeouts or hangs
@@ -XXX,XX +XXX,XX @@ prevent guests from circumventing throttling limits by using many small disks
 instead of a single larger disk.
 @end table
 
-By default, the @option{cache=writeback} mode is used. It will report data
+By default, the @option{cache.writeback=on} mode is used. It will report data
 writes as completed as soon as the data is present in the host page cache.
 This is safe as long as your guest OS makes sure to correctly flush disk caches
 where needed. If your guest OS does not handle volatile disk write caches
 correctly and your host crashes or loses power, then the guest may experience
 data corruption.
 
-For such guests, you should consider using @option{cache=writethrough}. This
+For such guests, you should consider using @option{cache.writeback=off}. This
 means that the host page cache will be used to read and write data, but write
 notification will be sent to the guest only after QEMU has made sure to flush
 each write to the disk. Be aware that this has a major impact on performance.
 
-The host page cache can be avoided entirely with @option{cache=none}.  This will
-attempt to do disk IO directly to the guest's memory.  QEMU may still perform
-an internal copy of the data. Note that this is considered a writeback mode and
-the guest OS must handle the disk write cache correctly in order to avoid data
-corruption on host crashes.
-
-The host page cache can be avoided while only sending write notifications to
-the guest when the data has been flushed to the disk using
-@option{cache=directsync}.
-
-In case you don't care about data integrity over host failures, use
-@option{cache=unsafe}. This option tells QEMU that it never needs to write any
-data to the disk but can instead keep things in cache. If anything goes wrong,
-like your host losing power, the disk storage getting disconnected accidentally,
-etc. your image will most probably be rendered unusable.   When using
-the @option{-snapshot} option, unsafe caching is always used.
+When using the @option{-snapshot} option, unsafe caching is always used.
 
 Copy-on-read avoids accessing the same backing file sectors repeatedly and is
 useful when the backing file is over a slow network.  By default copy-on-read
-- 
1.8.3.1

This documents the driver-specific options for the raw, qcow2 and file
block drivers for the man page. For everything else, we refer to the
QAPI documentation.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 qemu-options.hx | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 114 insertions(+), 1 deletion(-)

diff --git a/qemu-options.hx b/qemu-options.hx
index XXXXXXX..XXXXXXX 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ STEXI
 @item -blockdev @var{option}[,@var{option}[,@var{option}[,...]]]
 @findex -blockdev
 
-Define a new block driver node.
+Define a new block driver node. Some of the options apply to all block drivers,
+other options are only accepted for a specific block driver. See below for a
+list of generic options and options for the most common block drivers.
+
+Options that expect a reference to another node (e.g. @code{file}) can be
+given in two ways. Either you specify the node name of an already existing node
+(file=@var{node-name}), or you define a new node inline, adding options
+for the referenced node after a dot (file.filename=@var{path},file.aio=native).
+
+A block driver node created with @option{-blockdev} can be used for a guest
+device by specifying its node name for the @code{drive} property in a
+@option{-device} argument that defines a block device.
 
 @table @option
 @item Valid options for any block driver node:
@@ -XXX,XX +XXX,XX @@ zero write commands. You may even choose "unmap" if @var{discard} is set
 to "unmap" to allow a zero write to be converted to an @code{unmap} operation.
 @end table
 
+@item Driver-specific options for @code{file}
+
+This is the protocol-level block driver for accessing regular files.
+
+@table @code
+@item filename
+The path to the image file in the local filesystem
+@item aio
+Specifies the AIO backend (threads/native, default: threads)
+@end table
+Example:
+@example
+-blockdev driver=file,node-name=disk,filename=disk.img
+@end example
+
+@item Driver-specific options for @code{raw}
+
+This is the image format block driver for raw images. It is usually
+stacked on top of a protocol level block driver such as @code{file}.
+
+@table @code
+@item file
+Reference to or definition of the data source block driver node
+(e.g. a @code{file} driver node)
+@end table
+Example 1:
+@example
+-blockdev driver=file,node-name=disk_file,filename=disk.img
+-blockdev driver=raw,node-name=disk,file=disk_file
+@end example
+Example 2:
+@example
+-blockdev driver=raw,node-name=disk,file.driver=file,file.filename=disk.img
+@end example
+
+@item Driver-specific options for @code{qcow2}
+
+This is the image format block driver for qcow2 images. It is usually
+stacked on top of a protocol level block driver such as @code{file}.
+
+@table @code
+@item file
+Reference to or definition of the data source block driver node
+(e.g. a @code{file} driver node)
+
+@item backing
+Reference to or definition of the backing file block device (default is taken
+from the image file). It is allowed to pass an empty string here in order to
+disable the default backing file.
+
+@item lazy-refcounts
+Whether to enable the lazy refcounts feature (on/off; default is taken from the
+image file)
+
+@item cache-size
+The maximum total size of the L2 table and refcount block caches in bytes
+(default: 1048576 bytes or 8 clusters, whichever is larger)
+
+@item l2-cache-size
+The maximum size of the L2 table cache in bytes
+(default: 4/5 of the total cache size)
+
+@item refcount-cache-size
+The maximum size of the refcount block cache in bytes
+(default: 1/5 of the total cache size)
+
+@item cache-clean-interval
+Clean unused entries in the L2 and refcount caches. The interval is in seconds.
+The default value is 0 and it disables this feature.
+
+@item pass-discard-request
+Whether discard requests to the qcow2 device should be forwarded to the data
+source (on/off; default: on if discard=unmap is specified, off otherwise)
+
+@item pass-discard-snapshot
+Whether discard requests for the data source should be issued when a snapshot
+operation (e.g. deleting a snapshot) frees clusters in the qcow2 file (on/off;
+default: on)
+
+@item pass-discard-other
+Whether discard requests for the data source should be issued on other
+occasions where a cluster gets freed (on/off; default: off)
+
+@item overlap-check
+Which overlap checks to perform for writes to the image
+(none/constant/cached/all; default: cached). For details or finer
+granularity control refer to the QAPI documentation of @code{blockdev-add}.
+@end table
+
+Example 1:
+@example
+-blockdev driver=file,node-name=my_file,filename=/tmp/disk.qcow2
+-blockdev driver=qcow2,node-name=hda,file=my_file,overlap-check=none,cache-size=16777216
+@end example
+Example 2:
+@example
+-blockdev driver=qcow2,node-name=disk,file.driver=http,file.filename=http://example.com/image.qcow2
+@end example
+
+@item Driver-specific options for other drivers
+Please refer to the QAPI documentation of the @code{blockdev-add} QMP command.
+
 @end table
 
 ETEXI
-- 
1.8.3.1

From: Alberto Garcia <berto@igalia.com>

There used to be throttle_timers_{detach,attach}_aio_context() calls
in bdrv_set_aio_context(), but since 7ca7f0f6db1fedd28d490795d778cf239
they are now in blk_set_aio_context().

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/throttle-groups.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index XXXXXXX..XXXXXXX 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@
  * Again, all this is handled internally and is mostly transparent to
  * the outside. The 'throttle_timers' field however has an additional
  * constraint because it may be temporarily invalid (see for example
- * bdrv_set_aio_context()). Therefore in this file a thread will
+ * blk_set_aio_context()). Therefore in this file a thread will
  * access some other BlockBackend's timers only after verifying that
  * that BlockBackend has throttled requests in the queue.
  */
-- 
1.8.3.1

From: Stefan Hajnoczi <stefanha@redhat.com>

Old kvm.ko versions only supported a tiny number of ioeventfds so
virtio-pci avoids ioeventfds when kvm_has_many_ioeventfds() returns 0.

Do not check kvm_has_many_ioeventfds() when KVM is disabled since it
always returns 0.  Since commit 8c56c1a592b5092d91da8d8943c17777d6462a6f
("memory: emulate ioeventfd") it has been possible to use ioeventfds in
qtest or TCG mode.

This patch makes -device virtio-blk-pci,iothread=iothread0 work even
when KVM is disabled.

I have tested that virtio-blk-pci works under TCG both with and without
iothread.

Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/virtio/virtio-pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -XXX,XX +XXX,XX @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
     bool pcie_port = pci_bus_is_express(pci_dev->bus) &&
                      !pci_bus_is_root(pci_dev->bus);
 
-    if (!kvm_has_many_ioeventfds()) {
+    if (kvm_enabled() && !kvm_has_many_ioeventfds()) {
         proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD;
     }
 
-- 
1.8.3.1

From: Stefan Hajnoczi <stefanha@redhat.com>

migration_incoming_state_destroy() uses qemu_fclose() on the vmstate
file.  Make sure to call it inside an AioContext acquire/release region.

This fixes an 'qemu: qemu_mutex_unlock: Operation not permitted' abort
in loadvm.

This patch closes the vmstate file before ending the drained region.
Previously we closed the vmstate file after ending the drained region.
The order does not matter.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 migration/savevm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -XXX,XX +XXX,XX @@ int load_snapshot(const char *name, Error **errp)
 
     aio_context_acquire(aio_context);
     ret = qemu_loadvm_state(f);
+    migration_incoming_state_destroy();
     aio_context_release(aio_context);
 
     bdrv_drain_all_end();
 
-    migration_incoming_state_destroy();
     if (ret < 0) {
         error_setg(errp, "Error %d while loading VM state", ret);
         return ret;
-- 
1.8.3.1

From: Stefan Hajnoczi <stefanha@redhat.com>

Avoid duplicating the QEMU command-line.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/068 | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tests/qemu-iotests/068 b/tests/qemu-iotests/068
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/068
+++ b/tests/qemu-iotests/068
@@ -XXX,XX +XXX,XX @@ case "$QEMU_DEFAULT_MACHINE" in
       ;;
 esac
 
-# Give qemu some time to boot before saving the VM state
-bash -c 'sleep 1; echo -e "savevm 0\nquit"' |\
-    $QEMU $platform_parm -nographic -monitor stdio -serial none -hda "$TEST_IMG" |\
+_qemu()
+{
+    $QEMU $platform_parm -nographic -monitor stdio -serial none -hda "$TEST_IMG" \
+          "$@" |\
     _filter_qemu | _filter_hmp
+}
+
+# Give qemu some time to boot before saving the VM state
+bash -c 'sleep 1; echo -e "savevm 0\nquit"' | _qemu
 # Now try to continue from that VM state (this should just work)
-echo quit |\
-    $QEMU $platform_parm -nographic -monitor stdio -serial none -hda "$TEST_IMG" -loadvm 0 |\
-    _filter_qemu | _filter_hmp
+echo quit | _qemu -loadvm 0
 
 # success, all done
 echo "*** done"
-- 
1.8.3.1

From: Stefan Hajnoczi <stefanha@redhat.com>

Perform the savevm/loadvm test with both iothread on and off.  This
covers the recently found savevm/loadvm hang when iothread is enabled.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/068     | 23 ++++++++++++++---------
 tests/qemu-iotests/068.out | 11 ++++++++++-
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/tests/qemu-iotests/068 b/tests/qemu-iotests/068
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/068
+++ b/tests/qemu-iotests/068
@@ -XXX,XX +XXX,XX @@ _supported_os Linux
 IMGOPTS="compat=1.1"
 IMG_SIZE=128K
 
-echo
-echo "=== Saving and reloading a VM state to/from a qcow2 image ==="
-echo
-_make_test_img $IMG_SIZE
-
 case "$QEMU_DEFAULT_MACHINE" in
   s390-ccw-virtio)
       platform_parm="-no-shutdown"
@@ -XXX,XX +XXX,XX @@ _qemu()
     _filter_qemu | _filter_hmp
 }
 
-# Give qemu some time to boot before saving the VM state
-bash -c 'sleep 1; echo -e "savevm 0\nquit"' | _qemu
-# Now try to continue from that VM state (this should just work)
-echo quit | _qemu -loadvm 0
+for extra_args in \
+    "" \
+    "-object iothread,id=iothread0 -set device.hba0.iothread=iothread0"; do
+    echo
+    echo "=== Saving and reloading a VM state to/from a qcow2 image ($extra_args) ==="
+    echo
+
+    _make_test_img $IMG_SIZE
+
+    # Give qemu some time to boot before saving the VM state
+    bash -c 'sleep 1; echo -e "savevm 0\nquit"' | _qemu $extra_args
+    # Now try to continue from that VM state (this should just work)
+    echo quit | _qemu $extra_args -loadvm 0
+done
 
 # success, all done
 echo "*** done"
diff --git a/tests/qemu-iotests/068.out b/tests/qemu-iotests/068.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/068.out
+++ b/tests/qemu-iotests/068.out
@@ -XXX,XX +XXX,XX @@
 QA output created by 068
 
-=== Saving and reloading a VM state to/from a qcow2 image ===
+=== Saving and reloading a VM state to/from a qcow2 image () ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=131072
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) savevm 0
+(qemu) quit
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) quit
+
+=== Saving and reloading a VM state to/from a qcow2 image (-object iothread,id=iothread0 -set device.hba0.iothread=iothread0) ===
 
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=131072
 QEMU X.Y.Z monitor - type 'help' for more information
-- 
1.8.3.1

From: Stephen Bates <sbates@raithlin.com>

Add the ability for the NVMe model to support both the RDS and WDS
modes in the Controller Memory Buffer.

Although not currently supported in the upstreamed Linux kernel a fork
with support exists [1] and user-space test programs that build on
this also exist [2].

Useful for testing CMB functionality in preperation for real CMB
enabled NVMe devices (coming soon).

[1] https://github.com/sbates130272/linux-p2pmem
[2] https://github.com/sbates130272/p2pmem-test

Signed-off-by: Stephen Bates <sbates@raithlin.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/nvme.c | 83 +++++++++++++++++++++++++++++++++++++++------------------
 hw/block/nvme.h |  1 +
 2 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -XXX,XX +XXX,XX @@
  *              cmb_size_mb=<cmb_size_mb[optional]>
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
- * offset 0 in BAR2 and supports SQS only for now.
+ * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
  */
 
 #include "qemu/osdep.h"
@@ -XXX,XX +XXX,XX @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
     }
 }
 
-static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
-    uint32_t len, NvmeCtrl *n)
+static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
+                             uint64_t prp2, uint32_t len, NvmeCtrl *n)
 {
     hwaddr trans_len = n->page_size - (prp1 % n->page_size);
     trans_len = MIN(len, trans_len);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
 
     if (!prp1) {
         return NVME_INVALID_FIELD | NVME_DNR;
+    } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
+               prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
+        qsg->nsg = 0;
+        qemu_iovec_init(iov, num_prps);
+        qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - n->ctrl_mem.addr], trans_len);
+    } else {
+        pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
+        qemu_sglist_add(qsg, prp1, trans_len);
     }
-
-    pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
-    qemu_sglist_add(qsg, prp1, trans_len);
     len -= trans_len;
     if (len) {
         if (!prp2) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
 
             nents = (len + n->page_size - 1) >> n->page_bits;
             prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
-            pci_dma_read(&n->parent_obj, prp2, (void *)prp_list, prp_trans);
+            nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
             while (len != 0) {
                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
                     i = 0;
                     nents = (len + n->page_size - 1) >> n->page_bits;
                     prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
-                    pci_dma_read(&n->parent_obj, prp_ent, (void *)prp_list,
+                    nvme_addr_read(n, prp_ent, (void *)prp_list,
                         prp_trans);
                     prp_ent = le64_to_cpu(prp_list[i]);
                 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
                 }
 
                 trans_len = MIN(len, n->page_size);
-                qemu_sglist_add(qsg, prp_ent, trans_len);
+                if (qsg->nsg){
+                    qemu_sglist_add(qsg, prp_ent, trans_len);
+                } else {
+                    qemu_iovec_add(iov, (void *)&n->cmbuf[prp_ent - n->ctrl_mem.addr], trans_len);
+                }
                 len -= trans_len;
                 i++;
             }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
             if (prp2 & (n->page_size - 1)) {
                 goto unmap;
             }
-            qemu_sglist_add(qsg, prp2, len);
+            if (qsg->nsg) {
+                qemu_sglist_add(qsg, prp2, len);
+            } else {
+                qemu_iovec_add(iov, (void *)&n->cmbuf[prp2 - n->ctrl_mem.addr], trans_len);
+            }
         }
     }
     return NVME_SUCCESS;
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
     uint64_t prp1, uint64_t prp2)
 {
     QEMUSGList qsg;
+    QEMUIOVector iov;
+    uint16_t status = NVME_SUCCESS;
 
-    if (nvme_map_prp(&qsg, prp1, prp2, len, n)) {
+    if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
         return NVME_INVALID_FIELD | NVME_DNR;
     }
-    if (dma_buf_read(ptr, len, &qsg)) {
+    if (qsg.nsg > 0) {
+        if (dma_buf_read(ptr, len, &qsg)) {
+            status = NVME_INVALID_FIELD | NVME_DNR;
+        }
         qemu_sglist_destroy(&qsg);
-        return NVME_INVALID_FIELD | NVME_DNR;
+    } else {
+        if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
+            status = NVME_INVALID_FIELD | NVME_DNR;
+        }
+        qemu_iovec_destroy(&iov);
     }
-    qemu_sglist_destroy(&qsg);
-    return NVME_SUCCESS;
+    return status;
 }
 
 static void nvme_post_cqes(void *opaque)
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
         return NVME_LBA_RANGE | NVME_DNR;
     }
 
-    if (nvme_map_prp(&req->qsg, prp1, prp2, data_size, n)) {
+    if (nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, data_size, n)) {
         block_acct_invalid(blk_get_stats(n->conf.blk), acct);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
-    assert((nlb << data_shift) == req->qsg.size);
-
-    req->has_sg = true;
     dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct);
-    req->aiocb = is_write ?
-        dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
-                      nvme_rw_cb, req) :
-        dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
-                     nvme_rw_cb, req);
+    if (req->qsg.nsg > 0) {
+        req->has_sg = true;
+        req->aiocb = is_write ?
+            dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
+                          nvme_rw_cb, req) :
+            dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
+                         nvme_rw_cb, req);
+    } else {
+        req->has_sg = false;
+        req->aiocb = is_write ?
+            blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
+                            req) :
+            blk_aio_preadv(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
+                           req);
+    }
 
     return NVME_NO_COMPLETE;
 }
@@ -XXX,XX +XXX,XX @@ static int nvme_init(PCIDevice *pci_dev)
         NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
         NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
         NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
-        NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 0);
-        NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 0);
+        NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
+        NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
         NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
         NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->cmb_size_mb);
 
+        n->cmbloc = n->bar.cmbloc;
+        n->cmbsz = n->bar.cmbsz;
+
         n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
         memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n,
                               "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -XXX,XX +XXX,XX @@ typedef struct NvmeRequest {
     NvmeCqe                 cqe;
     BlockAcctCookie         acct;
     QEMUSGList              qsg;
+    QEMUIOVector            iov;
     QTAILQ_ENTRY(NvmeRequest)entry;
 } NvmeRequest;
 
-- 
1.8.3.1

From: Alberto Garcia <berto@igalia.com>

Qcow2COWRegion has two attributes:

- The offset of the COW region from the start of the first cluster
  touched by the I/O request. Since it's always going to be positive
  and the maximum request size is at most INT_MAX, we can use a
  regular unsigned int to store this offset.

- The size of the COW region in bytes. This is guaranteed to be >= 0,
  so we should use an unsigned type instead.

In x86_64 this reduces the size of Qcow2COWRegion from 16 to 8 bytes.
It will also help keep some assertions simpler now that we know that
there are no negative numbers.

The prototype of do_perform_cow() is also updated to reflect these
changes.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 4 ++--
 block/qcow2.h         | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

From: Alberto Garcia <berto@igalia.com>

Instead of calling perform_cow() twice with a different COW region
each time, call it just once and make perform_cow() handle both
regions.

This patch simply moves code around. The next one will do the actual
reordering of the COW operations.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn do_perform_cow(BlockDriverState *bs,
     struct iovec iov;
     int ret;
 
+    if (bytes == 0) {
+        return 0;
+    }
+
     iov.iov_len = bytes;
     iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
     if (iov.iov_base == NULL) {
@@ -XXX,XX +XXX,XX @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
     return cluster_offset;
 }
 
-static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
+static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
 {
     BDRVQcow2State *s = bs->opaque;
+    Qcow2COWRegion *start = &m->cow_start;
+    Qcow2COWRegion *end = &m->cow_end;
     int ret;
 
-    if (r->nb_bytes == 0) {
+    if (start->nb_bytes == 0 && end->nb_bytes == 0) {
         return 0;
     }
 
     qemu_co_mutex_unlock(&s->lock);
-    ret = do_perform_cow(bs, m->offset, m->alloc_offset, r->offset, r->nb_bytes);
-    qemu_co_mutex_lock(&s->lock);
-
+    ret = do_perform_cow(bs, m->offset, m->alloc_offset,
+                         start->offset, start->nb_bytes);
     if (ret < 0) {
-        return ret;
+        goto fail;
     }
 
+    ret = do_perform_cow(bs, m->offset, m->alloc_offset,
+                         end->offset, end->nb_bytes);
+
+fail:
+    qemu_co_mutex_lock(&s->lock);
+
     /*
      * Before we update the L2 table to actually point to the new cluster, we
      * need to be sure that the refcounts have been increased and COW was
      * handled.
      */
-    qcow2_cache_depends_on_flush(s->l2_table_cache);
+    if (ret == 0) {
+        qcow2_cache_depends_on_flush(s->l2_table_cache);
+    }
 
-    return 0;
+    return ret;
 }
 
 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     }
 
     /* copy content of unmodified sectors */
-    ret = perform_cow(bs, m, &m->cow_start);
-    if (ret < 0) {
-        goto err;
-    }
-
-    ret = perform_cow(bs, m, &m->cow_end);
+    ret = perform_cow(bs, m);
     if (ret < 0) {
         goto err;
     }
-- 
1.8.3.1

From: Alberto Garcia <berto@igalia.com>

This patch splits do_perform_cow() into three separate functions to
read, encrypt and write the COW regions.

perform_cow() can now read both regions first, then encrypt them and
finally write them to disk. The memory allocation is also done in
this function now, using one single buffer large enough to hold both
regions.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 117 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 87 insertions(+), 30 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
     return 0;
 }
 
-static int coroutine_fn do_perform_cow(BlockDriverState *bs,
-                                       uint64_t src_cluster_offset,
-                                       uint64_t cluster_offset,
-                                       unsigned offset_in_cluster,
-                                       unsigned bytes)
+static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
+                                            uint64_t src_cluster_offset,
+                                            unsigned offset_in_cluster,
+                                            uint8_t *buffer,
+                                            unsigned bytes)
 {
-    BDRVQcow2State *s = bs->opaque;
     QEMUIOVector qiov;
-    struct iovec iov;
+    struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
     int ret;
 
     if (bytes == 0) {
         return 0;
     }
 
-    iov.iov_len = bytes;
-    iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
-    if (iov.iov_base == NULL) {
-        return -ENOMEM;
-    }
-
     qemu_iovec_init_external(&qiov, &iov, 1);
 
     BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
 
     if (!bs->drv) {
-        ret = -ENOMEDIUM;
-        goto out;
+        return -ENOMEDIUM;
     }
 
     /* Call .bdrv_co_readv() directly instead of using the public block-layer
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn do_perform_cow(BlockDriverState *bs,
     ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster,
                                   bytes, &qiov, 0);
     if (ret < 0) {
-        goto out;
+        return ret;
     }
 
-    if (bs->encrypted) {
+    return 0;
+}
+
+static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
+                                                uint64_t src_cluster_offset,
+                                                unsigned offset_in_cluster,
+                                                uint8_t *buffer,
+                                                unsigned bytes)
+{
+    if (bytes && bs->encrypted) {
+        BDRVQcow2State *s = bs->opaque;
         int64_t sector = (src_cluster_offset + offset_in_cluster)
                          >> BDRV_SECTOR_BITS;
         assert(s->cipher);
         assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
         assert((bytes & ~BDRV_SECTOR_MASK) == 0);
-        if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base,
+        if (qcow2_encrypt_sectors(s, sector, buffer, buffer,
                                   bytes >> BDRV_SECTOR_BITS, true, NULL) < 0) {
-            ret = -EIO;
-            goto out;
+            return false;
         }
     }
+    return true;
+}
+
+static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
+                                             uint64_t cluster_offset,
+                                             unsigned offset_in_cluster,
+                                             uint8_t *buffer,
+                                             unsigned bytes)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
+    int ret;
+
+    if (bytes == 0) {
+        return 0;
+    }
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
 
     ret = qcow2_pre_write_overlap_check(bs, 0,
             cluster_offset + offset_in_cluster, bytes);
     if (ret < 0) {
-        goto out;
+        return ret;
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
     ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster,
                           bytes, &qiov, 0);
     if (ret < 0) {
-        goto out;
+        return ret;
     }
 
-    ret = 0;
-out:
-    qemu_vfree(iov.iov_base);
-    return ret;
+    return 0;
 }
 
 
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
     BDRVQcow2State *s = bs->opaque;
     Qcow2COWRegion *start = &m->cow_start;
     Qcow2COWRegion *end = &m->cow_end;
+    unsigned buffer_size;
+    uint8_t *start_buffer, *end_buffer;
     int ret;
 
+    assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
+
     if (start->nb_bytes == 0 && end->nb_bytes == 0) {
         return 0;
     }
 
+    /* Reserve a buffer large enough to store the data from both the
+     * start and end COW regions. Add some padding in the middle if
+     * necessary to make sure that the end region is optimally aligned */
+    buffer_size = QEMU_ALIGN_UP(start->nb_bytes, bdrv_opt_mem_align(bs)) +
+        end->nb_bytes;
+    start_buffer = qemu_try_blockalign(bs, buffer_size);
+    if (start_buffer == NULL) {
+        return -ENOMEM;
+    }
+    /* The part of the buffer where the end region is located */
+    end_buffer = start_buffer + buffer_size - end->nb_bytes;
+
     qemu_co_mutex_unlock(&s->lock);
-    ret = do_perform_cow(bs, m->offset, m->alloc_offset,
-                         start->offset, start->nb_bytes);
+    /* First we read the existing data from both COW regions */
+    ret = do_perform_cow_read(bs, m->offset, start->offset,
+                              start_buffer, start->nb_bytes);
     if (ret < 0) {
         goto fail;
     }
 
-    ret = do_perform_cow(bs, m->offset, m->alloc_offset,
-                         end->offset, end->nb_bytes);
+    ret = do_perform_cow_read(bs, m->offset, end->offset,
+                              end_buffer, end->nb_bytes);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Encrypt the data if necessary before writing it */
+    if (bs->encrypted) {
+        if (!do_perform_cow_encrypt(bs, m->offset, start->offset,
+                                    start_buffer, start->nb_bytes) ||
+            !do_perform_cow_encrypt(bs, m->offset, end->offset,
+                                    end_buffer, end->nb_bytes)) {
+            ret = -EIO;
+            goto fail;
+        }
+    }
+
+    /* And now we can write everything */
+    ret = do_perform_cow_write(bs, m->alloc_offset, start->offset,
+                               start_buffer, start->nb_bytes);
+    if (ret < 0) {
+        goto fail;
+    }
 
+    ret = do_perform_cow_write(bs, m->alloc_offset, end->offset,
+                               end_buffer, end->nb_bytes);
 fail:
     qemu_co_mutex_lock(&s->lock);
 
@@ -XXX,XX +XXX,XX @@ fail:
         qcow2_cache_depends_on_flush(s->l2_table_cache);
     }
 
+    qemu_vfree(start_buffer);
     return ret;
 }
 
-- 
1.8.3.1

From: Alberto Garcia <berto@igalia.com>

Instead of passing a single buffer pointer to do_perform_cow_write(),
pass a QEMUIOVector. This will allow us to merge the write requests
for the COW regions and the actual data into a single one.

Although do_perform_cow_read() does not strictly need to change its
API, we're doing it here as well for consistency.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 51 ++++++++++++++++++++++++---------------------------
 1 file changed, 24 insertions(+), 27 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
 static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
                                             uint64_t src_cluster_offset,
                                             unsigned offset_in_cluster,
-                                            uint8_t *buffer,
-                                            unsigned bytes)
+                                            QEMUIOVector *qiov)
 {
-    QEMUIOVector qiov;
-    struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
     int ret;
 
-    if (bytes == 0) {
+    if (qiov->size == 0) {
         return 0;
     }
 
-    qemu_iovec_init_external(&qiov, &iov, 1);
-
     BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
 
     if (!bs->drv) {
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
      * which can lead to deadlock when block layer copy-on-read is enabled.
      */
     ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster,
-                                  bytes, &qiov, 0);
+                                  qiov->size, qiov, 0);
     if (ret < 0) {
         return ret;
     }
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
 static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
                                              uint64_t cluster_offset,
                                              unsigned offset_in_cluster,
-                                             uint8_t *buffer,
-                                             unsigned bytes)
+                                             QEMUIOVector *qiov)
 {
-    QEMUIOVector qiov;
-    struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
     int ret;
 
-    if (bytes == 0) {
+    if (qiov->size == 0) {
         return 0;
     }
 
-    qemu_iovec_init_external(&qiov, &iov, 1);
-
     ret = qcow2_pre_write_overlap_check(bs, 0,
-            cluster_offset + offset_in_cluster, bytes);
+            cluster_offset + offset_in_cluster, qiov->size);
     if (ret < 0) {
         return ret;
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
     ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster,
-                          bytes, &qiov, 0);
+                          qiov->size, qiov, 0);
     if (ret < 0) {
         return ret;
     }
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
     unsigned data_bytes = end->offset - (start->offset + start->nb_bytes);
     bool merge_reads;
     uint8_t *start_buffer, *end_buffer;
+    QEMUIOVector qiov;
     int ret;
 
     assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
     /* The part of the buffer where the end region is located */
     end_buffer = start_buffer + buffer_size - end->nb_bytes;
 
+    qemu_iovec_init(&qiov, 1);
+
     qemu_co_mutex_unlock(&s->lock);
     /* First we read the existing data from both COW regions. We
      * either read the whole region in one go, or the start and end
      * regions separately. */
     if (merge_reads) {
-        ret = do_perform_cow_read(bs, m->offset, start->offset,
-                                  start_buffer, buffer_size);
+        qemu_iovec_add(&qiov, start_buffer, buffer_size);
+        ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
     } else {
-        ret = do_perform_cow_read(bs, m->offset, start->offset,
-                                  start_buffer, start->nb_bytes);
+        qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
+        ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
         if (ret < 0) {
             goto fail;
         }
 
-        ret = do_perform_cow_read(bs, m->offset, end->offset,
-                                  end_buffer, end->nb_bytes);
+        qemu_iovec_reset(&qiov);
+        qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
+        ret = do_perform_cow_read(bs, m->offset, end->offset, &qiov);
     }
     if (ret < 0) {
         goto fail;
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
     }
 
     /* And now we can write everything */
-    ret = do_perform_cow_write(bs, m->alloc_offset, start->offset,
-                               start_buffer, start->nb_bytes);
+    qemu_iovec_reset(&qiov);
+    qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
+    ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
     if (ret < 0) {
         goto fail;
     }
 
-    ret = do_perform_cow_write(bs, m->alloc_offset, end->offset,
-                               end_buffer, end->nb_bytes);
+    qemu_iovec_reset(&qiov);
+    qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
+    ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
 fail:
     qemu_co_mutex_lock(&s->lock);
 
@@ -XXX,XX +XXX,XX @@ fail:
     }
 
     qemu_vfree(start_buffer);
+    qemu_iovec_destroy(&qiov);
     return ret;
 }
 
-- 
1.8.3.1

From: Alberto Garcia <berto@igalia.com>

If the guest tries to write data that results on the allocation of a
new cluster, instead of writing the guest data first and then the data
from the COW regions, write everything together using one single I/O
operation.

This can improve the write performance by 25% or more, depending on
several factors such as the media type, the cluster size and the I/O
request size.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 40 ++++++++++++++++++++++++--------
 block/qcow2.c         | 64 +++++++++++++++++++++++++++++++++++++++++++--------
 block/qcow2.h         |  7 ++++++
 3 files changed, 91 insertions(+), 20 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
     assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
     assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes);
     assert(start->offset + start->nb_bytes <= end->offset);
+    assert(!m->data_qiov || m->data_qiov->size == data_bytes);
 
     if (start->nb_bytes == 0 && end->nb_bytes == 0) {
         return 0;
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
     /* The part of the buffer where the end region is located */
     end_buffer = start_buffer + buffer_size - end->nb_bytes;
 
-    qemu_iovec_init(&qiov, 1);
+    qemu_iovec_init(&qiov, 2 + (m->data_qiov ? m->data_qiov->niov : 0));
 
     qemu_co_mutex_unlock(&s->lock);
     /* First we read the existing data from both COW regions. We
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
         }
     }
 
-    /* And now we can write everything */
-    qemu_iovec_reset(&qiov);
-    qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
-    ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
-    if (ret < 0) {
-        goto fail;
+    /* And now we can write everything. If we have the guest data we
+     * can write everything in one single operation */
+    if (m->data_qiov) {
+        qemu_iovec_reset(&qiov);
+        if (start->nb_bytes) {
+            qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
+        }
+        qemu_iovec_concat(&qiov, m->data_qiov, 0, data_bytes);
+        if (end->nb_bytes) {
+            qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
+        }
+        /* NOTE: we have a write_aio blkdebug event here followed by
+         * a cow_write one in do_perform_cow_write(), but there's only
+         * one single I/O operation */
+        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+        ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
+    } else {
+        /* If there's no guest data then write both COW regions separately */
+        qemu_iovec_reset(&qiov);
+        qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
+        ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        qemu_iovec_reset(&qiov);
+        qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
+        ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
     }
 
-    qemu_iovec_reset(&qiov);
-    qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
-    ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
 fail:
     qemu_co_mutex_lock(&s->lock);
 
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ fail:
     return ret;
 }
 
+/* Check if it's possible to merge a write request with the writing of
+ * the data from the COW regions */
+static bool merge_cow(uint64_t offset, unsigned bytes,
+                      QEMUIOVector *hd_qiov, QCowL2Meta *l2meta)
+{
+    QCowL2Meta *m;
+
+    for (m = l2meta; m != NULL; m = m->next) {
+        /* If both COW regions are empty then there's nothing to merge */
+        if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) {
+            continue;
+        }
+
+        /* The data (middle) region must be immediately after the
+         * start region */
+        if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
+            continue;
+        }
+
+        /* The end region must be immediately after the data (middle)
+         * region */
+        if (m->offset + m->cow_end.offset != offset + bytes) {
+            continue;
+        }
+
+        /* Make sure that adding both COW regions to the QEMUIOVector
+         * does not exceed IOV_MAX */
+        if (hd_qiov->niov > IOV_MAX - 2) {
+            continue;
+        }
+
+        m->data_qiov = hd_qiov;
+        return true;
+    }
+
+    return false;
+}
+
 static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
                                          uint64_t bytes, QEMUIOVector *qiov,
                                          int flags)
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
             goto fail;
         }
 
-        qemu_co_mutex_unlock(&s->lock);
-        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
-        trace_qcow2_writev_data(qemu_coroutine_self(),
-                                cluster_offset + offset_in_cluster);
-        ret = bdrv_co_pwritev(bs->file,
-                              cluster_offset + offset_in_cluster,
-                              cur_bytes, &hd_qiov, 0);
-        qemu_co_mutex_lock(&s->lock);
-        if (ret < 0) {
-            goto fail;
+        /* If we need to do COW, check if it's possible to merge the
+         * writing of the guest data together with that of the COW regions.
+         * If it's not possible (or not necessary) then write the
+         * guest data now. */
+        if (!merge_cow(offset, cur_bytes, &hd_qiov, l2meta)) {
+            qemu_co_mutex_unlock(&s->lock);
+            BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+            trace_qcow2_writev_data(qemu_coroutine_self(),
+                                    cluster_offset + offset_in_cluster);
+            ret = bdrv_co_pwritev(bs->file,
+                                  cluster_offset + offset_in_cluster,
+                                  cur_bytes, &hd_qiov, 0);
+            qemu_co_mutex_lock(&s->lock);
+            if (ret < 0) {
+                goto fail;
+            }
         }
 
         while (l2meta != NULL) {
diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ typedef struct QCowL2Meta
      */
     Qcow2COWRegion cow_end;
 
+    /**
+     * The I/O vector with the data from the actual guest write request.
+     * If non-NULL, this is meant to be merged together with the data
+     * from @cow_start and @cow_end into one single write operation.
+     */
+    QEMUIOVector *data_qiov;
+
     /** Pointer to next L2Meta of the same write request */
     struct QCowL2Meta *next;
 
-- 
1.8.3.1

From: Alberto Garcia <berto@igalia.com>

We already have functions for doing these calculations, so let's use
them instead of doing everything by hand. This makes the code a bit
more readable.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 4 ++--
 block/qcow2.c         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
 
     /* find the cluster offset for the given disk offset */
 
-    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+    l2_index = offset_to_l2_index(s, offset);
     *cluster_offset = be64_to_cpu(l2_table[l2_index]);
 
     nb_clusters = size_to_clusters(s, bytes_needed);
@@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
 
     /* find the cluster offset for the given disk offset */
 
-    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+    l2_index = offset_to_l2_index(s, offset);
 
     *new_l2_table = l2_table;
     *new_l2_index = l2_index;
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static int validate_table_offset(BlockDriverState *bs, uint64_t offset,
     }
 
     /* Tables must be cluster aligned */
-    if (offset & (s->cluster_size - 1)) {
+    if (offset_into_cluster(s, offset) != 0) {
         return -EINVAL;
     }
 
-- 
1.8.3.1

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed-cluster.c | 94 ++++++++++++++++++-----------------------------------
 block/qed-table.c   | 15 +++------
 block/qed.h         |  3 +-
 3 files changed, 36 insertions(+), 76 deletions(-)

diff --git a/block/qed-cluster.c b/block/qed-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed-cluster.c
+++ b/block/qed-cluster.c
@@ -XXX,XX +XXX,XX @@ static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
     return i - index;
 }
 
-typedef struct {
-    BDRVQEDState *s;
-    uint64_t pos;
-    size_t len;
-
-    QEDRequest *request;
-
-    /* User callback */
-    QEDFindClusterFunc *cb;
-    void *opaque;
-} QEDFindClusterCB;
-
-static void qed_find_cluster_cb(void *opaque, int ret)
-{
-    QEDFindClusterCB *find_cluster_cb = opaque;
-    BDRVQEDState *s = find_cluster_cb->s;
-    QEDRequest *request = find_cluster_cb->request;
-    uint64_t offset = 0;
-    size_t len = 0;
-    unsigned int index;
-    unsigned int n;
-
-    qed_acquire(s);
-    if (ret) {
-        goto out;
-    }
-
-    index = qed_l2_index(s, find_cluster_cb->pos);
-    n = qed_bytes_to_clusters(s,
-                              qed_offset_into_cluster(s, find_cluster_cb->pos) +
-                              find_cluster_cb->len);
-    n = qed_count_contiguous_clusters(s, request->l2_table->table,
-                                      index, n, &offset);
-
-    if (qed_offset_is_unalloc_cluster(offset)) {
-        ret = QED_CLUSTER_L2;
-    } else if (qed_offset_is_zero_cluster(offset)) {
-        ret = QED_CLUSTER_ZERO;
-    } else if (qed_check_cluster_offset(s, offset)) {
-        ret = QED_CLUSTER_FOUND;
-    } else {
-        ret = -EINVAL;
-    }
-
-    len = MIN(find_cluster_cb->len, n * s->header.cluster_size -
-              qed_offset_into_cluster(s, find_cluster_cb->pos));
-
-out:
-    find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
-    qed_release(s);
-    g_free(find_cluster_cb);
-}
-
 /**
  * Find the offset of a data cluster
  *
@@ -XXX,XX +XXX,XX @@ out:
 void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
                       size_t len, QEDFindClusterFunc *cb, void *opaque)
 {
-    QEDFindClusterCB *find_cluster_cb;
     uint64_t l2_offset;
+    uint64_t offset = 0;
+    unsigned int index;
+    unsigned int n;
+    int ret;
 
     /* Limit length to L2 boundary.  Requests are broken up at the L2 boundary
      * so that a request acts on one L2 table at a time.
@@ -XXX,XX +XXX,XX @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
         return;
     }
 
-    find_cluster_cb = g_malloc(sizeof(*find_cluster_cb));
-    find_cluster_cb->s = s;
-    find_cluster_cb->pos = pos;
-    find_cluster_cb->len = len;
-    find_cluster_cb->cb = cb;
-    find_cluster_cb->opaque = opaque;
-    find_cluster_cb->request = request;
+    ret = qed_read_l2_table(s, request, l2_offset);
+    qed_acquire(s);
+    if (ret) {
+        goto out;
+    }
+
+    index = qed_l2_index(s, pos);
+    n = qed_bytes_to_clusters(s,
+                              qed_offset_into_cluster(s, pos) + len);
+    n = qed_count_contiguous_clusters(s, request->l2_table->table,
+                                      index, n, &offset);
+
+    if (qed_offset_is_unalloc_cluster(offset)) {
+        ret = QED_CLUSTER_L2;
+    } else if (qed_offset_is_zero_cluster(offset)) {
+        ret = QED_CLUSTER_ZERO;
+    } else if (qed_check_cluster_offset(s, offset)) {
+        ret = QED_CLUSTER_FOUND;
+    } else {
+        ret = -EINVAL;
+    }
+
+    len = MIN(len,
+              n * s->header.cluster_size - qed_offset_into_cluster(s, pos));
 
-    qed_read_l2_table(s, request, l2_offset,
-                      qed_find_cluster_cb, find_cluster_cb);
+out:
+    cb(opaque, ret, offset, len);
+    qed_release(s);
 }
diff --git a/block/qed-table.c b/block/qed-table.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -XXX,XX +XXX,XX @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
     return ret;
 }
 
-void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
-                       BlockCompletionFunc *cb, void *opaque)
+int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
 {
     int ret;
 
@@ -XXX,XX +XXX,XX @@ void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
     /* Check for cached L2 entry */
     request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset);
     if (request->l2_table) {
-        cb(opaque, 0);
-        return;
+        return 0;
     }
 
     request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
@@ -XXX,XX +XXX,XX @@ void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
     }
     qed_release(s);
 
-    cb(opaque, ret);
+    return ret;
 }
 
 int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
 {
-    int ret = -EINPROGRESS;
-
-    qed_read_l2_table(s, request, offset, qed_sync_cb, &ret);
-    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
-
-    return ret;
+    return qed_read_l2_table(s, request, offset);
 }
 
 void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
diff --git a/block/qed.h b/block/qed.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -XXX,XX +XXX,XX @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
                             unsigned int n);
 int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                            uint64_t offset);
-void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
-                       BlockCompletionFunc *cb, void *opaque);
+int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset);
 void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
                         unsigned int index, unsigned int n, bool flush,
                         BlockCompletionFunc *cb, void *opaque);
-- 
1.8.3.1

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed-cluster.c | 39 ++++++++++++++++++++++-----------------
 block/qed.c         | 24 +++++++++++-------------
 block/qed.h         |  4 ++--
 3 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/block/qed-cluster.c b/block/qed-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed-cluster.c
+++ b/block/qed-cluster.c
@@ -XXX,XX +XXX,XX @@ static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
  * @s:          QED state
  * @request:    L2 cache entry
  * @pos:        Byte position in device
- * @len:        Number of bytes
- * @cb:         Completion function
- * @opaque:     User data for completion function
+ * @len:        Number of bytes (may be shortened on return)
+ * @img_offset: Contains offset in the image file on success
  *
  * This function translates a position in the block device to an offset in the
- * image file.  It invokes the cb completion callback to report back the
- * translated offset or unallocated range in the image file.
+ * image file. The translated offset or unallocated range in the image file is
+ * reported back in *img_offset and *len.
  *
  * If the L2 table exists, request->l2_table points to the L2 table cache entry
  * and the caller must free the reference when they are finished.  The cache
  * entry is exposed in this way to avoid callers having to read the L2 table
  * again later during request processing.  If request->l2_table is non-NULL it
  * will be unreferenced before taking on the new cache entry.
+ *
+ * On success QED_CLUSTER_FOUND is returned and img_offset/len are a contiguous
+ * range in the image file.
+ *
+ * On failure QED_CLUSTER_L2 or QED_CLUSTER_L1 is returned for missing L2 or L1
+ * table offset, respectively. len is number of contiguous unallocated bytes.
  */
-void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
-                      size_t len, QEDFindClusterFunc *cb, void *opaque)
+int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
+                     size_t *len, uint64_t *img_offset)
 {
     uint64_t l2_offset;
     uint64_t offset = 0;
@@ -XXX,XX +XXX,XX @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
     /* Limit length to L2 boundary.  Requests are broken up at the L2 boundary
      * so that a request acts on one L2 table at a time.
      */
-    len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos);
+    *len = MIN(*len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos);
 
     l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)];
     if (qed_offset_is_unalloc_cluster(l2_offset)) {
-        cb(opaque, QED_CLUSTER_L1, 0, len);
-        return;
+        *img_offset = 0;
+        return QED_CLUSTER_L1;
     }
     if (!qed_check_table_offset(s, l2_offset)) {
-        cb(opaque, -EINVAL, 0, 0);
-        return;
+        *img_offset = *len = 0;
+        return -EINVAL;
     }
 
     ret = qed_read_l2_table(s, request, l2_offset);
@@ -XXX,XX +XXX,XX @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
     }
 
     index = qed_l2_index(s, pos);
-    n = qed_bytes_to_clusters(s,
-                              qed_offset_into_cluster(s, pos) + len);
+    n = qed_bytes_to_clusters(s, qed_offset_into_cluster(s, pos) + *len);
     n = qed_count_contiguous_clusters(s, request->l2_table->table,
                                       index, n, &offset);
 
@@ -XXX,XX +XXX,XX @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
         ret = -EINVAL;
     }
 
-    len = MIN(len,
-              n * s->header.cluster_size - qed_offset_into_cluster(s, pos));
+    *len = MIN(*len,
+               n * s->header.cluster_size - qed_offset_into_cluster(s, pos));
 
 out:
-    cb(opaque, ret, offset, len);
+    *img_offset = offset;
     qed_release(s);
+    return ret;
 }
diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs,
         .file = file,
     };
     QEDRequest request = { .l2_table = NULL };
+    uint64_t offset;
+    int ret;
 
-    qed_find_cluster(s, &request, cb.pos, len, qed_is_allocated_cb, &cb);
+    ret = qed_find_cluster(s, &request, cb.pos, &len, &offset);
+    qed_is_allocated_cb(&cb, ret, offset, len);
 
-    /* Now sleep if the callback wasn't invoked immediately */
-    while (cb.status == BDRV_BLOCK_OFFSET_MASK) {
-        cb.co = qemu_coroutine_self();
-        qemu_coroutine_yield();
-    }
+    /* The callback was invoked immediately */
+    assert(cb.status != BDRV_BLOCK_OFFSET_MASK);
 
     qed_unref_l2_cache_entry(request.l2_table);
 
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
  *              or -errno
  * @offset:     Cluster offset in bytes
  * @len:        Length in bytes
- *
- * Callback from qed_find_cluster().
  */
 static void qed_aio_write_data(void *opaque, int ret,
                                uint64_t offset, size_t len)
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_data(void *opaque, int ret,
  *              or -errno
  * @offset:     Cluster offset in bytes
  * @len:        Length in bytes
- *
- * Callback from qed_find_cluster().
  */
 static void qed_aio_read_data(void *opaque, int ret,
                               uint64_t offset, size_t len)
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
     BDRVQEDState *s = acb_to_s(acb);
     QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
                                 qed_aio_write_data : qed_aio_read_data;
+    uint64_t offset;
+    size_t len;
 
     trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
 
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
     }
 
     /* Find next cluster and start I/O */
-    qed_find_cluster(s, &acb->request,
-                      acb->cur_pos, acb->end_pos - acb->cur_pos,
-                      io_fn, acb);
+    len = acb->end_pos - acb->cur_pos;
+    ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
+    io_fn(acb, ret, offset, len);
 }
 
 static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
diff --git a/block/qed.h b/block/qed.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -XXX,XX +XXX,XX @@ int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
 /**
  * Cluster functions
  */
-void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
-                      size_t len, QEDFindClusterFunc *cb, void *opaque);
+int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
+                     size_t *len, uint64_t *img_offset);
 
 /**
  * Consistency check
-- 
1.8.3.1

With this change, qed_aio_write_prefill() and qed_aio_write_postfill()
collapse into a single function. This is reflected by a rename of the
combined function to qed_aio_write_cow().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 57 +++++++++++++++++++++++----------------------------------
 1 file changed, 23 insertions(+), 34 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static int qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
  * @pos:        Byte position in device
  * @len:        Number of bytes
  * @offset:     Byte offset in image file
- * @cb:         Completion function
- * @opaque:     User data for completion function
  */
-static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
-                                       uint64_t len, uint64_t offset,
-                                       BlockCompletionFunc *cb,
-                                       void *opaque)
+static int qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
+                                      uint64_t len, uint64_t offset)
 {
     QEMUIOVector qiov;
     QEMUIOVector *backing_qiov = NULL;
@@ -XXX,XX +XXX,XX @@ static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
 
     /* Skip copy entirely if there is no work to do */
     if (len == 0) {
-        cb(opaque, 0);
-        return;
+        return 0;
     }
 
     iov = (struct iovec) {
@@ -XXX,XX +XXX,XX @@ static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
     ret = 0;
 out:
     qemu_vfree(iov.iov_base);
-    cb(opaque, ret);
+    return ret;
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
 }
 
 /**
- * Populate back untouched region of new data cluster
+ * Populate untouched regions of new data cluster
  */
-static void qed_aio_write_postfill(void *opaque, int ret)
+static void qed_aio_write_cow(void *opaque, int ret)
 {
     QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
-    uint64_t start = acb->cur_pos + acb->cur_qiov.size;
-    uint64_t len =
-        qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
-    uint64_t offset = acb->cur_cluster +
-                      qed_offset_into_cluster(s, acb->cur_pos) +
-                      acb->cur_qiov.size;
+    uint64_t start, len, offset;
+
+    /* Populate front untouched region of new data cluster */
+    start = qed_start_of_cluster(s, acb->cur_pos);
+    len = qed_offset_into_cluster(s, acb->cur_pos);
 
+    trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
+    ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
     if (ret) {
         qed_aio_complete(acb, ret);
         return;
     }
 
-    trace_qed_aio_write_postfill(s, acb, start, len, offset);
-    qed_copy_from_backing_file(s, start, len, offset,
-                                qed_aio_write_main, acb);
-}
+    /* Populate back untouched region of new data cluster */
+    start = acb->cur_pos + acb->cur_qiov.size;
+    len = qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
+    offset = acb->cur_cluster +
+             qed_offset_into_cluster(s, acb->cur_pos) +
+             acb->cur_qiov.size;
 
-/**
- * Populate front untouched region of new data cluster
- */
-static void qed_aio_write_prefill(void *opaque, int ret)
-{
-    QEDAIOCB *acb = opaque;
-    BDRVQEDState *s = acb_to_s(acb);
-    uint64_t start = qed_start_of_cluster(s, acb->cur_pos);
-    uint64_t len = qed_offset_into_cluster(s, acb->cur_pos);
+    trace_qed_aio_write_postfill(s, acb, start, len, offset);
+    ret = qed_copy_from_backing_file(s, start, len, offset);
 
-    trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
-    qed_copy_from_backing_file(s, start, len, acb->cur_cluster,
-                                qed_aio_write_postfill, acb);
+    qed_aio_write_main(acb, ret);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 
         cb = qed_aio_write_zero_cluster;
     } else {
-        cb = qed_aio_write_prefill;
+        cb = qed_aio_write_cow;
         acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
     }
 
-- 
1.8.3.1

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ int qed_write_header_sync(BDRVQEDState *s)
  * This function only updates known header fields in-place and does not affect
  * extra data after the QED header.
  */
-static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
-                             void *opaque)
+static int qed_write_header(BDRVQEDState *s)
 {
     /* We must write full sectors for O_DIRECT but cannot necessarily generate
      * the data following the header if an unrecognized compat feature is
@@ -XXX,XX +XXX,XX @@ static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
     ret = 0;
 out:
     qemu_vfree(buf);
-    cb(opaque, ret);
+    return ret;
 }
 
 static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
@@ -XXX,XX +XXX,XX @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
     }
 }
 
-static void qed_finish_clear_need_check(void *opaque, int ret)
-{
-    /* Do nothing */
-}
-
-static void qed_flush_after_clear_need_check(void *opaque, int ret)
-{
-    BDRVQEDState *s = opaque;
-
-    bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s);
-
-    /* No need to wait until flush completes */
-    qed_unplug_allocating_write_reqs(s);
-}
-
 static void qed_clear_need_check(void *opaque, int ret)
 {
     BDRVQEDState *s = opaque;
@@ -XXX,XX +XXX,XX @@ static void qed_clear_need_check(void *opaque, int ret)
     }
 
     s->header.features &= ~QED_F_NEED_CHECK;
-    qed_write_header(s, qed_flush_after_clear_need_check, s);
+    ret = qed_write_header(s);
+    (void) ret;
+
+    qed_unplug_allocating_write_reqs(s);
+
+    ret = bdrv_flush(s->bs);
+    (void) ret;
 }
 
 static void qed_need_check_timer_cb(void *opaque)
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 {
     BDRVQEDState *s = acb_to_s(acb);
     BlockCompletionFunc *cb;
+    int ret;
 
     /* Cancel timer when the first allocating request comes in */
     if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 
     if (qed_should_set_need_check(s)) {
         s->header.features |= QED_F_NEED_CHECK;
-        qed_write_header(s, cb, acb);
+        ret = qed_write_header(s);
+        cb(acb, ret);
     } else {
         cb(acb, 0);
     }
-- 
1.8.3.1

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed-table.c | 47 ++++++++++++-----------------------------------
 block/qed.c       | 12 +++++++-----
 block/qed.h       |  8 +++-----
 3 files changed, 22 insertions(+), 45 deletions(-)

diff --git a/block/qed-table.c b/block/qed-table.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -XXX,XX +XXX,XX @@ out:
  * @index:      Index of first element
  * @n:          Number of elements
  * @flush:      Whether or not to sync to disk
- * @cb:         Completion function
- * @opaque:     Argument for completion function
  */
-static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
-                            unsigned int index, unsigned int n, bool flush,
-                            BlockCompletionFunc *cb, void *opaque)
+static int qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
+                           unsigned int index, unsigned int n, bool flush)
 {
     unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1;
     unsigned int start, end, i;
@@ -XXX,XX +XXX,XX @@ static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
     ret = 0;
 out:
     qemu_vfree(new_table);
-    cb(opaque, ret);
-}
-
-/**
- * Propagate return value from async callback
- */
-static void qed_sync_cb(void *opaque, int ret)
-{
-    *(int *)opaque = ret;
+    return ret;
 }
 
 int qed_read_l1_table_sync(BDRVQEDState *s)
@@ -XXX,XX +XXX,XX @@ int qed_read_l1_table_sync(BDRVQEDState *s)
     return qed_read_table(s, s->header.l1_table_offset, s->l1_table);
 }
 
-void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
-                        BlockCompletionFunc *cb, void *opaque)
+int qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n)
 {
     BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE);
-    qed_write_table(s, s->header.l1_table_offset,
-                    s->l1_table, index, n, false, cb, opaque);
+    return qed_write_table(s, s->header.l1_table_offset,
+                           s->l1_table, index, n, false);
 }
 
 int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
                             unsigned int n)
 {
-    int ret = -EINPROGRESS;
-
-    qed_write_l1_table(s, index, n, qed_sync_cb, &ret);
-    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
-
-    return ret;
+    return qed_write_l1_table(s, index, n);
 }
 
 int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
@@ -XXX,XX +XXX,XX @@ int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset
     return qed_read_l2_table(s, request, offset);
 }
 
-void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
-                        unsigned int index, unsigned int n, bool flush,
-                        BlockCompletionFunc *cb, void *opaque)
+int qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
+                       unsigned int index, unsigned int n, bool flush)
 {
     BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE);
-    qed_write_table(s, request->l2_table->offset,
-                    request->l2_table->table, index, n, flush, cb, opaque);
+    return qed_write_table(s, request->l2_table->offset,
+                           request->l2_table->table, index, n, flush);
 }
 
 int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                             unsigned int index, unsigned int n, bool flush)
 {
-    int ret = -EINPROGRESS;
-
-    qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret);
-    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
-
-    return ret;
+    return qed_write_l2_table(s, request, index, n, flush);
 }
diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l1_update(void *opaque, int ret)
     index = qed_l1_index(s, acb->cur_pos);
     s->l1_table->offsets[index] = acb->request.l2_table->offset;
 
-    qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb);
+    ret = qed_write_l1_table(s, index, 1);
+    qed_commit_l2_update(acb, ret);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
 
     if (need_alloc) {
         /* Write out the whole new L2 table */
-        qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
-                           qed_aio_write_l1_update, acb);
+        ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
+        qed_aio_write_l1_update(acb, ret);
     } else {
         /* Write out only the updated part of the L2 table */
-        qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
-                           qed_aio_next_io_cb, acb);
+        ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
+                                 false);
+        qed_aio_next_io(acb, ret);
     }
     return;
 
diff --git a/block/qed.h b/block/qed.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -XXX,XX +XXX,XX @@ void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table);
  * Table I/O functions
  */
 int qed_read_l1_table_sync(BDRVQEDState *s);
-void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
-                        BlockCompletionFunc *cb, void *opaque);
+int qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n);
 int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
                             unsigned int n);
 int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                            uint64_t offset);
 int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset);
-void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
-                        unsigned int index, unsigned int n, bool flush,
-                        BlockCompletionFunc *cb, void *opaque);
+int qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
+                       unsigned int index, unsigned int n, bool flush);
 int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                             unsigned int index, unsigned int n, bool flush);
 
-- 
1.8.3.1

Note that this code is generally not running in coroutine context, so
this is an actual blocking synchronous operation. We'll fix this in a
moment.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 61 +++++++++++++++++++------------------------------------------
 1 file changed, 19 insertions(+), 42 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_aio_start_io(QEDAIOCB *acb)
     qed_aio_next_io(acb, 0);
 }
 
-static void qed_aio_next_io_cb(void *opaque, int ret)
-{
-    QEDAIOCB *acb = opaque;
-
-    qed_aio_next_io(acb, ret);
-}
-
 static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
 {
     assert(!s->allocating_write_reqs_plugged);
@@ -XXX,XX +XXX,XX @@ err:
     qed_aio_complete(acb, ret);
 }
 
-static void qed_aio_write_l2_update_cb(void *opaque, int ret)
-{
-    QEDAIOCB *acb = opaque;
-    qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
-}
-
-/**
- * Flush new data clusters before updating the L2 table
- *
- * This flush is necessary when a backing file is in use.  A crash during an
- * allocating write could result in empty clusters in the image.  If the write
- * only touched a subregion of the cluster, then backing image sectors have
- * been lost in the untouched region.  The solution is to flush after writing a
- * new data cluster and before updating the L2 table.
- */
-static void qed_aio_write_flush_before_l2_update(void *opaque, int ret)
-{
-    QEDAIOCB *acb = opaque;
-    BDRVQEDState *s = acb_to_s(acb);
-
-    if (!bdrv_aio_flush(s->bs->file->bs, qed_aio_write_l2_update_cb, opaque)) {
-        qed_aio_complete(acb, -EIO);
-    }
-}
-
 /**
  * Write data to the image file
  */
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
     BDRVQEDState *s = acb_to_s(acb);
     uint64_t offset = acb->cur_cluster +
                       qed_offset_into_cluster(s, acb->cur_pos);
-    BlockCompletionFunc *next_fn;
 
     trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size);
 
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
         return;
     }
 
+    BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
+    ret = bdrv_pwritev(s->bs->file, offset, &acb->cur_qiov);
+    if (ret >= 0) {
+        ret = 0;
+    }
+
     if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
-        next_fn = qed_aio_next_io_cb;
+        qed_aio_next_io(acb, ret);
     } else {
         if (s->bs->backing) {
-            next_fn = qed_aio_write_flush_before_l2_update;
-        } else {
-            next_fn = qed_aio_write_l2_update_cb;
+            /*
+             * Flush new data clusters before updating the L2 table
+             *
+             * This flush is necessary when a backing file is in use.  A crash
+             * during an allocating write could result in empty clusters in the
+             * image.  If the write only touched a subregion of the cluster,
+             * then backing image sectors have been lost in the untouched
+             * region.  The solution is to flush after writing a new data
+             * cluster and before updating the L2 table.
+             */
+            ret = bdrv_flush(s->bs->file->bs);
         }
+        qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
     }
-
-    BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
-    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
-                    &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
-                    next_fn, acb);
 }
 
 /**
-- 
1.8.3.1

qed_commit_l2_update() is unconditionally called at the end of
qed_aio_write_l1_update(). Inline it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
 }
 
 /**
- * Commit the current L2 table to the cache
+ * Update L1 table with new L2 table offset and write it out
  */
-static void qed_commit_l2_update(void *opaque, int ret)
+static void qed_aio_write_l1_update(void *opaque, int ret)
 {
     QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
     CachedL2Table *l2_table = acb->request.l2_table;
     uint64_t l2_offset = l2_table->offset;
+    int index;
+
+    if (ret) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
 
+    index = qed_l1_index(s, acb->cur_pos);
+    s->l1_table->offsets[index] = l2_table->offset;
+
+    ret = qed_write_l1_table(s, index, 1);
+
+    /* Commit the current L2 table to the cache */
     qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
 
     /* This is guaranteed to succeed because we just committed the entry to the
@@ -XXX,XX +XXX,XX @@ static void qed_commit_l2_update(void *opaque, int ret)
     qed_aio_next_io(acb, ret);
 }
 
-/**
- * Update L1 table with new L2 table offset and write it out
- */
-static void qed_aio_write_l1_update(void *opaque, int ret)
-{
-    QEDAIOCB *acb = opaque;
-    BDRVQEDState *s = acb_to_s(acb);
-    int index;
-
-    if (ret) {
-        qed_aio_complete(acb, ret);
-        return;
-    }
-
-    index = qed_l1_index(s, acb->cur_pos);
-    s->l1_table->offsets[index] = acb->request.l2_table->offset;
-
-    ret = qed_write_l1_table(s, index, 1);
-    qed_commit_l2_update(acb, ret);
-}
 
 /**
  * Update L2 table with new cluster offsets and write them out
-- 
1.8.3.1

Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
just return an error code and let the caller handle it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
 /**
  * Update L1 table with new L2 table offset and write it out
  */
-static void qed_aio_write_l1_update(void *opaque, int ret)
+static int qed_aio_write_l1_update(QEDAIOCB *acb)
 {
-    QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
     CachedL2Table *l2_table = acb->request.l2_table;
     uint64_t l2_offset = l2_table->offset;
-    int index;
-
-    if (ret) {
-        qed_aio_complete(acb, ret);
-        return;
-    }
+    int index, ret;
 
     index = qed_l1_index(s, acb->cur_pos);
     s->l1_table->offsets[index] = l2_table->offset;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l1_update(void *opaque, int ret)
     acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
     assert(acb->request.l2_table != NULL);
 
-    qed_aio_next_io(acb, ret);
+    return ret;
 }
 
 
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
     if (need_alloc) {
         /* Write out the whole new L2 table */
         ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
-        qed_aio_write_l1_update(acb, ret);
+        if (ret) {
+            goto err;
+        }
+        ret = qed_aio_write_l1_update(acb);
+        qed_aio_next_io(acb, ret);
+
     } else {
         /* Write out only the updated part of the L2 table */
         ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
-- 
1.8.3.1

Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
just return an error code and let the caller handle it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_l1_update(QEDAIOCB *acb)
 /**
  * Update L2 table with new cluster offsets and write them out
  */
-static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
+static int qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
 {
     BDRVQEDState *s = acb_to_s(acb);
     bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
-    int index;
-
-    if (ret) {
-        goto err;
-    }
+    int index, ret;
 
     if (need_alloc) {
         qed_unref_l2_cache_entry(acb->request.l2_table);
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
         /* Write out the whole new L2 table */
         ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
         if (ret) {
-            goto err;
+            return ret;
         }
-        ret = qed_aio_write_l1_update(acb);
-        qed_aio_next_io(acb, ret);
-
+        return qed_aio_write_l1_update(acb);
     } else {
         /* Write out only the updated part of the L2 table */
         ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
                                  false);
-        qed_aio_next_io(acb, ret);
+        if (ret) {
+            return ret;
+        }
     }
-    return;
-
-err:
-    qed_aio_complete(acb, ret);
+    return 0;
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
              */
             ret = bdrv_flush(s->bs->file->bs);
         }
-        qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
+        if (ret) {
+            goto err;
+        }
+        ret = qed_aio_write_l2_update(acb, acb->cur_cluster);
+        if (ret) {
+            goto err;
+        }
+        qed_aio_next_io(acb, 0);
     }
+    return;
+
+err:
+    qed_aio_complete(acb, ret);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_zero_cluster(void *opaque, int ret)
         return;
     }
 
-    qed_aio_write_l2_update(acb, 0, 1);
+    ret = qed_aio_write_l2_update(acb, 1);
+    if (ret < 0) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+    qed_aio_next_io(acb, 0);
 }
 
 /**
-- 
1.8.3.1

Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
just return an error code and let the caller handle it.

While refactoring qed_aio_write_alloc() to accomodate the change,
qed_aio_write_zero_cluster() ended up with a single line, so I chose to
inline that line and remove the function completely.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 58 +++++++++++++++++++++-------------------------------------
 1 file changed, 21 insertions(+), 37 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_main(QEDAIOCB *acb)
 /**
  * Populate untouched regions of new data cluster
  */
-static void qed_aio_write_cow(void *opaque, int ret)
+static int qed_aio_write_cow(QEDAIOCB *acb)
 {
-    QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
     uint64_t start, len, offset;
+    int ret;
 
     /* Populate front untouched region of new data cluster */
     start = qed_start_of_cluster(s, acb->cur_pos);
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_cow(void *opaque, int ret)
 
     trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
     ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
-    if (ret) {
-        qed_aio_complete(acb, ret);
-        return;
+    if (ret < 0) {
+        return ret;
     }
 
     /* Populate back untouched region of new data cluster */
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_cow(void *opaque, int ret)
 
     trace_qed_aio_write_postfill(s, acb, start, len, offset);
     ret = qed_copy_from_backing_file(s, start, len, offset);
-    if (ret) {
-        qed_aio_complete(acb, ret);
-        return;
-    }
-
-    ret = qed_aio_write_main(acb);
     if (ret < 0) {
-        qed_aio_complete(acb, ret);
-        return;
+        return ret;
     }
-    qed_aio_next_io(acb, 0);
+
+    return qed_aio_write_main(acb);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static bool qed_should_set_need_check(BDRVQEDState *s)
     return !(s->header.features & QED_F_NEED_CHECK);
 }
 
-static void qed_aio_write_zero_cluster(void *opaque, int ret)
-{
-    QEDAIOCB *acb = opaque;
-
-    if (ret) {
-        qed_aio_complete(acb, ret);
-        return;
-    }
-
-    ret = qed_aio_write_l2_update(acb, 1);
-    if (ret < 0) {
-        qed_aio_complete(acb, ret);
-        return;
-    }
-    qed_aio_next_io(acb, 0);
-}
-
 /**
  * Write new data cluster
  *
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_zero_cluster(void *opaque, int ret)
 static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 {
     BDRVQEDState *s = acb_to_s(acb);
-    BlockCompletionFunc *cb;
     int ret;
 
     /* Cancel timer when the first allocating request comes in */
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
             qed_aio_start_io(acb);
             return;
         }
-
-        cb = qed_aio_write_zero_cluster;
     } else {
-        cb = qed_aio_write_cow;
         acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
     }
 
     if (qed_should_set_need_check(s)) {
         s->header.features |= QED_F_NEED_CHECK;
         ret = qed_write_header(s);
-        cb(acb, ret);
+        if (ret < 0) {
+            qed_aio_complete(acb, ret);
+            return;
+        }
+    }
+
+    if (acb->flags & QED_AIOCB_ZERO) {
+        ret = qed_aio_write_l2_update(acb, 1);
     } else {
-        cb(acb, 0);
+        ret = qed_aio_write_cow(acb);
     }
+    if (ret < 0) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+    qed_aio_next_io(acb, 0);
 }
 
 /**
-- 
1.8.3.1

Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
just return an error code and let the caller handle it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 43 ++++++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static bool qed_should_set_need_check(BDRVQEDState *s)
  *
  * This path is taken when writing to previously unallocated clusters.
  */
-static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+static int qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 {
     BDRVQEDState *s = acb_to_s(acb);
     int ret;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
     }
     if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
         s->allocating_write_reqs_plugged) {
-        return; /* wait for existing request to finish */
+        return -EINPROGRESS; /* wait for existing request to finish */
     }
 
     acb->cur_nclusters = qed_bytes_to_clusters(s,
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
     if (acb->flags & QED_AIOCB_ZERO) {
         /* Skip ahead if the clusters are already zero */
         if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
-            qed_aio_start_io(acb);
-            return;
+            return 0;
         }
     } else {
         acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
         s->header.features |= QED_F_NEED_CHECK;
         ret = qed_write_header(s);
         if (ret < 0) {
-            qed_aio_complete(acb, ret);
-            return;
+            return ret;
         }
     }
 
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
         ret = qed_aio_write_cow(acb);
     }
     if (ret < 0) {
-        qed_aio_complete(acb, ret);
-        return;
+        return ret;
     }
-    qed_aio_next_io(acb, 0);
+    return 0;
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
  *
  * This path is taken when writing to already allocated clusters.
  */
-static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
+static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
 {
-    int ret;
-
     /* Allocate buffer for zero writes */
     if (acb->flags & QED_AIOCB_ZERO) {
         struct iovec *iov = acb->qiov->iov;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
         if (!iov->iov_base) {
             iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len);
             if (iov->iov_base == NULL) {
-                qed_aio_complete(acb, -ENOMEM);
-                return;
+                return -ENOMEM;
             }
             memset(iov->iov_base, 0, iov->iov_len);
         }
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
     qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
 
     /* Do the actual write */
-    ret = qed_aio_write_main(acb);
-    if (ret < 0) {
-        qed_aio_complete(acb, ret);
-        return;
-    }
-    qed_aio_next_io(acb, 0);
+    return qed_aio_write_main(acb);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_data(void *opaque, int ret,
 
     switch (ret) {
     case QED_CLUSTER_FOUND:
-        qed_aio_write_inplace(acb, offset, len);
+        ret = qed_aio_write_inplace(acb, offset, len);
         break;
 
     case QED_CLUSTER_L2:
     case QED_CLUSTER_L1:
     case QED_CLUSTER_ZERO:
-        qed_aio_write_alloc(acb, len);
+        ret = qed_aio_write_alloc(acb, len);
         break;
 
     default:
-        qed_aio_complete(acb, ret);
+        assert(ret < 0);
         break;
     }
+
+    if (ret < 0) {
+        if (ret != -EINPROGRESS) {
+            qed_aio_complete(acb, ret);
+        }
+        return;
+    }
+    qed_aio_next_io(acb, 0);
 }
 
 /**
-- 
1.8.3.1

All callers pass ret = 0, so we can just remove it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
     return l2_table;
 }
 
-static void qed_aio_next_io(QEDAIOCB *acb, int ret);
+static void qed_aio_next_io(QEDAIOCB *acb);
 
 static void qed_aio_start_io(QEDAIOCB *acb)
 {
-    qed_aio_next_io(acb, 0);
+    qed_aio_next_io(acb);
 }
 
 static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
@@ -XXX,XX +XXX,XX @@ static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
 /**
  * Begin next I/O or complete the request
  */
-static void qed_aio_next_io(QEDAIOCB *acb, int ret)
+static void qed_aio_next_io(QEDAIOCB *acb)
 {
     BDRVQEDState *s = acb_to_s(acb);
     uint64_t offset;
     size_t len;
+    int ret;
 
-    trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
+    trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
 
     if (acb->backing_qiov) {
         qemu_iovec_destroy(acb->backing_qiov);
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
         acb->backing_qiov = NULL;
     }
 
-    /* Handle I/O error */
-    if (ret) {
-        qed_aio_complete(acb, ret);
-        return;
-    }
-
     acb->qiov_offset += acb->cur_qiov.size;
     acb->cur_pos += acb->cur_qiov.size;
     qemu_iovec_reset(&acb->cur_qiov);
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
         }
         return;
     }
-    qed_aio_next_io(acb, 0);
+    qed_aio_next_io(acb);
 }
 
 static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
-- 
1.8.3.1

Most of the qed code is now synchronous and matches the coroutine model.
One notable exception is the serialisation between requests which can
still schedule a callback. Before we can replace this with coroutine
locks, let's convert the driver's external interfaces to the coroutine
versions.

We need to be careful to handle both requests that call the completion
callback directly from the calling coroutine (i.e. fully synchronous
code) and requests that involve some callback, so that we need to yield
and wait for the completion callback coming from outside the coroutine.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 97 ++++++++++++++++++++++++++-----------------------------------
 1 file changed, 42 insertions(+), 55 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb)
     }
 }
 
-static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
-                                 int64_t sector_num,
-                                 QEMUIOVector *qiov, int nb_sectors,
-                                 BlockCompletionFunc *cb,
-                                 void *opaque, int flags)
+typedef struct QEDRequestCo {
+    Coroutine *co;
+    bool done;
+    int ret;
+} QEDRequestCo;
+
+static void qed_co_request_cb(void *opaque, int ret)
 {
-    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque);
+    QEDRequestCo *co = opaque;
 
-    trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
-                        opaque, flags);
+    co->done = true;
+    co->ret = ret;
+    qemu_coroutine_enter_if_inactive(co->co);
+}
+
+static int coroutine_fn qed_co_request(BlockDriverState *bs, int64_t sector_num,
+                                       QEMUIOVector *qiov, int nb_sectors,
+                                       int flags)
+{
+    QEDRequestCo co = {
+        .co     = qemu_coroutine_self(),
+        .done   = false,
+    };
+    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, qed_co_request_cb, &co);
+
+    trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, &co, flags);
 
     acb->flags = flags;
     acb->qiov = qiov;
@@ -XXX,XX +XXX,XX @@ static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
 
     /* Start request */
     qed_aio_start_io(acb);
-    return &acb->common;
-}
 
-static BlockAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
-                                      int64_t sector_num,
-                                      QEMUIOVector *qiov, int nb_sectors,
-                                      BlockCompletionFunc *cb,
-                                      void *opaque)
-{
-    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+    if (!co.done) {
+        qemu_coroutine_yield();
+    }
+
+    return co.ret;
 }
 
-static BlockAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
-                                       int64_t sector_num,
-                                       QEMUIOVector *qiov, int nb_sectors,
-                                       BlockCompletionFunc *cb,
-                                       void *opaque)
+static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
+                                          int64_t sector_num, int nb_sectors,
+                                          QEMUIOVector *qiov)
 {
-    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
-                         opaque, QED_AIOCB_WRITE);
+    return qed_co_request(bs, sector_num, qiov, nb_sectors, 0);
 }
 
-typedef struct {
-    Coroutine *co;
-    int ret;
-    bool done;
-} QEDWriteZeroesCB;
-
-static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
+static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
+                                           int64_t sector_num, int nb_sectors,
+                                           QEMUIOVector *qiov)
 {
-    QEDWriteZeroesCB *cb = opaque;
-
-    cb->done = true;
-    cb->ret = ret;
-    if (cb->co) {
-        aio_co_wake(cb->co);
-    }
+    return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
 }
 
 static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
                                                   int count,
                                                   BdrvRequestFlags flags)
 {
-    BlockAIOCB *blockacb;
     BDRVQEDState *s = bs->opaque;
-    QEDWriteZeroesCB cb = { .done = false };
     QEMUIOVector qiov;
     struct iovec iov;
 
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
     iov.iov_len = count;
 
     qemu_iovec_init_external(&qiov, &iov, 1);
-    blockacb = qed_aio_setup(bs, offset >> BDRV_SECTOR_BITS, &qiov,
-                             count >> BDRV_SECTOR_BITS,
-                             qed_co_pwrite_zeroes_cb, &cb,
-                             QED_AIOCB_WRITE | QED_AIOCB_ZERO);
-    if (!blockacb) {
-        return -EIO;
-    }
-    if (!cb.done) {
-        cb.co = qemu_coroutine_self();
-        qemu_coroutine_yield();
-    }
-    assert(cb.done);
-    return cb.ret;
+    return qed_co_request(bs, offset >> BDRV_SECTOR_BITS, &qiov,
+                          count >> BDRV_SECTOR_BITS,
+                          QED_AIOCB_WRITE | QED_AIOCB_ZERO);
 }
 
 static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_qed = {
     .bdrv_create              = bdrv_qed_create,
     .bdrv_has_zero_init       = bdrv_has_zero_init_1,
     .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
-    .bdrv_aio_readv           = bdrv_qed_aio_readv,
-    .bdrv_aio_writev          = bdrv_qed_aio_writev,
+    .bdrv_co_readv            = bdrv_qed_co_readv,
+    .bdrv_co_writev           = bdrv_qed_co_writev,
     .bdrv_co_pwrite_zeroes    = bdrv_qed_co_pwrite_zeroes,
     .bdrv_truncate            = bdrv_qed_truncate,
     .bdrv_getlength           = bdrv_qed_getlength,
-- 
1.8.3.1

Now that we're running in coroutine context, the ad-hoc serialisation
code (which drops a request that has to wait out of coroutine context)
can be replaced by a CoQueue.

This means that when we resume a serialised request, it is running in
coroutine context again and its I/O isn't blocking any more.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 49 +++++++++++++++++--------------------------------
 block/qed.h |  3 ++-
 2 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
 
 static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
 {
-    QEDAIOCB *acb;
-
     assert(s->allocating_write_reqs_plugged);
 
     s->allocating_write_reqs_plugged = false;
-
-    acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
-    if (acb) {
-        qed_aio_start_io(acb);
-    }
+    qemu_co_enter_next(&s->allocating_write_reqs);
 }
 
 static void qed_clear_need_check(void *opaque, int ret)
@@ -XXX,XX +XXX,XX @@ static void qed_need_check_timer_cb(void *opaque)
     BDRVQEDState *s = opaque;
 
     /* The timer should only fire when allocating writes have drained */
-    assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs));
+    assert(!s->allocating_acb);
 
     trace_qed_need_check_timer_cb(s);
 
@@ -XXX,XX +XXX,XX @@ static int bdrv_qed_do_open(BlockDriverState *bs, QDict *options, int flags,
     int ret;
 
     s->bs = bs;
-    QSIMPLEQ_INIT(&s->allocating_write_reqs);
+    qemu_co_queue_init(&s->allocating_write_reqs);
 
     ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
     if (ret < 0) {
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete_bh(void *opaque)
     qed_release(s);
 }
 
-static void qed_resume_alloc_bh(void *opaque)
-{
-    qed_aio_start_io(opaque);
-}
-
 static void qed_aio_complete(QEDAIOCB *acb, int ret)
 {
     BDRVQEDState *s = acb_to_s(acb);
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
      * next request in the queue.  This ensures that we don't cycle through
      * requests multiple times but rather finish one at a time completely.
      */
-    if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
-        QEDAIOCB *next_acb;
-        QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
-        next_acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
-        if (next_acb) {
-            aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs),
-                                    qed_resume_alloc_bh, next_acb);
+    if (acb == s->allocating_acb) {
+        s->allocating_acb = NULL;
+        if (!qemu_co_queue_empty(&s->allocating_write_reqs)) {
+            qemu_co_enter_next(&s->allocating_write_reqs);
         } else if (s->header.features & QED_F_NEED_CHECK) {
             qed_start_need_check_timer(s);
         }
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
     int ret;
 
     /* Cancel timer when the first allocating request comes in */
-    if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
+    if (s->allocating_acb == NULL) {
         qed_cancel_need_check_timer(s);
     }
 
     /* Freeze this request if another allocating write is in progress */
-    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
-        QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next);
-    }
-    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
-        s->allocating_write_reqs_plugged) {
-        return -EINPROGRESS; /* wait for existing request to finish */
+    if (s->allocating_acb != acb || s->allocating_write_reqs_plugged) {
+        if (s->allocating_acb != NULL) {
+            qemu_co_queue_wait(&s->allocating_write_reqs, NULL);
+            assert(s->allocating_acb == NULL);
+        }
+        s->allocating_acb = acb;
+        return -EAGAIN; /* start over with looking up table entries */
     }
 
     acb->cur_nclusters = qed_bytes_to_clusters(s,
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb)
             ret = qed_aio_read_data(acb, ret, offset, len);
         }
 
-        if (ret < 0) {
-            if (ret != -EINPROGRESS) {
-                qed_aio_complete(acb, ret);
-            }
+        if (ret < 0 && ret != -EAGAIN) {
+            qed_aio_complete(acb, ret);
             return;
         }
     }
diff --git a/block/qed.h b/block/qed.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -XXX,XX +XXX,XX @@ typedef struct {
     uint32_t l2_mask;
 
     /* Allocating write request queue */
-    QSIMPLEQ_HEAD(, QEDAIOCB) allocating_write_reqs;
+    QEDAIOCB *allocating_acb;
+    CoQueue allocating_write_reqs;
     bool allocating_write_reqs_plugged;
 
     /* Periodic flush and clear need check flag */
-- 
1.8.3.1

Now that we process a request in the same coroutine from beginning to
end and don't drop out of it any more, we can look like a proper
coroutine-based driver and simply call qed_aio_next_io() and get a
return value from it instead of spawning an additional coroutine that
reenters the parent when it's done.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 101 +++++++++++++-----------------------------------------------
 block/qed.h |   3 +-
 2 files changed, 22 insertions(+), 82 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@
 #include "qapi/qmp/qerror.h"
 #include "sysemu/block-backend.h"
 
-static const AIOCBInfo qed_aiocb_info = {
-    .aiocb_size         = sizeof(QEDAIOCB),
-};
-
 static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
                           const char *filename)
 {
@@ -XXX,XX +XXX,XX @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
     return l2_table;
 }
 
-static void qed_aio_next_io(QEDAIOCB *acb);
-
-static void qed_aio_start_io(QEDAIOCB *acb)
-{
-    qed_aio_next_io(acb);
-}
-
 static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
 {
     assert(!s->allocating_write_reqs_plugged);
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs,
 
 static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
 {
-    return acb->common.bs->opaque;
+    return acb->bs->opaque;
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
     }
 }
 
-static void qed_aio_complete_bh(void *opaque)
-{
-    QEDAIOCB *acb = opaque;
-    BDRVQEDState *s = acb_to_s(acb);
-    BlockCompletionFunc *cb = acb->common.cb;
-    void *user_opaque = acb->common.opaque;
-    int ret = acb->bh_ret;
-
-    qemu_aio_unref(acb);
-
-    /* Invoke callback */
-    qed_acquire(s);
-    cb(user_opaque, ret);
-    qed_release(s);
-}
-
-static void qed_aio_complete(QEDAIOCB *acb, int ret)
+static void qed_aio_complete(QEDAIOCB *acb)
 {
     BDRVQEDState *s = acb_to_s(acb);
 
-    trace_qed_aio_complete(s, acb, ret);
-
     /* Free resources */
     qemu_iovec_destroy(&acb->cur_qiov);
     qed_unref_l2_cache_entry(acb->request.l2_table);
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
         acb->qiov->iov[0].iov_base = NULL;
     }
 
-    /* Arrange for a bh to invoke the completion function */
-    acb->bh_ret = ret;
-    aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs),
-                            qed_aio_complete_bh, acb);
-
     /* Start next allocating write request waiting behind this one.  Note that
      * requests enqueue themselves when they first hit an unallocated cluster
      * but they wait until the entire request is finished before waking up the
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
         struct iovec *iov = acb->qiov->iov;
 
         if (!iov->iov_base) {
-            iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len);
+            iov->iov_base = qemu_try_blockalign(acb->bs, iov->iov_len);
             if (iov->iov_base == NULL) {
                 return -ENOMEM;
             }
@@ -XXX,XX +XXX,XX @@ static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
 {
     QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
-    BlockDriverState *bs = acb->common.bs;
+    BlockDriverState *bs = acb->bs;
 
     /* Adjust offset into cluster */
     offset += qed_offset_into_cluster(s, acb->cur_pos);
@@ -XXX,XX +XXX,XX @@ static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
 /**
  * Begin next I/O or complete the request
  */
-static void qed_aio_next_io(QEDAIOCB *acb)
+static int qed_aio_next_io(QEDAIOCB *acb)
 {
     BDRVQEDState *s = acb_to_s(acb);
     uint64_t offset;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb)
 
         /* Complete request */
         if (acb->cur_pos >= acb->end_pos) {
-            qed_aio_complete(acb, 0);
-            return;
+            ret = 0;
+            break;
         }
 
         /* Find next cluster and start I/O */
         len = acb->end_pos - acb->cur_pos;
         ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
         if (ret < 0) {
-            qed_aio_complete(acb, ret);
-            return;
+            break;
         }
 
         if (acb->flags & QED_AIOCB_WRITE) {
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb)
         }
 
         if (ret < 0 && ret != -EAGAIN) {
-            qed_aio_complete(acb, ret);
-            return;
+            break;
         }
     }
-}
 
-typedef struct QEDRequestCo {
-    Coroutine *co;
-    bool done;
-    int ret;
-} QEDRequestCo;
-
-static void qed_co_request_cb(void *opaque, int ret)
-{
-    QEDRequestCo *co = opaque;
-
-    co->done = true;
-    co->ret = ret;
-    qemu_coroutine_enter_if_inactive(co->co);
+    trace_qed_aio_complete(s, acb, ret);
+    qed_aio_complete(acb);
+    return ret;
 }
 
 static int coroutine_fn qed_co_request(BlockDriverState *bs, int64_t sector_num,
                                        QEMUIOVector *qiov, int nb_sectors,
                                        int flags)
 {
-    QEDRequestCo co = {
-        .co     = qemu_coroutine_self(),
-        .done   = false,
+    QEDAIOCB acb = {
+        .bs         = bs,
+        .cur_pos    = (uint64_t) sector_num * BDRV_SECTOR_SIZE,
+        .end_pos    = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE,
+        .qiov       = qiov,
+        .flags      = flags,
     };
-    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, qed_co_request_cb, &co);
-
-    trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, &co, flags);
+    qemu_iovec_init(&acb.cur_qiov, qiov->niov);
 
-    acb->flags = flags;
-    acb->qiov = qiov;
-    acb->qiov_offset = 0;
-    acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
-    acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE;
-    acb->backing_qiov = NULL;
-    acb->request.l2_table = NULL;
-    qemu_iovec_init(&acb->cur_qiov, qiov->niov);
+    trace_qed_aio_setup(bs->opaque, &acb, sector_num, nb_sectors, NULL, flags);
 
     /* Start request */
-    qed_aio_start_io(acb);
-
-    if (!co.done) {
-        qemu_coroutine_yield();
-    }
-
-    return co.ret;
+    return qed_aio_next_io(&acb);
 }
 
 static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
diff --git a/block/qed.h b/block/qed.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -XXX,XX +XXX,XX @@ enum {
 };
 
 typedef struct QEDAIOCB {
-    BlockAIOCB common;
-    int bh_ret;                     /* final return status for completion bh */
+    BlockDriverState *bs;
     QSIMPLEQ_ENTRY(QEDAIOCB) next;  /* next request */
     int flags;                      /* QED_AIOCB_* bits ORed together */
     uint64_t end_pos;               /* request end on block device, in bytes */
-- 
1.8.3.1

This fixes the last place where we degraded from AIO to actual blocking
synchronous I/O requests. Putting it into a coroutine means that instead
of blocking, the coroutine simply yields while doing I/O.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
     qemu_co_enter_next(&s->allocating_write_reqs);
 }
 
-static void qed_clear_need_check(void *opaque, int ret)
+static void qed_need_check_timer_entry(void *opaque)
 {
     BDRVQEDState *s = opaque;
+    int ret;
 
-    if (ret) {
+    /* The timer should only fire when allocating writes have drained */
+    assert(!s->allocating_acb);
+
+    trace_qed_need_check_timer_cb(s);
+
+    qed_acquire(s);
+    qed_plug_allocating_write_reqs(s);
+
+    /* Ensure writes are on disk before clearing flag */
+    ret = bdrv_co_flush(s->bs->file->bs);
+    qed_release(s);
+    if (ret < 0) {
         qed_unplug_allocating_write_reqs(s);
         return;
     }
@@ -XXX,XX +XXX,XX @@ static void qed_clear_need_check(void *opaque, int ret)
 
     qed_unplug_allocating_write_reqs(s);
 
-    ret = bdrv_flush(s->bs);
+    ret = bdrv_co_flush(s->bs);
     (void) ret;
 }
 
 static void qed_need_check_timer_cb(void *opaque)
 {
-    BDRVQEDState *s = opaque;
-
-    /* The timer should only fire when allocating writes have drained */
-    assert(!s->allocating_acb);
-
-    trace_qed_need_check_timer_cb(s);
-
-    qed_acquire(s);
-    qed_plug_allocating_write_reqs(s);
-
-    /* Ensure writes are on disk before clearing flag */
-    bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
-    qed_release(s);
+    Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque);
+    qemu_coroutine_enter(co);
 }
 
 void qed_acquire(BDRVQEDState *s)
-- 
1.8.3.1

Now that we stay in coroutine context for the whole request when doing
reads or writes, we can add coroutine_fn annotations to many functions
that can do I/O or yield directly.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed-cluster.c |  5 +++--
 block/qed.c         | 44 ++++++++++++++++++++++++--------------------
 block/qed.h         |  5 +++--
 3 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/block/qed-cluster.c b/block/qed-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed-cluster.c
+++ b/block/qed-cluster.c
@@ -XXX,XX +XXX,XX @@ static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
  * On failure QED_CLUSTER_L2 or QED_CLUSTER_L1 is returned for missing L2 or L1
  * table offset, respectively. len is number of contiguous unallocated bytes.
  */
-int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
-                     size_t *len, uint64_t *img_offset)
+int coroutine_fn qed_find_cluster(BDRVQEDState *s, QEDRequest *request,
+                                  uint64_t pos, size_t *len,
+                                  uint64_t *img_offset)
 {
     uint64_t l2_offset;
     uint64_t offset = 0;
diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ int qed_write_header_sync(BDRVQEDState *s)
  * This function only updates known header fields in-place and does not affect
  * extra data after the QED header.
  */
-static int qed_write_header(BDRVQEDState *s)
+static int coroutine_fn qed_write_header(BDRVQEDState *s)
 {
     /* We must write full sectors for O_DIRECT but cannot necessarily generate
      * the data following the header if an unrecognized compat feature is
@@ -XXX,XX +XXX,XX @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
     qemu_co_enter_next(&s->allocating_write_reqs);
 }
 
-static void qed_need_check_timer_entry(void *opaque)
+static void coroutine_fn qed_need_check_timer_entry(void *opaque)
 {
     BDRVQEDState *s = opaque;
     int ret;
@@ -XXX,XX +XXX,XX @@ static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
  * This function reads qiov->size bytes starting at pos from the backing file.
  * If there is no backing file then zeroes are read.
  */
-static int qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
-                                 QEMUIOVector *qiov,
-                                 QEMUIOVector **backing_qiov)
+static int coroutine_fn qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
+                                              QEMUIOVector *qiov,
+                                              QEMUIOVector **backing_qiov)
 {
     uint64_t backing_length = 0;
     size_t size;
@@ -XXX,XX +XXX,XX @@ static int qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
  * @len:        Number of bytes
  * @offset:     Byte offset in image file
  */
-static int qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
-                                      uint64_t len, uint64_t offset)
+static int coroutine_fn qed_copy_from_backing_file(BDRVQEDState *s,
+                                                   uint64_t pos, uint64_t len,
+                                                   uint64_t offset)
 {
     QEMUIOVector qiov;
     QEMUIOVector *backing_qiov = NULL;
@@ -XXX,XX +XXX,XX @@ out:
  * The cluster offset may be an allocated byte offset in the image file, the
  * zero cluster marker, or the unallocated cluster marker.
  */
-static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
-                                unsigned int n, uint64_t cluster)
+static void coroutine_fn qed_update_l2_table(BDRVQEDState *s, QEDTable *table,
+                                             int index, unsigned int n,
+                                             uint64_t cluster)
 {
     int i;
     for (i = index; i < index + n; i++) {
@@ -XXX,XX +XXX,XX @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
     }
 }
 
-static void qed_aio_complete(QEDAIOCB *acb)
+static void coroutine_fn qed_aio_complete(QEDAIOCB *acb)
 {
     BDRVQEDState *s = acb_to_s(acb);
 
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb)
 /**
  * Update L1 table with new L2 table offset and write it out
  */
-static int qed_aio_write_l1_update(QEDAIOCB *acb)
+static int coroutine_fn qed_aio_write_l1_update(QEDAIOCB *acb)
 {
     BDRVQEDState *s = acb_to_s(acb);
     CachedL2Table *l2_table = acb->request.l2_table;
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_l1_update(QEDAIOCB *acb)
 /**
  * Update L2 table with new cluster offsets and write them out
  */
-static int qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
+static int coroutine_fn qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
 {
     BDRVQEDState *s = acb_to_s(acb);
     bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
 /**
  * Write data to the image file
  */
-static int qed_aio_write_main(QEDAIOCB *acb)
+static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
 {
     BDRVQEDState *s = acb_to_s(acb);
     uint64_t offset = acb->cur_cluster +
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_main(QEDAIOCB *acb)
 /**
  * Populate untouched regions of new data cluster
  */
-static int qed_aio_write_cow(QEDAIOCB *acb)
+static int coroutine_fn qed_aio_write_cow(QEDAIOCB *acb)
 {
     BDRVQEDState *s = acb_to_s(acb);
     uint64_t start, len, offset;
@@ -XXX,XX +XXX,XX @@ static bool qed_should_set_need_check(BDRVQEDState *s)
  *
  * This path is taken when writing to previously unallocated clusters.
  */
-static int qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+static int coroutine_fn qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 {
     BDRVQEDState *s = acb_to_s(acb);
     int ret;
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
  *
  * This path is taken when writing to already allocated clusters.
  */
-static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
+static int coroutine_fn qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset,
+                                              size_t len)
 {
     /* Allocate buffer for zero writes */
     if (acb->flags & QED_AIOCB_ZERO) {
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
  * @offset:     Cluster offset in bytes
  * @len:        Length in bytes
  */
-static int qed_aio_write_data(void *opaque, int ret,
-                              uint64_t offset, size_t len)
+static int coroutine_fn qed_aio_write_data(void *opaque, int ret,
+                                           uint64_t offset, size_t len)
 {
     QEDAIOCB *acb = opaque;
 
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_data(void *opaque, int ret,
  * @offset:     Cluster offset in bytes
  * @len:        Length in bytes
  */
-static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
+static int coroutine_fn qed_aio_read_data(void *opaque, int ret,
+                                          uint64_t offset, size_t len)
 {
     QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
@@ -XXX,XX +XXX,XX @@ static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
 /**
  * Begin next I/O or complete the request
  */
-static int qed_aio_next_io(QEDAIOCB *acb)
+static int coroutine_fn qed_aio_next_io(QEDAIOCB *acb)
 {
     BDRVQEDState *s = acb_to_s(acb);
     uint64_t offset;
diff --git a/block/qed.h b/block/qed.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -XXX,XX +XXX,XX @@ int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
 /**
  * Cluster functions
  */
-int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
-                     size_t *len, uint64_t *img_offset);
+int coroutine_fn qed_find_cluster(BDRVQEDState *s, QEDRequest *request,
+                                  uint64_t pos, size_t *len,
+                                  uint64_t *img_offset);
 
 /**
  * Consistency check
-- 
1.8.3.1

All functions that are marked coroutine_fn can directly call the
bdrv_co_* version of functions instead of going through the wrapper.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_write_header(BDRVQEDState *s)
     };
     qemu_iovec_init_external(&qiov, &iov, 1);
 
-    ret = bdrv_preadv(s->bs->file, 0, &qiov);
+    ret = bdrv_co_preadv(s->bs->file, 0, qiov.size, &qiov, 0);
     if (ret < 0) {
         goto out;
     }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_write_header(BDRVQEDState *s)
     /* Update header */
     qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);
 
-    ret = bdrv_pwritev(s->bs->file, 0, &qiov);
+    ret = bdrv_co_pwritev(s->bs->file, 0, qiov.size,  &qiov, 0);
     if (ret < 0) {
         goto out;
     }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
     qemu_iovec_concat(*backing_qiov, qiov, 0, size);
 
     BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
-    ret = bdrv_preadv(s->bs->backing, pos, *backing_qiov);
+    ret = bdrv_co_preadv(s->bs->backing, pos, size, *backing_qiov, 0);
     if (ret < 0) {
         return ret;
     }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_copy_from_backing_file(BDRVQEDState *s,
     }
 
     BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
-    ret = bdrv_pwritev(s->bs->file, offset, &qiov);
+    ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, 0);
     if (ret < 0) {
         goto out;
     }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
     trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size);
 
     BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
-    ret = bdrv_pwritev(s->bs->file, offset, &acb->cur_qiov);
+    ret = bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size,
+                          &acb->cur_qiov, 0);
     if (ret < 0) {
         return ret;
     }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
              * region.  The solution is to flush after writing a new data
              * cluster and before updating the L2 table.
              */
-            ret = bdrv_flush(s->bs->file->bs);
+            ret = bdrv_co_flush(s->bs->file->bs);
             if (ret < 0) {
                 return ret;
             }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_aio_read_data(void *opaque, int ret,
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
-    ret = bdrv_preadv(bs->file, offset, &acb->cur_qiov);
+    ret = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size,
+                         &acb->cur_qiov, 0);
     if (ret < 0) {
         return ret;
     }
-- 
1.8.3.1

From: "sochin.jiang" <sochin.jiang@huawei.com>

img_commit could fall into an infinite loop calling run_block_job() if
its blockjob fails on any I/O error, fix this already known problem.

Signed-off-by: sochin.jiang <sochin.jiang@huawei.com>
Message-id: 1497509253-28941-1-git-send-email-sochin.jiang@huawei.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 blockjob.c               |  4 ++--
 include/block/blockjob.h | 18 ++++++++++++++++++
 qemu-img.c               | 20 +++++++++++++-------
 3 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static void block_job_resume(BlockJob *job)
     block_job_enter(job);
 }
 
-static void block_job_ref(BlockJob *job)
+void block_job_ref(BlockJob *job)
 {
     ++job->refcnt;
 }
@@ -XXX,XX +XXX,XX @@ static void block_job_attached_aio_context(AioContext *new_context,
                                            void *opaque);
 static void block_job_detach_aio_context(void *opaque);
 
-static void block_job_unref(BlockJob *job)
+void block_job_unref(BlockJob *job)
 {
     if (--job->refcnt == 0) {
         BlockDriverState *bs = blk_bs(job->blk);
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -XXX,XX +XXX,XX @@ void block_job_iostatus_reset(BlockJob *job);
 BlockJobTxn *block_job_txn_new(void);
 
 /**
+ * block_job_ref:
+ *
+ * Add a reference to BlockJob refcnt, it will be decreased with
+ * block_job_unref, and then be freed if it comes to be the last
+ * reference.
+ */
+void block_job_ref(BlockJob *job);
+
+/**
+ * block_job_unref:
+ *
+ * Release a reference that was previously acquired with block_job_ref
+ * or block_job_create. If it's the last reference to the object, it will be
+ * freed.
+ */
+void block_job_unref(BlockJob *job);
+
+/**
  * block_job_txn_unref:
  *
  * Release a reference that was previously acquired with block_job_txn_add_job
diff --git a/qemu-img.c b/qemu-img.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ static void common_block_job_cb(void *opaque, int ret)
 static void run_block_job(BlockJob *job, Error **errp)
 {
     AioContext *aio_context = blk_get_aio_context(job->blk);
+    int ret = 0;
 
-    /* FIXME In error cases, the job simply goes away and we access a dangling
-     * pointer below. */
     aio_context_acquire(aio_context);
+    block_job_ref(job);
     do {
         aio_poll(aio_context, true);
         qemu_progress_print(job->len ?
                             ((float)job->offset / job->len * 100.f) : 0.0f, 0);
-    } while (!job->ready);
+    } while (!job->ready && !job->completed);
 
-    block_job_complete_sync(job, errp);
+    if (!job->completed) {
+        ret = block_job_complete_sync(job, errp);
+    } else {
+        ret = job->ret;
+    }
+    block_job_unref(job);
     aio_context_release(aio_context);
 
-    /* A block job may finish instantaneously without publishing any progress,
-     * so just signal completion here */
-    qemu_progress_print(100.f, 0);
+    /* publish completion progress only when success */
+    if (!ret) {
+        qemu_progress_print(100.f, 0);
+    }
 }
 
 static int img_commit(int argc, char **argv)
-- 
1.8.3.1

From: Max Reitz <mreitz@redhat.com>

The bs->exact_filename field may not be sufficient to store the full
blkdebug node filename. In this case, we should not generate a filename
at all instead of an unusable one.

Cc: qemu-stable@nongnu.org
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170613172006.19685-2-mreitz@redhat.com
Reviewed-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/blkdebug.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index XXXXXXX..XXXXXXX 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -XXX,XX +XXX,XX @@ static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options)
     }
 
     if (!force_json && bs->file->bs->exact_filename[0]) {
-        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "blkdebug:%s:%s", s->config_file ?: "",
-                 bs->file->bs->exact_filename);
+        int ret = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                           "blkdebug:%s:%s", s->config_file ?: "",
+                           bs->file->bs->exact_filename);
+        if (ret >= sizeof(bs->exact_filename)) {
+            /* An overflow makes the filename unusable, so do not report any */
+            bs->exact_filename[0] = 0;
+        }
     }
 
     opts = qdict_new();
-- 
1.8.3.1

From: Max Reitz <mreitz@redhat.com>

The bs->exact_filename field may not be sufficient to store the full
blkverify node filename. In this case, we should not generate a filename
at all instead of an unusable one.

Cc: qemu-stable@nongnu.org
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170613172006.19685-3-mreitz@redhat.com
Reviewed-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/blkverify.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/block/blkverify.c b/block/blkverify.c
index XXXXXXX..XXXXXXX 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -XXX,XX +XXX,XX @@ static void blkverify_refresh_filename(BlockDriverState *bs, QDict *options)
     if (bs->file->bs->exact_filename[0]
         && s->test_file->bs->exact_filename[0])
     {
-        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "blkverify:%s:%s",
-                 bs->file->bs->exact_filename,
-                 s->test_file->bs->exact_filename);
+        int ret = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                           "blkverify:%s:%s",
+                           bs->file->bs->exact_filename,
+                           s->test_file->bs->exact_filename);
+        if (ret >= sizeof(bs->exact_filename)) {
+            /* An overflow makes the filename unusable, so do not report any */
+            bs->exact_filename[0] = 0;
+        }
     }
 }
 
-- 
1.8.3.1

From: Max Reitz <mreitz@redhat.com>

uri_parse(...)->scheme may be NULL. In fact, probably every field may be
NULL, and the callers do test this for all of the other fields but not
for scheme (except for block/gluster.c; block/vxhs.c does not access
that field at all).

We can easily fix this by using g_strcmp0() instead of strcmp().

Cc: qemu-stable@nongnu.org
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170613205726.13544-1-mreitz@redhat.com
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/nbd.c      | 6 +++---
 block/nfs.c      | 2 +-
 block/sheepdog.c | 6 +++---
 block/ssh.c      | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -XXX,XX +XXX,XX @@ static int nbd_parse_uri(const char *filename, QDict *options)
     }
 
     /* transport */
-    if (!strcmp(uri->scheme, "nbd")) {
+    if (!g_strcmp0(uri->scheme, "nbd")) {
         is_unix = false;
-    } else if (!strcmp(uri->scheme, "nbd+tcp")) {
+    } else if (!g_strcmp0(uri->scheme, "nbd+tcp")) {
         is_unix = false;
-    } else if (!strcmp(uri->scheme, "nbd+unix")) {
+    } else if (!g_strcmp0(uri->scheme, "nbd+unix")) {
         is_unix = true;
     } else {
         ret = -EINVAL;
diff --git a/block/nfs.c b/block/nfs.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -XXX,XX +XXX,XX @@ static int nfs_parse_uri(const char *filename, QDict *options, Error **errp)
         error_setg(errp, "Invalid URI specified");
         goto out;
     }
-    if (strcmp(uri->scheme, "nfs") != 0) {
+    if (g_strcmp0(uri->scheme, "nfs") != 0) {
         error_setg(errp, "URI scheme must be 'nfs'");
         goto out;
     }
diff --git a/block/sheepdog.c b/block/sheepdog.c
index XXXXXXX..XXXXXXX 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static void sd_parse_uri(SheepdogConfig *cfg, const char *filename,
     }
 
     /* transport */
-    if (!strcmp(uri->scheme, "sheepdog")) {
+    if (!g_strcmp0(uri->scheme, "sheepdog")) {
         is_unix = false;
-    } else if (!strcmp(uri->scheme, "sheepdog+tcp")) {
+    } else if (!g_strcmp0(uri->scheme, "sheepdog+tcp")) {
         is_unix = false;
-    } else if (!strcmp(uri->scheme, "sheepdog+unix")) {
+    } else if (!g_strcmp0(uri->scheme, "sheepdog+unix")) {
         is_unix = true;
     } else {
         error_setg(&err, "URI scheme must be 'sheepdog', 'sheepdog+tcp',"
diff --git a/block/ssh.c b/block/ssh.c
index XXXXXXX..XXXXXXX 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -XXX,XX +XXX,XX @@ static int parse_uri(const char *filename, QDict *options, Error **errp)
         return -EINVAL;
     }
 
-    if (strcmp(uri->scheme, "ssh") != 0) {
+    if (g_strcmp0(uri->scheme, "ssh") != 0) {
         error_setg(errp, "URI scheme must be 'ssh'");
         goto err;
     }
-- 
1.8.3.1

The following changes since commit 281f327487c9c9b1599f93c589a408bbf4a651b8:

Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.12-pull-request' into staging (2017-12-22 00:11:36 +0000)

are available in the git repository at:

git://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to 1a63a907507fbbcfaee3f622907ec244b7eabda8:

block: Keep nodes drained between reopen_queue/multiple (2017-12-22 15:05:32 +0100)

----------------------------------------------------------------
Block layer patches

----------------------------------------------------------------
Doug Gale (1):
      nvme: Add tracing

Edgar Kaziakhmedov (1):
      qcow2: get rid of qcow2_backing_read1 routine

Fam Zheng (2):
      block: Open backing image in force share mode for size probe
      block: Remove unused bdrv_requests_pending

John Snow (1):
      iotests: fix 197 for vpc

Kevin Wolf (27):
      block: Formats don't need CONSISTENT_READ with NO_IO
      block: Make bdrv_drain_invoke() recursive
      block: Call .drain_begin only once in bdrv_drain_all_begin()
      test-bdrv-drain: Test BlockDriver callbacks for drain
      block: bdrv_drain_recurse(): Remove unused begin parameter
      block: Don't wait for requests in bdrv_drain*_end()
      block: Unify order in drain functions
      block: Don't acquire AioContext in hmp_qemu_io()
      block: Document that x-blockdev-change breaks quorum children list
      block: Assert drain_all is only called from main AioContext
      block: Make bdrv_drain() driver callbacks non-recursive
      test-bdrv-drain: Test callback for bdrv_drain
      test-bdrv-drain: Test bs->quiesce_counter
      blockjob: Pause job on draining any job BDS
      test-bdrv-drain: Test drain vs. block jobs
      block: Don't block_job_pause_all() in bdrv_drain_all()
      block: Nested drain_end must still call callbacks
      test-bdrv-drain: Test nested drain sections
      block: Don't notify parents in drain call chain
      block: Add bdrv_subtree_drained_begin/end()
      test-bdrv-drain: Tests for bdrv_subtree_drain
      test-bdrv-drain: Test behaviour in coroutine context
      test-bdrv-drain: Recursive draining with multiple parents
      block: Allow graph changes in subtree drained section
      test-bdrv-drain: Test graph changes in drained section
      commit: Simplify reopen of base
      block: Keep nodes drained between reopen_queue/multiple

Thomas Huth (3):
      block: Remove the obsolete -drive boot=on|off parameter
      block: Remove the deprecated -hdachs option
      block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter

Commit 1f4ad7d fixed 'qemu-img info' for raw images that are currently
in use as a mirror target. It is not enough for image formats, though,
as these still unconditionally request BLK_PERM_CONSISTENT_READ.

As this permission is geared towards whether the guest-visible data is
consistent, and has no impact on whether the metadata is sane, and
'qemu-img info' does not read guest-visible data (except for the raw
format), it makes sense to not require BLK_PERM_CONSISTENT_READ if there
is not going to be any guest I/O performed, regardless of image format.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
     assert(role == &child_backing || role == &child_file);
 
     if (!backing) {
+        int flags = bdrv_reopen_get_flags(reopen_queue, bs);
+
         /* Apart from the modifications below, the same permissions are
          * forwarded and left alone as for filters */
         bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared,
@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
 
         /* bs->file always needs to be consistent because of the metadata. We
          * can never allow other users to resize or write to it. */
-        perm |= BLK_PERM_CONSISTENT_READ;
+        if (!(flags & BDRV_O_NO_IO)) {
+            perm |= BLK_PERM_CONSISTENT_READ;
+        }
         shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
     } else {
         /* We want consistent read from backing files if the parent needs it.
-- 
2.13.6

From: John Snow <jsnow@redhat.com>

VPC has some difficulty creating geometries of particular size.
However, we can indeed force it to use a literal one, so let's
do that for the sake of test 197, which is testing some specific
offsets.

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Lukáš Doktor <ldoktor@redhat.com>
---
 tests/qemu-iotests/197           | 4 ++++
 tests/qemu-iotests/common.filter | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/197
+++ b/tests/qemu-iotests/197
@@ -XXX,XX +XXX,XX @@ echo '=== Copy-on-read ==='
 echo
 
 # Prep the images
+# VPC rounds image sizes to a specific geometry, force a specific size.
+if [ "$IMGFMT" = "vpc" ]; then
+    IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size")
+fi
 _make_test_img 4G
 $QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io
 IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \
diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/common.filter
+++ b/tests/qemu-iotests/common.filter
@@ -XXX,XX +XXX,XX @@ _filter_img_create()
         -e "s# log_size=[0-9]\\+##g" \
         -e "s# refcount_bits=[0-9]\\+##g" \
         -e "s# key-secret=[a-zA-Z0-9]\\+##g" \
-        -e "s# iter-time=[0-9]\\+##g"
+        -e "s# iter-time=[0-9]\\+##g" \
+        -e "s# force_size=\$on\\|off\$##g"
 }
 
 _filter_img_info()
-- 
2.13.6

This change separates bdrv_drain_invoke(), which calls the BlockDriver
drain callbacks, from bdrv_drain_recurse(). Instead, the function
performs its own recursion now.

One reason for this is that bdrv_drain_recurse() can be called multiple
times by bdrv_drain_all_begin(), but the callbacks may only be called
once. The separation is necessary to fix this bug.

The other reason is that we intend to go to a model where we call all
driver callbacks first, and only then start polling. This is not fully
achieved yet with this patch, as bdrv_drain_invoke() contains a
BDRV_POLL_WHILE() loop for the block driver callbacks, which can still
call callbacks for any unrelated event. It's a step in this direction
anyway.

Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
     bdrv_wakeup(bs);
 }
 
+/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 {
+    BdrvChild *child, *tmp;
     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
 
     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
     data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
     bdrv_coroutine_enter(bs, data.co);
     BDRV_POLL_WHILE(bs, !data.done);
+
+    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+        bdrv_drain_invoke(child->bs, begin);
+    }
 }
 
 static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
     BdrvChild *child, *tmp;
     bool waited;
 
-    /* Ensure any pending metadata writes are submitted to bs->file.  */
-    bdrv_drain_invoke(bs, begin);
-
     /* Wait for drained requests to finish */
     waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
         bdrv_parent_drained_begin(bs);
     }
 
+    bdrv_drain_invoke(bs, true);
     bdrv_drain_recurse(bs, true);
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
     }
 
     bdrv_parent_drained_end(bs);
+    bdrv_drain_invoke(bs, false);
     bdrv_drain_recurse(bs, false);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
+                    /* FIXME Calling this multiple times is wrong */
+                    bdrv_drain_invoke(bs, true);
                     waited |= bdrv_drain_recurse(bs, true);
                 }
             }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_context_acquire(aio_context);
         aio_enable_external(aio_context);
         bdrv_parent_drained_end(bs);
+        bdrv_drain_invoke(bs, false);
         bdrv_drain_recurse(bs, false);
         aio_context_release(aio_context);
     }
-- 
2.13.6

bdrv_drain_all_begin() used to call the .bdrv_co_drain_begin() driver
callback inside its polling loop. This means that how many times it got
called for each node depended on long it had to poll the event loop.

This is obviously not right and results in nodes that stay drained even
after bdrv_drain_all_end(), which calls .bdrv_co_drain_begin() once per
node.

Fix bdrv_drain_all_begin() to call the callback only once, too.

Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
         aio_context_acquire(aio_context);
         bdrv_parent_drained_begin(bs);
         aio_disable_external(aio_context);
+        bdrv_drain_invoke(bs, true);
         aio_context_release(aio_context);
 
         if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
-                    /* FIXME Calling this multiple times is wrong */
-                    bdrv_drain_invoke(bs, true);
                     waited |= bdrv_drain_recurse(bs, true);
                 }
             }
-- 
2.13.6

This adds a test case that the BlockDriver callbacks for drain are
called in bdrv_drained_all_begin/end(), and that both of them are called
exactly once.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 tests/test-bdrv-drain.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++
 tests/Makefile.include  |   2 +
 2 files changed, 139 insertions(+)
 create mode 100644 tests/test-bdrv-drain.c

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Block node draining tests
+ *
+ * Copyright (c) 2017 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "block/block.h"
+#include "sysemu/block-backend.h"
+#include "qapi/error.h"
+
+typedef struct BDRVTestState {
+    int drain_count;
+} BDRVTestState;
+
+static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
+{
+    BDRVTestState *s = bs->opaque;
+    s->drain_count++;
+}
+
+static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
+{
+    BDRVTestState *s = bs->opaque;
+    s->drain_count--;
+}
+
+static void bdrv_test_close(BlockDriverState *bs)
+{
+    BDRVTestState *s = bs->opaque;
+    g_assert_cmpint(s->drain_count, >, 0);
+}
+
+static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
+                                            uint64_t offset, uint64_t bytes,
+                                            QEMUIOVector *qiov, int flags)
+{
+    /* We want this request to stay until the polling loop in drain waits for
+     * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
+     * first and polls its result, too, but it shouldn't accidentally complete
+     * this request yet. */
+    qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
+
+    return 0;
+}
+
+static BlockDriver bdrv_test = {
+    .format_name            = "test",
+    .instance_size          = sizeof(BDRVTestState),
+
+    .bdrv_close             = bdrv_test_close,
+    .bdrv_co_preadv         = bdrv_test_co_preadv,
+
+    .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
+    .bdrv_co_drain_end      = bdrv_test_co_drain_end,
+};
+
+static void aio_ret_cb(void *opaque, int ret)
+{
+    int *aio_ret = opaque;
+    *aio_ret = ret;
+}
+
+static void test_drv_cb_drain_all(void)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs;
+    BDRVTestState *s;
+    BlockAIOCB *acb;
+    int aio_ret;
+
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = NULL,
+        .iov_len = 0,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
+                              &error_abort);
+    s = bs->opaque;
+    blk_insert_bs(blk, bs, &error_abort);
+
+    /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
+    g_assert_cmpint(s->drain_count, ==, 0);
+    bdrv_drain_all_begin();
+    g_assert_cmpint(s->drain_count, ==, 1);
+    bdrv_drain_all_end();
+    g_assert_cmpint(s->drain_count, ==, 0);
+
+    /* Now do the same while a request is pending */
+    aio_ret = -EINPROGRESS;
+    acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
+    g_assert(acb != NULL);
+    g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
+
+    g_assert_cmpint(s->drain_count, ==, 0);
+    bdrv_drain_all_begin();
+    g_assert_cmpint(aio_ret, ==, 0);
+    g_assert_cmpint(s->drain_count, ==, 1);
+    bdrv_drain_all_end();
+    g_assert_cmpint(s->drain_count, ==, 0);
+
+    bdrv_unref(bs);
+    blk_unref(blk);
+}
+
+int main(int argc, char **argv)
+{
+    bdrv_init();
+    qemu_init_main_loop(&error_abort);
+
+    g_test_init(&argc, &argv, NULL);
+
+    g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
+
+    return g_test_run();
+}
diff --git a/tests/Makefile.include b/tests/Makefile.include
index XXXXXXX..XXXXXXX 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -XXX,XX +XXX,XX @@ gcov-files-test-thread-pool-y = thread-pool.c
 gcov-files-test-hbitmap-y = util/hbitmap.c
 check-unit-y += tests/test-hbitmap$(EXESUF)
 gcov-files-test-hbitmap-y = blockjob.c
+check-unit-y += tests/test-bdrv-drain$(EXESUF)
 check-unit-y += tests/test-blockjob$(EXESUF)
 check-unit-y += tests/test-blockjob-txn$(EXESUF)
 check-unit-y += tests/test-x86-cpuid$(EXESUF)
@@ -XXX,XX +XXX,XX @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
 tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
 tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
+tests/test-bdrv-drain$(EXESUF): tests/test-bdrv-drain.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
-- 
2.13.6

Now that the bdrv_drain_invoke() calls are pulled up to the callers of
bdrv_drain_recurse(), the 'begin' parameter isn't needed any more.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
     }
 }
 
-static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
+static bool bdrv_drain_recurse(BlockDriverState *bs)
 {
     BdrvChild *child, *tmp;
     bool waited;
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
              */
             bdrv_ref(bs);
         }
-        waited |= bdrv_drain_recurse(bs, begin);
+        waited |= bdrv_drain_recurse(bs);
         if (in_main_loop) {
             bdrv_unref(bs);
         }
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
     }
 
     bdrv_drain_invoke(bs, true);
-    bdrv_drain_recurse(bs, true);
+    bdrv_drain_recurse(bs);
 }
 
 void bdrv_drained_end(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
 
     bdrv_parent_drained_end(bs);
     bdrv_drain_invoke(bs, false);
-    bdrv_drain_recurse(bs, false);
+    bdrv_drain_recurse(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
-                    waited |= bdrv_drain_recurse(bs, true);
+                    waited |= bdrv_drain_recurse(bs);
                 }
             }
             aio_context_release(aio_context);
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_enable_external(aio_context);
         bdrv_parent_drained_end(bs);
         bdrv_drain_invoke(bs, false);
-        bdrv_drain_recurse(bs, false);
+        bdrv_drain_recurse(bs);
         aio_context_release(aio_context);
     }
 
-- 
2.13.6

The device is drained, so there is no point in waiting for requests at
the end of the drained section. Remove the bdrv_drain_recurse() calls
there.

The bdrv_drain_recurse() calls were introduced in commit 481cad48e5e
in order to call the .bdrv_co_drain_end() driver callback. This is now
done by a separate bdrv_drain_invoke() call.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
 
     bdrv_parent_drained_end(bs);
     bdrv_drain_invoke(bs, false);
-    bdrv_drain_recurse(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_enable_external(aio_context);
         bdrv_parent_drained_end(bs);
         bdrv_drain_invoke(bs, false);
-        bdrv_drain_recurse(bs);
         aio_context_release(aio_context);
     }
 
-- 
2.13.6

Drain requests are propagated to child nodes, parent nodes and directly
to the AioContext. The order in which this happened was different
between all combinations of drain/drain_all and begin/end.

The correct order is to keep children only drained when their parents
are also drained. This means that at the start of a drained section, the
AioContext needs to be drained first, the parents second and only then
the children. The correct order for the end of a drained section is the
opposite.

This patch changes the three other functions to follow the example of
bdrv_drained_begin(), which is the only one that got it right.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
         return;
     }
 
+    /* Stop things in parent-to-child order */
     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
         aio_disable_external(bdrv_get_aio_context(bs));
         bdrv_parent_drained_begin(bs);
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
         return;
     }
 
-    bdrv_parent_drained_end(bs);
+    /* Re-enable things in child-to-parent order */
     bdrv_drain_invoke(bs, false);
+    bdrv_parent_drained_end(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
+        /* Stop things in parent-to-child order */
         aio_context_acquire(aio_context);
-        bdrv_parent_drained_begin(bs);
         aio_disable_external(aio_context);
+        bdrv_parent_drained_begin(bs);
         bdrv_drain_invoke(bs, true);
         aio_context_release(aio_context);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
+        /* Re-enable things in child-to-parent order */
         aio_context_acquire(aio_context);
-        aio_enable_external(aio_context);
-        bdrv_parent_drained_end(bs);
         bdrv_drain_invoke(bs, false);
+        bdrv_parent_drained_end(bs);
+        aio_enable_external(aio_context);
         aio_context_release(aio_context);
     }
 
-- 
2.13.6

Commit 15afd94a047 added code to acquire and release the AioContext in
qemuio_command(). This means that the lock is taken twice now in the
call path from hmp_qemu_io(). This causes BDRV_POLL_WHILE() to hang for
any requests issued to nodes in a non-mainloop AioContext.

Dropping the first locking from hmp_qemu_io() fixes the problem.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 hmp.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/hmp.c b/hmp.c
index XXXXXXX..XXXXXXX 100644
--- a/hmp.c
+++ b/hmp.c
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
 {
     BlockBackend *blk;
     BlockBackend *local_blk = NULL;
-    AioContext *aio_context;
     const char* device = qdict_get_str(qdict, "device");
     const char* command = qdict_get_str(qdict, "command");
     Error *err = NULL;
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
         }
     }
 
-    aio_context = blk_get_aio_context(blk);
-    aio_context_acquire(aio_context);
-
     /*
      * Notably absent: Proper permission management. This is sad, but it seems
      * almost impossible to achieve without changing the semantics and thereby
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
      */
     qemuio_command(blk, command);
 
-    aio_context_release(aio_context);
-
 fail:
     blk_unref(local_blk);
     hmp_handle_error(mon, &err);
-- 
2.13.6

From: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>

Since bdrv_co_preadv does all neccessary checks including
reading after the end of the backing file, avoid duplication
of verification before bdrv_co_preadv call.

Signed-off-by: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.h |  3 ---
 block/qcow2.c | 51 ++++++++-------------------------------------------
 2 files changed, 8 insertions(+), 46 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset)
 }
 
 /* qcow2.c functions */
-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-                  int64_t sector_num, int nb_sectors);
-
 int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
                                      int refcount_order, bool generous_increase,
                                      uint64_t *refblock_count);
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
     return status;
 }
 
-/* handle reading after the end of the backing file */
-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-                        int64_t offset, int bytes)
-{
-    uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE;
-    int n1;
-
-    if ((offset + bytes) <= bs_size) {
-        return bytes;
-    }
-
-    if (offset >= bs_size) {
-        n1 = 0;
-    } else {
-        n1 = bs_size - offset;
-    }
-
-    qemu_iovec_memset(qiov, n1, 0, bytes - n1);
-
-    return n1;
-}
-
 static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
                                         uint64_t bytes, QEMUIOVector *qiov,
                                         int flags)
 {
     BDRVQcow2State *s = bs->opaque;
-    int offset_in_cluster, n1;
+    int offset_in_cluster;
     int ret;
     unsigned int cur_bytes; /* number of bytes in current iteration */
     uint64_t cluster_offset = 0;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
         case QCOW2_CLUSTER_UNALLOCATED:
 
             if (bs->backing) {
-                /* read from the base image */
-                n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov,
-                                         offset, cur_bytes);
-                if (n1 > 0) {
-                    QEMUIOVector local_qiov;
-
-                    qemu_iovec_init(&local_qiov, hd_qiov.niov);
-                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1);
-
-                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
-                    qemu_co_mutex_unlock(&s->lock);
-                    ret = bdrv_co_preadv(bs->backing, offset, n1,
-                                         &local_qiov, 0);
-                    qemu_co_mutex_lock(&s->lock);
-
-                    qemu_iovec_destroy(&local_qiov);
-
-                    if (ret < 0) {
-                        goto fail;
-                    }
+                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
+                qemu_co_mutex_unlock(&s->lock);
+                ret = bdrv_co_preadv(bs->backing, offset, cur_bytes,
+                                     &hd_qiov, 0);
+                qemu_co_mutex_lock(&s->lock);
+                if (ret < 0) {
+                    goto fail;
                 }
             } else {
                 /* Note: in this case, no need to wait */
-- 
2.13.6

Removing a quorum child node with x-blockdev-change results in a quorum
driver state that cannot be recreated with create options because it
would require a list with gaps. This causes trouble in at least
.bdrv_refresh_filename().

Document this problem so that we won't accidentally mark the command
stable without having addressed it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
---
 qapi/block-core.json | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 # does not support all kinds of operations, all kinds of children, nor
 # all block drivers.
 #
+# FIXME Removing children from a quorum node means introducing gaps in the
+# child indices. This cannot be represented in the 'children' list of
+# BlockdevOptionsQuorum, as returned by .bdrv_refresh_filename().
+#
 # Warning: The data in a new quorum child MUST be consistent with that of
 # the rest of the array.
 #
-- 
2.13.6

From: Doug Gale <doug16k@gmail.com>

Add trace output for commands, errors, and undefined behavior.
Add guest error log output for undefined behavior.
Report invalid undefined accesses to MMIO.
Annotate unlikely error checks with unlikely.

Signed-off-by: Doug Gale <doug16k@gmail.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/nvme.c       | 349 ++++++++++++++++++++++++++++++++++++++++++--------
 hw/block/trace-events |  93 ++++++++++++++
 2 files changed, 390 insertions(+), 52 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -XXX,XX +XXX,XX @@
 #include "qapi/visitor.h"
 #include "sysemu/block-backend.h"
 
+#include "qemu/log.h"
+#include "trace.h"
 #include "nvme.h"
 
+#define NVME_GUEST_ERR(trace, fmt, ...) \
+    do { \
+        (trace_##trace)(__VA_ARGS__); \
+        qemu_log_mask(LOG_GUEST_ERROR, #trace \
+            " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
+    } while (0)
+
 static void nvme_process_sq(void *opaque);
 
 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
@@ -XXX,XX +XXX,XX @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
 {
     if (cq->irq_enabled) {
         if (msix_enabled(&(n->parent_obj))) {
+            trace_nvme_irq_msix(cq->vector);
             msix_notify(&(n->parent_obj), cq->vector);
         } else {
+            trace_nvme_irq_pin();
             pci_irq_pulse(&n->parent_obj);
         }
+    } else {
+        trace_nvme_irq_masked();
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
     trans_len = MIN(len, trans_len);
     int num_prps = (len >> n->page_bits) + 1;
 
-    if (!prp1) {
+    if (unlikely(!prp1)) {
+        trace_nvme_err_invalid_prp();
         return NVME_INVALID_FIELD | NVME_DNR;
     } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
                prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
     }
     len -= trans_len;
     if (len) {
-        if (!prp2) {
+        if (unlikely(!prp2)) {
+            trace_nvme_err_invalid_prp2_missing();
             goto unmap;
         }
         if (len > n->page_size) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
 
                 if (i == n->max_prp_ents - 1 && len > n->page_size) {
-                    if (!prp_ent || prp_ent & (n->page_size - 1)) {
+                    if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
+                        trace_nvme_err_invalid_prplist_ent(prp_ent);
                         goto unmap;
                     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                     prp_ent = le64_to_cpu(prp_list[i]);
                 }
 
-                if (!prp_ent || prp_ent & (n->page_size - 1)) {
+                if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
+                    trace_nvme_err_invalid_prplist_ent(prp_ent);
                     goto unmap;
                 }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                 i++;
             }
         } else {
-            if (prp2 & (n->page_size - 1)) {
+            if (unlikely(prp2 & (n->page_size - 1))) {
+                trace_nvme_err_invalid_prp2_align(prp2);
                 goto unmap;
             }
             if (qsg->nsg) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
     QEMUIOVector iov;
     uint16_t status = NVME_SUCCESS;
 
+    trace_nvme_dma_read(prp1, prp2);
+
     if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
         return NVME_INVALID_FIELD | NVME_DNR;
     }
     if (qsg.nsg > 0) {
-        if (dma_buf_read(ptr, len, &qsg)) {
+        if (unlikely(dma_buf_read(ptr, len, &qsg))) {
+            trace_nvme_err_invalid_dma();
             status = NVME_INVALID_FIELD | NVME_DNR;
         }
         qemu_sglist_destroy(&qsg);
     } else {
-        if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
+        if (unlikely(qemu_iovec_to_buf(&iov, 0, ptr, len) != len)) {
+            trace_nvme_err_invalid_dma();
             status = NVME_INVALID_FIELD | NVME_DNR;
         }
         qemu_iovec_destroy(&iov);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
     uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS);
 
-    if (slba + nlb > ns->id_ns.nsze) {
+    if (unlikely(slba + nlb > ns->id_ns.nsze)) {
+        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
         return NVME_LBA_RANGE | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
     enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
 
-    if ((slba + nlb) > ns->id_ns.nsze) {
+    trace_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
+
+    if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
         block_acct_invalid(blk_get_stats(n->conf.blk), acct);
+        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
         return NVME_LBA_RANGE | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     NvmeNamespace *ns;
     uint32_t nsid = le32_to_cpu(cmd->nsid);
 
-    if (nsid == 0 || nsid > n->num_namespaces) {
+    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
+        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
         return NVME_INVALID_NSID | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     case NVME_CMD_READ:
         return nvme_rw(n, ns, cmd, req);
     default:
+        trace_nvme_err_invalid_opc(cmd->opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
     }
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
     NvmeCQueue *cq;
     uint16_t qid = le16_to_cpu(c->qid);
 
-    if (!qid || nvme_check_sqid(n, qid)) {
+    if (unlikely(!qid || nvme_check_sqid(n, qid))) {
+        trace_nvme_err_invalid_del_sq(qid);
         return NVME_INVALID_QID | NVME_DNR;
     }
 
+    trace_nvme_del_sq(qid);
+
     sq = n->sq[qid];
     while (!QTAILQ_EMPTY(&sq->out_req_list)) {
         req = QTAILQ_FIRST(&sq->out_req_list);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
     uint16_t qflags = le16_to_cpu(c->sq_flags);
     uint64_t prp1 = le64_to_cpu(c->prp1);
 
-    if (!cqid || nvme_check_cqid(n, cqid)) {
+    trace_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
+
+    if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
+        trace_nvme_err_invalid_create_sq_cqid(cqid);
         return NVME_INVALID_CQID | NVME_DNR;
     }
-    if (!sqid || !nvme_check_sqid(n, sqid)) {
+    if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) {
+        trace_nvme_err_invalid_create_sq_sqid(sqid);
         return NVME_INVALID_QID | NVME_DNR;
     }
-    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
+    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
+        trace_nvme_err_invalid_create_sq_size(qsize);
         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
     }
-    if (!prp1 || prp1 & (n->page_size - 1)) {
+    if (unlikely(!prp1 || prp1 & (n->page_size - 1))) {
+        trace_nvme_err_invalid_create_sq_addr(prp1);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
-    if (!(NVME_SQ_FLAGS_PC(qflags))) {
+    if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
+        trace_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
         return NVME_INVALID_FIELD | NVME_DNR;
     }
     sq = g_malloc0(sizeof(*sq));
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
     NvmeCQueue *cq;
     uint16_t qid = le16_to_cpu(c->qid);
 
-    if (!qid || nvme_check_cqid(n, qid)) {
+    if (unlikely(!qid || nvme_check_cqid(n, qid))) {
+        trace_nvme_err_invalid_del_cq_cqid(qid);
         return NVME_INVALID_CQID | NVME_DNR;
     }
 
     cq = n->cq[qid];
-    if (!QTAILQ_EMPTY(&cq->sq_list)) {
+    if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
+        trace_nvme_err_invalid_del_cq_notempty(qid);
         return NVME_INVALID_QUEUE_DEL;
     }
+    trace_nvme_del_cq(qid);
     nvme_free_cq(cq, n);
     return NVME_SUCCESS;
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
     uint16_t qflags = le16_to_cpu(c->cq_flags);
     uint64_t prp1 = le64_to_cpu(c->prp1);
 
-    if (!cqid || !nvme_check_cqid(n, cqid)) {
+    trace_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
+                         NVME_CQ_FLAGS_IEN(qflags) != 0);
+
+    if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) {
+        trace_nvme_err_invalid_create_cq_cqid(cqid);
         return NVME_INVALID_CQID | NVME_DNR;
     }
-    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
+    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
+        trace_nvme_err_invalid_create_cq_size(qsize);
         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
     }
-    if (!prp1) {
+    if (unlikely(!prp1)) {
+        trace_nvme_err_invalid_create_cq_addr(prp1);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
-    if (vector > n->num_queues) {
+    if (unlikely(vector > n->num_queues)) {
+        trace_nvme_err_invalid_create_cq_vector(vector);
         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
     }
-    if (!(NVME_CQ_FLAGS_PC(qflags))) {
+    if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
+        trace_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
     uint64_t prp1 = le64_to_cpu(c->prp1);
     uint64_t prp2 = le64_to_cpu(c->prp2);
 
+    trace_nvme_identify_ctrl();
+
     return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
         prp1, prp2);
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
     uint64_t prp1 = le64_to_cpu(c->prp1);
     uint64_t prp2 = le64_to_cpu(c->prp2);
 
-    if (nsid == 0 || nsid > n->num_namespaces) {
+    trace_nvme_identify_ns(nsid);
+
+    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
+        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
         return NVME_INVALID_NSID | NVME_DNR;
     }
 
     ns = &n->namespaces[nsid - 1];
+
     return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
         prp1, prp2);
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
     uint16_t ret;
     int i, j = 0;
 
+    trace_nvme_identify_nslist(min_nsid);
+
     list = g_malloc0(data_len);
     for (i = 0; i < n->num_namespaces; i++) {
         if (i < min_nsid) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
     case 0x02:
         return nvme_identify_nslist(n, c);
     default:
+        trace_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     switch (dw10) {
     case NVME_VOLATILE_WRITE_CACHE:
         result = blk_enable_write_cache(n->conf.blk);
+        trace_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
         break;
     case NVME_NUMBER_OF_QUEUES:
         result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
+        trace_nvme_getfeat_numq(result);
         break;
     default:
+        trace_nvme_err_invalid_getfeat(dw10);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
         break;
     case NVME_NUMBER_OF_QUEUES:
+        trace_nvme_setfeat_numq((dw11 & 0xFFFF) + 1,
+                                ((dw11 >> 16) & 0xFFFF) + 1,
+                                n->num_queues - 1, n->num_queues - 1);
         req->cqe.result =
             cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
         break;
     default:
+        trace_nvme_err_invalid_setfeat(dw10);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
     return NVME_SUCCESS;
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     case NVME_ADM_CMD_GET_FEATURES:
         return nvme_get_feature(n, cmd, req);
     default:
+        trace_nvme_err_invalid_admin_opc(cmd->opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
     }
 }
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
     uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
     uint32_t page_size = 1 << page_bits;
 
-    if (n->cq[0] || n->sq[0] || !n->bar.asq || !n->bar.acq ||
-            n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1) ||
-            NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) ||
-            NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap) ||
-            NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) ||
-            NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes) ||
-            NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.sqes) ||
-            NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.sqes) ||
-            !NVME_AQA_ASQS(n->bar.aqa) || !NVME_AQA_ACQS(n->bar.aqa)) {
+    if (unlikely(n->cq[0])) {
+        trace_nvme_err_startfail_cq();
+        return -1;
+    }
+    if (unlikely(n->sq[0])) {
+        trace_nvme_err_startfail_sq();
+        return -1;
+    }
+    if (unlikely(!n->bar.asq)) {
+        trace_nvme_err_startfail_nbarasq();
+        return -1;
+    }
+    if (unlikely(!n->bar.acq)) {
+        trace_nvme_err_startfail_nbaracq();
+        return -1;
+    }
+    if (unlikely(n->bar.asq & (page_size - 1))) {
+        trace_nvme_err_startfail_asq_misaligned(n->bar.asq);
+        return -1;
+    }
+    if (unlikely(n->bar.acq & (page_size - 1))) {
+        trace_nvme_err_startfail_acq_misaligned(n->bar.acq);
+        return -1;
+    }
+    if (unlikely(NVME_CC_MPS(n->bar.cc) <
+                 NVME_CAP_MPSMIN(n->bar.cap))) {
+        trace_nvme_err_startfail_page_too_small(
+                    NVME_CC_MPS(n->bar.cc),
+                    NVME_CAP_MPSMIN(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_MPS(n->bar.cc) >
+                 NVME_CAP_MPSMAX(n->bar.cap))) {
+        trace_nvme_err_startfail_page_too_large(
+                    NVME_CC_MPS(n->bar.cc),
+                    NVME_CAP_MPSMAX(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
+                 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
+        trace_nvme_err_startfail_cqent_too_small(
+                    NVME_CC_IOCQES(n->bar.cc),
+                    NVME_CTRL_CQES_MIN(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
+                 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
+        trace_nvme_err_startfail_cqent_too_large(
+                    NVME_CC_IOCQES(n->bar.cc),
+                    NVME_CTRL_CQES_MAX(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
+                 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
+        trace_nvme_err_startfail_sqent_too_small(
+                    NVME_CC_IOSQES(n->bar.cc),
+                    NVME_CTRL_SQES_MIN(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
+                 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
+        trace_nvme_err_startfail_sqent_too_large(
+                    NVME_CC_IOSQES(n->bar.cc),
+                    NVME_CTRL_SQES_MAX(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
+        trace_nvme_err_startfail_asqent_sz_zero();
+        return -1;
+    }
+    if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
+        trace_nvme_err_startfail_acqent_sz_zero();
         return -1;
     }
 
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
     unsigned size)
 {
+    if (unlikely(offset & (sizeof(uint32_t) - 1))) {
+        NVME_GUEST_ERR(nvme_ub_mmiowr_misaligned32,
+                       "MMIO write not 32-bit aligned,"
+                       " offset=0x%"PRIx64"", offset);
+        /* should be ignored, fall through for now */
+    }
+
+    if (unlikely(size < sizeof(uint32_t))) {
+        NVME_GUEST_ERR(nvme_ub_mmiowr_toosmall,
+                       "MMIO write smaller than 32-bits,"
+                       " offset=0x%"PRIx64", size=%u",
+                       offset, size);
+        /* should be ignored, fall through for now */
+    }
+
     switch (offset) {
-    case 0xc:
+    case 0xc:   /* INTMS */
+        if (unlikely(msix_enabled(&(n->parent_obj)))) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
+                           "undefined access to interrupt mask set"
+                           " when MSI-X is enabled");
+            /* should be ignored, fall through for now */
+        }
         n->bar.intms |= data & 0xffffffff;
         n->bar.intmc = n->bar.intms;
+        trace_nvme_mmio_intm_set(data & 0xffffffff,
+                                 n->bar.intmc);
         break;
-    case 0x10:
+    case 0x10:  /* INTMC */
+        if (unlikely(msix_enabled(&(n->parent_obj)))) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
+                           "undefined access to interrupt mask clr"
+                           " when MSI-X is enabled");
+            /* should be ignored, fall through for now */
+        }
         n->bar.intms &= ~(data & 0xffffffff);
         n->bar.intmc = n->bar.intms;
+        trace_nvme_mmio_intm_clr(data & 0xffffffff,
+                                 n->bar.intmc);
         break;
-    case 0x14:
+    case 0x14:  /* CC */
+        trace_nvme_mmio_cfg(data & 0xffffffff);
         /* Windows first sends data, then sends enable bit */
         if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
             !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
@@ -XXX,XX +XXX,XX @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
 
         if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
             n->bar.cc = data;
-            if (nvme_start_ctrl(n)) {
+            if (unlikely(nvme_start_ctrl(n))) {
+                trace_nvme_err_startfail();
                 n->bar.csts = NVME_CSTS_FAILED;
             } else {
+                trace_nvme_mmio_start_success();
                 n->bar.csts = NVME_CSTS_READY;
             }
         } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
+            trace_nvme_mmio_stopped();
             nvme_clear_ctrl(n);
             n->bar.csts &= ~NVME_CSTS_READY;
         }
         if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
-                nvme_clear_ctrl(n);
-                n->bar.cc = data;
-                n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
+            trace_nvme_mmio_shutdown_set();
+            nvme_clear_ctrl(n);
+            n->bar.cc = data;
+            n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
         } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
-                n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
-                n->bar.cc = data;
+            trace_nvme_mmio_shutdown_cleared();
+            n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
+            n->bar.cc = data;
+        }
+        break;
+    case 0x1C:  /* CSTS */
+        if (data & (1 << 4)) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_ssreset_w1c_unsupported,
+                           "attempted to W1C CSTS.NSSRO"
+                           " but CAP.NSSRS is zero (not supported)");
+        } else if (data != 0) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_ro_csts,
+                           "attempted to set a read only bit"
+                           " of controller status");
+        }
+        break;
+    case 0x20:  /* NSSR */
+        if (data == 0x4E564D65) {
+            trace_nvme_ub_mmiowr_ssreset_unsupported();
+        } else {
+            /* The spec says that writes of other values have no effect */
+            return;
         }
         break;
-    case 0x24:
+    case 0x24:  /* AQA */
         n->bar.aqa = data & 0xffffffff;
+        trace_nvme_mmio_aqattr(data & 0xffffffff);
         break;
-    case 0x28:
+    case 0x28:  /* ASQ */
         n->bar.asq = data;
+        trace_nvme_mmio_asqaddr(data);
         break;
-    case 0x2c:
+    case 0x2c:  /* ASQ hi */
         n->bar.asq |= data << 32;
+        trace_nvme_mmio_asqaddr_hi(data, n->bar.asq);
         break;
-    case 0x30:
+    case 0x30:  /* ACQ */
+        trace_nvme_mmio_acqaddr(data);
         n->bar.acq = data;
         break;
-    case 0x34:
+    case 0x34:  /* ACQ hi */
         n->bar.acq |= data << 32;
+        trace_nvme_mmio_acqaddr_hi(data, n->bar.acq);
         break;
+    case 0x38:  /* CMBLOC */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbloc_reserved,
+                       "invalid write to reserved CMBLOC"
+                       " when CMBSZ is zero, ignored");
+        return;
+    case 0x3C:  /* CMBSZ */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
+                       "invalid write to read only CMBSZ, ignored");
+        return;
     default:
+        NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
+                       "invalid MMIO write,"
+                       " offset=0x%"PRIx64", data=%"PRIx64"",
+                       offset, data);
         break;
     }
 }
@@ -XXX,XX +XXX,XX @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
     uint8_t *ptr = (uint8_t *)&n->bar;
     uint64_t val = 0;
 
+    if (unlikely(addr & (sizeof(uint32_t) - 1))) {
+        NVME_GUEST_ERR(nvme_ub_mmiord_misaligned32,
+                       "MMIO read not 32-bit aligned,"
+                       " offset=0x%"PRIx64"", addr);
+        /* should RAZ, fall through for now */
+    } else if (unlikely(size < sizeof(uint32_t))) {
+        NVME_GUEST_ERR(nvme_ub_mmiord_toosmall,
+                       "MMIO read smaller than 32-bits,"
+                       " offset=0x%"PRIx64"", addr);
+        /* should RAZ, fall through for now */
+    }
+
     if (addr < sizeof(n->bar)) {
         memcpy(&val, ptr + addr, size);
+    } else {
+        NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
+                       "MMIO read beyond last register,"
+                       " offset=0x%"PRIx64", returning 0", addr);
     }
+
     return val;
 }
 
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
 {
     uint32_t qid;
 
-    if (addr & ((1 << 2) - 1)) {
+    if (unlikely(addr & ((1 << 2) - 1))) {
+        NVME_GUEST_ERR(nvme_ub_db_wr_misaligned,
+                       "doorbell write not 32-bit aligned,"
+                       " offset=0x%"PRIx64", ignoring", addr);
         return;
     }
 
     if (((addr - 0x1000) >> 2) & 1) {
+        /* Completion queue doorbell write */
+
         uint16_t new_head = val & 0xffff;
         int start_sqs;
         NvmeCQueue *cq;
 
         qid = (addr - (0x1000 + (1 << 2))) >> 3;
-        if (nvme_check_cqid(n, qid)) {
+        if (unlikely(nvme_check_cqid(n, qid))) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cq,
+                           "completion queue doorbell write"
+                           " for nonexistent queue,"
+                           " sqid=%"PRIu32", ignoring", qid);
             return;
         }
 
         cq = n->cq[qid];
-        if (new_head >= cq->size) {
+        if (unlikely(new_head >= cq->size)) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cqhead,
+                           "completion queue doorbell write value"
+                           " beyond queue size, sqid=%"PRIu32","
+                           " new_head=%"PRIu16", ignoring",
+                           qid, new_head);
             return;
         }
 
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
             nvme_isr_notify(n, cq);
         }
     } else {
+        /* Submission queue doorbell write */
+
         uint16_t new_tail = val & 0xffff;
         NvmeSQueue *sq;
 
         qid = (addr - 0x1000) >> 3;
-        if (nvme_check_sqid(n, qid)) {
+        if (unlikely(nvme_check_sqid(n, qid))) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sq,
+                           "submission queue doorbell write"
+                           " for nonexistent queue,"
+                           " sqid=%"PRIu32", ignoring", qid);
             return;
         }
 
         sq = n->sq[qid];
-        if (new_tail >= sq->size) {
+        if (unlikely(new_tail >= sq->size)) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sqtail,
+                           "submission queue doorbell write value"
+                           " beyond queue size, sqid=%"PRIu32","
+                           " new_tail=%"PRIu16", ignoring",
+                           qid, new_tail);
             return;
         }
 
diff --git a/hw/block/trace-events b/hw/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -XXX,XX +XXX,XX @@ virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint6
 hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
 hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int trans) "blk %p CHS %u %u %u trans %d"
 
+# hw/block/nvme.c
+# nvme traces for successful events
+nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u"
+nvme_irq_pin(void) "pulsing IRQ pin"
+nvme_irq_masked(void) "IRQ is masked"
+nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
+nvme_rw(char const *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
+nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
+nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
+nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
+nvme_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16""
+nvme_identify_ctrl(void) "identify controller"
+nvme_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
+nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16""
+nvme_getfeat_vwcache(char const* result) "get feature volatile write cache, result=%s"
+nvme_getfeat_numq(int result) "get feature number of queues, result=%d"
+nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
+nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
+nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64""
+nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64""
+nvme_mmio_aqattr(uint64_t data) "wrote MMIO, admin queue attributes=0x%"PRIx64""
+nvme_mmio_asqaddr(uint64_t data) "wrote MMIO, admin submission queue address=0x%"PRIx64""
+nvme_mmio_acqaddr(uint64_t data) "wrote MMIO, admin completion queue address=0x%"PRIx64""
+nvme_mmio_asqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin submission queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
+nvme_mmio_acqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin completion queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
+nvme_mmio_start_success(void) "setting controller enable bit succeeded"
+nvme_mmio_stopped(void) "cleared controller enable bit"
+nvme_mmio_shutdown_set(void) "shutdown bit set"
+nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
+
+# nvme traces for error conditions
+nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
+nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
+nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""
+nvme_err_invalid_prp2_missing(void) "PRP2 is null and more data to be transferred"
+nvme_err_invalid_field(void) "invalid field"
+nvme_err_invalid_prp(void) "invalid PRP"
+nvme_err_invalid_sgl(void) "invalid SGL"
+nvme_err_invalid_ns(uint32_t ns, uint32_t limit) "invalid namespace %u not within 1-%u"
+nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
+nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
+nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
+nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
+nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
+nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16""
+nvme_err_invalid_create_sq_size(uint16_t qsize) "failed creating submission queue, invalid qsize=%"PRIu16""
+nvme_err_invalid_create_sq_addr(uint64_t addr) "failed creating submission queue, addr=0x%"PRIx64""
+nvme_err_invalid_create_sq_qflags(uint16_t qflags) "failed creating submission queue, qflags=%"PRIu16""
+nvme_err_invalid_del_cq_cqid(uint16_t cqid) "failed deleting completion queue, cqid=%"PRIu16""
+nvme_err_invalid_del_cq_notempty(uint16_t cqid) "failed deleting completion queue, it is not empty, cqid=%"PRIu16""
+nvme_err_invalid_create_cq_cqid(uint16_t cqid) "failed creating completion queue, cqid=%"PRIu16""
+nvme_err_invalid_create_cq_size(uint16_t size) "failed creating completion queue, size=%"PRIu16""
+nvme_err_invalid_create_cq_addr(uint64_t addr) "failed creating completion queue, addr=0x%"PRIx64""
+nvme_err_invalid_create_cq_vector(uint16_t vector) "failed creating completion queue, vector=%"PRIu16""
+nvme_err_invalid_create_cq_qflags(uint16_t qflags) "failed creating completion queue, qflags=%"PRIu16""
+nvme_err_invalid_identify_cns(uint16_t cns) "identify, invalid cns=0x%"PRIx16""
+nvme_err_invalid_getfeat(int dw10) "invalid get features, dw10=0x%"PRIx32""
+nvme_err_invalid_setfeat(uint32_t dw10) "invalid set features, dw10=0x%"PRIx32""
+nvme_err_startfail_cq(void) "nvme_start_ctrl failed because there are non-admin completion queues"
+nvme_err_startfail_sq(void) "nvme_start_ctrl failed because there are non-admin submission queues"
+nvme_err_startfail_nbarasq(void) "nvme_start_ctrl failed because the admin submission queue address is null"
+nvme_err_startfail_nbaracq(void) "nvme_start_ctrl failed because the admin completion queue address is null"
+nvme_err_startfail_asq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin submission queue address is misaligned: 0x%"PRIx64""
+nvme_err_startfail_acq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin completion queue address is misaligned: 0x%"PRIx64""
+nvme_err_startfail_page_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too small: log2size=%u, min=%u"
+nvme_err_startfail_page_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too large: log2size=%u, max=%u"
+nvme_err_startfail_cqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too small: log2size=%u, min=%u"
+nvme_err_startfail_cqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too large: log2size=%u, max=%u"
+nvme_err_startfail_sqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too small: log2size=%u, min=%u"
+nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too large: log2size=%u, max=%u"
+nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero"
+nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero"
+nvme_err_startfail(void) "setting controller enable bit failed"
+
+# Traces for undefined behavior
+nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64""
+nvme_ub_mmiowr_toosmall(uint64_t offset, unsigned size) "MMIO write smaller than 32 bits, offset=0x%"PRIx64", size=%u"
+nvme_ub_mmiowr_intmask_with_msix(void) "undefined access to interrupt mask set when MSI-X is enabled"
+nvme_ub_mmiowr_ro_csts(void) "attempted to set a read only bit of controller status"
+nvme_ub_mmiowr_ssreset_w1c_unsupported(void) "attempted to W1C CSTS.NSSRO but CAP.NSSRS is zero (not supported)"
+nvme_ub_mmiowr_ssreset_unsupported(void) "attempted NVM subsystem reset but CAP.NSSRS is zero (not supported)"
+nvme_ub_mmiowr_cmbloc_reserved(void) "invalid write to reserved CMBLOC when CMBSZ is zero, ignored"
+nvme_ub_mmiowr_cmbsz_readonly(void) "invalid write to read only CMBSZ, ignored"
+nvme_ub_mmiowr_invalid(uint64_t offset, uint64_t data) "invalid MMIO write, offset=0x%"PRIx64", data=0x%"PRIx64""
+nvme_ub_mmiord_misaligned32(uint64_t offset) "MMIO read not 32-bit aligned, offset=0x%"PRIx64""
+nvme_ub_mmiord_toosmall(uint64_t offset) "MMIO read smaller than 32-bits, offset=0x%"PRIx64""
+nvme_ub_mmiord_invalid_ofs(uint64_t offset) "MMIO read beyond last register, offset=0x%"PRIx64", returning 0"
+nvme_ub_db_wr_misaligned(uint64_t offset) "doorbell write not 32-bit aligned, offset=0x%"PRIx64", ignoring"
+nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for nonexistent queue, cqid=%"PRIu32", ignoring"
+nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring"
+nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring"
+nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring"
+
 # hw/block/xen_disk.c
 xen_disk_alloc(char *name) "%s"
 xen_disk_init(char *name) "%s"
-- 
2.13.6

From: Fam Zheng <famz@redhat.com>

Management tools create overlays of running guests with qemu-img:

$ qemu-img create -b /image/in/use.qcow2 -f qcow2 /overlay/image.qcow2

but this doesn't work anymore due to image locking:

qemu-img: /overlay/image.qcow2: Failed to get shared "write" lock
    Is another process using the image?
    Could not open backing image to determine size.
Use the force share option to allow this use case again.

Cc: qemu-stable@nongnu.org
Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ void bdrv_img_create(const char *filename, const char *fmt,
         back_flags = flags;
         back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
 
+        backing_options = qdict_new();
         if (backing_fmt) {
-            backing_options = qdict_new();
             qdict_put_str(backing_options, "driver", backing_fmt);
         }
+        qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
 
         bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
                        &local_err);
-- 
2.13.6

From: Thomas Huth <thuth@redhat.com>

It's not working anymore since QEMU v1.3.0 - time to remove it now.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c    | 11 -----------
 qemu-doc.texi |  6 ------
 2 files changed, 17 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index XXXXXXX..XXXXXXX 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -XXX,XX +XXX,XX @@ QemuOptsList qemu_legacy_drive_opts = {
             .type = QEMU_OPT_STRING,
             .help = "chs translation (auto, lba, none)",
         },{
-            .name = "boot",
-            .type = QEMU_OPT_BOOL,
-            .help = "(deprecated, ignored)",
-        },{
             .name = "addr",
             .type = QEMU_OPT_STRING,
             .help = "pci address (virtio only)",
@@ -XXX,XX +XXX,XX @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type)
         goto fail;
     }
 
-    /* Deprecated option boot=[on|off] */
-    if (qemu_opt_get(legacy_opts, "boot") != NULL) {
-        fprintf(stderr, "qemu-kvm: boot=on|off is deprecated and will be "
-                "ignored. Future versions will reject this parameter. Please "
-                "update your scripts.\n");
-    }
-
     /* Other deprecated options */
     if (!qtest_enabled()) {
         for (i = 0; i < ARRAY_SIZE(deprecated); i++) {
diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ deprecated.
 
 @section System emulator command line arguments
 
-@subsection -drive boot=on|off (since 1.3.0)
-
-The ``boot=on|off'' option to the ``-drive'' argument is
-ignored. Applications should use the ``bootindex=N'' parameter
-to set an absolute ordering between devices instead.
-
 @subsection -tdf (since 1.3.0)
 
 The ``-tdf'' argument is ignored. The behaviour implemented
-- 
2.13.6

From: Thomas Huth <thuth@redhat.com>

It's been marked as deprecated since QEMU v2.10.0, and so far nobody
complained that we should keep it, so let's remove this legacy option
now to simplify the code quite a bit.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 vl.c            | 86 ++-------------------------------------------------------
 qemu-doc.texi   |  8 ------
 qemu-options.hx | 19 ++-----------
 3 files changed, 4 insertions(+), 109 deletions(-)

diff --git a/vl.c b/vl.c
index XXXXXXX..XXXXXXX 100644
--- a/vl.c
+++ b/vl.c
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
     const char *boot_order = NULL;
     const char *boot_once = NULL;
     DisplayState *ds;
-    int cyls, heads, secs, translation;
     QemuOpts *opts, *machine_opts;
-    QemuOpts *hda_opts = NULL, *icount_opts = NULL, *accel_opts = NULL;
+    QemuOpts *icount_opts = NULL, *accel_opts = NULL;
     QemuOptsList *olist;
     int optind;
     const char *optarg;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
 
     cpu_model = NULL;
     snapshot = 0;
-    cyls = heads = secs = 0;
-    translation = BIOS_ATA_TRANSLATION_AUTO;
 
     nb_nics = 0;
 
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
         if (optind >= argc)
             break;
         if (argv[optind][0] != '-') {
-            hda_opts = drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
+            drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
         } else {
             const QEMUOption *popt;
 
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
                 cpu_model = optarg;
                 break;
             case QEMU_OPTION_hda:
-                {
-                    char buf[256];
-                    if (cyls == 0)
-                        snprintf(buf, sizeof(buf), "%s", HD_OPTS);
-                    else
-                        snprintf(buf, sizeof(buf),
-                                 "%s,cyls=%d,heads=%d,secs=%d%s",
-                                 HD_OPTS , cyls, heads, secs,
-                                 translation == BIOS_ATA_TRANSLATION_LBA ?
-                                 ",trans=lba" :
-                                 translation == BIOS_ATA_TRANSLATION_NONE ?
-                                 ",trans=none" : "");
-                    drive_add(IF_DEFAULT, 0, optarg, buf);
-                    break;
-                }
             case QEMU_OPTION_hdb:
             case QEMU_OPTION_hdc:
             case QEMU_OPTION_hdd:
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
             case QEMU_OPTION_snapshot:
                 snapshot = 1;
                 break;
-            case QEMU_OPTION_hdachs:
-                {
-                    const char *p;
-                    p = optarg;
-                    cyls = strtol(p, (char **)&p, 0);
-                    if (cyls < 1 || cyls > 16383)
-                        goto chs_fail;
-                    if (*p != ',')
-                        goto chs_fail;
-                    p++;
-                    heads = strtol(p, (char **)&p, 0);
-                    if (heads < 1 || heads > 16)
-                        goto chs_fail;
-                    if (*p != ',')
-                        goto chs_fail;
-                    p++;
-                    secs = strtol(p, (char **)&p, 0);
-                    if (secs < 1 || secs > 63)
-                        goto chs_fail;
-                    if (*p == ',') {
-                        p++;
-                        if (!strcmp(p, "large")) {
-                            translation = BIOS_ATA_TRANSLATION_LARGE;
-                        } else if (!strcmp(p, "rechs")) {
-                            translation = BIOS_ATA_TRANSLATION_RECHS;
-                        } else if (!strcmp(p, "none")) {
-                            translation = BIOS_ATA_TRANSLATION_NONE;
-                        } else if (!strcmp(p, "lba")) {
-                            translation = BIOS_ATA_TRANSLATION_LBA;
-                        } else if (!strcmp(p, "auto")) {
-                            translation = BIOS_ATA_TRANSLATION_AUTO;
-                        } else {
-                            goto chs_fail;
-                        }
-                    } else if (*p != '\0') {
-                    chs_fail:
-                        error_report("invalid physical CHS format");
-                        exit(1);
-                    }
-                    if (hda_opts != NULL) {
-                        qemu_opt_set_number(hda_opts, "cyls", cyls,
-                                            &error_abort);
-                        qemu_opt_set_number(hda_opts, "heads", heads,
-                                            &error_abort);
-                        qemu_opt_set_number(hda_opts, "secs", secs,
-                                            &error_abort);
-                        if (translation == BIOS_ATA_TRANSLATION_LARGE) {
-                            qemu_opt_set(hda_opts, "trans", "large",
-                                         &error_abort);
-                        } else if (translation == BIOS_ATA_TRANSLATION_RECHS) {
-                            qemu_opt_set(hda_opts, "trans", "rechs",
-                                         &error_abort);
-                        } else if (translation == BIOS_ATA_TRANSLATION_LBA) {
-                            qemu_opt_set(hda_opts, "trans", "lba",
-                                         &error_abort);
-                        } else if (translation == BIOS_ATA_TRANSLATION_NONE) {
-                            qemu_opt_set(hda_opts, "trans", "none",
-                                         &error_abort);
-                        }
-                    }
-                }
-                error_report("'-hdachs' is deprecated, please use '-device"
-                             " ide-hd,cyls=c,heads=h,secs=s,...' instead");
-                break;
             case QEMU_OPTION_numa:
                 opts = qemu_opts_parse_noisily(qemu_find_opts("numa"),
                                                optarg, true);
diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ The ``--net dump'' argument is now replaced with the
 ``-object filter-dump'' argument which works in combination
 with the modern ``-netdev`` backends instead.
 
-@subsection -hdachs (since 2.10.0)
-
-The ``-hdachs'' argument is now a synonym for setting
-the ``cyls'', ``heads'', ``secs'', and ``trans'' properties
-on the ``ide-hd'' device using the ``-device'' argument.
-The new syntax allows different settings to be provided
-per disk.
-
 @subsection -usbdevice (since 2.10.0)
 
 The ``-usbdevice DEV'' argument is now a synonym for setting
diff --git a/qemu-options.hx b/qemu-options.hx
index XXXXXXX..XXXXXXX 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ of available connectors of a given interface type.
 @item media=@var{media}
 This option defines the type of the media: disk or cdrom.
 @item cyls=@var{c},heads=@var{h},secs=@var{s}[,trans=@var{t}]
-These options have the same definition as they have in @option{-hdachs}.
-These parameters are deprecated, use the corresponding parameters
+Force disk physical geometry and the optional BIOS translation (trans=none or
+lba). These parameters are deprecated, use the corresponding parameters
 of @code{-device} instead.
 @item snapshot=@var{snapshot}
 @var{snapshot} is "on" or "off" and controls snapshot mode for the given drive
@@ -XXX,XX +XXX,XX @@ the raw disk image you use is not written back. You can however force
 the write back by pressing @key{C-a s} (@pxref{disk_images}).
 ETEXI
 
-DEF("hdachs", HAS_ARG, QEMU_OPTION_hdachs, \
-    "-hdachs c,h,s[,t]\n" \
-    "                force hard disk 0 physical geometry and the optional BIOS\n" \
-    "                translation (t=none or lba) (usually QEMU can guess them)\n",
-    QEMU_ARCH_ALL)
-STEXI
-@item -hdachs @var{c},@var{h},@var{s},[,@var{t}]
-@findex -hdachs
-Force hard disk 0 physical geometry (1 <= @var{c} <= 16383, 1 <=
-@var{h} <= 16, 1 <= @var{s} <= 63) and optionally force the BIOS
-translation mode (@var{t}=none, lba or auto). Usually QEMU can guess
-all those parameters. This option is deprecated, please use
-@code{-device ide-hd,cyls=c,heads=h,secs=s,...} instead.
-ETEXI
-
 DEF("fsdev", HAS_ARG, QEMU_OPTION_fsdev,
     "-fsdev fsdriver,id=id[,path=path,][security_model={mapped-xattr|mapped-file|passthrough|none}]\n"
     " [,writeout=immediate][,readonly][,socket=socket|sock_fd=sock_fd][,fmode=fmode][,dmode=dmode]\n"
-- 
2.13.6

From: Thomas Huth <thuth@redhat.com>

Looks like we forgot to announce the deprecation of these options in
the corresponding chapter of the qemu-doc text, so let's do that now.

diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ longer be directly supported in QEMU.
 The ``-drive if=scsi'' argument is replaced by the the
 ``-device BUS-TYPE'' argument combined with ``-drive if=none''.
 
+@subsection -drive cyls=...,heads=...,secs=...,trans=... (since 2.10.0)
+
+The drive geometry arguments are replaced by the the geometry arguments
+that can be specified with the ``-device'' parameter.
+
+@subsection -drive serial=... (since 2.10.0)
+
+The drive serial argument is replaced by the the serial argument
+that can be specified with the ``-device'' parameter.
+
+@subsection -drive addr=... (since 2.10.0)
+
+The drive addr argument is replaced by the the addr argument
+that can be specified with the ``-device'' parameter.
+
 @subsection -net dump (since 2.10.0)
 
 The ``--net dump'' argument is now replaced with the
-- 
2.13.6

From: Fam Zheng <famz@redhat.com>

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block_int.h |  1 -
 block/io.c                | 18 ------------------
 2 files changed, 19 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ bool blk_dev_is_tray_open(BlockBackend *blk);
 bool blk_dev_is_medium_locked(BlockBackend *blk);
 
 void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes);
-bool bdrv_requests_pending(BlockDriverState *bs);
 
 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
 void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_disable_copy_on_read(BlockDriverState *bs)
     assert(old >= 1);
 }
 
-/* Check if any requests are in-flight (including throttled requests) */
-bool bdrv_requests_pending(BlockDriverState *bs)
-{
-    BdrvChild *child;
-
-    if (atomic_read(&bs->in_flight)) {
-        return true;
-    }
-
-    QLIST_FOREACH(child, &bs->children, next) {
-        if (bdrv_requests_pending(child->bs)) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
 typedef struct {
     Coroutine *co;
     BlockDriverState *bs;
-- 
2.13.6

bdrv_drained_begin() doesn't increase bs->quiesce_counter recursively
and also doesn't notify other parent nodes of children, which both means
that the child nodes are not actually drained, and bdrv_drained_begin()
is providing useful functionality only on a single node.

To keep things consistent, we also shouldn't call the block driver
callbacks recursively.

A proper recursive drain version that provides an actually working
drained section for child nodes will be introduced later.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
---
 block/io.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
 }
 
 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
-static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
+static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
 {
     BdrvChild *child, *tmp;
     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
     bdrv_coroutine_enter(bs, data.co);
     BDRV_POLL_WHILE(bs, !data.done);
 
-    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
-        bdrv_drain_invoke(child->bs, begin);
+    if (recursive) {
+        QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+            bdrv_drain_invoke(child->bs, begin, true);
+        }
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
         bdrv_parent_drained_begin(bs);
     }
 
-    bdrv_drain_invoke(bs, true);
+    bdrv_drain_invoke(bs, true, false);
     bdrv_drain_recurse(bs);
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
     }
 
     /* Re-enable things in child-to-parent order */
-    bdrv_drain_invoke(bs, false);
+    bdrv_drain_invoke(bs, false, false);
     bdrv_parent_drained_end(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
         aio_context_acquire(aio_context);
         aio_disable_external(aio_context);
         bdrv_parent_drained_begin(bs);
-        bdrv_drain_invoke(bs, true);
+        bdrv_drain_invoke(bs, true, true);
         aio_context_release(aio_context);
 
         if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
 
         /* Re-enable things in child-to-parent order */
         aio_context_acquire(aio_context);
-        bdrv_drain_invoke(bs, false);
+        bdrv_drain_invoke(bs, false, true);
         bdrv_parent_drained_end(bs);
         aio_enable_external(aio_context);
         aio_context_release(aio_context);
-- 
2.13.6

The existing test is for bdrv_drain_all_begin/end() only. Generalise the
test case so that it can be run for the other variants as well. At the
moment this is only bdrv_drain_begin/end(), but in a while, we'll add
another one.

Also, add a backing file to the test node to test whether the operations
work recursively.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 69 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 62 insertions(+), 7 deletions(-)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_test = {
 
     .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
     .bdrv_co_drain_end      = bdrv_test_co_drain_end,
+
+    .bdrv_child_perm        = bdrv_format_default_perms,
 };
 
 static void aio_ret_cb(void *opaque, int ret)
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
     *aio_ret = ret;
 }
 
-static void test_drv_cb_drain_all(void)
+enum drain_type {
+    BDRV_DRAIN_ALL,
+    BDRV_DRAIN,
+};
+
+static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
+{
+    switch (drain_type) {
+    case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
+    case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
+    default:                    g_assert_not_reached();
+    }
+}
+
+static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
+{
+    switch (drain_type) {
+    case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
+    case BDRV_DRAIN:            bdrv_drained_end(bs); break;
+    default:                    g_assert_not_reached();
+    }
+}
+
+static void test_drv_cb_common(enum drain_type drain_type, bool recursive)
 {
     BlockBackend *blk;
-    BlockDriverState *bs;
-    BDRVTestState *s;
+    BlockDriverState *bs, *backing;
+    BDRVTestState *s, *backing_s;
     BlockAIOCB *acb;
     int aio_ret;
 
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
     s = bs->opaque;
     blk_insert_bs(blk, bs, &error_abort);
 
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs, backing, &error_abort);
+
     /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
     g_assert_cmpint(s->drain_count, ==, 0);
-    bdrv_drain_all_begin();
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(drain_type, bs);
+
     g_assert_cmpint(s->drain_count, ==, 1);
-    bdrv_drain_all_end();
+    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
+
+    do_drain_end(drain_type, bs);
+
     g_assert_cmpint(s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
 
     /* Now do the same while a request is pending */
     aio_ret = -EINPROGRESS;
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
     g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
 
     g_assert_cmpint(s->drain_count, ==, 0);
-    bdrv_drain_all_begin();
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(drain_type, bs);
+
     g_assert_cmpint(aio_ret, ==, 0);
     g_assert_cmpint(s->drain_count, ==, 1);
-    bdrv_drain_all_end();
+    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
+
+    do_drain_end(drain_type, bs);
+
     g_assert_cmpint(s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
 
+    bdrv_unref(backing);
     bdrv_unref(bs);
     blk_unref(blk);
 }
 
+static void test_drv_cb_drain_all(void)
+{
+    test_drv_cb_common(BDRV_DRAIN_ALL, true);
+}
+
+static void test_drv_cb_drain(void)
+{
+    test_drv_cb_common(BDRV_DRAIN, false);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_init(&argc, &argv, NULL);
 
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
+    g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 
     return g_test_run();
 }
-- 
2.13.6

This is currently only working correctly for bdrv_drain(), not for
bdrv_drain_all(). Leave a comment for the drain_all case, we'll address
it later.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
     test_drv_cb_common(BDRV_DRAIN, false);
 }
 
+static void test_quiesce_common(enum drain_type drain_type, bool recursive)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs, *backing;
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
+                              &error_abort);
+    blk_insert_bs(blk, bs, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    bdrv_set_backing_hd(bs, backing, &error_abort);
+
+    g_assert_cmpint(bs->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+
+    do_drain_begin(drain_type, bs);
+
+    g_assert_cmpint(bs->quiesce_counter, ==, 1);
+    g_assert_cmpint(backing->quiesce_counter, ==, !!recursive);
+
+    do_drain_end(drain_type, bs);
+
+    g_assert_cmpint(bs->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+
+    bdrv_unref(backing);
+    bdrv_unref(bs);
+    blk_unref(blk);
+}
+
+static void test_quiesce_drain_all(void)
+{
+    // XXX drain_all doesn't quiesce
+    //test_quiesce_common(BDRV_DRAIN_ALL, true);
+}
+
+static void test_quiesce_drain(void)
+{
+    test_quiesce_common(BDRV_DRAIN, false);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
     g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 
+    g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
+    g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
+
     return g_test_run();
 }
-- 
2.13.6

Block jobs already paused themselves when their main BlockBackend
entered a drained section. This is not good enough: We also want to
pause a block job and may not submit new requests if, for example, the
mirror target node should be drained.

This implements .drained_begin/end callbacks in child_job in order to
consider all block nodes related to the job, and removes the
BlockBackend callbacks which are unnecessary now because the root of the
job main BlockBackend is always referenced with a child_job, too.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockjob.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static char *child_job_get_parent_desc(BdrvChild *c)
                            job->id);
 }
 
-static const BdrvChildRole child_job = {
-    .get_parent_desc    = child_job_get_parent_desc,
-    .stay_at_node       = true,
-};
-
-static void block_job_drained_begin(void *opaque)
+static void child_job_drained_begin(BdrvChild *c)
 {
-    BlockJob *job = opaque;
+    BlockJob *job = c->opaque;
     block_job_pause(job);
 }
 
-static void block_job_drained_end(void *opaque)
+static void child_job_drained_end(BdrvChild *c)
 {
-    BlockJob *job = opaque;
+    BlockJob *job = c->opaque;
     block_job_resume(job);
 }
 
-static const BlockDevOps block_job_dev_ops = {
-    .drained_begin = block_job_drained_begin,
-    .drained_end = block_job_drained_end,
+static const BdrvChildRole child_job = {
+    .get_parent_desc    = child_job_get_parent_desc,
+    .drained_begin      = child_job_drained_begin,
+    .drained_end        = child_job_drained_end,
+    .stay_at_node       = true,
 };
 
 void block_job_remove_all_bdrv(BlockJob *job)
@@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
     block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort);
     bs->job = job;
 
-    blk_set_dev_ops(blk, &block_job_dev_ops, job);
     bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
 
     QLIST_INSERT_HEAD(&block_jobs, job, job_list);
-- 
2.13.6

Block jobs must be paused if any of the involved nodes are drained.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "block/block.h"
+#include "block/blockjob_int.h"
 #include "sysemu/block-backend.h"
 #include "qapi/error.h"
 
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
     test_quiesce_common(BDRV_DRAIN, false);
 }
 
+
+typedef struct TestBlockJob {
+    BlockJob common;
+    bool should_complete;
+} TestBlockJob;
+
+static void test_job_completed(BlockJob *job, void *opaque)
+{
+    block_job_completed(job, 0);
+}
+
+static void coroutine_fn test_job_start(void *opaque)
+{
+    TestBlockJob *s = opaque;
+
+    while (!s->should_complete) {
+        block_job_sleep_ns(&s->common, 100000);
+    }
+
+    block_job_defer_to_main_loop(&s->common, test_job_completed, NULL);
+}
+
+static void test_job_complete(BlockJob *job, Error **errp)
+{
+    TestBlockJob *s = container_of(job, TestBlockJob, common);
+    s->should_complete = true;
+}
+
+BlockJobDriver test_job_driver = {
+    .instance_size  = sizeof(TestBlockJob),
+    .start          = test_job_start,
+    .complete       = test_job_complete,
+};
+
+static void test_blockjob_common(enum drain_type drain_type)
+{
+    BlockBackend *blk_src, *blk_target;
+    BlockDriverState *src, *target;
+    BlockJob *job;
+    int ret;
+
+    src = bdrv_new_open_driver(&bdrv_test, "source", BDRV_O_RDWR,
+                               &error_abort);
+    blk_src = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_insert_bs(blk_src, src, &error_abort);
+
+    target = bdrv_new_open_driver(&bdrv_test, "target", BDRV_O_RDWR,
+                                  &error_abort);
+    blk_target = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_insert_bs(blk_target, target, &error_abort);
+
+    job = block_job_create("job0", &test_job_driver, src, 0, BLK_PERM_ALL, 0,
+                           0, NULL, NULL, &error_abort);
+    block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort);
+    block_job_start(job);
+
+    g_assert_cmpint(job->pause_count, ==, 0);
+    g_assert_false(job->paused);
+    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
+
+    do_drain_begin(drain_type, src);
+
+    if (drain_type == BDRV_DRAIN_ALL) {
+        /* bdrv_drain_all() drains both src and target, and involves an
+         * additional block_job_pause_all() */
+        g_assert_cmpint(job->pause_count, ==, 3);
+    } else {
+        g_assert_cmpint(job->pause_count, ==, 1);
+    }
+    /* XXX We don't wait until the job is actually paused. Is this okay? */
+    /* g_assert_true(job->paused); */
+    g_assert_false(job->busy); /* The job is paused */
+
+    do_drain_end(drain_type, src);
+
+    g_assert_cmpint(job->pause_count, ==, 0);
+    g_assert_false(job->paused);
+    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
+
+    do_drain_begin(drain_type, target);
+
+    if (drain_type == BDRV_DRAIN_ALL) {
+        /* bdrv_drain_all() drains both src and target, and involves an
+         * additional block_job_pause_all() */
+        g_assert_cmpint(job->pause_count, ==, 3);
+    } else {
+        g_assert_cmpint(job->pause_count, ==, 1);
+    }
+    /* XXX We don't wait until the job is actually paused. Is this okay? */
+    /* g_assert_true(job->paused); */
+    g_assert_false(job->busy); /* The job is paused */
+
+    do_drain_end(drain_type, target);
+
+    g_assert_cmpint(job->pause_count, ==, 0);
+    g_assert_false(job->paused);
+    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
+
+    ret = block_job_complete_sync(job, &error_abort);
+    g_assert_cmpint(ret, ==, 0);
+
+    blk_unref(blk_src);
+    blk_unref(blk_target);
+    bdrv_unref(src);
+    bdrv_unref(target);
+}
+
+static void test_blockjob_drain_all(void)
+{
+    test_blockjob_common(BDRV_DRAIN_ALL);
+}
+
+static void test_blockjob_drain(void)
+{
+    test_blockjob_common(BDRV_DRAIN);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 
+    g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
+    g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
+
     return g_test_run();
 }
-- 
2.13.6

Block jobs are already paused using the BdrvChildRole drain callbacks,
so we don't need an additional block_job_pause_all() call.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c              |  4 ----
 tests/test-bdrv-drain.c | 10 ++++------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
      * context. */
     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 
-    block_job_pause_all();
-
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_enable_external(aio_context);
         aio_context_release(aio_context);
     }
-
-    block_job_resume_all();
 }
 
 void bdrv_drain_all(void)
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
     do_drain_begin(drain_type, src);
 
     if (drain_type == BDRV_DRAIN_ALL) {
-        /* bdrv_drain_all() drains both src and target, and involves an
-         * additional block_job_pause_all() */
-        g_assert_cmpint(job->pause_count, ==, 3);
+        /* bdrv_drain_all() drains both src and target */
+        g_assert_cmpint(job->pause_count, ==, 2);
     } else {
         g_assert_cmpint(job->pause_count, ==, 1);
     }
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
     do_drain_begin(drain_type, target);
 
     if (drain_type == BDRV_DRAIN_ALL) {
-        /* bdrv_drain_all() drains both src and target, and involves an
-         * additional block_job_pause_all() */
-        g_assert_cmpint(job->pause_count, ==, 3);
+        /* bdrv_drain_all() drains both src and target */
+        g_assert_cmpint(job->pause_count, ==, 2);
     } else {
         g_assert_cmpint(job->pause_count, ==, 1);
     }
-- 
2.13.6

bdrv_do_drained_begin() restricts the call of parent callbacks and
aio_disable_external() to the outermost drain section, but the block
driver callbacks are always called. bdrv_do_drained_end() must match
this behaviour, otherwise nodes stay drained even if begin/end calls
were balanced.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
 
 void bdrv_drained_end(BlockDriverState *bs)
 {
+    int old_quiesce_counter;
+
     if (qemu_in_coroutine()) {
         bdrv_co_yield_to_drain(bs, false);
         return;
     }
     assert(bs->quiesce_counter > 0);
-    if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
-        return;
-    }
+    old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
 
     /* Re-enable things in child-to-parent order */
     bdrv_drain_invoke(bs, false, false);
-    bdrv_parent_drained_end(bs);
-    aio_enable_external(bdrv_get_aio_context(bs));
+    if (old_quiesce_counter == 1) {
+        bdrv_parent_drained_end(bs);
+        aio_enable_external(bdrv_get_aio_context(bs));
+    }
 }
 
 /*
-- 
2.13.6

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
 enum drain_type {
     BDRV_DRAIN_ALL,
     BDRV_DRAIN,
+    DRAIN_TYPE_MAX,
 };
 
 static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
     test_quiesce_common(BDRV_DRAIN, false);
 }
 
+static void test_nested(void)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs, *backing;
+    BDRVTestState *s, *backing_s;
+    enum drain_type outer, inner;
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
+                              &error_abort);
+    s = bs->opaque;
+    blk_insert_bs(blk, bs, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs, backing, &error_abort);
+
+    for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
+        for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
+            /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
+            int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
+                                  (inner != BDRV_DRAIN_ALL);
+            int backing_quiesce = 0;
+            int backing_cb_cnt  = (outer != BDRV_DRAIN) +
+                                  (inner != BDRV_DRAIN);
+
+            g_assert_cmpint(bs->quiesce_counter, ==, 0);
+            g_assert_cmpint(backing->quiesce_counter, ==, 0);
+            g_assert_cmpint(s->drain_count, ==, 0);
+            g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+            do_drain_begin(outer, bs);
+            do_drain_begin(inner, bs);
+
+            g_assert_cmpint(bs->quiesce_counter, ==, bs_quiesce);
+            g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
+            g_assert_cmpint(s->drain_count, ==, 2);
+            g_assert_cmpint(backing_s->drain_count, ==, backing_cb_cnt);
+
+            do_drain_end(inner, bs);
+            do_drain_end(outer, bs);
+
+            g_assert_cmpint(bs->quiesce_counter, ==, 0);
+            g_assert_cmpint(backing->quiesce_counter, ==, 0);
+            g_assert_cmpint(s->drain_count, ==, 0);
+            g_assert_cmpint(backing_s->drain_count, ==, 0);
+        }
+    }
+
+    bdrv_unref(backing);
+    bdrv_unref(bs);
+    blk_unref(blk);
+}
+
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 
+    g_test_add_func("/bdrv-drain/nested", test_nested);
+
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 
-- 
2.13.6

This is in preparation for subtree drains, i.e. drained sections that
affect not only a single node, but recursively all child nodes, too.

Calling the parent callbacks for drain is pointless when we just came
from that parent node recursively and leads to multiple increases of
bs->quiesce_counter in a single drain call. Don't do it.

In order for this to work correctly, the parent callback must be called
for every bdrv_drain_begin/end() call, not only for the outermost one:

If we have a node N with two parents A and B, recursive draining of A
should cause the quiesce_counter of B to increase because its child N is
drained independently of B. If now B is recursively drained, too, A must
increase its quiesce_counter because N is drained independently of A
only now, even if N is going from quiesce_counter 1 to 2.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h |  4 ++--
 block.c               | 13 +++++++++----
 block/io.c            | 47 ++++++++++++++++++++++++++++++++++-------------
 3 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_io_unplug(BlockDriverState *bs);
  * Begin a quiesced section of all users of @bs. This is part of
  * bdrv_drained_begin.
  */
-void bdrv_parent_drained_begin(BlockDriverState *bs);
+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
 
 /**
  * bdrv_parent_drained_end:
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs);
  * End a quiesced section of all users of @bs. This is part of
  * bdrv_drained_end.
  */
-void bdrv_parent_drained_end(BlockDriverState *bs);
+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
 
 /**
  * bdrv_drained_begin:
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
                                       BlockDriverState *new_bs)
 {
     BlockDriverState *old_bs = child->bs;
+    int i;
 
     if (old_bs && new_bs) {
         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
     }
     if (old_bs) {
         if (old_bs->quiesce_counter && child->role->drained_end) {
-            child->role->drained_end(child);
+            for (i = 0; i < old_bs->quiesce_counter; i++) {
+                child->role->drained_end(child);
+            }
         }
         if (child->role->detach) {
             child->role->detach(child);
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
     if (new_bs) {
         QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
         if (new_bs->quiesce_counter && child->role->drained_begin) {
-            child->role->drained_begin(child);
+            for (i = 0; i < new_bs->quiesce_counter; i++) {
+                child->role->drained_begin(child);
+            }
         }
 
         if (child->role->attach) {
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
     AioContext *ctx = bdrv_get_aio_context(bs);
 
     aio_disable_external(ctx);
-    bdrv_parent_drained_begin(bs);
+    bdrv_parent_drained_begin(bs, NULL);
     bdrv_drain(bs); /* ensure there are no in-flight requests */
 
     while (aio_poll(ctx, false)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
      */
     aio_context_acquire(new_context);
     bdrv_attach_aio_context(bs, new_context);
-    bdrv_parent_drained_end(bs);
+    bdrv_parent_drained_end(bs, NULL);
     aio_enable_external(ctx);
     aio_context_release(new_context);
 }
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
     int64_t offset, int bytes, BdrvRequestFlags flags);
 
-void bdrv_parent_drained_begin(BlockDriverState *bs)
+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
 {
     BdrvChild *c, *next;
 
     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
+        if (c == ignore) {
+            continue;
+        }
         if (c->role->drained_begin) {
             c->role->drained_begin(c);
         }
     }
 }
 
-void bdrv_parent_drained_end(BlockDriverState *bs)
+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
 {
     BdrvChild *c, *next;
 
     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
+        if (c == ignore) {
+            continue;
+        }
         if (c->role->drained_end) {
             c->role->drained_end(c);
         }
@@ -XXX,XX +XXX,XX @@ typedef struct {
     BlockDriverState *bs;
     bool done;
     bool begin;
+    BdrvChild *parent;
 } BdrvCoDrainData;
 
 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
     return waited;
 }
 
+static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
+static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
+
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
     BdrvCoDrainData *data = opaque;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 
     bdrv_dec_in_flight(bs);
     if (data->begin) {
-        bdrv_drained_begin(bs);
+        bdrv_do_drained_begin(bs, data->parent);
     } else {
-        bdrv_drained_end(bs);
+        bdrv_do_drained_end(bs, data->parent);
     }
 
     data->done = true;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 }
 
 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-                                                bool begin)
+                                                bool begin, BdrvChild *parent)
 {
     BdrvCoDrainData data;
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
         .bs = bs,
         .done = false,
         .begin = begin,
+        .parent = parent,
     };
     bdrv_inc_in_flight(bs);
     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     assert(data.done);
 }
 
-void bdrv_drained_begin(BlockDriverState *bs)
+static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
 {
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, true);
+        bdrv_co_yield_to_drain(bs, true, parent);
         return;
     }
 
     /* Stop things in parent-to-child order */
     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
         aio_disable_external(bdrv_get_aio_context(bs));
-        bdrv_parent_drained_begin(bs);
     }
 
+    bdrv_parent_drained_begin(bs, parent);
     bdrv_drain_invoke(bs, true, false);
     bdrv_drain_recurse(bs);
 }
 
-void bdrv_drained_end(BlockDriverState *bs)
+void bdrv_drained_begin(BlockDriverState *bs)
+{
+    bdrv_do_drained_begin(bs, NULL);
+}
+
+static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
 {
     int old_quiesce_counter;
 
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, false);
+        bdrv_co_yield_to_drain(bs, false, parent);
         return;
     }
     assert(bs->quiesce_counter > 0);
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
 
     /* Re-enable things in child-to-parent order */
     bdrv_drain_invoke(bs, false, false);
+    bdrv_parent_drained_end(bs, parent);
     if (old_quiesce_counter == 1) {
-        bdrv_parent_drained_end(bs);
         aio_enable_external(bdrv_get_aio_context(bs));
     }
 }
 
+void bdrv_drained_end(BlockDriverState *bs)
+{
+    bdrv_do_drained_end(bs, NULL);
+}
+
 /*
  * Wait for pending requests to complete on a single BlockDriverState subtree,
  * and suspend block driver's internal I/O until next request arrives.
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
         /* Stop things in parent-to-child order */
         aio_context_acquire(aio_context);
         aio_disable_external(aio_context);
-        bdrv_parent_drained_begin(bs);
+        bdrv_parent_drained_begin(bs, NULL);
         bdrv_drain_invoke(bs, true, true);
         aio_context_release(aio_context);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         /* Re-enable things in child-to-parent order */
         aio_context_acquire(aio_context);
         bdrv_drain_invoke(bs, false, true);
-        bdrv_parent_drained_end(bs);
+        bdrv_parent_drained_end(bs, NULL);
         aio_enable_external(aio_context);
         aio_context_release(aio_context);
     }
-- 
2.13.6

bdrv_drained_begin() waits for the completion of requests in the whole
subtree, but it only actually keeps its immediate bs parameter quiesced
until bdrv_drained_end().

Add a version that keeps the whole subtree drained. As of this commit,
graph changes cannot be allowed during a subtree drained section, but
this will be fixed soon.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h | 13 +++++++++++++
 block/io.c            | 54 ++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
 void bdrv_drained_begin(BlockDriverState *bs);
 
 /**
+ * Like bdrv_drained_begin, but recursively begins a quiesced section for
+ * exclusive access to all child nodes as well.
+ *
+ * Graph changes are not allowed during a subtree drain section.
+ */
+void bdrv_subtree_drained_begin(BlockDriverState *bs);
+
+/**
  * bdrv_drained_end:
  *
  * End a quiescent section started by bdrv_drained_begin().
  */
 void bdrv_drained_end(BlockDriverState *bs);
 
+/**
+ * End a quiescent section started by bdrv_subtree_drained_begin().
+ */
+void bdrv_subtree_drained_end(BlockDriverState *bs);
+
 void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
                     Error **errp);
 void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
     BlockDriverState *bs;
     bool done;
     bool begin;
+    bool recursive;
     BdrvChild *parent;
 } BdrvCoDrainData;
 
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
     return waited;
 }
 
-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
+static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                                  BdrvChild *parent);
+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                                BdrvChild *parent);
 
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 
     bdrv_dec_in_flight(bs);
     if (data->begin) {
-        bdrv_do_drained_begin(bs, data->parent);
+        bdrv_do_drained_begin(bs, data->recursive, data->parent);
     } else {
-        bdrv_do_drained_end(bs, data->parent);
+        bdrv_do_drained_end(bs, data->recursive, data->parent);
     }
 
     data->done = true;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 }
 
 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-                                                bool begin, BdrvChild *parent)
+                                                bool begin, bool recursive,
+                                                BdrvChild *parent)
 {
     BdrvCoDrainData data;
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
         .bs = bs,
         .done = false,
         .begin = begin,
+        .recursive = recursive,
         .parent = parent,
     };
     bdrv_inc_in_flight(bs);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     assert(data.done);
 }
 
-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
+static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                                  BdrvChild *parent)
 {
+    BdrvChild *child, *next;
+
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, true, parent);
+        bdrv_co_yield_to_drain(bs, true, recursive, parent);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
     bdrv_parent_drained_begin(bs, parent);
     bdrv_drain_invoke(bs, true, false);
     bdrv_drain_recurse(bs);
+
+    if (recursive) {
+        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
+            bdrv_do_drained_begin(child->bs, true, child);
+        }
+    }
 }
 
 void bdrv_drained_begin(BlockDriverState *bs)
 {
-    bdrv_do_drained_begin(bs, NULL);
+    bdrv_do_drained_begin(bs, false, NULL);
+}
+
+void bdrv_subtree_drained_begin(BlockDriverState *bs)
+{
+    bdrv_do_drained_begin(bs, true, NULL);
 }
 
-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                                BdrvChild *parent)
 {
+    BdrvChild *child, *next;
     int old_quiesce_counter;
 
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, false, parent);
+        bdrv_co_yield_to_drain(bs, false, recursive, parent);
         return;
     }
     assert(bs->quiesce_counter > 0);
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
     if (old_quiesce_counter == 1) {
         aio_enable_external(bdrv_get_aio_context(bs));
     }
+
+    if (recursive) {
+        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
+            bdrv_do_drained_end(child->bs, true, child);
+        }
+    }
 }
 
 void bdrv_drained_end(BlockDriverState *bs)
 {
-    bdrv_do_drained_end(bs, NULL);
+    bdrv_do_drained_end(bs, false, NULL);
+}
+
+void bdrv_subtree_drained_end(BlockDriverState *bs)
+{
+    bdrv_do_drained_end(bs, true, NULL);
 }
 
 /*
-- 
2.13.6

Add a subtree drain version to the existing test cases.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
 enum drain_type {
     BDRV_DRAIN_ALL,
     BDRV_DRAIN,
+    BDRV_SUBTREE_DRAIN,
     DRAIN_TYPE_MAX,
 };
 
@@ -XXX,XX +XXX,XX @@ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
     switch (drain_type) {
     case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
     case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
+    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_begin(bs); break;
     default:                    g_assert_not_reached();
     }
 }
@@ -XXX,XX +XXX,XX @@ static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
     switch (drain_type) {
     case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
     case BDRV_DRAIN:            bdrv_drained_end(bs); break;
+    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_end(bs); break;
     default:                    g_assert_not_reached();
     }
 }
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
     test_drv_cb_common(BDRV_DRAIN, false);
 }
 
+static void test_drv_cb_drain_subtree(void)
+{
+    test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
+}
+
 static void test_quiesce_common(enum drain_type drain_type, bool recursive)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
     test_quiesce_common(BDRV_DRAIN, false);
 }
 
+static void test_quiesce_drain_subtree(void)
+{
+    test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
+}
+
 static void test_nested(void)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
             /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
             int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
                                   (inner != BDRV_DRAIN_ALL);
-            int backing_quiesce = 0;
+            int backing_quiesce = (outer == BDRV_SUBTREE_DRAIN) +
+                                  (inner == BDRV_SUBTREE_DRAIN);
             int backing_cb_cnt  = (outer != BDRV_DRAIN) +
                                   (inner != BDRV_DRAIN);
 
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_drain(void)
     test_blockjob_common(BDRV_DRAIN);
 }
 
+static void test_blockjob_drain_subtree(void)
+{
+    test_blockjob_common(BDRV_SUBTREE_DRAIN);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
 
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
     g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
+    g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
+                    test_drv_cb_drain_subtree);
 
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
+    g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
+                    test_quiesce_drain_subtree);
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
+    g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
+                    test_blockjob_drain_subtree);
 
     return g_test_run();
 }
-- 
2.13.6

If bdrv_do_drained_begin/end() are called in coroutine context, they
first use a BH to get out of the coroutine context. Call some existing
tests again from a coroutine to cover this code path.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
     *aio_ret = ret;
 }
 
+typedef struct CallInCoroutineData {
+    void (*entry)(void);
+    bool done;
+} CallInCoroutineData;
+
+static coroutine_fn void call_in_coroutine_entry(void *opaque)
+{
+    CallInCoroutineData *data = opaque;
+
+    data->entry();
+    data->done = true;
+}
+
+static void call_in_coroutine(void (*entry)(void))
+{
+    Coroutine *co;
+    CallInCoroutineData data = {
+        .entry  = entry,
+        .done   = false,
+    };
+
+    co = qemu_coroutine_create(call_in_coroutine_entry, &data);
+    qemu_coroutine_enter(co);
+    while (!data.done) {
+        aio_poll(qemu_get_aio_context(), true);
+    }
+}
+
 enum drain_type {
     BDRV_DRAIN_ALL,
     BDRV_DRAIN,
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_subtree(void)
     test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
 }
 
+static void test_drv_cb_co_drain(void)
+{
+    call_in_coroutine(test_drv_cb_drain);
+}
+
+static void test_drv_cb_co_drain_subtree(void)
+{
+    call_in_coroutine(test_drv_cb_drain_subtree);
+}
+
 static void test_quiesce_common(enum drain_type drain_type, bool recursive)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain_subtree(void)
     test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
 }
 
+static void test_quiesce_co_drain(void)
+{
+    call_in_coroutine(test_quiesce_drain);
+}
+
+static void test_quiesce_co_drain_subtree(void)
+{
+    call_in_coroutine(test_quiesce_drain_subtree);
+}
+
 static void test_nested(void)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
                     test_drv_cb_drain_subtree);
 
+    // XXX bdrv_drain_all() doesn't work in coroutine context
+    g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
+    g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
+                    test_drv_cb_co_drain_subtree);
+
+
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
     g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
                     test_quiesce_drain_subtree);
 
+    // XXX bdrv_drain_all() doesn't work in coroutine context
+    g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
+    g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
+                    test_quiesce_co_drain_subtree);
+
     g_test_add_func("/bdrv-drain/nested", test_nested);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
-- 
2.13.6

Test that drain sections are correctly propagated through the graph.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
     blk_unref(blk);
 }
 
+static void test_multiparent(void)
+{
+    BlockBackend *blk_a, *blk_b;
+    BlockDriverState *bs_a, *bs_b, *backing;
+    BDRVTestState *a_s, *b_s, *backing_s;
+
+    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
+                                &error_abort);
+    a_s = bs_a->opaque;
+    blk_insert_bs(blk_a, bs_a, &error_abort);
+
+    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
+                                &error_abort);
+    b_s = bs_b->opaque;
+    blk_insert_bs(blk_b, bs_b, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs_a, backing, &error_abort);
+    bdrv_set_backing_hd(bs_b, backing, &error_abort);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(backing->quiesce_counter, ==, 1);
+    g_assert_cmpint(a_s->drain_count, ==, 1);
+    g_assert_cmpint(b_s->drain_count, ==, 1);
+    g_assert_cmpint(backing_s->drain_count, ==, 1);
+
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 2);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
+    g_assert_cmpint(backing->quiesce_counter, ==, 2);
+    g_assert_cmpint(a_s->drain_count, ==, 2);
+    g_assert_cmpint(b_s->drain_count, ==, 2);
+    g_assert_cmpint(backing_s->drain_count, ==, 2);
+
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(backing->quiesce_counter, ==, 1);
+    g_assert_cmpint(a_s->drain_count, ==, 1);
+    g_assert_cmpint(b_s->drain_count, ==, 1);
+    g_assert_cmpint(backing_s->drain_count, ==, 1);
+
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    bdrv_unref(backing);
+    bdrv_unref(bs_a);
+    bdrv_unref(bs_b);
+    blk_unref(blk_a);
+    blk_unref(blk_b);
+}
+
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
                     test_quiesce_co_drain_subtree);
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
+    g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
-- 
2.13.6

We need to remember how many of the drain sections in which a node is
were recursive (i.e. subtree drain rather than node drain), so that they
can be correctly applied when children are added or removed during the
drained section.

With this change, it is safe to modify the graph even inside a
bdrv_subtree_drained_begin/end() section.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h     |  2 --
 include/block/block_int.h |  5 +++++
 block.c                   | 32 +++++++++++++++++++++++++++++---
 block/io.c                | 28 ++++++++++++++++++++++++----
 4 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs);
 /**
  * Like bdrv_drained_begin, but recursively begins a quiesced section for
  * exclusive access to all child nodes as well.
- *
- * Graph changes are not allowed during a subtree drain section.
  */
 void bdrv_subtree_drained_begin(BlockDriverState *bs);
 
diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
 
     /* Accessed with atomic ops.  */
     int quiesce_counter;
+    int recursive_quiesce_counter;
+
     unsigned int write_gen;               /* Current data generation */
 
     /* Protected by reqs_lock.  */
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags);
 
+void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
+void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
+
 int get_tmp_filename(char *filename, int size);
 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
                             const char *filename);
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_child_cb_drained_end(BdrvChild *child)
     bdrv_drained_end(bs);
 }
 
+static void bdrv_child_cb_attach(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_apply_subtree_drain(child, bs);
+}
+
+static void bdrv_child_cb_detach(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_unapply_subtree_drain(child, bs);
+}
+
 static int bdrv_child_cb_inactivate(BdrvChild *child)
 {
     BlockDriverState *bs = child->opaque;
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_file = {
     .inherit_options = bdrv_inherited_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
     .drained_end     = bdrv_child_cb_drained_end,
+    .attach          = bdrv_child_cb_attach,
+    .detach          = bdrv_child_cb_detach,
     .inactivate      = bdrv_child_cb_inactivate,
 };
 
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_format = {
     .inherit_options = bdrv_inherited_fmt_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
     .drained_end     = bdrv_child_cb_drained_end,
+    .attach          = bdrv_child_cb_attach,
+    .detach          = bdrv_child_cb_detach,
     .inactivate      = bdrv_child_cb_inactivate,
 };
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_attach(BdrvChild *c)
                     parent->backing_blocker);
     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
                     parent->backing_blocker);
+
+    bdrv_child_cb_attach(c);
 }
 
 static void bdrv_backing_detach(BdrvChild *c)
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_detach(BdrvChild *c)
     bdrv_op_unblock_all(c->bs, parent->backing_blocker);
     error_free(parent->backing_blocker);
     parent->backing_blocker = NULL;
+
+    bdrv_child_cb_detach(c);
 }
 
 /*
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
     }
     if (old_bs) {
+        /* Detach first so that the recursive drain sections coming from @child
+         * are already gone and we only end the drain sections that came from
+         * elsewhere. */
+        if (child->role->detach) {
+            child->role->detach(child);
+        }
         if (old_bs->quiesce_counter && child->role->drained_end) {
             for (i = 0; i < old_bs->quiesce_counter; i++) {
                 child->role->drained_end(child);
             }
         }
-        if (child->role->detach) {
-            child->role->detach(child);
-        }
         QLIST_REMOVE(child, next_parent);
     }
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
             }
         }
 
+        /* Attach only after starting new drained sections, so that recursive
+         * drain sections coming from @child don't get an extra .drained_begin
+         * callback. */
         if (child->role->attach) {
             child->role->attach(child);
         }
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     assert(data.done);
 }
 
-static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-                                  BdrvChild *parent)
+void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                           BdrvChild *parent)
 {
     BdrvChild *child, *next;
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
     bdrv_drain_recurse(bs);
 
     if (recursive) {
+        bs->recursive_quiesce_counter++;
         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
             bdrv_do_drained_begin(child->bs, true, child);
         }
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_begin(BlockDriverState *bs)
     bdrv_do_drained_begin(bs, true, NULL);
 }
 
-static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
-                                BdrvChild *parent)
+void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                         BdrvChild *parent)
 {
     BdrvChild *child, *next;
     int old_quiesce_counter;
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
     }
 
     if (recursive) {
+        bs->recursive_quiesce_counter--;
         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
             bdrv_do_drained_end(child->bs, true, child);
         }
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_end(BlockDriverState *bs)
     bdrv_do_drained_end(bs, true, NULL);
 }
 
+void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
+{
+    int i;
+
+    for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
+        bdrv_do_drained_begin(child->bs, true, child);
+    }
+}
+
+void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
+{
+    int i;
+
+    for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
+        bdrv_do_drained_end(child->bs, true, child);
+    }
+}
+
 /*
  * Wait for pending requests to complete on a single BlockDriverState subtree,
  * and suspend block driver's internal I/O until next request arrives.
-- 
2.13.6

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_multiparent(void)
     blk_unref(blk_b);
 }
 
+static void test_graph_change(void)
+{
+    BlockBackend *blk_a, *blk_b;
+    BlockDriverState *bs_a, *bs_b, *backing;
+    BDRVTestState *a_s, *b_s, *backing_s;
+
+    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
+                                &error_abort);
+    a_s = bs_a->opaque;
+    blk_insert_bs(blk_a, bs_a, &error_abort);
+
+    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
+                                &error_abort);
+    b_s = bs_b->opaque;
+    blk_insert_bs(blk_b, bs_b, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs_a, backing, &error_abort);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
+
+    bdrv_set_backing_hd(bs_b, backing, &error_abort);
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
+    g_assert_cmpint(backing->quiesce_counter, ==, 5);
+    g_assert_cmpint(a_s->drain_count, ==, 5);
+    g_assert_cmpint(b_s->drain_count, ==, 5);
+    g_assert_cmpint(backing_s->drain_count, ==, 5);
+
+    bdrv_set_backing_hd(bs_b, NULL, &error_abort);
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 3);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
+    g_assert_cmpint(backing->quiesce_counter, ==, 3);
+    g_assert_cmpint(a_s->drain_count, ==, 3);
+    g_assert_cmpint(b_s->drain_count, ==, 2);
+    g_assert_cmpint(backing_s->drain_count, ==, 3);
+
+    bdrv_set_backing_hd(bs_b, backing, &error_abort);
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
+    g_assert_cmpint(backing->quiesce_counter, ==, 5);
+    g_assert_cmpint(a_s->drain_count, ==, 5);
+    g_assert_cmpint(b_s->drain_count, ==, 5);
+    g_assert_cmpint(backing_s->drain_count, ==, 5);
+
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    bdrv_unref(backing);
+    bdrv_unref(bs_a);
+    bdrv_unref(bs_b);
+    blk_unref(blk_a);
+    blk_unref(blk_b);
+}
+
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
     g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
+    g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
-- 
2.13.6

Since commit bde70715, base is the only node that is reopened in
commit_start(). This means that the code, which still involves an
explicit BlockReopenQueue, can now be simplified by using bdrv_reopen().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
---
 block/commit.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/block/commit.c b/block/commit.c
index XXXXXXX..XXXXXXX 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
                   const char *filter_node_name, Error **errp)
 {
     CommitBlockJob *s;
-    BlockReopenQueue *reopen_queue = NULL;
     int orig_base_flags;
     BlockDriverState *iter;
     BlockDriverState *commit_top_bs = NULL;
@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
     /* convert base to r/w, if necessary */
     orig_base_flags = bdrv_get_flags(base);
     if (!(orig_base_flags & BDRV_O_RDWR)) {
-        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
-                                         orig_base_flags | BDRV_O_RDWR);
-    }
-
-    if (reopen_queue) {
-        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
+        bdrv_reopen(base, orig_base_flags | BDRV_O_RDWR, &local_err);
         if (local_err != NULL) {
             error_propagate(errp, local_err);
             goto fail;
-- 
2.13.6

The bdrv_reopen*() implementation doesn't like it if the graph is
changed between queuing nodes for reopen and actually reopening them
(one of the reasons is that queuing can be recursive).

So instead of draining the device only in bdrv_reopen_multiple(),
require that callers already drained all affected nodes, and assert this
in bdrv_reopen_queue().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
---
 block.c             | 23 ++++++++++++++++-------
 block/replication.c |  6 ++++++
 qemu-io-cmds.c      |  3 +++
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
  * returns a pointer to bs_queue, which is either the newly allocated
  * bs_queue, or the existing bs_queue being used.
  *
+ * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
  */
 static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
                                                  BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
     BdrvChild *child;
     QDict *old_options, *explicit_options;
 
+    /* Make sure that the caller remembered to use a drained section. This is
+     * important to avoid graph changes between the recursive queuing here and
+     * bdrv_reopen_multiple(). */
+    assert(bs->quiesce_counter > 0);
+
     if (bs_queue == NULL) {
         bs_queue = g_new0(BlockReopenQueue, 1);
         QSIMPLEQ_INIT(bs_queue);
@@ -XXX,XX +XXX,XX @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
  * If all devices prepare successfully, then the changes are committed
  * to all devices.
  *
+ * All affected nodes must be drained between bdrv_reopen_queue() and
+ * bdrv_reopen_multiple().
  */
 int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **er
 
     assert(bs_queue != NULL);
 
-    aio_context_release(ctx);
-    bdrv_drain_all_begin();
-    aio_context_acquire(ctx);
-
     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
+        assert(bs_entry->state.bs->quiesce_counter > 0);
         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
             error_propagate(errp, local_err);
             goto cleanup;
@@ -XXX,XX +XXX,XX @@ cleanup:
     }
     g_free(bs_queue);
 
-    bdrv_drain_all_end();
-
     return ret;
 }
 
@@ -XXX,XX +XXX,XX @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
 {
     int ret = -1;
     Error *local_err = NULL;
-    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
+    BlockReopenQueue *queue;
 
+    bdrv_subtree_drained_begin(bs);
+
+    queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
     ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err);
     if (local_err != NULL) {
         error_propagate(errp, local_err);
     }
+
+    bdrv_subtree_drained_end(bs);
+
     return ret;
 }
 
diff --git a/block/replication.c b/block/replication.c
index XXXXXXX..XXXXXXX 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
         new_secondary_flags = s->orig_secondary_flags;
     }
 
+    bdrv_subtree_drained_begin(s->hidden_disk->bs);
+    bdrv_subtree_drained_begin(s->secondary_disk->bs);
+
     if (orig_hidden_flags != new_hidden_flags) {
         reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, NULL,
                                          new_hidden_flags);
@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
                              reopen_queue, &local_err);
         error_propagate(errp, local_err);
     }
+
+    bdrv_subtree_drained_end(s->hidden_disk->bs);
+    bdrv_subtree_drained_end(s->secondary_disk->bs);
 }
 
 static void backup_job_cleanup(BlockDriverState *bs)
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -XXX,XX +XXX,XX @@ static int reopen_f(BlockBackend *blk, int argc, char **argv)
     opts = qopts ? qemu_opts_to_qdict(qopts, NULL) : NULL;
     qemu_opts_reset(&reopen_opts);
 
+    bdrv_subtree_drained_begin(bs);
     brq = bdrv_reopen_queue(NULL, bs, opts, flags);
     bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, &local_err);
+    bdrv_subtree_drained_end(bs);
+
     if (local_err) {
         error_report_err(local_err);
     } else {
-- 
2.13.6