Series comparison

-[Qemu-devel] [PULL 00/14] Block layer patches
+[Qemu-devel] [PULL v3 00/35] Block layer patches
-The following changes since commit 98bfaac788be0ca63d7d010c8d4ba100ff1d8278:
+The following changes since commit 281f327487c9c9b1599f93c589a408bbf4a651b8:
-  Merge remote-tracking branch 'remotes/armbru/tags/pull-qapi-2017-09-01-v3' into staging (2017-09-04 13:28:09 +0100)
+  Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.12-pull-request' into staging (2017-12-22 00:11:36 +0000)
 are available in the git repository at:
   git://repo.or.cz/qemu/kevin.git tags/for-upstream
-for you to fetch changes up to 83a8c775a8bf134eb18a719322939b74a818d750:
+for you to fetch changes up to 1a63a907507fbbcfaee3f622907ec244b7eabda8:
-  qcow2: move qcow2_store_persistent_dirty_bitmaps() before cache flushing (2017-09-06 14:40:18 +0200)
+  block: Keep nodes drained between reopen_queue/multiple (2017-12-22 15:05:32 +0100)
 ----------------------------------------------------------------
 Block layer patches
 ----------------------------------------------------------------
-Daniel P. Berrange (1):
+Doug Gale (1):
-      block: document semantics of bdrv_co_preadv|pwritev
+      nvme: Add tracing
-Eric Blake (2):
+Edgar Kaziakhmedov (1):
-      qcow: Change signature of get_cluster_offset()
+      qcow2: get rid of qcow2_backing_read1 routine
       qcow: Check failure of bdrv_getlength() and bdrv_truncate()
-Manos Pitsidianakis (10):
+Fam Zheng (2):
-      block: pass bdrv_* methods to bs->file by default in block filters
+      block: Open backing image in force share mode for size probe
-      block: remove unused bdrv_media_changed
+      block: Remove unused bdrv_requests_pending
       block: remove bdrv_truncate callback in blkdebug
       block: add default implementations for bdrv_co_get_block_status()
       block: move ThrottleGroup membership to ThrottleGroupMember
       block: add aio_context field in ThrottleGroupMember
       block: tidy ThrottleGroupMember initializations
       block: convert ThrottleGroup to object with QOM
       block: add throttle block filter driver
       qemu-iotests: add 184 for throttle filter driver
-Pavel Butsykin (1):
+John Snow (1):
-      qcow2: move qcow2_store_persistent_dirty_bitmaps() before cache flushing
+      iotests: fix 197 for vpc
- qapi/block-core.json            |  66 +++-
+Kevin Wolf (27):
- include/block/block.h           |   1 -
+      block: Formats don't need CONSISTENT_READ with NO_IO
- include/block/block_int.h       |  56 ++-
+      block: Make bdrv_drain_invoke() recursive
- include/block/throttle-groups.h |  52 ++-
+      block: Call .drain_begin only once in bdrv_drain_all_begin()
- include/qemu/throttle-options.h |  60 +++-
+      test-bdrv-drain: Test BlockDriver callbacks for drain
- include/qemu/throttle.h         |   3 +
+      block: bdrv_drain_recurse(): Remove unused begin parameter
- include/sysemu/block-backend.h  |  20 +-
+      block: Don't wait for requests in bdrv_drain*_end()
- block.c                         |  35 +-
+      block: Unify order in drain functions
- block/blkdebug.c                |  20 +-
+      block: Don't acquire AioContext in hmp_qemu_io()
- block/block-backend.c           |  62 ++--
+      block: Document that x-blockdev-change breaks quorum children list
- block/commit.c                  |  12 +-
+      block: Assert drain_all is only called from main AioContext
- block/io.c                      |  26 ++
+      block: Make bdrv_drain() driver callbacks non-recursive
- block/mirror.c                  |  12 +-
+      test-bdrv-drain: Test callback for bdrv_drain
- block/qapi.c                    |   8 +-
+      test-bdrv-drain: Test bs->quiesce_counter
- block/qcow.c                    | 153 ++++----
+      blockjob: Pause job on draining any job BDS
- block/qcow2.c                   |  16 +-
+      test-bdrv-drain: Test drain vs. block jobs
- block/raw-format.c              |   6 -
+      block: Don't block_job_pause_all() in bdrv_drain_all()
- block/throttle-groups.c         | 750 ++++++++++++++++++++++++++++++----------
+      block: Nested drain_end must still call callbacks
- block/throttle.c                | 237 +++++++++++++
+      test-bdrv-drain: Test nested drain sections
- blockdev.c                      |   4 +-
+      block: Don't notify parents in drain call chain
- tests/test-throttle.c           | 111 +++---
+      block: Add bdrv_subtree_drained_begin/end()
- util/throttle.c                 | 151 ++++++++
+      test-bdrv-drain: Tests for bdrv_subtree_drain
- block/Makefile.objs             |   1 +
+      test-bdrv-drain: Test behaviour in coroutine context
- tests/qemu-iotests/184          | 205 +++++++++++
+      test-bdrv-drain: Recursive draining with multiple parents
- tests/qemu-iotests/184.out      | 302 ++++++++++++++++
+      block: Allow graph changes in subtree drained section
- tests/qemu-iotests/group        |   1 +
+      test-bdrv-drain: Test graph changes in drained section
-files changed, 1917 insertions(+), 453 deletions(-)
+      commit: Simplify reopen of base
- create mode 100644 block/throttle.c
+      block: Keep nodes drained between reopen_queue/multiple
  create mode 100755 tests/qemu-iotests/184
  create mode 100644 tests/qemu-iotests/184.out
+Thomas Huth (3):
+      block: Remove the obsolete -drive boot=on|off parameter
+      block: Remove the deprecated -hdachs option
+      block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter
+ qapi/block-core.json             |   4 +
+ block/qcow2.h                    |   3 -
+ include/block/block.h            |  15 +-
+ include/block/block_int.h        |   6 +-
+ block.c                          |  75 ++++-
+ block/commit.c                   |   8 +-
+ block/io.c                       | 164 +++++++---
+ block/qcow2.c                    |  51 +--
+ block/replication.c              |   6 +
+ blockdev.c                       |  11 -
+ blockjob.c                       |  22 +-
+ hmp.c                            |   6 -
+ hw/block/nvme.c                  | 349 +++++++++++++++++----
+ qemu-io-cmds.c                   |   3 +
+ tests/test-bdrv-drain.c          | 651 +++++++++++++++++++++++++++++++++++++++
+ vl.c                             |  86 +-----
+ hw/block/trace-events            |  93 ++++++
+ qemu-doc.texi                    |  29 +-
+ qemu-options.hx                  |  19 +-
+ tests/Makefile.include           |   2 +
+ tests/qemu-iotests/197           |   4 +
+ tests/qemu-iotests/common.filter |   3 +-
+files changed, 1294 insertions(+), 316 deletions(-)
+ create mode 100644 tests/test-bdrv-drain.c

-New patch
+[Qemu-devel] [PULL v3 01/35] block: Formats don't need CONSISTENT_READ with NO_IO
+Commit 1f4ad7d fixed 'qemu-img info' for raw images that are currently
+in use as a mirror target. It is not enough for image formats, though,
+as these still unconditionally request BLK_PERM_CONSISTENT_READ.
+As this permission is geared towards whether the guest-visible data is
+consistent, and has no impact on whether the metadata is sane, and
+'qemu-img info' does not read guest-visible data (except for the raw
+format), it makes sense to not require BLK_PERM_CONSISTENT_READ if there
+is not going to be any guest I/O performed, regardless of image format.
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+---
+ block.c | 6 +++++-
+file changed, 5 insertions(+), 1 deletion(-)
+diff --git a/block.c b/block.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block.c
++++ b/block.c
+@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
+     assert(role == &child_backing || role == &child_file);
+     if (!backing) {
++        int flags = bdrv_reopen_get_flags(reopen_queue, bs);
++
+         /* Apart from the modifications below, the same permissions are
+          * forwarded and left alone as for filters */
+         bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared,
+@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
+         /* bs->file always needs to be consistent because of the metadata. We
+          * can never allow other users to resize or write to it. */
+-        perm |= BLK_PERM_CONSISTENT_READ;
++        if (!(flags & BDRV_O_NO_IO)) {
++            perm |= BLK_PERM_CONSISTENT_READ;
++        }
+         shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
+     } else {
+         /* We want consistent read from backing files if the parent needs it.
+--
+.13.6

-New patch
+[Qemu-devel] [PULL v3 02/35] iotests: fix 197 for vpc
+From: John Snow <jsnow@redhat.com>
+VPC has some difficulty creating geometries of particular size.
+However, we can indeed force it to use a literal one, so let's
+do that for the sake of test 197, which is testing some specific
+offsets.
+Signed-off-by: John Snow <jsnow@redhat.com>
+Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Lukáš Doktor <ldoktor@redhat.com>
+---
+ tests/qemu-iotests/197           | 4 ++++
+ tests/qemu-iotests/common.filter | 3 ++-
+files changed, 6 insertions(+), 1 deletion(-)
+diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197
+index XXXXXXX..XXXXXXX 100755
+--- a/tests/qemu-iotests/197
++++ b/tests/qemu-iotests/197
+@@ -XXX,XX +XXX,XX @@ echo '=== Copy-on-read ==='
+ echo
+ # Prep the images
++# VPC rounds image sizes to a specific geometry, force a specific size.
++if [ "$IMGFMT" = "vpc" ]; then
++    IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size")
++fi
+ _make_test_img 4G
+ $QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io
+ IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \
+diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
+index XXXXXXX..XXXXXXX 100644
+--- a/tests/qemu-iotests/common.filter
++++ b/tests/qemu-iotests/common.filter
+@@ -XXX,XX +XXX,XX @@ _filter_img_create()
+         -e "s# log_size=[0-9]\\+##g" \
+         -e "s# refcount_bits=[0-9]\\+##g" \
+         -e "s# key-secret=[a-zA-Z0-9]\\+##g" \
+-        -e "s# iter-time=[0-9]\\+##g"
++        -e "s# iter-time=[0-9]\\+##g" \
++        -e "s# force_size=\\(on\\|off\\)##g"
+ }
+ _filter_img_info()
+--
+.13.6

-New patch
+[Qemu-devel] [PULL v3 03/35] block: Make bdrv_drain_invoke() recursive
+This change separates bdrv_drain_invoke(), which calls the BlockDriver
+drain callbacks, from bdrv_drain_recurse(). Instead, the function
+performs its own recursion now.
+One reason for this is that bdrv_drain_recurse() can be called multiple
+times by bdrv_drain_all_begin(), but the callbacks may only be called
+once. The separation is necessary to fix this bug.
+The other reason is that we intend to go to a model where we call all
+driver callbacks first, and only then start polling. This is not fully
+achieved yet with this patch, as bdrv_drain_invoke() contains a
+BDRV_POLL_WHILE() loop for the block driver callbacks, which can still
+call callbacks for any unrelated event. It's a step in this direction
+anyway.
+Cc: qemu-stable@nongnu.org
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/io.c | 14 +++++++++++---
+file changed, 11 insertions(+), 3 deletions(-)
+diff --git a/block/io.c b/block/io.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/io.c
++++ b/block/io.c
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
+     bdrv_wakeup(bs);
+ }
++/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
+ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
+ {
++    BdrvChild *child, *tmp;
+     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
+     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
+@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
+     data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
+     bdrv_coroutine_enter(bs, data.co);
+     BDRV_POLL_WHILE(bs, !data.done);
++
++    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
++        bdrv_drain_invoke(child->bs, begin);
++    }
+ }
+ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
+@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
+     BdrvChild *child, *tmp;
+     bool waited;
+-    /* Ensure any pending metadata writes are submitted to bs->file.  */
+-    bdrv_drain_invoke(bs, begin);
+-
+     /* Wait for drained requests to finish */
+     waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
+         bdrv_parent_drained_begin(bs);
+     }
++    bdrv_drain_invoke(bs, true);
+     bdrv_drain_recurse(bs, true);
+ }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
+     }
+     bdrv_parent_drained_end(bs);
++    bdrv_drain_invoke(bs, false);
+     bdrv_drain_recurse(bs, false);
+     aio_enable_external(bdrv_get_aio_context(bs));
+ }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
+             aio_context_acquire(aio_context);
+             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+                 if (aio_context == bdrv_get_aio_context(bs)) {
++                    /* FIXME Calling this multiple times is wrong */
++                    bdrv_drain_invoke(bs, true);
+                     waited |= bdrv_drain_recurse(bs, true);
+                 }
+             }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
+         aio_context_acquire(aio_context);
+         aio_enable_external(aio_context);
+         bdrv_parent_drained_end(bs);
++        bdrv_drain_invoke(bs, false);
+         bdrv_drain_recurse(bs, false);
+         aio_context_release(aio_context);
+     }
+--
+.13.6

-New patch
+[Qemu-devel] [PULL v3 04/35] block: Call .drain_begin only once in bdrv_drain_all_begin()
+bdrv_drain_all_begin() used to call the .bdrv_co_drain_begin() driver
+callback inside its polling loop. This means that how many times it got
+called for each node depended on long it had to poll the event loop.
+This is obviously not right and results in nodes that stay drained even
+after bdrv_drain_all_end(), which calls .bdrv_co_drain_begin() once per
+node.
+Fix bdrv_drain_all_begin() to call the callback only once, too.
+Cc: qemu-stable@nongnu.org
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/io.c | 3 +--
+file changed, 1 insertion(+), 2 deletions(-)
+diff --git a/block/io.c b/block/io.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/io.c
++++ b/block/io.c
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
+         aio_context_acquire(aio_context);
+         bdrv_parent_drained_begin(bs);
+         aio_disable_external(aio_context);
++        bdrv_drain_invoke(bs, true);
+         aio_context_release(aio_context);
+         if (!g_slist_find(aio_ctxs, aio_context)) {
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
+             aio_context_acquire(aio_context);
+             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+                 if (aio_context == bdrv_get_aio_context(bs)) {
+-                    /* FIXME Calling this multiple times is wrong */
+-                    bdrv_drain_invoke(bs, true);
+                     waited |= bdrv_drain_recurse(bs, true);
+                 }
+             }
+--
+.13.6

-[Qemu-devel] [PULL 12/14] block: add throttle block filter driver
+[Qemu-devel] [PULL v3 05/35] test-bdrv-drain: Test BlockDriver callbacks for drain
-From: Manos Pitsidianakis <el13635@mail.ntua.gr>
+This adds a test case that the BlockDriver callbacks for drain are
 called in bdrv_drained_all_begin/end(), and that both of them are called
 exactly once.
-block/throttle.c uses existing I/O throttle infrastructure inside a
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-block filter driver. I/O operations are intercepted in the filter's
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-read/write coroutines, and referred to block/throttle-groups.c
+Reviewed-by: Eric Blake <eblake@redhat.com>
 ---
  tests/test-bdrv-drain.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++
  tests/Makefile.include  |   2 +
 files changed, 139 insertions(+)
  create mode 100644 tests/test-bdrv-drain.c
-The driver can be used with the syntax
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 -drive driver=throttle,file.filename=foo.qcow2,throttle-group=bar
 which registers the throttle filter node with the ThrottleGroup 'bar'. The
 given group must be created beforehand with object-add or -object.
 Reviewed-by: Alberto Garcia <berto@igalia.com>
 Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
  qapi/block-core.json            |  18 ++-
  include/block/throttle-groups.h |   5 +
  include/qemu/throttle-options.h |   1 +
  block/throttle-groups.c         |  15 ++-
  block/throttle.c                | 237 ++++++++++++++++++++++++++++++++++++++++
  block/Makefile.objs             |   1 +
 files changed, 275 insertions(+), 2 deletions(-)
  create mode 100644 block/throttle.c
 diff --git a/qapi/block-core.json b/qapi/block-core.json
 index XXXXXXX..XXXXXXX 100644
 --- a/qapi/block-core.json
 +++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
  # Drivers that are supported in block device operations.
  #
  # @vxhs: Since 2.10
 +# @throttle: Since 2.11
  #
  # Since: 2.9
  ##
@@ -XXX,XX +XXX,XX @@
              'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
              'null-aio', 'null-co', 'parallels', 'qcow', 'qcow2', 'qed',
              'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
 -            'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
 +            'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
  ##
  # @BlockdevOptionsFile:
@@ -XXX,XX +XXX,XX @@
              '*tls-creds': 'str' } }
  ##
 +# @BlockdevOptionsThrottle:
 +#
 +# Driver specific block device options for the throttle driver
 +#
 +# @throttle-group:   the name of the throttle-group object to use. It
 +#                    must already exist.
 +# @file:             reference to or definition of the data source block device
 +# Since: 2.11
 +##
 +{ 'struct': 'BlockdevOptionsThrottle',
 +  'data': { 'throttle-group': 'str',
 +            'file' : 'BlockdevRef'
 +             } }
 +##
  # @BlockdevOptions:
  #
  # Options for creating a block device.  Many options are available for all
@@ -XXX,XX +XXX,XX @@
        'replication':'BlockdevOptionsReplication',
        'sheepdog':   'BlockdevOptionsSheepdog',
        'ssh':        'BlockdevOptionsSsh',
 +      'throttle':   'BlockdevOptionsThrottle',
        'vdi':        'BlockdevOptionsGenericFormat',
        'vhdx':       'BlockdevOptionsGenericFormat',
        'vmdk':       'BlockdevOptionsGenericCOWFormat',
 diff --git a/include/block/throttle-groups.h b/include/block/throttle-groups.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/throttle-groups.h
 +++ b/include/block/throttle-groups.h
@@ -XXX,XX +XXX,XX @@ void coroutine_fn throttle_group_co_io_limits_intercept(ThrottleGroupMember *tgm
  void throttle_group_attach_aio_context(ThrottleGroupMember *tgm,
                                         AioContext *new_context);
  void throttle_group_detach_aio_context(ThrottleGroupMember *tgm);
 +/*
 + * throttle_group_exists() must be called under the global
 + * mutex.
 + */
 +bool throttle_group_exists(const char *name);
  #endif
 diff --git a/include/qemu/throttle-options.h b/include/qemu/throttle-options.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/throttle-options.h
 +++ b/include/qemu/throttle-options.h
@@ -XXX,XX +XXX,XX @@
  #define QEMU_OPT_BPS_WRITE_MAX "bps-write-max"
  #define QEMU_OPT_BPS_WRITE_MAX_LENGTH "bps-write-max-length"
  #define QEMU_OPT_IOPS_SIZE "iops-size"
 +#define QEMU_OPT_THROTTLE_GROUP_NAME "throttle-group"
  #define THROTTLE_OPT_PREFIX "throttling."
  #define THROTTLE_OPTS \
 diff --git a/block/throttle-groups.c b/block/throttle-groups.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/throttle-groups.c
 +++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@ static ThrottleGroup *throttle_group_by_name(const char *name)
      return NULL;
  }
 +/* This function reads throttle_groups and must be called under the global
 + * mutex.
 + */
 +bool throttle_group_exists(const char *name)
 +{
 +    return throttle_group_by_name(name) != NULL;
 +}
 +
  /* Increments the reference count of a ThrottleGroup given its name.
   *
   * If no ThrottleGroup is found with the given name a new one is
@@ -XXX,XX +XXX,XX @@ void throttle_group_unregister_tgm(ThrottleGroupMember *tgm)
      ThrottleGroupMember *token;
      int i;
 +    if (!ts) {
 +        /* Discard already unregistered tgm */
 +        return;
 +    }
 +
      assert(tgm->pending_reqs[0] == 0 && tgm->pending_reqs[1] == 0);
      assert(qemu_co_queue_empty(&tgm->throttled_reqs[0]));
      assert(qemu_co_queue_empty(&tgm->throttled_reqs[1]));
@@ -XXX,XX +XXX,XX @@ static void throttle_group_obj_complete(UserCreatable *obj, Error **errp)
      assert(tg->name);
      /* error if name is duplicate */
 -    if (throttle_group_by_name(tg->name) != NULL) {
 +    if (throttle_group_exists(tg->name)) {
          error_setg(errp, "A group with this name already exists");
          return;
      }
 diff --git a/block/throttle.c b/block/throttle.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/block/throttle.c
++++ b/tests/test-bdrv-drain.c
 @@ -XXX,XX +XXX,XX @@
 +/*
-+ * QEMU block throttling filter driver infrastructure
++ * Block node draining tests
 + *
-+ * Copyright (c) 2017 Manos Pitsidianakis
++ * Copyright (c) 2017 Kevin Wolf <kwolf@redhat.com>
 + *
-+ * This program is free software; you can redistribute it and/or
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
-+ * modify it under the terms of the GNU General Public License as
++ * of this software and associated documentation files (the "Software"), to deal
-+ * published by the Free Software Foundation; either version 2 or
++ * in the Software without restriction, including without limitation the rights
-+ * (at your option) version 3 of the License.
++ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
-+ * This program is distributed in the hope that it will be useful,
++ * The above copyright notice and this permission notice shall be included in
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * all copies or substantial portions of the Software.
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
-+ * You should have received a copy of the GNU General Public License
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
-+#include "block/throttle-groups.h"
++#include "block/block.h"
-+#include "qemu/throttle-options.h"
++#include "sysemu/block-backend.h"
 +#include "qapi/error.h"
 +
-+static QemuOptsList throttle_opts = {
++typedef struct BDRVTestState {
-+    .name = "throttle",
++    int drain_count;
-+    .head = QTAILQ_HEAD_INITIALIZER(throttle_opts.head),
++} BDRVTestState;
 +    .desc = {
 +        {
 +            .name = QEMU_OPT_THROTTLE_GROUP_NAME,
 +            .type = QEMU_OPT_STRING,
 +            .help = "Name of the throttle group",
 +        },
 +        { /* end of list */ }
 +    },
 +};
 +
-+static int throttle_configure_tgm(BlockDriverState *bs,
++static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
 +                                  ThrottleGroupMember *tgm,
 +                                  QDict *options, Error **errp)
 +{
-+    int ret;
++    BDRVTestState *s = bs->opaque;
-+    const char *group_name;
++    s->drain_count++;
 +    Error *local_err = NULL;
 +    QemuOpts *opts = qemu_opts_create(&throttle_opts, NULL, 0, &error_abort);
 +
 +    qemu_opts_absorb_qdict(opts, options, &local_err);
 +    if (local_err) {
 +        error_propagate(errp, local_err);
 +        ret = -EINVAL;
 +        goto fin;
 +    }
 +
 +    group_name = qemu_opt_get(opts, QEMU_OPT_THROTTLE_GROUP_NAME);
 +    if (!group_name) {
 +        error_setg(errp, "Please specify a throttle group");
 +        ret = -EINVAL;
 +        goto fin;
 +    } else if (!throttle_group_exists(group_name)) {
 +        error_setg(errp, "Throttle group '%s' does not exist", group_name);
 +        ret = -EINVAL;
 +        goto fin;
 +    }
 +
 +    /* Register membership to group with name group_name */
 +    throttle_group_register_tgm(tgm, group_name, bdrv_get_aio_context(bs));
 +    ret = 0;
 +fin:
 +    qemu_opts_del(opts);
 +    return ret;
 +}
 +
-+static int throttle_open(BlockDriverState *bs, QDict *options,
++static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
 +                         int flags, Error **errp)
 +{
-+    ThrottleGroupMember *tgm = bs->opaque;
++    BDRVTestState *s = bs->opaque;
-+
++    s->drain_count--;
 +    bs->file = bdrv_open_child(NULL, options, "file", bs,
 +                               &child_file, false, errp);
 +    if (!bs->file) {
 +        return -EINVAL;
 +    }
 +    bs->supported_write_flags = bs->file->bs->supported_write_flags;
 +    bs->supported_zero_flags = bs->file->bs->supported_zero_flags;
 +
 +    return throttle_configure_tgm(bs, tgm, options, errp);
 +}
 +
-+static void throttle_close(BlockDriverState *bs)
++static void bdrv_test_close(BlockDriverState *bs)
 +{
-+    ThrottleGroupMember *tgm = bs->opaque;
++    BDRVTestState *s = bs->opaque;
-+    throttle_group_unregister_tgm(tgm);
++    g_assert_cmpint(s->drain_count, >, 0);
 +}
 +
-+
++static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
 +static int64_t throttle_getlength(BlockDriverState *bs)
 +{
 +    return bdrv_getlength(bs->file->bs);
 +}
 +
 +static int coroutine_fn throttle_co_preadv(BlockDriverState *bs,
 +                                           uint64_t offset, uint64_t bytes,
 +                                           QEMUIOVector *qiov, int flags)
 +{
 +
 +    ThrottleGroupMember *tgm = bs->opaque;
 +    throttle_group_co_io_limits_intercept(tgm, bytes, false);
 +
 +    return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
 +}
 +
 +static int coroutine_fn throttle_co_pwritev(BlockDriverState *bs,
 +                                            uint64_t offset, uint64_t bytes,
 +                                            QEMUIOVector *qiov, int flags)
 +{
-+    ThrottleGroupMember *tgm = bs->opaque;
++    /* We want this request to stay until the polling loop in drain waits for
-+    throttle_group_co_io_limits_intercept(tgm, bytes, true);
++     * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
 +     * first and polls its result, too, but it shouldn't accidentally complete
 +     * this request yet. */
 +    qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
 +
-+    return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
++    return 0;
 +}
 +
-+static int coroutine_fn throttle_co_pwrite_zeroes(BlockDriverState *bs,
++static BlockDriver bdrv_test = {
-+                                                  int64_t offset, int bytes,
++    .format_name            = "test",
-+                                                  BdrvRequestFlags flags)
++    .instance_size          = sizeof(BDRVTestState),
 +
 +    .bdrv_close             = bdrv_test_close,
 +    .bdrv_co_preadv         = bdrv_test_co_preadv,
 +
 +    .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
 +    .bdrv_co_drain_end      = bdrv_test_co_drain_end,
 +};
 +
 +static void aio_ret_cb(void *opaque, int ret)
 +{
-+    ThrottleGroupMember *tgm = bs->opaque;
++    int *aio_ret = opaque;
-+    throttle_group_co_io_limits_intercept(tgm, bytes, true);
++    *aio_ret = ret;
 +
 +    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
 +}
 +
-+static int coroutine_fn throttle_co_pdiscard(BlockDriverState *bs,
++static void test_drv_cb_drain_all(void)
 +                                             int64_t offset, int bytes)
 +{
-+    ThrottleGroupMember *tgm = bs->opaque;
++    BlockBackend *blk;
-+    throttle_group_co_io_limits_intercept(tgm, bytes, true);
++    BlockDriverState *bs;
 +    BDRVTestState *s;
 +    BlockAIOCB *acb;
 +    int aio_ret;
 +
-+    return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
++    QEMUIOVector qiov;
 +    struct iovec iov = {
 +        .iov_base = NULL,
 +        .iov_len = 0,
 +    };
 +    qemu_iovec_init_external(&qiov, &iov, 1);
 +
 +    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
 +                              &error_abort);
 +    s = bs->opaque;
 +    blk_insert_bs(blk, bs, &error_abort);
 +
 +    /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
 +    g_assert_cmpint(s->drain_count, ==, 0);
 +    bdrv_drain_all_begin();
 +    g_assert_cmpint(s->drain_count, ==, 1);
 +    bdrv_drain_all_end();
 +    g_assert_cmpint(s->drain_count, ==, 0);
 +
 +    /* Now do the same while a request is pending */
 +    aio_ret = -EINPROGRESS;
 +    acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
 +    g_assert(acb != NULL);
 +    g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
 +
 +    g_assert_cmpint(s->drain_count, ==, 0);
 +    bdrv_drain_all_begin();
 +    g_assert_cmpint(aio_ret, ==, 0);
 +    g_assert_cmpint(s->drain_count, ==, 1);
 +    bdrv_drain_all_end();
 +    g_assert_cmpint(s->drain_count, ==, 0);
 +
 +    bdrv_unref(bs);
 +    blk_unref(blk);
 +}
 +
-+static int throttle_co_flush(BlockDriverState *bs)
++int main(int argc, char **argv)
 +{
-+    return bdrv_co_flush(bs->file->bs);
++    bdrv_init();
 +    qemu_init_main_loop(&error_abort);
 +
 +    g_test_init(&argc, &argv, NULL);
 +
 +    g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
 +
 +    return g_test_run();
 +}
-+
+diff --git a/tests/Makefile.include b/tests/Makefile.include
 +static void throttle_detach_aio_context(BlockDriverState *bs)
 +{
 +    ThrottleGroupMember *tgm = bs->opaque;
 +    throttle_group_detach_aio_context(tgm);
 +}
 +
 +static void throttle_attach_aio_context(BlockDriverState *bs,
 +                                        AioContext *new_context)
 +{
 +    ThrottleGroupMember *tgm = bs->opaque;
 +    throttle_group_attach_aio_context(tgm, new_context);
 +}
 +
 +static int throttle_reopen_prepare(BDRVReopenState *reopen_state,
 +                                   BlockReopenQueue *queue, Error **errp)
 +{
 +    ThrottleGroupMember *tgm;
 +
 +    assert(reopen_state != NULL);
 +    assert(reopen_state->bs != NULL);
 +
 +    reopen_state->opaque = g_new0(ThrottleGroupMember, 1);
 +    tgm = reopen_state->opaque;
 +
 +    return throttle_configure_tgm(reopen_state->bs, tgm, reopen_state->options,
 +            errp);
 +}
 +
 +static void throttle_reopen_commit(BDRVReopenState *reopen_state)
 +{
 +    ThrottleGroupMember *old_tgm = reopen_state->bs->opaque;
 +    ThrottleGroupMember *new_tgm = reopen_state->opaque;
 +
 +    throttle_group_unregister_tgm(old_tgm);
 +    g_free(old_tgm);
 +    reopen_state->bs->opaque = new_tgm;
 +    reopen_state->opaque = NULL;
 +}
 +
 +static void throttle_reopen_abort(BDRVReopenState *reopen_state)
 +{
 +    ThrottleGroupMember *tgm = reopen_state->opaque;
 +
 +    throttle_group_unregister_tgm(tgm);
 +    g_free(tgm);
 +    reopen_state->opaque = NULL;
 +}
 +
 +static bool throttle_recurse_is_first_non_filter(BlockDriverState *bs,
 +                                                 BlockDriverState *candidate)
 +{
 +    return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
 +}
 +
 +static BlockDriver bdrv_throttle = {
 +    .format_name                        =   "throttle",
 +    .protocol_name                      =   "throttle",
 +    .instance_size                      =   sizeof(ThrottleGroupMember),
 +
 +    .bdrv_file_open                     =   throttle_open,
 +    .bdrv_close                         =   throttle_close,
 +    .bdrv_co_flush                      =   throttle_co_flush,
 +
 +    .bdrv_child_perm                    =   bdrv_filter_default_perms,
 +
 +    .bdrv_getlength                     =   throttle_getlength,
 +
 +    .bdrv_co_preadv                     =   throttle_co_preadv,
 +    .bdrv_co_pwritev                    =   throttle_co_pwritev,
 +
 +    .bdrv_co_pwrite_zeroes              =   throttle_co_pwrite_zeroes,
 +    .bdrv_co_pdiscard                   =   throttle_co_pdiscard,
 +
 +    .bdrv_recurse_is_first_non_filter   =   throttle_recurse_is_first_non_filter,
 +
 +    .bdrv_attach_aio_context            =   throttle_attach_aio_context,
 +    .bdrv_detach_aio_context            =   throttle_detach_aio_context,
 +
 +    .bdrv_reopen_prepare                =   throttle_reopen_prepare,
 +    .bdrv_reopen_commit                 =   throttle_reopen_commit,
 +    .bdrv_reopen_abort                  =   throttle_reopen_abort,
 +    .bdrv_co_get_block_status           =   bdrv_co_get_block_status_from_file,
 +
 +    .is_filter                          =   true,
 +};
 +
 +static void bdrv_throttle_init(void)
 +{
 +    bdrv_register(&bdrv_throttle);
 +}
 +
 +block_init(bdrv_throttle_init);
 diff --git a/block/Makefile.objs b/block/Makefile.objs
 index XXXXXXX..XXXXXXX 100644
---- a/block/Makefile.objs
+--- a/tests/Makefile.include
-+++ b/block/Makefile.objs
++++ b/tests/Makefile.include
-@@ -XXX,XX +XXX,XX @@ block-obj-y += accounting.o dirty-bitmap.o
+@@ -XXX,XX +XXX,XX @@ gcov-files-test-thread-pool-y = thread-pool.c
- block-obj-y += write-threshold.o
+ gcov-files-test-hbitmap-y = util/hbitmap.c
- block-obj-y += backup.o
+ check-unit-y += tests/test-hbitmap$(EXESUF)
- block-obj-$(CONFIG_REPLICATION) += replication.o
+ gcov-files-test-hbitmap-y = blockjob.c
-+block-obj-y += throttle.o
++check-unit-y += tests/test-bdrv-drain$(EXESUF)
+ check-unit-y += tests/test-blockjob$(EXESUF)
- block-obj-y += crypto.o
+ check-unit-y += tests/test-blockjob-txn$(EXESUF)
+ check-unit-y += tests/test-x86-cpuid$(EXESUF)
@@ -XXX,XX +XXX,XX @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
  tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
  tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
  tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
 +tests/test-bdrv-drain$(EXESUF): tests/test-bdrv-drain.o $(test-block-obj-y) $(test-util-obj-y)
  tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
  tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
  tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
 --
-.13.5
+.13.6

-New patch
+[Qemu-devel] [PULL v3 06/35] block: bdrv_drain_recurse(): Remove unused begin parameter
+Now that the bdrv_drain_invoke() calls are pulled up to the callers of
+bdrv_drain_recurse(), the 'begin' parameter isn't needed any more.
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/io.c | 12 ++++++------
+file changed, 6 insertions(+), 6 deletions(-)
+diff --git a/block/io.c b/block/io.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/io.c
++++ b/block/io.c
+@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
+     }
+ }
+-static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
++static bool bdrv_drain_recurse(BlockDriverState *bs)
+ {
+     BdrvChild *child, *tmp;
+     bool waited;
+@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
+              */
+             bdrv_ref(bs);
+         }
+-        waited |= bdrv_drain_recurse(bs, begin);
++        waited |= bdrv_drain_recurse(bs);
+         if (in_main_loop) {
+             bdrv_unref(bs);
+         }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
+     }
+     bdrv_drain_invoke(bs, true);
+-    bdrv_drain_recurse(bs, true);
++    bdrv_drain_recurse(bs);
+ }
+ void bdrv_drained_end(BlockDriverState *bs)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
+     bdrv_parent_drained_end(bs);
+     bdrv_drain_invoke(bs, false);
+-    bdrv_drain_recurse(bs, false);
++    bdrv_drain_recurse(bs);
+     aio_enable_external(bdrv_get_aio_context(bs));
+ }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
+             aio_context_acquire(aio_context);
+             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+                 if (aio_context == bdrv_get_aio_context(bs)) {
+-                    waited |= bdrv_drain_recurse(bs, true);
++                    waited |= bdrv_drain_recurse(bs);
+                 }
+             }
+             aio_context_release(aio_context);
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
+         aio_enable_external(aio_context);
+         bdrv_parent_drained_end(bs);
+         bdrv_drain_invoke(bs, false);
+-        bdrv_drain_recurse(bs, false);
++        bdrv_drain_recurse(bs);
+         aio_context_release(aio_context);
+     }
+--
+.13.6

-New patch
+[Qemu-devel] [PULL v3 07/35] block: Don't wait for requests in bdrv_drain*_end()
+The device is drained, so there is no point in waiting for requests at
+the end of the drained section. Remove the bdrv_drain_recurse() calls
+there.
+The bdrv_drain_recurse() calls were introduced in commit 481cad48e5e
+in order to call the .bdrv_co_drain_end() driver callback. This is now
+done by a separate bdrv_drain_invoke() call.
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/io.c | 2 --
+file changed, 2 deletions(-)
+diff --git a/block/io.c b/block/io.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/io.c
++++ b/block/io.c
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
+     bdrv_parent_drained_end(bs);
+     bdrv_drain_invoke(bs, false);
+-    bdrv_drain_recurse(bs);
+     aio_enable_external(bdrv_get_aio_context(bs));
+ }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
+         aio_enable_external(aio_context);
+         bdrv_parent_drained_end(bs);
+         bdrv_drain_invoke(bs, false);
+-        bdrv_drain_recurse(bs);
+         aio_context_release(aio_context);
+     }
+--
+.13.6

-New patch
+[Qemu-devel] [PULL v3 08/35] block: Unify order in drain functions
+Drain requests are propagated to child nodes, parent nodes and directly
+to the AioContext. The order in which this happened was different
+between all combinations of drain/drain_all and begin/end.
+The correct order is to keep children only drained when their parents
+are also drained. This means that at the start of a drained section, the
+AioContext needs to be drained first, the parents second and only then
+the children. The correct order for the end of a drained section is the
+opposite.
+This patch changes the three other functions to follow the example of
+bdrv_drained_begin(), which is the only one that got it right.
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/io.c | 12 ++++++++----
+file changed, 8 insertions(+), 4 deletions(-)
+diff --git a/block/io.c b/block/io.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/io.c
++++ b/block/io.c
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
+         return;
+     }
++    /* Stop things in parent-to-child order */
+     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
+         aio_disable_external(bdrv_get_aio_context(bs));
+         bdrv_parent_drained_begin(bs);
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
+         return;
+     }
+-    bdrv_parent_drained_end(bs);
++    /* Re-enable things in child-to-parent order */
+     bdrv_drain_invoke(bs, false);
++    bdrv_parent_drained_end(bs);
+     aio_enable_external(bdrv_get_aio_context(bs));
+ }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
+     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+         AioContext *aio_context = bdrv_get_aio_context(bs);
++        /* Stop things in parent-to-child order */
+         aio_context_acquire(aio_context);
+-        bdrv_parent_drained_begin(bs);
+         aio_disable_external(aio_context);
++        bdrv_parent_drained_begin(bs);
+         bdrv_drain_invoke(bs, true);
+         aio_context_release(aio_context);
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
+     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+         AioContext *aio_context = bdrv_get_aio_context(bs);
++        /* Re-enable things in child-to-parent order */
+         aio_context_acquire(aio_context);
+-        aio_enable_external(aio_context);
+-        bdrv_parent_drained_end(bs);
+         bdrv_drain_invoke(bs, false);
++        bdrv_parent_drained_end(bs);
++        aio_enable_external(aio_context);
+         aio_context_release(aio_context);
+     }
+--
+.13.6

-New patch
+[Qemu-devel] [PULL v3 09/35] block: Don't acquire AioContext in hmp_qemu_io()
+Commit 15afd94a047 added code to acquire and release the AioContext in
+qemuio_command(). This means that the lock is taken twice now in the
+call path from hmp_qemu_io(). This causes BDRV_POLL_WHILE() to hang for
+any requests issued to nodes in a non-mainloop AioContext.
+Dropping the first locking from hmp_qemu_io() fixes the problem.
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ hmp.c | 6 ------
+file changed, 6 deletions(-)
+diff --git a/hmp.c b/hmp.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hmp.c
++++ b/hmp.c
+@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
+ {
+     BlockBackend *blk;
+     BlockBackend *local_blk = NULL;
+-    AioContext *aio_context;
+     const char* device = qdict_get_str(qdict, "device");
+     const char* command = qdict_get_str(qdict, "command");
+     Error *err = NULL;
+@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
+         }
+     }
+-    aio_context = blk_get_aio_context(blk);
+-    aio_context_acquire(aio_context);
+-
+     /*
+      * Notably absent: Proper permission management. This is sad, but it seems
+      * almost impossible to achieve without changing the semantics and thereby
+@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
+      */
+     qemuio_command(blk, command);
+-    aio_context_release(aio_context);
+-
+ fail:
+     blk_unref(local_blk);
+     hmp_handle_error(mon, &err);
+--
+.13.6

-[Qemu-devel] [PULL 14/14] qcow2: move qcow2_store_persistent_dirty_bitmaps() before cache flushing
+[Qemu-devel] [PULL v3 10/35] qcow2: get rid of qcow2_backing_read1 routine
-From: Pavel Butsykin <pbutsykin@virtuozzo.com>
+From: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
-After calling qcow2_inactivate(), all qcow2 caches must be flushed, but this
+Since bdrv_co_preadv does all neccessary checks including
-may not happen, because the last call qcow2_store_persistent_dirty_bitmaps()
+reading after the end of the backing file, avoid duplication
-can lead to marking l2/refcont cache as dirty.
+of verification before bdrv_co_preadv call.
-Let's move qcow2_store_persistent_dirty_bitmaps() before the caсhe flushing
+Signed-off-by: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
-to fix it.
+Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+Reviewed-by: Eric Blake <eblake@redhat.com>
 Cc: qemu-stable@nongnu.org
 Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/qcow2.c | 16 ++++++++--------
+ block/qcow2.h |  3 ---
-file changed, 8 insertions(+), 8 deletions(-)
+ block/qcow2.c | 51 ++++++++-------------------------------------------
 files changed, 8 insertions(+), 46 deletions(-)
+diff --git a/block/qcow2.h b/block/qcow2.h
+index XXXXXXX..XXXXXXX 100644
+--- a/block/qcow2.h
++++ b/block/qcow2.h
+@@ -XXX,XX +XXX,XX @@ uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset)
+ }
+ /* qcow2.c functions */
+-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
+-                  int64_t sector_num, int nb_sectors);
+-
+ int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
+                                      int refcount_order, bool generous_increase,
+                                      uint64_t *refblock_count);
 diff --git a/block/qcow2.c b/block/qcow2.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2.c
 +++ b/block/qcow2.c
-@@ -XXX,XX +XXX,XX @@ static int qcow2_inactivate(BlockDriverState *bs)
+@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
-     int ret, result = 0;
+     return status;
-     Error *local_err = NULL;
+ }
-+    qcow2_store_persistent_dirty_bitmaps(bs, &local_err);
+-/* handle reading after the end of the backing file */
-+    if (local_err != NULL) {
+-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-+        result = -EINVAL;
+-                        int64_t offset, int bytes)
-+        error_report_err(local_err);
+-{
-+        error_report("Persistent bitmaps are lost for node '%s'",
+-    uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE;
-+                     bdrv_get_device_or_node_name(bs));
+-    int n1;
-+    }
+-
-+
+-    if ((offset + bytes) <= bs_size) {
-     ret = qcow2_cache_flush(bs, s->l2_table_cache);
+-        return bytes;
      if (ret) {
          result = ret;
@@ -XXX,XX +XXX,XX @@ static int qcow2_inactivate(BlockDriverState *bs)
                       strerror(-ret));
      }
 -    qcow2_store_persistent_dirty_bitmaps(bs, &local_err);
 -    if (local_err != NULL) {
 -        result = -EINVAL;
 -        error_report_err(local_err);
 -        error_report("Persistent bitmaps are lost for node '%s'",
 -                     bdrv_get_device_or_node_name(bs));
 -    }
 -
-     if (result == 0) {
+-    if (offset >= bs_size) {
-         qcow2_mark_clean(bs);
+-        n1 = 0;
-     }
+-    } else {
 -        n1 = bs_size - offset;
 -    }
 -
 -    qemu_iovec_memset(qiov, n1, 0, bytes - n1);
 -
 -    return n1;
 -}
 -
  static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
                                          uint64_t bytes, QEMUIOVector *qiov,
                                          int flags)
  {
      BDRVQcow2State *s = bs->opaque;
 -    int offset_in_cluster, n1;
 +    int offset_in_cluster;
      int ret;
      unsigned int cur_bytes; /* number of bytes in current iteration */
      uint64_t cluster_offset = 0;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
          case QCOW2_CLUSTER_UNALLOCATED:
              if (bs->backing) {
 -                /* read from the base image */
 -                n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov,
 -                                         offset, cur_bytes);
 -                if (n1 > 0) {
 -                    QEMUIOVector local_qiov;
 -
 -                    qemu_iovec_init(&local_qiov, hd_qiov.niov);
 -                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1);
 -
 -                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
 -                    qemu_co_mutex_unlock(&s->lock);
 -                    ret = bdrv_co_preadv(bs->backing, offset, n1,
 -                                         &local_qiov, 0);
 -                    qemu_co_mutex_lock(&s->lock);
 -
 -                    qemu_iovec_destroy(&local_qiov);
 -
 -                    if (ret < 0) {
 -                        goto fail;
 -                    }
 +                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
 +                qemu_co_mutex_unlock(&s->lock);
 +                ret = bdrv_co_preadv(bs->backing, offset, cur_bytes,
 +                                     &hd_qiov, 0);
 +                qemu_co_mutex_lock(&s->lock);
 +                if (ret < 0) {
 +                    goto fail;
                  }
              } else {
                  /* Note: in this case, no need to wait */
 --
-.13.5
+.13.6

-New patch
+[Qemu-devel] [PULL v3 11/35] block: Document that x-blockdev-change breaks quorum children list
+Removing a quorum child node with x-blockdev-change results in a quorum
+driver state that cannot be recreated with create options because it
+would require a list with gaps. This causes trouble in at least
+.bdrv_refresh_filename().
+Document this problem so that we won't accidentally mark the command
+stable without having addressed it.
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Alberto Garcia <berto@igalia.com>
+---
+ qapi/block-core.json | 4 ++++
+file changed, 4 insertions(+)
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index XXXXXXX..XXXXXXX 100644
+--- a/qapi/block-core.json
++++ b/qapi/block-core.json
+@@ -XXX,XX +XXX,XX @@
+ # does not support all kinds of operations, all kinds of children, nor
+ # all block drivers.
+ #
++# FIXME Removing children from a quorum node means introducing gaps in the
++# child indices. This cannot be represented in the 'children' list of
++# BlockdevOptionsQuorum, as returned by .bdrv_refresh_filename().
++#
+ # Warning: The data in a new quorum child MUST be consistent with that of
+ # the rest of the array.
+ #
+--
+.13.6

-[Qemu-devel] [PULL 06/14] qcow: Check failure of bdrv_getlength() and bdrv_truncate()
+[Qemu-devel] [PULL v3 12/35] nvme: Add tracing
-From: Eric Blake <eblake@redhat.com>
+From: Doug Gale <doug16k@gmail.com>
-Omitting the check for whether bdrv_getlength() and bdrv_truncate()
+Add trace output for commands, errors, and undefined behavior.
-failed meant that it was theoretically possible to return an
+Add guest error log output for undefined behavior.
-incorrect offset to the caller.  More likely, conditions for either
+Report invalid undefined accesses to MMIO.
-of these functions to fail would also cause one of our other calls
+Annotate unlikely error checks with unlikely.
 (such as bdrv_pread() or bdrv_pwrite_sync()) to also fail, but
 auditing that we are safe is difficult compared to just patching
 things to always forward on the error rather than ignoring it.
-Use osdep.h macros instead of open-coded rounding while in the
+Signed-off-by: Doug Gale <doug16k@gmail.com>
-area.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reported-by: Markus Armbruster <armbru@redhat.com>
 Signed-off-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/qcow.c | 30 ++++++++++++++++++++++--------
+ hw/block/nvme.c       | 349 ++++++++++++++++++++++++++++++++++++++++++--------
-file changed, 22 insertions(+), 8 deletions(-)
+ hw/block/trace-events |  93 ++++++++++++++
 files changed, 390 insertions(+), 52 deletions(-)
-diff --git a/block/qcow.c b/block/qcow.c
+diff --git a/hw/block/nvme.c b/hw/block/nvme.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow.c
+--- a/hw/block/nvme.c
-+++ b/block/qcow.c
++++ b/hw/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static int get_cluster_offset(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@
  #include "qapi/visitor.h"
  #include "sysemu/block-backend.h"
 +#include "qemu/log.h"
 +#include "trace.h"
  #include "nvme.h"
 +#define NVME_GUEST_ERR(trace, fmt, ...) \
 +    do { \
 +        (trace_##trace)(__VA_ARGS__); \
 +        qemu_log_mask(LOG_GUEST_ERROR, #trace \
 +            " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
 +    } while (0)
 +
  static void nvme_process_sq(void *opaque);
  static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
@@ -XXX,XX +XXX,XX @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
  {
-     BDRVQcowState *s = bs->opaque;
+     if (cq->irq_enabled) {
-     int min_index, i, j, l1_index, l2_index, ret;
+         if (msix_enabled(&(n->parent_obj))) {
--    uint64_t l2_offset, *l2_table, cluster_offset, tmp;
++            trace_nvme_irq_msix(cq->vector);
-+    int64_t l2_offset;
+             msix_notify(&(n->parent_obj), cq->vector);
-+    uint64_t *l2_table, cluster_offset, tmp;
+         } else {
-     uint32_t min_count;
++            trace_nvme_irq_pin();
-     int new_l2_table;
+             pci_irq_pulse(&n->parent_obj);
+         }
-@@ -XXX,XX +XXX,XX @@ static int get_cluster_offset(BlockDriverState *bs,
++    } else {
-             return 0;
++        trace_nvme_irq_masked();
-         /* allocate a new l2 entry */
+     }
-         l2_offset = bdrv_getlength(bs->file->bs);
+ }
-+        if (l2_offset < 0) {
-+            return l2_offset;
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
-+        }
+     trans_len = MIN(len, trans_len);
-         /* round to cluster size */
+     int num_prps = (len >> n->page_bits) + 1;
--        l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1);
-+        l2_offset = QEMU_ALIGN_UP(l2_offset, s->cluster_size);
+-    if (!prp1) {
-         /* update the L1 entry */
++    if (unlikely(!prp1)) {
-         s->l1_table[l1_index] = l2_offset;
++        trace_nvme_err_invalid_prp();
-         tmp = cpu_to_be64(l2_offset);
+         return NVME_INVALID_FIELD | NVME_DNR;
-@@ -XXX,XX +XXX,XX @@ static int get_cluster_offset(BlockDriverState *bs,
+     } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
-                 return -EIO;
+                prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
-             }
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
-             cluster_offset = bdrv_getlength(bs->file->bs);
+     }
--            cluster_offset = (cluster_offset + s->cluster_size - 1) &
+     len -= trans_len;
--                ~(s->cluster_size - 1);
+     if (len) {
-+            if ((int64_t) cluster_offset < 0) {
+-        if (!prp2) {
-+                return cluster_offset;
++        if (unlikely(!prp2)) {
-+            }
++            trace_nvme_err_invalid_prp2_missing();
-+            cluster_offset = QEMU_ALIGN_UP(cluster_offset, s->cluster_size);
+             goto unmap;
-             /* write the cluster content */
+         }
-             ret = bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache,
+         if (len > n->page_size) {
-                               s->cluster_size);
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
-@@ -XXX,XX +XXX,XX @@ static int get_cluster_offset(BlockDriverState *bs,
+                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
                  if (i == n->max_prp_ents - 1 && len > n->page_size) {
 -                    if (!prp_ent || prp_ent & (n->page_size - 1)) {
 +                    if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
 +                        trace_nvme_err_invalid_prplist_ent(prp_ent);
                          goto unmap;
                      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                      prp_ent = le64_to_cpu(prp_list[i]);
                  }
 -                if (!prp_ent || prp_ent & (n->page_size - 1)) {
 +                if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
 +                    trace_nvme_err_invalid_prplist_ent(prp_ent);
                      goto unmap;
                  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                  i++;
              }
          } else {
-             cluster_offset = bdrv_getlength(bs->file->bs);
+-            if (prp2 & (n->page_size - 1)) {
-+            if ((int64_t) cluster_offset < 0) {
++            if (unlikely(prp2 & (n->page_size - 1))) {
-+                return cluster_offset;
++                trace_nvme_err_invalid_prp2_align(prp2);
-+            }
+                 goto unmap;
-             if (allocate == 1) {
+             }
-                 /* round to cluster size */
+             if (qsg->nsg) {
--                cluster_offset = (cluster_offset + s->cluster_size - 1) &
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
--                    ~(s->cluster_size - 1);
+     QEMUIOVector iov;
--                bdrv_truncate(bs->file, cluster_offset + s->cluster_size,
+     uint16_t status = NVME_SUCCESS;
--                              PREALLOC_MODE_OFF, NULL);
-+                cluster_offset = QEMU_ALIGN_UP(cluster_offset, s->cluster_size);
++    trace_nvme_dma_read(prp1, prp2);
-+                if (cluster_offset + s->cluster_size > INT64_MAX) {
++
-+                    return -E2BIG;
+     if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
-+                }
+         return NVME_INVALID_FIELD | NVME_DNR;
-+                ret = bdrv_truncate(bs->file, cluster_offset + s->cluster_size,
+     }
-+                                    PREALLOC_MODE_OFF, NULL);
+     if (qsg.nsg > 0) {
-+                if (ret < 0) {
+-        if (dma_buf_read(ptr, len, &qsg)) {
-+                    return ret;
++        if (unlikely(dma_buf_read(ptr, len, &qsg))) {
-+                }
++            trace_nvme_err_invalid_dma();
-                 /* if encrypted, we must initialize the cluster
+             status = NVME_INVALID_FIELD | NVME_DNR;
-                    content which won't be written */
+         }
-                 if (bs->encrypted &&
+         qemu_sglist_destroy(&qsg);
      } else {
 -        if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
 +        if (unlikely(qemu_iovec_to_buf(&iov, 0, ptr, len) != len)) {
 +            trace_nvme_err_invalid_dma();
              status = NVME_INVALID_FIELD | NVME_DNR;
          }
          qemu_iovec_destroy(&iov);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
      uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
      uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS);
 -    if (slba + nlb > ns->id_ns.nsze) {
 +    if (unlikely(slba + nlb > ns->id_ns.nsze)) {
 +        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
          return NVME_LBA_RANGE | NVME_DNR;
      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
      int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
      enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
 -    if ((slba + nlb) > ns->id_ns.nsze) {
 +    trace_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
 +
 +    if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
          block_acct_invalid(blk_get_stats(n->conf.blk), acct);
 +        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
          return NVME_LBA_RANGE | NVME_DNR;
      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
      NvmeNamespace *ns;
      uint32_t nsid = le32_to_cpu(cmd->nsid);
 -    if (nsid == 0 || nsid > n->num_namespaces) {
 +    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
 +        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
          return NVME_INVALID_NSID | NVME_DNR;
      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
      case NVME_CMD_READ:
          return nvme_rw(n, ns, cmd, req);
      default:
 +        trace_nvme_err_invalid_opc(cmd->opcode);
          return NVME_INVALID_OPCODE | NVME_DNR;
      }
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
      NvmeCQueue *cq;
      uint16_t qid = le16_to_cpu(c->qid);
 -    if (!qid || nvme_check_sqid(n, qid)) {
 +    if (unlikely(!qid || nvme_check_sqid(n, qid))) {
 +        trace_nvme_err_invalid_del_sq(qid);
          return NVME_INVALID_QID | NVME_DNR;
      }
 +    trace_nvme_del_sq(qid);
 +
      sq = n->sq[qid];
      while (!QTAILQ_EMPTY(&sq->out_req_list)) {
          req = QTAILQ_FIRST(&sq->out_req_list);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
      uint16_t qflags = le16_to_cpu(c->sq_flags);
      uint64_t prp1 = le64_to_cpu(c->prp1);
 -    if (!cqid || nvme_check_cqid(n, cqid)) {
 +    trace_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
 +
 +    if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
 +        trace_nvme_err_invalid_create_sq_cqid(cqid);
          return NVME_INVALID_CQID | NVME_DNR;
      }
 -    if (!sqid || !nvme_check_sqid(n, sqid)) {
 +    if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) {
 +        trace_nvme_err_invalid_create_sq_sqid(sqid);
          return NVME_INVALID_QID | NVME_DNR;
      }
 -    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
 +    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
 +        trace_nvme_err_invalid_create_sq_size(qsize);
          return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
      }
 -    if (!prp1 || prp1 & (n->page_size - 1)) {
 +    if (unlikely(!prp1 || prp1 & (n->page_size - 1))) {
 +        trace_nvme_err_invalid_create_sq_addr(prp1);
          return NVME_INVALID_FIELD | NVME_DNR;
      }
 -    if (!(NVME_SQ_FLAGS_PC(qflags))) {
 +    if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
 +        trace_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
          return NVME_INVALID_FIELD | NVME_DNR;
      }
      sq = g_malloc0(sizeof(*sq));
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
      NvmeCQueue *cq;
      uint16_t qid = le16_to_cpu(c->qid);
 -    if (!qid || nvme_check_cqid(n, qid)) {
 +    if (unlikely(!qid || nvme_check_cqid(n, qid))) {
 +        trace_nvme_err_invalid_del_cq_cqid(qid);
          return NVME_INVALID_CQID | NVME_DNR;
      }
      cq = n->cq[qid];
 -    if (!QTAILQ_EMPTY(&cq->sq_list)) {
 +    if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
 +        trace_nvme_err_invalid_del_cq_notempty(qid);
          return NVME_INVALID_QUEUE_DEL;
      }
 +    trace_nvme_del_cq(qid);
      nvme_free_cq(cq, n);
      return NVME_SUCCESS;
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
      uint16_t qflags = le16_to_cpu(c->cq_flags);
      uint64_t prp1 = le64_to_cpu(c->prp1);
 -    if (!cqid || !nvme_check_cqid(n, cqid)) {
 +    trace_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
 +                         NVME_CQ_FLAGS_IEN(qflags) != 0);
 +
 +    if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) {
 +        trace_nvme_err_invalid_create_cq_cqid(cqid);
          return NVME_INVALID_CQID | NVME_DNR;
      }
 -    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
 +    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
 +        trace_nvme_err_invalid_create_cq_size(qsize);
          return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
      }
 -    if (!prp1) {
 +    if (unlikely(!prp1)) {
 +        trace_nvme_err_invalid_create_cq_addr(prp1);
          return NVME_INVALID_FIELD | NVME_DNR;
      }
 -    if (vector > n->num_queues) {
 +    if (unlikely(vector > n->num_queues)) {
 +        trace_nvme_err_invalid_create_cq_vector(vector);
          return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
      }
 -    if (!(NVME_CQ_FLAGS_PC(qflags))) {
 +    if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
 +        trace_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
          return NVME_INVALID_FIELD | NVME_DNR;
      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
      uint64_t prp1 = le64_to_cpu(c->prp1);
      uint64_t prp2 = le64_to_cpu(c->prp2);
 +    trace_nvme_identify_ctrl();
 +
      return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
          prp1, prp2);
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
      uint64_t prp1 = le64_to_cpu(c->prp1);
      uint64_t prp2 = le64_to_cpu(c->prp2);
 -    if (nsid == 0 || nsid > n->num_namespaces) {
 +    trace_nvme_identify_ns(nsid);
 +
 +    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
 +        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
          return NVME_INVALID_NSID | NVME_DNR;
      }
      ns = &n->namespaces[nsid - 1];
 +
      return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
          prp1, prp2);
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
      uint16_t ret;
      int i, j = 0;
 +    trace_nvme_identify_nslist(min_nsid);
 +
      list = g_malloc0(data_len);
      for (i = 0; i < n->num_namespaces; i++) {
          if (i < min_nsid) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
      case 0x02:
          return nvme_identify_nslist(n, c);
      default:
 +        trace_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
          return NVME_INVALID_FIELD | NVME_DNR;
      }
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
      switch (dw10) {
      case NVME_VOLATILE_WRITE_CACHE:
          result = blk_enable_write_cache(n->conf.blk);
 +        trace_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
          break;
      case NVME_NUMBER_OF_QUEUES:
          result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
 +        trace_nvme_getfeat_numq(result);
          break;
      default:
 +        trace_nvme_err_invalid_getfeat(dw10);
          return NVME_INVALID_FIELD | NVME_DNR;
      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
          blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
          break;
      case NVME_NUMBER_OF_QUEUES:
 +        trace_nvme_setfeat_numq((dw11 & 0xFFFF) + 1,
 +                                ((dw11 >> 16) & 0xFFFF) + 1,
 +                                n->num_queues - 1, n->num_queues - 1);
          req->cqe.result =
              cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
          break;
      default:
 +        trace_nvme_err_invalid_setfeat(dw10);
          return NVME_INVALID_FIELD | NVME_DNR;
      }
      return NVME_SUCCESS;
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
      case NVME_ADM_CMD_GET_FEATURES:
          return nvme_get_feature(n, cmd, req);
      default:
 +        trace_nvme_err_invalid_admin_opc(cmd->opcode);
          return NVME_INVALID_OPCODE | NVME_DNR;
      }
  }
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
      uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
      uint32_t page_size = 1 << page_bits;
 -    if (n->cq[0] || n->sq[0] || !n->bar.asq || !n->bar.acq ||
 -            n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1) ||
 -            NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) ||
 -            NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap) ||
 -            NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) ||
 -            NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes) ||
 -            NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.sqes) ||
 -            NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.sqes) ||
 -            !NVME_AQA_ASQS(n->bar.aqa) || !NVME_AQA_ACQS(n->bar.aqa)) {
 +    if (unlikely(n->cq[0])) {
 +        trace_nvme_err_startfail_cq();
 +        return -1;
 +    }
 +    if (unlikely(n->sq[0])) {
 +        trace_nvme_err_startfail_sq();
 +        return -1;
 +    }
 +    if (unlikely(!n->bar.asq)) {
 +        trace_nvme_err_startfail_nbarasq();
 +        return -1;
 +    }
 +    if (unlikely(!n->bar.acq)) {
 +        trace_nvme_err_startfail_nbaracq();
 +        return -1;
 +    }
 +    if (unlikely(n->bar.asq & (page_size - 1))) {
 +        trace_nvme_err_startfail_asq_misaligned(n->bar.asq);
 +        return -1;
 +    }
 +    if (unlikely(n->bar.acq & (page_size - 1))) {
 +        trace_nvme_err_startfail_acq_misaligned(n->bar.acq);
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_MPS(n->bar.cc) <
 +                 NVME_CAP_MPSMIN(n->bar.cap))) {
 +        trace_nvme_err_startfail_page_too_small(
 +                    NVME_CC_MPS(n->bar.cc),
 +                    NVME_CAP_MPSMIN(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_MPS(n->bar.cc) >
 +                 NVME_CAP_MPSMAX(n->bar.cap))) {
 +        trace_nvme_err_startfail_page_too_large(
 +                    NVME_CC_MPS(n->bar.cc),
 +                    NVME_CAP_MPSMAX(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
 +                 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
 +        trace_nvme_err_startfail_cqent_too_small(
 +                    NVME_CC_IOCQES(n->bar.cc),
 +                    NVME_CTRL_CQES_MIN(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
 +                 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
 +        trace_nvme_err_startfail_cqent_too_large(
 +                    NVME_CC_IOCQES(n->bar.cc),
 +                    NVME_CTRL_CQES_MAX(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
 +                 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
 +        trace_nvme_err_startfail_sqent_too_small(
 +                    NVME_CC_IOSQES(n->bar.cc),
 +                    NVME_CTRL_SQES_MIN(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
 +                 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
 +        trace_nvme_err_startfail_sqent_too_large(
 +                    NVME_CC_IOSQES(n->bar.cc),
 +                    NVME_CTRL_SQES_MAX(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
 +        trace_nvme_err_startfail_asqent_sz_zero();
 +        return -1;
 +    }
 +    if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
 +        trace_nvme_err_startfail_acqent_sz_zero();
          return -1;
      }
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
  static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
      unsigned size)
  {
 +    if (unlikely(offset & (sizeof(uint32_t) - 1))) {
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_misaligned32,
 +                       "MMIO write not 32-bit aligned,"
 +                       " offset=0x%"PRIx64"", offset);
 +        /* should be ignored, fall through for now */
 +    }
 +
 +    if (unlikely(size < sizeof(uint32_t))) {
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_toosmall,
 +                       "MMIO write smaller than 32-bits,"
 +                       " offset=0x%"PRIx64", size=%u",
 +                       offset, size);
 +        /* should be ignored, fall through for now */
 +    }
 +
      switch (offset) {
 -    case 0xc:
 +    case 0xc:   /* INTMS */
 +        if (unlikely(msix_enabled(&(n->parent_obj)))) {
 +            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
 +                           "undefined access to interrupt mask set"
 +                           " when MSI-X is enabled");
 +            /* should be ignored, fall through for now */
 +        }
          n->bar.intms |= data & 0xffffffff;
          n->bar.intmc = n->bar.intms;
 +        trace_nvme_mmio_intm_set(data & 0xffffffff,
 +                                 n->bar.intmc);
          break;
 -    case 0x10:
 +    case 0x10:  /* INTMC */
 +        if (unlikely(msix_enabled(&(n->parent_obj)))) {
 +            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
 +                           "undefined access to interrupt mask clr"
 +                           " when MSI-X is enabled");
 +            /* should be ignored, fall through for now */
 +        }
          n->bar.intms &= ~(data & 0xffffffff);
          n->bar.intmc = n->bar.intms;
 +        trace_nvme_mmio_intm_clr(data & 0xffffffff,
 +                                 n->bar.intmc);
          break;
 -    case 0x14:
 +    case 0x14:  /* CC */
 +        trace_nvme_mmio_cfg(data & 0xffffffff);
          /* Windows first sends data, then sends enable bit */
          if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
              !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
@@ -XXX,XX +XXX,XX @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
          if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
              n->bar.cc = data;
 -            if (nvme_start_ctrl(n)) {
 +            if (unlikely(nvme_start_ctrl(n))) {
 +                trace_nvme_err_startfail();
                  n->bar.csts = NVME_CSTS_FAILED;
              } else {
 +                trace_nvme_mmio_start_success();
                  n->bar.csts = NVME_CSTS_READY;
              }
          } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
 +            trace_nvme_mmio_stopped();
              nvme_clear_ctrl(n);
              n->bar.csts &= ~NVME_CSTS_READY;
          }
          if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
 -                nvme_clear_ctrl(n);
 -                n->bar.cc = data;
 -                n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
 +            trace_nvme_mmio_shutdown_set();
 +            nvme_clear_ctrl(n);
 +            n->bar.cc = data;
 +            n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
          } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
 -                n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
 -                n->bar.cc = data;
 +            trace_nvme_mmio_shutdown_cleared();
 +            n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
 +            n->bar.cc = data;
 +        }
 +        break;
 +    case 0x1C:  /* CSTS */
 +        if (data & (1 << 4)) {
 +            NVME_GUEST_ERR(nvme_ub_mmiowr_ssreset_w1c_unsupported,
 +                           "attempted to W1C CSTS.NSSRO"
 +                           " but CAP.NSSRS is zero (not supported)");
 +        } else if (data != 0) {
 +            NVME_GUEST_ERR(nvme_ub_mmiowr_ro_csts,
 +                           "attempted to set a read only bit"
 +                           " of controller status");
 +        }
 +        break;
 +    case 0x20:  /* NSSR */
 +        if (data == 0x4E564D65) {
 +            trace_nvme_ub_mmiowr_ssreset_unsupported();
 +        } else {
 +            /* The spec says that writes of other values have no effect */
 +            return;
          }
          break;
 -    case 0x24:
 +    case 0x24:  /* AQA */
          n->bar.aqa = data & 0xffffffff;
 +        trace_nvme_mmio_aqattr(data & 0xffffffff);
          break;
 -    case 0x28:
 +    case 0x28:  /* ASQ */
          n->bar.asq = data;
 +        trace_nvme_mmio_asqaddr(data);
          break;
 -    case 0x2c:
 +    case 0x2c:  /* ASQ hi */
          n->bar.asq |= data << 32;
 +        trace_nvme_mmio_asqaddr_hi(data, n->bar.asq);
          break;
 -    case 0x30:
 +    case 0x30:  /* ACQ */
 +        trace_nvme_mmio_acqaddr(data);
          n->bar.acq = data;
          break;
 -    case 0x34:
 +    case 0x34:  /* ACQ hi */
          n->bar.acq |= data << 32;
 +        trace_nvme_mmio_acqaddr_hi(data, n->bar.acq);
          break;
 +    case 0x38:  /* CMBLOC */
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbloc_reserved,
 +                       "invalid write to reserved CMBLOC"
 +                       " when CMBSZ is zero, ignored");
 +        return;
 +    case 0x3C:  /* CMBSZ */
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
 +                       "invalid write to read only CMBSZ, ignored");
 +        return;
      default:
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
 +                       "invalid MMIO write,"
 +                       " offset=0x%"PRIx64", data=%"PRIx64"",
 +                       offset, data);
          break;
      }
  }
@@ -XXX,XX +XXX,XX @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
      uint8_t *ptr = (uint8_t *)&n->bar;
      uint64_t val = 0;
 +    if (unlikely(addr & (sizeof(uint32_t) - 1))) {
 +        NVME_GUEST_ERR(nvme_ub_mmiord_misaligned32,
 +                       "MMIO read not 32-bit aligned,"
 +                       " offset=0x%"PRIx64"", addr);
 +        /* should RAZ, fall through for now */
 +    } else if (unlikely(size < sizeof(uint32_t))) {
 +        NVME_GUEST_ERR(nvme_ub_mmiord_toosmall,
 +                       "MMIO read smaller than 32-bits,"
 +                       " offset=0x%"PRIx64"", addr);
 +        /* should RAZ, fall through for now */
 +    }
 +
      if (addr < sizeof(n->bar)) {
          memcpy(&val, ptr + addr, size);
 +    } else {
 +        NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
 +                       "MMIO read beyond last register,"
 +                       " offset=0x%"PRIx64", returning 0", addr);
      }
 +
      return val;
  }
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
  {
      uint32_t qid;
 -    if (addr & ((1 << 2) - 1)) {
 +    if (unlikely(addr & ((1 << 2) - 1))) {
 +        NVME_GUEST_ERR(nvme_ub_db_wr_misaligned,
 +                       "doorbell write not 32-bit aligned,"
 +                       " offset=0x%"PRIx64", ignoring", addr);
          return;
      }
      if (((addr - 0x1000) >> 2) & 1) {
 +        /* Completion queue doorbell write */
 +
          uint16_t new_head = val & 0xffff;
          int start_sqs;
          NvmeCQueue *cq;
          qid = (addr - (0x1000 + (1 << 2))) >> 3;
 -        if (nvme_check_cqid(n, qid)) {
 +        if (unlikely(nvme_check_cqid(n, qid))) {
 +            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cq,
 +                           "completion queue doorbell write"
 +                           " for nonexistent queue,"
 +                           " sqid=%"PRIu32", ignoring", qid);
              return;
          }
          cq = n->cq[qid];
 -        if (new_head >= cq->size) {
 +        if (unlikely(new_head >= cq->size)) {
 +            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cqhead,
 +                           "completion queue doorbell write value"
 +                           " beyond queue size, sqid=%"PRIu32","
 +                           " new_head=%"PRIu16", ignoring",
 +                           qid, new_head);
              return;
          }
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
              nvme_isr_notify(n, cq);
          }
      } else {
 +        /* Submission queue doorbell write */
 +
          uint16_t new_tail = val & 0xffff;
          NvmeSQueue *sq;
          qid = (addr - 0x1000) >> 3;
 -        if (nvme_check_sqid(n, qid)) {
 +        if (unlikely(nvme_check_sqid(n, qid))) {
 +            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sq,
 +                           "submission queue doorbell write"
 +                           " for nonexistent queue,"
 +                           " sqid=%"PRIu32", ignoring", qid);
              return;
          }
          sq = n->sq[qid];
 -        if (new_tail >= sq->size) {
 +        if (unlikely(new_tail >= sq->size)) {
 +            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sqtail,
 +                           "submission queue doorbell write value"
 +                           " beyond queue size, sqid=%"PRIu32","
 +                           " new_tail=%"PRIu16", ignoring",
 +                           qid, new_tail);
              return;
          }
 diff --git a/hw/block/trace-events b/hw/block/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/block/trace-events
 +++ b/hw/block/trace-events
@@ -XXX,XX +XXX,XX @@ virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint6
  hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
  hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int trans) "blk %p CHS %u %u %u trans %d"
 +# hw/block/nvme.c
 +# nvme traces for successful events
 +nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u"
 +nvme_irq_pin(void) "pulsing IRQ pin"
 +nvme_irq_masked(void) "IRQ is masked"
 +nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
 +nvme_rw(char const *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
 +nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
 +nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
 +nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
 +nvme_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16""
 +nvme_identify_ctrl(void) "identify controller"
 +nvme_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
 +nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16""
 +nvme_getfeat_vwcache(char const* result) "get feature volatile write cache, result=%s"
 +nvme_getfeat_numq(int result) "get feature number of queues, result=%d"
 +nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
 +nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
 +nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64""
 +nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64""
 +nvme_mmio_aqattr(uint64_t data) "wrote MMIO, admin queue attributes=0x%"PRIx64""
 +nvme_mmio_asqaddr(uint64_t data) "wrote MMIO, admin submission queue address=0x%"PRIx64""
 +nvme_mmio_acqaddr(uint64_t data) "wrote MMIO, admin completion queue address=0x%"PRIx64""
 +nvme_mmio_asqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin submission queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
 +nvme_mmio_acqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin completion queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
 +nvme_mmio_start_success(void) "setting controller enable bit succeeded"
 +nvme_mmio_stopped(void) "cleared controller enable bit"
 +nvme_mmio_shutdown_set(void) "shutdown bit set"
 +nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
 +
 +# nvme traces for error conditions
 +nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
 +nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
 +nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""
 +nvme_err_invalid_prp2_missing(void) "PRP2 is null and more data to be transferred"
 +nvme_err_invalid_field(void) "invalid field"
 +nvme_err_invalid_prp(void) "invalid PRP"
 +nvme_err_invalid_sgl(void) "invalid SGL"
 +nvme_err_invalid_ns(uint32_t ns, uint32_t limit) "invalid namespace %u not within 1-%u"
 +nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
 +nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
 +nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
 +nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
 +nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
 +nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16""
 +nvme_err_invalid_create_sq_size(uint16_t qsize) "failed creating submission queue, invalid qsize=%"PRIu16""
 +nvme_err_invalid_create_sq_addr(uint64_t addr) "failed creating submission queue, addr=0x%"PRIx64""
 +nvme_err_invalid_create_sq_qflags(uint16_t qflags) "failed creating submission queue, qflags=%"PRIu16""
 +nvme_err_invalid_del_cq_cqid(uint16_t cqid) "failed deleting completion queue, cqid=%"PRIu16""
 +nvme_err_invalid_del_cq_notempty(uint16_t cqid) "failed deleting completion queue, it is not empty, cqid=%"PRIu16""
 +nvme_err_invalid_create_cq_cqid(uint16_t cqid) "failed creating completion queue, cqid=%"PRIu16""
 +nvme_err_invalid_create_cq_size(uint16_t size) "failed creating completion queue, size=%"PRIu16""
 +nvme_err_invalid_create_cq_addr(uint64_t addr) "failed creating completion queue, addr=0x%"PRIx64""
 +nvme_err_invalid_create_cq_vector(uint16_t vector) "failed creating completion queue, vector=%"PRIu16""
 +nvme_err_invalid_create_cq_qflags(uint16_t qflags) "failed creating completion queue, qflags=%"PRIu16""
 +nvme_err_invalid_identify_cns(uint16_t cns) "identify, invalid cns=0x%"PRIx16""
 +nvme_err_invalid_getfeat(int dw10) "invalid get features, dw10=0x%"PRIx32""
 +nvme_err_invalid_setfeat(uint32_t dw10) "invalid set features, dw10=0x%"PRIx32""
 +nvme_err_startfail_cq(void) "nvme_start_ctrl failed because there are non-admin completion queues"
 +nvme_err_startfail_sq(void) "nvme_start_ctrl failed because there are non-admin submission queues"
 +nvme_err_startfail_nbarasq(void) "nvme_start_ctrl failed because the admin submission queue address is null"
 +nvme_err_startfail_nbaracq(void) "nvme_start_ctrl failed because the admin completion queue address is null"
 +nvme_err_startfail_asq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin submission queue address is misaligned: 0x%"PRIx64""
 +nvme_err_startfail_acq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin completion queue address is misaligned: 0x%"PRIx64""
 +nvme_err_startfail_page_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too small: log2size=%u, min=%u"
 +nvme_err_startfail_page_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too large: log2size=%u, max=%u"
 +nvme_err_startfail_cqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too small: log2size=%u, min=%u"
 +nvme_err_startfail_cqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too large: log2size=%u, max=%u"
 +nvme_err_startfail_sqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too small: log2size=%u, min=%u"
 +nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too large: log2size=%u, max=%u"
 +nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero"
 +nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero"
 +nvme_err_startfail(void) "setting controller enable bit failed"
 +
 +# Traces for undefined behavior
 +nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64""
 +nvme_ub_mmiowr_toosmall(uint64_t offset, unsigned size) "MMIO write smaller than 32 bits, offset=0x%"PRIx64", size=%u"
 +nvme_ub_mmiowr_intmask_with_msix(void) "undefined access to interrupt mask set when MSI-X is enabled"
 +nvme_ub_mmiowr_ro_csts(void) "attempted to set a read only bit of controller status"
 +nvme_ub_mmiowr_ssreset_w1c_unsupported(void) "attempted to W1C CSTS.NSSRO but CAP.NSSRS is zero (not supported)"
 +nvme_ub_mmiowr_ssreset_unsupported(void) "attempted NVM subsystem reset but CAP.NSSRS is zero (not supported)"
 +nvme_ub_mmiowr_cmbloc_reserved(void) "invalid write to reserved CMBLOC when CMBSZ is zero, ignored"
 +nvme_ub_mmiowr_cmbsz_readonly(void) "invalid write to read only CMBSZ, ignored"
 +nvme_ub_mmiowr_invalid(uint64_t offset, uint64_t data) "invalid MMIO write, offset=0x%"PRIx64", data=0x%"PRIx64""
 +nvme_ub_mmiord_misaligned32(uint64_t offset) "MMIO read not 32-bit aligned, offset=0x%"PRIx64""
 +nvme_ub_mmiord_toosmall(uint64_t offset) "MMIO read smaller than 32-bits, offset=0x%"PRIx64""
 +nvme_ub_mmiord_invalid_ofs(uint64_t offset) "MMIO read beyond last register, offset=0x%"PRIx64", returning 0"
 +nvme_ub_db_wr_misaligned(uint64_t offset) "doorbell write not 32-bit aligned, offset=0x%"PRIx64", ignoring"
 +nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for nonexistent queue, cqid=%"PRIu32", ignoring"
 +nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring"
 +nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring"
 +nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring"
 +
  # hw/block/xen_disk.c
  xen_disk_alloc(char *name) "%s"
  xen_disk_init(char *name) "%s"
 --
-.13.5
+.13.6

-[Qemu-devel] [PULL 01/14] block: pass bdrv_* methods to bs->file by default in block filters
+[Qemu-devel] [PULL v3 13/35] block: Open backing image in force share mode for size probe
-From: Manos Pitsidianakis <el13635@mail.ntua.gr>
+From: Fam Zheng <famz@redhat.com>
-The following functions fail if bs->drv is a filter and does not
+Management tools create overlays of running guests with qemu-img:
 implement them:
-bdrv_probe_blocksizes
+  $ qemu-img create -b /image/in/use.qcow2 -f qcow2 /overlay/image.qcow2
 bdrv_probe_geometry
 bdrv_truncate
 bdrv_has_zero_init
 bdrv_get_info
-Instead, the call should be passed to bs->file if it exists, to allow
+but this doesn't work anymore due to image locking:
 filter drivers to support those methods without implementing them. This
 commit makes `drv->is_filter = true` imply that these callbacks will be
 forwarded to bs->file by default, so disabling support for these
 functions must be done explicitly.
+    qemu-img: /overlay/image.qcow2: Failed to get shared "write" lock
+    Is another process using the image?
+    Could not open backing image to determine size.
+Use the force share option to allow this use case again.
+Cc: qemu-stable@nongnu.org
+Signed-off-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/block_int.h |  6 +++++-
+ block.c | 3 ++-
- block.c                   | 21 +++++++++++++++++++--
+file changed, 2 insertions(+), 1 deletion(-)
 files changed, 24 insertions(+), 3 deletions(-)
-diff --git a/include/block/block_int.h b/include/block/block_int.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/block_int.h
-+++ b/include/block/block_int.h
-@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
-     const char *format_name;
-     int instance_size;
--    /* set to true if the BlockDriver is a block filter */
-+    /* set to true if the BlockDriver is a block filter. Block filters pass
-+     * certain callbacks that refer to data (see block.c) to their bs->file if
-+     * the driver doesn't implement them. Drivers that do not wish to forward
-+     * must implement them and return -ENOTSUP.
-+     */
-     bool is_filter;
-     /* for snapshots block filter like Quorum can implement the
-      * following recursive callback.
 diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block.c
 +++ b/block.c
-@@ -XXX,XX +XXX,XX @@ int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
+@@ -XXX,XX +XXX,XX @@ void bdrv_img_create(const char *filename, const char *fmt,
+         back_flags = flags;
-     if (drv && drv->bdrv_probe_blocksizes) {
+         back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
-         return drv->bdrv_probe_blocksizes(bs, bsz);
-+    } else if (drv && drv->is_filter && bs->file) {
++        backing_options = qdict_new();
-+        return bdrv_probe_blocksizes(bs->file->bs, bsz);
+         if (backing_fmt) {
-     }
+-            backing_options = qdict_new();
+             qdict_put_str(backing_options, "driver", backing_fmt);
-     return -ENOTSUP;
+         }
-@@ -XXX,XX +XXX,XX @@ int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
++        qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
-     if (drv && drv->bdrv_probe_geometry) {
+         bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
-         return drv->bdrv_probe_geometry(bs, geo);
+                        &local_err);
 +    } else if (drv && drv->is_filter && bs->file) {
 +        return bdrv_probe_geometry(bs->file->bs, geo);
      }
      return -ENOTSUP;
@@ -XXX,XX +XXX,XX @@ int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc,
      assert(child->perm & BLK_PERM_RESIZE);
 +    /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
      if (!drv) {
          error_setg(errp, "No medium inserted");
          return -ENOMEDIUM;
      }
      if (!drv->bdrv_truncate) {
 +        if (bs->file && drv->is_filter) {
 +            return bdrv_truncate(bs->file, offset, prealloc, errp);
 +        }
          error_setg(errp, "Image format driver does not support resize");
          return -ENOTSUP;
      }
@@ -XXX,XX +XXX,XX @@ int bdrv_has_zero_init(BlockDriverState *bs)
      if (bs->drv->bdrv_has_zero_init) {
          return bs->drv->bdrv_has_zero_init(bs);
      }
 +    if (bs->file && bs->drv->is_filter) {
 +        return bdrv_has_zero_init(bs->file->bs);
 +    }
      /* safe default */
      return 0;
@@ -XXX,XX +XXX,XX @@ void bdrv_get_backing_filename(BlockDriverState *bs,
  int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
  {
      BlockDriver *drv = bs->drv;
 -    if (!drv)
 +    /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
 +    if (!drv) {
          return -ENOMEDIUM;
 -    if (!drv->bdrv_get_info)
 +    }
 +    if (!drv->bdrv_get_info) {
 +        if (bs->file && drv->is_filter) {
 +            return bdrv_get_info(bs->file->bs, bdi);
 +        }
          return -ENOTSUP;
 +    }
      memset(bdi, 0, sizeof(*bdi));
      return drv->bdrv_get_info(bs, bdi);
  }
 --
-.13.5
+.13.6

-New patch
+[Qemu-devel] [PULL v3 14/35] block: Remove the obsolete -drive boot=on|off parameter
+From: Thomas Huth <thuth@redhat.com>
+It's not working anymore since QEMU v1.3.0 - time to remove it now.
+Signed-off-by: Thomas Huth <thuth@redhat.com>
+Reviewed-by: John Snow <jsnow@redhat.com>
+Reviewed-by: Markus Armbruster <armbru@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+---
+ blockdev.c    | 11 -----------
+ qemu-doc.texi |  6 ------
+files changed, 17 deletions(-)
+diff --git a/blockdev.c b/blockdev.c
+index XXXXXXX..XXXXXXX 100644
+--- a/blockdev.c
++++ b/blockdev.c
+@@ -XXX,XX +XXX,XX @@ QemuOptsList qemu_legacy_drive_opts = {
+             .type = QEMU_OPT_STRING,
+             .help = "chs translation (auto, lba, none)",
+         },{
+-            .name = "boot",
+-            .type = QEMU_OPT_BOOL,
+-            .help = "(deprecated, ignored)",
+-        },{
+             .name = "addr",
+             .type = QEMU_OPT_STRING,
+             .help = "pci address (virtio only)",
+@@ -XXX,XX +XXX,XX @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type)
+         goto fail;
+     }
+-    /* Deprecated option boot=[on|off] */
+-    if (qemu_opt_get(legacy_opts, "boot") != NULL) {
+-        fprintf(stderr, "qemu-kvm: boot=on|off is deprecated and will be "
+-                "ignored. Future versions will reject this parameter. Please "
+-                "update your scripts.\n");
+-    }
+-
+     /* Other deprecated options */
+     if (!qtest_enabled()) {
+         for (i = 0; i < ARRAY_SIZE(deprecated); i++) {
+diff --git a/qemu-doc.texi b/qemu-doc.texi
+index XXXXXXX..XXXXXXX 100644
+--- a/qemu-doc.texi
++++ b/qemu-doc.texi
+@@ -XXX,XX +XXX,XX @@ deprecated.
+ @section System emulator command line arguments
+-@subsection -drive boot=on|off (since 1.3.0)
+-
+-The ``boot=on|off'' option to the ``-drive'' argument is
+-ignored. Applications should use the ``bootindex=N'' parameter
+-to set an absolute ordering between devices instead.
+-
+ @subsection -tdf (since 1.3.0)
+ The ``-tdf'' argument is ignored. The behaviour implemented
+--
+.13.6

-New patch
+[Qemu-devel] [PULL v3 15/35] block: Remove the deprecated -hdachs option
+From: Thomas Huth <thuth@redhat.com>
 It's been marked as deprecated since QEMU v2.10.0, and so far nobody
 complained that we should keep it, so let's remove this legacy option
 now to simplify the code quite a bit.
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 Reviewed-by: John Snow <jsnow@redhat.com>
 Reviewed-by: Markus Armbruster <armbru@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
  vl.c            | 86 ++-------------------------------------------------------
  qemu-doc.texi   |  8 ------
  qemu-options.hx | 19 ++-----------
 files changed, 4 insertions(+), 109 deletions(-)
 diff --git a/vl.c b/vl.c
 index XXXXXXX..XXXXXXX 100644
 --- a/vl.c
 +++ b/vl.c
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
      const char *boot_order = NULL;
      const char *boot_once = NULL;
      DisplayState *ds;
 -    int cyls, heads, secs, translation;
      QemuOpts *opts, *machine_opts;
 -    QemuOpts *hda_opts = NULL, *icount_opts = NULL, *accel_opts = NULL;
 +    QemuOpts *icount_opts = NULL, *accel_opts = NULL;
      QemuOptsList *olist;
      int optind;
      const char *optarg;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
      cpu_model = NULL;
      snapshot = 0;
 -    cyls = heads = secs = 0;
 -    translation = BIOS_ATA_TRANSLATION_AUTO;
      nb_nics = 0;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
          if (optind >= argc)
              break;
          if (argv[optind][0] != '-') {
 -            hda_opts = drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
 +            drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
          } else {
              const QEMUOption *popt;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
                  cpu_model = optarg;
                  break;
              case QEMU_OPTION_hda:
 -                {
 -                    char buf[256];
 -                    if (cyls == 0)
 -                        snprintf(buf, sizeof(buf), "%s", HD_OPTS);
 -                    else
 -                        snprintf(buf, sizeof(buf),
 -                                 "%s,cyls=%d,heads=%d,secs=%d%s",
 -                                 HD_OPTS , cyls, heads, secs,
 -                                 translation == BIOS_ATA_TRANSLATION_LBA ?
 -                                 ",trans=lba" :
 -                                 translation == BIOS_ATA_TRANSLATION_NONE ?
 -                                 ",trans=none" : "");
 -                    drive_add(IF_DEFAULT, 0, optarg, buf);
 -                    break;
 -                }
              case QEMU_OPTION_hdb:
              case QEMU_OPTION_hdc:
              case QEMU_OPTION_hdd:
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
              case QEMU_OPTION_snapshot:
                  snapshot = 1;
                  break;
 -            case QEMU_OPTION_hdachs:
 -                {
 -                    const char *p;
 -                    p = optarg;
 -                    cyls = strtol(p, (char **)&p, 0);
 -                    if (cyls < 1 || cyls > 16383)
 -                        goto chs_fail;
 -                    if (*p != ',')
 -                        goto chs_fail;
 -                    p++;
 -                    heads = strtol(p, (char **)&p, 0);
 -                    if (heads < 1 || heads > 16)
 -                        goto chs_fail;
 -                    if (*p != ',')
 -                        goto chs_fail;
 -                    p++;
 -                    secs = strtol(p, (char **)&p, 0);
 -                    if (secs < 1 || secs > 63)
 -                        goto chs_fail;
 -                    if (*p == ',') {
 -                        p++;
 -                        if (!strcmp(p, "large")) {
 -                            translation = BIOS_ATA_TRANSLATION_LARGE;
 -                        } else if (!strcmp(p, "rechs")) {
 -                            translation = BIOS_ATA_TRANSLATION_RECHS;
 -                        } else if (!strcmp(p, "none")) {
 -                            translation = BIOS_ATA_TRANSLATION_NONE;
 -                        } else if (!strcmp(p, "lba")) {
 -                            translation = BIOS_ATA_TRANSLATION_LBA;
 -                        } else if (!strcmp(p, "auto")) {
 -                            translation = BIOS_ATA_TRANSLATION_AUTO;
 -                        } else {
 -                            goto chs_fail;
 -                        }
 -                    } else if (*p != '\0') {
 -                    chs_fail:
 -                        error_report("invalid physical CHS format");
 -                        exit(1);
 -                    }
 -                    if (hda_opts != NULL) {
 -                        qemu_opt_set_number(hda_opts, "cyls", cyls,
 -                                            &error_abort);
 -                        qemu_opt_set_number(hda_opts, "heads", heads,
 -                                            &error_abort);
 -                        qemu_opt_set_number(hda_opts, "secs", secs,
 -                                            &error_abort);
 -                        if (translation == BIOS_ATA_TRANSLATION_LARGE) {
 -                            qemu_opt_set(hda_opts, "trans", "large",
 -                                         &error_abort);
 -                        } else if (translation == BIOS_ATA_TRANSLATION_RECHS) {
 -                            qemu_opt_set(hda_opts, "trans", "rechs",
 -                                         &error_abort);
 -                        } else if (translation == BIOS_ATA_TRANSLATION_LBA) {
 -                            qemu_opt_set(hda_opts, "trans", "lba",
 -                                         &error_abort);
 -                        } else if (translation == BIOS_ATA_TRANSLATION_NONE) {
 -                            qemu_opt_set(hda_opts, "trans", "none",
 -                                         &error_abort);
 -                        }
 -                    }
 -                }
 -                error_report("'-hdachs' is deprecated, please use '-device"
 -                             " ide-hd,cyls=c,heads=h,secs=s,...' instead");
 -                break;
              case QEMU_OPTION_numa:
                  opts = qemu_opts_parse_noisily(qemu_find_opts("numa"),
                                                 optarg, true);
 diff --git a/qemu-doc.texi b/qemu-doc.texi
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-doc.texi
 +++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ The ``--net dump'' argument is now replaced with the
  ``-object filter-dump'' argument which works in combination
  with the modern ``-netdev`` backends instead.
 -@subsection -hdachs (since 2.10.0)
 -
 -The ``-hdachs'' argument is now a synonym for setting
 -the ``cyls'', ``heads'', ``secs'', and ``trans'' properties
 -on the ``ide-hd'' device using the ``-device'' argument.
 -The new syntax allows different settings to be provided
 -per disk.
 -
  @subsection -usbdevice (since 2.10.0)
  The ``-usbdevice DEV'' argument is now a synonym for setting
 diff --git a/qemu-options.hx b/qemu-options.hx
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-options.hx
 +++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ of available connectors of a given interface type.
  @item media=@var{media}
  This option defines the type of the media: disk or cdrom.
  @item cyls=@var{c},heads=@var{h},secs=@var{s}[,trans=@var{t}]
 -These options have the same definition as they have in @option{-hdachs}.
 -These parameters are deprecated, use the corresponding parameters
 +Force disk physical geometry and the optional BIOS translation (trans=none or
 +lba). These parameters are deprecated, use the corresponding parameters
  of @code{-device} instead.
  @item snapshot=@var{snapshot}
  @var{snapshot} is "on" or "off" and controls snapshot mode for the given drive
@@ -XXX,XX +XXX,XX @@ the raw disk image you use is not written back. You can however force
  the write back by pressing @key{C-a s} (@pxref{disk_images}).
  ETEXI
 -DEF("hdachs", HAS_ARG, QEMU_OPTION_hdachs, \
 -    "-hdachs c,h,s[,t]\n" \
 -    "                force hard disk 0 physical geometry and the optional BIOS\n" \
 -    "                translation (t=none or lba) (usually QEMU can guess them)\n",
 -    QEMU_ARCH_ALL)
 -STEXI
 -@item -hdachs @var{c},@var{h},@var{s},[,@var{t}]
 -@findex -hdachs
 -Force hard disk 0 physical geometry (1 <= @var{c} <= 16383, 1 <=
 -@var{h} <= 16, 1 <= @var{s} <= 63) and optionally force the BIOS
 -translation mode (@var{t}=none, lba or auto). Usually QEMU can guess
 -all those parameters. This option is deprecated, please use
 -@code{-device ide-hd,cyls=c,heads=h,secs=s,...} instead.
 -ETEXI
 -
  DEF("fsdev", HAS_ARG, QEMU_OPTION_fsdev,
      "-fsdev fsdriver,id=id[,path=path,][security_model={mapped-xattr|mapped-file|passthrough|none}]\n"
      " [,writeout=immediate][,readonly][,socket=socket|sock_fd=sock_fd][,fmode=fmode][,dmode=dmode]\n"
 --
 .13.6

-New patch
+[Qemu-devel] [PULL v3 16/35] block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter
+From: Thomas Huth <thuth@redhat.com>
+Looks like we forgot to announce the deprecation of these options in
+the corresponding chapter of the qemu-doc text, so let's do that now.
+Signed-off-by: Thomas Huth <thuth@redhat.com>
+Reviewed-by: John Snow <jsnow@redhat.com>
+Reviewed-by: Markus Armbruster <armbru@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+---
+ qemu-doc.texi | 15 +++++++++++++++
+file changed, 15 insertions(+)
+diff --git a/qemu-doc.texi b/qemu-doc.texi
+index XXXXXXX..XXXXXXX 100644
+--- a/qemu-doc.texi
++++ b/qemu-doc.texi
+@@ -XXX,XX +XXX,XX @@ longer be directly supported in QEMU.
+ The ``-drive if=scsi'' argument is replaced by the the
+ ``-device BUS-TYPE'' argument combined with ``-drive if=none''.
++@subsection -drive cyls=...,heads=...,secs=...,trans=... (since 2.10.0)
++
++The drive geometry arguments are replaced by the the geometry arguments
++that can be specified with the ``-device'' parameter.
++
++@subsection -drive serial=... (since 2.10.0)
++
++The drive serial argument is replaced by the the serial argument
++that can be specified with the ``-device'' parameter.
++
++@subsection -drive addr=... (since 2.10.0)
++
++The drive addr argument is replaced by the the addr argument
++that can be specified with the ``-device'' parameter.
++
+ @subsection -net dump (since 2.10.0)
+ The ``--net dump'' argument is now replaced with the
+--
+.13.6

-[Qemu-devel] [PULL 04/14] block: add default implementations for bdrv_co_get_block_status()
+[Qemu-devel] [PULL v3 17/35] block: Remove unused bdrv_requests_pending
-From: Manos Pitsidianakis <el13635@mail.ntua.gr>
+From: Fam Zheng <famz@redhat.com>
-bdrv_co_get_block_status_from_file() and
+Signed-off-by: Fam Zheng <famz@redhat.com>
 bdrv_co_get_block_status_from_backing() set *file to bs->file and
 bs->backing respectively, so that bdrv_co_get_block_status() can recurse
 to them. Future block drivers won't have to duplicate code to implement
 this.
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/block_int.h | 18 ++++++++++++++++++
+ include/block/block_int.h |  1 -
- block/blkdebug.c          | 12 +-----------
+ block/io.c                | 18 ------------------
- block/commit.c            | 12 +-----------
+files changed, 19 deletions(-)
  block/io.c                | 26 ++++++++++++++++++++++++++
  block/mirror.c            | 12 +-----------
 files changed, 47 insertions(+), 33 deletions(-)
 diff --git a/include/block/block_int.h b/include/block/block_int.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/block_int.h
 +++ b/include/block/block_int.h
-@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
+@@ -XXX,XX +XXX,XX @@ bool blk_dev_is_tray_open(BlockBackend *blk);
-                                uint64_t perm, uint64_t shared,
+ bool blk_dev_is_medium_locked(BlockBackend *blk);
-                                uint64_t *nperm, uint64_t *nshared);
+ void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes);
-+/*
+-bool bdrv_requests_pending(BlockDriverState *bs);
-+ * Default implementation for drivers to pass bdrv_co_get_block_status() to
-+ * their file.
+ void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
-+ */
+ void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
 +int64_t coroutine_fn bdrv_co_get_block_status_from_file(BlockDriverState *bs,
 +                                                        int64_t sector_num,
 +                                                        int nb_sectors,
 +                                                        int *pnum,
 +                                                        BlockDriverState **file);
 +/*
 + * Default implementation for drivers to pass bdrv_co_get_block_status() to
 + * their backing file.
 + */
 +int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs,
 +                                                           int64_t sector_num,
 +                                                           int nb_sectors,
 +                                                           int *pnum,
 +                                                           BlockDriverState **file);
  const char *bdrv_get_parent_name(const BlockDriverState *bs);
  void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp);
  bool blk_dev_has_removable_media(BlockBackend *blk);
 diff --git a/block/blkdebug.c b/block/blkdebug.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/blkdebug.c
 +++ b/block/blkdebug.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs,
      return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
  }
 -static int64_t coroutine_fn blkdebug_co_get_block_status(
 -    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
 -    BlockDriverState **file)
 -{
 -    *pnum = nb_sectors;
 -    *file = bs->file->bs;
 -    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
 -        (sector_num << BDRV_SECTOR_BITS);
 -}
 -
  static void blkdebug_close(BlockDriverState *bs)
  {
      BDRVBlkdebugState *s = bs->opaque;
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_blkdebug = {
      .bdrv_co_flush_to_disk  = blkdebug_co_flush,
      .bdrv_co_pwrite_zeroes  = blkdebug_co_pwrite_zeroes,
      .bdrv_co_pdiscard       = blkdebug_co_pdiscard,
 -    .bdrv_co_get_block_status = blkdebug_co_get_block_status,
 +    .bdrv_co_get_block_status = bdrv_co_get_block_status_from_file,
      .bdrv_debug_event           = blkdebug_debug_event,
      .bdrv_debug_breakpoint      = blkdebug_debug_breakpoint,
 diff --git a/block/commit.c b/block/commit.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/commit.c
 +++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_commit_top_preadv(BlockDriverState *bs,
      return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
  }
 -static int64_t coroutine_fn bdrv_commit_top_get_block_status(
 -    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
 -    BlockDriverState **file)
 -{
 -    *pnum = nb_sectors;
 -    *file = bs->backing->bs;
 -    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
 -           (sector_num << BDRV_SECTOR_BITS);
 -}
 -
  static void bdrv_commit_top_refresh_filename(BlockDriverState *bs, QDict *opts)
  {
      bdrv_refresh_filename(bs->backing->bs);
@@ -XXX,XX +XXX,XX @@ static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c,
  static BlockDriver bdrv_commit_top = {
      .format_name                = "commit_top",
      .bdrv_co_preadv             = bdrv_commit_top_preadv,
 -    .bdrv_co_get_block_status   = bdrv_commit_top_get_block_status,
 +    .bdrv_co_get_block_status   = bdrv_co_get_block_status_from_backing,
      .bdrv_refresh_filename      = bdrv_commit_top_refresh_filename,
      .bdrv_close                 = bdrv_commit_top_close,
      .bdrv_child_perm            = bdrv_commit_top_child_perm,
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ typedef struct BdrvCoGetBlockStatusData {
+@@ -XXX,XX +XXX,XX @@ void bdrv_disable_copy_on_read(BlockDriverState *bs)
-     bool done;
+     assert(old >= 1);
  } BdrvCoGetBlockStatusData;
 +int64_t coroutine_fn bdrv_co_get_block_status_from_file(BlockDriverState *bs,
 +                                                        int64_t sector_num,
 +                                                        int nb_sectors,
 +                                                        int *pnum,
 +                                                        BlockDriverState **file)
 +{
 +    assert(bs->file && bs->file->bs);
 +    *pnum = nb_sectors;
 +    *file = bs->file->bs;
 +    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
 +           (sector_num << BDRV_SECTOR_BITS);
 +}
 +
 +int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs,
 +                                                           int64_t sector_num,
 +                                                           int nb_sectors,
 +                                                           int *pnum,
 +                                                           BlockDriverState **file)
 +{
 +    assert(bs->backing && bs->backing->bs);
 +    *pnum = nb_sectors;
 +    *file = bs->backing->bs;
 +    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
 +           (sector_num << BDRV_SECTOR_BITS);
 +}
 +
  /*
   * Returns the allocation status of the specified sectors.
   * Drivers not implementing the functionality are assumed to not support
 diff --git a/block/mirror.c b/block/mirror.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
      return bdrv_co_flush(bs->backing->bs);
  }
--static int64_t coroutine_fn bdrv_mirror_top_get_block_status(
+-/* Check if any requests are in-flight (including throttled requests) */
--    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
+-bool bdrv_requests_pending(BlockDriverState *bs)
 -    BlockDriverState **file)
 -{
--    *pnum = nb_sectors;
+-    BdrvChild *child;
--    *file = bs->backing->bs;
+-
--    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
+-    if (atomic_read(&bs->in_flight)) {
--           (sector_num << BDRV_SECTOR_BITS);
+-        return true;
 -    }
 -
 -    QLIST_FOREACH(child, &bs->children, next) {
 -        if (bdrv_requests_pending(child->bs)) {
 -            return true;
 -        }
 -    }
 -
 -    return false;
 -}
 -
- static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs,
+ typedef struct {
-     int64_t offset, int bytes, BdrvRequestFlags flags)
+     Coroutine *co;
- {
+     BlockDriverState *bs;
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_mirror_top = {
      .bdrv_co_pwrite_zeroes      = bdrv_mirror_top_pwrite_zeroes,
      .bdrv_co_pdiscard           = bdrv_mirror_top_pdiscard,
      .bdrv_co_flush              = bdrv_mirror_top_flush,
 -    .bdrv_co_get_block_status   = bdrv_mirror_top_get_block_status,
 +    .bdrv_co_get_block_status   = bdrv_co_get_block_status_from_backing,
      .bdrv_refresh_filename      = bdrv_mirror_top_refresh_filename,
      .bdrv_close                 = bdrv_mirror_top_close,
      .bdrv_child_perm            = bdrv_mirror_top_child_perm,
 --
-.13.5
+.13.6

-New patch
+[Qemu-devel] [PULL v3 18/35] block: Assert drain_all is only called from main AioContext
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+---
+ block/io.c | 6 ++++++
+file changed, 6 insertions(+)
+diff --git a/block/io.c b/block/io.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/io.c
++++ b/block/io.c
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
+     BdrvNextIterator it;
+     GSList *aio_ctxs = NULL, *ctx;
++    /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
++     * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
++     * nodes in several different AioContexts, so make sure we're in the main
++     * context. */
++    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
++
+     block_job_pause_all();
+     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+--
+.13.6

-New patch
+[Qemu-devel] [PULL v3 19/35] block: Make bdrv_drain() driver callbacks non-recursive
+bdrv_drained_begin() doesn't increase bs->quiesce_counter recursively
+and also doesn't notify other parent nodes of children, which both means
+that the child nodes are not actually drained, and bdrv_drained_begin()
+is providing useful functionality only on a single node.
+To keep things consistent, we also shouldn't call the block driver
+callbacks recursively.
+A proper recursive drain version that provides an actually working
+drained section for child nodes will be introduced later.
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+---
+ block/io.c | 16 +++++++++-------
+file changed, 9 insertions(+), 7 deletions(-)
+diff --git a/block/io.c b/block/io.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/io.c
++++ b/block/io.c
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
+ }
+ /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
+-static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
++static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
+ {
+     BdrvChild *child, *tmp;
+     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
+@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
+     bdrv_coroutine_enter(bs, data.co);
+     BDRV_POLL_WHILE(bs, !data.done);
+-    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+-        bdrv_drain_invoke(child->bs, begin);
++    if (recursive) {
++        QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
++            bdrv_drain_invoke(child->bs, begin, true);
++        }
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
+         bdrv_parent_drained_begin(bs);
+     }
+-    bdrv_drain_invoke(bs, true);
++    bdrv_drain_invoke(bs, true, false);
+     bdrv_drain_recurse(bs);
+ }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
+     }
+     /* Re-enable things in child-to-parent order */
+-    bdrv_drain_invoke(bs, false);
++    bdrv_drain_invoke(bs, false, false);
+     bdrv_parent_drained_end(bs);
+     aio_enable_external(bdrv_get_aio_context(bs));
+ }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
+         aio_context_acquire(aio_context);
+         aio_disable_external(aio_context);
+         bdrv_parent_drained_begin(bs);
+-        bdrv_drain_invoke(bs, true);
++        bdrv_drain_invoke(bs, true, true);
+         aio_context_release(aio_context);
+         if (!g_slist_find(aio_ctxs, aio_context)) {
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
+         /* Re-enable things in child-to-parent order */
+         aio_context_acquire(aio_context);
+-        bdrv_drain_invoke(bs, false);
++        bdrv_drain_invoke(bs, false, true);
+         bdrv_parent_drained_end(bs);
+         aio_enable_external(aio_context);
+         aio_context_release(aio_context);
+--
+.13.6

-New patch
+[Qemu-devel] [PULL v3 20/35] test-bdrv-drain: Test callback for bdrv_drain
+The existing test is for bdrv_drain_all_begin/end() only. Generalise the
+test case so that it can be run for the other variants as well. At the
+moment this is only bdrv_drain_begin/end(), but in a while, we'll add
+another one.
+Also, add a backing file to the test node to test whether the operations
+work recursively.
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+---
+ tests/test-bdrv-drain.c | 69 ++++++++++++++++++++++++++++++++++++++++++++-----
+file changed, 62 insertions(+), 7 deletions(-)
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tests/test-bdrv-drain.c
++++ b/tests/test-bdrv-drain.c
+@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_test = {
+     .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
+     .bdrv_co_drain_end      = bdrv_test_co_drain_end,
++
++    .bdrv_child_perm        = bdrv_format_default_perms,
+ };
+ static void aio_ret_cb(void *opaque, int ret)
+@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
+     *aio_ret = ret;
+ }
+-static void test_drv_cb_drain_all(void)
++enum drain_type {
++    BDRV_DRAIN_ALL,
++    BDRV_DRAIN,
++};
++
++static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
++{
++    switch (drain_type) {
++    case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
++    case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
++    default:                    g_assert_not_reached();
++    }
++}
++
++static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
++{
++    switch (drain_type) {
++    case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
++    case BDRV_DRAIN:            bdrv_drained_end(bs); break;
++    default:                    g_assert_not_reached();
++    }
++}
++
++static void test_drv_cb_common(enum drain_type drain_type, bool recursive)
+ {
+     BlockBackend *blk;
+-    BlockDriverState *bs;
+-    BDRVTestState *s;
++    BlockDriverState *bs, *backing;
++    BDRVTestState *s, *backing_s;
+     BlockAIOCB *acb;
+     int aio_ret;
+@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
+     s = bs->opaque;
+     blk_insert_bs(blk, bs, &error_abort);
++    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
++    backing_s = backing->opaque;
++    bdrv_set_backing_hd(bs, backing, &error_abort);
++
+     /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
+     g_assert_cmpint(s->drain_count, ==, 0);
+-    bdrv_drain_all_begin();
++    g_assert_cmpint(backing_s->drain_count, ==, 0);
++
++    do_drain_begin(drain_type, bs);
++
+     g_assert_cmpint(s->drain_count, ==, 1);
+-    bdrv_drain_all_end();
++    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
++
++    do_drain_end(drain_type, bs);
++
+     g_assert_cmpint(s->drain_count, ==, 0);
++    g_assert_cmpint(backing_s->drain_count, ==, 0);
+     /* Now do the same while a request is pending */
+     aio_ret = -EINPROGRESS;
+@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
+     g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
+     g_assert_cmpint(s->drain_count, ==, 0);
+-    bdrv_drain_all_begin();
++    g_assert_cmpint(backing_s->drain_count, ==, 0);
++
++    do_drain_begin(drain_type, bs);
++
+     g_assert_cmpint(aio_ret, ==, 0);
+     g_assert_cmpint(s->drain_count, ==, 1);
+-    bdrv_drain_all_end();
++    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
++
++    do_drain_end(drain_type, bs);
++
+     g_assert_cmpint(s->drain_count, ==, 0);
++    g_assert_cmpint(backing_s->drain_count, ==, 0);
++    bdrv_unref(backing);
+     bdrv_unref(bs);
+     blk_unref(blk);
+ }
++static void test_drv_cb_drain_all(void)
++{
++    test_drv_cb_common(BDRV_DRAIN_ALL, true);
++}
++
++static void test_drv_cb_drain(void)
++{
++    test_drv_cb_common(BDRV_DRAIN, false);
++}
++
+ int main(int argc, char **argv)
+ {
+     bdrv_init();
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
+     g_test_init(&argc, &argv, NULL);
+     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
++    g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
+     return g_test_run();
+ }
+--
+.13.6

-[Qemu-devel] [PULL 10/14] block: tidy ThrottleGroupMember initializations
+[Qemu-devel] [PULL v3 21/35] test-bdrv-drain: Test bs->quiesce_counter
-From: Manos Pitsidianakis <el13635@mail.ntua.gr>
+This is currently only working correctly for bdrv_drain(), not for
 bdrv_drain_all(). Leave a comment for the drain_all case, we'll address
 it later.
-Move the CoMutex and CoQueue inits inside throttle_group_register_tgm()
-which is called whenever a ThrottleGroupMember is initialized. There's
-no need for them to be separate.
-Reviewed-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/block-backend.c   | 3 ---
+ tests/test-bdrv-drain.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
- block/throttle-groups.c | 3 +++
+file changed, 45 insertions(+)
 files changed, 3 insertions(+), 3 deletions(-)
-diff --git a/block/block-backend.c b/block/block-backend.c
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/block-backend.c
+--- a/tests/test-bdrv-drain.c
-+++ b/block/block-backend.c
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm)
+@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
-     blk->shared_perm = shared_perm;
+     test_drv_cb_common(BDRV_DRAIN, false);
-     blk_set_enable_write_cache(blk, true);
+ }
--    qemu_co_mutex_init(&blk->public.throttle_group_member.throttled_reqs_lock);
++static void test_quiesce_common(enum drain_type drain_type, bool recursive)
--    qemu_co_queue_init(&blk->public.throttle_group_member.throttled_reqs[0]);
++{
--    qemu_co_queue_init(&blk->public.throttle_group_member.throttled_reqs[1]);
++    BlockBackend *blk;
-     block_acct_init(&blk->stats);
++    BlockDriverState *bs, *backing;
++
-     notifier_list_init(&blk->remove_bs_notifiers);
++    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
-diff --git a/block/throttle-groups.c b/block/throttle-groups.c
++    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
-index XXXXXXX..XXXXXXX 100644
++                              &error_abort);
---- a/block/throttle-groups.c
++    blk_insert_bs(blk, bs, &error_abort);
-+++ b/block/throttle-groups.c
++
-@@ -XXX,XX +XXX,XX @@ void throttle_group_register_tgm(ThrottleGroupMember *tgm,
++    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
-                          read_timer_cb,
++    bdrv_set_backing_hd(bs, backing, &error_abort);
-                          write_timer_cb,
++
-                          tgm);
++    g_assert_cmpint(bs->quiesce_counter, ==, 0);
-+    qemu_co_mutex_init(&tgm->throttled_reqs_lock);
++    g_assert_cmpint(backing->quiesce_counter, ==, 0);
-+    qemu_co_queue_init(&tgm->throttled_reqs[0]);
++
-+    qemu_co_queue_init(&tgm->throttled_reqs[1]);
++    do_drain_begin(drain_type, bs);
++
-     qemu_mutex_unlock(&tg->lock);
++    g_assert_cmpint(bs->quiesce_counter, ==, 1);
 +    g_assert_cmpint(backing->quiesce_counter, ==, !!recursive);
 +
 +    do_drain_end(drain_type, bs);
 +
 +    g_assert_cmpint(bs->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +
 +    bdrv_unref(backing);
 +    bdrv_unref(bs);
 +    blk_unref(blk);
 +}
 +
 +static void test_quiesce_drain_all(void)
 +{
 +    // XXX drain_all doesn't quiesce
 +    //test_quiesce_common(BDRV_DRAIN_ALL, true);
 +}
 +
 +static void test_quiesce_drain(void)
 +{
 +    test_quiesce_common(BDRV_DRAIN, false);
 +}
 +
  int main(int argc, char **argv)
  {
      bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
      g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 +    g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
 +    g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 +
      return g_test_run();
  }
 --
-.13.5
+.13.6

-[Qemu-devel] [PULL 05/14] qcow: Change signature of get_cluster_offset()
+[Qemu-devel] [PULL v3 22/35] blockjob: Pause job on draining any job BDS
-From: Eric Blake <eblake@redhat.com>
+Block jobs already paused themselves when their main BlockBackend
 entered a drained section. This is not good enough: We also want to
 pause a block job and may not submit new requests if, for example, the
 mirror target node should be drained.
-The old signature has an ambiguous meaning for a return of 0:
+This implements .drained_begin/end callbacks in child_job in order to
-either no allocation was requested or necessary, or an error
+consider all block nodes related to the job, and removes the
-occurred (but any errno associated with the error is lost to
+BlockBackend callbacks which are unnecessary now because the root of the
-the caller, which then has to assume EIO).
+job main BlockBackend is always referenced with a child_job, too.
-Better is to follow the example of qcow2, by changing the
-signature to have a separate return value that cleanly
-distinguishes between failure and success, along with a
-parameter that cleanly holds a 64-bit value.  Then update all
-callers.
-While auditing that all return paths return a negative errno
-(rather than -1), I also simplified places where we can pass
-NULL rather than a local Error that just gets thrown away.
-Suggested-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/qcow.c | 123 +++++++++++++++++++++++++++++++++++------------------------
+ blockjob.c | 22 +++++++++-------------
-file changed, 73 insertions(+), 50 deletions(-)
+file changed, 9 insertions(+), 13 deletions(-)
-diff --git a/block/qcow.c b/block/qcow.c
+diff --git a/blockjob.c b/blockjob.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow.c
+--- a/blockjob.c
-+++ b/block/qcow.c
++++ b/blockjob.c
-@@ -XXX,XX +XXX,XX @@ static int qcow_reopen_prepare(BDRVReopenState *state,
+@@ -XXX,XX +XXX,XX @@ static char *child_job_get_parent_desc(BdrvChild *c)
-  * 'compressed_size'. 'compressed_size' must be > 0 and <
+                            job->id);
-  * cluster_size
+ }
-  *
-- * return 0 if not allocated.
+-static const BdrvChildRole child_job = {
-+ * return 0 if not allocated, 1 if *result is assigned, and negative
+-    .get_parent_desc    = child_job_get_parent_desc,
-+ * errno on failure.
+-    .stay_at_node       = true,
-  */
+-};
--static uint64_t get_cluster_offset(BlockDriverState *bs,
+-
--                                   uint64_t offset, int allocate,
+-static void block_job_drained_begin(void *opaque)
--                                   int compressed_size,
++static void child_job_drained_begin(BdrvChild *c)
 -                                   int n_start, int n_end)
 +static int get_cluster_offset(BlockDriverState *bs,
 +                              uint64_t offset, int allocate,
 +                              int compressed_size,
 +                              int n_start, int n_end, uint64_t *result)
  {
-     BDRVQcowState *s = bs->opaque;
+-    BlockJob *job = opaque;
--    int min_index, i, j, l1_index, l2_index;
++    BlockJob *job = c->opaque;
-+    int min_index, i, j, l1_index, l2_index, ret;
+     block_job_pause(job);
      uint64_t l2_offset, *l2_table, cluster_offset, tmp;
      uint32_t min_count;
      int new_l2_table;
 +    *result = 0;
      l1_index = offset >> (s->l2_bits + s->cluster_bits);
      l2_offset = s->l1_table[l1_index];
      new_l2_table = 0;
@@ -XXX,XX +XXX,XX @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
          /* update the L1 entry */
          s->l1_table[l1_index] = l2_offset;
          tmp = cpu_to_be64(l2_offset);
 -        if (bdrv_pwrite_sync(bs->file,
 -                s->l1_table_offset + l1_index * sizeof(tmp),
 -                &tmp, sizeof(tmp)) < 0)
 -            return 0;
 +        ret = bdrv_pwrite_sync(bs->file,
 +                               s->l1_table_offset + l1_index * sizeof(tmp),
 +                               &tmp, sizeof(tmp));
 +        if (ret < 0) {
 +            return ret;
 +        }
          new_l2_table = 1;
      }
      for(i = 0; i < L2_CACHE_SIZE; i++) {
@@ -XXX,XX +XXX,XX @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
      l2_table = s->l2_cache + (min_index << s->l2_bits);
      if (new_l2_table) {
          memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
 -        if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
 -                s->l2_size * sizeof(uint64_t)) < 0)
 -            return 0;
 +        ret = bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
 +                               s->l2_size * sizeof(uint64_t));
 +        if (ret < 0) {
 +            return ret;
 +        }
      } else {
 -        if (bdrv_pread(bs->file, l2_offset, l2_table,
 -                       s->l2_size * sizeof(uint64_t)) !=
 -            s->l2_size * sizeof(uint64_t))
 -            return 0;
 +        ret = bdrv_pread(bs->file, l2_offset, l2_table,
 +                         s->l2_size * sizeof(uint64_t));
 +        if (ret < 0) {
 +            return ret;
 +        }
      }
      s->l2_cache_offsets[min_index] = l2_offset;
      s->l2_cache_counts[min_index] = 1;
@@ -XXX,XX +XXX,XX @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
              /* if the cluster is already compressed, we must
                 decompress it in the case it is not completely
                 overwritten */
 -            if (decompress_cluster(bs, cluster_offset) < 0)
 -                return 0;
 +            if (decompress_cluster(bs, cluster_offset) < 0) {
 +                return -EIO;
 +            }
              cluster_offset = bdrv_getlength(bs->file->bs);
              cluster_offset = (cluster_offset + s->cluster_size - 1) &
                  ~(s->cluster_size - 1);
              /* write the cluster content */
 -            if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache,
 -                            s->cluster_size) !=
 -                s->cluster_size)
 -                return -1;
 +            ret = bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache,
 +                              s->cluster_size);
 +            if (ret < 0) {
 +                return ret;
 +            }
          } else {
              cluster_offset = bdrv_getlength(bs->file->bs);
              if (allocate == 1) {
@@ -XXX,XX +XXX,XX @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
                                                        s->cluster_data,
                                                        BDRV_SECTOR_SIZE,
                                                        NULL) < 0) {
 -                                errno = EIO;
 -                                return -1;
 +                                return -EIO;
 +                            }
 +                            ret = bdrv_pwrite(bs->file,
 +                                              cluster_offset + i * 512,
 +                                              s->cluster_data, 512);
 +                            if (ret < 0) {
 +                                return ret;
                              }
 -                            if (bdrv_pwrite(bs->file,
 -                                            cluster_offset + i * 512,
 -                                            s->cluster_data, 512) != 512)
 -                                return -1;
                          }
                      }
                  }
@@ -XXX,XX +XXX,XX @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
          /* update L2 table */
          tmp = cpu_to_be64(cluster_offset);
          l2_table[l2_index] = tmp;
 -        if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
 -                &tmp, sizeof(tmp)) < 0)
 -            return 0;
 +        ret = bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
 +                               &tmp, sizeof(tmp));
 +        if (ret < 0) {
 +            return ret;
 +        }
      }
 -    return cluster_offset;
 +    *result = cluster_offset;
 +    return 1;
  }
- static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs,
+-static void block_job_drained_end(void *opaque)
-         int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
++static void child_job_drained_end(BdrvChild *c)
  {
-     BDRVQcowState *s = bs->opaque;
+-    BlockJob *job = opaque;
--    int index_in_cluster, n;
++    BlockJob *job = c->opaque;
-+    int index_in_cluster, n, ret;
+     block_job_resume(job);
      uint64_t cluster_offset;
      qemu_co_mutex_lock(&s->lock);
 -    cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
 +    ret = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0, &cluster_offset);
      qemu_co_mutex_unlock(&s->lock);
 +    if (ret < 0) {
 +        return ret;
 +    }
      index_in_cluster = sector_num & (s->cluster_sectors - 1);
      n = s->cluster_sectors - index_in_cluster;
      if (n > nb_sectors)
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
      while (nb_sectors != 0) {
          /* prepare next request */
 -        cluster_offset = get_cluster_offset(bs, sector_num << 9,
 -                                                 0, 0, 0, 0);
 +        ret = get_cluster_offset(bs, sector_num << 9,
 +                                 0, 0, 0, 0, &cluster_offset);
 +        if (ret < 0) {
 +            break;
 +        }
          index_in_cluster = sector_num & (s->cluster_sectors - 1);
          n = s->cluster_sectors - index_in_cluster;
          if (n > nb_sectors) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
                  ret = bdrv_co_readv(bs->backing, sector_num, n, &hd_qiov);
                  qemu_co_mutex_lock(&s->lock);
                  if (ret < 0) {
 -                    goto fail;
 +                    break;
                  }
              } else {
                  /* Note: in this case, no need to wait */
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
          } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
              /* add AIO support for compressed blocks ? */
              if (decompress_cluster(bs, cluster_offset) < 0) {
 -                goto fail;
 +                ret = -EIO;
 +                break;
              }
              memcpy(buf,
                     s->cluster_cache + index_in_cluster * 512, 512 * n);
          } else {
              if ((cluster_offset & 511) != 0) {
 -                goto fail;
 +                ret = -EIO;
 +                break;
              }
              hd_iov.iov_base = (void *)buf;
              hd_iov.iov_len = n * 512;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
                  assert(s->crypto);
                  if (qcrypto_block_decrypt(s->crypto, sector_num, buf,
                                            n * BDRV_SECTOR_SIZE, NULL) < 0) {
 -                    goto fail;
 +                    ret = -EIO;
 +                    break;
                  }
              }
          }
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
          buf += n * 512;
      }
 -done:
      qemu_co_mutex_unlock(&s->lock);
      if (qiov->niov > 1) {
@@ -XXX,XX +XXX,XX @@ done:
      }
      return ret;
 -
 -fail:
 -    ret = -EIO;
 -    goto done;
  }
- static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
+-static const BlockDevOps block_job_dev_ops = {
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
+-    .drained_begin = block_job_drained_begin,
-         if (n > nb_sectors) {
+-    .drained_end = block_job_drained_end,
-             n = nb_sectors;
++static const BdrvChildRole child_job = {
-         }
++    .get_parent_desc    = child_job_get_parent_desc,
--        cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
++    .drained_begin      = child_job_drained_begin,
--                                            index_in_cluster,
++    .drained_end        = child_job_drained_end,
--                                            index_in_cluster + n);
++    .stay_at_node       = true,
-+        ret = get_cluster_offset(bs, sector_num << 9, 1, 0,
+ };
-+                                 index_in_cluster,
-+                                 index_in_cluster + n, &cluster_offset);
+ void block_job_remove_all_bdrv(BlockJob *job)
-+        if (ret < 0) {
+@@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
-+            break;
+     block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort);
-+        }
+     bs->job = job;
-         if (!cluster_offset || (cluster_offset & 511) != 0) {
-             ret = -EIO;
+-    blk_set_dev_ops(blk, &block_job_dev_ops, job);
-             break;
+     bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
-@@ -XXX,XX +XXX,XX @@ qcow_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
-         goto success;
+     QLIST_INSERT_HEAD(&block_jobs, job, job_list);
      }
      qemu_co_mutex_lock(&s->lock);
 -    cluster_offset = get_cluster_offset(bs, offset, 2, out_len, 0, 0);
 +    ret = get_cluster_offset(bs, offset, 2, out_len, 0, 0, &cluster_offset);
      qemu_co_mutex_unlock(&s->lock);
 +    if (ret < 0) {
 +        goto fail;
 +    }
      if (cluster_offset == 0) {
          ret = -EIO;
          goto fail;
 --
-.13.5
+.13.6

-[Qemu-devel] [PULL 13/14] qemu-iotests: add 184 for throttle filter driver
+[Qemu-devel] [PULL v3 23/35] test-bdrv-drain: Test drain vs. block jobs
-From: Manos Pitsidianakis <el13635@mail.ntua.gr>
+Block jobs must be paused if any of the involved nodes are drained.
-Reviewed-by: Alberto Garcia <berto@igalia.com>
-Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- tests/qemu-iotests/184     | 205 ++++++++++++++++++++++++++++++
+ tests/test-bdrv-drain.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++
- tests/qemu-iotests/184.out | 302 +++++++++++++++++++++++++++++++++++++++++++++
+file changed, 121 insertions(+)
  tests/qemu-iotests/group   |   1 +
 files changed, 508 insertions(+)
  create mode 100755 tests/qemu-iotests/184
  create mode 100644 tests/qemu-iotests/184.out
-diff --git a/tests/qemu-iotests/184 b/tests/qemu-iotests/184
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
-new file mode 100755
+index XXXXXXX..XXXXXXX 100644
-index XXXXXXX..XXXXXXX
+--- a/tests/test-bdrv-drain.c
---- /dev/null
++++ b/tests/test-bdrv-drain.c
 +++ b/tests/qemu-iotests/184
 @@ -XXX,XX +XXX,XX @@
-+#!/bin/bash
-+#
+ #include "qemu/osdep.h"
-+# Test I/O throttle block filter driver interface
+ #include "block/block.h"
-+#
++#include "block/blockjob_int.h"
-+# Copyright (C) 2017 Manos Pitsidianakis
+ #include "sysemu/block-backend.h"
-+#
+ #include "qapi/error.h"
-+# This program is free software; you can redistribute it and/or modify
-+# it under the terms of the GNU General Public License as published by
+@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
-+# the Free Software Foundation; either version 2 of the License, or
+     test_quiesce_common(BDRV_DRAIN, false);
-+# (at your option) any later version.
+ }
-+#
 +# This program is distributed in the hope that it will be useful,
 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 +# GNU General Public License for more details.
 +#
 +# You should have received a copy of the GNU General Public License
 +# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 +#
 +
-+# creator
++typedef struct TestBlockJob {
-+owner="Manos Pitsidianakis"
++    BlockJob common;
 +    bool should_complete;
 +} TestBlockJob;
 +
-+seq=`basename $0`
++static void test_job_completed(BlockJob *job, void *opaque)
 +echo "QA output created by $seq"
 +
 +here=`pwd`
 +status=1    # failure is the default!
 +
 +_cleanup()
 +{
-+    _cleanup_test_img
++    block_job_completed(job, 0);
 +}
 +trap "_cleanup; exit \$status" 0 1 2 3 15
 +
 +# get standard environment, filters and checks
 +. ./common.rc
 +. ./common.filter
 +
 +_supported_fmt qcow2
 +_supported_proto file
 +_supported_os Linux
 +
 +function do_run_qemu()
 +{
 +    echo Testing: "$@" | _filter_imgfmt
 +    $QEMU -nographic -qmp-pretty stdio -serial none "$@"
 +    echo
 +}
 +
-+function run_qemu()
++static void coroutine_fn test_job_start(void *opaque)
 +{
-+    do_run_qemu "$@" 2>&1 | _filter_testdir | _filter_qemu | _filter_qmp\
++    TestBlockJob *s = opaque;
-+                          | _filter_qemu_io | _filter_generated_node_ids
++
 +    while (!s->should_complete) {
 +        block_job_sleep_ns(&s->common, 100000);
 +    }
 +
 +    block_job_defer_to_main_loop(&s->common, test_job_completed, NULL);
 +}
 +
-+_make_test_img 64M
++static void test_job_complete(BlockJob *job, Error **errp)
 +test_throttle=$($QEMU_IMG --help|grep throttle)
 +[ "$test_throttle" = "" ] && _supported_fmt throttle
 +
 +echo
 +echo "== checking interface =="
 +
 +run_qemu <<EOF
 +{ "execute": "qmp_capabilities" }
 +{ "execute": "blockdev-add",
 +  "arguments": {
 +    "driver": "$IMGFMT",
 +    "node-name": "disk0",
 +    "file": {
 +      "driver": "file",
 +      "filename": "$TEST_IMG"
 +    }
 +  }
 +}
 +{ "execute": "object-add",
 +  "arguments": {
 +    "qom-type": "throttle-group",
 +    "id": "group0",
 +    "props": {
 +      "limits" : {
 +        "iops-total": 1000
 +      }
 +    }
 +  }
 +}
 +{ "execute": "blockdev-add",
 +  "arguments": {
 +    "driver": "throttle",
 +    "node-name": "throttle0",
 +    "throttle-group": "group0",
 +    "file": "disk0"
 +  }
 +}
 +{ "execute": "query-named-block-nodes" }
 +{ "execute": "query-block" }
 +{ "execute": "quit" }
 +EOF
 +
 +echo
 +echo "== property changes in ThrottleGroup =="
 +
 +run_qemu <<EOF
 +{ "execute": "qmp_capabilities" }
 +{ "execute": "object-add",
 +  "arguments": {
 +    "qom-type": "throttle-group",
 +    "id": "group0",
 +    "props" : {
 +      "limits": {
 +          "iops-total": 1000
 +      }
 +    }
 +  }
 +}
 +{ "execute" : "qom-get",
 +  "arguments" : {
 +    "path" : "group0",
 +    "property" : "limits"
 +  }
 +}
 +{ "execute" : "qom-set",
 +    "arguments" : {
 +        "path" : "group0",
 +        "property" : "limits",
 +        "value" : {
 +            "iops-total" : 0
 +        }
 +    }
 +}
 +{ "execute" : "qom-get",
 +  "arguments" : {
 +    "path" : "group0",
 +    "property" : "limits"
 +  }
 +}
 +{ "execute": "quit" }
 +EOF
 +
 +echo
 +echo "== object creation/set errors  =="
 +
 +run_qemu <<EOF
 +{ "execute": "qmp_capabilities" }
 +{ "execute": "object-add",
 +  "arguments": {
 +    "qom-type": "throttle-group",
 +    "id": "group0",
 +    "props" : {
 +      "limits": {
 +          "iops-total": 1000
 +      }
 +    }
 +  }
 +}
 +{ "execute" : "qom-set",
 +  "arguments" : {
 +    "path" : "group0",
 +    "property" : "x-iops-total",
 +    "value" : 0
 +  }
 +}
 +{ "execute" : "qom-set",
 +    "arguments" : {
 +        "path" : "group0",
 +        "property" : "limits",
 +        "value" : {
 +            "iops-total" : 10,
 +            "iops-read" : 10
 +        }
 +    }
 +}
 +{ "execute": "quit" }
 +EOF
 +
 +echo
 +echo "== don't specify group =="
 +
 +run_qemu <<EOF
 +{ "execute": "qmp_capabilities" }
 +{ "execute": "blockdev-add",
 +  "arguments": {
 +    "driver": "$IMGFMT",
 +    "node-name": "disk0",
 +    "file": {
 +      "driver": "file",
 +      "filename": "$TEST_IMG"
 +    }
 +  }
 +}
 +{ "execute": "blockdev-add",
 +  "arguments": {
 +    "driver": "throttle",
 +    "node-name": "throttle0",
 +    "file": "disk0"
 +  }
 +}
 +{ "execute": "quit" }
 +EOF
 +
 +echo
 +# success, all done
 +echo "*** done"
 +rm -f $seq.full
 +status=0
 diff --git a/tests/qemu-iotests/184.out b/tests/qemu-iotests/184.out
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/qemu-iotests/184.out
@@ -XXX,XX +XXX,XX @@
 +QA output created by 184
 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
 +
 +== checking interface ==
 +Testing:
 +{
-+    QMP_VERSION
++    TestBlockJob *s = container_of(job, TestBlockJob, common);
-+}
++    s->should_complete = true;
 +{
 +    "return": {
 +    }
 +}
 +{
 +    "return": {
 +    }
 +}
 +{
 +    "return": {
 +    }
 +}
 +{
 +    "return": {
 +    }
 +}
 +{
 +    "return": [
 +        {
 +            "iops_rd": 0,
 +            "detect_zeroes": "off",
 +            "image": {
 +                "virtual-size": 67108864,
 +                "filename": "json:{\"throttle-group\": \"group0\", \"driver\": \"throttle\", \"file\": {\"driver\": \"qcow2\", \"file\": {\"driver\": \"file\", \"filename\": \"TEST_DIR/t.qcow2\"}}}",
 +                "cluster-size": 65536,
 +                "format": "throttle",
 +                "actual-size": 200704,
 +                "dirty-flag": false
 +            },
 +            "iops_wr": 0,
 +            "ro": false,
 +            "node-name": "throttle0",
 +            "backing_file_depth": 0,
 +            "drv": "throttle",
 +            "iops": 0,
 +            "bps_wr": 0,
 +            "write_threshold": 0,
 +            "encrypted": false,
 +            "bps": 0,
 +            "bps_rd": 0,
 +            "cache": {
 +                "no-flush": false,
 +                "direct": false,
 +                "writeback": true
 +            },
 +            "file": "json:{\"throttle-group\": \"group0\", \"driver\": \"throttle\", \"file\": {\"driver\": \"qcow2\", \"file\": {\"driver\": \"file\", \"filename\": \"TEST_DIR/t.qcow2\"}}}",
 +            "encryption_key_missing": false
 +        },
 +        {
 +            "iops_rd": 0,
 +            "detect_zeroes": "off",
 +            "image": {
 +                "virtual-size": 67108864,
 +                "filename": "TEST_DIR/t.qcow2",
 +                "cluster-size": 65536,
 +                "format": "qcow2",
 +                "actual-size": 200704,
 +                "format-specific": {
 +                    "type": "qcow2",
 +                    "data": {
 +                        "compat": "1.1",
 +                        "lazy-refcounts": false,
 +                        "refcount-bits": 16,
 +                        "corrupt": false
 +                    }
 +                },
 +                "dirty-flag": false
 +            },
 +            "iops_wr": 0,
 +            "ro": false,
 +            "node-name": "disk0",
 +            "backing_file_depth": 0,
 +            "drv": "qcow2",
 +            "iops": 0,
 +            "bps_wr": 0,
 +            "write_threshold": 0,
 +            "encrypted": false,
 +            "bps": 0,
 +            "bps_rd": 0,
 +            "cache": {
 +                "no-flush": false,
 +                "direct": false,
 +                "writeback": true
 +            },
 +            "file": "TEST_DIR/t.qcow2",
 +            "encryption_key_missing": false
 +        },
 +        {
 +            "iops_rd": 0,
 +            "detect_zeroes": "off",
 +            "image": {
 +                "virtual-size": 197120,
 +                "filename": "TEST_DIR/t.qcow2",
 +                "format": "file",
 +                "actual-size": 200704,
 +                "dirty-flag": false
 +            },
 +            "iops_wr": 0,
 +            "ro": false,
 +            "node-name": "NODE_NAME",
 +            "backing_file_depth": 0,
 +            "drv": "file",
 +            "iops": 0,
 +            "bps_wr": 0,
 +            "write_threshold": 0,
 +            "encrypted": false,
 +            "bps": 0,
 +            "bps_rd": 0,
 +            "cache": {
 +                "no-flush": false,
 +                "direct": false,
 +                "writeback": true
 +            },
 +            "file": "TEST_DIR/t.qcow2",
 +            "encryption_key_missing": false
 +        }
 +    ]
 +}
 +{
 +    "return": [
 +    ]
 +}
 +{
 +    "return": {
 +    }
 +}
 +{
 +    "timestamp": {
 +        "seconds":  TIMESTAMP,
 +        "microseconds":  TIMESTAMP
 +    },
 +    "event": "SHUTDOWN",
 +    "data": {
 +        "guest": false
 +    }
 +}
 +
++BlockJobDriver test_job_driver = {
++    .instance_size  = sizeof(TestBlockJob),
++    .start          = test_job_start,
++    .complete       = test_job_complete,
++};
 +
-+== property changes in ThrottleGroup ==
++static void test_blockjob_common(enum drain_type drain_type)
 +Testing:
 +{
-+    QMP_VERSION
++    BlockBackend *blk_src, *blk_target;
-+}
++    BlockDriverState *src, *target;
-+{
++    BlockJob *job;
-+    "return": {
++    int ret;
 +
 +    src = bdrv_new_open_driver(&bdrv_test, "source", BDRV_O_RDWR,
 +                               &error_abort);
 +    blk_src = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    blk_insert_bs(blk_src, src, &error_abort);
 +
 +    target = bdrv_new_open_driver(&bdrv_test, "target", BDRV_O_RDWR,
 +                                  &error_abort);
 +    blk_target = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    blk_insert_bs(blk_target, target, &error_abort);
 +
 +    job = block_job_create("job0", &test_job_driver, src, 0, BLK_PERM_ALL, 0,
 +                           0, NULL, NULL, &error_abort);
 +    block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort);
 +    block_job_start(job);
 +
 +    g_assert_cmpint(job->pause_count, ==, 0);
 +    g_assert_false(job->paused);
 +    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
 +
 +    do_drain_begin(drain_type, src);
 +
 +    if (drain_type == BDRV_DRAIN_ALL) {
 +        /* bdrv_drain_all() drains both src and target, and involves an
 +         * additional block_job_pause_all() */
 +        g_assert_cmpint(job->pause_count, ==, 3);
 +    } else {
 +        g_assert_cmpint(job->pause_count, ==, 1);
 +    }
-+}
++    /* XXX We don't wait until the job is actually paused. Is this okay? */
-+{
++    /* g_assert_true(job->paused); */
-+    "return": {
++    g_assert_false(job->busy); /* The job is paused */
 +
 +    do_drain_end(drain_type, src);
 +
 +    g_assert_cmpint(job->pause_count, ==, 0);
 +    g_assert_false(job->paused);
 +    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
 +
 +    do_drain_begin(drain_type, target);
 +
 +    if (drain_type == BDRV_DRAIN_ALL) {
 +        /* bdrv_drain_all() drains both src and target, and involves an
 +         * additional block_job_pause_all() */
 +        g_assert_cmpint(job->pause_count, ==, 3);
 +    } else {
 +        g_assert_cmpint(job->pause_count, ==, 1);
 +    }
-+}
++    /* XXX We don't wait until the job is actually paused. Is this okay? */
-+{
++    /* g_assert_true(job->paused); */
-+    "return": {
++    g_assert_false(job->busy); /* The job is paused */
-+        "bps-read-max-length": 1,
++
-+        "iops-read-max-length": 1,
++    do_drain_end(drain_type, target);
-+        "bps-read-max": 0,
++
-+        "bps-total": 0,
++    g_assert_cmpint(job->pause_count, ==, 0);
-+        "iops-total-max-length": 1,
++    g_assert_false(job->paused);
-+        "iops-total": 1000,
++    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
-+        "iops-write-max": 0,
++
-+        "bps-write": 0,
++    ret = block_job_complete_sync(job, &error_abort);
-+        "bps-total-max": 0,
++    g_assert_cmpint(ret, ==, 0);
-+        "bps-write-max": 0,
++
-+        "iops-size": 0,
++    blk_unref(blk_src);
-+        "iops-read": 0,
++    blk_unref(blk_target);
-+        "iops-write-max-length": 1,
++    bdrv_unref(src);
-+        "iops-write": 0,
++    bdrv_unref(target);
 +        "bps-total-max-length": 1,
 +        "iops-read-max": 0,
 +        "bps-read": 0,
 +        "bps-write-max-length": 1,
 +        "iops-total-max": 0
 +    }
 +}
 +{
 +    "return": {
 +    }
 +}
 +{
 +    "return": {
 +        "bps-read-max-length": 1,
 +        "iops-read-max-length": 1,
 +        "bps-read-max": 0,
 +        "bps-total": 0,
 +        "iops-total-max-length": 1,
 +        "iops-total": 0,
 +        "iops-write-max": 0,
 +        "bps-write": 0,
 +        "bps-total-max": 0,
 +        "bps-write-max": 0,
 +        "iops-size": 0,
 +        "iops-read": 0,
 +        "iops-write-max-length": 1,
 +        "iops-write": 0,
 +        "bps-total-max-length": 1,
 +        "iops-read-max": 0,
 +        "bps-read": 0,
 +        "bps-write-max-length": 1,
 +        "iops-total-max": 0
 +    }
 +}
 +{
 +    "return": {
 +    }
 +}
 +{
 +    "timestamp": {
 +        "seconds":  TIMESTAMP,
 +        "microseconds":  TIMESTAMP
 +    },
 +    "event": "SHUTDOWN",
 +    "data": {
 +        "guest": false
 +    }
 +}
 +
-+
++static void test_blockjob_drain_all(void)
 +== object creation/set errors  ==
 +Testing:
 +{
-+    QMP_VERSION
++    test_blockjob_common(BDRV_DRAIN_ALL);
 +}
 +{
 +    "return": {
 +    }
 +}
 +{
 +    "return": {
 +    }
 +}
 +{
 +    "error": {
 +        "class": "GenericError",
 +        "desc": "Property cannot be set after initialization"
 +    }
 +}
 +{
 +    "error": {
 +        "class": "GenericError",
 +        "desc": "bps/iops/max total values and read/write values cannot be used at the same time"
 +    }
 +}
 +{
 +    "return": {
 +    }
 +}
 +{
 +    "timestamp": {
 +        "seconds":  TIMESTAMP,
 +        "microseconds":  TIMESTAMP
 +    },
 +    "event": "SHUTDOWN",
 +    "data": {
 +        "guest": false
 +    }
 +}
 +
-+
++static void test_blockjob_drain(void)
 +== don't specify group ==
 +Testing:
 +{
-+    QMP_VERSION
++    test_blockjob_common(BDRV_DRAIN);
 +}
 +{
 +    "return": {
 +    }
 +}
 +{
 +    "return": {
 +    }
 +}
 +{
 +    "error": {
 +        "class": "GenericError",
 +        "desc": "Parameter 'throttle-group' is missing"
 +    }
 +}
 +{
 +    "return": {
 +    }
 +}
 +{
 +    "timestamp": {
 +        "seconds":  TIMESTAMP,
 +        "microseconds":  TIMESTAMP
 +    },
 +    "event": "SHUTDOWN",
 +    "data": {
 +        "guest": false
 +    }
 +}
 +
+ int main(int argc, char **argv)
+ {
+     bdrv_init();
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
+     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
+     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
++    g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
++    g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 +
-+*** done
+     return g_test_run();
-diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
+ }
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/qemu-iotests/group
 +++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 rw auto migration
 rw auto quick
 rw auto migration
 +184 rw auto quick
 rw auto
 rw auto
 rw auto
 --
-.13.5
+.13.6

-New patch
+[Qemu-devel] [PULL v3 24/35] block: Don't block_job_pause_all() in bdrv_drain_all()
+Block jobs are already paused using the BdrvChildRole drain callbacks,
+so we don't need an additional block_job_pause_all() call.
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+---
+ block/io.c              |  4 ----
+ tests/test-bdrv-drain.c | 10 ++++------
+files changed, 4 insertions(+), 10 deletions(-)
+diff --git a/block/io.c b/block/io.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/io.c
++++ b/block/io.c
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
+      * context. */
+     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+-    block_job_pause_all();
+-
+     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+         AioContext *aio_context = bdrv_get_aio_context(bs);
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
+         aio_enable_external(aio_context);
+         aio_context_release(aio_context);
+     }
+-
+-    block_job_resume_all();
+ }
+ void bdrv_drain_all(void)
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tests/test-bdrv-drain.c
++++ b/tests/test-bdrv-drain.c
+@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
+     do_drain_begin(drain_type, src);
+     if (drain_type == BDRV_DRAIN_ALL) {
+-        /* bdrv_drain_all() drains both src and target, and involves an
+-         * additional block_job_pause_all() */
+-        g_assert_cmpint(job->pause_count, ==, 3);
++        /* bdrv_drain_all() drains both src and target */
++        g_assert_cmpint(job->pause_count, ==, 2);
+     } else {
+         g_assert_cmpint(job->pause_count, ==, 1);
+     }
+@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
+     do_drain_begin(drain_type, target);
+     if (drain_type == BDRV_DRAIN_ALL) {
+-        /* bdrv_drain_all() drains both src and target, and involves an
+-         * additional block_job_pause_all() */
+-        g_assert_cmpint(job->pause_count, ==, 3);
++        /* bdrv_drain_all() drains both src and target */
++        g_assert_cmpint(job->pause_count, ==, 2);
+     } else {
+         g_assert_cmpint(job->pause_count, ==, 1);
+     }
+--
+.13.6

-New patch
+[Qemu-devel] [PULL v3 25/35] block: Nested drain_end must still call callbacks
+bdrv_do_drained_begin() restricts the call of parent callbacks and
+aio_disable_external() to the outermost drain section, but the block
+driver callbacks are always called. bdrv_do_drained_end() must match
+this behaviour, otherwise nodes stay drained even if begin/end calls
+were balanced.
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+---
+ block/io.c | 12 +++++++-----
+file changed, 7 insertions(+), 5 deletions(-)
+diff --git a/block/io.c b/block/io.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/io.c
++++ b/block/io.c
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
+ void bdrv_drained_end(BlockDriverState *bs)
+ {
++    int old_quiesce_counter;
++
+     if (qemu_in_coroutine()) {
+         bdrv_co_yield_to_drain(bs, false);
+         return;
+     }
+     assert(bs->quiesce_counter > 0);
+-    if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
+-        return;
+-    }
++    old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
+     /* Re-enable things in child-to-parent order */
+     bdrv_drain_invoke(bs, false, false);
+-    bdrv_parent_drained_end(bs);
+-    aio_enable_external(bdrv_get_aio_context(bs));
++    if (old_quiesce_counter == 1) {
++        bdrv_parent_drained_end(bs);
++        aio_enable_external(bdrv_get_aio_context(bs));
++    }
+ }
+ /*
+--
+.13.6

-[Qemu-devel] [PULL 07/14] block: document semantics of bdrv_co_preadv|pwritev
+[Qemu-devel] [PULL v3 26/35] test-bdrv-drain: Test nested drain sections
-From: "Daniel P. Berrange" <berrange@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Daniel P. Berrange <berrange@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/block_int.h | 31 +++++++++++++++++++++++++++++++
+ tests/test-bdrv-drain.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++
-file changed, 31 insertions(+)
+file changed, 57 insertions(+)
-diff --git a/include/block/block_int.h b/include/block/block_int.h
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/block_int.h
+--- a/tests/test-bdrv-drain.c
-+++ b/include/block/block_int.h
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
+@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
+ enum drain_type {
-     int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs,
+     BDRV_DRAIN_ALL,
-         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+     BDRV_DRAIN,
 +    DRAIN_TYPE_MAX,
  };
  static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
      test_quiesce_common(BDRV_DRAIN, false);
  }
 +static void test_nested(void)
 +{
 +    BlockBackend *blk;
 +    BlockDriverState *bs, *backing;
 +    BDRVTestState *s, *backing_s;
 +    enum drain_type outer, inner;
 +
-+    /**
++    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
-+     * @offset: position in bytes to read at
++    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
-+     * @bytes: number of bytes to read
++                              &error_abort);
-+     * @qiov: the buffers to fill with read data
++    s = bs->opaque;
-+     * @flags: currently unused, always 0
++    blk_insert_bs(blk, bs, &error_abort);
-+     *
++
-+     * @offset and @bytes will be a multiple of 'request_alignment',
++    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
-+     * but the length of individual @qiov elements does not have to
++    backing_s = backing->opaque;
-+     * be a multiple.
++    bdrv_set_backing_hd(bs, backing, &error_abort);
-+     *
++
-+     * @bytes will always equal the total size of @qiov, and will be
++    for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
-+     * no larger than 'max_transfer'.
++        for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
-+     *
++            /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
-+     * The buffer in @qiov may point directly to guest memory.
++            int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
-+     */
++                                  (inner != BDRV_DRAIN_ALL);
-     int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs,
++            int backing_quiesce = 0;
-         uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
++            int backing_cb_cnt  = (outer != BDRV_DRAIN) +
-     int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
++                                  (inner != BDRV_DRAIN);
-         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
++
-     int coroutine_fn (*bdrv_co_writev_flags)(BlockDriverState *bs,
++            g_assert_cmpint(bs->quiesce_counter, ==, 0);
-         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags);
++            g_assert_cmpint(backing->quiesce_counter, ==, 0);
-+    /**
++            g_assert_cmpint(s->drain_count, ==, 0);
-+     * @offset: position in bytes to write at
++            g_assert_cmpint(backing_s->drain_count, ==, 0);
-+     * @bytes: number of bytes to write
++
-+     * @qiov: the buffers containing data to write
++            do_drain_begin(outer, bs);
-+     * @flags: zero or more bits allowed by 'supported_write_flags'
++            do_drain_begin(inner, bs);
-+     *
++
-+     * @offset and @bytes will be a multiple of 'request_alignment',
++            g_assert_cmpint(bs->quiesce_counter, ==, bs_quiesce);
-+     * but the length of individual @qiov elements does not have to
++            g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
-+     * be a multiple.
++            g_assert_cmpint(s->drain_count, ==, 2);
-+     *
++            g_assert_cmpint(backing_s->drain_count, ==, backing_cb_cnt);
-+     * @bytes will always equal the total size of @qiov, and will be
++
-+     * no larger than 'max_transfer'.
++            do_drain_end(inner, bs);
-+     *
++            do_drain_end(outer, bs);
-+     * The buffer in @qiov may point directly to guest memory.
++
-+     */
++            g_assert_cmpint(bs->quiesce_counter, ==, 0);
-     int coroutine_fn (*bdrv_co_pwritev)(BlockDriverState *bs,
++            g_assert_cmpint(backing->quiesce_counter, ==, 0);
-         uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
++            g_assert_cmpint(s->drain_count, ==, 0);
 +            g_assert_cmpint(backing_s->drain_count, ==, 0);
 +        }
 +    }
 +
 +    bdrv_unref(backing);
 +    bdrv_unref(bs);
 +    blk_unref(blk);
 +}
 +
  typedef struct TestBlockJob {
      BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
      g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 +    g_test_add_func("/bdrv-drain/nested", test_nested);
 +
      g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
      g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 --
-.13.5
+.13.6

-New patch
+[Qemu-devel] [PULL v3 27/35] block: Don't notify parents in drain call chain
+This is in preparation for subtree drains, i.e. drained sections that
 affect not only a single node, but recursively all child nodes, too.
 Calling the parent callbacks for drain is pointless when we just came
 from that parent node recursively and leads to multiple increases of
 bs->quiesce_counter in a single drain call. Don't do it.
 In order for this to work correctly, the parent callback must be called
 for every bdrv_drain_begin/end() call, not only for the outermost one:
 If we have a node N with two parents A and B, recursive draining of A
 should cause the quiesce_counter of B to increase because its child N is
 drained independently of B. If now B is recursively drained, too, A must
 increase its quiesce_counter because N is drained independently of A
 only now, even if N is going from quiesce_counter 1 to 2.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
  include/block/block.h |  4 ++--
  block.c               | 13 +++++++++----
  block/io.c            | 47 ++++++++++++++++++++++++++++++++++-------------
 files changed, 45 insertions(+), 19 deletions(-)
 diff --git a/include/block/block.h b/include/block/block.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/block.h
 +++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_io_unplug(BlockDriverState *bs);
   * Begin a quiesced section of all users of @bs. This is part of
   * bdrv_drained_begin.
   */
 -void bdrv_parent_drained_begin(BlockDriverState *bs);
 +void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
  /**
   * bdrv_parent_drained_end:
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs);
   * End a quiesced section of all users of @bs. This is part of
   * bdrv_drained_end.
   */
 -void bdrv_parent_drained_end(BlockDriverState *bs);
 +void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
  /**
   * bdrv_drained_begin:
 diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block.c
 +++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
                                        BlockDriverState *new_bs)
  {
      BlockDriverState *old_bs = child->bs;
 +    int i;
      if (old_bs && new_bs) {
          assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
      }
      if (old_bs) {
          if (old_bs->quiesce_counter && child->role->drained_end) {
 -            child->role->drained_end(child);
 +            for (i = 0; i < old_bs->quiesce_counter; i++) {
 +                child->role->drained_end(child);
 +            }
          }
          if (child->role->detach) {
              child->role->detach(child);
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
      if (new_bs) {
          QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
          if (new_bs->quiesce_counter && child->role->drained_begin) {
 -            child->role->drained_begin(child);
 +            for (i = 0; i < new_bs->quiesce_counter; i++) {
 +                child->role->drained_begin(child);
 +            }
          }
          if (child->role->attach) {
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
      AioContext *ctx = bdrv_get_aio_context(bs);
      aio_disable_external(ctx);
 -    bdrv_parent_drained_begin(bs);
 +    bdrv_parent_drained_begin(bs, NULL);
      bdrv_drain(bs); /* ensure there are no in-flight requests */
      while (aio_poll(ctx, false)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
       */
      aio_context_acquire(new_context);
      bdrv_attach_aio_context(bs, new_context);
 -    bdrv_parent_drained_end(bs);
 +    bdrv_parent_drained_end(bs, NULL);
      aio_enable_external(ctx);
      aio_context_release(new_context);
  }
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@
  static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
      int64_t offset, int bytes, BdrvRequestFlags flags);
 -void bdrv_parent_drained_begin(BlockDriverState *bs)
 +void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
  {
      BdrvChild *c, *next;
      QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
 +        if (c == ignore) {
 +            continue;
 +        }
          if (c->role->drained_begin) {
              c->role->drained_begin(c);
          }
      }
  }
 -void bdrv_parent_drained_end(BlockDriverState *bs)
 +void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
  {
      BdrvChild *c, *next;
      QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
 +        if (c == ignore) {
 +            continue;
 +        }
          if (c->role->drained_end) {
              c->role->drained_end(c);
          }
@@ -XXX,XX +XXX,XX @@ typedef struct {
      BlockDriverState *bs;
      bool done;
      bool begin;
 +    BdrvChild *parent;
  } BdrvCoDrainData;
  static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
      return waited;
  }
 +static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
 +static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
 +
  static void bdrv_co_drain_bh_cb(void *opaque)
  {
      BdrvCoDrainData *data = opaque;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
      bdrv_dec_in_flight(bs);
      if (data->begin) {
 -        bdrv_drained_begin(bs);
 +        bdrv_do_drained_begin(bs, data->parent);
      } else {
 -        bdrv_drained_end(bs);
 +        bdrv_do_drained_end(bs, data->parent);
      }
      data->done = true;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
  }
  static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
 -                                                bool begin)
 +                                                bool begin, BdrvChild *parent)
  {
      BdrvCoDrainData data;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
          .bs = bs,
          .done = false,
          .begin = begin,
 +        .parent = parent,
      };
      bdrv_inc_in_flight(bs);
      aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
      assert(data.done);
  }
 -void bdrv_drained_begin(BlockDriverState *bs)
 +static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
  {
      if (qemu_in_coroutine()) {
 -        bdrv_co_yield_to_drain(bs, true);
 +        bdrv_co_yield_to_drain(bs, true, parent);
          return;
      }
      /* Stop things in parent-to-child order */
      if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
          aio_disable_external(bdrv_get_aio_context(bs));
 -        bdrv_parent_drained_begin(bs);
      }
 +    bdrv_parent_drained_begin(bs, parent);
      bdrv_drain_invoke(bs, true, false);
      bdrv_drain_recurse(bs);
  }
 -void bdrv_drained_end(BlockDriverState *bs)
 +void bdrv_drained_begin(BlockDriverState *bs)
 +{
 +    bdrv_do_drained_begin(bs, NULL);
 +}
 +
 +static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
  {
      int old_quiesce_counter;
      if (qemu_in_coroutine()) {
 -        bdrv_co_yield_to_drain(bs, false);
 +        bdrv_co_yield_to_drain(bs, false, parent);
          return;
      }
      assert(bs->quiesce_counter > 0);
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
      /* Re-enable things in child-to-parent order */
      bdrv_drain_invoke(bs, false, false);
 +    bdrv_parent_drained_end(bs, parent);
      if (old_quiesce_counter == 1) {
 -        bdrv_parent_drained_end(bs);
          aio_enable_external(bdrv_get_aio_context(bs));
      }
  }
 +void bdrv_drained_end(BlockDriverState *bs)
 +{
 +    bdrv_do_drained_end(bs, NULL);
 +}
 +
  /*
   * Wait for pending requests to complete on a single BlockDriverState subtree,
   * and suspend block driver's internal I/O until next request arrives.
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
          /* Stop things in parent-to-child order */
          aio_context_acquire(aio_context);
          aio_disable_external(aio_context);
 -        bdrv_parent_drained_begin(bs);
 +        bdrv_parent_drained_begin(bs, NULL);
          bdrv_drain_invoke(bs, true, true);
          aio_context_release(aio_context);
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
          /* Re-enable things in child-to-parent order */
          aio_context_acquire(aio_context);
          bdrv_drain_invoke(bs, false, true);
 -        bdrv_parent_drained_end(bs);
 +        bdrv_parent_drained_end(bs, NULL);
          aio_enable_external(aio_context);
          aio_context_release(aio_context);
      }
 --
 .13.6

-[Qemu-devel] [PULL 09/14] block: add aio_context field in ThrottleGroupMember
+[Qemu-devel] [PULL v3 28/35] block: Add bdrv_subtree_drained_begin/end()
-From: Manos Pitsidianakis <el13635@mail.ntua.gr>
+bdrv_drained_begin() waits for the completion of requests in the whole
 subtree, but it only actually keeps its immediate bs parameter quiesced
 until bdrv_drained_end().
-timer_cb() needs to know about the current Aio context of the throttle
+Add a version that keeps the whole subtree drained. As of this commit,
-request that is woken up. In order to make ThrottleGroupMember backend
+graph changes cannot be allowed during a subtree drained section, but
-agnostic, this information is stored in an aio_context field instead of
+this will be fixed soon.
 accessing it from BlockBackend.
-Reviewed-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/throttle-groups.h |  7 ++++-
+ include/block/block.h | 13 +++++++++++++
- block/block-backend.c           | 15 ++++------
+ block/io.c            | 54 ++++++++++++++++++++++++++++++++++++++++-----------
- block/throttle-groups.c         | 38 ++++++++++++++++---------
+files changed, 56 insertions(+), 11 deletions(-)
  tests/test-throttle.c           | 63 +++++++++++++++++++++--------------------
 files changed, 69 insertions(+), 54 deletions(-)
-diff --git a/include/block/throttle-groups.h b/include/block/throttle-groups.h
+diff --git a/include/block/block.h b/include/block/block.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/throttle-groups.h
+--- a/include/block/block.h
-+++ b/include/block/throttle-groups.h
++++ b/include/block/block.h
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
  void bdrv_drained_begin(BlockDriverState *bs);
  /**
 + * Like bdrv_drained_begin, but recursively begins a quiesced section for
 + * exclusive access to all child nodes as well.
 + *
 + * Graph changes are not allowed during a subtree drain section.
 + */
 +void bdrv_subtree_drained_begin(BlockDriverState *bs);
 +
 +/**
   * bdrv_drained_end:
   *
   * End a quiescent section started by bdrv_drained_begin().
   */
+ void bdrv_drained_end(BlockDriverState *bs);
- typedef struct ThrottleGroupMember {
-+    AioContext   *aio_context;
++/**
-     /* throttled_reqs_lock protects the CoQueues for throttled requests.  */
++ * End a quiescent section started by bdrv_subtree_drained_begin().
-     CoMutex      throttled_reqs_lock;
++ */
-     CoQueue      throttled_reqs[2];
++void bdrv_subtree_drained_end(BlockDriverState *bs);
-@@ -XXX,XX +XXX,XX @@ void throttle_group_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg);
++
- void throttle_group_get_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg);
+ void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
+                     Error **errp);
- void throttle_group_register_tgm(ThrottleGroupMember *tgm,
+ void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
--                                const char *groupname);
+diff --git a/block/io.c b/block/io.c
 +                                const char *groupname,
 +                                AioContext *ctx);
  void throttle_group_unregister_tgm(ThrottleGroupMember *tgm);
  void throttle_group_restart_tgm(ThrottleGroupMember *tgm);
  void coroutine_fn throttle_group_co_io_limits_intercept(ThrottleGroupMember *tgm,
                                                          unsigned int bytes,
                                                          bool is_write);
 +void throttle_group_attach_aio_context(ThrottleGroupMember *tgm,
 +                                       AioContext *new_context);
 +void throttle_group_detach_aio_context(ThrottleGroupMember *tgm);
  #endif
 diff --git a/block/block-backend.c b/block/block-backend.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/block-backend.c
+--- a/block/io.c
-+++ b/block/block-backend.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
+@@ -XXX,XX +XXX,XX @@ typedef struct {
- void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
+     BlockDriverState *bs;
      bool done;
      bool begin;
 +    bool recursive;
      BdrvChild *parent;
  } BdrvCoDrainData;
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
      return waited;
  }
 -static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
 -static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
 +static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 +                                  BdrvChild *parent);
 +static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 +                                BdrvChild *parent);
  static void bdrv_co_drain_bh_cb(void *opaque)
  {
-     BlockDriverState *bs = blk_bs(blk);
+@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
--    ThrottleTimers *tt;
-+    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
+     bdrv_dec_in_flight(bs);
+     if (data->begin) {
-     if (bs) {
+-        bdrv_do_drained_begin(bs, data->parent);
--        if (blk->public.throttle_group_member.throttle_state) {
++        bdrv_do_drained_begin(bs, data->recursive, data->parent);
--            tt = &blk->public.throttle_group_member.throttle_timers;
+     } else {
--            throttle_timers_detach_aio_context(tt);
+-        bdrv_do_drained_end(bs, data->parent);
-+        if (tgm->throttle_state) {
++        bdrv_do_drained_end(bs, data->recursive, data->parent);
 +            throttle_group_detach_aio_context(tgm);
 +            throttle_group_attach_aio_context(tgm, new_context);
          }
          bdrv_set_aio_context(bs, new_context);
 -        if (blk->public.throttle_group_member.throttle_state) {
 -            tt = &blk->public.throttle_group_member.throttle_timers;
 -            throttle_timers_attach_aio_context(tt, new_context);
 -        }
      }
+     data->done = true;
+@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
  }
-@@ -XXX,XX +XXX,XX @@ void blk_io_limits_disable(BlockBackend *blk)
+ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
- void blk_io_limits_enable(BlockBackend *blk, const char *group)
+-                                                bool begin, BdrvChild *parent)
 +                                                bool begin, bool recursive,
 +                                                BdrvChild *parent)
  {
-     assert(!blk->public.throttle_group_member.throttle_state);
+     BdrvCoDrainData data;
--    throttle_group_register_tgm(&blk->public.throttle_group_member, group);
-+    throttle_group_register_tgm(&blk->public.throttle_group_member,
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-+                                group, blk_get_aio_context(blk));
+         .bs = bs,
          .done = false,
          .begin = begin,
 +        .recursive = recursive,
          .parent = parent,
      };
      bdrv_inc_in_flight(bs);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
      assert(data.done);
  }
- void blk_io_limits_update_group(BlockBackend *blk, const char *group)
+-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
-diff --git a/block/throttle-groups.c b/block/throttle-groups.c
++static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-index XXXXXXX..XXXXXXX 100644
++                                  BdrvChild *parent)
 --- a/block/throttle-groups.c
 +++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn throttle_group_restart_queue_entry(void *opaque)
  static void throttle_group_restart_queue(ThrottleGroupMember *tgm, bool is_write)
  {
--    BlockBackendPublic *blkp = container_of(tgm, BlockBackendPublic,
++    BdrvChild *child, *next;
--            throttle_group_member);
++
--    BlockBackend *blk = blk_by_public(blkp);
+     if (qemu_in_coroutine()) {
-     Coroutine *co;
+-        bdrv_co_yield_to_drain(bs, true, parent);
-     RestartData rd = {
++        bdrv_co_yield_to_drain(bs, true, recursive, parent);
-         .tgm = tgm,
+         return;
-@@ -XXX,XX +XXX,XX @@ static void throttle_group_restart_queue(ThrottleGroupMember *tgm, bool is_write
+     }
-     };
+@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
-     co = qemu_coroutine_create(throttle_group_restart_queue_entry, &rd);
+     bdrv_parent_drained_begin(bs, parent);
--    aio_co_enter(blk_get_aio_context(blk), co);
+     bdrv_drain_invoke(bs, true, false);
-+    aio_co_enter(tgm->aio_context, co);
+     bdrv_drain_recurse(bs);
 +
 +    if (recursive) {
 +        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 +            bdrv_do_drained_begin(child->bs, true, child);
 +        }
 +    }
  }
- void throttle_group_restart_tgm(ThrottleGroupMember *tgm)
+ void bdrv_drained_begin(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ void throttle_group_get_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg)
  /* ThrottleTimers callback. This wakes up a request that was waiting
   * because it had been throttled.
   *
 - * @blk:       the BlockBackend whose request had been throttled
 + * @tgm:       the ThrottleGroupMember whose request had been throttled
   * @is_write:  the type of operation (read/write)
   */
 -static void timer_cb(BlockBackend *blk, bool is_write)
 +static void timer_cb(ThrottleGroupMember *tgm, bool is_write)
  {
--    BlockBackendPublic *blkp = blk_get_public(blk);
+-    bdrv_do_drained_begin(bs, NULL);
--    ThrottleGroupMember *tgm = &blkp->throttle_group_member;
++    bdrv_do_drained_begin(bs, false, NULL);
      ThrottleState *ts = tgm->throttle_state;
      ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
@@ -XXX,XX +XXX,XX @@ static void write_timer_cb(void *opaque)
   *
   * @tgm:       the ThrottleGroupMember to insert
   * @groupname: the name of the group
 + * @ctx:       the AioContext to use
   */
  void throttle_group_register_tgm(ThrottleGroupMember *tgm,
 -                                 const char *groupname)
 +                                 const char *groupname,
 +                                 AioContext *ctx)
  {
      int i;
 -    BlockBackendPublic *blkp = container_of(tgm, BlockBackendPublic,
 -            throttle_group_member);
 -    BlockBackend *blk = blk_by_public(blkp);
      ThrottleState *ts = throttle_group_incref(groupname);
      ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
      tgm->throttle_state = ts;
 +    tgm->aio_context = ctx;
      qemu_mutex_lock(&tg->lock);
      /* If the ThrottleGroup is new set this ThrottleGroupMember as the token */
@@ -XXX,XX +XXX,XX @@ void throttle_group_register_tgm(ThrottleGroupMember *tgm,
      QLIST_INSERT_HEAD(&tg->head, tgm, round_robin);
      throttle_timers_init(&tgm->throttle_timers,
 -                         blk_get_aio_context(blk),
 +                         tgm->aio_context,
                           tg->clock_type,
                           read_timer_cb,
                           write_timer_cb,
 -                         blk);
 +                         tgm);
      qemu_mutex_unlock(&tg->lock);
  }
@@ -XXX,XX +XXX,XX @@ void throttle_group_unregister_tgm(ThrottleGroupMember *tgm)
      tgm->throttle_state = NULL;
  }
 +void throttle_group_attach_aio_context(ThrottleGroupMember *tgm,
 +                                       AioContext *new_context)
 +{
 +    ThrottleTimers *tt = &tgm->throttle_timers;
 +    throttle_timers_attach_aio_context(tt, new_context);
 +    tgm->aio_context = new_context;
 +}
 +
-+void throttle_group_detach_aio_context(ThrottleGroupMember *tgm)
++void bdrv_subtree_drained_begin(BlockDriverState *bs)
 +{
-+    ThrottleTimers *tt = &tgm->throttle_timers;
++    bdrv_do_drained_begin(bs, true, NULL);
-+    throttle_timers_detach_aio_context(tt);
+ }
-+    tgm->aio_context = NULL;
 -static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
 +static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 +                                BdrvChild *parent)
  {
 +    BdrvChild *child, *next;
      int old_quiesce_counter;
      if (qemu_in_coroutine()) {
 -        bdrv_co_yield_to_drain(bs, false, parent);
 +        bdrv_co_yield_to_drain(bs, false, recursive, parent);
          return;
      }
      assert(bs->quiesce_counter > 0);
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
      if (old_quiesce_counter == 1) {
          aio_enable_external(bdrv_get_aio_context(bs));
      }
 +
 +    if (recursive) {
 +        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 +            bdrv_do_drained_end(child->bs, true, child);
 +        }
 +    }
  }
  void bdrv_drained_end(BlockDriverState *bs)
  {
 -    bdrv_do_drained_end(bs, NULL);
 +    bdrv_do_drained_end(bs, false, NULL);
 +}
 +
- static void throttle_groups_init(void)
++void bdrv_subtree_drained_end(BlockDriverState *bs)
- {
++{
-     qemu_mutex_init(&throttle_groups_lock);
++    bdrv_do_drained_end(bs, true, NULL);
 diff --git a/tests/test-throttle.c b/tests/test-throttle.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/test-throttle.c
 +++ b/tests/test-throttle.c
@@ -XXX,XX +XXX,XX @@
  static AioContext     *ctx;
  static LeakyBucket    bkt;
  static ThrottleConfig cfg;
 +static ThrottleGroupMember tgm;
  static ThrottleState  ts;
 -static ThrottleTimers tt;
 +static ThrottleTimers *tt;
  /* useful function */
  static bool double_cmp(double x, double y)
@@ -XXX,XX +XXX,XX @@ static void test_init(void)
  {
      int i;
 +    tt = &tgm.throttle_timers;
 +
      /* fill the structures with crap */
      memset(&ts, 1, sizeof(ts));
 -    memset(&tt, 1, sizeof(tt));
 +    memset(tt, 1, sizeof(*tt));
      /* init structures */
      throttle_init(&ts);
 -    throttle_timers_init(&tt, ctx, QEMU_CLOCK_VIRTUAL,
 +    throttle_timers_init(tt, ctx, QEMU_CLOCK_VIRTUAL,
                           read_timer_cb, write_timer_cb, &ts);
      /* check initialized fields */
 -    g_assert(tt.clock_type == QEMU_CLOCK_VIRTUAL);
 -    g_assert(tt.timers[0]);
 -    g_assert(tt.timers[1]);
 +    g_assert(tt->clock_type == QEMU_CLOCK_VIRTUAL);
 +    g_assert(tt->timers[0]);
 +    g_assert(tt->timers[1]);
      /* check other fields where cleared */
      g_assert(!ts.previous_leak);
@@ -XXX,XX +XXX,XX @@ static void test_init(void)
          g_assert(!ts.cfg.buckets[i].level);
      }
 -    throttle_timers_destroy(&tt);
 +    throttle_timers_destroy(tt);
  }
- static void test_destroy(void)
+ /*
  {
      int i;
      throttle_init(&ts);
 -    throttle_timers_init(&tt, ctx, QEMU_CLOCK_VIRTUAL,
 +    throttle_timers_init(tt, ctx, QEMU_CLOCK_VIRTUAL,
                           read_timer_cb, write_timer_cb, &ts);
 -    throttle_timers_destroy(&tt);
 +    throttle_timers_destroy(tt);
      for (i = 0; i < 2; i++) {
 -        g_assert(!tt.timers[i]);
 +        g_assert(!tt->timers[i]);
      }
  }
@@ -XXX,XX +XXX,XX @@ static void test_config_functions(void)
      orig_cfg.op_size = 1;
      throttle_init(&ts);
 -    throttle_timers_init(&tt, ctx, QEMU_CLOCK_VIRTUAL,
 +    throttle_timers_init(tt, ctx, QEMU_CLOCK_VIRTUAL,
                           read_timer_cb, write_timer_cb, &ts);
      /* structure reset by throttle_init previous_leak should be null */
      g_assert(!ts.previous_leak);
@@ -XXX,XX +XXX,XX @@ static void test_config_functions(void)
      /* get back the fixed configuration */
      throttle_get_config(&ts, &final_cfg);
 -    throttle_timers_destroy(&tt);
 +    throttle_timers_destroy(tt);
      g_assert(final_cfg.buckets[THROTTLE_BPS_TOTAL].avg == 153);
      g_assert(final_cfg.buckets[THROTTLE_BPS_READ].avg  == 56);
@@ -XXX,XX +XXX,XX @@ static void test_have_timer(void)
  {
      /* zero structures */
      memset(&ts, 0, sizeof(ts));
 -    memset(&tt, 0, sizeof(tt));
 +    memset(tt, 0, sizeof(*tt));
      /* no timer set should return false */
 -    g_assert(!throttle_timers_are_initialized(&tt));
 +    g_assert(!throttle_timers_are_initialized(tt));
      /* init structures */
      throttle_init(&ts);
 -    throttle_timers_init(&tt, ctx, QEMU_CLOCK_VIRTUAL,
 +    throttle_timers_init(tt, ctx, QEMU_CLOCK_VIRTUAL,
                           read_timer_cb, write_timer_cb, &ts);
      /* timer set by init should return true */
 -    g_assert(throttle_timers_are_initialized(&tt));
 +    g_assert(throttle_timers_are_initialized(tt));
 -    throttle_timers_destroy(&tt);
 +    throttle_timers_destroy(tt);
  }
  static void test_detach_attach(void)
  {
      /* zero structures */
      memset(&ts, 0, sizeof(ts));
 -    memset(&tt, 0, sizeof(tt));
 +    memset(tt, 0, sizeof(*tt));
      /* init the structure */
      throttle_init(&ts);
 -    throttle_timers_init(&tt, ctx, QEMU_CLOCK_VIRTUAL,
 +    throttle_timers_init(tt, ctx, QEMU_CLOCK_VIRTUAL,
                           read_timer_cb, write_timer_cb, &ts);
      /* timer set by init should return true */
 -    g_assert(throttle_timers_are_initialized(&tt));
 +    g_assert(throttle_timers_are_initialized(tt));
      /* timer should no longer exist after detaching */
 -    throttle_timers_detach_aio_context(&tt);
 -    g_assert(!throttle_timers_are_initialized(&tt));
 +    throttle_timers_detach_aio_context(tt);
 +    g_assert(!throttle_timers_are_initialized(tt));
      /* timer should exist again after attaching */
 -    throttle_timers_attach_aio_context(&tt, ctx);
 -    g_assert(throttle_timers_are_initialized(&tt));
 +    throttle_timers_attach_aio_context(tt, ctx);
 +    g_assert(throttle_timers_are_initialized(tt));
 -    throttle_timers_destroy(&tt);
 +    throttle_timers_destroy(tt);
  }
  static bool do_test_accounting(bool is_ops, /* are we testing bps or ops */
@@ -XXX,XX +XXX,XX @@ static bool do_test_accounting(bool is_ops, /* are we testing bps or ops */
      cfg.op_size = op_size;
      throttle_init(&ts);
 -    throttle_timers_init(&tt, ctx, QEMU_CLOCK_VIRTUAL,
 +    throttle_timers_init(tt, ctx, QEMU_CLOCK_VIRTUAL,
                           read_timer_cb, write_timer_cb, &ts);
      throttle_config(&ts, QEMU_CLOCK_VIRTUAL, &cfg);
@@ -XXX,XX +XXX,XX @@ static bool do_test_accounting(bool is_ops, /* are we testing bps or ops */
          return false;
      }
 -    throttle_timers_destroy(&tt);
 +    throttle_timers_destroy(tt);
      return true;
  }
@@ -XXX,XX +XXX,XX @@ static void test_groups(void)
      g_assert(tgm2->throttle_state == NULL);
      g_assert(tgm3->throttle_state == NULL);
 -    throttle_group_register_tgm(tgm1, "bar");
 -    throttle_group_register_tgm(tgm2, "foo");
 -    throttle_group_register_tgm(tgm3, "bar");
 +    throttle_group_register_tgm(tgm1, "bar", blk_get_aio_context(blk1));
 +    throttle_group_register_tgm(tgm2, "foo", blk_get_aio_context(blk2));
 +    throttle_group_register_tgm(tgm3, "bar", blk_get_aio_context(blk3));
      g_assert(tgm1->throttle_state != NULL);
      g_assert(tgm2->throttle_state != NULL);
 --
-.13.5
+.13.6

-[Qemu-devel] [PULL 08/14] block: move ThrottleGroup membership to ThrottleGroupMember
+[Qemu-devel] [PULL v3 29/35] test-bdrv-drain: Tests for bdrv_subtree_drain
-From: Manos Pitsidianakis <el13635@mail.ntua.gr>
+Add a subtree drain version to the existing test cases.
-This commit eliminates the 1:1 relationship between BlockBackend and
-throttle group state.  Users will be able to create multiple throttle
-nodes, each with its own throttle group state, in the future.  The
-throttle group state cannot be per-BlockBackend anymore, it must be
-per-throttle node. This is done by gathering ThrottleGroup membership
-details from BlockBackendPublic into ThrottleGroupMember and refactoring
-existing code to use the structure.
-Reviewed-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/throttle-groups.h |  39 +++++-
+ tests/test-bdrv-drain.c | 27 ++++++++++++++++++++++++++-
- include/sysemu/block-backend.h  |  20 +--
+file changed, 26 insertions(+), 1 deletion(-)
  block/block-backend.c           |  66 +++++----
  block/qapi.c                    |   8 +-
  block/throttle-groups.c         | 288 ++++++++++++++++++++--------------------
  blockdev.c                      |   4 +-
  tests/test-throttle.c           |  53 ++++----
 files changed, 252 insertions(+), 226 deletions(-)
-diff --git a/include/block/throttle-groups.h b/include/block/throttle-groups.h
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/throttle-groups.h
+--- a/tests/test-bdrv-drain.c
-+++ b/include/block/throttle-groups.h
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
- #include "qemu/throttle.h"
+ enum drain_type {
- #include "block/block_int.h"
+     BDRV_DRAIN_ALL,
+     BDRV_DRAIN,
--const char *throttle_group_get_name(BlockBackend *blk);
++    BDRV_SUBTREE_DRAIN,
-+/* The ThrottleGroupMember structure indicates membership in a ThrottleGroup
+     DRAIN_TYPE_MAX,
-+ * and holds related data.
+ };
-+ */
-+
+@@ -XXX,XX +XXX,XX @@ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
-+typedef struct ThrottleGroupMember {
+     switch (drain_type) {
-+    /* throttled_reqs_lock protects the CoQueues for throttled requests.  */
+     case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
-+    CoMutex      throttled_reqs_lock;
+     case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
-+    CoQueue      throttled_reqs[2];
++    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_begin(bs); break;
-+
+     default:                    g_assert_not_reached();
 +    /* Nonzero if the I/O limits are currently being ignored; generally
 +     * it is zero.  Accessed with atomic operations.
 +     */
 +    unsigned int io_limits_disabled;
 +
 +    /* The following fields are protected by the ThrottleGroup lock.
 +     * See the ThrottleGroup documentation for details.
 +     * throttle_state tells us if I/O limits are configured. */
 +    ThrottleState *throttle_state;
 +    ThrottleTimers throttle_timers;
 +    unsigned       pending_reqs[2];
 +    QLIST_ENTRY(ThrottleGroupMember) round_robin;
 +
 +} ThrottleGroupMember;
 +
 +const char *throttle_group_get_name(ThrottleGroupMember *tgm);
  ThrottleState *throttle_group_incref(const char *name);
  void throttle_group_unref(ThrottleState *ts);
 -void throttle_group_config(BlockBackend *blk, ThrottleConfig *cfg);
 -void throttle_group_get_config(BlockBackend *blk, ThrottleConfig *cfg);
 +void throttle_group_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg);
 +void throttle_group_get_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg);
 -void throttle_group_register_blk(BlockBackend *blk, const char *groupname);
 -void throttle_group_unregister_blk(BlockBackend *blk);
 -void throttle_group_restart_blk(BlockBackend *blk);
 +void throttle_group_register_tgm(ThrottleGroupMember *tgm,
 +                                const char *groupname);
 +void throttle_group_unregister_tgm(ThrottleGroupMember *tgm);
 +void throttle_group_restart_tgm(ThrottleGroupMember *tgm);
 -void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
 +void coroutine_fn throttle_group_co_io_limits_intercept(ThrottleGroupMember *tgm,
                                                          unsigned int bytes,
                                                          bool is_write);
 diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/sysemu/block-backend.h
 +++ b/include/sysemu/block-backend.h
@@ -XXX,XX +XXX,XX @@ typedef struct BlockDevOps {
  /* This struct is embedded in (the private) BlockBackend struct and contains
   * fields that must be public. This is in particular for QLIST_ENTRY() and
 - * friends so that BlockBackends can be kept in lists outside block-backend.c */
 + * friends so that BlockBackends can be kept in lists outside block-backend.c
 + * */
  typedef struct BlockBackendPublic {
 -    /* throttled_reqs_lock protects the CoQueues for throttled requests.  */
 -    CoMutex      throttled_reqs_lock;
 -    CoQueue      throttled_reqs[2];
 -
 -    /* Nonzero if the I/O limits are currently being ignored; generally
 -     * it is zero.  Accessed with atomic operations.
 -     */
 -    unsigned int io_limits_disabled;
 -
 -    /* The following fields are protected by the ThrottleGroup lock.
 -     * See the ThrottleGroup documentation for details.
 -     * throttle_state tells us if I/O limits are configured. */
 -    ThrottleState *throttle_state;
 -    ThrottleTimers throttle_timers;
 -    unsigned       pending_reqs[2];
 -    QLIST_ENTRY(BlockBackendPublic) round_robin;
 +    ThrottleGroupMember throttle_group_member;
  } BlockBackendPublic;
  BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm);
 diff --git a/block/block-backend.c b/block/block-backend.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/block-backend.c
 +++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm)
      blk->shared_perm = shared_perm;
      blk_set_enable_write_cache(blk, true);
 -    qemu_co_mutex_init(&blk->public.throttled_reqs_lock);
 -    qemu_co_queue_init(&blk->public.throttled_reqs[0]);
 -    qemu_co_queue_init(&blk->public.throttled_reqs[1]);
 +    qemu_co_mutex_init(&blk->public.throttle_group_member.throttled_reqs_lock);
 +    qemu_co_queue_init(&blk->public.throttle_group_member.throttled_reqs[0]);
 +    qemu_co_queue_init(&blk->public.throttle_group_member.throttled_reqs[1]);
      block_acct_init(&blk->stats);
      notifier_list_init(&blk->remove_bs_notifiers);
@@ -XXX,XX +XXX,XX @@ static void blk_delete(BlockBackend *blk)
      assert(!blk->refcnt);
      assert(!blk->name);
      assert(!blk->dev);
 -    if (blk->public.throttle_state) {
 +    if (blk->public.throttle_group_member.throttle_state) {
          blk_io_limits_disable(blk);
      }
      if (blk->root) {
@@ -XXX,XX +XXX,XX @@ BlockBackend *blk_by_public(BlockBackendPublic *public)
   */
  void blk_remove_bs(BlockBackend *blk)
  {
 +    ThrottleTimers *tt;
 +
      notifier_list_notify(&blk->remove_bs_notifiers, blk);
 -    if (blk->public.throttle_state) {
 -        throttle_timers_detach_aio_context(&blk->public.throttle_timers);
 +    if (blk->public.throttle_group_member.throttle_state) {
 +        tt = &blk->public.throttle_group_member.throttle_timers;
 +        throttle_timers_detach_aio_context(tt);
      }
      blk_update_root_state(blk);
@@ -XXX,XX +XXX,XX @@ int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
      bdrv_ref(bs);
      notifier_list_notify(&blk->insert_bs_notifiers, blk);
 -    if (blk->public.throttle_state) {
 +    if (blk->public.throttle_group_member.throttle_state) {
          throttle_timers_attach_aio_context(
 -            &blk->public.throttle_timers, bdrv_get_aio_context(bs));
 +            &blk->public.throttle_group_member.throttle_timers,
 +            bdrv_get_aio_context(bs));
      }
      return 0;
@@ -XXX,XX +XXX,XX @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
      bdrv_inc_in_flight(bs);
      /* throttling disk I/O */
 -    if (blk->public.throttle_state) {
 -        throttle_group_co_io_limits_intercept(blk, bytes, false);
 +    if (blk->public.throttle_group_member.throttle_state) {
 +        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
 +                bytes, false);
      }
      ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
@@ -XXX,XX +XXX,XX @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
      }
      bdrv_inc_in_flight(bs);
 -
      /* throttling disk I/O */
 -    if (blk->public.throttle_state) {
 -        throttle_group_co_io_limits_intercept(blk, bytes, true);
 +    if (blk->public.throttle_group_member.throttle_state) {
 +        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
 +                bytes, true);
      }
      if (!blk->enable_write_cache) {
@@ -XXX,XX +XXX,XX @@ static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
  void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
  {
      BlockDriverState *bs = blk_bs(blk);
 +    ThrottleTimers *tt;
      if (bs) {
 -        if (blk->public.throttle_state) {
 -            throttle_timers_detach_aio_context(&blk->public.throttle_timers);
 +        if (blk->public.throttle_group_member.throttle_state) {
 +            tt = &blk->public.throttle_group_member.throttle_timers;
 +            throttle_timers_detach_aio_context(tt);
          }
          bdrv_set_aio_context(bs, new_context);
 -        if (blk->public.throttle_state) {
 -            throttle_timers_attach_aio_context(&blk->public.throttle_timers,
 -                                               new_context);
 +        if (blk->public.throttle_group_member.throttle_state) {
 +            tt = &blk->public.throttle_group_member.throttle_timers;
 +            throttle_timers_attach_aio_context(tt, new_context);
          }
      }
  }
-@@ -XXX,XX +XXX,XX @@ int blk_commit_all(void)
+@@ -XXX,XX +XXX,XX @@ static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
- /* throttling disk I/O limits */
+     switch (drain_type) {
- void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
+     case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
- {
+     case BDRV_DRAIN:            bdrv_drained_end(bs); break;
--    throttle_group_config(blk, cfg);
++    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_end(bs); break;
-+    throttle_group_config(&blk->public.throttle_group_member, cfg);
+     default:                    g_assert_not_reached();
  }
  void blk_io_limits_disable(BlockBackend *blk)
  {
 -    assert(blk->public.throttle_state);
 +    assert(blk->public.throttle_group_member.throttle_state);
      bdrv_drained_begin(blk_bs(blk));
 -    throttle_group_unregister_blk(blk);
 +    throttle_group_unregister_tgm(&blk->public.throttle_group_member);
      bdrv_drained_end(blk_bs(blk));
  }
  /* should be called before blk_set_io_limits if a limit is set */
  void blk_io_limits_enable(BlockBackend *blk, const char *group)
  {
 -    assert(!blk->public.throttle_state);
 -    throttle_group_register_blk(blk, group);
 +    assert(!blk->public.throttle_group_member.throttle_state);
 +    throttle_group_register_tgm(&blk->public.throttle_group_member, group);
  }
  void blk_io_limits_update_group(BlockBackend *blk, const char *group)
  {
      /* this BB is not part of any group */
 -    if (!blk->public.throttle_state) {
 +    if (!blk->public.throttle_group_member.throttle_state) {
          return;
      }
      /* this BB is a part of the same group than the one we want */
 -    if (!g_strcmp0(throttle_group_get_name(blk), group)) {
 +    if (!g_strcmp0(throttle_group_get_name(&blk->public.throttle_group_member),
 +                group)) {
          return;
      }
@@ -XXX,XX +XXX,XX @@ static void blk_root_drained_begin(BdrvChild *child)
      /* Note that blk->root may not be accessible here yet if we are just
       * attaching to a BlockDriverState that is drained. Use child instead. */
 -    if (atomic_fetch_inc(&blk->public.io_limits_disabled) == 0) {
 -        throttle_group_restart_blk(blk);
 +    if (atomic_fetch_inc(&blk->public.throttle_group_member.io_limits_disabled) == 0) {
 +        throttle_group_restart_tgm(&blk->public.throttle_group_member);
      }
  }
+@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
-@@ -XXX,XX +XXX,XX @@ static void blk_root_drained_end(BdrvChild *child)
+     test_drv_cb_common(BDRV_DRAIN, false);
      BlockBackend *blk = child->opaque;
      assert(blk->quiesce_counter);
 -    assert(blk->public.io_limits_disabled);
 -    atomic_dec(&blk->public.io_limits_disabled);
 +    assert(blk->public.throttle_group_member.io_limits_disabled);
 +    atomic_dec(&blk->public.throttle_group_member.io_limits_disabled);
      if (--blk->quiesce_counter == 0) {
          if (blk->dev_ops && blk->dev_ops->drained_end) {
 diff --git a/block/qapi.c b/block/qapi.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qapi.c
 +++ b/block/qapi.c
@@ -XXX,XX +XXX,XX @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
      info->detect_zeroes = bs->detect_zeroes;
 -    if (blk && blk_get_public(blk)->throttle_state) {
 +    if (blk && blk_get_public(blk)->throttle_group_member.throttle_state) {
          ThrottleConfig cfg;
 +        BlockBackendPublic *blkp = blk_get_public(blk);
 -        throttle_group_get_config(blk, &cfg);
 +        throttle_group_get_config(&blkp->throttle_group_member, &cfg);
          info->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
          info->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
@@ -XXX,XX +XXX,XX @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
          info->iops_size = cfg.op_size;
          info->has_group = true;
 -        info->group = g_strdup(throttle_group_get_name(blk));
 +        info->group =
 +            g_strdup(throttle_group_get_name(&blkp->throttle_group_member));
      }
      info->write_threshold = bdrv_write_threshold_get(bs);
 diff --git a/block/throttle-groups.c b/block/throttle-groups.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/throttle-groups.c
 +++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/qtest.h"
  /* The ThrottleGroup structure (with its ThrottleState) is shared
 - * among different BlockBackends and it's independent from
 + * among different ThrottleGroupMembers and it's independent from
   * AioContext, so in order to use it from different threads it needs
   * its own locking.
   *
@@ -XXX,XX +XXX,XX @@
   * The whole ThrottleGroup structure is private and invisible to
   * outside users, that only use it through its ThrottleState.
   *
 - * In addition to the ThrottleGroup structure, BlockBackendPublic has
 + * In addition to the ThrottleGroup structure, ThrottleGroupMember has
   * fields that need to be accessed by other members of the group and
   * therefore also need to be protected by this lock. Once a
 - * BlockBackend is registered in a group those fields can be accessed
 + * ThrottleGroupMember is registered in a group those fields can be accessed
   * by other threads any time.
   *
   * Again, all this is handled internally and is mostly transparent to
   * the outside. The 'throttle_timers' field however has an additional
   * constraint because it may be temporarily invalid (see for example
   * blk_set_aio_context()). Therefore in this file a thread will
 - * access some other BlockBackend's timers only after verifying that
 - * that BlockBackend has throttled requests in the queue.
 + * access some other ThrottleGroupMember's timers only after verifying that
 + * that ThrottleGroupMember has throttled requests in the queue.
   */
  typedef struct ThrottleGroup {
      char *name; /* This is constant during the lifetime of the group */
      QemuMutex lock; /* This lock protects the following four fields */
      ThrottleState ts;
 -    QLIST_HEAD(, BlockBackendPublic) head;
 -    BlockBackend *tokens[2];
 +    QLIST_HEAD(, ThrottleGroupMember) head;
 +    ThrottleGroupMember *tokens[2];
      bool any_timer_armed[2];
      QEMUClockType clock_type;
@@ -XXX,XX +XXX,XX @@ void throttle_group_unref(ThrottleState *ts)
      qemu_mutex_unlock(&throttle_groups_lock);
  }
--/* Get the name from a BlockBackend's ThrottleGroup. The name (and the pointer)
++static void test_drv_cb_drain_subtree(void)
-+/* Get the name from a ThrottleGroupMember's group. The name (and the pointer)
++{
-  * is guaranteed to remain constant during the lifetime of the group.
++    test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
-  *
++}
-- * @blk:  a BlockBackend that is member of a throttling group
++
-+ * @tgm:  a ThrottleGroupMember
+ static void test_quiesce_common(enum drain_type drain_type, bool recursive)
   * @ret:  the name of the group.
   */
 -const char *throttle_group_get_name(BlockBackend *blk)
 +const char *throttle_group_get_name(ThrottleGroupMember *tgm)
  {
--    BlockBackendPublic *blkp = blk_get_public(blk);
+     BlockBackend *blk;
--    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
+@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
-+    ThrottleGroup *tg = container_of(tgm->throttle_state, ThrottleGroup, ts);
+     test_quiesce_common(BDRV_DRAIN, false);
      return tg->name;
  }
--/* Return the next BlockBackend in the round-robin sequence, simulating a
++static void test_quiesce_drain_subtree(void)
-- * circular list.
++{
-+/* Return the next ThrottleGroupMember in the round-robin sequence, simulating
++    test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
-+ * a circular list.
++}
-  *
++
-  * This assumes that tg->lock is held.
+ static void test_nested(void)
   *
 - * @blk: the current BlockBackend
 - * @ret: the next BlockBackend in the sequence
 + * @tgm: the current ThrottleGroupMember
 + * @ret: the next ThrottleGroupMember in the sequence
   */
 -static BlockBackend *throttle_group_next_blk(BlockBackend *blk)
 +static ThrottleGroupMember *throttle_group_next_tgm(ThrottleGroupMember *tgm)
  {
--    BlockBackendPublic *blkp = blk_get_public(blk);
+     BlockBackend *blk;
--    ThrottleState *ts = blkp->throttle_state;
+@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
-+    ThrottleState *ts = tgm->throttle_state;
+             /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
-     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+             int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
--    BlockBackendPublic *next = QLIST_NEXT(blkp, round_robin);
+                                   (inner != BDRV_DRAIN_ALL);
-+    ThrottleGroupMember *next = QLIST_NEXT(tgm, round_robin);
+-            int backing_quiesce = 0;
++            int backing_quiesce = (outer == BDRV_SUBTREE_DRAIN) +
-     if (!next) {
++                                  (inner == BDRV_SUBTREE_DRAIN);
-         next = QLIST_FIRST(&tg->head);
+             int backing_cb_cnt  = (outer != BDRV_DRAIN) +
-     }
+                                   (inner != BDRV_DRAIN);
--    return blk_by_public(next);
+@@ -XXX,XX +XXX,XX @@ static void test_blockjob_drain(void)
-+    return next;
+     test_blockjob_common(BDRV_DRAIN);
  }
- /*
++static void test_blockjob_drain_subtree(void)
-- * Return whether a BlockBackend has pending requests.
++{
-+ * Return whether a ThrottleGroupMember has pending requests.
++    test_blockjob_common(BDRV_SUBTREE_DRAIN);
-  *
++}
-  * This assumes that tg->lock is held.
++
-  *
+ int main(int argc, char **argv)
 - * @blk: the BlockBackend
 - * @is_write:  the type of operation (read/write)
 - * @ret:       whether the BlockBackend has pending requests.
 + * @tgm:        the ThrottleGroupMember
 + * @is_write:   the type of operation (read/write)
 + * @ret:        whether the ThrottleGroupMember has pending requests.
   */
 -static inline bool blk_has_pending_reqs(BlockBackend *blk,
 +static inline bool tgm_has_pending_reqs(ThrottleGroupMember *tgm,
                                          bool is_write)
  {
--    const BlockBackendPublic *blkp = blk_get_public(blk);
+     bdrv_init();
--    return blkp->pending_reqs[is_write];
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
-+    return tgm->pending_reqs[is_write];
      g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
      g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 +    g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
 +                    test_drv_cb_drain_subtree);
      g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
      g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 +    g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
 +                    test_quiesce_drain_subtree);
      g_test_add_func("/bdrv-drain/nested", test_nested);
      g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
      g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 +    g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
 +                    test_blockjob_drain_subtree);
      return g_test_run();
  }
--/* Return the next BlockBackend in the round-robin sequence with pending I/O
-- * requests.
-+/* Return the next ThrottleGroupMember in the round-robin sequence with pending
-+ * I/O requests.
-  *
-  * This assumes that tg->lock is held.
-  *
-- * @blk:       the current BlockBackend
-+ * @tgm:       the current ThrottleGroupMember
-  * @is_write:  the type of operation (read/write)
-- * @ret:       the next BlockBackend with pending requests, or blk if there is
-- *             none.
-+ * @ret:       the next ThrottleGroupMember with pending requests, or tgm if
-+ *             there is none.
-  */
--static BlockBackend *next_throttle_token(BlockBackend *blk, bool is_write)
-+static ThrottleGroupMember *next_throttle_token(ThrottleGroupMember *tgm,
-+                                                bool is_write)
- {
--    BlockBackendPublic *blkp = blk_get_public(blk);
--    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
--    BlockBackend *token, *start;
-+    ThrottleState *ts = tgm->throttle_state;
-+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-+    ThrottleGroupMember *token, *start;
-     start = token = tg->tokens[is_write];
-     /* get next bs round in round robin style */
--    token = throttle_group_next_blk(token);
--    while (token != start && !blk_has_pending_reqs(token, is_write)) {
--        token = throttle_group_next_blk(token);
-+    token = throttle_group_next_tgm(token);
-+    while (token != start && !tgm_has_pending_reqs(token, is_write)) {
-+        token = throttle_group_next_tgm(token);
-     }
-     /* If no IO are queued for scheduling on the next round robin token
--     * then decide the token is the current bs because chances are
--     * the current bs get the current request queued.
-+     * then decide the token is the current tgm because chances are
-+     * the current tgm got the current request queued.
-      */
--    if (token == start && !blk_has_pending_reqs(token, is_write)) {
--        token = blk;
-+    if (token == start && !tgm_has_pending_reqs(token, is_write)) {
-+        token = tgm;
-     }
--    /* Either we return the original BB, or one with pending requests */
--    assert(token == blk || blk_has_pending_reqs(token, is_write));
-+    /* Either we return the original TGM, or one with pending requests */
-+    assert(token == tgm || tgm_has_pending_reqs(token, is_write));
-     return token;
- }
--/* Check if the next I/O request for a BlockBackend needs to be throttled or
-- * not. If there's no timer set in this group, set one and update the token
-- * accordingly.
-+/* Check if the next I/O request for a ThrottleGroupMember needs to be
-+ * throttled or not. If there's no timer set in this group, set one and update
-+ * the token accordingly.
-  *
-  * This assumes that tg->lock is held.
-  *
-- * @blk:        the current BlockBackend
-+ * @tgm:        the current ThrottleGroupMember
-  * @is_write:   the type of operation (read/write)
-  * @ret:        whether the I/O request needs to be throttled or not
-  */
--static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write)
-+static bool throttle_group_schedule_timer(ThrottleGroupMember *tgm,
-+                                          bool is_write)
- {
--    BlockBackendPublic *blkp = blk_get_public(blk);
--    ThrottleState *ts = blkp->throttle_state;
--    ThrottleTimers *tt = &blkp->throttle_timers;
-+    ThrottleState *ts = tgm->throttle_state;
-     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-+    ThrottleTimers *tt = &tgm->throttle_timers;
-     bool must_wait;
--    if (atomic_read(&blkp->io_limits_disabled)) {
-+    if (atomic_read(&tgm->io_limits_disabled)) {
-         return false;
-     }
-@@ -XXX,XX +XXX,XX @@ static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write)
-     must_wait = throttle_schedule_timer(ts, tt, is_write);
--    /* If a timer just got armed, set blk as the current token */
-+    /* If a timer just got armed, set tgm as the current token */
-     if (must_wait) {
--        tg->tokens[is_write] = blk;
-+        tg->tokens[is_write] = tgm;
-         tg->any_timer_armed[is_write] = true;
-     }
-     return must_wait;
- }
--/* Start the next pending I/O request for a BlockBackend.  Return whether
-+/* Start the next pending I/O request for a ThrottleGroupMember. Return whether
-  * any request was actually pending.
-  *
-- * @blk:       the current BlockBackend
-+ * @tgm:       the current ThrottleGroupMember
-  * @is_write:  the type of operation (read/write)
-  */
--static bool coroutine_fn throttle_group_co_restart_queue(BlockBackend *blk,
-+static bool coroutine_fn throttle_group_co_restart_queue(ThrottleGroupMember *tgm,
-                                                          bool is_write)
- {
--    BlockBackendPublic *blkp = blk_get_public(blk);
-     bool ret;
--    qemu_co_mutex_lock(&blkp->throttled_reqs_lock);
--    ret = qemu_co_queue_next(&blkp->throttled_reqs[is_write]);
--    qemu_co_mutex_unlock(&blkp->throttled_reqs_lock);
-+    qemu_co_mutex_lock(&tgm->throttled_reqs_lock);
-+    ret = qemu_co_queue_next(&tgm->throttled_reqs[is_write]);
-+    qemu_co_mutex_unlock(&tgm->throttled_reqs_lock);
-     return ret;
- }
-@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn throttle_group_co_restart_queue(BlockBackend *blk,
-  *
-  * This assumes that tg->lock is held.
-  *
-- * @blk:       the current BlockBackend
-+ * @tgm:       the current ThrottleGroupMember
-  * @is_write:  the type of operation (read/write)
-  */
--static void schedule_next_request(BlockBackend *blk, bool is_write)
-+static void schedule_next_request(ThrottleGroupMember *tgm, bool is_write)
- {
--    BlockBackendPublic *blkp = blk_get_public(blk);
--    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
-+    ThrottleState *ts = tgm->throttle_state;
-+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-     bool must_wait;
--    BlockBackend *token;
-+    ThrottleGroupMember *token;
-     /* Check if there's any pending request to schedule next */
--    token = next_throttle_token(blk, is_write);
--    if (!blk_has_pending_reqs(token, is_write)) {
-+    token = next_throttle_token(tgm, is_write);
-+    if (!tgm_has_pending_reqs(token, is_write)) {
-         return;
-     }
-@@ -XXX,XX +XXX,XX @@ static void schedule_next_request(BlockBackend *blk, bool is_write)
-     /* If it doesn't have to wait, queue it for immediate execution */
-     if (!must_wait) {
--        /* Give preference to requests from the current blk */
-+        /* Give preference to requests from the current tgm */
-         if (qemu_in_coroutine() &&
--            throttle_group_co_restart_queue(blk, is_write)) {
--            token = blk;
-+            throttle_group_co_restart_queue(tgm, is_write)) {
-+            token = tgm;
-         } else {
--            ThrottleTimers *tt = &blk_get_public(token)->throttle_timers;
-+            ThrottleTimers *tt = &token->throttle_timers;
-             int64_t now = qemu_clock_get_ns(tg->clock_type);
-             timer_mod(tt->timers[is_write], now);
-             tg->any_timer_armed[is_write] = true;
-@@ -XXX,XX +XXX,XX @@ static void schedule_next_request(BlockBackend *blk, bool is_write)
-  * if necessary, and schedule the next request using a round robin
-  * algorithm.
-  *
-- * @blk:       the current BlockBackend
-+ * @tgm:       the current ThrottleGroupMember
-  * @bytes:     the number of bytes for this I/O
-  * @is_write:  the type of operation (read/write)
-  */
--void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
-+void coroutine_fn throttle_group_co_io_limits_intercept(ThrottleGroupMember *tgm,
-                                                         unsigned int bytes,
-                                                         bool is_write)
- {
-     bool must_wait;
--    BlockBackend *token;
--
--    BlockBackendPublic *blkp = blk_get_public(blk);
--    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
-+    ThrottleGroupMember *token;
-+    ThrottleGroup *tg = container_of(tgm->throttle_state, ThrottleGroup, ts);
-     qemu_mutex_lock(&tg->lock);
-     /* First we check if this I/O has to be throttled. */
--    token = next_throttle_token(blk, is_write);
-+    token = next_throttle_token(tgm, is_write);
-     must_wait = throttle_group_schedule_timer(token, is_write);
-     /* Wait if there's a timer set or queued requests of this type */
--    if (must_wait || blkp->pending_reqs[is_write]) {
--        blkp->pending_reqs[is_write]++;
-+    if (must_wait || tgm->pending_reqs[is_write]) {
-+        tgm->pending_reqs[is_write]++;
-         qemu_mutex_unlock(&tg->lock);
--        qemu_co_mutex_lock(&blkp->throttled_reqs_lock);
--        qemu_co_queue_wait(&blkp->throttled_reqs[is_write],
--                           &blkp->throttled_reqs_lock);
--        qemu_co_mutex_unlock(&blkp->throttled_reqs_lock);
-+        qemu_co_mutex_lock(&tgm->throttled_reqs_lock);
-+        qemu_co_queue_wait(&tgm->throttled_reqs[is_write],
-+                           &tgm->throttled_reqs_lock);
-+        qemu_co_mutex_unlock(&tgm->throttled_reqs_lock);
-         qemu_mutex_lock(&tg->lock);
--        blkp->pending_reqs[is_write]--;
-+        tgm->pending_reqs[is_write]--;
-     }
-     /* The I/O will be executed, so do the accounting */
--    throttle_account(blkp->throttle_state, is_write, bytes);
-+    throttle_account(tgm->throttle_state, is_write, bytes);
-     /* Schedule the next request */
--    schedule_next_request(blk, is_write);
-+    schedule_next_request(tgm, is_write);
-     qemu_mutex_unlock(&tg->lock);
- }
- typedef struct {
--    BlockBackend *blk;
-+    ThrottleGroupMember *tgm;
-     bool is_write;
- } RestartData;
- static void coroutine_fn throttle_group_restart_queue_entry(void *opaque)
- {
-     RestartData *data = opaque;
--    BlockBackend *blk = data->blk;
-+    ThrottleGroupMember *tgm = data->tgm;
-+    ThrottleState *ts = tgm->throttle_state;
-+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-     bool is_write = data->is_write;
--    BlockBackendPublic *blkp = blk_get_public(blk);
--    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
-     bool empty_queue;
--    empty_queue = !throttle_group_co_restart_queue(blk, is_write);
-+    empty_queue = !throttle_group_co_restart_queue(tgm, is_write);
-     /* If the request queue was empty then we have to take care of
-      * scheduling the next one */
-     if (empty_queue) {
-         qemu_mutex_lock(&tg->lock);
--        schedule_next_request(blk, is_write);
-+        schedule_next_request(tgm, is_write);
-         qemu_mutex_unlock(&tg->lock);
-     }
- }
--static void throttle_group_restart_queue(BlockBackend *blk, bool is_write)
-+static void throttle_group_restart_queue(ThrottleGroupMember *tgm, bool is_write)
- {
-+    BlockBackendPublic *blkp = container_of(tgm, BlockBackendPublic,
-+            throttle_group_member);
-+    BlockBackend *blk = blk_by_public(blkp);
-     Coroutine *co;
-     RestartData rd = {
--        .blk = blk,
-+        .tgm = tgm,
-         .is_write = is_write
-     };
-@@ -XXX,XX +XXX,XX @@ static void throttle_group_restart_queue(BlockBackend *blk, bool is_write)
-     aio_co_enter(blk_get_aio_context(blk), co);
- }
--void throttle_group_restart_blk(BlockBackend *blk)
-+void throttle_group_restart_tgm(ThrottleGroupMember *tgm)
- {
--    BlockBackendPublic *blkp = blk_get_public(blk);
--
--    if (blkp->throttle_state) {
--        throttle_group_restart_queue(blk, 0);
--        throttle_group_restart_queue(blk, 1);
-+    if (tgm->throttle_state) {
-+        throttle_group_restart_queue(tgm, 0);
-+        throttle_group_restart_queue(tgm, 1);
-     }
- }
-@@ -XXX,XX +XXX,XX @@ void throttle_group_restart_blk(BlockBackend *blk)
-  * to throttle_config(), but guarantees atomicity within the
-  * throttling group.
-  *
-- * @blk: a BlockBackend that is a member of the group
-+ * @tgm:    a ThrottleGroupMember that is a member of the group
-  * @cfg: the configuration to set
-  */
--void throttle_group_config(BlockBackend *blk, ThrottleConfig *cfg)
-+void throttle_group_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg)
- {
--    BlockBackendPublic *blkp = blk_get_public(blk);
--    ThrottleState *ts = blkp->throttle_state;
-+    ThrottleState *ts = tgm->throttle_state;
-     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-     qemu_mutex_lock(&tg->lock);
-     throttle_config(ts, tg->clock_type, cfg);
-     qemu_mutex_unlock(&tg->lock);
--    throttle_group_restart_blk(blk);
-+    throttle_group_restart_tgm(tgm);
- }
- /* Get the throttle configuration from a particular group. Similar to
-  * throttle_get_config(), but guarantees atomicity within the
-  * throttling group.
-  *
-- * @blk: a BlockBackend that is a member of the group
-+ * @tgm:    a ThrottleGroupMember that is a member of the group
-  * @cfg: the configuration will be written here
-  */
--void throttle_group_get_config(BlockBackend *blk, ThrottleConfig *cfg)
-+void throttle_group_get_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg)
- {
--    BlockBackendPublic *blkp = blk_get_public(blk);
--    ThrottleState *ts = blkp->throttle_state;
-+    ThrottleState *ts = tgm->throttle_state;
-     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-     qemu_mutex_lock(&tg->lock);
-     throttle_get_config(ts, cfg);
-@@ -XXX,XX +XXX,XX @@ void throttle_group_get_config(BlockBackend *blk, ThrottleConfig *cfg)
- static void timer_cb(BlockBackend *blk, bool is_write)
- {
-     BlockBackendPublic *blkp = blk_get_public(blk);
--    ThrottleState *ts = blkp->throttle_state;
-+    ThrottleGroupMember *tgm = &blkp->throttle_group_member;
-+    ThrottleState *ts = tgm->throttle_state;
-     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-     /* The timer has just been fired, so we can update the flag */
-@@ -XXX,XX +XXX,XX @@ static void timer_cb(BlockBackend *blk, bool is_write)
-     qemu_mutex_unlock(&tg->lock);
-     /* Run the request that was waiting for this timer */
--    throttle_group_restart_queue(blk, is_write);
-+    throttle_group_restart_queue(tgm, is_write);
- }
- static void read_timer_cb(void *opaque)
-@@ -XXX,XX +XXX,XX @@ static void write_timer_cb(void *opaque)
-     timer_cb(opaque, true);
- }
--/* Register a BlockBackend in the throttling group, also initializing its
-- * timers and updating its throttle_state pointer to point to it. If a
-+/* Register a ThrottleGroupMember from the throttling group, also initializing
-+ * its timers and updating its throttle_state pointer to point to it. If a
-  * throttling group with that name does not exist yet, it will be created.
-  *
-- * @blk:       the BlockBackend to insert
-+ * @tgm:       the ThrottleGroupMember to insert
-  * @groupname: the name of the group
-  */
--void throttle_group_register_blk(BlockBackend *blk, const char *groupname)
-+void throttle_group_register_tgm(ThrottleGroupMember *tgm,
-+                                 const char *groupname)
- {
-     int i;
--    BlockBackendPublic *blkp = blk_get_public(blk);
-+    BlockBackendPublic *blkp = container_of(tgm, BlockBackendPublic,
-+            throttle_group_member);
-+    BlockBackend *blk = blk_by_public(blkp);
-     ThrottleState *ts = throttle_group_incref(groupname);
-     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
--    blkp->throttle_state = ts;
-+
-+    tgm->throttle_state = ts;
-     qemu_mutex_lock(&tg->lock);
--    /* If the ThrottleGroup is new set this BlockBackend as the token */
-+    /* If the ThrottleGroup is new set this ThrottleGroupMember as the token */
-     for (i = 0; i < 2; i++) {
-         if (!tg->tokens[i]) {
--            tg->tokens[i] = blk;
-+            tg->tokens[i] = tgm;
-         }
-     }
--    QLIST_INSERT_HEAD(&tg->head, blkp, round_robin);
-+    QLIST_INSERT_HEAD(&tg->head, tgm, round_robin);
--    throttle_timers_init(&blkp->throttle_timers,
-+    throttle_timers_init(&tgm->throttle_timers,
-                          blk_get_aio_context(blk),
-                          tg->clock_type,
-                          read_timer_cb,
-@@ -XXX,XX +XXX,XX @@ void throttle_group_register_blk(BlockBackend *blk, const char *groupname)
-     qemu_mutex_unlock(&tg->lock);
- }
--/* Unregister a BlockBackend from its group, removing it from the list,
-+/* Unregister a ThrottleGroupMember from its group, removing it from the list,
-  * destroying the timers and setting the throttle_state pointer to NULL.
-  *
-- * The BlockBackend must not have pending throttled requests, so the caller has
-- * to drain them first.
-+ * The ThrottleGroupMember must not have pending throttled requests, so the
-+ * caller has to drain them first.
-  *
-  * The group will be destroyed if it's empty after this operation.
-  *
-- * @blk: the BlockBackend to remove
-+ * @tgm the ThrottleGroupMember to remove
-  */
--void throttle_group_unregister_blk(BlockBackend *blk)
-+void throttle_group_unregister_tgm(ThrottleGroupMember *tgm)
- {
--    BlockBackendPublic *blkp = blk_get_public(blk);
--    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
-+    ThrottleState *ts = tgm->throttle_state;
-+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-+    ThrottleGroupMember *token;
-     int i;
--    assert(blkp->pending_reqs[0] == 0 && blkp->pending_reqs[1] == 0);
--    assert(qemu_co_queue_empty(&blkp->throttled_reqs[0]));
--    assert(qemu_co_queue_empty(&blkp->throttled_reqs[1]));
-+    assert(tgm->pending_reqs[0] == 0 && tgm->pending_reqs[1] == 0);
-+    assert(qemu_co_queue_empty(&tgm->throttled_reqs[0]));
-+    assert(qemu_co_queue_empty(&tgm->throttled_reqs[1]));
-     qemu_mutex_lock(&tg->lock);
-     for (i = 0; i < 2; i++) {
--        if (tg->tokens[i] == blk) {
--            BlockBackend *token = throttle_group_next_blk(blk);
--            /* Take care of the case where this is the last blk in the group */
--            if (token == blk) {
-+        if (tg->tokens[i] == tgm) {
-+            token = throttle_group_next_tgm(tgm);
-+            /* Take care of the case where this is the last tgm in the group */
-+            if (token == tgm) {
-                 token = NULL;
-             }
-             tg->tokens[i] = token;
-         }
-     }
--    /* remove the current blk from the list */
--    QLIST_REMOVE(blkp, round_robin);
--    throttle_timers_destroy(&blkp->throttle_timers);
-+    /* remove the current tgm from the list */
-+    QLIST_REMOVE(tgm, round_robin);
-+    throttle_timers_destroy(&tgm->throttle_timers);
-     qemu_mutex_unlock(&tg->lock);
-     throttle_group_unref(&tg->ts);
--    blkp->throttle_state = NULL;
-+    tgm->throttle_state = NULL;
- }
- static void throttle_groups_init(void)
-diff --git a/blockdev.c b/blockdev.c
-index XXXXXXX..XXXXXXX 100644
---- a/blockdev.c
-+++ b/blockdev.c
-@@ -XXX,XX +XXX,XX @@ void qmp_block_set_io_throttle(BlockIOThrottle *arg, Error **errp)
-     if (throttle_enabled(&cfg)) {
-         /* Enable I/O limits if they're not enabled yet, otherwise
-          * just update the throttling group. */
--        if (!blk_get_public(blk)->throttle_state) {
-+        if (!blk_get_public(blk)->throttle_group_member.throttle_state) {
-             blk_io_limits_enable(blk,
-                                  arg->has_group ? arg->group :
-                                  arg->has_device ? arg->device :
-@@ -XXX,XX +XXX,XX @@ void qmp_block_set_io_throttle(BlockIOThrottle *arg, Error **errp)
-         }
-         /* Set the new throttling configuration */
-         blk_set_io_limits(blk, &cfg);
--    } else if (blk_get_public(blk)->throttle_state) {
-+    } else if (blk_get_public(blk)->throttle_group_member.throttle_state) {
-         /* If all throttling settings are set to 0, disable I/O limits */
-         blk_io_limits_disable(blk);
-     }
-diff --git a/tests/test-throttle.c b/tests/test-throttle.c
-index XXXXXXX..XXXXXXX 100644
---- a/tests/test-throttle.c
-+++ b/tests/test-throttle.c
-@@ -XXX,XX +XXX,XX @@ static void test_groups(void)
-     ThrottleConfig cfg1, cfg2;
-     BlockBackend *blk1, *blk2, *blk3;
-     BlockBackendPublic *blkp1, *blkp2, *blkp3;
-+    ThrottleGroupMember *tgm1, *tgm2, *tgm3;
-     /* No actual I/O is performed on these devices */
-     blk1 = blk_new(0, BLK_PERM_ALL);
-@@ -XXX,XX +XXX,XX @@ static void test_groups(void)
-     blkp2 = blk_get_public(blk2);
-     blkp3 = blk_get_public(blk3);
--    g_assert(blkp1->throttle_state == NULL);
--    g_assert(blkp2->throttle_state == NULL);
--    g_assert(blkp3->throttle_state == NULL);
-+    tgm1 = &blkp1->throttle_group_member;
-+    tgm2 = &blkp2->throttle_group_member;
-+    tgm3 = &blkp3->throttle_group_member;
--    throttle_group_register_blk(blk1, "bar");
--    throttle_group_register_blk(blk2, "foo");
--    throttle_group_register_blk(blk3, "bar");
-+    g_assert(tgm1->throttle_state == NULL);
-+    g_assert(tgm2->throttle_state == NULL);
-+    g_assert(tgm3->throttle_state == NULL);
--    g_assert(blkp1->throttle_state != NULL);
--    g_assert(blkp2->throttle_state != NULL);
--    g_assert(blkp3->throttle_state != NULL);
-+    throttle_group_register_tgm(tgm1, "bar");
-+    throttle_group_register_tgm(tgm2, "foo");
-+    throttle_group_register_tgm(tgm3, "bar");
--    g_assert(!strcmp(throttle_group_get_name(blk1), "bar"));
--    g_assert(!strcmp(throttle_group_get_name(blk2), "foo"));
--    g_assert(blkp1->throttle_state == blkp3->throttle_state);
-+    g_assert(tgm1->throttle_state != NULL);
-+    g_assert(tgm2->throttle_state != NULL);
-+    g_assert(tgm3->throttle_state != NULL);
-+
-+    g_assert(!strcmp(throttle_group_get_name(tgm1), "bar"));
-+    g_assert(!strcmp(throttle_group_get_name(tgm2), "foo"));
-+    g_assert(tgm1->throttle_state == tgm3->throttle_state);
-     /* Setting the config of a group member affects the whole group */
-     throttle_config_init(&cfg1);
-@@ -XXX,XX +XXX,XX @@ static void test_groups(void)
-     cfg1.buckets[THROTTLE_BPS_WRITE].avg = 285000;
-     cfg1.buckets[THROTTLE_OPS_READ].avg  = 20000;
-     cfg1.buckets[THROTTLE_OPS_WRITE].avg = 12000;
--    throttle_group_config(blk1, &cfg1);
-+    throttle_group_config(tgm1, &cfg1);
--    throttle_group_get_config(blk1, &cfg1);
--    throttle_group_get_config(blk3, &cfg2);
-+    throttle_group_get_config(tgm1, &cfg1);
-+    throttle_group_get_config(tgm3, &cfg2);
-     g_assert(!memcmp(&cfg1, &cfg2, sizeof(cfg1)));
-     cfg2.buckets[THROTTLE_BPS_READ].avg  = 4547;
-     cfg2.buckets[THROTTLE_BPS_WRITE].avg = 1349;
-     cfg2.buckets[THROTTLE_OPS_READ].avg  = 123;
-     cfg2.buckets[THROTTLE_OPS_WRITE].avg = 86;
--    throttle_group_config(blk3, &cfg1);
-+    throttle_group_config(tgm3, &cfg1);
--    throttle_group_get_config(blk1, &cfg1);
--    throttle_group_get_config(blk3, &cfg2);
-+    throttle_group_get_config(tgm1, &cfg1);
-+    throttle_group_get_config(tgm3, &cfg2);
-     g_assert(!memcmp(&cfg1, &cfg2, sizeof(cfg1)));
--    throttle_group_unregister_blk(blk1);
--    throttle_group_unregister_blk(blk2);
--    throttle_group_unregister_blk(blk3);
-+    throttle_group_unregister_tgm(tgm1);
-+    throttle_group_unregister_tgm(tgm2);
-+    throttle_group_unregister_tgm(tgm3);
--    g_assert(blkp1->throttle_state == NULL);
--    g_assert(blkp2->throttle_state == NULL);
--    g_assert(blkp3->throttle_state == NULL);
-+    g_assert(tgm1->throttle_state == NULL);
-+    g_assert(tgm2->throttle_state == NULL);
-+    g_assert(tgm3->throttle_state == NULL);
- }
- int main(int argc, char **argv)
 --
-.13.5
+.13.6

-[Qemu-devel] [PULL 11/14] block: convert ThrottleGroup to object with QOM
+[Qemu-devel] [PULL v3 30/35] test-bdrv-drain: Test behaviour in coroutine context
-From: Manos Pitsidianakis <el13635@mail.ntua.gr>
+If bdrv_do_drained_begin/end() are called in coroutine context, they
 first use a BH to get out of the coroutine context. Call some existing
 tests again from a coroutine to cover this code path.
-ThrottleGroup is converted to an object. This will allow the future
-throttle block filter drive easy creation and configuration of throttle
-groups in QMP and cli.
-A new QAPI struct, ThrottleLimits, is introduced to provide a shared
-struct for all throttle configuration needs in QMP.
-ThrottleGroups can be created via CLI as
-    -object throttle-group,id=foo,x-iops-total=100,x-..
-where x-* are individual limit properties. Since we can't add non-scalar
-properties in -object this interface must be used instead. However,
-setting these properties must be disabled after initialization because
-certain combinations of limits are forbidden and thus configuration
-changes should be done in one transaction. The individual properties
-will go away when support for non-scalar values in CLI is implemented
-and thus are marked as experimental.
-ThrottleGroup also has a `limits` property that uses the ThrottleLimits
-struct.  It can be used to create ThrottleGroups or set the
-configuration in existing groups as follows:
-{ "execute": "object-add",
-  "arguments": {
-    "qom-type": "throttle-group",
-    "id": "foo",
-    "props" : {
-      "limits": {
-          "iops-total": 100
-      }
-    }
-  }
-}
-{ "execute" : "qom-set",
-    "arguments" : {
-        "path" : "foo",
-        "property" : "limits",
-        "value" : {
-            "iops-total" : 99
-        }
-    }
-}
-This also means a group's configuration can be fetched with qom-get.
-Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Alberto Garcia <berto@igalia.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- qapi/block-core.json            |  48 +++++
+ tests/test-bdrv-drain.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
- include/block/throttle-groups.h |   3 +
+file changed, 59 insertions(+)
  include/qemu/throttle-options.h |  59 ++++--
  include/qemu/throttle.h         |   3 +
  block/throttle-groups.c         | 424 ++++++++++++++++++++++++++++++++++++----
  tests/test-throttle.c           |   1 +
  util/throttle.c                 | 151 ++++++++++++++
 files changed, 628 insertions(+), 61 deletions(-)
-diff --git a/qapi/block-core.json b/qapi/block-core.json
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/qapi/block-core.json
+--- a/tests/test-bdrv-drain.c
-+++ b/qapi/block-core.json
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
-             '*iops_size': 'int', '*group': 'str' } }
+     *aio_ret = ret;
+ }
- ##
-+# @ThrottleLimits:
++typedef struct CallInCoroutineData {
-+#
++    void (*entry)(void);
-+# Limit parameters for throttling.
++    bool done;
-+# Since some limit combinations are illegal, limits should always be set in one
++} CallInCoroutineData;
 +# transaction. All fields are optional. When setting limits, if a field is
 +# missing the current value is not changed.
 +#
 +# @iops-total:             limit total I/O operations per second
 +# @iops-total-max:         I/O operations burst
 +# @iops-total-max-length:  length of the iops-total-max burst period, in seconds
 +#                          It must only be set if @iops-total-max is set as well.
 +# @iops-read:              limit read operations per second
 +# @iops-read-max:          I/O operations read burst
 +# @iops-read-max-length:   length of the iops-read-max burst period, in seconds
 +#                          It must only be set if @iops-read-max is set as well.
 +# @iops-write:             limit write operations per second
 +# @iops-write-max:         I/O operations write burst
 +# @iops-write-max-length:  length of the iops-write-max burst period, in seconds
 +#                          It must only be set if @iops-write-max is set as well.
 +# @bps-total:              limit total bytes per second
 +# @bps-total-max:          total bytes burst
 +# @bps-total-max-length:   length of the bps-total-max burst period, in seconds.
 +#                          It must only be set if @bps-total-max is set as well.
 +# @bps-read:               limit read bytes per second
 +# @bps-read-max:           total bytes read burst
 +# @bps-read-max-length:    length of the bps-read-max burst period, in seconds
 +#                          It must only be set if @bps-read-max is set as well.
 +# @bps-write:              limit write bytes per second
 +# @bps-write-max:          total bytes write burst
 +# @bps-write-max-length:   length of the bps-write-max burst period, in seconds
 +#                          It must only be set if @bps-write-max is set as well.
 +# @iops-size:              when limiting by iops max size of an I/O in bytes
 +#
 +# Since: 2.11
 +##
 +{ 'struct': 'ThrottleLimits',
 +  'data': { '*iops-total' : 'int', '*iops-total-max' : 'int',
 +            '*iops-total-max-length' : 'int', '*iops-read' : 'int',
 +            '*iops-read-max' : 'int', '*iops-read-max-length' : 'int',
 +            '*iops-write' : 'int', '*iops-write-max' : 'int',
 +            '*iops-write-max-length' : 'int', '*bps-total' : 'int',
 +            '*bps-total-max' : 'int', '*bps-total-max-length' : 'int',
 +            '*bps-read' : 'int', '*bps-read-max' : 'int',
 +            '*bps-read-max-length' : 'int', '*bps-write' : 'int',
 +            '*bps-write-max' : 'int', '*bps-write-max-length' : 'int',
 +            '*iops-size' : 'int' } }
 +
-+##
++static coroutine_fn void call_in_coroutine_entry(void *opaque)
- # @block-stream:
++{
- #
++    CallInCoroutineData *data = opaque;
  # Copy data from a backing file into a block device.
 diff --git a/include/block/throttle-groups.h b/include/block/throttle-groups.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/throttle-groups.h
 +++ b/include/block/throttle-groups.h
@@ -XXX,XX +XXX,XX @@ typedef struct ThrottleGroupMember {
  } ThrottleGroupMember;
 +#define TYPE_THROTTLE_GROUP "throttle-group"
 +#define THROTTLE_GROUP(obj) OBJECT_CHECK(ThrottleGroup, (obj), TYPE_THROTTLE_GROUP)
 +
- const char *throttle_group_get_name(ThrottleGroupMember *tgm);
++    data->entry();
++    data->done = true;
  ThrottleState *throttle_group_incref(const char *name);
 diff --git a/include/qemu/throttle-options.h b/include/qemu/throttle-options.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/throttle-options.h
 +++ b/include/qemu/throttle-options.h
@@ -XXX,XX +XXX,XX @@
  #ifndef THROTTLE_OPTIONS_H
  #define THROTTLE_OPTIONS_H
 +#define QEMU_OPT_IOPS_TOTAL "iops-total"
 +#define QEMU_OPT_IOPS_TOTAL_MAX "iops-total-max"
 +#define QEMU_OPT_IOPS_TOTAL_MAX_LENGTH "iops-total-max-length"
 +#define QEMU_OPT_IOPS_READ "iops-read"
 +#define QEMU_OPT_IOPS_READ_MAX "iops-read-max"
 +#define QEMU_OPT_IOPS_READ_MAX_LENGTH "iops-read-max-length"
 +#define QEMU_OPT_IOPS_WRITE "iops-write"
 +#define QEMU_OPT_IOPS_WRITE_MAX "iops-write-max"
 +#define QEMU_OPT_IOPS_WRITE_MAX_LENGTH "iops-write-max-length"
 +#define QEMU_OPT_BPS_TOTAL "bps-total"
 +#define QEMU_OPT_BPS_TOTAL_MAX "bps-total-max"
 +#define QEMU_OPT_BPS_TOTAL_MAX_LENGTH "bps-total-max-length"
 +#define QEMU_OPT_BPS_READ "bps-read"
 +#define QEMU_OPT_BPS_READ_MAX "bps-read-max"
 +#define QEMU_OPT_BPS_READ_MAX_LENGTH "bps-read-max-length"
 +#define QEMU_OPT_BPS_WRITE "bps-write"
 +#define QEMU_OPT_BPS_WRITE_MAX "bps-write-max"
 +#define QEMU_OPT_BPS_WRITE_MAX_LENGTH "bps-write-max-length"
 +#define QEMU_OPT_IOPS_SIZE "iops-size"
 +
 +#define THROTTLE_OPT_PREFIX "throttling."
  #define THROTTLE_OPTS \
            { \
 -            .name = "throttling.iops-total",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_TOTAL,\
              .type = QEMU_OPT_NUMBER,\
              .help = "limit total I/O operations per second",\
          },{ \
 -            .name = "throttling.iops-read",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_READ,\
              .type = QEMU_OPT_NUMBER,\
              .help = "limit read operations per second",\
          },{ \
 -            .name = "throttling.iops-write",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_WRITE,\
              .type = QEMU_OPT_NUMBER,\
              .help = "limit write operations per second",\
          },{ \
 -            .name = "throttling.bps-total",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_TOTAL,\
              .type = QEMU_OPT_NUMBER,\
              .help = "limit total bytes per second",\
          },{ \
 -            .name = "throttling.bps-read",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_READ,\
              .type = QEMU_OPT_NUMBER,\
              .help = "limit read bytes per second",\
          },{ \
 -            .name = "throttling.bps-write",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_WRITE,\
              .type = QEMU_OPT_NUMBER,\
              .help = "limit write bytes per second",\
          },{ \
 -            .name = "throttling.iops-total-max",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_TOTAL_MAX,\
              .type = QEMU_OPT_NUMBER,\
              .help = "I/O operations burst",\
          },{ \
 -            .name = "throttling.iops-read-max",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_READ_MAX,\
              .type = QEMU_OPT_NUMBER,\
              .help = "I/O operations read burst",\
          },{ \
 -            .name = "throttling.iops-write-max",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_WRITE_MAX,\
              .type = QEMU_OPT_NUMBER,\
              .help = "I/O operations write burst",\
          },{ \
 -            .name = "throttling.bps-total-max",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_TOTAL_MAX,\
              .type = QEMU_OPT_NUMBER,\
              .help = "total bytes burst",\
          },{ \
 -            .name = "throttling.bps-read-max",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_READ_MAX,\
              .type = QEMU_OPT_NUMBER,\
              .help = "total bytes read burst",\
          },{ \
 -            .name = "throttling.bps-write-max",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_WRITE_MAX,\
              .type = QEMU_OPT_NUMBER,\
              .help = "total bytes write burst",\
          },{ \
 -            .name = "throttling.iops-total-max-length",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_TOTAL_MAX_LENGTH,\
              .type = QEMU_OPT_NUMBER,\
              .help = "length of the iops-total-max burst period, in seconds",\
          },{ \
 -            .name = "throttling.iops-read-max-length",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_READ_MAX_LENGTH,\
              .type = QEMU_OPT_NUMBER,\
              .help = "length of the iops-read-max burst period, in seconds",\
          },{ \
 -            .name = "throttling.iops-write-max-length",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_WRITE_MAX_LENGTH,\
              .type = QEMU_OPT_NUMBER,\
              .help = "length of the iops-write-max burst period, in seconds",\
          },{ \
 -            .name = "throttling.bps-total-max-length",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_TOTAL_MAX_LENGTH,\
              .type = QEMU_OPT_NUMBER,\
              .help = "length of the bps-total-max burst period, in seconds",\
          },{ \
 -            .name = "throttling.bps-read-max-length",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_READ_MAX_LENGTH,\
              .type = QEMU_OPT_NUMBER,\
              .help = "length of the bps-read-max burst period, in seconds",\
          },{ \
 -            .name = "throttling.bps-write-max-length",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_WRITE_MAX_LENGTH,\
              .type = QEMU_OPT_NUMBER,\
              .help = "length of the bps-write-max burst period, in seconds",\
          },{ \
 -            .name = "throttling.iops-size",\
 +            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_SIZE,\
              .type = QEMU_OPT_NUMBER,\
              .help = "when limiting by iops max size of an I/O in bytes",\
          }
 diff --git a/include/qemu/throttle.h b/include/qemu/throttle.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/throttle.h
 +++ b/include/qemu/throttle.h
@@ -XXX,XX +XXX,XX @@ bool throttle_schedule_timer(ThrottleState *ts,
                               bool is_write);
  void throttle_account(ThrottleState *ts, bool is_write, uint64_t size);
 +void throttle_limits_to_config(ThrottleLimits *arg, ThrottleConfig *cfg,
 +                               Error **errp);
 +void throttle_config_to_limits(ThrottleConfig *cfg, ThrottleLimits *var);
  #endif
 diff --git a/block/throttle-groups.c b/block/throttle-groups.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/throttle-groups.c
 +++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "sysemu/block-backend.h"
  #include "block/throttle-groups.h"
 +#include "qemu/throttle-options.h"
  #include "qemu/queue.h"
  #include "qemu/thread.h"
  #include "sysemu/qtest.h"
 +#include "qapi/error.h"
 +#include "qapi-visit.h"
 +#include "qom/object.h"
 +#include "qom/object_interfaces.h"
 +
 +static void throttle_group_obj_init(Object *obj);
 +static void throttle_group_obj_complete(UserCreatable *obj, Error **errp);
  /* The ThrottleGroup structure (with its ThrottleState) is shared
   * among different ThrottleGroupMembers and it's independent from
@@ -XXX,XX +XXX,XX @@
   * that ThrottleGroupMember has throttled requests in the queue.
   */
  typedef struct ThrottleGroup {
 +    Object parent_obj;
 +
 +    /* refuse individual property change if initialization is complete */
 +    bool is_initialized;
      char *name; /* This is constant during the lifetime of the group */
      QemuMutex lock; /* This lock protects the following four fields */
@@ -XXX,XX +XXX,XX @@ typedef struct ThrottleGroup {
      bool any_timer_armed[2];
      QEMUClockType clock_type;
 -    /* These two are protected by the global throttle_groups_lock */
 -    unsigned refcount;
 +    /* This field is protected by the global QEMU mutex */
      QTAILQ_ENTRY(ThrottleGroup) list;
  } ThrottleGroup;
 -static QemuMutex throttle_groups_lock;
 +/* This is protected by the global QEMU mutex */
  static QTAILQ_HEAD(, ThrottleGroup) throttle_groups =
      QTAILQ_HEAD_INITIALIZER(throttle_groups);
 +
 +/* This function reads throttle_groups and must be called under the global
 + * mutex.
 + */
 +static ThrottleGroup *throttle_group_by_name(const char *name)
 +{
 +    ThrottleGroup *iter;
 +
 +    /* Look for an existing group with that name */
 +    QTAILQ_FOREACH(iter, &throttle_groups, list) {
 +        if (!g_strcmp0(name, iter->name)) {
 +            return iter;
 +        }
 +    }
 +
 +    return NULL;
 +}
 +
- /* Increments the reference count of a ThrottleGroup given its name.
++static void call_in_coroutine(void (*entry)(void))
-  *
++{
-  * If no ThrottleGroup is found with the given name a new one is
++    Coroutine *co;
-  * created.
++    CallInCoroutineData data = {
-  *
++        .entry  = entry,
-+ * This function edits throttle_groups and must be called under the global
++        .done   = false,
-+ * mutex.
++    };
 + *
   * @name: the name of the ThrottleGroup
   * @ret:  the ThrottleState member of the ThrottleGroup
   */
  ThrottleState *throttle_group_incref(const char *name)
  {
      ThrottleGroup *tg = NULL;
 -    ThrottleGroup *iter;
 -
 -    qemu_mutex_lock(&throttle_groups_lock);
      /* Look for an existing group with that name */
 -    QTAILQ_FOREACH(iter, &throttle_groups, list) {
 -        if (!strcmp(name, iter->name)) {
 -            tg = iter;
 -            break;
 -        }
 -    }
 -
 -    /* Create a new one if not found */
 -    if (!tg) {
 -        tg = g_new0(ThrottleGroup, 1);
 +    tg = throttle_group_by_name(name);
 +
-+    if (tg) {
++    co = qemu_coroutine_create(call_in_coroutine_entry, &data);
-+        object_ref(OBJECT(tg));
++    qemu_coroutine_enter(co);
-+    } else {
++    while (!data.done) {
-+        /* Create a new one if not found */
++        aio_poll(qemu_get_aio_context(), true);
 +        /* new ThrottleGroup obj will have a refcnt = 1 */
 +        tg = THROTTLE_GROUP(object_new(TYPE_THROTTLE_GROUP));
          tg->name = g_strdup(name);
 -        tg->clock_type = QEMU_CLOCK_REALTIME;
 -
 -        if (qtest_enabled()) {
 -            /* For testing block IO throttling only */
 -            tg->clock_type = QEMU_CLOCK_VIRTUAL;
 -        }
 -        qemu_mutex_init(&tg->lock);
 -        throttle_init(&tg->ts);
 -        QLIST_INIT(&tg->head);
 -
 -        QTAILQ_INSERT_TAIL(&throttle_groups, tg, list);
 +        throttle_group_obj_complete(USER_CREATABLE(tg), &error_abort);
      }
 -    tg->refcount++;
 -
 -    qemu_mutex_unlock(&throttle_groups_lock);
 -
      return &tg->ts;
  }
@@ -XXX,XX +XXX,XX @@ ThrottleState *throttle_group_incref(const char *name)
   * When the reference count reaches zero the ThrottleGroup is
   * destroyed.
   *
 + * This function edits throttle_groups and must be called under the global
 + * mutex.
 + *
   * @ts:  The ThrottleGroup to unref, given by its ThrottleState member
   */
  void throttle_group_unref(ThrottleState *ts)
  {
      ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
 -
 -    qemu_mutex_lock(&throttle_groups_lock);
 -    if (--tg->refcount == 0) {
 -        QTAILQ_REMOVE(&throttle_groups, tg, list);
 -        qemu_mutex_destroy(&tg->lock);
 -        g_free(tg->name);
 -        g_free(tg);
 -    }
 -    qemu_mutex_unlock(&throttle_groups_lock);
 +    object_unref(OBJECT(tg));
  }
  /* Get the name from a ThrottleGroupMember's group. The name (and the pointer)
@@ -XXX,XX +XXX,XX @@ static void write_timer_cb(void *opaque)
   * its timers and updating its throttle_state pointer to point to it. If a
   * throttling group with that name does not exist yet, it will be created.
   *
 + * This function edits throttle_groups and must be called under the global
 + * mutex.
 + *
   * @tgm:       the ThrottleGroupMember to insert
   * @groupname: the name of the group
   * @ctx:       the AioContext to use
@@ -XXX,XX +XXX,XX @@ void throttle_group_detach_aio_context(ThrottleGroupMember *tgm)
      tgm->aio_context = NULL;
  }
 +#undef THROTTLE_OPT_PREFIX
 +#define THROTTLE_OPT_PREFIX "x-"
 +
 +/* Helper struct and array for QOM property setter/getter */
 +typedef struct {
 +    const char *name;
 +    BucketType type;
 +    enum {
 +        AVG,
 +        MAX,
 +        BURST_LENGTH,
 +        IOPS_SIZE,
 +    } category;
 +} ThrottleParamInfo;
 +
 +static ThrottleParamInfo properties[] = {
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_TOTAL,
 +        THROTTLE_OPS_TOTAL, AVG,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_TOTAL_MAX,
 +        THROTTLE_OPS_TOTAL, MAX,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_TOTAL_MAX_LENGTH,
 +        THROTTLE_OPS_TOTAL, BURST_LENGTH,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_READ,
 +        THROTTLE_OPS_READ, AVG,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_READ_MAX,
 +        THROTTLE_OPS_READ, MAX,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_READ_MAX_LENGTH,
 +        THROTTLE_OPS_READ, BURST_LENGTH,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_WRITE,
 +        THROTTLE_OPS_WRITE, AVG,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_WRITE_MAX,
 +        THROTTLE_OPS_WRITE, MAX,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_WRITE_MAX_LENGTH,
 +        THROTTLE_OPS_WRITE, BURST_LENGTH,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_TOTAL,
 +        THROTTLE_BPS_TOTAL, AVG,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_TOTAL_MAX,
 +        THROTTLE_BPS_TOTAL, MAX,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_TOTAL_MAX_LENGTH,
 +        THROTTLE_BPS_TOTAL, BURST_LENGTH,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_READ,
 +        THROTTLE_BPS_READ, AVG,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_READ_MAX,
 +        THROTTLE_BPS_READ, MAX,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_READ_MAX_LENGTH,
 +        THROTTLE_BPS_READ, BURST_LENGTH,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_WRITE,
 +        THROTTLE_BPS_WRITE, AVG,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_WRITE_MAX,
 +        THROTTLE_BPS_WRITE, MAX,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_WRITE_MAX_LENGTH,
 +        THROTTLE_BPS_WRITE, BURST_LENGTH,
 +    },
 +    {
 +        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_SIZE,
 +        0, IOPS_SIZE,
 +    }
-+};
-+
-+/* This function edits throttle_groups and must be called under the global
-+ * mutex */
-+static void throttle_group_obj_init(Object *obj)
-+{
-+    ThrottleGroup *tg = THROTTLE_GROUP(obj);
-+
-+    tg->clock_type = QEMU_CLOCK_REALTIME;
-+    if (qtest_enabled()) {
-+        /* For testing block IO throttling only */
-+        tg->clock_type = QEMU_CLOCK_VIRTUAL;
-+    }
-+    tg->is_initialized = false;
-+    qemu_mutex_init(&tg->lock);
-+    throttle_init(&tg->ts);
-+    QLIST_INIT(&tg->head);
 +}
 +
-+/* This function edits throttle_groups and must be called under the global
+ enum drain_type {
-+ * mutex */
+     BDRV_DRAIN_ALL,
-+static void throttle_group_obj_complete(UserCreatable *obj, Error **errp)
+     BDRV_DRAIN,
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_subtree(void)
      test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
  }
 +static void test_drv_cb_co_drain(void)
 +{
-+    ThrottleGroup *tg = THROTTLE_GROUP(obj);
++    call_in_coroutine(test_drv_cb_drain);
 +    ThrottleConfig cfg;
 +
 +    /* set group name to object id if it exists */
 +    if (!tg->name && tg->parent_obj.parent) {
 +        tg->name = object_get_canonical_path_component(OBJECT(obj));
 +    }
 +    /* We must have a group name at this point */
 +    assert(tg->name);
 +
 +    /* error if name is duplicate */
 +    if (throttle_group_by_name(tg->name) != NULL) {
 +        error_setg(errp, "A group with this name already exists");
 +        return;
 +    }
 +
 +    /* check validity */
 +    throttle_get_config(&tg->ts, &cfg);
 +    if (!throttle_is_valid(&cfg, errp)) {
 +        return;
 +    }
 +    throttle_config(&tg->ts, tg->clock_type, &cfg);
 +    QTAILQ_INSERT_TAIL(&throttle_groups, tg, list);
 +    tg->is_initialized = true;
 +}
 +
-+/* This function edits throttle_groups and must be called under the global
++static void test_drv_cb_co_drain_subtree(void)
 + * mutex */
 +static void throttle_group_obj_finalize(Object *obj)
 +{
-+    ThrottleGroup *tg = THROTTLE_GROUP(obj);
++    call_in_coroutine(test_drv_cb_drain_subtree);
 +    if (tg->is_initialized) {
 +        QTAILQ_REMOVE(&throttle_groups, tg, list);
 +    }
 +    qemu_mutex_destroy(&tg->lock);
 +    g_free(tg->name);
 +}
 +
-+static void throttle_group_set(Object *obj, Visitor *v, const char * name,
+ static void test_quiesce_common(enum drain_type drain_type, bool recursive)
-+                               void *opaque, Error **errp)
+ {
-+
+     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain_subtree(void)
      test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
  }
 +static void test_quiesce_co_drain(void)
 +{
-+    ThrottleGroup *tg = THROTTLE_GROUP(obj);
++    call_in_coroutine(test_quiesce_drain);
 +    ThrottleConfig *cfg;
 +    ThrottleParamInfo *info = opaque;
 +    Error *local_err = NULL;
 +    int64_t value;
 +
 +    /* If we have finished initialization, don't accept individual property
 +     * changes through QOM. Throttle configuration limits must be set in one
 +     * transaction, as certain combinations are invalid.
 +     */
 +    if (tg->is_initialized) {
 +        error_setg(&local_err, "Property cannot be set after initialization");
 +        goto ret;
 +    }
 +
 +    visit_type_int64(v, name, &value, &local_err);
 +    if (local_err) {
 +        goto ret;
 +    }
 +    if (value < 0) {
 +        error_setg(&local_err, "Property values cannot be negative");
 +        goto ret;
 +    }
 +
 +    cfg = &tg->ts.cfg;
 +    switch (info->category) {
 +    case AVG:
 +        cfg->buckets[info->type].avg = value;
 +        break;
 +    case MAX:
 +        cfg->buckets[info->type].max = value;
 +        break;
 +    case BURST_LENGTH:
 +        if (value > UINT_MAX) {
 +            error_setg(&local_err, "%s value must be in the"
 +                       "range [0, %u]", info->name, UINT_MAX);
 +            goto ret;
 +        }
 +        cfg->buckets[info->type].burst_length = value;
 +        break;
 +    case IOPS_SIZE:
 +        cfg->op_size = value;
 +        break;
 +    }
 +
 +ret:
 +    error_propagate(errp, local_err);
 +    return;
 +
 +}
 +
-+static void throttle_group_get(Object *obj, Visitor *v, const char *name,
++static void test_quiesce_co_drain_subtree(void)
 +                               void *opaque, Error **errp)
 +{
-+    ThrottleGroup *tg = THROTTLE_GROUP(obj);
++    call_in_coroutine(test_quiesce_drain_subtree);
 +    ThrottleConfig cfg;
 +    ThrottleParamInfo *info = opaque;
 +    int64_t value;
 +
 +    throttle_get_config(&tg->ts, &cfg);
 +    switch (info->category) {
 +    case AVG:
 +        value = cfg.buckets[info->type].avg;
 +        break;
 +    case MAX:
 +        value = cfg.buckets[info->type].max;
 +        break;
 +    case BURST_LENGTH:
 +        value = cfg.buckets[info->type].burst_length;
 +        break;
 +    case IOPS_SIZE:
 +        value = cfg.op_size;
 +        break;
 +    }
 +
 +    visit_type_int64(v, name, &value, errp);
 +}
 +
-+static void throttle_group_set_limits(Object *obj, Visitor *v,
+ static void test_nested(void)
-+                                      const char *name, void *opaque,
+ {
-+                                      Error **errp)
+     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
                      test_drv_cb_drain_subtree);
 +    // XXX bdrv_drain_all() doesn't work in coroutine context
 +    g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
 +    g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
 +                    test_drv_cb_co_drain_subtree);
 +
-+{
-+    ThrottleGroup *tg = THROTTLE_GROUP(obj);
-+    ThrottleConfig cfg;
-+    ThrottleLimits arg = { 0 };
-+    ThrottleLimits *argp = &arg;
-+    Error *local_err = NULL;
 +
-+    visit_type_ThrottleLimits(v, name, &argp, &local_err);
+     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
-+    if (local_err) {
+     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
-+        goto ret;
+     g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
-+    }
+                     test_quiesce_drain_subtree);
-+    qemu_mutex_lock(&tg->lock);
-+    throttle_get_config(&tg->ts, &cfg);
++    // XXX bdrv_drain_all() doesn't work in coroutine context
-+    throttle_limits_to_config(argp, &cfg, &local_err);
++    g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
-+    if (local_err) {
++    g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
-+        goto unlock;
++                    test_quiesce_co_drain_subtree);
 +    }
 +    throttle_config(&tg->ts, tg->clock_type, &cfg);
 +
-+unlock:
+     g_test_add_func("/bdrv-drain/nested", test_nested);
-+    qemu_mutex_unlock(&tg->lock);
-+ret:
+     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
 +    error_propagate(errp, local_err);
 +    return;
 +}
 +
 +static void throttle_group_get_limits(Object *obj, Visitor *v,
 +                                      const char *name, void *opaque,
 +                                      Error **errp)
 +{
 +    ThrottleGroup *tg = THROTTLE_GROUP(obj);
 +    ThrottleConfig cfg;
 +    ThrottleLimits arg = { 0 };
 +    ThrottleLimits *argp = &arg;
 +
 +    qemu_mutex_lock(&tg->lock);
 +    throttle_get_config(&tg->ts, &cfg);
 +    qemu_mutex_unlock(&tg->lock);
 +
 +    throttle_config_to_limits(&cfg, argp);
 +
 +    visit_type_ThrottleLimits(v, name, &argp, errp);
 +}
 +
 +static bool throttle_group_can_be_deleted(UserCreatable *uc)
 +{
 +    return OBJECT(uc)->ref == 1;
 +}
 +
 +static void throttle_group_obj_class_init(ObjectClass *klass, void *class_data)
 +{
 +    size_t i = 0;
 +    UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
 +
 +    ucc->complete = throttle_group_obj_complete;
 +    ucc->can_be_deleted = throttle_group_can_be_deleted;
 +
 +    /* individual properties */
 +    for (i = 0; i < sizeof(properties) / sizeof(ThrottleParamInfo); i++) {
 +        object_class_property_add(klass,
 +                                  properties[i].name,
 +                                  "int",
 +                                  throttle_group_get,
 +                                  throttle_group_set,
 +                                  NULL, &properties[i],
 +                                  &error_abort);
 +    }
 +
 +    /* ThrottleLimits */
 +    object_class_property_add(klass,
 +                              "limits", "ThrottleLimits",
 +                              throttle_group_get_limits,
 +                              throttle_group_set_limits,
 +                              NULL, NULL,
 +                              &error_abort);
 +}
 +
 +static const TypeInfo throttle_group_info = {
 +    .name = TYPE_THROTTLE_GROUP,
 +    .parent = TYPE_OBJECT,
 +    .class_init = throttle_group_obj_class_init,
 +    .instance_size = sizeof(ThrottleGroup),
 +    .instance_init = throttle_group_obj_init,
 +    .instance_finalize = throttle_group_obj_finalize,
 +    .interfaces = (InterfaceInfo[]) {
 +        { TYPE_USER_CREATABLE },
 +        { }
 +    },
 +};
 +
  static void throttle_groups_init(void)
  {
 -    qemu_mutex_init(&throttle_groups_lock);
 +    type_register_static(&throttle_group_info);
  }
 -block_init(throttle_groups_init);
 +type_init(throttle_groups_init);
 diff --git a/tests/test-throttle.c b/tests/test-throttle.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/test-throttle.c
 +++ b/tests/test-throttle.c
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      qemu_init_main_loop(&error_fatal);
      ctx = qemu_get_aio_context();
      bdrv_init();
 +    module_call_init(MODULE_INIT_QOM);
      do {} while (g_main_context_iteration(NULL, false));
 diff --git a/util/throttle.c b/util/throttle.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/throttle.c
 +++ b/util/throttle.c
@@ -XXX,XX +XXX,XX @@ void throttle_account(ThrottleState *ts, bool is_write, uint64_t size)
      }
  }
 +/* return a ThrottleConfig based on the options in a ThrottleLimits
 + *
 + * @arg:    the ThrottleLimits object to read from
 + * @cfg:    the ThrottleConfig to edit
 + * @errp:   error object
 + */
 +void throttle_limits_to_config(ThrottleLimits *arg, ThrottleConfig *cfg,
 +                               Error **errp)
 +{
 +    if (arg->has_bps_total) {
 +        cfg->buckets[THROTTLE_BPS_TOTAL].avg = arg->bps_total;
 +    }
 +    if (arg->has_bps_read) {
 +        cfg->buckets[THROTTLE_BPS_READ].avg  = arg->bps_read;
 +    }
 +    if (arg->has_bps_write) {
 +        cfg->buckets[THROTTLE_BPS_WRITE].avg = arg->bps_write;
 +    }
 +
 +    if (arg->has_iops_total) {
 +        cfg->buckets[THROTTLE_OPS_TOTAL].avg = arg->iops_total;
 +    }
 +    if (arg->has_iops_read) {
 +        cfg->buckets[THROTTLE_OPS_READ].avg  = arg->iops_read;
 +    }
 +    if (arg->has_iops_write) {
 +        cfg->buckets[THROTTLE_OPS_WRITE].avg = arg->iops_write;
 +    }
 +
 +    if (arg->has_bps_total_max) {
 +        cfg->buckets[THROTTLE_BPS_TOTAL].max = arg->bps_total_max;
 +    }
 +    if (arg->has_bps_read_max) {
 +        cfg->buckets[THROTTLE_BPS_READ].max = arg->bps_read_max;
 +    }
 +    if (arg->has_bps_write_max) {
 +        cfg->buckets[THROTTLE_BPS_WRITE].max = arg->bps_write_max;
 +    }
 +    if (arg->has_iops_total_max) {
 +        cfg->buckets[THROTTLE_OPS_TOTAL].max = arg->iops_total_max;
 +    }
 +    if (arg->has_iops_read_max) {
 +        cfg->buckets[THROTTLE_OPS_READ].max = arg->iops_read_max;
 +    }
 +    if (arg->has_iops_write_max) {
 +        cfg->buckets[THROTTLE_OPS_WRITE].max = arg->iops_write_max;
 +    }
 +
 +    if (arg->has_bps_total_max_length) {
 +        if (arg->bps_total_max_length > UINT_MAX) {
 +            error_setg(errp, "bps-total-max-length value must be in"
 +                             " the range [0, %u]", UINT_MAX);
 +            return;
 +        }
 +        cfg->buckets[THROTTLE_BPS_TOTAL].burst_length = arg->bps_total_max_length;
 +    }
 +    if (arg->has_bps_read_max_length) {
 +        if (arg->bps_read_max_length > UINT_MAX) {
 +            error_setg(errp, "bps-read-max-length value must be in"
 +                             " the range [0, %u]", UINT_MAX);
 +            return;
 +        }
 +        cfg->buckets[THROTTLE_BPS_READ].burst_length = arg->bps_read_max_length;
 +    }
 +    if (arg->has_bps_write_max_length) {
 +        if (arg->bps_write_max_length > UINT_MAX) {
 +            error_setg(errp, "bps-write-max-length value must be in"
 +                             " the range [0, %u]", UINT_MAX);
 +            return;
 +        }
 +        cfg->buckets[THROTTLE_BPS_WRITE].burst_length = arg->bps_write_max_length;
 +    }
 +    if (arg->has_iops_total_max_length) {
 +        if (arg->iops_total_max_length > UINT_MAX) {
 +            error_setg(errp, "iops-total-max-length value must be in"
 +                             " the range [0, %u]", UINT_MAX);
 +            return;
 +        }
 +        cfg->buckets[THROTTLE_OPS_TOTAL].burst_length = arg->iops_total_max_length;
 +    }
 +    if (arg->has_iops_read_max_length) {
 +        if (arg->iops_read_max_length > UINT_MAX) {
 +            error_setg(errp, "iops-read-max-length value must be in"
 +                             " the range [0, %u]", UINT_MAX);
 +            return;
 +        }
 +        cfg->buckets[THROTTLE_OPS_READ].burst_length = arg->iops_read_max_length;
 +    }
 +    if (arg->has_iops_write_max_length) {
 +        if (arg->iops_write_max_length > UINT_MAX) {
 +            error_setg(errp, "iops-write-max-length value must be in"
 +                             " the range [0, %u]", UINT_MAX);
 +            return;
 +        }
 +        cfg->buckets[THROTTLE_OPS_WRITE].burst_length = arg->iops_write_max_length;
 +    }
 +
 +    if (arg->has_iops_size) {
 +        cfg->op_size = arg->iops_size;
 +    }
 +
 +    throttle_is_valid(cfg, errp);
 +}
 +
 +/* write the options of a ThrottleConfig to a ThrottleLimits
 + *
 + * @cfg:    the ThrottleConfig to read from
 + * @var:    the ThrottleLimits to write to
 + */
 +void throttle_config_to_limits(ThrottleConfig *cfg, ThrottleLimits *var)
 +{
 +    var->bps_total               = cfg->buckets[THROTTLE_BPS_TOTAL].avg;
 +    var->bps_read                = cfg->buckets[THROTTLE_BPS_READ].avg;
 +    var->bps_write               = cfg->buckets[THROTTLE_BPS_WRITE].avg;
 +    var->iops_total              = cfg->buckets[THROTTLE_OPS_TOTAL].avg;
 +    var->iops_read               = cfg->buckets[THROTTLE_OPS_READ].avg;
 +    var->iops_write              = cfg->buckets[THROTTLE_OPS_WRITE].avg;
 +    var->bps_total_max           = cfg->buckets[THROTTLE_BPS_TOTAL].max;
 +    var->bps_read_max            = cfg->buckets[THROTTLE_BPS_READ].max;
 +    var->bps_write_max           = cfg->buckets[THROTTLE_BPS_WRITE].max;
 +    var->iops_total_max          = cfg->buckets[THROTTLE_OPS_TOTAL].max;
 +    var->iops_read_max           = cfg->buckets[THROTTLE_OPS_READ].max;
 +    var->iops_write_max          = cfg->buckets[THROTTLE_OPS_WRITE].max;
 +    var->bps_total_max_length    = cfg->buckets[THROTTLE_BPS_TOTAL].burst_length;
 +    var->bps_read_max_length     = cfg->buckets[THROTTLE_BPS_READ].burst_length;
 +    var->bps_write_max_length    = cfg->buckets[THROTTLE_BPS_WRITE].burst_length;
 +    var->iops_total_max_length   = cfg->buckets[THROTTLE_OPS_TOTAL].burst_length;
 +    var->iops_read_max_length    = cfg->buckets[THROTTLE_OPS_READ].burst_length;
 +    var->iops_write_max_length   = cfg->buckets[THROTTLE_OPS_WRITE].burst_length;
 +    var->iops_size               = cfg->op_size;
 +
 +    var->has_bps_total = true;
 +    var->has_bps_read = true;
 +    var->has_bps_write = true;
 +    var->has_iops_total = true;
 +    var->has_iops_read = true;
 +    var->has_iops_write = true;
 +    var->has_bps_total_max = true;
 +    var->has_bps_read_max = true;
 +    var->has_bps_write_max = true;
 +    var->has_iops_total_max = true;
 +    var->has_iops_read_max = true;
 +    var->has_iops_write_max = true;
 +    var->has_bps_read_max_length = true;
 +    var->has_bps_total_max_length = true;
 +    var->has_bps_write_max_length = true;
 +    var->has_iops_total_max_length = true;
 +    var->has_iops_read_max_length = true;
 +    var->has_iops_write_max_length = true;
 +    var->has_iops_size = true;
 +}
 --
-.13.5
+.13.6

-New patch
+[Qemu-devel] [PULL v3 31/35] test-bdrv-drain: Recursive draining with multiple parents
+Test that drain sections are correctly propagated through the graph.
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+---
+ tests/test-bdrv-drain.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++
+file changed, 74 insertions(+)
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tests/test-bdrv-drain.c
++++ b/tests/test-bdrv-drain.c
+@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
+     blk_unref(blk);
+ }
++static void test_multiparent(void)
++{
++    BlockBackend *blk_a, *blk_b;
++    BlockDriverState *bs_a, *bs_b, *backing;
++    BDRVTestState *a_s, *b_s, *backing_s;
++
++    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
++    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
++                                &error_abort);
++    a_s = bs_a->opaque;
++    blk_insert_bs(blk_a, bs_a, &error_abort);
++
++    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
++    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
++                                &error_abort);
++    b_s = bs_b->opaque;
++    blk_insert_bs(blk_b, bs_b, &error_abort);
++
++    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
++    backing_s = backing->opaque;
++    bdrv_set_backing_hd(bs_a, backing, &error_abort);
++    bdrv_set_backing_hd(bs_b, backing, &error_abort);
++
++    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
++    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
++    g_assert_cmpint(backing->quiesce_counter, ==, 0);
++    g_assert_cmpint(a_s->drain_count, ==, 0);
++    g_assert_cmpint(b_s->drain_count, ==, 0);
++    g_assert_cmpint(backing_s->drain_count, ==, 0);
++
++    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
++
++    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
++    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
++    g_assert_cmpint(backing->quiesce_counter, ==, 1);
++    g_assert_cmpint(a_s->drain_count, ==, 1);
++    g_assert_cmpint(b_s->drain_count, ==, 1);
++    g_assert_cmpint(backing_s->drain_count, ==, 1);
++
++    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
++
++    g_assert_cmpint(bs_a->quiesce_counter, ==, 2);
++    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
++    g_assert_cmpint(backing->quiesce_counter, ==, 2);
++    g_assert_cmpint(a_s->drain_count, ==, 2);
++    g_assert_cmpint(b_s->drain_count, ==, 2);
++    g_assert_cmpint(backing_s->drain_count, ==, 2);
++
++    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
++
++    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
++    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
++    g_assert_cmpint(backing->quiesce_counter, ==, 1);
++    g_assert_cmpint(a_s->drain_count, ==, 1);
++    g_assert_cmpint(b_s->drain_count, ==, 1);
++    g_assert_cmpint(backing_s->drain_count, ==, 1);
++
++    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
++
++    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
++    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
++    g_assert_cmpint(backing->quiesce_counter, ==, 0);
++    g_assert_cmpint(a_s->drain_count, ==, 0);
++    g_assert_cmpint(b_s->drain_count, ==, 0);
++    g_assert_cmpint(backing_s->drain_count, ==, 0);
++
++    bdrv_unref(backing);
++    bdrv_unref(bs_a);
++    bdrv_unref(bs_b);
++    blk_unref(blk_a);
++    blk_unref(blk_b);
++}
++
+ typedef struct TestBlockJob {
+     BlockJob common;
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
+                     test_quiesce_co_drain_subtree);
+     g_test_add_func("/bdrv-drain/nested", test_nested);
++    g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
+     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
+     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
+--
+.13.6

-[Qemu-devel] [PULL 02/14] block: remove unused bdrv_media_changed
+[Qemu-devel] [PULL v3 32/35] block: Allow graph changes in subtree drained section
-From: Manos Pitsidianakis <el13635@mail.ntua.gr>
+We need to remember how many of the drain sections in which a node is
+were recursive (i.e. subtree drain rather than node drain), so that they
-This function is not used anywhere, so remove it.
+can be correctly applied when children are added or removed during the
+drained section.
-Markus Armbruster adds:
-The i82078 floppy device model used to call bdrv_media_changed() to
+With this change, it is safe to modify the graph even inside a
-implement its media change bit when backed by a host floppy.  This
+bdrv_subtree_drained_begin/end() section.
-went away in 21fcf36 "fdc: simplify media change handling".
 Probably broke host floppy media change.  Host floppy pass-through
 was dropped in commit f709623.  bdrv_media_changed() has never been
 used for anything else.  Remove it.
 (Source is Message-ID: <87y3ruaypm.fsf@dusky.pond.sub.org>)
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/block.h     |  1 -
+ include/block/block.h     |  2 --
- include/block/block_int.h |  1 -
+ include/block/block_int.h |  5 +++++
- block.c                   | 14 --------------
+ block.c                   | 32 +++++++++++++++++++++++++++++---
- block/raw-format.c        |  6 ------
+ block/io.c                | 28 ++++++++++++++++++++++++----
-files changed, 22 deletions(-)
+files changed, 58 insertions(+), 9 deletions(-)
 diff --git a/include/block/block.h b/include/block/block.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/block.h
 +++ b/include/block/block.h
-@@ -XXX,XX +XXX,XX @@ int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs);
- int bdrv_set_read_only(BlockDriverState *bs, bool read_only, Error **errp);
+ /**
- bool bdrv_is_sg(BlockDriverState *bs);
+  * Like bdrv_drained_begin, but recursively begins a quiesced section for
- bool bdrv_is_inserted(BlockDriverState *bs);
+  * exclusive access to all child nodes as well.
--int bdrv_media_changed(BlockDriverState *bs);
+- *
- void bdrv_lock_medium(BlockDriverState *bs, bool locked);
+- * Graph changes are not allowed during a subtree drain section.
- void bdrv_eject(BlockDriverState *bs, bool eject_flag);
+  */
- const char *bdrv_get_format_name(BlockDriverState *bs);
+ void bdrv_subtree_drained_begin(BlockDriverState *bs);
 diff --git a/include/block/block_int.h b/include/block/block_int.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/block_int.h
 +++ b/include/block/block_int.h
-@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
+@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
-     /* removable device specific */
+     /* Accessed with atomic ops.  */
-     bool (*bdrv_is_inserted)(BlockDriverState *bs);
+     int quiesce_counter;
--    int (*bdrv_media_changed)(BlockDriverState *bs);
++    int recursive_quiesce_counter;
-     void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
++
-     void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked);
+     unsigned int write_gen;               /* Current data generation */
      /* Protected by reqs_lock.  */
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
      int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
      BdrvRequestFlags flags);
 +void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
 +void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
 +
  int get_tmp_filename(char *filename, int size);
  BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
                              const char *filename);
 diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block.c
 +++ b/block.c
-@@ -XXX,XX +XXX,XX @@ bool bdrv_is_inserted(BlockDriverState *bs)
+@@ -XXX,XX +XXX,XX @@ static void bdrv_child_cb_drained_end(BdrvChild *child)
- }
+     bdrv_drained_end(bs);
+ }
- /**
-- * Return whether the media changed since the last call to this
++static void bdrv_child_cb_attach(BdrvChild *child)
-- * function, or -ENOTSUP if we don't know.  Most drivers don't know.
++{
-- */
++    BlockDriverState *bs = child->opaque;
--int bdrv_media_changed(BlockDriverState *bs)
++    bdrv_apply_subtree_drain(child, bs);
--{
++}
--    BlockDriver *drv = bs->drv;
++
--
++static void bdrv_child_cb_detach(BdrvChild *child)
--    if (drv && drv->bdrv_media_changed) {
++{
--        return drv->bdrv_media_changed(bs);
++    BlockDriverState *bs = child->opaque;
--    }
++    bdrv_unapply_subtree_drain(child, bs);
--    return -ENOTSUP;
++}
--}
++
--
+ static int bdrv_child_cb_inactivate(BdrvChild *child)
 -/**
   * If eject_flag is TRUE, eject the media. Otherwise, close the tray
   */
  void bdrv_eject(BlockDriverState *bs, bool eject_flag)
 diff --git a/block/raw-format.c b/block/raw-format.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/raw-format.c
 +++ b/block/raw-format.c
@@ -XXX,XX +XXX,XX @@ static int raw_truncate(BlockDriverState *bs, int64_t offset,
      return bdrv_truncate(bs->file, offset, prealloc, errp);
  }
 -static int raw_media_changed(BlockDriverState *bs)
 -{
 -    return bdrv_media_changed(bs->file->bs);
 -}
 -
  static void raw_eject(BlockDriverState *bs, bool eject_flag)
  {
-     bdrv_eject(bs->file->bs, eject_flag);
+     BlockDriverState *bs = child->opaque;
-@@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_raw = {
+@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_file = {
-     .bdrv_refresh_limits  = &raw_refresh_limits,
+     .inherit_options = bdrv_inherited_options,
-     .bdrv_probe_blocksizes = &raw_probe_blocksizes,
+     .drained_begin   = bdrv_child_cb_drained_begin,
-     .bdrv_probe_geometry  = &raw_probe_geometry,
+     .drained_end     = bdrv_child_cb_drained_end,
--    .bdrv_media_changed   = &raw_media_changed,
++    .attach          = bdrv_child_cb_attach,
-     .bdrv_eject           = &raw_eject,
++    .detach          = bdrv_child_cb_detach,
-     .bdrv_lock_medium     = &raw_lock_medium,
+     .inactivate      = bdrv_child_cb_inactivate,
-     .bdrv_co_ioctl        = &raw_co_ioctl,
+ };
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_format = {
      .inherit_options = bdrv_inherited_fmt_options,
      .drained_begin   = bdrv_child_cb_drained_begin,
      .drained_end     = bdrv_child_cb_drained_end,
 +    .attach          = bdrv_child_cb_attach,
 +    .detach          = bdrv_child_cb_detach,
      .inactivate      = bdrv_child_cb_inactivate,
  };
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_attach(BdrvChild *c)
                      parent->backing_blocker);
      bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
                      parent->backing_blocker);
 +
 +    bdrv_child_cb_attach(c);
  }
  static void bdrv_backing_detach(BdrvChild *c)
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_detach(BdrvChild *c)
      bdrv_op_unblock_all(c->bs, parent->backing_blocker);
      error_free(parent->backing_blocker);
      parent->backing_blocker = NULL;
 +
 +    bdrv_child_cb_detach(c);
  }
  /*
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
          assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
      }
      if (old_bs) {
 +        /* Detach first so that the recursive drain sections coming from @child
 +         * are already gone and we only end the drain sections that came from
 +         * elsewhere. */
 +        if (child->role->detach) {
 +            child->role->detach(child);
 +        }
          if (old_bs->quiesce_counter && child->role->drained_end) {
              for (i = 0; i < old_bs->quiesce_counter; i++) {
                  child->role->drained_end(child);
              }
          }
 -        if (child->role->detach) {
 -            child->role->detach(child);
 -        }
          QLIST_REMOVE(child, next_parent);
      }
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
              }
          }
 +        /* Attach only after starting new drained sections, so that recursive
 +         * drain sections coming from @child don't get an extra .drained_begin
 +         * callback. */
          if (child->role->attach) {
              child->role->attach(child);
          }
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
      assert(data.done);
  }
 -static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 -                                  BdrvChild *parent)
 +void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 +                           BdrvChild *parent)
  {
      BdrvChild *child, *next;
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
      bdrv_drain_recurse(bs);
      if (recursive) {
 +        bs->recursive_quiesce_counter++;
          QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
              bdrv_do_drained_begin(child->bs, true, child);
          }
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_begin(BlockDriverState *bs)
      bdrv_do_drained_begin(bs, true, NULL);
  }
 -static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 -                                BdrvChild *parent)
 +void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 +                         BdrvChild *parent)
  {
      BdrvChild *child, *next;
      int old_quiesce_counter;
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
      }
      if (recursive) {
 +        bs->recursive_quiesce_counter--;
          QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
              bdrv_do_drained_end(child->bs, true, child);
          }
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_end(BlockDriverState *bs)
      bdrv_do_drained_end(bs, true, NULL);
  }
 +void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
 +{
 +    int i;
 +
 +    for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
 +        bdrv_do_drained_begin(child->bs, true, child);
 +    }
 +}
 +
 +void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
 +{
 +    int i;
 +
 +    for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
 +        bdrv_do_drained_end(child->bs, true, child);
 +    }
 +}
 +
  /*
   * Wait for pending requests to complete on a single BlockDriverState subtree,
   * and suspend block driver's internal I/O until next request arrives.
 --
-.13.5
+.13.6

-[Qemu-devel] [PULL 03/14] block: remove bdrv_truncate callback in blkdebug
+[Qemu-devel] [PULL v3 33/35] test-bdrv-drain: Test graph changes in drained section
-From: Manos Pitsidianakis <el13635@mail.ntua.gr>
-Now that bdrv_truncate is passed to bs->file by default, remove the
-callback from block/blkdebug.c and set is_filter to true. is_filter also gives
-access to other callbacks that are forwarded automatically to bs->file for
-filters.
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/blkdebug.c | 8 +-------
+ tests/test-bdrv-drain.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
-file changed, 1 insertion(+), 7 deletions(-)
+file changed, 80 insertions(+)
-diff --git a/block/blkdebug.c b/block/blkdebug.c
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/blkdebug.c
+--- a/tests/test-bdrv-drain.c
-+++ b/block/blkdebug.c
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ static int64_t blkdebug_getlength(BlockDriverState *bs)
+@@ -XXX,XX +XXX,XX @@ static void test_multiparent(void)
-     return bdrv_getlength(bs->file->bs);
+     blk_unref(blk_b);
  }
--static int blkdebug_truncate(BlockDriverState *bs, int64_t offset,
++static void test_graph_change(void)
--                             PreallocMode prealloc, Error **errp)
++{
--{
++    BlockBackend *blk_a, *blk_b;
--    return bdrv_truncate(bs->file, offset, prealloc, errp);
++    BlockDriverState *bs_a, *bs_b, *backing;
--}
++    BDRVTestState *a_s, *b_s, *backing_s;
--
++
- static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options)
++    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
- {
++    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
-     BDRVBlkdebugState *s = bs->opaque;
++                                &error_abort);
-@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_blkdebug = {
++    a_s = bs_a->opaque;
-     .format_name            = "blkdebug",
++    blk_insert_bs(blk_a, bs_a, &error_abort);
-     .protocol_name          = "blkdebug",
++
-     .instance_size          = sizeof(BDRVBlkdebugState),
++    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
-+    .is_filter              = true,
++    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
++                                &error_abort);
-     .bdrv_parse_filename    = blkdebug_parse_filename,
++    b_s = bs_b->opaque;
-     .bdrv_file_open         = blkdebug_open,
++    blk_insert_bs(blk_b, bs_b, &error_abort);
-@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_blkdebug = {
++
-     .bdrv_child_perm        = bdrv_filter_default_perms,
++    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
++    backing_s = backing->opaque;
-     .bdrv_getlength         = blkdebug_getlength,
++    bdrv_set_backing_hd(bs_a, backing, &error_abort);
--    .bdrv_truncate          = blkdebug_truncate,
++
-     .bdrv_refresh_filename  = blkdebug_refresh_filename,
++    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
-     .bdrv_refresh_limits    = blkdebug_refresh_limits,
++    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
++    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
 +
 +    bdrv_set_backing_hd(bs_b, backing, &error_abort);
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 5);
 +    g_assert_cmpint(a_s->drain_count, ==, 5);
 +    g_assert_cmpint(b_s->drain_count, ==, 5);
 +    g_assert_cmpint(backing_s->drain_count, ==, 5);
 +
 +    bdrv_set_backing_hd(bs_b, NULL, &error_abort);
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 3);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 3);
 +    g_assert_cmpint(a_s->drain_count, ==, 3);
 +    g_assert_cmpint(b_s->drain_count, ==, 2);
 +    g_assert_cmpint(backing_s->drain_count, ==, 3);
 +
 +    bdrv_set_backing_hd(bs_b, backing, &error_abort);
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 5);
 +    g_assert_cmpint(a_s->drain_count, ==, 5);
 +    g_assert_cmpint(b_s->drain_count, ==, 5);
 +    g_assert_cmpint(backing_s->drain_count, ==, 5);
 +
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    bdrv_unref(backing);
 +    bdrv_unref(bs_a);
 +    bdrv_unref(bs_b);
 +    blk_unref(blk_a);
 +    blk_unref(blk_b);
 +}
 +
  typedef struct TestBlockJob {
      BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/nested", test_nested);
      g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
 +    g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
      g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
      g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 --
-.13.5
+.13.6

-New patch
+[Qemu-devel] [PULL v3 34/35] commit: Simplify reopen of base
+Since commit bde70715, base is the only node that is reopened in
+commit_start(). This means that the code, which still involves an
+explicit BlockReopenQueue, can now be simplified by using bdrv_reopen().
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+---
+ block/commit.c | 8 +-------
+file changed, 1 insertion(+), 7 deletions(-)
+diff --git a/block/commit.c b/block/commit.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/commit.c
++++ b/block/commit.c
+@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
+                   const char *filter_node_name, Error **errp)
+ {
+     CommitBlockJob *s;
+-    BlockReopenQueue *reopen_queue = NULL;
+     int orig_base_flags;
+     BlockDriverState *iter;
+     BlockDriverState *commit_top_bs = NULL;
+@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
+     /* convert base to r/w, if necessary */
+     orig_base_flags = bdrv_get_flags(base);
+     if (!(orig_base_flags & BDRV_O_RDWR)) {
+-        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
+-                                         orig_base_flags | BDRV_O_RDWR);
+-    }
+-
+-    if (reopen_queue) {
+-        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
++        bdrv_reopen(base, orig_base_flags | BDRV_O_RDWR, &local_err);
+         if (local_err != NULL) {
+             error_propagate(errp, local_err);
+             goto fail;
+--
+.13.6

-New patch
+[Qemu-devel] [PULL v3 35/35] block: Keep nodes drained between reopen_queue/multiple
+The bdrv_reopen*() implementation doesn't like it if the graph is
+changed between queuing nodes for reopen and actually reopening them
+(one of the reasons is that queuing can be recursive).
+So instead of draining the device only in bdrv_reopen_multiple(),
+require that callers already drained all affected nodes, and assert this
+in bdrv_reopen_queue().
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+---
+ block.c             | 23 ++++++++++++++++-------
+ block/replication.c |  6 ++++++
+ qemu-io-cmds.c      |  3 +++
+files changed, 25 insertions(+), 7 deletions(-)
+diff --git a/block.c b/block.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block.c
++++ b/block.c
+@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
+  * returns a pointer to bs_queue, which is either the newly allocated
+  * bs_queue, or the existing bs_queue being used.
+  *
++ * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
+  */
+ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
+                                                  BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
+     BdrvChild *child;
+     QDict *old_options, *explicit_options;
++    /* Make sure that the caller remembered to use a drained section. This is
++     * important to avoid graph changes between the recursive queuing here and
++     * bdrv_reopen_multiple(). */
++    assert(bs->quiesce_counter > 0);
++
+     if (bs_queue == NULL) {
+         bs_queue = g_new0(BlockReopenQueue, 1);
+         QSIMPLEQ_INIT(bs_queue);
+@@ -XXX,XX +XXX,XX @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
+  * If all devices prepare successfully, then the changes are committed
+  * to all devices.
+  *
++ * All affected nodes must be drained between bdrv_reopen_queue() and
++ * bdrv_reopen_multiple().
+  */
+ int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp)
+ {
+@@ -XXX,XX +XXX,XX @@ int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **er
+     assert(bs_queue != NULL);
+-    aio_context_release(ctx);
+-    bdrv_drain_all_begin();
+-    aio_context_acquire(ctx);
+-
+     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
++        assert(bs_entry->state.bs->quiesce_counter > 0);
+         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
+             error_propagate(errp, local_err);
+             goto cleanup;
+@@ -XXX,XX +XXX,XX @@ cleanup:
+     }
+     g_free(bs_queue);
+-    bdrv_drain_all_end();
+-
+     return ret;
+ }
+@@ -XXX,XX +XXX,XX @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
+ {
+     int ret = -1;
+     Error *local_err = NULL;
+-    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
++    BlockReopenQueue *queue;
++    bdrv_subtree_drained_begin(bs);
++
++    queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
+     ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err);
+     if (local_err != NULL) {
+         error_propagate(errp, local_err);
+     }
++
++    bdrv_subtree_drained_end(bs);
++
+     return ret;
+ }
+diff --git a/block/replication.c b/block/replication.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/replication.c
++++ b/block/replication.c
+@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
+         new_secondary_flags = s->orig_secondary_flags;
+     }
++    bdrv_subtree_drained_begin(s->hidden_disk->bs);
++    bdrv_subtree_drained_begin(s->secondary_disk->bs);
++
+     if (orig_hidden_flags != new_hidden_flags) {
+         reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, NULL,
+                                          new_hidden_flags);
+@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
+                              reopen_queue, &local_err);
+         error_propagate(errp, local_err);
+     }
++
++    bdrv_subtree_drained_end(s->hidden_disk->bs);
++    bdrv_subtree_drained_end(s->secondary_disk->bs);
+ }
+ static void backup_job_cleanup(BlockDriverState *bs)
+diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
+index XXXXXXX..XXXXXXX 100644
+--- a/qemu-io-cmds.c
++++ b/qemu-io-cmds.c
+@@ -XXX,XX +XXX,XX @@ static int reopen_f(BlockBackend *blk, int argc, char **argv)
+     opts = qopts ? qemu_opts_to_qdict(qopts, NULL) : NULL;
+     qemu_opts_reset(&reopen_opts);
++    bdrv_subtree_drained_begin(bs);
+     brq = bdrv_reopen_queue(NULL, bs, opts, flags);
+     bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, &local_err);
++    bdrv_subtree_drained_end(bs);
++
+     if (local_err) {
+         error_report_err(local_err);
+     } else {
+--
+.13.6

The following changes since commit 98bfaac788be0ca63d7d010c8d4ba100ff1d8278:

Merge remote-tracking branch 'remotes/armbru/tags/pull-qapi-2017-09-01-v3' into staging (2017-09-04 13:28:09 +0100)

are available in the git repository at:

git://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to 83a8c775a8bf134eb18a719322939b74a818d750:

qcow2: move qcow2_store_persistent_dirty_bitmaps() before cache flushing (2017-09-06 14:40:18 +0200)

----------------------------------------------------------------
Block layer patches

----------------------------------------------------------------
Daniel P. Berrange (1):
      block: document semantics of bdrv_co_preadv|pwritev

Eric Blake (2):
      qcow: Change signature of get_cluster_offset()
      qcow: Check failure of bdrv_getlength() and bdrv_truncate()

Manos Pitsidianakis (10):
      block: pass bdrv_* methods to bs->file by default in block filters
      block: remove unused bdrv_media_changed
      block: remove bdrv_truncate callback in blkdebug
      block: add default implementations for bdrv_co_get_block_status()
      block: move ThrottleGroup membership to ThrottleGroupMember
      block: add aio_context field in ThrottleGroupMember
      block: tidy ThrottleGroupMember initializations
      block: convert ThrottleGroup to object with QOM
      block: add throttle block filter driver
      qemu-iotests: add 184 for throttle filter driver

Pavel Butsykin (1):
      qcow2: move qcow2_store_persistent_dirty_bitmaps() before cache flushing

From: Manos Pitsidianakis <el13635@mail.ntua.gr>

The following functions fail if bs->drv is a filter and does not
implement them:

bdrv_probe_blocksizes
bdrv_probe_geometry
bdrv_truncate
bdrv_has_zero_init
bdrv_get_info

Instead, the call should be passed to bs->file if it exists, to allow
filter drivers to support those methods without implementing them. This
commit makes `drv->is_filter = true` imply that these callbacks will be
forwarded to bs->file by default, so disabling support for these
functions must be done explicitly.

Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block_int.h |  6 +++++-
 block.c                   | 21 +++++++++++++++++++--
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
     const char *format_name;
     int instance_size;
 
-    /* set to true if the BlockDriver is a block filter */
+    /* set to true if the BlockDriver is a block filter. Block filters pass
+     * certain callbacks that refer to data (see block.c) to their bs->file if
+     * the driver doesn't implement them. Drivers that do not wish to forward
+     * must implement them and return -ENOTSUP.
+     */
     bool is_filter;
     /* for snapshots block filter like Quorum can implement the
      * following recursive callback.
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
 
     if (drv && drv->bdrv_probe_blocksizes) {
         return drv->bdrv_probe_blocksizes(bs, bsz);
+    } else if (drv && drv->is_filter && bs->file) {
+        return bdrv_probe_blocksizes(bs->file->bs, bsz);
     }
 
     return -ENOTSUP;
@@ -XXX,XX +XXX,XX @@ int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
 
     if (drv && drv->bdrv_probe_geometry) {
         return drv->bdrv_probe_geometry(bs, geo);
+    } else if (drv && drv->is_filter && bs->file) {
+        return bdrv_probe_geometry(bs->file->bs, geo);
     }
 
     return -ENOTSUP;
@@ -XXX,XX +XXX,XX @@ int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc,
 
     assert(child->perm & BLK_PERM_RESIZE);
 
+    /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
     if (!drv) {
         error_setg(errp, "No medium inserted");
         return -ENOMEDIUM;
     }
     if (!drv->bdrv_truncate) {
+        if (bs->file && drv->is_filter) {
+            return bdrv_truncate(bs->file, offset, prealloc, errp);
+        }
         error_setg(errp, "Image format driver does not support resize");
         return -ENOTSUP;
     }
@@ -XXX,XX +XXX,XX @@ int bdrv_has_zero_init(BlockDriverState *bs)
     if (bs->drv->bdrv_has_zero_init) {
         return bs->drv->bdrv_has_zero_init(bs);
     }
+    if (bs->file && bs->drv->is_filter) {
+        return bdrv_has_zero_init(bs->file->bs);
+    }
 
     /* safe default */
     return 0;
@@ -XXX,XX +XXX,XX @@ void bdrv_get_backing_filename(BlockDriverState *bs,
 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
     BlockDriver *drv = bs->drv;
-    if (!drv)
+    /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
+    if (!drv) {
         return -ENOMEDIUM;
-    if (!drv->bdrv_get_info)
+    }
+    if (!drv->bdrv_get_info) {
+        if (bs->file && drv->is_filter) {
+            return bdrv_get_info(bs->file->bs, bdi);
+        }
         return -ENOTSUP;
+    }
     memset(bdi, 0, sizeof(*bdi));
     return drv->bdrv_get_info(bs, bdi);
 }
-- 
2.13.5

From: Manos Pitsidianakis <el13635@mail.ntua.gr>

This function is not used anywhere, so remove it.

Markus Armbruster adds:
The i82078 floppy device model used to call bdrv_media_changed() to
implement its media change bit when backed by a host floppy.  This
went away in 21fcf36 "fdc: simplify media change handling".
Probably broke host floppy media change.  Host floppy pass-through
was dropped in commit f709623.  bdrv_media_changed() has never been
used for anything else.  Remove it.
(Source is Message-ID: <87y3ruaypm.fsf@dusky.pond.sub.org>)

Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h     |  1 -
 include/block/block_int.h |  1 -
 block.c                   | 14 --------------
 block/raw-format.c        |  6 ------
 4 files changed, 22 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
 int bdrv_set_read_only(BlockDriverState *bs, bool read_only, Error **errp);
 bool bdrv_is_sg(BlockDriverState *bs);
 bool bdrv_is_inserted(BlockDriverState *bs);
-int bdrv_media_changed(BlockDriverState *bs);
 void bdrv_lock_medium(BlockDriverState *bs, bool locked);
 void bdrv_eject(BlockDriverState *bs, bool eject_flag);
 const char *bdrv_get_format_name(BlockDriverState *bs);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
 
     /* removable device specific */
     bool (*bdrv_is_inserted)(BlockDriverState *bs);
-    int (*bdrv_media_changed)(BlockDriverState *bs);
     void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
     void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked);
 
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ bool bdrv_is_inserted(BlockDriverState *bs)
 }
 
 /**
- * Return whether the media changed since the last call to this
- * function, or -ENOTSUP if we don't know.  Most drivers don't know.
- */
-int bdrv_media_changed(BlockDriverState *bs)
-{
-    BlockDriver *drv = bs->drv;
-
-    if (drv && drv->bdrv_media_changed) {
-        return drv->bdrv_media_changed(bs);
-    }
-    return -ENOTSUP;
-}
-
-/**
  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
  */
 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
diff --git a/block/raw-format.c b/block/raw-format.c
index XXXXXXX..XXXXXXX 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -XXX,XX +XXX,XX @@ static int raw_truncate(BlockDriverState *bs, int64_t offset,
     return bdrv_truncate(bs->file, offset, prealloc, errp);
 }
 
-static int raw_media_changed(BlockDriverState *bs)
-{
-    return bdrv_media_changed(bs->file->bs);
-}
-
 static void raw_eject(BlockDriverState *bs, bool eject_flag)
 {
     bdrv_eject(bs->file->bs, eject_flag);
@@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_raw = {
     .bdrv_refresh_limits  = &raw_refresh_limits,
     .bdrv_probe_blocksizes = &raw_probe_blocksizes,
     .bdrv_probe_geometry  = &raw_probe_geometry,
-    .bdrv_media_changed   = &raw_media_changed,
     .bdrv_eject           = &raw_eject,
     .bdrv_lock_medium     = &raw_lock_medium,
     .bdrv_co_ioctl        = &raw_co_ioctl,
-- 
2.13.5

From: Manos Pitsidianakis <el13635@mail.ntua.gr>

Now that bdrv_truncate is passed to bs->file by default, remove the
callback from block/blkdebug.c and set is_filter to true. is_filter also gives
access to other callbacks that are forwarded automatically to bs->file for
filters.

Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/blkdebug.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index XXXXXXX..XXXXXXX 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -XXX,XX +XXX,XX @@ static int64_t blkdebug_getlength(BlockDriverState *bs)
     return bdrv_getlength(bs->file->bs);
 }
 
-static int blkdebug_truncate(BlockDriverState *bs, int64_t offset,
-                             PreallocMode prealloc, Error **errp)
-{
-    return bdrv_truncate(bs->file, offset, prealloc, errp);
-}
-
 static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options)
 {
     BDRVBlkdebugState *s = bs->opaque;
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_blkdebug = {
     .format_name            = "blkdebug",
     .protocol_name          = "blkdebug",
     .instance_size          = sizeof(BDRVBlkdebugState),
+    .is_filter              = true,
 
     .bdrv_parse_filename    = blkdebug_parse_filename,
     .bdrv_file_open         = blkdebug_open,
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_blkdebug = {
     .bdrv_child_perm        = bdrv_filter_default_perms,
 
     .bdrv_getlength         = blkdebug_getlength,
-    .bdrv_truncate          = blkdebug_truncate,
     .bdrv_refresh_filename  = blkdebug_refresh_filename,
     .bdrv_refresh_limits    = blkdebug_refresh_limits,
 
-- 
2.13.5

From: Manos Pitsidianakis <el13635@mail.ntua.gr>

bdrv_co_get_block_status_from_file() and
bdrv_co_get_block_status_from_backing() set *file to bs->file and
bs->backing respectively, so that bdrv_co_get_block_status() can recurse
to them. Future block drivers won't have to duplicate code to implement
this.

Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block_int.h | 18 ++++++++++++++++++
 block/blkdebug.c          | 12 +-----------
 block/commit.c            | 12 +-----------
 block/io.c                | 26 ++++++++++++++++++++++++++
 block/mirror.c            | 12 +-----------
 5 files changed, 47 insertions(+), 33 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
                                uint64_t perm, uint64_t shared,
                                uint64_t *nperm, uint64_t *nshared);
 
+/*
+ * Default implementation for drivers to pass bdrv_co_get_block_status() to
+ * their file.
+ */
+int64_t coroutine_fn bdrv_co_get_block_status_from_file(BlockDriverState *bs,
+                                                        int64_t sector_num,
+                                                        int nb_sectors,
+                                                        int *pnum,
+                                                        BlockDriverState **file);
+/*
+ * Default implementation for drivers to pass bdrv_co_get_block_status() to
+ * their backing file.
+ */
+int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs,
+                                                           int64_t sector_num,
+                                                           int nb_sectors,
+                                                           int *pnum,
+                                                           BlockDriverState **file);
 const char *bdrv_get_parent_name(const BlockDriverState *bs);
 void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp);
 bool blk_dev_has_removable_media(BlockBackend *blk);
diff --git a/block/blkdebug.c b/block/blkdebug.c
index XXXXXXX..XXXXXXX 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs,
     return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
 }
 
-static int64_t coroutine_fn blkdebug_co_get_block_status(
-    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
-    BlockDriverState **file)
-{
-    *pnum = nb_sectors;
-    *file = bs->file->bs;
-    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
-        (sector_num << BDRV_SECTOR_BITS);
-}
-
 static void blkdebug_close(BlockDriverState *bs)
 {
     BDRVBlkdebugState *s = bs->opaque;
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_blkdebug = {
     .bdrv_co_flush_to_disk  = blkdebug_co_flush,
     .bdrv_co_pwrite_zeroes  = blkdebug_co_pwrite_zeroes,
     .bdrv_co_pdiscard       = blkdebug_co_pdiscard,
-    .bdrv_co_get_block_status = blkdebug_co_get_block_status,
+    .bdrv_co_get_block_status = bdrv_co_get_block_status_from_file,
 
     .bdrv_debug_event           = blkdebug_debug_event,
     .bdrv_debug_breakpoint      = blkdebug_debug_breakpoint,
diff --git a/block/commit.c b/block/commit.c
index XXXXXXX..XXXXXXX 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_commit_top_preadv(BlockDriverState *bs,
     return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
 }
 
-static int64_t coroutine_fn bdrv_commit_top_get_block_status(
-    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
-    BlockDriverState **file)
-{
-    *pnum = nb_sectors;
-    *file = bs->backing->bs;
-    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
-           (sector_num << BDRV_SECTOR_BITS);
-}
-
 static void bdrv_commit_top_refresh_filename(BlockDriverState *bs, QDict *opts)
 {
     bdrv_refresh_filename(bs->backing->bs);
@@ -XXX,XX +XXX,XX @@ static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c,
 static BlockDriver bdrv_commit_top = {
     .format_name                = "commit_top",
     .bdrv_co_preadv             = bdrv_commit_top_preadv,
-    .bdrv_co_get_block_status   = bdrv_commit_top_get_block_status,
+    .bdrv_co_get_block_status   = bdrv_co_get_block_status_from_backing,
     .bdrv_refresh_filename      = bdrv_commit_top_refresh_filename,
     .bdrv_close                 = bdrv_commit_top_close,
     .bdrv_child_perm            = bdrv_commit_top_child_perm,
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ typedef struct BdrvCoGetBlockStatusData {
     bool done;
 } BdrvCoGetBlockStatusData;
 
+int64_t coroutine_fn bdrv_co_get_block_status_from_file(BlockDriverState *bs,
+                                                        int64_t sector_num,
+                                                        int nb_sectors,
+                                                        int *pnum,
+                                                        BlockDriverState **file)
+{
+    assert(bs->file && bs->file->bs);
+    *pnum = nb_sectors;
+    *file = bs->file->bs;
+    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
+           (sector_num << BDRV_SECTOR_BITS);
+}
+
+int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs,
+                                                           int64_t sector_num,
+                                                           int nb_sectors,
+                                                           int *pnum,
+                                                           BlockDriverState **file)
+{
+    assert(bs->backing && bs->backing->bs);
+    *pnum = nb_sectors;
+    *file = bs->backing->bs;
+    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
+           (sector_num << BDRV_SECTOR_BITS);
+}
+
 /*
  * Returns the allocation status of the specified sectors.
  * Drivers not implementing the functionality are assumed to not support
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
     return bdrv_co_flush(bs->backing->bs);
 }
 
-static int64_t coroutine_fn bdrv_mirror_top_get_block_status(
-    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
-    BlockDriverState **file)
-{
-    *pnum = nb_sectors;
-    *file = bs->backing->bs;
-    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
-           (sector_num << BDRV_SECTOR_BITS);
-}
-
 static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs,
     int64_t offset, int bytes, BdrvRequestFlags flags)
 {
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_mirror_top = {
     .bdrv_co_pwrite_zeroes      = bdrv_mirror_top_pwrite_zeroes,
     .bdrv_co_pdiscard           = bdrv_mirror_top_pdiscard,
     .bdrv_co_flush              = bdrv_mirror_top_flush,
-    .bdrv_co_get_block_status   = bdrv_mirror_top_get_block_status,
+    .bdrv_co_get_block_status   = bdrv_co_get_block_status_from_backing,
     .bdrv_refresh_filename      = bdrv_mirror_top_refresh_filename,
     .bdrv_close                 = bdrv_mirror_top_close,
     .bdrv_child_perm            = bdrv_mirror_top_child_perm,
-- 
2.13.5

From: Eric Blake <eblake@redhat.com>

The old signature has an ambiguous meaning for a return of 0:
either no allocation was requested or necessary, or an error
occurred (but any errno associated with the error is lost to
the caller, which then has to assume EIO).

Better is to follow the example of qcow2, by changing the
signature to have a separate return value that cleanly
distinguishes between failure and success, along with a
parameter that cleanly holds a 64-bit value.  Then update all
callers.

While auditing that all return paths return a negative errno
(rather than -1), I also simplified places where we can pass
NULL rather than a local Error that just gets thrown away.

Suggested-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow.c | 123 +++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 73 insertions(+), 50 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -XXX,XX +XXX,XX @@ static int qcow_reopen_prepare(BDRVReopenState *state,
  * 'compressed_size'. 'compressed_size' must be > 0 and <
  * cluster_size
  *
- * return 0 if not allocated.
+ * return 0 if not allocated, 1 if *result is assigned, and negative
+ * errno on failure.
  */
-static uint64_t get_cluster_offset(BlockDriverState *bs,
-                                   uint64_t offset, int allocate,
-                                   int compressed_size,
-                                   int n_start, int n_end)
+static int get_cluster_offset(BlockDriverState *bs,
+                              uint64_t offset, int allocate,
+                              int compressed_size,
+                              int n_start, int n_end, uint64_t *result)
 {
     BDRVQcowState *s = bs->opaque;
-    int min_index, i, j, l1_index, l2_index;
+    int min_index, i, j, l1_index, l2_index, ret;
     uint64_t l2_offset, *l2_table, cluster_offset, tmp;
     uint32_t min_count;
     int new_l2_table;
 
+    *result = 0;
     l1_index = offset >> (s->l2_bits + s->cluster_bits);
     l2_offset = s->l1_table[l1_index];
     new_l2_table = 0;
@@ -XXX,XX +XXX,XX @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
         /* update the L1 entry */
         s->l1_table[l1_index] = l2_offset;
         tmp = cpu_to_be64(l2_offset);
-        if (bdrv_pwrite_sync(bs->file,
-                s->l1_table_offset + l1_index * sizeof(tmp),
-                &tmp, sizeof(tmp)) < 0)
-            return 0;
+        ret = bdrv_pwrite_sync(bs->file,
+                               s->l1_table_offset + l1_index * sizeof(tmp),
+                               &tmp, sizeof(tmp));
+        if (ret < 0) {
+            return ret;
+        }
         new_l2_table = 1;
     }
     for(i = 0; i < L2_CACHE_SIZE; i++) {
@@ -XXX,XX +XXX,XX @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
     l2_table = s->l2_cache + (min_index << s->l2_bits);
     if (new_l2_table) {
         memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
-        if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
-                s->l2_size * sizeof(uint64_t)) < 0)
-            return 0;
+        ret = bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
+                               s->l2_size * sizeof(uint64_t));
+        if (ret < 0) {
+            return ret;
+        }
     } else {
-        if (bdrv_pread(bs->file, l2_offset, l2_table,
-                       s->l2_size * sizeof(uint64_t)) !=
-            s->l2_size * sizeof(uint64_t))
-            return 0;
+        ret = bdrv_pread(bs->file, l2_offset, l2_table,
+                         s->l2_size * sizeof(uint64_t));
+        if (ret < 0) {
+            return ret;
+        }
     }
     s->l2_cache_offsets[min_index] = l2_offset;
     s->l2_cache_counts[min_index] = 1;
@@ -XXX,XX +XXX,XX @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
             /* if the cluster is already compressed, we must
                decompress it in the case it is not completely
                overwritten */
-            if (decompress_cluster(bs, cluster_offset) < 0)
-                return 0;
+            if (decompress_cluster(bs, cluster_offset) < 0) {
+                return -EIO;
+            }
             cluster_offset = bdrv_getlength(bs->file->bs);
             cluster_offset = (cluster_offset + s->cluster_size - 1) &
                 ~(s->cluster_size - 1);
             /* write the cluster content */
-            if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache,
-                            s->cluster_size) !=
-                s->cluster_size)
-                return -1;
+            ret = bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache,
+                              s->cluster_size);
+            if (ret < 0) {
+                return ret;
+            }
         } else {
             cluster_offset = bdrv_getlength(bs->file->bs);
             if (allocate == 1) {
@@ -XXX,XX +XXX,XX @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
                                                       s->cluster_data,
                                                       BDRV_SECTOR_SIZE,
                                                       NULL) < 0) {
-                                errno = EIO;
-                                return -1;
+                                return -EIO;
+                            }
+                            ret = bdrv_pwrite(bs->file,
+                                              cluster_offset + i * 512,
+                                              s->cluster_data, 512);
+                            if (ret < 0) {
+                                return ret;
                             }
-                            if (bdrv_pwrite(bs->file,
-                                            cluster_offset + i * 512,
-                                            s->cluster_data, 512) != 512)
-                                return -1;
                         }
                     }
                 }
@@ -XXX,XX +XXX,XX @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
         /* update L2 table */
         tmp = cpu_to_be64(cluster_offset);
         l2_table[l2_index] = tmp;
-        if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
-                &tmp, sizeof(tmp)) < 0)
-            return 0;
+        ret = bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
+                               &tmp, sizeof(tmp));
+        if (ret < 0) {
+            return ret;
+        }
     }
-    return cluster_offset;
+    *result = cluster_offset;
+    return 1;
 }
 
 static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
 {
     BDRVQcowState *s = bs->opaque;
-    int index_in_cluster, n;
+    int index_in_cluster, n, ret;
     uint64_t cluster_offset;
 
     qemu_co_mutex_lock(&s->lock);
-    cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
+    ret = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0, &cluster_offset);
     qemu_co_mutex_unlock(&s->lock);
+    if (ret < 0) {
+        return ret;
+    }
     index_in_cluster = sector_num & (s->cluster_sectors - 1);
     n = s->cluster_sectors - index_in_cluster;
     if (n > nb_sectors)
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
 
     while (nb_sectors != 0) {
         /* prepare next request */
-        cluster_offset = get_cluster_offset(bs, sector_num << 9,
-                                                 0, 0, 0, 0);
+        ret = get_cluster_offset(bs, sector_num << 9,
+                                 0, 0, 0, 0, &cluster_offset);
+        if (ret < 0) {
+            break;
+        }
         index_in_cluster = sector_num & (s->cluster_sectors - 1);
         n = s->cluster_sectors - index_in_cluster;
         if (n > nb_sectors) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
                 ret = bdrv_co_readv(bs->backing, sector_num, n, &hd_qiov);
                 qemu_co_mutex_lock(&s->lock);
                 if (ret < 0) {
-                    goto fail;
+                    break;
                 }
             } else {
                 /* Note: in this case, no need to wait */
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
         } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
             /* add AIO support for compressed blocks ? */
             if (decompress_cluster(bs, cluster_offset) < 0) {
-                goto fail;
+                ret = -EIO;
+                break;
             }
             memcpy(buf,
                    s->cluster_cache + index_in_cluster * 512, 512 * n);
         } else {
             if ((cluster_offset & 511) != 0) {
-                goto fail;
+                ret = -EIO;
+                break;
             }
             hd_iov.iov_base = (void *)buf;
             hd_iov.iov_len = n * 512;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
                 assert(s->crypto);
                 if (qcrypto_block_decrypt(s->crypto, sector_num, buf,
                                           n * BDRV_SECTOR_SIZE, NULL) < 0) {
-                    goto fail;
+                    ret = -EIO;
+                    break;
                 }
             }
         }
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
         buf += n * 512;
     }
 
-done:
     qemu_co_mutex_unlock(&s->lock);
 
     if (qiov->niov > 1) {
@@ -XXX,XX +XXX,XX @@ done:
     }
 
     return ret;
-
-fail:
-    ret = -EIO;
-    goto done;
 }
 
 static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
         if (n > nb_sectors) {
             n = nb_sectors;
         }
-        cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
-                                            index_in_cluster,
-                                            index_in_cluster + n);
+        ret = get_cluster_offset(bs, sector_num << 9, 1, 0,
+                                 index_in_cluster,
+                                 index_in_cluster + n, &cluster_offset);
+        if (ret < 0) {
+            break;
+        }
         if (!cluster_offset || (cluster_offset & 511) != 0) {
             ret = -EIO;
             break;
@@ -XXX,XX +XXX,XX @@ qcow_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
         goto success;
     }
     qemu_co_mutex_lock(&s->lock);
-    cluster_offset = get_cluster_offset(bs, offset, 2, out_len, 0, 0);
+    ret = get_cluster_offset(bs, offset, 2, out_len, 0, 0, &cluster_offset);
     qemu_co_mutex_unlock(&s->lock);
+    if (ret < 0) {
+        goto fail;
+    }
     if (cluster_offset == 0) {
         ret = -EIO;
         goto fail;
-- 
2.13.5

From: Eric Blake <eblake@redhat.com>

Omitting the check for whether bdrv_getlength() and bdrv_truncate()
failed meant that it was theoretically possible to return an
incorrect offset to the caller.  More likely, conditions for either
of these functions to fail would also cause one of our other calls
(such as bdrv_pread() or bdrv_pwrite_sync()) to also fail, but
auditing that we are safe is difficult compared to just patching
things to always forward on the error rather than ignoring it.

Use osdep.h macros instead of open-coded rounding while in the
area.

Reported-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -XXX,XX +XXX,XX @@ static int get_cluster_offset(BlockDriverState *bs,
 {
     BDRVQcowState *s = bs->opaque;
     int min_index, i, j, l1_index, l2_index, ret;
-    uint64_t l2_offset, *l2_table, cluster_offset, tmp;
+    int64_t l2_offset;
+    uint64_t *l2_table, cluster_offset, tmp;
     uint32_t min_count;
     int new_l2_table;
 
@@ -XXX,XX +XXX,XX @@ static int get_cluster_offset(BlockDriverState *bs,
             return 0;
         /* allocate a new l2 entry */
         l2_offset = bdrv_getlength(bs->file->bs);
+        if (l2_offset < 0) {
+            return l2_offset;
+        }
         /* round to cluster size */
-        l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1);
+        l2_offset = QEMU_ALIGN_UP(l2_offset, s->cluster_size);
         /* update the L1 entry */
         s->l1_table[l1_index] = l2_offset;
         tmp = cpu_to_be64(l2_offset);
@@ -XXX,XX +XXX,XX @@ static int get_cluster_offset(BlockDriverState *bs,
                 return -EIO;
             }
             cluster_offset = bdrv_getlength(bs->file->bs);
-            cluster_offset = (cluster_offset + s->cluster_size - 1) &
-                ~(s->cluster_size - 1);
+            if ((int64_t) cluster_offset < 0) {
+                return cluster_offset;
+            }
+            cluster_offset = QEMU_ALIGN_UP(cluster_offset, s->cluster_size);
             /* write the cluster content */
             ret = bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache,
                               s->cluster_size);
@@ -XXX,XX +XXX,XX @@ static int get_cluster_offset(BlockDriverState *bs,
             }
         } else {
             cluster_offset = bdrv_getlength(bs->file->bs);
+            if ((int64_t) cluster_offset < 0) {
+                return cluster_offset;
+            }
             if (allocate == 1) {
                 /* round to cluster size */
-                cluster_offset = (cluster_offset + s->cluster_size - 1) &
-                    ~(s->cluster_size - 1);
-                bdrv_truncate(bs->file, cluster_offset + s->cluster_size,
-                              PREALLOC_MODE_OFF, NULL);
+                cluster_offset = QEMU_ALIGN_UP(cluster_offset, s->cluster_size);
+                if (cluster_offset + s->cluster_size > INT64_MAX) {
+                    return -E2BIG;
+                }
+                ret = bdrv_truncate(bs->file, cluster_offset + s->cluster_size,
+                                    PREALLOC_MODE_OFF, NULL);
+                if (ret < 0) {
+                    return ret;
+                }
                 /* if encrypted, we must initialize the cluster
                    content which won't be written */
                 if (bs->encrypted &&
-- 
2.13.5

From: "Daniel P. Berrange" <berrange@redhat.com>

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Daniel P. Berrange <berrange@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block_int.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
 
     int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+
+    /**
+     * @offset: position in bytes to read at
+     * @bytes: number of bytes to read
+     * @qiov: the buffers to fill with read data
+     * @flags: currently unused, always 0
+     *
+     * @offset and @bytes will be a multiple of 'request_alignment',
+     * but the length of individual @qiov elements does not have to
+     * be a multiple.
+     *
+     * @bytes will always equal the total size of @qiov, and will be
+     * no larger than 'max_transfer'.
+     *
+     * The buffer in @qiov may point directly to guest memory.
+     */
     int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs,
         uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
     int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
     int coroutine_fn (*bdrv_co_writev_flags)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags);
+    /**
+     * @offset: position in bytes to write at
+     * @bytes: number of bytes to write
+     * @qiov: the buffers containing data to write
+     * @flags: zero or more bits allowed by 'supported_write_flags'
+     *
+     * @offset and @bytes will be a multiple of 'request_alignment',
+     * but the length of individual @qiov elements does not have to
+     * be a multiple.
+     *
+     * @bytes will always equal the total size of @qiov, and will be
+     * no larger than 'max_transfer'.
+     *
+     * The buffer in @qiov may point directly to guest memory.
+     */
     int coroutine_fn (*bdrv_co_pwritev)(BlockDriverState *bs,
         uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
 
-- 
2.13.5

From: Manos Pitsidianakis <el13635@mail.ntua.gr>

This commit eliminates the 1:1 relationship between BlockBackend and
throttle group state.  Users will be able to create multiple throttle
nodes, each with its own throttle group state, in the future.  The
throttle group state cannot be per-BlockBackend anymore, it must be
per-throttle node. This is done by gathering ThrottleGroup membership
details from BlockBackendPublic into ThrottleGroupMember and refactoring
existing code to use the structure.

Reviewed-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/throttle-groups.h |  39 +++++-
 include/sysemu/block-backend.h  |  20 +--
 block/block-backend.c           |  66 +++++----
 block/qapi.c                    |   8 +-
 block/throttle-groups.c         | 288 ++++++++++++++++++++--------------------
 blockdev.c                      |   4 +-
 tests/test-throttle.c           |  53 ++++----
 7 files changed, 252 insertions(+), 226 deletions(-)

diff --git a/include/block/throttle-groups.h b/include/block/throttle-groups.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/throttle-groups.h
+++ b/include/block/throttle-groups.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/throttle.h"
 #include "block/block_int.h"
 
-const char *throttle_group_get_name(BlockBackend *blk);
+/* The ThrottleGroupMember structure indicates membership in a ThrottleGroup
+ * and holds related data.
+ */
+
+typedef struct ThrottleGroupMember {
+    /* throttled_reqs_lock protects the CoQueues for throttled requests.  */
+    CoMutex      throttled_reqs_lock;
+    CoQueue      throttled_reqs[2];
+
+    /* Nonzero if the I/O limits are currently being ignored; generally
+     * it is zero.  Accessed with atomic operations.
+     */
+    unsigned int io_limits_disabled;
+
+    /* The following fields are protected by the ThrottleGroup lock.
+     * See the ThrottleGroup documentation for details.
+     * throttle_state tells us if I/O limits are configured. */
+    ThrottleState *throttle_state;
+    ThrottleTimers throttle_timers;
+    unsigned       pending_reqs[2];
+    QLIST_ENTRY(ThrottleGroupMember) round_robin;
+
+} ThrottleGroupMember;
+
+const char *throttle_group_get_name(ThrottleGroupMember *tgm);
 
 ThrottleState *throttle_group_incref(const char *name);
 void throttle_group_unref(ThrottleState *ts);
 
-void throttle_group_config(BlockBackend *blk, ThrottleConfig *cfg);
-void throttle_group_get_config(BlockBackend *blk, ThrottleConfig *cfg);
+void throttle_group_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg);
+void throttle_group_get_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg);
 
-void throttle_group_register_blk(BlockBackend *blk, const char *groupname);
-void throttle_group_unregister_blk(BlockBackend *blk);
-void throttle_group_restart_blk(BlockBackend *blk);
+void throttle_group_register_tgm(ThrottleGroupMember *tgm,
+                                const char *groupname);
+void throttle_group_unregister_tgm(ThrottleGroupMember *tgm);
+void throttle_group_restart_tgm(ThrottleGroupMember *tgm);
 
-void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
+void coroutine_fn throttle_group_co_io_limits_intercept(ThrottleGroupMember *tgm,
                                                         unsigned int bytes,
                                                         bool is_write);
 
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -XXX,XX +XXX,XX @@ typedef struct BlockDevOps {
 
 /* This struct is embedded in (the private) BlockBackend struct and contains
  * fields that must be public. This is in particular for QLIST_ENTRY() and
- * friends so that BlockBackends can be kept in lists outside block-backend.c */
+ * friends so that BlockBackends can be kept in lists outside block-backend.c
+ * */
 typedef struct BlockBackendPublic {
-    /* throttled_reqs_lock protects the CoQueues for throttled requests.  */
-    CoMutex      throttled_reqs_lock;
-    CoQueue      throttled_reqs[2];
-
-    /* Nonzero if the I/O limits are currently being ignored; generally
-     * it is zero.  Accessed with atomic operations.
-     */
-    unsigned int io_limits_disabled;
-
-    /* The following fields are protected by the ThrottleGroup lock.
-     * See the ThrottleGroup documentation for details.
-     * throttle_state tells us if I/O limits are configured. */
-    ThrottleState *throttle_state;
-    ThrottleTimers throttle_timers;
-    unsigned       pending_reqs[2];
-    QLIST_ENTRY(BlockBackendPublic) round_robin;
+    ThrottleGroupMember throttle_group_member;
 } BlockBackendPublic;
 
 BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm);
diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm)
     blk->shared_perm = shared_perm;
     blk_set_enable_write_cache(blk, true);
 
-    qemu_co_mutex_init(&blk->public.throttled_reqs_lock);
-    qemu_co_queue_init(&blk->public.throttled_reqs[0]);
-    qemu_co_queue_init(&blk->public.throttled_reqs[1]);
+    qemu_co_mutex_init(&blk->public.throttle_group_member.throttled_reqs_lock);
+    qemu_co_queue_init(&blk->public.throttle_group_member.throttled_reqs[0]);
+    qemu_co_queue_init(&blk->public.throttle_group_member.throttled_reqs[1]);
     block_acct_init(&blk->stats);
 
     notifier_list_init(&blk->remove_bs_notifiers);
@@ -XXX,XX +XXX,XX @@ static void blk_delete(BlockBackend *blk)
     assert(!blk->refcnt);
     assert(!blk->name);
     assert(!blk->dev);
-    if (blk->public.throttle_state) {
+    if (blk->public.throttle_group_member.throttle_state) {
         blk_io_limits_disable(blk);
     }
     if (blk->root) {
@@ -XXX,XX +XXX,XX @@ BlockBackend *blk_by_public(BlockBackendPublic *public)
  */
 void blk_remove_bs(BlockBackend *blk)
 {
+    ThrottleTimers *tt;
+
     notifier_list_notify(&blk->remove_bs_notifiers, blk);
-    if (blk->public.throttle_state) {
-        throttle_timers_detach_aio_context(&blk->public.throttle_timers);
+    if (blk->public.throttle_group_member.throttle_state) {
+        tt = &blk->public.throttle_group_member.throttle_timers;
+        throttle_timers_detach_aio_context(tt);
     }
 
     blk_update_root_state(blk);
@@ -XXX,XX +XXX,XX @@ int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
     bdrv_ref(bs);
 
     notifier_list_notify(&blk->insert_bs_notifiers, blk);
-    if (blk->public.throttle_state) {
+    if (blk->public.throttle_group_member.throttle_state) {
         throttle_timers_attach_aio_context(
-            &blk->public.throttle_timers, bdrv_get_aio_context(bs));
+            &blk->public.throttle_group_member.throttle_timers,
+            bdrv_get_aio_context(bs));
     }
 
     return 0;
@@ -XXX,XX +XXX,XX @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
     bdrv_inc_in_flight(bs);
 
     /* throttling disk I/O */
-    if (blk->public.throttle_state) {
-        throttle_group_co_io_limits_intercept(blk, bytes, false);
+    if (blk->public.throttle_group_member.throttle_state) {
+        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
+                bytes, false);
     }
 
     ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
@@ -XXX,XX +XXX,XX @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
     }
 
     bdrv_inc_in_flight(bs);
-
     /* throttling disk I/O */
-    if (blk->public.throttle_state) {
-        throttle_group_co_io_limits_intercept(blk, bytes, true);
+    if (blk->public.throttle_group_member.throttle_state) {
+        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
+                bytes, true);
     }
 
     if (!blk->enable_write_cache) {
@@ -XXX,XX +XXX,XX @@ static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
 void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
 {
     BlockDriverState *bs = blk_bs(blk);
+    ThrottleTimers *tt;
 
     if (bs) {
-        if (blk->public.throttle_state) {
-            throttle_timers_detach_aio_context(&blk->public.throttle_timers);
+        if (blk->public.throttle_group_member.throttle_state) {
+            tt = &blk->public.throttle_group_member.throttle_timers;
+            throttle_timers_detach_aio_context(tt);
         }
         bdrv_set_aio_context(bs, new_context);
-        if (blk->public.throttle_state) {
-            throttle_timers_attach_aio_context(&blk->public.throttle_timers,
-                                               new_context);
+        if (blk->public.throttle_group_member.throttle_state) {
+            tt = &blk->public.throttle_group_member.throttle_timers;
+            throttle_timers_attach_aio_context(tt, new_context);
         }
     }
 }
@@ -XXX,XX +XXX,XX @@ int blk_commit_all(void)
 /* throttling disk I/O limits */
 void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
 {
-    throttle_group_config(blk, cfg);
+    throttle_group_config(&blk->public.throttle_group_member, cfg);
 }
 
 void blk_io_limits_disable(BlockBackend *blk)
 {
-    assert(blk->public.throttle_state);
+    assert(blk->public.throttle_group_member.throttle_state);
     bdrv_drained_begin(blk_bs(blk));
-    throttle_group_unregister_blk(blk);
+    throttle_group_unregister_tgm(&blk->public.throttle_group_member);
     bdrv_drained_end(blk_bs(blk));
 }
 
 /* should be called before blk_set_io_limits if a limit is set */
 void blk_io_limits_enable(BlockBackend *blk, const char *group)
 {
-    assert(!blk->public.throttle_state);
-    throttle_group_register_blk(blk, group);
+    assert(!blk->public.throttle_group_member.throttle_state);
+    throttle_group_register_tgm(&blk->public.throttle_group_member, group);
 }
 
 void blk_io_limits_update_group(BlockBackend *blk, const char *group)
 {
     /* this BB is not part of any group */
-    if (!blk->public.throttle_state) {
+    if (!blk->public.throttle_group_member.throttle_state) {
         return;
     }
 
     /* this BB is a part of the same group than the one we want */
-    if (!g_strcmp0(throttle_group_get_name(blk), group)) {
+    if (!g_strcmp0(throttle_group_get_name(&blk->public.throttle_group_member),
+                group)) {
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void blk_root_drained_begin(BdrvChild *child)
     /* Note that blk->root may not be accessible here yet if we are just
      * attaching to a BlockDriverState that is drained. Use child instead. */
 
-    if (atomic_fetch_inc(&blk->public.io_limits_disabled) == 0) {
-        throttle_group_restart_blk(blk);
+    if (atomic_fetch_inc(&blk->public.throttle_group_member.io_limits_disabled) == 0) {
+        throttle_group_restart_tgm(&blk->public.throttle_group_member);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void blk_root_drained_end(BdrvChild *child)
     BlockBackend *blk = child->opaque;
     assert(blk->quiesce_counter);
 
-    assert(blk->public.io_limits_disabled);
-    atomic_dec(&blk->public.io_limits_disabled);
+    assert(blk->public.throttle_group_member.io_limits_disabled);
+    atomic_dec(&blk->public.throttle_group_member.io_limits_disabled);
 
     if (--blk->quiesce_counter == 0) {
         if (blk->dev_ops && blk->dev_ops->drained_end) {
diff --git a/block/qapi.c b/block/qapi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -XXX,XX +XXX,XX @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
 
     info->detect_zeroes = bs->detect_zeroes;
 
-    if (blk && blk_get_public(blk)->throttle_state) {
+    if (blk && blk_get_public(blk)->throttle_group_member.throttle_state) {
         ThrottleConfig cfg;
+        BlockBackendPublic *blkp = blk_get_public(blk);
 
-        throttle_group_get_config(blk, &cfg);
+        throttle_group_get_config(&blkp->throttle_group_member, &cfg);
 
         info->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
         info->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
@@ -XXX,XX +XXX,XX @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
         info->iops_size = cfg.op_size;
 
         info->has_group = true;
-        info->group = g_strdup(throttle_group_get_name(blk));
+        info->group =
+            g_strdup(throttle_group_get_name(&blkp->throttle_group_member));
     }
 
     info->write_threshold = bdrv_write_threshold_get(bs);
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index XXXXXXX..XXXXXXX 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@
 #include "sysemu/qtest.h"
 
 /* The ThrottleGroup structure (with its ThrottleState) is shared
- * among different BlockBackends and it's independent from
+ * among different ThrottleGroupMembers and it's independent from
  * AioContext, so in order to use it from different threads it needs
  * its own locking.
  *
@@ -XXX,XX +XXX,XX @@
  * The whole ThrottleGroup structure is private and invisible to
  * outside users, that only use it through its ThrottleState.
  *
- * In addition to the ThrottleGroup structure, BlockBackendPublic has
+ * In addition to the ThrottleGroup structure, ThrottleGroupMember has
  * fields that need to be accessed by other members of the group and
  * therefore also need to be protected by this lock. Once a
- * BlockBackend is registered in a group those fields can be accessed
+ * ThrottleGroupMember is registered in a group those fields can be accessed
  * by other threads any time.
  *
  * Again, all this is handled internally and is mostly transparent to
  * the outside. The 'throttle_timers' field however has an additional
  * constraint because it may be temporarily invalid (see for example
  * blk_set_aio_context()). Therefore in this file a thread will
- * access some other BlockBackend's timers only after verifying that
- * that BlockBackend has throttled requests in the queue.
+ * access some other ThrottleGroupMember's timers only after verifying that
+ * that ThrottleGroupMember has throttled requests in the queue.
  */
 typedef struct ThrottleGroup {
     char *name; /* This is constant during the lifetime of the group */
 
     QemuMutex lock; /* This lock protects the following four fields */
     ThrottleState ts;
-    QLIST_HEAD(, BlockBackendPublic) head;
-    BlockBackend *tokens[2];
+    QLIST_HEAD(, ThrottleGroupMember) head;
+    ThrottleGroupMember *tokens[2];
     bool any_timer_armed[2];
     QEMUClockType clock_type;
 
@@ -XXX,XX +XXX,XX @@ void throttle_group_unref(ThrottleState *ts)
     qemu_mutex_unlock(&throttle_groups_lock);
 }
 
-/* Get the name from a BlockBackend's ThrottleGroup. The name (and the pointer)
+/* Get the name from a ThrottleGroupMember's group. The name (and the pointer)
  * is guaranteed to remain constant during the lifetime of the group.
  *
- * @blk:  a BlockBackend that is member of a throttling group
+ * @tgm:  a ThrottleGroupMember
  * @ret:  the name of the group.
  */
-const char *throttle_group_get_name(BlockBackend *blk)
+const char *throttle_group_get_name(ThrottleGroupMember *tgm)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
+    ThrottleGroup *tg = container_of(tgm->throttle_state, ThrottleGroup, ts);
     return tg->name;
 }
 
-/* Return the next BlockBackend in the round-robin sequence, simulating a
- * circular list.
+/* Return the next ThrottleGroupMember in the round-robin sequence, simulating
+ * a circular list.
  *
  * This assumes that tg->lock is held.
  *
- * @blk: the current BlockBackend
- * @ret: the next BlockBackend in the sequence
+ * @tgm: the current ThrottleGroupMember
+ * @ret: the next ThrottleGroupMember in the sequence
  */
-static BlockBackend *throttle_group_next_blk(BlockBackend *blk)
+static ThrottleGroupMember *throttle_group_next_tgm(ThrottleGroupMember *tgm)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleState *ts = blkp->throttle_state;
+    ThrottleState *ts = tgm->throttle_state;
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-    BlockBackendPublic *next = QLIST_NEXT(blkp, round_robin);
+    ThrottleGroupMember *next = QLIST_NEXT(tgm, round_robin);
 
     if (!next) {
         next = QLIST_FIRST(&tg->head);
     }
 
-    return blk_by_public(next);
+    return next;
 }
 
 /*
- * Return whether a BlockBackend has pending requests.
+ * Return whether a ThrottleGroupMember has pending requests.
  *
  * This assumes that tg->lock is held.
  *
- * @blk: the BlockBackend
- * @is_write:  the type of operation (read/write)
- * @ret:       whether the BlockBackend has pending requests.
+ * @tgm:        the ThrottleGroupMember
+ * @is_write:   the type of operation (read/write)
+ * @ret:        whether the ThrottleGroupMember has pending requests.
  */
-static inline bool blk_has_pending_reqs(BlockBackend *blk,
+static inline bool tgm_has_pending_reqs(ThrottleGroupMember *tgm,
                                         bool is_write)
 {
-    const BlockBackendPublic *blkp = blk_get_public(blk);
-    return blkp->pending_reqs[is_write];
+    return tgm->pending_reqs[is_write];
 }
 
-/* Return the next BlockBackend in the round-robin sequence with pending I/O
- * requests.
+/* Return the next ThrottleGroupMember in the round-robin sequence with pending
+ * I/O requests.
  *
  * This assumes that tg->lock is held.
  *
- * @blk:       the current BlockBackend
+ * @tgm:       the current ThrottleGroupMember
  * @is_write:  the type of operation (read/write)
- * @ret:       the next BlockBackend with pending requests, or blk if there is
- *             none.
+ * @ret:       the next ThrottleGroupMember with pending requests, or tgm if
+ *             there is none.
  */
-static BlockBackend *next_throttle_token(BlockBackend *blk, bool is_write)
+static ThrottleGroupMember *next_throttle_token(ThrottleGroupMember *tgm,
+                                                bool is_write)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
-    BlockBackend *token, *start;
+    ThrottleState *ts = tgm->throttle_state;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    ThrottleGroupMember *token, *start;
 
     start = token = tg->tokens[is_write];
 
     /* get next bs round in round robin style */
-    token = throttle_group_next_blk(token);
-    while (token != start && !blk_has_pending_reqs(token, is_write)) {
-        token = throttle_group_next_blk(token);
+    token = throttle_group_next_tgm(token);
+    while (token != start && !tgm_has_pending_reqs(token, is_write)) {
+        token = throttle_group_next_tgm(token);
     }
 
     /* If no IO are queued for scheduling on the next round robin token
-     * then decide the token is the current bs because chances are
-     * the current bs get the current request queued.
+     * then decide the token is the current tgm because chances are
+     * the current tgm got the current request queued.
      */
-    if (token == start && !blk_has_pending_reqs(token, is_write)) {
-        token = blk;
+    if (token == start && !tgm_has_pending_reqs(token, is_write)) {
+        token = tgm;
     }
 
-    /* Either we return the original BB, or one with pending requests */
-    assert(token == blk || blk_has_pending_reqs(token, is_write));
+    /* Either we return the original TGM, or one with pending requests */
+    assert(token == tgm || tgm_has_pending_reqs(token, is_write));
 
     return token;
 }
 
-/* Check if the next I/O request for a BlockBackend needs to be throttled or
- * not. If there's no timer set in this group, set one and update the token
- * accordingly.
+/* Check if the next I/O request for a ThrottleGroupMember needs to be
+ * throttled or not. If there's no timer set in this group, set one and update
+ * the token accordingly.
  *
  * This assumes that tg->lock is held.
  *
- * @blk:        the current BlockBackend
+ * @tgm:        the current ThrottleGroupMember
  * @is_write:   the type of operation (read/write)
  * @ret:        whether the I/O request needs to be throttled or not
  */
-static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write)
+static bool throttle_group_schedule_timer(ThrottleGroupMember *tgm,
+                                          bool is_write)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleState *ts = blkp->throttle_state;
-    ThrottleTimers *tt = &blkp->throttle_timers;
+    ThrottleState *ts = tgm->throttle_state;
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    ThrottleTimers *tt = &tgm->throttle_timers;
     bool must_wait;
 
-    if (atomic_read(&blkp->io_limits_disabled)) {
+    if (atomic_read(&tgm->io_limits_disabled)) {
         return false;
     }
 
@@ -XXX,XX +XXX,XX @@ static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write)
 
     must_wait = throttle_schedule_timer(ts, tt, is_write);
 
-    /* If a timer just got armed, set blk as the current token */
+    /* If a timer just got armed, set tgm as the current token */
     if (must_wait) {
-        tg->tokens[is_write] = blk;
+        tg->tokens[is_write] = tgm;
         tg->any_timer_armed[is_write] = true;
     }
 
     return must_wait;
 }
 
-/* Start the next pending I/O request for a BlockBackend.  Return whether
+/* Start the next pending I/O request for a ThrottleGroupMember. Return whether
  * any request was actually pending.
  *
- * @blk:       the current BlockBackend
+ * @tgm:       the current ThrottleGroupMember
  * @is_write:  the type of operation (read/write)
  */
-static bool coroutine_fn throttle_group_co_restart_queue(BlockBackend *blk,
+static bool coroutine_fn throttle_group_co_restart_queue(ThrottleGroupMember *tgm,
                                                          bool is_write)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
     bool ret;
 
-    qemu_co_mutex_lock(&blkp->throttled_reqs_lock);
-    ret = qemu_co_queue_next(&blkp->throttled_reqs[is_write]);
-    qemu_co_mutex_unlock(&blkp->throttled_reqs_lock);
+    qemu_co_mutex_lock(&tgm->throttled_reqs_lock);
+    ret = qemu_co_queue_next(&tgm->throttled_reqs[is_write]);
+    qemu_co_mutex_unlock(&tgm->throttled_reqs_lock);
 
     return ret;
 }
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn throttle_group_co_restart_queue(BlockBackend *blk,
  *
  * This assumes that tg->lock is held.
  *
- * @blk:       the current BlockBackend
+ * @tgm:       the current ThrottleGroupMember
  * @is_write:  the type of operation (read/write)
  */
-static void schedule_next_request(BlockBackend *blk, bool is_write)
+static void schedule_next_request(ThrottleGroupMember *tgm, bool is_write)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
+    ThrottleState *ts = tgm->throttle_state;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
     bool must_wait;
-    BlockBackend *token;
+    ThrottleGroupMember *token;
 
     /* Check if there's any pending request to schedule next */
-    token = next_throttle_token(blk, is_write);
-    if (!blk_has_pending_reqs(token, is_write)) {
+    token = next_throttle_token(tgm, is_write);
+    if (!tgm_has_pending_reqs(token, is_write)) {
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void schedule_next_request(BlockBackend *blk, bool is_write)
 
     /* If it doesn't have to wait, queue it for immediate execution */
     if (!must_wait) {
-        /* Give preference to requests from the current blk */
+        /* Give preference to requests from the current tgm */
         if (qemu_in_coroutine() &&
-            throttle_group_co_restart_queue(blk, is_write)) {
-            token = blk;
+            throttle_group_co_restart_queue(tgm, is_write)) {
+            token = tgm;
         } else {
-            ThrottleTimers *tt = &blk_get_public(token)->throttle_timers;
+            ThrottleTimers *tt = &token->throttle_timers;
             int64_t now = qemu_clock_get_ns(tg->clock_type);
             timer_mod(tt->timers[is_write], now);
             tg->any_timer_armed[is_write] = true;
@@ -XXX,XX +XXX,XX @@ static void schedule_next_request(BlockBackend *blk, bool is_write)
  * if necessary, and schedule the next request using a round robin
  * algorithm.
  *
- * @blk:       the current BlockBackend
+ * @tgm:       the current ThrottleGroupMember
  * @bytes:     the number of bytes for this I/O
  * @is_write:  the type of operation (read/write)
  */
-void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
+void coroutine_fn throttle_group_co_io_limits_intercept(ThrottleGroupMember *tgm,
                                                         unsigned int bytes,
                                                         bool is_write)
 {
     bool must_wait;
-    BlockBackend *token;
-
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
+    ThrottleGroupMember *token;
+    ThrottleGroup *tg = container_of(tgm->throttle_state, ThrottleGroup, ts);
     qemu_mutex_lock(&tg->lock);
 
     /* First we check if this I/O has to be throttled. */
-    token = next_throttle_token(blk, is_write);
+    token = next_throttle_token(tgm, is_write);
     must_wait = throttle_group_schedule_timer(token, is_write);
 
     /* Wait if there's a timer set or queued requests of this type */
-    if (must_wait || blkp->pending_reqs[is_write]) {
-        blkp->pending_reqs[is_write]++;
+    if (must_wait || tgm->pending_reqs[is_write]) {
+        tgm->pending_reqs[is_write]++;
         qemu_mutex_unlock(&tg->lock);
-        qemu_co_mutex_lock(&blkp->throttled_reqs_lock);
-        qemu_co_queue_wait(&blkp->throttled_reqs[is_write],
-                           &blkp->throttled_reqs_lock);
-        qemu_co_mutex_unlock(&blkp->throttled_reqs_lock);
+        qemu_co_mutex_lock(&tgm->throttled_reqs_lock);
+        qemu_co_queue_wait(&tgm->throttled_reqs[is_write],
+                           &tgm->throttled_reqs_lock);
+        qemu_co_mutex_unlock(&tgm->throttled_reqs_lock);
         qemu_mutex_lock(&tg->lock);
-        blkp->pending_reqs[is_write]--;
+        tgm->pending_reqs[is_write]--;
     }
 
     /* The I/O will be executed, so do the accounting */
-    throttle_account(blkp->throttle_state, is_write, bytes);
+    throttle_account(tgm->throttle_state, is_write, bytes);
 
     /* Schedule the next request */
-    schedule_next_request(blk, is_write);
+    schedule_next_request(tgm, is_write);
 
     qemu_mutex_unlock(&tg->lock);
 }
 
 typedef struct {
-    BlockBackend *blk;
+    ThrottleGroupMember *tgm;
     bool is_write;
 } RestartData;
 
 static void coroutine_fn throttle_group_restart_queue_entry(void *opaque)
 {
     RestartData *data = opaque;
-    BlockBackend *blk = data->blk;
+    ThrottleGroupMember *tgm = data->tgm;
+    ThrottleState *ts = tgm->throttle_state;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
     bool is_write = data->is_write;
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
     bool empty_queue;
 
-    empty_queue = !throttle_group_co_restart_queue(blk, is_write);
+    empty_queue = !throttle_group_co_restart_queue(tgm, is_write);
 
     /* If the request queue was empty then we have to take care of
      * scheduling the next one */
     if (empty_queue) {
         qemu_mutex_lock(&tg->lock);
-        schedule_next_request(blk, is_write);
+        schedule_next_request(tgm, is_write);
         qemu_mutex_unlock(&tg->lock);
     }
 }
 
-static void throttle_group_restart_queue(BlockBackend *blk, bool is_write)
+static void throttle_group_restart_queue(ThrottleGroupMember *tgm, bool is_write)
 {
+    BlockBackendPublic *blkp = container_of(tgm, BlockBackendPublic,
+            throttle_group_member);
+    BlockBackend *blk = blk_by_public(blkp);
     Coroutine *co;
     RestartData rd = {
-        .blk = blk,
+        .tgm = tgm,
         .is_write = is_write
     };
 
@@ -XXX,XX +XXX,XX @@ static void throttle_group_restart_queue(BlockBackend *blk, bool is_write)
     aio_co_enter(blk_get_aio_context(blk), co);
 }
 
-void throttle_group_restart_blk(BlockBackend *blk)
+void throttle_group_restart_tgm(ThrottleGroupMember *tgm)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-
-    if (blkp->throttle_state) {
-        throttle_group_restart_queue(blk, 0);
-        throttle_group_restart_queue(blk, 1);
+    if (tgm->throttle_state) {
+        throttle_group_restart_queue(tgm, 0);
+        throttle_group_restart_queue(tgm, 1);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void throttle_group_restart_blk(BlockBackend *blk)
  * to throttle_config(), but guarantees atomicity within the
  * throttling group.
  *
- * @blk: a BlockBackend that is a member of the group
+ * @tgm:    a ThrottleGroupMember that is a member of the group
  * @cfg: the configuration to set
  */
-void throttle_group_config(BlockBackend *blk, ThrottleConfig *cfg)
+void throttle_group_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleState *ts = blkp->throttle_state;
+    ThrottleState *ts = tgm->throttle_state;
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
     qemu_mutex_lock(&tg->lock);
     throttle_config(ts, tg->clock_type, cfg);
     qemu_mutex_unlock(&tg->lock);
 
-    throttle_group_restart_blk(blk);
+    throttle_group_restart_tgm(tgm);
 }
 
 /* Get the throttle configuration from a particular group. Similar to
  * throttle_get_config(), but guarantees atomicity within the
  * throttling group.
  *
- * @blk: a BlockBackend that is a member of the group
+ * @tgm:    a ThrottleGroupMember that is a member of the group
  * @cfg: the configuration will be written here
  */
-void throttle_group_get_config(BlockBackend *blk, ThrottleConfig *cfg)
+void throttle_group_get_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleState *ts = blkp->throttle_state;
+    ThrottleState *ts = tgm->throttle_state;
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
     qemu_mutex_lock(&tg->lock);
     throttle_get_config(ts, cfg);
@@ -XXX,XX +XXX,XX @@ void throttle_group_get_config(BlockBackend *blk, ThrottleConfig *cfg)
 static void timer_cb(BlockBackend *blk, bool is_write)
 {
     BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleState *ts = blkp->throttle_state;
+    ThrottleGroupMember *tgm = &blkp->throttle_group_member;
+    ThrottleState *ts = tgm->throttle_state;
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
 
     /* The timer has just been fired, so we can update the flag */
@@ -XXX,XX +XXX,XX @@ static void timer_cb(BlockBackend *blk, bool is_write)
     qemu_mutex_unlock(&tg->lock);
 
     /* Run the request that was waiting for this timer */
-    throttle_group_restart_queue(blk, is_write);
+    throttle_group_restart_queue(tgm, is_write);
 }
 
 static void read_timer_cb(void *opaque)
@@ -XXX,XX +XXX,XX @@ static void write_timer_cb(void *opaque)
     timer_cb(opaque, true);
 }
 
-/* Register a BlockBackend in the throttling group, also initializing its
- * timers and updating its throttle_state pointer to point to it. If a
+/* Register a ThrottleGroupMember from the throttling group, also initializing
+ * its timers and updating its throttle_state pointer to point to it. If a
  * throttling group with that name does not exist yet, it will be created.
  *
- * @blk:       the BlockBackend to insert
+ * @tgm:       the ThrottleGroupMember to insert
  * @groupname: the name of the group
  */
-void throttle_group_register_blk(BlockBackend *blk, const char *groupname)
+void throttle_group_register_tgm(ThrottleGroupMember *tgm,
+                                 const char *groupname)
 {
     int i;
-    BlockBackendPublic *blkp = blk_get_public(blk);
+    BlockBackendPublic *blkp = container_of(tgm, BlockBackendPublic,
+            throttle_group_member);
+    BlockBackend *blk = blk_by_public(blkp);
     ThrottleState *ts = throttle_group_incref(groupname);
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-    blkp->throttle_state = ts;
+
+    tgm->throttle_state = ts;
 
     qemu_mutex_lock(&tg->lock);
-    /* If the ThrottleGroup is new set this BlockBackend as the token */
+    /* If the ThrottleGroup is new set this ThrottleGroupMember as the token */
     for (i = 0; i < 2; i++) {
         if (!tg->tokens[i]) {
-            tg->tokens[i] = blk;
+            tg->tokens[i] = tgm;
         }
     }
 
-    QLIST_INSERT_HEAD(&tg->head, blkp, round_robin);
+    QLIST_INSERT_HEAD(&tg->head, tgm, round_robin);
 
-    throttle_timers_init(&blkp->throttle_timers,
+    throttle_timers_init(&tgm->throttle_timers,
                          blk_get_aio_context(blk),
                          tg->clock_type,
                          read_timer_cb,
@@ -XXX,XX +XXX,XX @@ void throttle_group_register_blk(BlockBackend *blk, const char *groupname)
     qemu_mutex_unlock(&tg->lock);
 }
 
-/* Unregister a BlockBackend from its group, removing it from the list,
+/* Unregister a ThrottleGroupMember from its group, removing it from the list,
  * destroying the timers and setting the throttle_state pointer to NULL.
  *
- * The BlockBackend must not have pending throttled requests, so the caller has
- * to drain them first.
+ * The ThrottleGroupMember must not have pending throttled requests, so the
+ * caller has to drain them first.
  *
  * The group will be destroyed if it's empty after this operation.
  *
- * @blk: the BlockBackend to remove
+ * @tgm the ThrottleGroupMember to remove
  */
-void throttle_group_unregister_blk(BlockBackend *blk)
+void throttle_group_unregister_tgm(ThrottleGroupMember *tgm)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
+    ThrottleState *ts = tgm->throttle_state;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    ThrottleGroupMember *token;
     int i;
 
-    assert(blkp->pending_reqs[0] == 0 && blkp->pending_reqs[1] == 0);
-    assert(qemu_co_queue_empty(&blkp->throttled_reqs[0]));
-    assert(qemu_co_queue_empty(&blkp->throttled_reqs[1]));
+    assert(tgm->pending_reqs[0] == 0 && tgm->pending_reqs[1] == 0);
+    assert(qemu_co_queue_empty(&tgm->throttled_reqs[0]));
+    assert(qemu_co_queue_empty(&tgm->throttled_reqs[1]));
 
     qemu_mutex_lock(&tg->lock);
     for (i = 0; i < 2; i++) {
-        if (tg->tokens[i] == blk) {
-            BlockBackend *token = throttle_group_next_blk(blk);
-            /* Take care of the case where this is the last blk in the group */
-            if (token == blk) {
+        if (tg->tokens[i] == tgm) {
+            token = throttle_group_next_tgm(tgm);
+            /* Take care of the case where this is the last tgm in the group */
+            if (token == tgm) {
                 token = NULL;
             }
             tg->tokens[i] = token;
         }
     }
 
-    /* remove the current blk from the list */
-    QLIST_REMOVE(blkp, round_robin);
-    throttle_timers_destroy(&blkp->throttle_timers);
+    /* remove the current tgm from the list */
+    QLIST_REMOVE(tgm, round_robin);
+    throttle_timers_destroy(&tgm->throttle_timers);
     qemu_mutex_unlock(&tg->lock);
 
     throttle_group_unref(&tg->ts);
-    blkp->throttle_state = NULL;
+    tgm->throttle_state = NULL;
 }
 
 static void throttle_groups_init(void)
diff --git a/blockdev.c b/blockdev.c
index XXXXXXX..XXXXXXX 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -XXX,XX +XXX,XX @@ void qmp_block_set_io_throttle(BlockIOThrottle *arg, Error **errp)
     if (throttle_enabled(&cfg)) {
         /* Enable I/O limits if they're not enabled yet, otherwise
          * just update the throttling group. */
-        if (!blk_get_public(blk)->throttle_state) {
+        if (!blk_get_public(blk)->throttle_group_member.throttle_state) {
             blk_io_limits_enable(blk,
                                  arg->has_group ? arg->group :
                                  arg->has_device ? arg->device :
@@ -XXX,XX +XXX,XX @@ void qmp_block_set_io_throttle(BlockIOThrottle *arg, Error **errp)
         }
         /* Set the new throttling configuration */
         blk_set_io_limits(blk, &cfg);
-    } else if (blk_get_public(blk)->throttle_state) {
+    } else if (blk_get_public(blk)->throttle_group_member.throttle_state) {
         /* If all throttling settings are set to 0, disable I/O limits */
         blk_io_limits_disable(blk);
     }
diff --git a/tests/test-throttle.c b/tests/test-throttle.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-throttle.c
+++ b/tests/test-throttle.c
@@ -XXX,XX +XXX,XX @@ static void test_groups(void)
     ThrottleConfig cfg1, cfg2;
     BlockBackend *blk1, *blk2, *blk3;
     BlockBackendPublic *blkp1, *blkp2, *blkp3;
+    ThrottleGroupMember *tgm1, *tgm2, *tgm3;
 
     /* No actual I/O is performed on these devices */
     blk1 = blk_new(0, BLK_PERM_ALL);
@@ -XXX,XX +XXX,XX @@ static void test_groups(void)
     blkp2 = blk_get_public(blk2);
     blkp3 = blk_get_public(blk3);
 
-    g_assert(blkp1->throttle_state == NULL);
-    g_assert(blkp2->throttle_state == NULL);
-    g_assert(blkp3->throttle_state == NULL);
+    tgm1 = &blkp1->throttle_group_member;
+    tgm2 = &blkp2->throttle_group_member;
+    tgm3 = &blkp3->throttle_group_member;
 
-    throttle_group_register_blk(blk1, "bar");
-    throttle_group_register_blk(blk2, "foo");
-    throttle_group_register_blk(blk3, "bar");
+    g_assert(tgm1->throttle_state == NULL);
+    g_assert(tgm2->throttle_state == NULL);
+    g_assert(tgm3->throttle_state == NULL);
 
-    g_assert(blkp1->throttle_state != NULL);
-    g_assert(blkp2->throttle_state != NULL);
-    g_assert(blkp3->throttle_state != NULL);
+    throttle_group_register_tgm(tgm1, "bar");
+    throttle_group_register_tgm(tgm2, "foo");
+    throttle_group_register_tgm(tgm3, "bar");
 
-    g_assert(!strcmp(throttle_group_get_name(blk1), "bar"));
-    g_assert(!strcmp(throttle_group_get_name(blk2), "foo"));
-    g_assert(blkp1->throttle_state == blkp3->throttle_state);
+    g_assert(tgm1->throttle_state != NULL);
+    g_assert(tgm2->throttle_state != NULL);
+    g_assert(tgm3->throttle_state != NULL);
+
+    g_assert(!strcmp(throttle_group_get_name(tgm1), "bar"));
+    g_assert(!strcmp(throttle_group_get_name(tgm2), "foo"));
+    g_assert(tgm1->throttle_state == tgm3->throttle_state);
 
     /* Setting the config of a group member affects the whole group */
     throttle_config_init(&cfg1);
@@ -XXX,XX +XXX,XX @@ static void test_groups(void)
     cfg1.buckets[THROTTLE_BPS_WRITE].avg = 285000;
     cfg1.buckets[THROTTLE_OPS_READ].avg  = 20000;
     cfg1.buckets[THROTTLE_OPS_WRITE].avg = 12000;
-    throttle_group_config(blk1, &cfg1);
+    throttle_group_config(tgm1, &cfg1);
 
-    throttle_group_get_config(blk1, &cfg1);
-    throttle_group_get_config(blk3, &cfg2);
+    throttle_group_get_config(tgm1, &cfg1);
+    throttle_group_get_config(tgm3, &cfg2);
     g_assert(!memcmp(&cfg1, &cfg2, sizeof(cfg1)));
 
     cfg2.buckets[THROTTLE_BPS_READ].avg  = 4547;
     cfg2.buckets[THROTTLE_BPS_WRITE].avg = 1349;
     cfg2.buckets[THROTTLE_OPS_READ].avg  = 123;
     cfg2.buckets[THROTTLE_OPS_WRITE].avg = 86;
-    throttle_group_config(blk3, &cfg1);
+    throttle_group_config(tgm3, &cfg1);
 
-    throttle_group_get_config(blk1, &cfg1);
-    throttle_group_get_config(blk3, &cfg2);
+    throttle_group_get_config(tgm1, &cfg1);
+    throttle_group_get_config(tgm3, &cfg2);
     g_assert(!memcmp(&cfg1, &cfg2, sizeof(cfg1)));
 
-    throttle_group_unregister_blk(blk1);
-    throttle_group_unregister_blk(blk2);
-    throttle_group_unregister_blk(blk3);
+    throttle_group_unregister_tgm(tgm1);
+    throttle_group_unregister_tgm(tgm2);
+    throttle_group_unregister_tgm(tgm3);
 
-    g_assert(blkp1->throttle_state == NULL);
-    g_assert(blkp2->throttle_state == NULL);
-    g_assert(blkp3->throttle_state == NULL);
+    g_assert(tgm1->throttle_state == NULL);
+    g_assert(tgm2->throttle_state == NULL);
+    g_assert(tgm3->throttle_state == NULL);
 }
 
 int main(int argc, char **argv)
-- 
2.13.5

From: Manos Pitsidianakis <el13635@mail.ntua.gr>

timer_cb() needs to know about the current Aio context of the throttle
request that is woken up. In order to make ThrottleGroupMember backend
agnostic, this information is stored in an aio_context field instead of
accessing it from BlockBackend.

Reviewed-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/throttle-groups.h |  7 ++++-
 block/block-backend.c           | 15 ++++------
 block/throttle-groups.c         | 38 ++++++++++++++++---------
 tests/test-throttle.c           | 63 +++++++++++++++++++++--------------------
 4 files changed, 69 insertions(+), 54 deletions(-)

diff --git a/include/block/throttle-groups.h b/include/block/throttle-groups.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/throttle-groups.h
+++ b/include/block/throttle-groups.h
@@ -XXX,XX +XXX,XX @@
  */
 
 typedef struct ThrottleGroupMember {
+    AioContext   *aio_context;
     /* throttled_reqs_lock protects the CoQueues for throttled requests.  */
     CoMutex      throttled_reqs_lock;
     CoQueue      throttled_reqs[2];
@@ -XXX,XX +XXX,XX @@ void throttle_group_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg);
 void throttle_group_get_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg);
 
 void throttle_group_register_tgm(ThrottleGroupMember *tgm,
-                                const char *groupname);
+                                const char *groupname,
+                                AioContext *ctx);
 void throttle_group_unregister_tgm(ThrottleGroupMember *tgm);
 void throttle_group_restart_tgm(ThrottleGroupMember *tgm);
 
 void coroutine_fn throttle_group_co_io_limits_intercept(ThrottleGroupMember *tgm,
                                                         unsigned int bytes,
                                                         bool is_write);
+void throttle_group_attach_aio_context(ThrottleGroupMember *tgm,
+                                       AioContext *new_context);
+void throttle_group_detach_aio_context(ThrottleGroupMember *tgm);
 
 #endif
diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
 void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
 {
     BlockDriverState *bs = blk_bs(blk);
-    ThrottleTimers *tt;
+    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
 
     if (bs) {
-        if (blk->public.throttle_group_member.throttle_state) {
-            tt = &blk->public.throttle_group_member.throttle_timers;
-            throttle_timers_detach_aio_context(tt);
+        if (tgm->throttle_state) {
+            throttle_group_detach_aio_context(tgm);
+            throttle_group_attach_aio_context(tgm, new_context);
         }
         bdrv_set_aio_context(bs, new_context);
-        if (blk->public.throttle_group_member.throttle_state) {
-            tt = &blk->public.throttle_group_member.throttle_timers;
-            throttle_timers_attach_aio_context(tt, new_context);
-        }
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void blk_io_limits_disable(BlockBackend *blk)
 void blk_io_limits_enable(BlockBackend *blk, const char *group)
 {
     assert(!blk->public.throttle_group_member.throttle_state);
-    throttle_group_register_tgm(&blk->public.throttle_group_member, group);
+    throttle_group_register_tgm(&blk->public.throttle_group_member,
+                                group, blk_get_aio_context(blk));
 }
 
 void blk_io_limits_update_group(BlockBackend *blk, const char *group)
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index XXXXXXX..XXXXXXX 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn throttle_group_restart_queue_entry(void *opaque)
 
 static void throttle_group_restart_queue(ThrottleGroupMember *tgm, bool is_write)
 {
-    BlockBackendPublic *blkp = container_of(tgm, BlockBackendPublic,
-            throttle_group_member);
-    BlockBackend *blk = blk_by_public(blkp);
     Coroutine *co;
     RestartData rd = {
         .tgm = tgm,
@@ -XXX,XX +XXX,XX @@ static void throttle_group_restart_queue(ThrottleGroupMember *tgm, bool is_write
     };
 
     co = qemu_coroutine_create(throttle_group_restart_queue_entry, &rd);
-    aio_co_enter(blk_get_aio_context(blk), co);
+    aio_co_enter(tgm->aio_context, co);
 }
 
 void throttle_group_restart_tgm(ThrottleGroupMember *tgm)
@@ -XXX,XX +XXX,XX @@ void throttle_group_get_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg)
 /* ThrottleTimers callback. This wakes up a request that was waiting
  * because it had been throttled.
  *
- * @blk:       the BlockBackend whose request had been throttled
+ * @tgm:       the ThrottleGroupMember whose request had been throttled
  * @is_write:  the type of operation (read/write)
  */
-static void timer_cb(BlockBackend *blk, bool is_write)
+static void timer_cb(ThrottleGroupMember *tgm, bool is_write)
 {
-    BlockBackendPublic *blkp = blk_get_public(blk);
-    ThrottleGroupMember *tgm = &blkp->throttle_group_member;
     ThrottleState *ts = tgm->throttle_state;
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
 
@@ -XXX,XX +XXX,XX @@ static void write_timer_cb(void *opaque)
  *
  * @tgm:       the ThrottleGroupMember to insert
  * @groupname: the name of the group
+ * @ctx:       the AioContext to use
  */
 void throttle_group_register_tgm(ThrottleGroupMember *tgm,
-                                 const char *groupname)
+                                 const char *groupname,
+                                 AioContext *ctx)
 {
     int i;
-    BlockBackendPublic *blkp = container_of(tgm, BlockBackendPublic,
-            throttle_group_member);
-    BlockBackend *blk = blk_by_public(blkp);
     ThrottleState *ts = throttle_group_incref(groupname);
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
 
     tgm->throttle_state = ts;
+    tgm->aio_context = ctx;
 
     qemu_mutex_lock(&tg->lock);
     /* If the ThrottleGroup is new set this ThrottleGroupMember as the token */
@@ -XXX,XX +XXX,XX @@ void throttle_group_register_tgm(ThrottleGroupMember *tgm,
     QLIST_INSERT_HEAD(&tg->head, tgm, round_robin);
 
     throttle_timers_init(&tgm->throttle_timers,
-                         blk_get_aio_context(blk),
+                         tgm->aio_context,
                          tg->clock_type,
                          read_timer_cb,
                          write_timer_cb,
-                         blk);
+                         tgm);
 
     qemu_mutex_unlock(&tg->lock);
 }
@@ -XXX,XX +XXX,XX @@ void throttle_group_unregister_tgm(ThrottleGroupMember *tgm)
     tgm->throttle_state = NULL;
 }
 
+void throttle_group_attach_aio_context(ThrottleGroupMember *tgm,
+                                       AioContext *new_context)
+{
+    ThrottleTimers *tt = &tgm->throttle_timers;
+    throttle_timers_attach_aio_context(tt, new_context);
+    tgm->aio_context = new_context;
+}
+
+void throttle_group_detach_aio_context(ThrottleGroupMember *tgm)
+{
+    ThrottleTimers *tt = &tgm->throttle_timers;
+    throttle_timers_detach_aio_context(tt);
+    tgm->aio_context = NULL;
+}
+
 static void throttle_groups_init(void)
 {
     qemu_mutex_init(&throttle_groups_lock);
diff --git a/tests/test-throttle.c b/tests/test-throttle.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-throttle.c
+++ b/tests/test-throttle.c
@@ -XXX,XX +XXX,XX @@
 static AioContext     *ctx;
 static LeakyBucket    bkt;
 static ThrottleConfig cfg;
+static ThrottleGroupMember tgm;
 static ThrottleState  ts;
-static ThrottleTimers tt;
+static ThrottleTimers *tt;
 
 /* useful function */
 static bool double_cmp(double x, double y)
@@ -XXX,XX +XXX,XX @@ static void test_init(void)
 {
     int i;
 
+    tt = &tgm.throttle_timers;
+
     /* fill the structures with crap */
     memset(&ts, 1, sizeof(ts));
-    memset(&tt, 1, sizeof(tt));
+    memset(tt, 1, sizeof(*tt));
 
     /* init structures */
     throttle_init(&ts);
-    throttle_timers_init(&tt, ctx, QEMU_CLOCK_VIRTUAL,
+    throttle_timers_init(tt, ctx, QEMU_CLOCK_VIRTUAL,
                          read_timer_cb, write_timer_cb, &ts);
 
     /* check initialized fields */
-    g_assert(tt.clock_type == QEMU_CLOCK_VIRTUAL);
-    g_assert(tt.timers[0]);
-    g_assert(tt.timers[1]);
+    g_assert(tt->clock_type == QEMU_CLOCK_VIRTUAL);
+    g_assert(tt->timers[0]);
+    g_assert(tt->timers[1]);
 
     /* check other fields where cleared */
     g_assert(!ts.previous_leak);
@@ -XXX,XX +XXX,XX @@ static void test_init(void)
         g_assert(!ts.cfg.buckets[i].level);
     }
 
-    throttle_timers_destroy(&tt);
+    throttle_timers_destroy(tt);
 }
 
 static void test_destroy(void)
 {
     int i;
     throttle_init(&ts);
-    throttle_timers_init(&tt, ctx, QEMU_CLOCK_VIRTUAL,
+    throttle_timers_init(tt, ctx, QEMU_CLOCK_VIRTUAL,
                          read_timer_cb, write_timer_cb, &ts);
-    throttle_timers_destroy(&tt);
+    throttle_timers_destroy(tt);
     for (i = 0; i < 2; i++) {
-        g_assert(!tt.timers[i]);
+        g_assert(!tt->timers[i]);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void test_config_functions(void)
     orig_cfg.op_size = 1;
 
     throttle_init(&ts);
-    throttle_timers_init(&tt, ctx, QEMU_CLOCK_VIRTUAL,
+    throttle_timers_init(tt, ctx, QEMU_CLOCK_VIRTUAL,
                          read_timer_cb, write_timer_cb, &ts);
     /* structure reset by throttle_init previous_leak should be null */
     g_assert(!ts.previous_leak);
@@ -XXX,XX +XXX,XX @@ static void test_config_functions(void)
     /* get back the fixed configuration */
     throttle_get_config(&ts, &final_cfg);
 
-    throttle_timers_destroy(&tt);
+    throttle_timers_destroy(tt);
 
     g_assert(final_cfg.buckets[THROTTLE_BPS_TOTAL].avg == 153);
     g_assert(final_cfg.buckets[THROTTLE_BPS_READ].avg  == 56);
@@ -XXX,XX +XXX,XX @@ static void test_have_timer(void)
 {
     /* zero structures */
     memset(&ts, 0, sizeof(ts));
-    memset(&tt, 0, sizeof(tt));
+    memset(tt, 0, sizeof(*tt));
 
     /* no timer set should return false */
-    g_assert(!throttle_timers_are_initialized(&tt));
+    g_assert(!throttle_timers_are_initialized(tt));
 
     /* init structures */
     throttle_init(&ts);
-    throttle_timers_init(&tt, ctx, QEMU_CLOCK_VIRTUAL,
+    throttle_timers_init(tt, ctx, QEMU_CLOCK_VIRTUAL,
                          read_timer_cb, write_timer_cb, &ts);
 
     /* timer set by init should return true */
-    g_assert(throttle_timers_are_initialized(&tt));
+    g_assert(throttle_timers_are_initialized(tt));
 
-    throttle_timers_destroy(&tt);
+    throttle_timers_destroy(tt);
 }
 
 static void test_detach_attach(void)
 {
     /* zero structures */
     memset(&ts, 0, sizeof(ts));
-    memset(&tt, 0, sizeof(tt));
+    memset(tt, 0, sizeof(*tt));
 
     /* init the structure */
     throttle_init(&ts);
-    throttle_timers_init(&tt, ctx, QEMU_CLOCK_VIRTUAL,
+    throttle_timers_init(tt, ctx, QEMU_CLOCK_VIRTUAL,
                          read_timer_cb, write_timer_cb, &ts);
 
     /* timer set by init should return true */
-    g_assert(throttle_timers_are_initialized(&tt));
+    g_assert(throttle_timers_are_initialized(tt));
 
     /* timer should no longer exist after detaching */
-    throttle_timers_detach_aio_context(&tt);
-    g_assert(!throttle_timers_are_initialized(&tt));
+    throttle_timers_detach_aio_context(tt);
+    g_assert(!throttle_timers_are_initialized(tt));
 
     /* timer should exist again after attaching */
-    throttle_timers_attach_aio_context(&tt, ctx);
-    g_assert(throttle_timers_are_initialized(&tt));
+    throttle_timers_attach_aio_context(tt, ctx);
+    g_assert(throttle_timers_are_initialized(tt));
 
-    throttle_timers_destroy(&tt);
+    throttle_timers_destroy(tt);
 }
 
 static bool do_test_accounting(bool is_ops, /* are we testing bps or ops */
@@ -XXX,XX +XXX,XX @@ static bool do_test_accounting(bool is_ops, /* are we testing bps or ops */
     cfg.op_size = op_size;
 
     throttle_init(&ts);
-    throttle_timers_init(&tt, ctx, QEMU_CLOCK_VIRTUAL,
+    throttle_timers_init(tt, ctx, QEMU_CLOCK_VIRTUAL,
                          read_timer_cb, write_timer_cb, &ts);
     throttle_config(&ts, QEMU_CLOCK_VIRTUAL, &cfg);
 
@@ -XXX,XX +XXX,XX @@ static bool do_test_accounting(bool is_ops, /* are we testing bps or ops */
         return false;
     }
 
-    throttle_timers_destroy(&tt);
+    throttle_timers_destroy(tt);
 
     return true;
 }
@@ -XXX,XX +XXX,XX @@ static void test_groups(void)
     g_assert(tgm2->throttle_state == NULL);
     g_assert(tgm3->throttle_state == NULL);
 
-    throttle_group_register_tgm(tgm1, "bar");
-    throttle_group_register_tgm(tgm2, "foo");
-    throttle_group_register_tgm(tgm3, "bar");
+    throttle_group_register_tgm(tgm1, "bar", blk_get_aio_context(blk1));
+    throttle_group_register_tgm(tgm2, "foo", blk_get_aio_context(blk2));
+    throttle_group_register_tgm(tgm3, "bar", blk_get_aio_context(blk3));
 
     g_assert(tgm1->throttle_state != NULL);
     g_assert(tgm2->throttle_state != NULL);
-- 
2.13.5

From: Manos Pitsidianakis <el13635@mail.ntua.gr>

Move the CoMutex and CoQueue inits inside throttle_group_register_tgm()
which is called whenever a ThrottleGroupMember is initialized. There's
no need for them to be separate.

Reviewed-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/block-backend.c   | 3 ---
 block/throttle-groups.c | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm)
     blk->shared_perm = shared_perm;
     blk_set_enable_write_cache(blk, true);
 
-    qemu_co_mutex_init(&blk->public.throttle_group_member.throttled_reqs_lock);
-    qemu_co_queue_init(&blk->public.throttle_group_member.throttled_reqs[0]);
-    qemu_co_queue_init(&blk->public.throttle_group_member.throttled_reqs[1]);
     block_acct_init(&blk->stats);
 
     notifier_list_init(&blk->remove_bs_notifiers);
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index XXXXXXX..XXXXXXX 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@ void throttle_group_register_tgm(ThrottleGroupMember *tgm,
                          read_timer_cb,
                          write_timer_cb,
                          tgm);
+    qemu_co_mutex_init(&tgm->throttled_reqs_lock);
+    qemu_co_queue_init(&tgm->throttled_reqs[0]);
+    qemu_co_queue_init(&tgm->throttled_reqs[1]);
 
     qemu_mutex_unlock(&tg->lock);
 }
-- 
2.13.5

From: Manos Pitsidianakis <el13635@mail.ntua.gr>

ThrottleGroup is converted to an object. This will allow the future
throttle block filter drive easy creation and configuration of throttle
groups in QMP and cli.

A new QAPI struct, ThrottleLimits, is introduced to provide a shared
struct for all throttle configuration needs in QMP.

ThrottleGroups can be created via CLI as
    -object throttle-group,id=foo,x-iops-total=100,x-..
where x-* are individual limit properties. Since we can't add non-scalar
properties in -object this interface must be used instead. However,
setting these properties must be disabled after initialization because
certain combinations of limits are forbidden and thus configuration
changes should be done in one transaction. The individual properties
will go away when support for non-scalar values in CLI is implemented
and thus are marked as experimental.

ThrottleGroup also has a `limits` property that uses the ThrottleLimits
struct.  It can be used to create ThrottleGroups or set the
configuration in existing groups as follows:

{ "execute": "object-add",
  "arguments": {
    "qom-type": "throttle-group",
    "id": "foo",
    "props" : {
      "limits": {
          "iops-total": 100
      }
    }
  }
}
{ "execute" : "qom-set",
    "arguments" : {
        "path" : "foo",
        "property" : "limits",
        "value" : {
            "iops-total" : 99
        }
    }
}

This also means a group's configuration can be fetched with qom-get.

Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qapi/block-core.json            |  48 +++++
 include/block/throttle-groups.h |   3 +
 include/qemu/throttle-options.h |  59 ++++--
 include/qemu/throttle.h         |   3 +
 block/throttle-groups.c         | 424 ++++++++++++++++++++++++++++++++++++----
 tests/test-throttle.c           |   1 +
 util/throttle.c                 | 151 ++++++++++++++
 7 files changed, 628 insertions(+), 61 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
             '*iops_size': 'int', '*group': 'str' } }
 
 ##
+# @ThrottleLimits:
+#
+# Limit parameters for throttling.
+# Since some limit combinations are illegal, limits should always be set in one
+# transaction. All fields are optional. When setting limits, if a field is
+# missing the current value is not changed.
+#
+# @iops-total:             limit total I/O operations per second
+# @iops-total-max:         I/O operations burst
+# @iops-total-max-length:  length of the iops-total-max burst period, in seconds
+#                          It must only be set if @iops-total-max is set as well.
+# @iops-read:              limit read operations per second
+# @iops-read-max:          I/O operations read burst
+# @iops-read-max-length:   length of the iops-read-max burst period, in seconds
+#                          It must only be set if @iops-read-max is set as well.
+# @iops-write:             limit write operations per second
+# @iops-write-max:         I/O operations write burst
+# @iops-write-max-length:  length of the iops-write-max burst period, in seconds
+#                          It must only be set if @iops-write-max is set as well.
+# @bps-total:              limit total bytes per second
+# @bps-total-max:          total bytes burst
+# @bps-total-max-length:   length of the bps-total-max burst period, in seconds.
+#                          It must only be set if @bps-total-max is set as well.
+# @bps-read:               limit read bytes per second
+# @bps-read-max:           total bytes read burst
+# @bps-read-max-length:    length of the bps-read-max burst period, in seconds
+#                          It must only be set if @bps-read-max is set as well.
+# @bps-write:              limit write bytes per second
+# @bps-write-max:          total bytes write burst
+# @bps-write-max-length:   length of the bps-write-max burst period, in seconds
+#                          It must only be set if @bps-write-max is set as well.
+# @iops-size:              when limiting by iops max size of an I/O in bytes
+#
+# Since: 2.11
+##
+{ 'struct': 'ThrottleLimits',
+  'data': { '*iops-total' : 'int', '*iops-total-max' : 'int',
+            '*iops-total-max-length' : 'int', '*iops-read' : 'int',
+            '*iops-read-max' : 'int', '*iops-read-max-length' : 'int',
+            '*iops-write' : 'int', '*iops-write-max' : 'int',
+            '*iops-write-max-length' : 'int', '*bps-total' : 'int',
+            '*bps-total-max' : 'int', '*bps-total-max-length' : 'int',
+            '*bps-read' : 'int', '*bps-read-max' : 'int',
+            '*bps-read-max-length' : 'int', '*bps-write' : 'int',
+            '*bps-write-max' : 'int', '*bps-write-max-length' : 'int',
+            '*iops-size' : 'int' } }
+
+##
 # @block-stream:
 #
 # Copy data from a backing file into a block device.
diff --git a/include/block/throttle-groups.h b/include/block/throttle-groups.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/throttle-groups.h
+++ b/include/block/throttle-groups.h
@@ -XXX,XX +XXX,XX @@ typedef struct ThrottleGroupMember {
 
 } ThrottleGroupMember;
 
+#define TYPE_THROTTLE_GROUP "throttle-group"
+#define THROTTLE_GROUP(obj) OBJECT_CHECK(ThrottleGroup, (obj), TYPE_THROTTLE_GROUP)
+
 const char *throttle_group_get_name(ThrottleGroupMember *tgm);
 
 ThrottleState *throttle_group_incref(const char *name);
diff --git a/include/qemu/throttle-options.h b/include/qemu/throttle-options.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/throttle-options.h
+++ b/include/qemu/throttle-options.h
@@ -XXX,XX +XXX,XX @@
 #ifndef THROTTLE_OPTIONS_H
 #define THROTTLE_OPTIONS_H
 
+#define QEMU_OPT_IOPS_TOTAL "iops-total"
+#define QEMU_OPT_IOPS_TOTAL_MAX "iops-total-max"
+#define QEMU_OPT_IOPS_TOTAL_MAX_LENGTH "iops-total-max-length"
+#define QEMU_OPT_IOPS_READ "iops-read"
+#define QEMU_OPT_IOPS_READ_MAX "iops-read-max"
+#define QEMU_OPT_IOPS_READ_MAX_LENGTH "iops-read-max-length"
+#define QEMU_OPT_IOPS_WRITE "iops-write"
+#define QEMU_OPT_IOPS_WRITE_MAX "iops-write-max"
+#define QEMU_OPT_IOPS_WRITE_MAX_LENGTH "iops-write-max-length"
+#define QEMU_OPT_BPS_TOTAL "bps-total"
+#define QEMU_OPT_BPS_TOTAL_MAX "bps-total-max"
+#define QEMU_OPT_BPS_TOTAL_MAX_LENGTH "bps-total-max-length"
+#define QEMU_OPT_BPS_READ "bps-read"
+#define QEMU_OPT_BPS_READ_MAX "bps-read-max"
+#define QEMU_OPT_BPS_READ_MAX_LENGTH "bps-read-max-length"
+#define QEMU_OPT_BPS_WRITE "bps-write"
+#define QEMU_OPT_BPS_WRITE_MAX "bps-write-max"
+#define QEMU_OPT_BPS_WRITE_MAX_LENGTH "bps-write-max-length"
+#define QEMU_OPT_IOPS_SIZE "iops-size"
+
+#define THROTTLE_OPT_PREFIX "throttling."
 #define THROTTLE_OPTS \
           { \
-            .name = "throttling.iops-total",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_TOTAL,\
             .type = QEMU_OPT_NUMBER,\
             .help = "limit total I/O operations per second",\
         },{ \
-            .name = "throttling.iops-read",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_READ,\
             .type = QEMU_OPT_NUMBER,\
             .help = "limit read operations per second",\
         },{ \
-            .name = "throttling.iops-write",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_WRITE,\
             .type = QEMU_OPT_NUMBER,\
             .help = "limit write operations per second",\
         },{ \
-            .name = "throttling.bps-total",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_TOTAL,\
             .type = QEMU_OPT_NUMBER,\
             .help = "limit total bytes per second",\
         },{ \
-            .name = "throttling.bps-read",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_READ,\
             .type = QEMU_OPT_NUMBER,\
             .help = "limit read bytes per second",\
         },{ \
-            .name = "throttling.bps-write",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_WRITE,\
             .type = QEMU_OPT_NUMBER,\
             .help = "limit write bytes per second",\
         },{ \
-            .name = "throttling.iops-total-max",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_TOTAL_MAX,\
             .type = QEMU_OPT_NUMBER,\
             .help = "I/O operations burst",\
         },{ \
-            .name = "throttling.iops-read-max",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_READ_MAX,\
             .type = QEMU_OPT_NUMBER,\
             .help = "I/O operations read burst",\
         },{ \
-            .name = "throttling.iops-write-max",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_WRITE_MAX,\
             .type = QEMU_OPT_NUMBER,\
             .help = "I/O operations write burst",\
         },{ \
-            .name = "throttling.bps-total-max",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_TOTAL_MAX,\
             .type = QEMU_OPT_NUMBER,\
             .help = "total bytes burst",\
         },{ \
-            .name = "throttling.bps-read-max",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_READ_MAX,\
             .type = QEMU_OPT_NUMBER,\
             .help = "total bytes read burst",\
         },{ \
-            .name = "throttling.bps-write-max",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_WRITE_MAX,\
             .type = QEMU_OPT_NUMBER,\
             .help = "total bytes write burst",\
         },{ \
-            .name = "throttling.iops-total-max-length",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_TOTAL_MAX_LENGTH,\
             .type = QEMU_OPT_NUMBER,\
             .help = "length of the iops-total-max burst period, in seconds",\
         },{ \
-            .name = "throttling.iops-read-max-length",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_READ_MAX_LENGTH,\
             .type = QEMU_OPT_NUMBER,\
             .help = "length of the iops-read-max burst period, in seconds",\
         },{ \
-            .name = "throttling.iops-write-max-length",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_WRITE_MAX_LENGTH,\
             .type = QEMU_OPT_NUMBER,\
             .help = "length of the iops-write-max burst period, in seconds",\
         },{ \
-            .name = "throttling.bps-total-max-length",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_TOTAL_MAX_LENGTH,\
             .type = QEMU_OPT_NUMBER,\
             .help = "length of the bps-total-max burst period, in seconds",\
         },{ \
-            .name = "throttling.bps-read-max-length",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_READ_MAX_LENGTH,\
             .type = QEMU_OPT_NUMBER,\
             .help = "length of the bps-read-max burst period, in seconds",\
         },{ \
-            .name = "throttling.bps-write-max-length",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_BPS_WRITE_MAX_LENGTH,\
             .type = QEMU_OPT_NUMBER,\
             .help = "length of the bps-write-max burst period, in seconds",\
         },{ \
-            .name = "throttling.iops-size",\
+            .name = THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_SIZE,\
             .type = QEMU_OPT_NUMBER,\
             .help = "when limiting by iops max size of an I/O in bytes",\
         }
diff --git a/include/qemu/throttle.h b/include/qemu/throttle.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/throttle.h
+++ b/include/qemu/throttle.h
@@ -XXX,XX +XXX,XX @@ bool throttle_schedule_timer(ThrottleState *ts,
                              bool is_write);
 
 void throttle_account(ThrottleState *ts, bool is_write, uint64_t size);
+void throttle_limits_to_config(ThrottleLimits *arg, ThrottleConfig *cfg,
+                               Error **errp);
+void throttle_config_to_limits(ThrottleConfig *cfg, ThrottleLimits *var);
 
 #endif
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index XXXXXXX..XXXXXXX 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "sysemu/block-backend.h"
 #include "block/throttle-groups.h"
+#include "qemu/throttle-options.h"
 #include "qemu/queue.h"
 #include "qemu/thread.h"
 #include "sysemu/qtest.h"
+#include "qapi/error.h"
+#include "qapi-visit.h"
+#include "qom/object.h"
+#include "qom/object_interfaces.h"
+
+static void throttle_group_obj_init(Object *obj);
+static void throttle_group_obj_complete(UserCreatable *obj, Error **errp);
 
 /* The ThrottleGroup structure (with its ThrottleState) is shared
  * among different ThrottleGroupMembers and it's independent from
@@ -XXX,XX +XXX,XX @@
  * that ThrottleGroupMember has throttled requests in the queue.
  */
 typedef struct ThrottleGroup {
+    Object parent_obj;
+
+    /* refuse individual property change if initialization is complete */
+    bool is_initialized;
     char *name; /* This is constant during the lifetime of the group */
 
     QemuMutex lock; /* This lock protects the following four fields */
@@ -XXX,XX +XXX,XX @@ typedef struct ThrottleGroup {
     bool any_timer_armed[2];
     QEMUClockType clock_type;
 
-    /* These two are protected by the global throttle_groups_lock */
-    unsigned refcount;
+    /* This field is protected by the global QEMU mutex */
     QTAILQ_ENTRY(ThrottleGroup) list;
 } ThrottleGroup;
 
-static QemuMutex throttle_groups_lock;
+/* This is protected by the global QEMU mutex */
 static QTAILQ_HEAD(, ThrottleGroup) throttle_groups =
     QTAILQ_HEAD_INITIALIZER(throttle_groups);
 
+
+/* This function reads throttle_groups and must be called under the global
+ * mutex.
+ */
+static ThrottleGroup *throttle_group_by_name(const char *name)
+{
+    ThrottleGroup *iter;
+
+    /* Look for an existing group with that name */
+    QTAILQ_FOREACH(iter, &throttle_groups, list) {
+        if (!g_strcmp0(name, iter->name)) {
+            return iter;
+        }
+    }
+
+    return NULL;
+}
+
 /* Increments the reference count of a ThrottleGroup given its name.
  *
  * If no ThrottleGroup is found with the given name a new one is
  * created.
  *
+ * This function edits throttle_groups and must be called under the global
+ * mutex.
+ *
  * @name: the name of the ThrottleGroup
  * @ret:  the ThrottleState member of the ThrottleGroup
  */
 ThrottleState *throttle_group_incref(const char *name)
 {
     ThrottleGroup *tg = NULL;
-    ThrottleGroup *iter;
-
-    qemu_mutex_lock(&throttle_groups_lock);
 
     /* Look for an existing group with that name */
-    QTAILQ_FOREACH(iter, &throttle_groups, list) {
-        if (!strcmp(name, iter->name)) {
-            tg = iter;
-            break;
-        }
-    }
-
-    /* Create a new one if not found */
-    if (!tg) {
-        tg = g_new0(ThrottleGroup, 1);
+    tg = throttle_group_by_name(name);
+
+    if (tg) {
+        object_ref(OBJECT(tg));
+    } else {
+        /* Create a new one if not found */
+        /* new ThrottleGroup obj will have a refcnt = 1 */
+        tg = THROTTLE_GROUP(object_new(TYPE_THROTTLE_GROUP));
         tg->name = g_strdup(name);
-        tg->clock_type = QEMU_CLOCK_REALTIME;
-
-        if (qtest_enabled()) {
-            /* For testing block IO throttling only */
-            tg->clock_type = QEMU_CLOCK_VIRTUAL;
-        }
-        qemu_mutex_init(&tg->lock);
-        throttle_init(&tg->ts);
-        QLIST_INIT(&tg->head);
-
-        QTAILQ_INSERT_TAIL(&throttle_groups, tg, list);
+        throttle_group_obj_complete(USER_CREATABLE(tg), &error_abort);
     }
 
-    tg->refcount++;
-
-    qemu_mutex_unlock(&throttle_groups_lock);
-
     return &tg->ts;
 }
 
@@ -XXX,XX +XXX,XX @@ ThrottleState *throttle_group_incref(const char *name)
  * When the reference count reaches zero the ThrottleGroup is
  * destroyed.
  *
+ * This function edits throttle_groups and must be called under the global
+ * mutex.
+ *
  * @ts:  The ThrottleGroup to unref, given by its ThrottleState member
  */
 void throttle_group_unref(ThrottleState *ts)
 {
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-
-    qemu_mutex_lock(&throttle_groups_lock);
-    if (--tg->refcount == 0) {
-        QTAILQ_REMOVE(&throttle_groups, tg, list);
-        qemu_mutex_destroy(&tg->lock);
-        g_free(tg->name);
-        g_free(tg);
-    }
-    qemu_mutex_unlock(&throttle_groups_lock);
+    object_unref(OBJECT(tg));
 }
 
 /* Get the name from a ThrottleGroupMember's group. The name (and the pointer)
@@ -XXX,XX +XXX,XX @@ static void write_timer_cb(void *opaque)
  * its timers and updating its throttle_state pointer to point to it. If a
  * throttling group with that name does not exist yet, it will be created.
  *
+ * This function edits throttle_groups and must be called under the global
+ * mutex.
+ *
  * @tgm:       the ThrottleGroupMember to insert
  * @groupname: the name of the group
  * @ctx:       the AioContext to use
@@ -XXX,XX +XXX,XX @@ void throttle_group_detach_aio_context(ThrottleGroupMember *tgm)
     tgm->aio_context = NULL;
 }
 
+#undef THROTTLE_OPT_PREFIX
+#define THROTTLE_OPT_PREFIX "x-"
+
+/* Helper struct and array for QOM property setter/getter */
+typedef struct {
+    const char *name;
+    BucketType type;
+    enum {
+        AVG,
+        MAX,
+        BURST_LENGTH,
+        IOPS_SIZE,
+    } category;
+} ThrottleParamInfo;
+
+static ThrottleParamInfo properties[] = {
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_TOTAL,
+        THROTTLE_OPS_TOTAL, AVG,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_TOTAL_MAX,
+        THROTTLE_OPS_TOTAL, MAX,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_TOTAL_MAX_LENGTH,
+        THROTTLE_OPS_TOTAL, BURST_LENGTH,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_READ,
+        THROTTLE_OPS_READ, AVG,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_READ_MAX,
+        THROTTLE_OPS_READ, MAX,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_READ_MAX_LENGTH,
+        THROTTLE_OPS_READ, BURST_LENGTH,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_WRITE,
+        THROTTLE_OPS_WRITE, AVG,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_WRITE_MAX,
+        THROTTLE_OPS_WRITE, MAX,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_WRITE_MAX_LENGTH,
+        THROTTLE_OPS_WRITE, BURST_LENGTH,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_TOTAL,
+        THROTTLE_BPS_TOTAL, AVG,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_TOTAL_MAX,
+        THROTTLE_BPS_TOTAL, MAX,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_TOTAL_MAX_LENGTH,
+        THROTTLE_BPS_TOTAL, BURST_LENGTH,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_READ,
+        THROTTLE_BPS_READ, AVG,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_READ_MAX,
+        THROTTLE_BPS_READ, MAX,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_READ_MAX_LENGTH,
+        THROTTLE_BPS_READ, BURST_LENGTH,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_WRITE,
+        THROTTLE_BPS_WRITE, AVG,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_WRITE_MAX,
+        THROTTLE_BPS_WRITE, MAX,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_BPS_WRITE_MAX_LENGTH,
+        THROTTLE_BPS_WRITE, BURST_LENGTH,
+    },
+    {
+        THROTTLE_OPT_PREFIX QEMU_OPT_IOPS_SIZE,
+        0, IOPS_SIZE,
+    }
+};
+
+/* This function edits throttle_groups and must be called under the global
+ * mutex */
+static void throttle_group_obj_init(Object *obj)
+{
+    ThrottleGroup *tg = THROTTLE_GROUP(obj);
+
+    tg->clock_type = QEMU_CLOCK_REALTIME;
+    if (qtest_enabled()) {
+        /* For testing block IO throttling only */
+        tg->clock_type = QEMU_CLOCK_VIRTUAL;
+    }
+    tg->is_initialized = false;
+    qemu_mutex_init(&tg->lock);
+    throttle_init(&tg->ts);
+    QLIST_INIT(&tg->head);
+}
+
+/* This function edits throttle_groups and must be called under the global
+ * mutex */
+static void throttle_group_obj_complete(UserCreatable *obj, Error **errp)
+{
+    ThrottleGroup *tg = THROTTLE_GROUP(obj);
+    ThrottleConfig cfg;
+
+    /* set group name to object id if it exists */
+    if (!tg->name && tg->parent_obj.parent) {
+        tg->name = object_get_canonical_path_component(OBJECT(obj));
+    }
+    /* We must have a group name at this point */
+    assert(tg->name);
+
+    /* error if name is duplicate */
+    if (throttle_group_by_name(tg->name) != NULL) {
+        error_setg(errp, "A group with this name already exists");
+        return;
+    }
+
+    /* check validity */
+    throttle_get_config(&tg->ts, &cfg);
+    if (!throttle_is_valid(&cfg, errp)) {
+        return;
+    }
+    throttle_config(&tg->ts, tg->clock_type, &cfg);
+    QTAILQ_INSERT_TAIL(&throttle_groups, tg, list);
+    tg->is_initialized = true;
+}
+
+/* This function edits throttle_groups and must be called under the global
+ * mutex */
+static void throttle_group_obj_finalize(Object *obj)
+{
+    ThrottleGroup *tg = THROTTLE_GROUP(obj);
+    if (tg->is_initialized) {
+        QTAILQ_REMOVE(&throttle_groups, tg, list);
+    }
+    qemu_mutex_destroy(&tg->lock);
+    g_free(tg->name);
+}
+
+static void throttle_group_set(Object *obj, Visitor *v, const char * name,
+                               void *opaque, Error **errp)
+
+{
+    ThrottleGroup *tg = THROTTLE_GROUP(obj);
+    ThrottleConfig *cfg;
+    ThrottleParamInfo *info = opaque;
+    Error *local_err = NULL;
+    int64_t value;
+
+    /* If we have finished initialization, don't accept individual property
+     * changes through QOM. Throttle configuration limits must be set in one
+     * transaction, as certain combinations are invalid.
+     */
+    if (tg->is_initialized) {
+        error_setg(&local_err, "Property cannot be set after initialization");
+        goto ret;
+    }
+
+    visit_type_int64(v, name, &value, &local_err);
+    if (local_err) {
+        goto ret;
+    }
+    if (value < 0) {
+        error_setg(&local_err, "Property values cannot be negative");
+        goto ret;
+    }
+
+    cfg = &tg->ts.cfg;
+    switch (info->category) {
+    case AVG:
+        cfg->buckets[info->type].avg = value;
+        break;
+    case MAX:
+        cfg->buckets[info->type].max = value;
+        break;
+    case BURST_LENGTH:
+        if (value > UINT_MAX) {
+            error_setg(&local_err, "%s value must be in the"
+                       "range [0, %u]", info->name, UINT_MAX);
+            goto ret;
+        }
+        cfg->buckets[info->type].burst_length = value;
+        break;
+    case IOPS_SIZE:
+        cfg->op_size = value;
+        break;
+    }
+
+ret:
+    error_propagate(errp, local_err);
+    return;
+
+}
+
+static void throttle_group_get(Object *obj, Visitor *v, const char *name,
+                               void *opaque, Error **errp)
+{
+    ThrottleGroup *tg = THROTTLE_GROUP(obj);
+    ThrottleConfig cfg;
+    ThrottleParamInfo *info = opaque;
+    int64_t value;
+
+    throttle_get_config(&tg->ts, &cfg);
+    switch (info->category) {
+    case AVG:
+        value = cfg.buckets[info->type].avg;
+        break;
+    case MAX:
+        value = cfg.buckets[info->type].max;
+        break;
+    case BURST_LENGTH:
+        value = cfg.buckets[info->type].burst_length;
+        break;
+    case IOPS_SIZE:
+        value = cfg.op_size;
+        break;
+    }
+
+    visit_type_int64(v, name, &value, errp);
+}
+
+static void throttle_group_set_limits(Object *obj, Visitor *v,
+                                      const char *name, void *opaque,
+                                      Error **errp)
+
+{
+    ThrottleGroup *tg = THROTTLE_GROUP(obj);
+    ThrottleConfig cfg;
+    ThrottleLimits arg = { 0 };
+    ThrottleLimits *argp = &arg;
+    Error *local_err = NULL;
+
+    visit_type_ThrottleLimits(v, name, &argp, &local_err);
+    if (local_err) {
+        goto ret;
+    }
+    qemu_mutex_lock(&tg->lock);
+    throttle_get_config(&tg->ts, &cfg);
+    throttle_limits_to_config(argp, &cfg, &local_err);
+    if (local_err) {
+        goto unlock;
+    }
+    throttle_config(&tg->ts, tg->clock_type, &cfg);
+
+unlock:
+    qemu_mutex_unlock(&tg->lock);
+ret:
+    error_propagate(errp, local_err);
+    return;
+}
+
+static void throttle_group_get_limits(Object *obj, Visitor *v,
+                                      const char *name, void *opaque,
+                                      Error **errp)
+{
+    ThrottleGroup *tg = THROTTLE_GROUP(obj);
+    ThrottleConfig cfg;
+    ThrottleLimits arg = { 0 };
+    ThrottleLimits *argp = &arg;
+
+    qemu_mutex_lock(&tg->lock);
+    throttle_get_config(&tg->ts, &cfg);
+    qemu_mutex_unlock(&tg->lock);
+
+    throttle_config_to_limits(&cfg, argp);
+
+    visit_type_ThrottleLimits(v, name, &argp, errp);
+}
+
+static bool throttle_group_can_be_deleted(UserCreatable *uc)
+{
+    return OBJECT(uc)->ref == 1;
+}
+
+static void throttle_group_obj_class_init(ObjectClass *klass, void *class_data)
+{
+    size_t i = 0;
+    UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
+
+    ucc->complete = throttle_group_obj_complete;
+    ucc->can_be_deleted = throttle_group_can_be_deleted;
+
+    /* individual properties */
+    for (i = 0; i < sizeof(properties) / sizeof(ThrottleParamInfo); i++) {
+        object_class_property_add(klass,
+                                  properties[i].name,
+                                  "int",
+                                  throttle_group_get,
+                                  throttle_group_set,
+                                  NULL, &properties[i],
+                                  &error_abort);
+    }
+
+    /* ThrottleLimits */
+    object_class_property_add(klass,
+                              "limits", "ThrottleLimits",
+                              throttle_group_get_limits,
+                              throttle_group_set_limits,
+                              NULL, NULL,
+                              &error_abort);
+}
+
+static const TypeInfo throttle_group_info = {
+    .name = TYPE_THROTTLE_GROUP,
+    .parent = TYPE_OBJECT,
+    .class_init = throttle_group_obj_class_init,
+    .instance_size = sizeof(ThrottleGroup),
+    .instance_init = throttle_group_obj_init,
+    .instance_finalize = throttle_group_obj_finalize,
+    .interfaces = (InterfaceInfo[]) {
+        { TYPE_USER_CREATABLE },
+        { }
+    },
+};
+
 static void throttle_groups_init(void)
 {
-    qemu_mutex_init(&throttle_groups_lock);
+    type_register_static(&throttle_group_info);
 }
 
-block_init(throttle_groups_init);
+type_init(throttle_groups_init);
diff --git a/tests/test-throttle.c b/tests/test-throttle.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-throttle.c
+++ b/tests/test-throttle.c
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     qemu_init_main_loop(&error_fatal);
     ctx = qemu_get_aio_context();
     bdrv_init();
+    module_call_init(MODULE_INIT_QOM);
 
     do {} while (g_main_context_iteration(NULL, false));
 
diff --git a/util/throttle.c b/util/throttle.c
index XXXXXXX..XXXXXXX 100644
--- a/util/throttle.c
+++ b/util/throttle.c
@@ -XXX,XX +XXX,XX @@ void throttle_account(ThrottleState *ts, bool is_write, uint64_t size)
     }
 }
 
+/* return a ThrottleConfig based on the options in a ThrottleLimits
+ *
+ * @arg:    the ThrottleLimits object to read from
+ * @cfg:    the ThrottleConfig to edit
+ * @errp:   error object
+ */
+void throttle_limits_to_config(ThrottleLimits *arg, ThrottleConfig *cfg,
+                               Error **errp)
+{
+    if (arg->has_bps_total) {
+        cfg->buckets[THROTTLE_BPS_TOTAL].avg = arg->bps_total;
+    }
+    if (arg->has_bps_read) {
+        cfg->buckets[THROTTLE_BPS_READ].avg  = arg->bps_read;
+    }
+    if (arg->has_bps_write) {
+        cfg->buckets[THROTTLE_BPS_WRITE].avg = arg->bps_write;
+    }
+
+    if (arg->has_iops_total) {
+        cfg->buckets[THROTTLE_OPS_TOTAL].avg = arg->iops_total;
+    }
+    if (arg->has_iops_read) {
+        cfg->buckets[THROTTLE_OPS_READ].avg  = arg->iops_read;
+    }
+    if (arg->has_iops_write) {
+        cfg->buckets[THROTTLE_OPS_WRITE].avg = arg->iops_write;
+    }
+
+    if (arg->has_bps_total_max) {
+        cfg->buckets[THROTTLE_BPS_TOTAL].max = arg->bps_total_max;
+    }
+    if (arg->has_bps_read_max) {
+        cfg->buckets[THROTTLE_BPS_READ].max = arg->bps_read_max;
+    }
+    if (arg->has_bps_write_max) {
+        cfg->buckets[THROTTLE_BPS_WRITE].max = arg->bps_write_max;
+    }
+    if (arg->has_iops_total_max) {
+        cfg->buckets[THROTTLE_OPS_TOTAL].max = arg->iops_total_max;
+    }
+    if (arg->has_iops_read_max) {
+        cfg->buckets[THROTTLE_OPS_READ].max = arg->iops_read_max;
+    }
+    if (arg->has_iops_write_max) {
+        cfg->buckets[THROTTLE_OPS_WRITE].max = arg->iops_write_max;
+    }
+
+    if (arg->has_bps_total_max_length) {
+        if (arg->bps_total_max_length > UINT_MAX) {
+            error_setg(errp, "bps-total-max-length value must be in"
+                             " the range [0, %u]", UINT_MAX);
+            return;
+        }
+        cfg->buckets[THROTTLE_BPS_TOTAL].burst_length = arg->bps_total_max_length;
+    }
+    if (arg->has_bps_read_max_length) {
+        if (arg->bps_read_max_length > UINT_MAX) {
+            error_setg(errp, "bps-read-max-length value must be in"
+                             " the range [0, %u]", UINT_MAX);
+            return;
+        }
+        cfg->buckets[THROTTLE_BPS_READ].burst_length = arg->bps_read_max_length;
+    }
+    if (arg->has_bps_write_max_length) {
+        if (arg->bps_write_max_length > UINT_MAX) {
+            error_setg(errp, "bps-write-max-length value must be in"
+                             " the range [0, %u]", UINT_MAX);
+            return;
+        }
+        cfg->buckets[THROTTLE_BPS_WRITE].burst_length = arg->bps_write_max_length;
+    }
+    if (arg->has_iops_total_max_length) {
+        if (arg->iops_total_max_length > UINT_MAX) {
+            error_setg(errp, "iops-total-max-length value must be in"
+                             " the range [0, %u]", UINT_MAX);
+            return;
+        }
+        cfg->buckets[THROTTLE_OPS_TOTAL].burst_length = arg->iops_total_max_length;
+    }
+    if (arg->has_iops_read_max_length) {
+        if (arg->iops_read_max_length > UINT_MAX) {
+            error_setg(errp, "iops-read-max-length value must be in"
+                             " the range [0, %u]", UINT_MAX);
+            return;
+        }
+        cfg->buckets[THROTTLE_OPS_READ].burst_length = arg->iops_read_max_length;
+    }
+    if (arg->has_iops_write_max_length) {
+        if (arg->iops_write_max_length > UINT_MAX) {
+            error_setg(errp, "iops-write-max-length value must be in"
+                             " the range [0, %u]", UINT_MAX);
+            return;
+        }
+        cfg->buckets[THROTTLE_OPS_WRITE].burst_length = arg->iops_write_max_length;
+    }
+
+    if (arg->has_iops_size) {
+        cfg->op_size = arg->iops_size;
+    }
+
+    throttle_is_valid(cfg, errp);
+}
+
+/* write the options of a ThrottleConfig to a ThrottleLimits
+ *
+ * @cfg:    the ThrottleConfig to read from
+ * @var:    the ThrottleLimits to write to
+ */
+void throttle_config_to_limits(ThrottleConfig *cfg, ThrottleLimits *var)
+{
+    var->bps_total               = cfg->buckets[THROTTLE_BPS_TOTAL].avg;
+    var->bps_read                = cfg->buckets[THROTTLE_BPS_READ].avg;
+    var->bps_write               = cfg->buckets[THROTTLE_BPS_WRITE].avg;
+    var->iops_total              = cfg->buckets[THROTTLE_OPS_TOTAL].avg;
+    var->iops_read               = cfg->buckets[THROTTLE_OPS_READ].avg;
+    var->iops_write              = cfg->buckets[THROTTLE_OPS_WRITE].avg;
+    var->bps_total_max           = cfg->buckets[THROTTLE_BPS_TOTAL].max;
+    var->bps_read_max            = cfg->buckets[THROTTLE_BPS_READ].max;
+    var->bps_write_max           = cfg->buckets[THROTTLE_BPS_WRITE].max;
+    var->iops_total_max          = cfg->buckets[THROTTLE_OPS_TOTAL].max;
+    var->iops_read_max           = cfg->buckets[THROTTLE_OPS_READ].max;
+    var->iops_write_max          = cfg->buckets[THROTTLE_OPS_WRITE].max;
+    var->bps_total_max_length    = cfg->buckets[THROTTLE_BPS_TOTAL].burst_length;
+    var->bps_read_max_length     = cfg->buckets[THROTTLE_BPS_READ].burst_length;
+    var->bps_write_max_length    = cfg->buckets[THROTTLE_BPS_WRITE].burst_length;
+    var->iops_total_max_length   = cfg->buckets[THROTTLE_OPS_TOTAL].burst_length;
+    var->iops_read_max_length    = cfg->buckets[THROTTLE_OPS_READ].burst_length;
+    var->iops_write_max_length   = cfg->buckets[THROTTLE_OPS_WRITE].burst_length;
+    var->iops_size               = cfg->op_size;
+
+    var->has_bps_total = true;
+    var->has_bps_read = true;
+    var->has_bps_write = true;
+    var->has_iops_total = true;
+    var->has_iops_read = true;
+    var->has_iops_write = true;
+    var->has_bps_total_max = true;
+    var->has_bps_read_max = true;
+    var->has_bps_write_max = true;
+    var->has_iops_total_max = true;
+    var->has_iops_read_max = true;
+    var->has_iops_write_max = true;
+    var->has_bps_read_max_length = true;
+    var->has_bps_total_max_length = true;
+    var->has_bps_write_max_length = true;
+    var->has_iops_total_max_length = true;
+    var->has_iops_read_max_length = true;
+    var->has_iops_write_max_length = true;
+    var->has_iops_size = true;
+}
-- 
2.13.5

From: Manos Pitsidianakis <el13635@mail.ntua.gr>

block/throttle.c uses existing I/O throttle infrastructure inside a
block filter driver. I/O operations are intercepted in the filter's
read/write coroutines, and referred to block/throttle-groups.c

The driver can be used with the syntax
-drive driver=throttle,file.filename=foo.qcow2,throttle-group=bar

which registers the throttle filter node with the ThrottleGroup 'bar'. The
given group must be created beforehand with object-add or -object.

Reviewed-by: Alberto Garcia <berto@igalia.com>
Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qapi/block-core.json            |  18 ++-
 include/block/throttle-groups.h |   5 +
 include/qemu/throttle-options.h |   1 +
 block/throttle-groups.c         |  15 ++-
 block/throttle.c                | 237 ++++++++++++++++++++++++++++++++++++++++
 block/Makefile.objs             |   1 +
 6 files changed, 275 insertions(+), 2 deletions(-)
 create mode 100644 block/throttle.c

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 # Drivers that are supported in block device operations.
 #
 # @vxhs: Since 2.10
+# @throttle: Since 2.11
 #
 # Since: 2.9
 ##
@@ -XXX,XX +XXX,XX @@
             'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
             'null-aio', 'null-co', 'parallels', 'qcow', 'qcow2', 'qed',
             'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
-            'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
+            'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
 
 ##
 # @BlockdevOptionsFile:
@@ -XXX,XX +XXX,XX @@
             '*tls-creds': 'str' } }
 
 ##
+# @BlockdevOptionsThrottle:
+#
+# Driver specific block device options for the throttle driver
+#
+# @throttle-group:   the name of the throttle-group object to use. It
+#                    must already exist.
+# @file:             reference to or definition of the data source block device
+# Since: 2.11
+##
+{ 'struct': 'BlockdevOptionsThrottle',
+  'data': { 'throttle-group': 'str',
+            'file' : 'BlockdevRef'
+             } }
+##
 # @BlockdevOptions:
 #
 # Options for creating a block device.  Many options are available for all
@@ -XXX,XX +XXX,XX @@
       'replication':'BlockdevOptionsReplication',
       'sheepdog':   'BlockdevOptionsSheepdog',
       'ssh':        'BlockdevOptionsSsh',
+      'throttle':   'BlockdevOptionsThrottle',
       'vdi':        'BlockdevOptionsGenericFormat',
       'vhdx':       'BlockdevOptionsGenericFormat',
       'vmdk':       'BlockdevOptionsGenericCOWFormat',
diff --git a/include/block/throttle-groups.h b/include/block/throttle-groups.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/throttle-groups.h
+++ b/include/block/throttle-groups.h
@@ -XXX,XX +XXX,XX @@ void coroutine_fn throttle_group_co_io_limits_intercept(ThrottleGroupMember *tgm
 void throttle_group_attach_aio_context(ThrottleGroupMember *tgm,
                                        AioContext *new_context);
 void throttle_group_detach_aio_context(ThrottleGroupMember *tgm);
+/*
+ * throttle_group_exists() must be called under the global
+ * mutex.
+ */
+bool throttle_group_exists(const char *name);
 
 #endif
diff --git a/include/qemu/throttle-options.h b/include/qemu/throttle-options.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/throttle-options.h
+++ b/include/qemu/throttle-options.h
@@ -XXX,XX +XXX,XX @@
 #define QEMU_OPT_BPS_WRITE_MAX "bps-write-max"
 #define QEMU_OPT_BPS_WRITE_MAX_LENGTH "bps-write-max-length"
 #define QEMU_OPT_IOPS_SIZE "iops-size"
+#define QEMU_OPT_THROTTLE_GROUP_NAME "throttle-group"
 
 #define THROTTLE_OPT_PREFIX "throttling."
 #define THROTTLE_OPTS \
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index XXXXXXX..XXXXXXX 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@ static ThrottleGroup *throttle_group_by_name(const char *name)
     return NULL;
 }
 
+/* This function reads throttle_groups and must be called under the global
+ * mutex.
+ */
+bool throttle_group_exists(const char *name)
+{
+    return throttle_group_by_name(name) != NULL;
+}
+
 /* Increments the reference count of a ThrottleGroup given its name.
  *
  * If no ThrottleGroup is found with the given name a new one is
@@ -XXX,XX +XXX,XX @@ void throttle_group_unregister_tgm(ThrottleGroupMember *tgm)
     ThrottleGroupMember *token;
     int i;
 
+    if (!ts) {
+        /* Discard already unregistered tgm */
+        return;
+    }
+
     assert(tgm->pending_reqs[0] == 0 && tgm->pending_reqs[1] == 0);
     assert(qemu_co_queue_empty(&tgm->throttled_reqs[0]));
     assert(qemu_co_queue_empty(&tgm->throttled_reqs[1]));
@@ -XXX,XX +XXX,XX @@ static void throttle_group_obj_complete(UserCreatable *obj, Error **errp)
     assert(tg->name);
 
     /* error if name is duplicate */
-    if (throttle_group_by_name(tg->name) != NULL) {
+    if (throttle_group_exists(tg->name)) {
         error_setg(errp, "A group with this name already exists");
         return;
     }
diff --git a/block/throttle.c b/block/throttle.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/block/throttle.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU block throttling filter driver infrastructure
+ *
+ * Copyright (c) 2017 Manos Pitsidianakis
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "block/throttle-groups.h"
+#include "qemu/throttle-options.h"
+#include "qapi/error.h"
+
+static QemuOptsList throttle_opts = {
+    .name = "throttle",
+    .head = QTAILQ_HEAD_INITIALIZER(throttle_opts.head),
+    .desc = {
+        {
+            .name = QEMU_OPT_THROTTLE_GROUP_NAME,
+            .type = QEMU_OPT_STRING,
+            .help = "Name of the throttle group",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int throttle_configure_tgm(BlockDriverState *bs,
+                                  ThrottleGroupMember *tgm,
+                                  QDict *options, Error **errp)
+{
+    int ret;
+    const char *group_name;
+    Error *local_err = NULL;
+    QemuOpts *opts = qemu_opts_create(&throttle_opts, NULL, 0, &error_abort);
+
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fin;
+    }
+
+    group_name = qemu_opt_get(opts, QEMU_OPT_THROTTLE_GROUP_NAME);
+    if (!group_name) {
+        error_setg(errp, "Please specify a throttle group");
+        ret = -EINVAL;
+        goto fin;
+    } else if (!throttle_group_exists(group_name)) {
+        error_setg(errp, "Throttle group '%s' does not exist", group_name);
+        ret = -EINVAL;
+        goto fin;
+    }
+
+    /* Register membership to group with name group_name */
+    throttle_group_register_tgm(tgm, group_name, bdrv_get_aio_context(bs));
+    ret = 0;
+fin:
+    qemu_opts_del(opts);
+    return ret;
+}
+
+static int throttle_open(BlockDriverState *bs, QDict *options,
+                         int flags, Error **errp)
+{
+    ThrottleGroupMember *tgm = bs->opaque;
+
+    bs->file = bdrv_open_child(NULL, options, "file", bs,
+                               &child_file, false, errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+    bs->supported_write_flags = bs->file->bs->supported_write_flags;
+    bs->supported_zero_flags = bs->file->bs->supported_zero_flags;
+
+    return throttle_configure_tgm(bs, tgm, options, errp);
+}
+
+static void throttle_close(BlockDriverState *bs)
+{
+    ThrottleGroupMember *tgm = bs->opaque;
+    throttle_group_unregister_tgm(tgm);
+}
+
+
+static int64_t throttle_getlength(BlockDriverState *bs)
+{
+    return bdrv_getlength(bs->file->bs);
+}
+
+static int coroutine_fn throttle_co_preadv(BlockDriverState *bs,
+                                           uint64_t offset, uint64_t bytes,
+                                           QEMUIOVector *qiov, int flags)
+{
+
+    ThrottleGroupMember *tgm = bs->opaque;
+    throttle_group_co_io_limits_intercept(tgm, bytes, false);
+
+    return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
+}
+
+static int coroutine_fn throttle_co_pwritev(BlockDriverState *bs,
+                                            uint64_t offset, uint64_t bytes,
+                                            QEMUIOVector *qiov, int flags)
+{
+    ThrottleGroupMember *tgm = bs->opaque;
+    throttle_group_co_io_limits_intercept(tgm, bytes, true);
+
+    return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
+}
+
+static int coroutine_fn throttle_co_pwrite_zeroes(BlockDriverState *bs,
+                                                  int64_t offset, int bytes,
+                                                  BdrvRequestFlags flags)
+{
+    ThrottleGroupMember *tgm = bs->opaque;
+    throttle_group_co_io_limits_intercept(tgm, bytes, true);
+
+    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
+}
+
+static int coroutine_fn throttle_co_pdiscard(BlockDriverState *bs,
+                                             int64_t offset, int bytes)
+{
+    ThrottleGroupMember *tgm = bs->opaque;
+    throttle_group_co_io_limits_intercept(tgm, bytes, true);
+
+    return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
+}
+
+static int throttle_co_flush(BlockDriverState *bs)
+{
+    return bdrv_co_flush(bs->file->bs);
+}
+
+static void throttle_detach_aio_context(BlockDriverState *bs)
+{
+    ThrottleGroupMember *tgm = bs->opaque;
+    throttle_group_detach_aio_context(tgm);
+}
+
+static void throttle_attach_aio_context(BlockDriverState *bs,
+                                        AioContext *new_context)
+{
+    ThrottleGroupMember *tgm = bs->opaque;
+    throttle_group_attach_aio_context(tgm, new_context);
+}
+
+static int throttle_reopen_prepare(BDRVReopenState *reopen_state,
+                                   BlockReopenQueue *queue, Error **errp)
+{
+    ThrottleGroupMember *tgm;
+
+    assert(reopen_state != NULL);
+    assert(reopen_state->bs != NULL);
+
+    reopen_state->opaque = g_new0(ThrottleGroupMember, 1);
+    tgm = reopen_state->opaque;
+
+    return throttle_configure_tgm(reopen_state->bs, tgm, reopen_state->options,
+            errp);
+}
+
+static void throttle_reopen_commit(BDRVReopenState *reopen_state)
+{
+    ThrottleGroupMember *old_tgm = reopen_state->bs->opaque;
+    ThrottleGroupMember *new_tgm = reopen_state->opaque;
+
+    throttle_group_unregister_tgm(old_tgm);
+    g_free(old_tgm);
+    reopen_state->bs->opaque = new_tgm;
+    reopen_state->opaque = NULL;
+}
+
+static void throttle_reopen_abort(BDRVReopenState *reopen_state)
+{
+    ThrottleGroupMember *tgm = reopen_state->opaque;
+
+    throttle_group_unregister_tgm(tgm);
+    g_free(tgm);
+    reopen_state->opaque = NULL;
+}
+
+static bool throttle_recurse_is_first_non_filter(BlockDriverState *bs,
+                                                 BlockDriverState *candidate)
+{
+    return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
+}
+
+static BlockDriver bdrv_throttle = {
+    .format_name                        =   "throttle",
+    .protocol_name                      =   "throttle",
+    .instance_size                      =   sizeof(ThrottleGroupMember),
+
+    .bdrv_file_open                     =   throttle_open,
+    .bdrv_close                         =   throttle_close,
+    .bdrv_co_flush                      =   throttle_co_flush,
+
+    .bdrv_child_perm                    =   bdrv_filter_default_perms,
+
+    .bdrv_getlength                     =   throttle_getlength,
+
+    .bdrv_co_preadv                     =   throttle_co_preadv,
+    .bdrv_co_pwritev                    =   throttle_co_pwritev,
+
+    .bdrv_co_pwrite_zeroes              =   throttle_co_pwrite_zeroes,
+    .bdrv_co_pdiscard                   =   throttle_co_pdiscard,
+
+    .bdrv_recurse_is_first_non_filter   =   throttle_recurse_is_first_non_filter,
+
+    .bdrv_attach_aio_context            =   throttle_attach_aio_context,
+    .bdrv_detach_aio_context            =   throttle_detach_aio_context,
+
+    .bdrv_reopen_prepare                =   throttle_reopen_prepare,
+    .bdrv_reopen_commit                 =   throttle_reopen_commit,
+    .bdrv_reopen_abort                  =   throttle_reopen_abort,
+    .bdrv_co_get_block_status           =   bdrv_co_get_block_status_from_file,
+
+    .is_filter                          =   true,
+};
+
+static void bdrv_throttle_init(void)
+{
+    bdrv_register(&bdrv_throttle);
+}
+
+block_init(bdrv_throttle_init);
diff --git a/block/Makefile.objs b/block/Makefile.objs
index XXXXXXX..XXXXXXX 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -XXX,XX +XXX,XX @@ block-obj-y += accounting.o dirty-bitmap.o
 block-obj-y += write-threshold.o
 block-obj-y += backup.o
 block-obj-$(CONFIG_REPLICATION) += replication.o
+block-obj-y += throttle.o
 
 block-obj-y += crypto.o
 
-- 
2.13.5

From: Manos Pitsidianakis <el13635@mail.ntua.gr>

Reviewed-by: Alberto Garcia <berto@igalia.com>
Signed-off-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/184     | 205 ++++++++++++++++++++++++++++++
 tests/qemu-iotests/184.out | 302 +++++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 508 insertions(+)
 create mode 100755 tests/qemu-iotests/184
 create mode 100644 tests/qemu-iotests/184.out

diff --git a/tests/qemu-iotests/184 b/tests/qemu-iotests/184
new file mode 100755
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/184
@@ -XXX,XX +XXX,XX @@
+#!/bin/bash
+#
+# Test I/O throttle block filter driver interface
+#
+# Copyright (C) 2017 Manos Pitsidianakis
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner="Manos Pitsidianakis"
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+status=1	# failure is the default!
+
+_cleanup()
+{
+    _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+
+function do_run_qemu()
+{
+    echo Testing: "$@" | _filter_imgfmt
+    $QEMU -nographic -qmp-pretty stdio -serial none "$@"
+    echo
+}
+
+function run_qemu()
+{
+    do_run_qemu "$@" 2>&1 | _filter_testdir | _filter_qemu | _filter_qmp\
+                          | _filter_qemu_io | _filter_generated_node_ids
+}
+
+_make_test_img 64M
+test_throttle=$($QEMU_IMG --help|grep throttle)
+[ "$test_throttle" = "" ] && _supported_fmt throttle
+
+echo
+echo "== checking interface =="
+
+run_qemu <<EOF
+{ "execute": "qmp_capabilities" }
+{ "execute": "blockdev-add",
+  "arguments": {
+    "driver": "$IMGFMT",
+    "node-name": "disk0",
+    "file": {
+      "driver": "file",
+      "filename": "$TEST_IMG"
+    }
+  }
+}
+{ "execute": "object-add",
+  "arguments": {
+    "qom-type": "throttle-group",
+    "id": "group0",
+    "props": {
+      "limits" : {
+        "iops-total": 1000
+      }
+    }
+  }
+}
+{ "execute": "blockdev-add",
+  "arguments": {
+    "driver": "throttle",
+    "node-name": "throttle0",
+    "throttle-group": "group0",
+    "file": "disk0"
+  }
+}
+{ "execute": "query-named-block-nodes" }
+{ "execute": "query-block" }
+{ "execute": "quit" }
+EOF
+
+echo
+echo "== property changes in ThrottleGroup =="
+
+run_qemu <<EOF
+{ "execute": "qmp_capabilities" }
+{ "execute": "object-add",
+  "arguments": {
+    "qom-type": "throttle-group",
+    "id": "group0",
+    "props" : {
+      "limits": {
+          "iops-total": 1000
+      }
+    }
+  }
+}
+{ "execute" : "qom-get",
+  "arguments" : {
+    "path" : "group0",
+    "property" : "limits"
+  }
+}
+{ "execute" : "qom-set",
+    "arguments" : {
+        "path" : "group0",
+        "property" : "limits",
+        "value" : {
+            "iops-total" : 0
+        }
+    }
+}
+{ "execute" : "qom-get",
+  "arguments" : {
+    "path" : "group0",
+    "property" : "limits"
+  }
+}
+{ "execute": "quit" }
+EOF
+
+echo
+echo "== object creation/set errors  =="
+
+run_qemu <<EOF
+{ "execute": "qmp_capabilities" }
+{ "execute": "object-add",
+  "arguments": {
+    "qom-type": "throttle-group",
+    "id": "group0",
+    "props" : {
+      "limits": {
+          "iops-total": 1000
+      }
+    }
+  }
+}
+{ "execute" : "qom-set",
+  "arguments" : {
+    "path" : "group0",
+    "property" : "x-iops-total",
+    "value" : 0
+  }
+}
+{ "execute" : "qom-set",
+    "arguments" : {
+        "path" : "group0",
+        "property" : "limits",
+        "value" : {
+            "iops-total" : 10,
+            "iops-read" : 10
+        }
+    }
+}
+{ "execute": "quit" }
+EOF
+
+echo
+echo "== don't specify group =="
+
+run_qemu <<EOF
+{ "execute": "qmp_capabilities" }
+{ "execute": "blockdev-add",
+  "arguments": {
+    "driver": "$IMGFMT",
+    "node-name": "disk0",
+    "file": {
+      "driver": "file",
+      "filename": "$TEST_IMG"
+    }
+  }
+}
+{ "execute": "blockdev-add",
+  "arguments": {
+    "driver": "throttle",
+    "node-name": "throttle0",
+    "file": "disk0"
+  }
+}
+{ "execute": "quit" }
+EOF
+
+echo
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/184.out b/tests/qemu-iotests/184.out
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/184.out
@@ -XXX,XX +XXX,XX @@
+QA output created by 184
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
+
+== checking interface ==
+Testing:
+{
+    QMP_VERSION
+}
+{
+    "return": {
+    }
+}
+{
+    "return": {
+    }
+}
+{
+    "return": {
+    }
+}
+{
+    "return": {
+    }
+}
+{
+    "return": [
+        {
+            "iops_rd": 0,
+            "detect_zeroes": "off",
+            "image": {
+                "virtual-size": 67108864,
+                "filename": "json:{\"throttle-group\": \"group0\", \"driver\": \"throttle\", \"file\": {\"driver\": \"qcow2\", \"file\": {\"driver\": \"file\", \"filename\": \"TEST_DIR/t.qcow2\"}}}",
+                "cluster-size": 65536,
+                "format": "throttle",
+                "actual-size": 200704,
+                "dirty-flag": false
+            },
+            "iops_wr": 0,
+            "ro": false,
+            "node-name": "throttle0",
+            "backing_file_depth": 0,
+            "drv": "throttle",
+            "iops": 0,
+            "bps_wr": 0,
+            "write_threshold": 0,
+            "encrypted": false,
+            "bps": 0,
+            "bps_rd": 0,
+            "cache": {
+                "no-flush": false,
+                "direct": false,
+                "writeback": true
+            },
+            "file": "json:{\"throttle-group\": \"group0\", \"driver\": \"throttle\", \"file\": {\"driver\": \"qcow2\", \"file\": {\"driver\": \"file\", \"filename\": \"TEST_DIR/t.qcow2\"}}}",
+            "encryption_key_missing": false
+        },
+        {
+            "iops_rd": 0,
+            "detect_zeroes": "off",
+            "image": {
+                "virtual-size": 67108864,
+                "filename": "TEST_DIR/t.qcow2",
+                "cluster-size": 65536,
+                "format": "qcow2",
+                "actual-size": 200704,
+                "format-specific": {
+                    "type": "qcow2",
+                    "data": {
+                        "compat": "1.1",
+                        "lazy-refcounts": false,
+                        "refcount-bits": 16,
+                        "corrupt": false
+                    }
+                },
+                "dirty-flag": false
+            },
+            "iops_wr": 0,
+            "ro": false,
+            "node-name": "disk0",
+            "backing_file_depth": 0,
+            "drv": "qcow2",
+            "iops": 0,
+            "bps_wr": 0,
+            "write_threshold": 0,
+            "encrypted": false,
+            "bps": 0,
+            "bps_rd": 0,
+            "cache": {
+                "no-flush": false,
+                "direct": false,
+                "writeback": true
+            },
+            "file": "TEST_DIR/t.qcow2",
+            "encryption_key_missing": false
+        },
+        {
+            "iops_rd": 0,
+            "detect_zeroes": "off",
+            "image": {
+                "virtual-size": 197120,
+                "filename": "TEST_DIR/t.qcow2",
+                "format": "file",
+                "actual-size": 200704,
+                "dirty-flag": false
+            },
+            "iops_wr": 0,
+            "ro": false,
+            "node-name": "NODE_NAME",
+            "backing_file_depth": 0,
+            "drv": "file",
+            "iops": 0,
+            "bps_wr": 0,
+            "write_threshold": 0,
+            "encrypted": false,
+            "bps": 0,
+            "bps_rd": 0,
+            "cache": {
+                "no-flush": false,
+                "direct": false,
+                "writeback": true
+            },
+            "file": "TEST_DIR/t.qcow2",
+            "encryption_key_missing": false
+        }
+    ]
+}
+{
+    "return": [
+    ]
+}
+{
+    "return": {
+    }
+}
+{
+    "timestamp": {
+        "seconds":  TIMESTAMP,
+        "microseconds":  TIMESTAMP
+    },
+    "event": "SHUTDOWN",
+    "data": {
+        "guest": false
+    }
+}
+
+
+== property changes in ThrottleGroup ==
+Testing:
+{
+    QMP_VERSION
+}
+{
+    "return": {
+    }
+}
+{
+    "return": {
+    }
+}
+{
+    "return": {
+        "bps-read-max-length": 1,
+        "iops-read-max-length": 1,
+        "bps-read-max": 0,
+        "bps-total": 0,
+        "iops-total-max-length": 1,
+        "iops-total": 1000,
+        "iops-write-max": 0,
+        "bps-write": 0,
+        "bps-total-max": 0,
+        "bps-write-max": 0,
+        "iops-size": 0,
+        "iops-read": 0,
+        "iops-write-max-length": 1,
+        "iops-write": 0,
+        "bps-total-max-length": 1,
+        "iops-read-max": 0,
+        "bps-read": 0,
+        "bps-write-max-length": 1,
+        "iops-total-max": 0
+    }
+}
+{
+    "return": {
+    }
+}
+{
+    "return": {
+        "bps-read-max-length": 1,
+        "iops-read-max-length": 1,
+        "bps-read-max": 0,
+        "bps-total": 0,
+        "iops-total-max-length": 1,
+        "iops-total": 0,
+        "iops-write-max": 0,
+        "bps-write": 0,
+        "bps-total-max": 0,
+        "bps-write-max": 0,
+        "iops-size": 0,
+        "iops-read": 0,
+        "iops-write-max-length": 1,
+        "iops-write": 0,
+        "bps-total-max-length": 1,
+        "iops-read-max": 0,
+        "bps-read": 0,
+        "bps-write-max-length": 1,
+        "iops-total-max": 0
+    }
+}
+{
+    "return": {
+    }
+}
+{
+    "timestamp": {
+        "seconds":  TIMESTAMP,
+        "microseconds":  TIMESTAMP
+    },
+    "event": "SHUTDOWN",
+    "data": {
+        "guest": false
+    }
+}
+
+
+== object creation/set errors  ==
+Testing:
+{
+    QMP_VERSION
+}
+{
+    "return": {
+    }
+}
+{
+    "return": {
+    }
+}
+{
+    "error": {
+        "class": "GenericError",
+        "desc": "Property cannot be set after initialization"
+    }
+}
+{
+    "error": {
+        "class": "GenericError",
+        "desc": "bps/iops/max total values and read/write values cannot be used at the same time"
+    }
+}
+{
+    "return": {
+    }
+}
+{
+    "timestamp": {
+        "seconds":  TIMESTAMP,
+        "microseconds":  TIMESTAMP
+    },
+    "event": "SHUTDOWN",
+    "data": {
+        "guest": false
+    }
+}
+
+
+== don't specify group ==
+Testing:
+{
+    QMP_VERSION
+}
+{
+    "return": {
+    }
+}
+{
+    "return": {
+    }
+}
+{
+    "error": {
+        "class": "GenericError",
+        "desc": "Parameter 'throttle-group' is missing"
+    }
+}
+{
+    "return": {
+    }
+}
+{
+    "timestamp": {
+        "seconds":  TIMESTAMP,
+        "microseconds":  TIMESTAMP
+    },
+    "event": "SHUTDOWN",
+    "data": {
+        "guest": false
+    }
+}
+
+
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 181 rw auto migration
 182 rw auto quick
 183 rw auto migration
+184 rw auto quick
 185 rw auto
 186 rw auto
 187 rw auto
-- 
2.13.5

From: Pavel Butsykin <pbutsykin@virtuozzo.com>

After calling qcow2_inactivate(), all qcow2 caches must be flushed, but this
may not happen, because the last call qcow2_store_persistent_dirty_bitmaps()
can lead to marking l2/refcont cache as dirty.

Let's move qcow2_store_persistent_dirty_bitmaps() before the caсhe flushing
to fix it.

Cc: qemu-stable@nongnu.org
Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static int qcow2_inactivate(BlockDriverState *bs)
     int ret, result = 0;
     Error *local_err = NULL;
 
+    qcow2_store_persistent_dirty_bitmaps(bs, &local_err);
+    if (local_err != NULL) {
+        result = -EINVAL;
+        error_report_err(local_err);
+        error_report("Persistent bitmaps are lost for node '%s'",
+                     bdrv_get_device_or_node_name(bs));
+    }
+
     ret = qcow2_cache_flush(bs, s->l2_table_cache);
     if (ret) {
         result = ret;
@@ -XXX,XX +XXX,XX @@ static int qcow2_inactivate(BlockDriverState *bs)
                      strerror(-ret));
     }
 
-    qcow2_store_persistent_dirty_bitmaps(bs, &local_err);
-    if (local_err != NULL) {
-        result = -EINVAL;
-        error_report_err(local_err);
-        error_report("Persistent bitmaps are lost for node '%s'",
-                     bdrv_get_device_or_node_name(bs));
-    }
-
     if (result == 0) {
         qcow2_mark_clean(bs);
     }
-- 
2.13.5

The following changes since commit 281f327487c9c9b1599f93c589a408bbf4a651b8:

Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.12-pull-request' into staging (2017-12-22 00:11:36 +0000)

are available in the git repository at:

git://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to 1a63a907507fbbcfaee3f622907ec244b7eabda8:

block: Keep nodes drained between reopen_queue/multiple (2017-12-22 15:05:32 +0100)

----------------------------------------------------------------
Block layer patches

----------------------------------------------------------------
Doug Gale (1):
      nvme: Add tracing

Edgar Kaziakhmedov (1):
      qcow2: get rid of qcow2_backing_read1 routine

Fam Zheng (2):
      block: Open backing image in force share mode for size probe
      block: Remove unused bdrv_requests_pending

John Snow (1):
      iotests: fix 197 for vpc

Kevin Wolf (27):
      block: Formats don't need CONSISTENT_READ with NO_IO
      block: Make bdrv_drain_invoke() recursive
      block: Call .drain_begin only once in bdrv_drain_all_begin()
      test-bdrv-drain: Test BlockDriver callbacks for drain
      block: bdrv_drain_recurse(): Remove unused begin parameter
      block: Don't wait for requests in bdrv_drain*_end()
      block: Unify order in drain functions
      block: Don't acquire AioContext in hmp_qemu_io()
      block: Document that x-blockdev-change breaks quorum children list
      block: Assert drain_all is only called from main AioContext
      block: Make bdrv_drain() driver callbacks non-recursive
      test-bdrv-drain: Test callback for bdrv_drain
      test-bdrv-drain: Test bs->quiesce_counter
      blockjob: Pause job on draining any job BDS
      test-bdrv-drain: Test drain vs. block jobs
      block: Don't block_job_pause_all() in bdrv_drain_all()
      block: Nested drain_end must still call callbacks
      test-bdrv-drain: Test nested drain sections
      block: Don't notify parents in drain call chain
      block: Add bdrv_subtree_drained_begin/end()
      test-bdrv-drain: Tests for bdrv_subtree_drain
      test-bdrv-drain: Test behaviour in coroutine context
      test-bdrv-drain: Recursive draining with multiple parents
      block: Allow graph changes in subtree drained section
      test-bdrv-drain: Test graph changes in drained section
      commit: Simplify reopen of base
      block: Keep nodes drained between reopen_queue/multiple

Thomas Huth (3):
      block: Remove the obsolete -drive boot=on|off parameter
      block: Remove the deprecated -hdachs option
      block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter

Commit 1f4ad7d fixed 'qemu-img info' for raw images that are currently
in use as a mirror target. It is not enough for image formats, though,
as these still unconditionally request BLK_PERM_CONSISTENT_READ.

As this permission is geared towards whether the guest-visible data is
consistent, and has no impact on whether the metadata is sane, and
'qemu-img info' does not read guest-visible data (except for the raw
format), it makes sense to not require BLK_PERM_CONSISTENT_READ if there
is not going to be any guest I/O performed, regardless of image format.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
     assert(role == &child_backing || role == &child_file);
 
     if (!backing) {
+        int flags = bdrv_reopen_get_flags(reopen_queue, bs);
+
         /* Apart from the modifications below, the same permissions are
          * forwarded and left alone as for filters */
         bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared,
@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
 
         /* bs->file always needs to be consistent because of the metadata. We
          * can never allow other users to resize or write to it. */
-        perm |= BLK_PERM_CONSISTENT_READ;
+        if (!(flags & BDRV_O_NO_IO)) {
+            perm |= BLK_PERM_CONSISTENT_READ;
+        }
         shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
     } else {
         /* We want consistent read from backing files if the parent needs it.
-- 
2.13.6

From: John Snow <jsnow@redhat.com>

VPC has some difficulty creating geometries of particular size.
However, we can indeed force it to use a literal one, so let's
do that for the sake of test 197, which is testing some specific
offsets.

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Lukáš Doktor <ldoktor@redhat.com>
---
 tests/qemu-iotests/197           | 4 ++++
 tests/qemu-iotests/common.filter | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/197
+++ b/tests/qemu-iotests/197
@@ -XXX,XX +XXX,XX @@ echo '=== Copy-on-read ==='
 echo
 
 # Prep the images
+# VPC rounds image sizes to a specific geometry, force a specific size.
+if [ "$IMGFMT" = "vpc" ]; then
+    IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size")
+fi
 _make_test_img 4G
 $QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io
 IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \
diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/common.filter
+++ b/tests/qemu-iotests/common.filter
@@ -XXX,XX +XXX,XX @@ _filter_img_create()
         -e "s# log_size=[0-9]\\+##g" \
         -e "s# refcount_bits=[0-9]\\+##g" \
         -e "s# key-secret=[a-zA-Z0-9]\\+##g" \
-        -e "s# iter-time=[0-9]\\+##g"
+        -e "s# iter-time=[0-9]\\+##g" \
+        -e "s# force_size=\$on\\|off\$##g"
 }
 
 _filter_img_info()
-- 
2.13.6

This change separates bdrv_drain_invoke(), which calls the BlockDriver
drain callbacks, from bdrv_drain_recurse(). Instead, the function
performs its own recursion now.

One reason for this is that bdrv_drain_recurse() can be called multiple
times by bdrv_drain_all_begin(), but the callbacks may only be called
once. The separation is necessary to fix this bug.

The other reason is that we intend to go to a model where we call all
driver callbacks first, and only then start polling. This is not fully
achieved yet with this patch, as bdrv_drain_invoke() contains a
BDRV_POLL_WHILE() loop for the block driver callbacks, which can still
call callbacks for any unrelated event. It's a step in this direction
anyway.

Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
     bdrv_wakeup(bs);
 }
 
+/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 {
+    BdrvChild *child, *tmp;
     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
 
     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
     data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
     bdrv_coroutine_enter(bs, data.co);
     BDRV_POLL_WHILE(bs, !data.done);
+
+    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+        bdrv_drain_invoke(child->bs, begin);
+    }
 }
 
 static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
     BdrvChild *child, *tmp;
     bool waited;
 
-    /* Ensure any pending metadata writes are submitted to bs->file.  */
-    bdrv_drain_invoke(bs, begin);
-
     /* Wait for drained requests to finish */
     waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
         bdrv_parent_drained_begin(bs);
     }
 
+    bdrv_drain_invoke(bs, true);
     bdrv_drain_recurse(bs, true);
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
     }
 
     bdrv_parent_drained_end(bs);
+    bdrv_drain_invoke(bs, false);
     bdrv_drain_recurse(bs, false);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
+                    /* FIXME Calling this multiple times is wrong */
+                    bdrv_drain_invoke(bs, true);
                     waited |= bdrv_drain_recurse(bs, true);
                 }
             }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_context_acquire(aio_context);
         aio_enable_external(aio_context);
         bdrv_parent_drained_end(bs);
+        bdrv_drain_invoke(bs, false);
         bdrv_drain_recurse(bs, false);
         aio_context_release(aio_context);
     }
-- 
2.13.6

bdrv_drain_all_begin() used to call the .bdrv_co_drain_begin() driver
callback inside its polling loop. This means that how many times it got
called for each node depended on long it had to poll the event loop.

This is obviously not right and results in nodes that stay drained even
after bdrv_drain_all_end(), which calls .bdrv_co_drain_begin() once per
node.

Fix bdrv_drain_all_begin() to call the callback only once, too.

Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
         aio_context_acquire(aio_context);
         bdrv_parent_drained_begin(bs);
         aio_disable_external(aio_context);
+        bdrv_drain_invoke(bs, true);
         aio_context_release(aio_context);
 
         if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
-                    /* FIXME Calling this multiple times is wrong */
-                    bdrv_drain_invoke(bs, true);
                     waited |= bdrv_drain_recurse(bs, true);
                 }
             }
-- 
2.13.6

This adds a test case that the BlockDriver callbacks for drain are
called in bdrv_drained_all_begin/end(), and that both of them are called
exactly once.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 tests/test-bdrv-drain.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++
 tests/Makefile.include  |   2 +
 2 files changed, 139 insertions(+)
 create mode 100644 tests/test-bdrv-drain.c

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Block node draining tests
+ *
+ * Copyright (c) 2017 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "block/block.h"
+#include "sysemu/block-backend.h"
+#include "qapi/error.h"
+
+typedef struct BDRVTestState {
+    int drain_count;
+} BDRVTestState;
+
+static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
+{
+    BDRVTestState *s = bs->opaque;
+    s->drain_count++;
+}
+
+static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
+{
+    BDRVTestState *s = bs->opaque;
+    s->drain_count--;
+}
+
+static void bdrv_test_close(BlockDriverState *bs)
+{
+    BDRVTestState *s = bs->opaque;
+    g_assert_cmpint(s->drain_count, >, 0);
+}
+
+static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
+                                            uint64_t offset, uint64_t bytes,
+                                            QEMUIOVector *qiov, int flags)
+{
+    /* We want this request to stay until the polling loop in drain waits for
+     * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
+     * first and polls its result, too, but it shouldn't accidentally complete
+     * this request yet. */
+    qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
+
+    return 0;
+}
+
+static BlockDriver bdrv_test = {
+    .format_name            = "test",
+    .instance_size          = sizeof(BDRVTestState),
+
+    .bdrv_close             = bdrv_test_close,
+    .bdrv_co_preadv         = bdrv_test_co_preadv,
+
+    .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
+    .bdrv_co_drain_end      = bdrv_test_co_drain_end,
+};
+
+static void aio_ret_cb(void *opaque, int ret)
+{
+    int *aio_ret = opaque;
+    *aio_ret = ret;
+}
+
+static void test_drv_cb_drain_all(void)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs;
+    BDRVTestState *s;
+    BlockAIOCB *acb;
+    int aio_ret;
+
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = NULL,
+        .iov_len = 0,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
+                              &error_abort);
+    s = bs->opaque;
+    blk_insert_bs(blk, bs, &error_abort);
+
+    /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
+    g_assert_cmpint(s->drain_count, ==, 0);
+    bdrv_drain_all_begin();
+    g_assert_cmpint(s->drain_count, ==, 1);
+    bdrv_drain_all_end();
+    g_assert_cmpint(s->drain_count, ==, 0);
+
+    /* Now do the same while a request is pending */
+    aio_ret = -EINPROGRESS;
+    acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
+    g_assert(acb != NULL);
+    g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
+
+    g_assert_cmpint(s->drain_count, ==, 0);
+    bdrv_drain_all_begin();
+    g_assert_cmpint(aio_ret, ==, 0);
+    g_assert_cmpint(s->drain_count, ==, 1);
+    bdrv_drain_all_end();
+    g_assert_cmpint(s->drain_count, ==, 0);
+
+    bdrv_unref(bs);
+    blk_unref(blk);
+}
+
+int main(int argc, char **argv)
+{
+    bdrv_init();
+    qemu_init_main_loop(&error_abort);
+
+    g_test_init(&argc, &argv, NULL);
+
+    g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
+
+    return g_test_run();
+}
diff --git a/tests/Makefile.include b/tests/Makefile.include
index XXXXXXX..XXXXXXX 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -XXX,XX +XXX,XX @@ gcov-files-test-thread-pool-y = thread-pool.c
 gcov-files-test-hbitmap-y = util/hbitmap.c
 check-unit-y += tests/test-hbitmap$(EXESUF)
 gcov-files-test-hbitmap-y = blockjob.c
+check-unit-y += tests/test-bdrv-drain$(EXESUF)
 check-unit-y += tests/test-blockjob$(EXESUF)
 check-unit-y += tests/test-blockjob-txn$(EXESUF)
 check-unit-y += tests/test-x86-cpuid$(EXESUF)
@@ -XXX,XX +XXX,XX @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
 tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
 tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
+tests/test-bdrv-drain$(EXESUF): tests/test-bdrv-drain.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
-- 
2.13.6

Now that the bdrv_drain_invoke() calls are pulled up to the callers of
bdrv_drain_recurse(), the 'begin' parameter isn't needed any more.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
     }
 }
 
-static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
+static bool bdrv_drain_recurse(BlockDriverState *bs)
 {
     BdrvChild *child, *tmp;
     bool waited;
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
              */
             bdrv_ref(bs);
         }
-        waited |= bdrv_drain_recurse(bs, begin);
+        waited |= bdrv_drain_recurse(bs);
         if (in_main_loop) {
             bdrv_unref(bs);
         }
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
     }
 
     bdrv_drain_invoke(bs, true);
-    bdrv_drain_recurse(bs, true);
+    bdrv_drain_recurse(bs);
 }
 
 void bdrv_drained_end(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
 
     bdrv_parent_drained_end(bs);
     bdrv_drain_invoke(bs, false);
-    bdrv_drain_recurse(bs, false);
+    bdrv_drain_recurse(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
-                    waited |= bdrv_drain_recurse(bs, true);
+                    waited |= bdrv_drain_recurse(bs);
                 }
             }
             aio_context_release(aio_context);
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_enable_external(aio_context);
         bdrv_parent_drained_end(bs);
         bdrv_drain_invoke(bs, false);
-        bdrv_drain_recurse(bs, false);
+        bdrv_drain_recurse(bs);
         aio_context_release(aio_context);
     }
 
-- 
2.13.6

The device is drained, so there is no point in waiting for requests at
the end of the drained section. Remove the bdrv_drain_recurse() calls
there.

The bdrv_drain_recurse() calls were introduced in commit 481cad48e5e
in order to call the .bdrv_co_drain_end() driver callback. This is now
done by a separate bdrv_drain_invoke() call.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
 
     bdrv_parent_drained_end(bs);
     bdrv_drain_invoke(bs, false);
-    bdrv_drain_recurse(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_enable_external(aio_context);
         bdrv_parent_drained_end(bs);
         bdrv_drain_invoke(bs, false);
-        bdrv_drain_recurse(bs);
         aio_context_release(aio_context);
     }
 
-- 
2.13.6

Drain requests are propagated to child nodes, parent nodes and directly
to the AioContext. The order in which this happened was different
between all combinations of drain/drain_all and begin/end.

The correct order is to keep children only drained when their parents
are also drained. This means that at the start of a drained section, the
AioContext needs to be drained first, the parents second and only then
the children. The correct order for the end of a drained section is the
opposite.

This patch changes the three other functions to follow the example of
bdrv_drained_begin(), which is the only one that got it right.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
         return;
     }
 
+    /* Stop things in parent-to-child order */
     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
         aio_disable_external(bdrv_get_aio_context(bs));
         bdrv_parent_drained_begin(bs);
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
         return;
     }
 
-    bdrv_parent_drained_end(bs);
+    /* Re-enable things in child-to-parent order */
     bdrv_drain_invoke(bs, false);
+    bdrv_parent_drained_end(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
+        /* Stop things in parent-to-child order */
         aio_context_acquire(aio_context);
-        bdrv_parent_drained_begin(bs);
         aio_disable_external(aio_context);
+        bdrv_parent_drained_begin(bs);
         bdrv_drain_invoke(bs, true);
         aio_context_release(aio_context);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
+        /* Re-enable things in child-to-parent order */
         aio_context_acquire(aio_context);
-        aio_enable_external(aio_context);
-        bdrv_parent_drained_end(bs);
         bdrv_drain_invoke(bs, false);
+        bdrv_parent_drained_end(bs);
+        aio_enable_external(aio_context);
         aio_context_release(aio_context);
     }
 
-- 
2.13.6

Commit 15afd94a047 added code to acquire and release the AioContext in
qemuio_command(). This means that the lock is taken twice now in the
call path from hmp_qemu_io(). This causes BDRV_POLL_WHILE() to hang for
any requests issued to nodes in a non-mainloop AioContext.

Dropping the first locking from hmp_qemu_io() fixes the problem.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 hmp.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/hmp.c b/hmp.c
index XXXXXXX..XXXXXXX 100644
--- a/hmp.c
+++ b/hmp.c
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
 {
     BlockBackend *blk;
     BlockBackend *local_blk = NULL;
-    AioContext *aio_context;
     const char* device = qdict_get_str(qdict, "device");
     const char* command = qdict_get_str(qdict, "command");
     Error *err = NULL;
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
         }
     }
 
-    aio_context = blk_get_aio_context(blk);
-    aio_context_acquire(aio_context);
-
     /*
      * Notably absent: Proper permission management. This is sad, but it seems
      * almost impossible to achieve without changing the semantics and thereby
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
      */
     qemuio_command(blk, command);
 
-    aio_context_release(aio_context);
-
 fail:
     blk_unref(local_blk);
     hmp_handle_error(mon, &err);
-- 
2.13.6

From: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>

Since bdrv_co_preadv does all neccessary checks including
reading after the end of the backing file, avoid duplication
of verification before bdrv_co_preadv call.

Signed-off-by: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.h |  3 ---
 block/qcow2.c | 51 ++++++++-------------------------------------------
 2 files changed, 8 insertions(+), 46 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset)
 }
 
 /* qcow2.c functions */
-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-                  int64_t sector_num, int nb_sectors);
-
 int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
                                      int refcount_order, bool generous_increase,
                                      uint64_t *refblock_count);
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
     return status;
 }
 
-/* handle reading after the end of the backing file */
-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-                        int64_t offset, int bytes)
-{
-    uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE;
-    int n1;
-
-    if ((offset + bytes) <= bs_size) {
-        return bytes;
-    }
-
-    if (offset >= bs_size) {
-        n1 = 0;
-    } else {
-        n1 = bs_size - offset;
-    }
-
-    qemu_iovec_memset(qiov, n1, 0, bytes - n1);
-
-    return n1;
-}
-
 static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
                                         uint64_t bytes, QEMUIOVector *qiov,
                                         int flags)
 {
     BDRVQcow2State *s = bs->opaque;
-    int offset_in_cluster, n1;
+    int offset_in_cluster;
     int ret;
     unsigned int cur_bytes; /* number of bytes in current iteration */
     uint64_t cluster_offset = 0;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
         case QCOW2_CLUSTER_UNALLOCATED:
 
             if (bs->backing) {
-                /* read from the base image */
-                n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov,
-                                         offset, cur_bytes);
-                if (n1 > 0) {
-                    QEMUIOVector local_qiov;
-
-                    qemu_iovec_init(&local_qiov, hd_qiov.niov);
-                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1);
-
-                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
-                    qemu_co_mutex_unlock(&s->lock);
-                    ret = bdrv_co_preadv(bs->backing, offset, n1,
-                                         &local_qiov, 0);
-                    qemu_co_mutex_lock(&s->lock);
-
-                    qemu_iovec_destroy(&local_qiov);
-
-                    if (ret < 0) {
-                        goto fail;
-                    }
+                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
+                qemu_co_mutex_unlock(&s->lock);
+                ret = bdrv_co_preadv(bs->backing, offset, cur_bytes,
+                                     &hd_qiov, 0);
+                qemu_co_mutex_lock(&s->lock);
+                if (ret < 0) {
+                    goto fail;
                 }
             } else {
                 /* Note: in this case, no need to wait */
-- 
2.13.6

Removing a quorum child node with x-blockdev-change results in a quorum
driver state that cannot be recreated with create options because it
would require a list with gaps. This causes trouble in at least
.bdrv_refresh_filename().

Document this problem so that we won't accidentally mark the command
stable without having addressed it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
---
 qapi/block-core.json | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 # does not support all kinds of operations, all kinds of children, nor
 # all block drivers.
 #
+# FIXME Removing children from a quorum node means introducing gaps in the
+# child indices. This cannot be represented in the 'children' list of
+# BlockdevOptionsQuorum, as returned by .bdrv_refresh_filename().
+#
 # Warning: The data in a new quorum child MUST be consistent with that of
 # the rest of the array.
 #
-- 
2.13.6

From: Doug Gale <doug16k@gmail.com>

Add trace output for commands, errors, and undefined behavior.
Add guest error log output for undefined behavior.
Report invalid undefined accesses to MMIO.
Annotate unlikely error checks with unlikely.

Signed-off-by: Doug Gale <doug16k@gmail.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/nvme.c       | 349 ++++++++++++++++++++++++++++++++++++++++++--------
 hw/block/trace-events |  93 ++++++++++++++
 2 files changed, 390 insertions(+), 52 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -XXX,XX +XXX,XX @@
 #include "qapi/visitor.h"
 #include "sysemu/block-backend.h"
 
+#include "qemu/log.h"
+#include "trace.h"
 #include "nvme.h"
 
+#define NVME_GUEST_ERR(trace, fmt, ...) \
+    do { \
+        (trace_##trace)(__VA_ARGS__); \
+        qemu_log_mask(LOG_GUEST_ERROR, #trace \
+            " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
+    } while (0)
+
 static void nvme_process_sq(void *opaque);
 
 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
@@ -XXX,XX +XXX,XX @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
 {
     if (cq->irq_enabled) {
         if (msix_enabled(&(n->parent_obj))) {
+            trace_nvme_irq_msix(cq->vector);
             msix_notify(&(n->parent_obj), cq->vector);
         } else {
+            trace_nvme_irq_pin();
             pci_irq_pulse(&n->parent_obj);
         }
+    } else {
+        trace_nvme_irq_masked();
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
     trans_len = MIN(len, trans_len);
     int num_prps = (len >> n->page_bits) + 1;
 
-    if (!prp1) {
+    if (unlikely(!prp1)) {
+        trace_nvme_err_invalid_prp();
         return NVME_INVALID_FIELD | NVME_DNR;
     } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
                prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
     }
     len -= trans_len;
     if (len) {
-        if (!prp2) {
+        if (unlikely(!prp2)) {
+            trace_nvme_err_invalid_prp2_missing();
             goto unmap;
         }
         if (len > n->page_size) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
 
                 if (i == n->max_prp_ents - 1 && len > n->page_size) {
-                    if (!prp_ent || prp_ent & (n->page_size - 1)) {
+                    if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
+                        trace_nvme_err_invalid_prplist_ent(prp_ent);
                         goto unmap;
                     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                     prp_ent = le64_to_cpu(prp_list[i]);
                 }
 
-                if (!prp_ent || prp_ent & (n->page_size - 1)) {
+                if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
+                    trace_nvme_err_invalid_prplist_ent(prp_ent);
                     goto unmap;
                 }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                 i++;
             }
         } else {
-            if (prp2 & (n->page_size - 1)) {
+            if (unlikely(prp2 & (n->page_size - 1))) {
+                trace_nvme_err_invalid_prp2_align(prp2);
                 goto unmap;
             }
             if (qsg->nsg) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
     QEMUIOVector iov;
     uint16_t status = NVME_SUCCESS;
 
+    trace_nvme_dma_read(prp1, prp2);
+
     if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
         return NVME_INVALID_FIELD | NVME_DNR;
     }
     if (qsg.nsg > 0) {
-        if (dma_buf_read(ptr, len, &qsg)) {
+        if (unlikely(dma_buf_read(ptr, len, &qsg))) {
+            trace_nvme_err_invalid_dma();
             status = NVME_INVALID_FIELD | NVME_DNR;
         }
         qemu_sglist_destroy(&qsg);
     } else {
-        if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
+        if (unlikely(qemu_iovec_to_buf(&iov, 0, ptr, len) != len)) {
+            trace_nvme_err_invalid_dma();
             status = NVME_INVALID_FIELD | NVME_DNR;
         }
         qemu_iovec_destroy(&iov);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
     uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS);
 
-    if (slba + nlb > ns->id_ns.nsze) {
+    if (unlikely(slba + nlb > ns->id_ns.nsze)) {
+        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
         return NVME_LBA_RANGE | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
     enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
 
-    if ((slba + nlb) > ns->id_ns.nsze) {
+    trace_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
+
+    if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
         block_acct_invalid(blk_get_stats(n->conf.blk), acct);
+        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
         return NVME_LBA_RANGE | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     NvmeNamespace *ns;
     uint32_t nsid = le32_to_cpu(cmd->nsid);
 
-    if (nsid == 0 || nsid > n->num_namespaces) {
+    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
+        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
         return NVME_INVALID_NSID | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     case NVME_CMD_READ:
         return nvme_rw(n, ns, cmd, req);
     default:
+        trace_nvme_err_invalid_opc(cmd->opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
     }
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
     NvmeCQueue *cq;
     uint16_t qid = le16_to_cpu(c->qid);
 
-    if (!qid || nvme_check_sqid(n, qid)) {
+    if (unlikely(!qid || nvme_check_sqid(n, qid))) {
+        trace_nvme_err_invalid_del_sq(qid);
         return NVME_INVALID_QID | NVME_DNR;
     }
 
+    trace_nvme_del_sq(qid);
+
     sq = n->sq[qid];
     while (!QTAILQ_EMPTY(&sq->out_req_list)) {
         req = QTAILQ_FIRST(&sq->out_req_list);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
     uint16_t qflags = le16_to_cpu(c->sq_flags);
     uint64_t prp1 = le64_to_cpu(c->prp1);
 
-    if (!cqid || nvme_check_cqid(n, cqid)) {
+    trace_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
+
+    if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
+        trace_nvme_err_invalid_create_sq_cqid(cqid);
         return NVME_INVALID_CQID | NVME_DNR;
     }
-    if (!sqid || !nvme_check_sqid(n, sqid)) {
+    if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) {
+        trace_nvme_err_invalid_create_sq_sqid(sqid);
         return NVME_INVALID_QID | NVME_DNR;
     }
-    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
+    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
+        trace_nvme_err_invalid_create_sq_size(qsize);
         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
     }
-    if (!prp1 || prp1 & (n->page_size - 1)) {
+    if (unlikely(!prp1 || prp1 & (n->page_size - 1))) {
+        trace_nvme_err_invalid_create_sq_addr(prp1);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
-    if (!(NVME_SQ_FLAGS_PC(qflags))) {
+    if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
+        trace_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
         return NVME_INVALID_FIELD | NVME_DNR;
     }
     sq = g_malloc0(sizeof(*sq));
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
     NvmeCQueue *cq;
     uint16_t qid = le16_to_cpu(c->qid);
 
-    if (!qid || nvme_check_cqid(n, qid)) {
+    if (unlikely(!qid || nvme_check_cqid(n, qid))) {
+        trace_nvme_err_invalid_del_cq_cqid(qid);
         return NVME_INVALID_CQID | NVME_DNR;
     }
 
     cq = n->cq[qid];
-    if (!QTAILQ_EMPTY(&cq->sq_list)) {
+    if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
+        trace_nvme_err_invalid_del_cq_notempty(qid);
         return NVME_INVALID_QUEUE_DEL;
     }
+    trace_nvme_del_cq(qid);
     nvme_free_cq(cq, n);
     return NVME_SUCCESS;
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
     uint16_t qflags = le16_to_cpu(c->cq_flags);
     uint64_t prp1 = le64_to_cpu(c->prp1);
 
-    if (!cqid || !nvme_check_cqid(n, cqid)) {
+    trace_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
+                         NVME_CQ_FLAGS_IEN(qflags) != 0);
+
+    if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) {
+        trace_nvme_err_invalid_create_cq_cqid(cqid);
         return NVME_INVALID_CQID | NVME_DNR;
     }
-    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
+    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
+        trace_nvme_err_invalid_create_cq_size(qsize);
         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
     }
-    if (!prp1) {
+    if (unlikely(!prp1)) {
+        trace_nvme_err_invalid_create_cq_addr(prp1);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
-    if (vector > n->num_queues) {
+    if (unlikely(vector > n->num_queues)) {
+        trace_nvme_err_invalid_create_cq_vector(vector);
         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
     }
-    if (!(NVME_CQ_FLAGS_PC(qflags))) {
+    if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
+        trace_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
     uint64_t prp1 = le64_to_cpu(c->prp1);
     uint64_t prp2 = le64_to_cpu(c->prp2);
 
+    trace_nvme_identify_ctrl();
+
     return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
         prp1, prp2);
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
     uint64_t prp1 = le64_to_cpu(c->prp1);
     uint64_t prp2 = le64_to_cpu(c->prp2);
 
-    if (nsid == 0 || nsid > n->num_namespaces) {
+    trace_nvme_identify_ns(nsid);
+
+    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
+        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
         return NVME_INVALID_NSID | NVME_DNR;
     }
 
     ns = &n->namespaces[nsid - 1];
+
     return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
         prp1, prp2);
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
     uint16_t ret;
     int i, j = 0;
 
+    trace_nvme_identify_nslist(min_nsid);
+
     list = g_malloc0(data_len);
     for (i = 0; i < n->num_namespaces; i++) {
         if (i < min_nsid) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
     case 0x02:
         return nvme_identify_nslist(n, c);
     default:
+        trace_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     switch (dw10) {
     case NVME_VOLATILE_WRITE_CACHE:
         result = blk_enable_write_cache(n->conf.blk);
+        trace_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
         break;
     case NVME_NUMBER_OF_QUEUES:
         result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
+        trace_nvme_getfeat_numq(result);
         break;
     default:
+        trace_nvme_err_invalid_getfeat(dw10);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
         break;
     case NVME_NUMBER_OF_QUEUES:
+        trace_nvme_setfeat_numq((dw11 & 0xFFFF) + 1,
+                                ((dw11 >> 16) & 0xFFFF) + 1,
+                                n->num_queues - 1, n->num_queues - 1);
         req->cqe.result =
             cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
         break;
     default:
+        trace_nvme_err_invalid_setfeat(dw10);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
     return NVME_SUCCESS;
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     case NVME_ADM_CMD_GET_FEATURES:
         return nvme_get_feature(n, cmd, req);
     default:
+        trace_nvme_err_invalid_admin_opc(cmd->opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
     }
 }
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
     uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
     uint32_t page_size = 1 << page_bits;
 
-    if (n->cq[0] || n->sq[0] || !n->bar.asq || !n->bar.acq ||
-            n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1) ||
-            NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) ||
-            NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap) ||
-            NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) ||
-            NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes) ||
-            NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.sqes) ||
-            NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.sqes) ||
-            !NVME_AQA_ASQS(n->bar.aqa) || !NVME_AQA_ACQS(n->bar.aqa)) {
+    if (unlikely(n->cq[0])) {
+        trace_nvme_err_startfail_cq();
+        return -1;
+    }
+    if (unlikely(n->sq[0])) {
+        trace_nvme_err_startfail_sq();
+        return -1;
+    }
+    if (unlikely(!n->bar.asq)) {
+        trace_nvme_err_startfail_nbarasq();
+        return -1;
+    }
+    if (unlikely(!n->bar.acq)) {
+        trace_nvme_err_startfail_nbaracq();
+        return -1;
+    }
+    if (unlikely(n->bar.asq & (page_size - 1))) {
+        trace_nvme_err_startfail_asq_misaligned(n->bar.asq);
+        return -1;
+    }
+    if (unlikely(n->bar.acq & (page_size - 1))) {
+        trace_nvme_err_startfail_acq_misaligned(n->bar.acq);
+        return -1;
+    }
+    if (unlikely(NVME_CC_MPS(n->bar.cc) <
+                 NVME_CAP_MPSMIN(n->bar.cap))) {
+        trace_nvme_err_startfail_page_too_small(
+                    NVME_CC_MPS(n->bar.cc),
+                    NVME_CAP_MPSMIN(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_MPS(n->bar.cc) >
+                 NVME_CAP_MPSMAX(n->bar.cap))) {
+        trace_nvme_err_startfail_page_too_large(
+                    NVME_CC_MPS(n->bar.cc),
+                    NVME_CAP_MPSMAX(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
+                 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
+        trace_nvme_err_startfail_cqent_too_small(
+                    NVME_CC_IOCQES(n->bar.cc),
+                    NVME_CTRL_CQES_MIN(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
+                 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
+        trace_nvme_err_startfail_cqent_too_large(
+                    NVME_CC_IOCQES(n->bar.cc),
+                    NVME_CTRL_CQES_MAX(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
+                 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
+        trace_nvme_err_startfail_sqent_too_small(
+                    NVME_CC_IOSQES(n->bar.cc),
+                    NVME_CTRL_SQES_MIN(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
+                 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
+        trace_nvme_err_startfail_sqent_too_large(
+                    NVME_CC_IOSQES(n->bar.cc),
+                    NVME_CTRL_SQES_MAX(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
+        trace_nvme_err_startfail_asqent_sz_zero();
+        return -1;
+    }
+    if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
+        trace_nvme_err_startfail_acqent_sz_zero();
         return -1;
     }
 
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
     unsigned size)
 {
+    if (unlikely(offset & (sizeof(uint32_t) - 1))) {
+        NVME_GUEST_ERR(nvme_ub_mmiowr_misaligned32,
+                       "MMIO write not 32-bit aligned,"
+                       " offset=0x%"PRIx64"", offset);
+        /* should be ignored, fall through for now */
+    }
+
+    if (unlikely(size < sizeof(uint32_t))) {
+        NVME_GUEST_ERR(nvme_ub_mmiowr_toosmall,
+                       "MMIO write smaller than 32-bits,"
+                       " offset=0x%"PRIx64", size=%u",
+                       offset, size);
+        /* should be ignored, fall through for now */
+    }
+
     switch (offset) {
-    case 0xc:
+    case 0xc:   /* INTMS */
+        if (unlikely(msix_enabled(&(n->parent_obj)))) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
+                           "undefined access to interrupt mask set"
+                           " when MSI-X is enabled");
+            /* should be ignored, fall through for now */
+        }
         n->bar.intms |= data & 0xffffffff;
         n->bar.intmc = n->bar.intms;
+        trace_nvme_mmio_intm_set(data & 0xffffffff,
+                                 n->bar.intmc);
         break;
-    case 0x10:
+    case 0x10:  /* INTMC */
+        if (unlikely(msix_enabled(&(n->parent_obj)))) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
+                           "undefined access to interrupt mask clr"
+                           " when MSI-X is enabled");
+            /* should be ignored, fall through for now */
+        }
         n->bar.intms &= ~(data & 0xffffffff);
         n->bar.intmc = n->bar.intms;
+        trace_nvme_mmio_intm_clr(data & 0xffffffff,
+                                 n->bar.intmc);
         break;
-    case 0x14:
+    case 0x14:  /* CC */
+        trace_nvme_mmio_cfg(data & 0xffffffff);
         /* Windows first sends data, then sends enable bit */
         if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
             !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
@@ -XXX,XX +XXX,XX @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
 
         if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
             n->bar.cc = data;
-            if (nvme_start_ctrl(n)) {
+            if (unlikely(nvme_start_ctrl(n))) {
+                trace_nvme_err_startfail();
                 n->bar.csts = NVME_CSTS_FAILED;
             } else {
+                trace_nvme_mmio_start_success();
                 n->bar.csts = NVME_CSTS_READY;
             }
         } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
+            trace_nvme_mmio_stopped();
             nvme_clear_ctrl(n);
             n->bar.csts &= ~NVME_CSTS_READY;
         }
         if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
-                nvme_clear_ctrl(n);
-                n->bar.cc = data;
-                n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
+            trace_nvme_mmio_shutdown_set();
+            nvme_clear_ctrl(n);
+            n->bar.cc = data;
+            n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
         } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
-                n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
-                n->bar.cc = data;
+            trace_nvme_mmio_shutdown_cleared();
+            n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
+            n->bar.cc = data;
+        }
+        break;
+    case 0x1C:  /* CSTS */
+        if (data & (1 << 4)) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_ssreset_w1c_unsupported,
+                           "attempted to W1C CSTS.NSSRO"
+                           " but CAP.NSSRS is zero (not supported)");
+        } else if (data != 0) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_ro_csts,
+                           "attempted to set a read only bit"
+                           " of controller status");
+        }
+        break;
+    case 0x20:  /* NSSR */
+        if (data == 0x4E564D65) {
+            trace_nvme_ub_mmiowr_ssreset_unsupported();
+        } else {
+            /* The spec says that writes of other values have no effect */
+            return;
         }
         break;
-    case 0x24:
+    case 0x24:  /* AQA */
         n->bar.aqa = data & 0xffffffff;
+        trace_nvme_mmio_aqattr(data & 0xffffffff);
         break;
-    case 0x28:
+    case 0x28:  /* ASQ */
         n->bar.asq = data;
+        trace_nvme_mmio_asqaddr(data);
         break;
-    case 0x2c:
+    case 0x2c:  /* ASQ hi */
         n->bar.asq |= data << 32;
+        trace_nvme_mmio_asqaddr_hi(data, n->bar.asq);
         break;
-    case 0x30:
+    case 0x30:  /* ACQ */
+        trace_nvme_mmio_acqaddr(data);
         n->bar.acq = data;
         break;
-    case 0x34:
+    case 0x34:  /* ACQ hi */
         n->bar.acq |= data << 32;
+        trace_nvme_mmio_acqaddr_hi(data, n->bar.acq);
         break;
+    case 0x38:  /* CMBLOC */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbloc_reserved,
+                       "invalid write to reserved CMBLOC"
+                       " when CMBSZ is zero, ignored");
+        return;
+    case 0x3C:  /* CMBSZ */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
+                       "invalid write to read only CMBSZ, ignored");
+        return;
     default:
+        NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
+                       "invalid MMIO write,"
+                       " offset=0x%"PRIx64", data=%"PRIx64"",
+                       offset, data);
         break;
     }
 }
@@ -XXX,XX +XXX,XX @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
     uint8_t *ptr = (uint8_t *)&n->bar;
     uint64_t val = 0;
 
+    if (unlikely(addr & (sizeof(uint32_t) - 1))) {
+        NVME_GUEST_ERR(nvme_ub_mmiord_misaligned32,
+                       "MMIO read not 32-bit aligned,"
+                       " offset=0x%"PRIx64"", addr);
+        /* should RAZ, fall through for now */
+    } else if (unlikely(size < sizeof(uint32_t))) {
+        NVME_GUEST_ERR(nvme_ub_mmiord_toosmall,
+                       "MMIO read smaller than 32-bits,"
+                       " offset=0x%"PRIx64"", addr);
+        /* should RAZ, fall through for now */
+    }
+
     if (addr < sizeof(n->bar)) {
         memcpy(&val, ptr + addr, size);
+    } else {
+        NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
+                       "MMIO read beyond last register,"
+                       " offset=0x%"PRIx64", returning 0", addr);
     }
+
     return val;
 }
 
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
 {
     uint32_t qid;
 
-    if (addr & ((1 << 2) - 1)) {
+    if (unlikely(addr & ((1 << 2) - 1))) {
+        NVME_GUEST_ERR(nvme_ub_db_wr_misaligned,
+                       "doorbell write not 32-bit aligned,"
+                       " offset=0x%"PRIx64", ignoring", addr);
         return;
     }
 
     if (((addr - 0x1000) >> 2) & 1) {
+        /* Completion queue doorbell write */
+
         uint16_t new_head = val & 0xffff;
         int start_sqs;
         NvmeCQueue *cq;
 
         qid = (addr - (0x1000 + (1 << 2))) >> 3;
-        if (nvme_check_cqid(n, qid)) {
+        if (unlikely(nvme_check_cqid(n, qid))) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cq,
+                           "completion queue doorbell write"
+                           " for nonexistent queue,"
+                           " sqid=%"PRIu32", ignoring", qid);
             return;
         }
 
         cq = n->cq[qid];
-        if (new_head >= cq->size) {
+        if (unlikely(new_head >= cq->size)) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cqhead,
+                           "completion queue doorbell write value"
+                           " beyond queue size, sqid=%"PRIu32","
+                           " new_head=%"PRIu16", ignoring",
+                           qid, new_head);
             return;
         }
 
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
             nvme_isr_notify(n, cq);
         }
     } else {
+        /* Submission queue doorbell write */
+
         uint16_t new_tail = val & 0xffff;
         NvmeSQueue *sq;
 
         qid = (addr - 0x1000) >> 3;
-        if (nvme_check_sqid(n, qid)) {
+        if (unlikely(nvme_check_sqid(n, qid))) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sq,
+                           "submission queue doorbell write"
+                           " for nonexistent queue,"
+                           " sqid=%"PRIu32", ignoring", qid);
             return;
         }
 
         sq = n->sq[qid];
-        if (new_tail >= sq->size) {
+        if (unlikely(new_tail >= sq->size)) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sqtail,
+                           "submission queue doorbell write value"
+                           " beyond queue size, sqid=%"PRIu32","
+                           " new_tail=%"PRIu16", ignoring",
+                           qid, new_tail);
             return;
         }
 
diff --git a/hw/block/trace-events b/hw/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -XXX,XX +XXX,XX @@ virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint6
 hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
 hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int trans) "blk %p CHS %u %u %u trans %d"
 
+# hw/block/nvme.c
+# nvme traces for successful events
+nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u"
+nvme_irq_pin(void) "pulsing IRQ pin"
+nvme_irq_masked(void) "IRQ is masked"
+nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
+nvme_rw(char const *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
+nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
+nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
+nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
+nvme_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16""
+nvme_identify_ctrl(void) "identify controller"
+nvme_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
+nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16""
+nvme_getfeat_vwcache(char const* result) "get feature volatile write cache, result=%s"
+nvme_getfeat_numq(int result) "get feature number of queues, result=%d"
+nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
+nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
+nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64""
+nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64""
+nvme_mmio_aqattr(uint64_t data) "wrote MMIO, admin queue attributes=0x%"PRIx64""
+nvme_mmio_asqaddr(uint64_t data) "wrote MMIO, admin submission queue address=0x%"PRIx64""
+nvme_mmio_acqaddr(uint64_t data) "wrote MMIO, admin completion queue address=0x%"PRIx64""
+nvme_mmio_asqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin submission queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
+nvme_mmio_acqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin completion queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
+nvme_mmio_start_success(void) "setting controller enable bit succeeded"
+nvme_mmio_stopped(void) "cleared controller enable bit"
+nvme_mmio_shutdown_set(void) "shutdown bit set"
+nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
+
+# nvme traces for error conditions
+nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
+nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
+nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""
+nvme_err_invalid_prp2_missing(void) "PRP2 is null and more data to be transferred"
+nvme_err_invalid_field(void) "invalid field"
+nvme_err_invalid_prp(void) "invalid PRP"
+nvme_err_invalid_sgl(void) "invalid SGL"
+nvme_err_invalid_ns(uint32_t ns, uint32_t limit) "invalid namespace %u not within 1-%u"
+nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
+nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
+nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
+nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
+nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
+nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16""
+nvme_err_invalid_create_sq_size(uint16_t qsize) "failed creating submission queue, invalid qsize=%"PRIu16""
+nvme_err_invalid_create_sq_addr(uint64_t addr) "failed creating submission queue, addr=0x%"PRIx64""
+nvme_err_invalid_create_sq_qflags(uint16_t qflags) "failed creating submission queue, qflags=%"PRIu16""
+nvme_err_invalid_del_cq_cqid(uint16_t cqid) "failed deleting completion queue, cqid=%"PRIu16""
+nvme_err_invalid_del_cq_notempty(uint16_t cqid) "failed deleting completion queue, it is not empty, cqid=%"PRIu16""
+nvme_err_invalid_create_cq_cqid(uint16_t cqid) "failed creating completion queue, cqid=%"PRIu16""
+nvme_err_invalid_create_cq_size(uint16_t size) "failed creating completion queue, size=%"PRIu16""
+nvme_err_invalid_create_cq_addr(uint64_t addr) "failed creating completion queue, addr=0x%"PRIx64""
+nvme_err_invalid_create_cq_vector(uint16_t vector) "failed creating completion queue, vector=%"PRIu16""
+nvme_err_invalid_create_cq_qflags(uint16_t qflags) "failed creating completion queue, qflags=%"PRIu16""
+nvme_err_invalid_identify_cns(uint16_t cns) "identify, invalid cns=0x%"PRIx16""
+nvme_err_invalid_getfeat(int dw10) "invalid get features, dw10=0x%"PRIx32""
+nvme_err_invalid_setfeat(uint32_t dw10) "invalid set features, dw10=0x%"PRIx32""
+nvme_err_startfail_cq(void) "nvme_start_ctrl failed because there are non-admin completion queues"
+nvme_err_startfail_sq(void) "nvme_start_ctrl failed because there are non-admin submission queues"
+nvme_err_startfail_nbarasq(void) "nvme_start_ctrl failed because the admin submission queue address is null"
+nvme_err_startfail_nbaracq(void) "nvme_start_ctrl failed because the admin completion queue address is null"
+nvme_err_startfail_asq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin submission queue address is misaligned: 0x%"PRIx64""
+nvme_err_startfail_acq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin completion queue address is misaligned: 0x%"PRIx64""
+nvme_err_startfail_page_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too small: log2size=%u, min=%u"
+nvme_err_startfail_page_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too large: log2size=%u, max=%u"
+nvme_err_startfail_cqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too small: log2size=%u, min=%u"
+nvme_err_startfail_cqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too large: log2size=%u, max=%u"
+nvme_err_startfail_sqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too small: log2size=%u, min=%u"
+nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too large: log2size=%u, max=%u"
+nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero"
+nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero"
+nvme_err_startfail(void) "setting controller enable bit failed"
+
+# Traces for undefined behavior
+nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64""
+nvme_ub_mmiowr_toosmall(uint64_t offset, unsigned size) "MMIO write smaller than 32 bits, offset=0x%"PRIx64", size=%u"
+nvme_ub_mmiowr_intmask_with_msix(void) "undefined access to interrupt mask set when MSI-X is enabled"
+nvme_ub_mmiowr_ro_csts(void) "attempted to set a read only bit of controller status"
+nvme_ub_mmiowr_ssreset_w1c_unsupported(void) "attempted to W1C CSTS.NSSRO but CAP.NSSRS is zero (not supported)"
+nvme_ub_mmiowr_ssreset_unsupported(void) "attempted NVM subsystem reset but CAP.NSSRS is zero (not supported)"
+nvme_ub_mmiowr_cmbloc_reserved(void) "invalid write to reserved CMBLOC when CMBSZ is zero, ignored"
+nvme_ub_mmiowr_cmbsz_readonly(void) "invalid write to read only CMBSZ, ignored"
+nvme_ub_mmiowr_invalid(uint64_t offset, uint64_t data) "invalid MMIO write, offset=0x%"PRIx64", data=0x%"PRIx64""
+nvme_ub_mmiord_misaligned32(uint64_t offset) "MMIO read not 32-bit aligned, offset=0x%"PRIx64""
+nvme_ub_mmiord_toosmall(uint64_t offset) "MMIO read smaller than 32-bits, offset=0x%"PRIx64""
+nvme_ub_mmiord_invalid_ofs(uint64_t offset) "MMIO read beyond last register, offset=0x%"PRIx64", returning 0"
+nvme_ub_db_wr_misaligned(uint64_t offset) "doorbell write not 32-bit aligned, offset=0x%"PRIx64", ignoring"
+nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for nonexistent queue, cqid=%"PRIu32", ignoring"
+nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring"
+nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring"
+nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring"
+
 # hw/block/xen_disk.c
 xen_disk_alloc(char *name) "%s"
 xen_disk_init(char *name) "%s"
-- 
2.13.6

From: Fam Zheng <famz@redhat.com>

Management tools create overlays of running guests with qemu-img:

$ qemu-img create -b /image/in/use.qcow2 -f qcow2 /overlay/image.qcow2

but this doesn't work anymore due to image locking:

qemu-img: /overlay/image.qcow2: Failed to get shared "write" lock
    Is another process using the image?
    Could not open backing image to determine size.
Use the force share option to allow this use case again.

Cc: qemu-stable@nongnu.org
Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ void bdrv_img_create(const char *filename, const char *fmt,
         back_flags = flags;
         back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
 
+        backing_options = qdict_new();
         if (backing_fmt) {
-            backing_options = qdict_new();
             qdict_put_str(backing_options, "driver", backing_fmt);
         }
+        qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
 
         bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
                        &local_err);
-- 
2.13.6

From: Thomas Huth <thuth@redhat.com>

It's not working anymore since QEMU v1.3.0 - time to remove it now.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c    | 11 -----------
 qemu-doc.texi |  6 ------
 2 files changed, 17 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index XXXXXXX..XXXXXXX 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -XXX,XX +XXX,XX @@ QemuOptsList qemu_legacy_drive_opts = {
             .type = QEMU_OPT_STRING,
             .help = "chs translation (auto, lba, none)",
         },{
-            .name = "boot",
-            .type = QEMU_OPT_BOOL,
-            .help = "(deprecated, ignored)",
-        },{
             .name = "addr",
             .type = QEMU_OPT_STRING,
             .help = "pci address (virtio only)",
@@ -XXX,XX +XXX,XX @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type)
         goto fail;
     }
 
-    /* Deprecated option boot=[on|off] */
-    if (qemu_opt_get(legacy_opts, "boot") != NULL) {
-        fprintf(stderr, "qemu-kvm: boot=on|off is deprecated and will be "
-                "ignored. Future versions will reject this parameter. Please "
-                "update your scripts.\n");
-    }
-
     /* Other deprecated options */
     if (!qtest_enabled()) {
         for (i = 0; i < ARRAY_SIZE(deprecated); i++) {
diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ deprecated.
 
 @section System emulator command line arguments
 
-@subsection -drive boot=on|off (since 1.3.0)
-
-The ``boot=on|off'' option to the ``-drive'' argument is
-ignored. Applications should use the ``bootindex=N'' parameter
-to set an absolute ordering between devices instead.
-
 @subsection -tdf (since 1.3.0)
 
 The ``-tdf'' argument is ignored. The behaviour implemented
-- 
2.13.6

From: Thomas Huth <thuth@redhat.com>

It's been marked as deprecated since QEMU v2.10.0, and so far nobody
complained that we should keep it, so let's remove this legacy option
now to simplify the code quite a bit.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 vl.c            | 86 ++-------------------------------------------------------
 qemu-doc.texi   |  8 ------
 qemu-options.hx | 19 ++-----------
 3 files changed, 4 insertions(+), 109 deletions(-)

diff --git a/vl.c b/vl.c
index XXXXXXX..XXXXXXX 100644
--- a/vl.c
+++ b/vl.c
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
     const char *boot_order = NULL;
     const char *boot_once = NULL;
     DisplayState *ds;
-    int cyls, heads, secs, translation;
     QemuOpts *opts, *machine_opts;
-    QemuOpts *hda_opts = NULL, *icount_opts = NULL, *accel_opts = NULL;
+    QemuOpts *icount_opts = NULL, *accel_opts = NULL;
     QemuOptsList *olist;
     int optind;
     const char *optarg;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
 
     cpu_model = NULL;
     snapshot = 0;
-    cyls = heads = secs = 0;
-    translation = BIOS_ATA_TRANSLATION_AUTO;
 
     nb_nics = 0;
 
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
         if (optind >= argc)
             break;
         if (argv[optind][0] != '-') {
-            hda_opts = drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
+            drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
         } else {
             const QEMUOption *popt;
 
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
                 cpu_model = optarg;
                 break;
             case QEMU_OPTION_hda:
-                {
-                    char buf[256];
-                    if (cyls == 0)
-                        snprintf(buf, sizeof(buf), "%s", HD_OPTS);
-                    else
-                        snprintf(buf, sizeof(buf),
-                                 "%s,cyls=%d,heads=%d,secs=%d%s",
-                                 HD_OPTS , cyls, heads, secs,
-                                 translation == BIOS_ATA_TRANSLATION_LBA ?
-                                 ",trans=lba" :
-                                 translation == BIOS_ATA_TRANSLATION_NONE ?
-                                 ",trans=none" : "");
-                    drive_add(IF_DEFAULT, 0, optarg, buf);
-                    break;
-                }
             case QEMU_OPTION_hdb:
             case QEMU_OPTION_hdc:
             case QEMU_OPTION_hdd:
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
             case QEMU_OPTION_snapshot:
                 snapshot = 1;
                 break;
-            case QEMU_OPTION_hdachs:
-                {
-                    const char *p;
-                    p = optarg;
-                    cyls = strtol(p, (char **)&p, 0);
-                    if (cyls < 1 || cyls > 16383)
-                        goto chs_fail;
-                    if (*p != ',')
-                        goto chs_fail;
-                    p++;
-                    heads = strtol(p, (char **)&p, 0);
-                    if (heads < 1 || heads > 16)
-                        goto chs_fail;
-                    if (*p != ',')
-                        goto chs_fail;
-                    p++;
-                    secs = strtol(p, (char **)&p, 0);
-                    if (secs < 1 || secs > 63)
-                        goto chs_fail;
-                    if (*p == ',') {
-                        p++;
-                        if (!strcmp(p, "large")) {
-                            translation = BIOS_ATA_TRANSLATION_LARGE;
-                        } else if (!strcmp(p, "rechs")) {
-                            translation = BIOS_ATA_TRANSLATION_RECHS;
-                        } else if (!strcmp(p, "none")) {
-                            translation = BIOS_ATA_TRANSLATION_NONE;
-                        } else if (!strcmp(p, "lba")) {
-                            translation = BIOS_ATA_TRANSLATION_LBA;
-                        } else if (!strcmp(p, "auto")) {
-                            translation = BIOS_ATA_TRANSLATION_AUTO;
-                        } else {
-                            goto chs_fail;
-                        }
-                    } else if (*p != '\0') {
-                    chs_fail:
-                        error_report("invalid physical CHS format");
-                        exit(1);
-                    }
-                    if (hda_opts != NULL) {
-                        qemu_opt_set_number(hda_opts, "cyls", cyls,
-                                            &error_abort);
-                        qemu_opt_set_number(hda_opts, "heads", heads,
-                                            &error_abort);
-                        qemu_opt_set_number(hda_opts, "secs", secs,
-                                            &error_abort);
-                        if (translation == BIOS_ATA_TRANSLATION_LARGE) {
-                            qemu_opt_set(hda_opts, "trans", "large",
-                                         &error_abort);
-                        } else if (translation == BIOS_ATA_TRANSLATION_RECHS) {
-                            qemu_opt_set(hda_opts, "trans", "rechs",
-                                         &error_abort);
-                        } else if (translation == BIOS_ATA_TRANSLATION_LBA) {
-                            qemu_opt_set(hda_opts, "trans", "lba",
-                                         &error_abort);
-                        } else if (translation == BIOS_ATA_TRANSLATION_NONE) {
-                            qemu_opt_set(hda_opts, "trans", "none",
-                                         &error_abort);
-                        }
-                    }
-                }
-                error_report("'-hdachs' is deprecated, please use '-device"
-                             " ide-hd,cyls=c,heads=h,secs=s,...' instead");
-                break;
             case QEMU_OPTION_numa:
                 opts = qemu_opts_parse_noisily(qemu_find_opts("numa"),
                                                optarg, true);
diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ The ``--net dump'' argument is now replaced with the
 ``-object filter-dump'' argument which works in combination
 with the modern ``-netdev`` backends instead.
 
-@subsection -hdachs (since 2.10.0)
-
-The ``-hdachs'' argument is now a synonym for setting
-the ``cyls'', ``heads'', ``secs'', and ``trans'' properties
-on the ``ide-hd'' device using the ``-device'' argument.
-The new syntax allows different settings to be provided
-per disk.
-
 @subsection -usbdevice (since 2.10.0)
 
 The ``-usbdevice DEV'' argument is now a synonym for setting
diff --git a/qemu-options.hx b/qemu-options.hx
index XXXXXXX..XXXXXXX 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ of available connectors of a given interface type.
 @item media=@var{media}
 This option defines the type of the media: disk or cdrom.
 @item cyls=@var{c},heads=@var{h},secs=@var{s}[,trans=@var{t}]
-These options have the same definition as they have in @option{-hdachs}.
-These parameters are deprecated, use the corresponding parameters
+Force disk physical geometry and the optional BIOS translation (trans=none or
+lba). These parameters are deprecated, use the corresponding parameters
 of @code{-device} instead.
 @item snapshot=@var{snapshot}
 @var{snapshot} is "on" or "off" and controls snapshot mode for the given drive
@@ -XXX,XX +XXX,XX @@ the raw disk image you use is not written back. You can however force
 the write back by pressing @key{C-a s} (@pxref{disk_images}).
 ETEXI
 
-DEF("hdachs", HAS_ARG, QEMU_OPTION_hdachs, \
-    "-hdachs c,h,s[,t]\n" \
-    "                force hard disk 0 physical geometry and the optional BIOS\n" \
-    "                translation (t=none or lba) (usually QEMU can guess them)\n",
-    QEMU_ARCH_ALL)
-STEXI
-@item -hdachs @var{c},@var{h},@var{s},[,@var{t}]
-@findex -hdachs
-Force hard disk 0 physical geometry (1 <= @var{c} <= 16383, 1 <=
-@var{h} <= 16, 1 <= @var{s} <= 63) and optionally force the BIOS
-translation mode (@var{t}=none, lba or auto). Usually QEMU can guess
-all those parameters. This option is deprecated, please use
-@code{-device ide-hd,cyls=c,heads=h,secs=s,...} instead.
-ETEXI
-
 DEF("fsdev", HAS_ARG, QEMU_OPTION_fsdev,
     "-fsdev fsdriver,id=id[,path=path,][security_model={mapped-xattr|mapped-file|passthrough|none}]\n"
     " [,writeout=immediate][,readonly][,socket=socket|sock_fd=sock_fd][,fmode=fmode][,dmode=dmode]\n"
-- 
2.13.6

From: Thomas Huth <thuth@redhat.com>

Looks like we forgot to announce the deprecation of these options in
the corresponding chapter of the qemu-doc text, so let's do that now.

diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ longer be directly supported in QEMU.
 The ``-drive if=scsi'' argument is replaced by the the
 ``-device BUS-TYPE'' argument combined with ``-drive if=none''.
 
+@subsection -drive cyls=...,heads=...,secs=...,trans=... (since 2.10.0)
+
+The drive geometry arguments are replaced by the the geometry arguments
+that can be specified with the ``-device'' parameter.
+
+@subsection -drive serial=... (since 2.10.0)
+
+The drive serial argument is replaced by the the serial argument
+that can be specified with the ``-device'' parameter.
+
+@subsection -drive addr=... (since 2.10.0)
+
+The drive addr argument is replaced by the the addr argument
+that can be specified with the ``-device'' parameter.
+
 @subsection -net dump (since 2.10.0)
 
 The ``--net dump'' argument is now replaced with the
-- 
2.13.6

From: Fam Zheng <famz@redhat.com>

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block_int.h |  1 -
 block/io.c                | 18 ------------------
 2 files changed, 19 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ bool blk_dev_is_tray_open(BlockBackend *blk);
 bool blk_dev_is_medium_locked(BlockBackend *blk);
 
 void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes);
-bool bdrv_requests_pending(BlockDriverState *bs);
 
 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
 void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_disable_copy_on_read(BlockDriverState *bs)
     assert(old >= 1);
 }
 
-/* Check if any requests are in-flight (including throttled requests) */
-bool bdrv_requests_pending(BlockDriverState *bs)
-{
-    BdrvChild *child;
-
-    if (atomic_read(&bs->in_flight)) {
-        return true;
-    }
-
-    QLIST_FOREACH(child, &bs->children, next) {
-        if (bdrv_requests_pending(child->bs)) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
 typedef struct {
     Coroutine *co;
     BlockDriverState *bs;
-- 
2.13.6

bdrv_drained_begin() doesn't increase bs->quiesce_counter recursively
and also doesn't notify other parent nodes of children, which both means
that the child nodes are not actually drained, and bdrv_drained_begin()
is providing useful functionality only on a single node.

To keep things consistent, we also shouldn't call the block driver
callbacks recursively.

A proper recursive drain version that provides an actually working
drained section for child nodes will be introduced later.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
---
 block/io.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
 }
 
 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
-static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
+static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
 {
     BdrvChild *child, *tmp;
     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
     bdrv_coroutine_enter(bs, data.co);
     BDRV_POLL_WHILE(bs, !data.done);
 
-    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
-        bdrv_drain_invoke(child->bs, begin);
+    if (recursive) {
+        QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+            bdrv_drain_invoke(child->bs, begin, true);
+        }
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
         bdrv_parent_drained_begin(bs);
     }
 
-    bdrv_drain_invoke(bs, true);
+    bdrv_drain_invoke(bs, true, false);
     bdrv_drain_recurse(bs);
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
     }
 
     /* Re-enable things in child-to-parent order */
-    bdrv_drain_invoke(bs, false);
+    bdrv_drain_invoke(bs, false, false);
     bdrv_parent_drained_end(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
         aio_context_acquire(aio_context);
         aio_disable_external(aio_context);
         bdrv_parent_drained_begin(bs);
-        bdrv_drain_invoke(bs, true);
+        bdrv_drain_invoke(bs, true, true);
         aio_context_release(aio_context);
 
         if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
 
         /* Re-enable things in child-to-parent order */
         aio_context_acquire(aio_context);
-        bdrv_drain_invoke(bs, false);
+        bdrv_drain_invoke(bs, false, true);
         bdrv_parent_drained_end(bs);
         aio_enable_external(aio_context);
         aio_context_release(aio_context);
-- 
2.13.6

The existing test is for bdrv_drain_all_begin/end() only. Generalise the
test case so that it can be run for the other variants as well. At the
moment this is only bdrv_drain_begin/end(), but in a while, we'll add
another one.

Also, add a backing file to the test node to test whether the operations
work recursively.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 69 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 62 insertions(+), 7 deletions(-)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_test = {
 
     .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
     .bdrv_co_drain_end      = bdrv_test_co_drain_end,
+
+    .bdrv_child_perm        = bdrv_format_default_perms,
 };
 
 static void aio_ret_cb(void *opaque, int ret)
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
     *aio_ret = ret;
 }
 
-static void test_drv_cb_drain_all(void)
+enum drain_type {
+    BDRV_DRAIN_ALL,
+    BDRV_DRAIN,
+};
+
+static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
+{
+    switch (drain_type) {
+    case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
+    case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
+    default:                    g_assert_not_reached();
+    }
+}
+
+static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
+{
+    switch (drain_type) {
+    case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
+    case BDRV_DRAIN:            bdrv_drained_end(bs); break;
+    default:                    g_assert_not_reached();
+    }
+}
+
+static void test_drv_cb_common(enum drain_type drain_type, bool recursive)
 {
     BlockBackend *blk;
-    BlockDriverState *bs;
-    BDRVTestState *s;
+    BlockDriverState *bs, *backing;
+    BDRVTestState *s, *backing_s;
     BlockAIOCB *acb;
     int aio_ret;
 
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
     s = bs->opaque;
     blk_insert_bs(blk, bs, &error_abort);
 
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs, backing, &error_abort);
+
     /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
     g_assert_cmpint(s->drain_count, ==, 0);
-    bdrv_drain_all_begin();
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(drain_type, bs);
+
     g_assert_cmpint(s->drain_count, ==, 1);
-    bdrv_drain_all_end();
+    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
+
+    do_drain_end(drain_type, bs);
+
     g_assert_cmpint(s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
 
     /* Now do the same while a request is pending */
     aio_ret = -EINPROGRESS;
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
     g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
 
     g_assert_cmpint(s->drain_count, ==, 0);
-    bdrv_drain_all_begin();
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(drain_type, bs);
+
     g_assert_cmpint(aio_ret, ==, 0);
     g_assert_cmpint(s->drain_count, ==, 1);
-    bdrv_drain_all_end();
+    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
+
+    do_drain_end(drain_type, bs);
+
     g_assert_cmpint(s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
 
+    bdrv_unref(backing);
     bdrv_unref(bs);
     blk_unref(blk);
 }
 
+static void test_drv_cb_drain_all(void)
+{
+    test_drv_cb_common(BDRV_DRAIN_ALL, true);
+}
+
+static void test_drv_cb_drain(void)
+{
+    test_drv_cb_common(BDRV_DRAIN, false);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_init(&argc, &argv, NULL);
 
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
+    g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 
     return g_test_run();
 }
-- 
2.13.6

This is currently only working correctly for bdrv_drain(), not for
bdrv_drain_all(). Leave a comment for the drain_all case, we'll address
it later.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
     test_drv_cb_common(BDRV_DRAIN, false);
 }
 
+static void test_quiesce_common(enum drain_type drain_type, bool recursive)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs, *backing;
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
+                              &error_abort);
+    blk_insert_bs(blk, bs, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    bdrv_set_backing_hd(bs, backing, &error_abort);
+
+    g_assert_cmpint(bs->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+
+    do_drain_begin(drain_type, bs);
+
+    g_assert_cmpint(bs->quiesce_counter, ==, 1);
+    g_assert_cmpint(backing->quiesce_counter, ==, !!recursive);
+
+    do_drain_end(drain_type, bs);
+
+    g_assert_cmpint(bs->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+
+    bdrv_unref(backing);
+    bdrv_unref(bs);
+    blk_unref(blk);
+}
+
+static void test_quiesce_drain_all(void)
+{
+    // XXX drain_all doesn't quiesce
+    //test_quiesce_common(BDRV_DRAIN_ALL, true);
+}
+
+static void test_quiesce_drain(void)
+{
+    test_quiesce_common(BDRV_DRAIN, false);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
     g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 
+    g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
+    g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
+
     return g_test_run();
 }
-- 
2.13.6

Block jobs already paused themselves when their main BlockBackend
entered a drained section. This is not good enough: We also want to
pause a block job and may not submit new requests if, for example, the
mirror target node should be drained.

This implements .drained_begin/end callbacks in child_job in order to
consider all block nodes related to the job, and removes the
BlockBackend callbacks which are unnecessary now because the root of the
job main BlockBackend is always referenced with a child_job, too.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockjob.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static char *child_job_get_parent_desc(BdrvChild *c)
                            job->id);
 }
 
-static const BdrvChildRole child_job = {
-    .get_parent_desc    = child_job_get_parent_desc,
-    .stay_at_node       = true,
-};
-
-static void block_job_drained_begin(void *opaque)
+static void child_job_drained_begin(BdrvChild *c)
 {
-    BlockJob *job = opaque;
+    BlockJob *job = c->opaque;
     block_job_pause(job);
 }
 
-static void block_job_drained_end(void *opaque)
+static void child_job_drained_end(BdrvChild *c)
 {
-    BlockJob *job = opaque;
+    BlockJob *job = c->opaque;
     block_job_resume(job);
 }
 
-static const BlockDevOps block_job_dev_ops = {
-    .drained_begin = block_job_drained_begin,
-    .drained_end = block_job_drained_end,
+static const BdrvChildRole child_job = {
+    .get_parent_desc    = child_job_get_parent_desc,
+    .drained_begin      = child_job_drained_begin,
+    .drained_end        = child_job_drained_end,
+    .stay_at_node       = true,
 };
 
 void block_job_remove_all_bdrv(BlockJob *job)
@@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
     block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort);
     bs->job = job;
 
-    blk_set_dev_ops(blk, &block_job_dev_ops, job);
     bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
 
     QLIST_INSERT_HEAD(&block_jobs, job, job_list);
-- 
2.13.6

Block jobs must be paused if any of the involved nodes are drained.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "block/block.h"
+#include "block/blockjob_int.h"
 #include "sysemu/block-backend.h"
 #include "qapi/error.h"
 
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
     test_quiesce_common(BDRV_DRAIN, false);
 }
 
+
+typedef struct TestBlockJob {
+    BlockJob common;
+    bool should_complete;
+} TestBlockJob;
+
+static void test_job_completed(BlockJob *job, void *opaque)
+{
+    block_job_completed(job, 0);
+}
+
+static void coroutine_fn test_job_start(void *opaque)
+{
+    TestBlockJob *s = opaque;
+
+    while (!s->should_complete) {
+        block_job_sleep_ns(&s->common, 100000);
+    }
+
+    block_job_defer_to_main_loop(&s->common, test_job_completed, NULL);
+}
+
+static void test_job_complete(BlockJob *job, Error **errp)
+{
+    TestBlockJob *s = container_of(job, TestBlockJob, common);
+    s->should_complete = true;
+}
+
+BlockJobDriver test_job_driver = {
+    .instance_size  = sizeof(TestBlockJob),
+    .start          = test_job_start,
+    .complete       = test_job_complete,
+};
+
+static void test_blockjob_common(enum drain_type drain_type)
+{
+    BlockBackend *blk_src, *blk_target;
+    BlockDriverState *src, *target;
+    BlockJob *job;
+    int ret;
+
+    src = bdrv_new_open_driver(&bdrv_test, "source", BDRV_O_RDWR,
+                               &error_abort);
+    blk_src = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_insert_bs(blk_src, src, &error_abort);
+
+    target = bdrv_new_open_driver(&bdrv_test, "target", BDRV_O_RDWR,
+                                  &error_abort);
+    blk_target = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_insert_bs(blk_target, target, &error_abort);
+
+    job = block_job_create("job0", &test_job_driver, src, 0, BLK_PERM_ALL, 0,
+                           0, NULL, NULL, &error_abort);
+    block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort);
+    block_job_start(job);
+
+    g_assert_cmpint(job->pause_count, ==, 0);
+    g_assert_false(job->paused);
+    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
+
+    do_drain_begin(drain_type, src);
+
+    if (drain_type == BDRV_DRAIN_ALL) {
+        /* bdrv_drain_all() drains both src and target, and involves an
+         * additional block_job_pause_all() */
+        g_assert_cmpint(job->pause_count, ==, 3);
+    } else {
+        g_assert_cmpint(job->pause_count, ==, 1);
+    }
+    /* XXX We don't wait until the job is actually paused. Is this okay? */
+    /* g_assert_true(job->paused); */
+    g_assert_false(job->busy); /* The job is paused */
+
+    do_drain_end(drain_type, src);
+
+    g_assert_cmpint(job->pause_count, ==, 0);
+    g_assert_false(job->paused);
+    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
+
+    do_drain_begin(drain_type, target);
+
+    if (drain_type == BDRV_DRAIN_ALL) {
+        /* bdrv_drain_all() drains both src and target, and involves an
+         * additional block_job_pause_all() */
+        g_assert_cmpint(job->pause_count, ==, 3);
+    } else {
+        g_assert_cmpint(job->pause_count, ==, 1);
+    }
+    /* XXX We don't wait until the job is actually paused. Is this okay? */
+    /* g_assert_true(job->paused); */
+    g_assert_false(job->busy); /* The job is paused */
+
+    do_drain_end(drain_type, target);
+
+    g_assert_cmpint(job->pause_count, ==, 0);
+    g_assert_false(job->paused);
+    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
+
+    ret = block_job_complete_sync(job, &error_abort);
+    g_assert_cmpint(ret, ==, 0);
+
+    blk_unref(blk_src);
+    blk_unref(blk_target);
+    bdrv_unref(src);
+    bdrv_unref(target);
+}
+
+static void test_blockjob_drain_all(void)
+{
+    test_blockjob_common(BDRV_DRAIN_ALL);
+}
+
+static void test_blockjob_drain(void)
+{
+    test_blockjob_common(BDRV_DRAIN);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 
+    g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
+    g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
+
     return g_test_run();
 }
-- 
2.13.6

Block jobs are already paused using the BdrvChildRole drain callbacks,
so we don't need an additional block_job_pause_all() call.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c              |  4 ----
 tests/test-bdrv-drain.c | 10 ++++------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
      * context. */
     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 
-    block_job_pause_all();
-
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_enable_external(aio_context);
         aio_context_release(aio_context);
     }
-
-    block_job_resume_all();
 }
 
 void bdrv_drain_all(void)
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
     do_drain_begin(drain_type, src);
 
     if (drain_type == BDRV_DRAIN_ALL) {
-        /* bdrv_drain_all() drains both src and target, and involves an
-         * additional block_job_pause_all() */
-        g_assert_cmpint(job->pause_count, ==, 3);
+        /* bdrv_drain_all() drains both src and target */
+        g_assert_cmpint(job->pause_count, ==, 2);
     } else {
         g_assert_cmpint(job->pause_count, ==, 1);
     }
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
     do_drain_begin(drain_type, target);
 
     if (drain_type == BDRV_DRAIN_ALL) {
-        /* bdrv_drain_all() drains both src and target, and involves an
-         * additional block_job_pause_all() */
-        g_assert_cmpint(job->pause_count, ==, 3);
+        /* bdrv_drain_all() drains both src and target */
+        g_assert_cmpint(job->pause_count, ==, 2);
     } else {
         g_assert_cmpint(job->pause_count, ==, 1);
     }
-- 
2.13.6

bdrv_do_drained_begin() restricts the call of parent callbacks and
aio_disable_external() to the outermost drain section, but the block
driver callbacks are always called. bdrv_do_drained_end() must match
this behaviour, otherwise nodes stay drained even if begin/end calls
were balanced.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
 
 void bdrv_drained_end(BlockDriverState *bs)
 {
+    int old_quiesce_counter;
+
     if (qemu_in_coroutine()) {
         bdrv_co_yield_to_drain(bs, false);
         return;
     }
     assert(bs->quiesce_counter > 0);
-    if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
-        return;
-    }
+    old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
 
     /* Re-enable things in child-to-parent order */
     bdrv_drain_invoke(bs, false, false);
-    bdrv_parent_drained_end(bs);
-    aio_enable_external(bdrv_get_aio_context(bs));
+    if (old_quiesce_counter == 1) {
+        bdrv_parent_drained_end(bs);
+        aio_enable_external(bdrv_get_aio_context(bs));
+    }
 }
 
 /*
-- 
2.13.6

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
 enum drain_type {
     BDRV_DRAIN_ALL,
     BDRV_DRAIN,
+    DRAIN_TYPE_MAX,
 };
 
 static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
     test_quiesce_common(BDRV_DRAIN, false);
 }
 
+static void test_nested(void)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs, *backing;
+    BDRVTestState *s, *backing_s;
+    enum drain_type outer, inner;
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
+                              &error_abort);
+    s = bs->opaque;
+    blk_insert_bs(blk, bs, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs, backing, &error_abort);
+
+    for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
+        for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
+            /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
+            int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
+                                  (inner != BDRV_DRAIN_ALL);
+            int backing_quiesce = 0;
+            int backing_cb_cnt  = (outer != BDRV_DRAIN) +
+                                  (inner != BDRV_DRAIN);
+
+            g_assert_cmpint(bs->quiesce_counter, ==, 0);
+            g_assert_cmpint(backing->quiesce_counter, ==, 0);
+            g_assert_cmpint(s->drain_count, ==, 0);
+            g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+            do_drain_begin(outer, bs);
+            do_drain_begin(inner, bs);
+
+            g_assert_cmpint(bs->quiesce_counter, ==, bs_quiesce);
+            g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
+            g_assert_cmpint(s->drain_count, ==, 2);
+            g_assert_cmpint(backing_s->drain_count, ==, backing_cb_cnt);
+
+            do_drain_end(inner, bs);
+            do_drain_end(outer, bs);
+
+            g_assert_cmpint(bs->quiesce_counter, ==, 0);
+            g_assert_cmpint(backing->quiesce_counter, ==, 0);
+            g_assert_cmpint(s->drain_count, ==, 0);
+            g_assert_cmpint(backing_s->drain_count, ==, 0);
+        }
+    }
+
+    bdrv_unref(backing);
+    bdrv_unref(bs);
+    blk_unref(blk);
+}
+
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 
+    g_test_add_func("/bdrv-drain/nested", test_nested);
+
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 
-- 
2.13.6

This is in preparation for subtree drains, i.e. drained sections that
affect not only a single node, but recursively all child nodes, too.

Calling the parent callbacks for drain is pointless when we just came
from that parent node recursively and leads to multiple increases of
bs->quiesce_counter in a single drain call. Don't do it.

In order for this to work correctly, the parent callback must be called
for every bdrv_drain_begin/end() call, not only for the outermost one:

If we have a node N with two parents A and B, recursive draining of A
should cause the quiesce_counter of B to increase because its child N is
drained independently of B. If now B is recursively drained, too, A must
increase its quiesce_counter because N is drained independently of A
only now, even if N is going from quiesce_counter 1 to 2.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h |  4 ++--
 block.c               | 13 +++++++++----
 block/io.c            | 47 ++++++++++++++++++++++++++++++++++-------------
 3 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_io_unplug(BlockDriverState *bs);
  * Begin a quiesced section of all users of @bs. This is part of
  * bdrv_drained_begin.
  */
-void bdrv_parent_drained_begin(BlockDriverState *bs);
+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
 
 /**
  * bdrv_parent_drained_end:
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs);
  * End a quiesced section of all users of @bs. This is part of
  * bdrv_drained_end.
  */
-void bdrv_parent_drained_end(BlockDriverState *bs);
+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
 
 /**
  * bdrv_drained_begin:
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
                                       BlockDriverState *new_bs)
 {
     BlockDriverState *old_bs = child->bs;
+    int i;
 
     if (old_bs && new_bs) {
         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
     }
     if (old_bs) {
         if (old_bs->quiesce_counter && child->role->drained_end) {
-            child->role->drained_end(child);
+            for (i = 0; i < old_bs->quiesce_counter; i++) {
+                child->role->drained_end(child);
+            }
         }
         if (child->role->detach) {
             child->role->detach(child);
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
     if (new_bs) {
         QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
         if (new_bs->quiesce_counter && child->role->drained_begin) {
-            child->role->drained_begin(child);
+            for (i = 0; i < new_bs->quiesce_counter; i++) {
+                child->role->drained_begin(child);
+            }
         }
 
         if (child->role->attach) {
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
     AioContext *ctx = bdrv_get_aio_context(bs);
 
     aio_disable_external(ctx);
-    bdrv_parent_drained_begin(bs);
+    bdrv_parent_drained_begin(bs, NULL);
     bdrv_drain(bs); /* ensure there are no in-flight requests */
 
     while (aio_poll(ctx, false)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
      */
     aio_context_acquire(new_context);
     bdrv_attach_aio_context(bs, new_context);
-    bdrv_parent_drained_end(bs);
+    bdrv_parent_drained_end(bs, NULL);
     aio_enable_external(ctx);
     aio_context_release(new_context);
 }
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
     int64_t offset, int bytes, BdrvRequestFlags flags);
 
-void bdrv_parent_drained_begin(BlockDriverState *bs)
+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
 {
     BdrvChild *c, *next;
 
     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
+        if (c == ignore) {
+            continue;
+        }
         if (c->role->drained_begin) {
             c->role->drained_begin(c);
         }
     }
 }
 
-void bdrv_parent_drained_end(BlockDriverState *bs)
+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
 {
     BdrvChild *c, *next;
 
     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
+        if (c == ignore) {
+            continue;
+        }
         if (c->role->drained_end) {
             c->role->drained_end(c);
         }
@@ -XXX,XX +XXX,XX @@ typedef struct {
     BlockDriverState *bs;
     bool done;
     bool begin;
+    BdrvChild *parent;
 } BdrvCoDrainData;
 
 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
     return waited;
 }
 
+static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
+static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
+
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
     BdrvCoDrainData *data = opaque;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 
     bdrv_dec_in_flight(bs);
     if (data->begin) {
-        bdrv_drained_begin(bs);
+        bdrv_do_drained_begin(bs, data->parent);
     } else {
-        bdrv_drained_end(bs);
+        bdrv_do_drained_end(bs, data->parent);
     }
 
     data->done = true;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 }
 
 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-                                                bool begin)
+                                                bool begin, BdrvChild *parent)
 {
     BdrvCoDrainData data;
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
         .bs = bs,
         .done = false,
         .begin = begin,
+        .parent = parent,
     };
     bdrv_inc_in_flight(bs);
     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     assert(data.done);
 }
 
-void bdrv_drained_begin(BlockDriverState *bs)
+static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
 {
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, true);
+        bdrv_co_yield_to_drain(bs, true, parent);
         return;
     }
 
     /* Stop things in parent-to-child order */
     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
         aio_disable_external(bdrv_get_aio_context(bs));
-        bdrv_parent_drained_begin(bs);
     }
 
+    bdrv_parent_drained_begin(bs, parent);
     bdrv_drain_invoke(bs, true, false);
     bdrv_drain_recurse(bs);
 }
 
-void bdrv_drained_end(BlockDriverState *bs)
+void bdrv_drained_begin(BlockDriverState *bs)
+{
+    bdrv_do_drained_begin(bs, NULL);
+}
+
+static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
 {
     int old_quiesce_counter;
 
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, false);
+        bdrv_co_yield_to_drain(bs, false, parent);
         return;
     }
     assert(bs->quiesce_counter > 0);
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
 
     /* Re-enable things in child-to-parent order */
     bdrv_drain_invoke(bs, false, false);
+    bdrv_parent_drained_end(bs, parent);
     if (old_quiesce_counter == 1) {
-        bdrv_parent_drained_end(bs);
         aio_enable_external(bdrv_get_aio_context(bs));
     }
 }
 
+void bdrv_drained_end(BlockDriverState *bs)
+{
+    bdrv_do_drained_end(bs, NULL);
+}
+
 /*
  * Wait for pending requests to complete on a single BlockDriverState subtree,
  * and suspend block driver's internal I/O until next request arrives.
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
         /* Stop things in parent-to-child order */
         aio_context_acquire(aio_context);
         aio_disable_external(aio_context);
-        bdrv_parent_drained_begin(bs);
+        bdrv_parent_drained_begin(bs, NULL);
         bdrv_drain_invoke(bs, true, true);
         aio_context_release(aio_context);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         /* Re-enable things in child-to-parent order */
         aio_context_acquire(aio_context);
         bdrv_drain_invoke(bs, false, true);
-        bdrv_parent_drained_end(bs);
+        bdrv_parent_drained_end(bs, NULL);
         aio_enable_external(aio_context);
         aio_context_release(aio_context);
     }
-- 
2.13.6

bdrv_drained_begin() waits for the completion of requests in the whole
subtree, but it only actually keeps its immediate bs parameter quiesced
until bdrv_drained_end().

Add a version that keeps the whole subtree drained. As of this commit,
graph changes cannot be allowed during a subtree drained section, but
this will be fixed soon.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h | 13 +++++++++++++
 block/io.c            | 54 ++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
 void bdrv_drained_begin(BlockDriverState *bs);
 
 /**
+ * Like bdrv_drained_begin, but recursively begins a quiesced section for
+ * exclusive access to all child nodes as well.
+ *
+ * Graph changes are not allowed during a subtree drain section.
+ */
+void bdrv_subtree_drained_begin(BlockDriverState *bs);
+
+/**
  * bdrv_drained_end:
  *
  * End a quiescent section started by bdrv_drained_begin().
  */
 void bdrv_drained_end(BlockDriverState *bs);
 
+/**
+ * End a quiescent section started by bdrv_subtree_drained_begin().
+ */
+void bdrv_subtree_drained_end(BlockDriverState *bs);
+
 void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
                     Error **errp);
 void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
     BlockDriverState *bs;
     bool done;
     bool begin;
+    bool recursive;
     BdrvChild *parent;
 } BdrvCoDrainData;
 
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
     return waited;
 }
 
-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
+static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                                  BdrvChild *parent);
+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                                BdrvChild *parent);
 
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 
     bdrv_dec_in_flight(bs);
     if (data->begin) {
-        bdrv_do_drained_begin(bs, data->parent);
+        bdrv_do_drained_begin(bs, data->recursive, data->parent);
     } else {
-        bdrv_do_drained_end(bs, data->parent);
+        bdrv_do_drained_end(bs, data->recursive, data->parent);
     }
 
     data->done = true;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 }
 
 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-                                                bool begin, BdrvChild *parent)
+                                                bool begin, bool recursive,
+                                                BdrvChild *parent)
 {
     BdrvCoDrainData data;
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
         .bs = bs,
         .done = false,
         .begin = begin,
+        .recursive = recursive,
         .parent = parent,
     };
     bdrv_inc_in_flight(bs);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     assert(data.done);
 }
 
-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
+static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                                  BdrvChild *parent)
 {
+    BdrvChild *child, *next;
+
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, true, parent);
+        bdrv_co_yield_to_drain(bs, true, recursive, parent);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
     bdrv_parent_drained_begin(bs, parent);
     bdrv_drain_invoke(bs, true, false);
     bdrv_drain_recurse(bs);
+
+    if (recursive) {
+        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
+            bdrv_do_drained_begin(child->bs, true, child);
+        }
+    }
 }
 
 void bdrv_drained_begin(BlockDriverState *bs)
 {
-    bdrv_do_drained_begin(bs, NULL);
+    bdrv_do_drained_begin(bs, false, NULL);
+}
+
+void bdrv_subtree_drained_begin(BlockDriverState *bs)
+{
+    bdrv_do_drained_begin(bs, true, NULL);
 }
 
-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                                BdrvChild *parent)
 {
+    BdrvChild *child, *next;
     int old_quiesce_counter;
 
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, false, parent);
+        bdrv_co_yield_to_drain(bs, false, recursive, parent);
         return;
     }
     assert(bs->quiesce_counter > 0);
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
     if (old_quiesce_counter == 1) {
         aio_enable_external(bdrv_get_aio_context(bs));
     }
+
+    if (recursive) {
+        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
+            bdrv_do_drained_end(child->bs, true, child);
+        }
+    }
 }
 
 void bdrv_drained_end(BlockDriverState *bs)
 {
-    bdrv_do_drained_end(bs, NULL);
+    bdrv_do_drained_end(bs, false, NULL);
+}
+
+void bdrv_subtree_drained_end(BlockDriverState *bs)
+{
+    bdrv_do_drained_end(bs, true, NULL);
 }
 
 /*
-- 
2.13.6

Add a subtree drain version to the existing test cases.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
 enum drain_type {
     BDRV_DRAIN_ALL,
     BDRV_DRAIN,
+    BDRV_SUBTREE_DRAIN,
     DRAIN_TYPE_MAX,
 };
 
@@ -XXX,XX +XXX,XX @@ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
     switch (drain_type) {
     case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
     case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
+    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_begin(bs); break;
     default:                    g_assert_not_reached();
     }
 }
@@ -XXX,XX +XXX,XX @@ static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
     switch (drain_type) {
     case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
     case BDRV_DRAIN:            bdrv_drained_end(bs); break;
+    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_end(bs); break;
     default:                    g_assert_not_reached();
     }
 }
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
     test_drv_cb_common(BDRV_DRAIN, false);
 }
 
+static void test_drv_cb_drain_subtree(void)
+{
+    test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
+}
+
 static void test_quiesce_common(enum drain_type drain_type, bool recursive)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
     test_quiesce_common(BDRV_DRAIN, false);
 }
 
+static void test_quiesce_drain_subtree(void)
+{
+    test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
+}
+
 static void test_nested(void)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
             /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
             int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
                                   (inner != BDRV_DRAIN_ALL);
-            int backing_quiesce = 0;
+            int backing_quiesce = (outer == BDRV_SUBTREE_DRAIN) +
+                                  (inner == BDRV_SUBTREE_DRAIN);
             int backing_cb_cnt  = (outer != BDRV_DRAIN) +
                                   (inner != BDRV_DRAIN);
 
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_drain(void)
     test_blockjob_common(BDRV_DRAIN);
 }
 
+static void test_blockjob_drain_subtree(void)
+{
+    test_blockjob_common(BDRV_SUBTREE_DRAIN);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
 
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
     g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
+    g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
+                    test_drv_cb_drain_subtree);
 
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
+    g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
+                    test_quiesce_drain_subtree);
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
+    g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
+                    test_blockjob_drain_subtree);
 
     return g_test_run();
 }
-- 
2.13.6

If bdrv_do_drained_begin/end() are called in coroutine context, they
first use a BH to get out of the coroutine context. Call some existing
tests again from a coroutine to cover this code path.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
     *aio_ret = ret;
 }
 
+typedef struct CallInCoroutineData {
+    void (*entry)(void);
+    bool done;
+} CallInCoroutineData;
+
+static coroutine_fn void call_in_coroutine_entry(void *opaque)
+{
+    CallInCoroutineData *data = opaque;
+
+    data->entry();
+    data->done = true;
+}
+
+static void call_in_coroutine(void (*entry)(void))
+{
+    Coroutine *co;
+    CallInCoroutineData data = {
+        .entry  = entry,
+        .done   = false,
+    };
+
+    co = qemu_coroutine_create(call_in_coroutine_entry, &data);
+    qemu_coroutine_enter(co);
+    while (!data.done) {
+        aio_poll(qemu_get_aio_context(), true);
+    }
+}
+
 enum drain_type {
     BDRV_DRAIN_ALL,
     BDRV_DRAIN,
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_subtree(void)
     test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
 }
 
+static void test_drv_cb_co_drain(void)
+{
+    call_in_coroutine(test_drv_cb_drain);
+}
+
+static void test_drv_cb_co_drain_subtree(void)
+{
+    call_in_coroutine(test_drv_cb_drain_subtree);
+}
+
 static void test_quiesce_common(enum drain_type drain_type, bool recursive)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain_subtree(void)
     test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
 }
 
+static void test_quiesce_co_drain(void)
+{
+    call_in_coroutine(test_quiesce_drain);
+}
+
+static void test_quiesce_co_drain_subtree(void)
+{
+    call_in_coroutine(test_quiesce_drain_subtree);
+}
+
 static void test_nested(void)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
                     test_drv_cb_drain_subtree);
 
+    // XXX bdrv_drain_all() doesn't work in coroutine context
+    g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
+    g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
+                    test_drv_cb_co_drain_subtree);
+
+
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
     g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
                     test_quiesce_drain_subtree);
 
+    // XXX bdrv_drain_all() doesn't work in coroutine context
+    g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
+    g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
+                    test_quiesce_co_drain_subtree);
+
     g_test_add_func("/bdrv-drain/nested", test_nested);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
-- 
2.13.6

Test that drain sections are correctly propagated through the graph.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
     blk_unref(blk);
 }
 
+static void test_multiparent(void)
+{
+    BlockBackend *blk_a, *blk_b;
+    BlockDriverState *bs_a, *bs_b, *backing;
+    BDRVTestState *a_s, *b_s, *backing_s;
+
+    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
+                                &error_abort);
+    a_s = bs_a->opaque;
+    blk_insert_bs(blk_a, bs_a, &error_abort);
+
+    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
+                                &error_abort);
+    b_s = bs_b->opaque;
+    blk_insert_bs(blk_b, bs_b, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs_a, backing, &error_abort);
+    bdrv_set_backing_hd(bs_b, backing, &error_abort);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(backing->quiesce_counter, ==, 1);
+    g_assert_cmpint(a_s->drain_count, ==, 1);
+    g_assert_cmpint(b_s->drain_count, ==, 1);
+    g_assert_cmpint(backing_s->drain_count, ==, 1);
+
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 2);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
+    g_assert_cmpint(backing->quiesce_counter, ==, 2);
+    g_assert_cmpint(a_s->drain_count, ==, 2);
+    g_assert_cmpint(b_s->drain_count, ==, 2);
+    g_assert_cmpint(backing_s->drain_count, ==, 2);
+
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(backing->quiesce_counter, ==, 1);
+    g_assert_cmpint(a_s->drain_count, ==, 1);
+    g_assert_cmpint(b_s->drain_count, ==, 1);
+    g_assert_cmpint(backing_s->drain_count, ==, 1);
+
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    bdrv_unref(backing);
+    bdrv_unref(bs_a);
+    bdrv_unref(bs_b);
+    blk_unref(blk_a);
+    blk_unref(blk_b);
+}
+
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
                     test_quiesce_co_drain_subtree);
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
+    g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
-- 
2.13.6

We need to remember how many of the drain sections in which a node is
were recursive (i.e. subtree drain rather than node drain), so that they
can be correctly applied when children are added or removed during the
drained section.

With this change, it is safe to modify the graph even inside a
bdrv_subtree_drained_begin/end() section.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h     |  2 --
 include/block/block_int.h |  5 +++++
 block.c                   | 32 +++++++++++++++++++++++++++++---
 block/io.c                | 28 ++++++++++++++++++++++++----
 4 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs);
 /**
  * Like bdrv_drained_begin, but recursively begins a quiesced section for
  * exclusive access to all child nodes as well.
- *
- * Graph changes are not allowed during a subtree drain section.
  */
 void bdrv_subtree_drained_begin(BlockDriverState *bs);
 
diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
 
     /* Accessed with atomic ops.  */
     int quiesce_counter;
+    int recursive_quiesce_counter;
+
     unsigned int write_gen;               /* Current data generation */
 
     /* Protected by reqs_lock.  */
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags);
 
+void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
+void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
+
 int get_tmp_filename(char *filename, int size);
 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
                             const char *filename);
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_child_cb_drained_end(BdrvChild *child)
     bdrv_drained_end(bs);
 }
 
+static void bdrv_child_cb_attach(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_apply_subtree_drain(child, bs);
+}
+
+static void bdrv_child_cb_detach(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_unapply_subtree_drain(child, bs);
+}
+
 static int bdrv_child_cb_inactivate(BdrvChild *child)
 {
     BlockDriverState *bs = child->opaque;
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_file = {
     .inherit_options = bdrv_inherited_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
     .drained_end     = bdrv_child_cb_drained_end,
+    .attach          = bdrv_child_cb_attach,
+    .detach          = bdrv_child_cb_detach,
     .inactivate      = bdrv_child_cb_inactivate,
 };
 
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_format = {
     .inherit_options = bdrv_inherited_fmt_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
     .drained_end     = bdrv_child_cb_drained_end,
+    .attach          = bdrv_child_cb_attach,
+    .detach          = bdrv_child_cb_detach,
     .inactivate      = bdrv_child_cb_inactivate,
 };
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_attach(BdrvChild *c)
                     parent->backing_blocker);
     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
                     parent->backing_blocker);
+
+    bdrv_child_cb_attach(c);
 }
 
 static void bdrv_backing_detach(BdrvChild *c)
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_detach(BdrvChild *c)
     bdrv_op_unblock_all(c->bs, parent->backing_blocker);
     error_free(parent->backing_blocker);
     parent->backing_blocker = NULL;
+
+    bdrv_child_cb_detach(c);
 }
 
 /*
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
     }
     if (old_bs) {
+        /* Detach first so that the recursive drain sections coming from @child
+         * are already gone and we only end the drain sections that came from
+         * elsewhere. */
+        if (child->role->detach) {
+            child->role->detach(child);
+        }
         if (old_bs->quiesce_counter && child->role->drained_end) {
             for (i = 0; i < old_bs->quiesce_counter; i++) {
                 child->role->drained_end(child);
             }
         }
-        if (child->role->detach) {
-            child->role->detach(child);
-        }
         QLIST_REMOVE(child, next_parent);
     }
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
             }
         }
 
+        /* Attach only after starting new drained sections, so that recursive
+         * drain sections coming from @child don't get an extra .drained_begin
+         * callback. */
         if (child->role->attach) {
             child->role->attach(child);
         }
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     assert(data.done);
 }
 
-static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-                                  BdrvChild *parent)
+void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                           BdrvChild *parent)
 {
     BdrvChild *child, *next;
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
     bdrv_drain_recurse(bs);
 
     if (recursive) {
+        bs->recursive_quiesce_counter++;
         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
             bdrv_do_drained_begin(child->bs, true, child);
         }
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_begin(BlockDriverState *bs)
     bdrv_do_drained_begin(bs, true, NULL);
 }
 
-static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
-                                BdrvChild *parent)
+void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                         BdrvChild *parent)
 {
     BdrvChild *child, *next;
     int old_quiesce_counter;
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
     }
 
     if (recursive) {
+        bs->recursive_quiesce_counter--;
         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
             bdrv_do_drained_end(child->bs, true, child);
         }
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_end(BlockDriverState *bs)
     bdrv_do_drained_end(bs, true, NULL);
 }
 
+void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
+{
+    int i;
+
+    for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
+        bdrv_do_drained_begin(child->bs, true, child);
+    }
+}
+
+void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
+{
+    int i;
+
+    for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
+        bdrv_do_drained_end(child->bs, true, child);
+    }
+}
+
 /*
  * Wait for pending requests to complete on a single BlockDriverState subtree,
  * and suspend block driver's internal I/O until next request arrives.
-- 
2.13.6

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_multiparent(void)
     blk_unref(blk_b);
 }
 
+static void test_graph_change(void)
+{
+    BlockBackend *blk_a, *blk_b;
+    BlockDriverState *bs_a, *bs_b, *backing;
+    BDRVTestState *a_s, *b_s, *backing_s;
+
+    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
+                                &error_abort);
+    a_s = bs_a->opaque;
+    blk_insert_bs(blk_a, bs_a, &error_abort);
+
+    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
+                                &error_abort);
+    b_s = bs_b->opaque;
+    blk_insert_bs(blk_b, bs_b, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs_a, backing, &error_abort);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
+
+    bdrv_set_backing_hd(bs_b, backing, &error_abort);
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
+    g_assert_cmpint(backing->quiesce_counter, ==, 5);
+    g_assert_cmpint(a_s->drain_count, ==, 5);
+    g_assert_cmpint(b_s->drain_count, ==, 5);
+    g_assert_cmpint(backing_s->drain_count, ==, 5);
+
+    bdrv_set_backing_hd(bs_b, NULL, &error_abort);
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 3);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
+    g_assert_cmpint(backing->quiesce_counter, ==, 3);
+    g_assert_cmpint(a_s->drain_count, ==, 3);
+    g_assert_cmpint(b_s->drain_count, ==, 2);
+    g_assert_cmpint(backing_s->drain_count, ==, 3);
+
+    bdrv_set_backing_hd(bs_b, backing, &error_abort);
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
+    g_assert_cmpint(backing->quiesce_counter, ==, 5);
+    g_assert_cmpint(a_s->drain_count, ==, 5);
+    g_assert_cmpint(b_s->drain_count, ==, 5);
+    g_assert_cmpint(backing_s->drain_count, ==, 5);
+
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    bdrv_unref(backing);
+    bdrv_unref(bs_a);
+    bdrv_unref(bs_b);
+    blk_unref(blk_a);
+    blk_unref(blk_b);
+}
+
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
     g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
+    g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
-- 
2.13.6

Since commit bde70715, base is the only node that is reopened in
commit_start(). This means that the code, which still involves an
explicit BlockReopenQueue, can now be simplified by using bdrv_reopen().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
---
 block/commit.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/block/commit.c b/block/commit.c
index XXXXXXX..XXXXXXX 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
                   const char *filter_node_name, Error **errp)
 {
     CommitBlockJob *s;
-    BlockReopenQueue *reopen_queue = NULL;
     int orig_base_flags;
     BlockDriverState *iter;
     BlockDriverState *commit_top_bs = NULL;
@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
     /* convert base to r/w, if necessary */
     orig_base_flags = bdrv_get_flags(base);
     if (!(orig_base_flags & BDRV_O_RDWR)) {
-        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
-                                         orig_base_flags | BDRV_O_RDWR);
-    }
-
-    if (reopen_queue) {
-        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
+        bdrv_reopen(base, orig_base_flags | BDRV_O_RDWR, &local_err);
         if (local_err != NULL) {
             error_propagate(errp, local_err);
             goto fail;
-- 
2.13.6

The bdrv_reopen*() implementation doesn't like it if the graph is
changed between queuing nodes for reopen and actually reopening them
(one of the reasons is that queuing can be recursive).

So instead of draining the device only in bdrv_reopen_multiple(),
require that callers already drained all affected nodes, and assert this
in bdrv_reopen_queue().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
---
 block.c             | 23 ++++++++++++++++-------
 block/replication.c |  6 ++++++
 qemu-io-cmds.c      |  3 +++
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
  * returns a pointer to bs_queue, which is either the newly allocated
  * bs_queue, or the existing bs_queue being used.
  *
+ * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
  */
 static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
                                                  BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
     BdrvChild *child;
     QDict *old_options, *explicit_options;
 
+    /* Make sure that the caller remembered to use a drained section. This is
+     * important to avoid graph changes between the recursive queuing here and
+     * bdrv_reopen_multiple(). */
+    assert(bs->quiesce_counter > 0);
+
     if (bs_queue == NULL) {
         bs_queue = g_new0(BlockReopenQueue, 1);
         QSIMPLEQ_INIT(bs_queue);
@@ -XXX,XX +XXX,XX @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
  * If all devices prepare successfully, then the changes are committed
  * to all devices.
  *
+ * All affected nodes must be drained between bdrv_reopen_queue() and
+ * bdrv_reopen_multiple().
  */
 int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **er
 
     assert(bs_queue != NULL);
 
-    aio_context_release(ctx);
-    bdrv_drain_all_begin();
-    aio_context_acquire(ctx);
-
     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
+        assert(bs_entry->state.bs->quiesce_counter > 0);
         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
             error_propagate(errp, local_err);
             goto cleanup;
@@ -XXX,XX +XXX,XX @@ cleanup:
     }
     g_free(bs_queue);
 
-    bdrv_drain_all_end();
-
     return ret;
 }
 
@@ -XXX,XX +XXX,XX @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
 {
     int ret = -1;
     Error *local_err = NULL;
-    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
+    BlockReopenQueue *queue;
 
+    bdrv_subtree_drained_begin(bs);
+
+    queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
     ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err);
     if (local_err != NULL) {
         error_propagate(errp, local_err);
     }
+
+    bdrv_subtree_drained_end(bs);
+
     return ret;
 }
 
diff --git a/block/replication.c b/block/replication.c
index XXXXXXX..XXXXXXX 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
         new_secondary_flags = s->orig_secondary_flags;
     }
 
+    bdrv_subtree_drained_begin(s->hidden_disk->bs);
+    bdrv_subtree_drained_begin(s->secondary_disk->bs);
+
     if (orig_hidden_flags != new_hidden_flags) {
         reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, NULL,
                                          new_hidden_flags);
@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
                              reopen_queue, &local_err);
         error_propagate(errp, local_err);
     }
+
+    bdrv_subtree_drained_end(s->hidden_disk->bs);
+    bdrv_subtree_drained_end(s->secondary_disk->bs);
 }
 
 static void backup_job_cleanup(BlockDriverState *bs)
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -XXX,XX +XXX,XX @@ static int reopen_f(BlockBackend *blk, int argc, char **argv)
     opts = qopts ? qemu_opts_to_qdict(qopts, NULL) : NULL;
     qemu_opts_reset(&reopen_opts);
 
+    bdrv_subtree_drained_begin(bs);
     brq = bdrv_reopen_queue(NULL, bs, opts, flags);
     bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, &local_err);
+    bdrv_subtree_drained_end(bs);
+
     if (local_err) {
         error_report_err(local_err);
     } else {
-- 
2.13.6