Series comparison

-[Qemu-devel] [PULL 00/55] Block layer patches
+[Qemu-devel] [PULL v3 00/35] Block layer patches
-The following changes since commit fb68096da3d35e64c88cd610c1fa42766c58e92a:
+The following changes since commit 281f327487c9c9b1599f93c589a408bbf4a651b8:
-  Revert "tests: use memfd in vhost-user-test" (2018-02-13 09:51:52 +0000)
+  Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.12-pull-request' into staging (2017-12-22 00:11:36 +0000)
 are available in the git repository at:
   git://repo.or.cz/qemu/kevin.git tags/for-upstream
-for you to fetch changes up to 0a4dc980e6c935e9be745ce3ee1a4c71629ecd00:
+for you to fetch changes up to 1a63a907507fbbcfaee3f622907ec244b7eabda8:
-  Merge remote-tracking branch 'mreitz/tags/pull-block-2018-02-13' into queue-block (2018-02-13 17:01:13 +0100)
+  block: Keep nodes drained between reopen_queue/multiple (2017-12-22 15:05:32 +0100)
 ----------------------------------------------------------------
 Block layer patches
 ----------------------------------------------------------------
-Alberto Garcia (40):
+Doug Gale (1):
-      qcow2: Use g_try_realloc() in qcow2_expand_zero_clusters()
+      nvme: Add tracing
       qcow2: Fix documentation of get_cluster_table()
       qcow2: Add table size field to Qcow2Cache
       qcow2: Remove BDS parameter from qcow2_cache_get_table_addr()
       qcow2: Remove BDS parameter from qcow2_cache_get_table_idx()
       qcow2: Remove BDS parameter from qcow2_cache_table_release()
       qcow2: Remove BDS parameter from qcow2_cache_entry_mark_dirty()
       qcow2: Remove BDS parameter from qcow2_cache_put()
       qcow2: Remove BDS parameter from qcow2_cache_destroy()
       qcow2: Remove BDS parameter from qcow2_cache_clean_unused()
       qcow2: Remove BDS parameter from qcow2_cache_discard()
       qcow2: Remove BDS parameter from qcow2_cache_is_table_offset()
       qcow2: Add offset_to_l1_index()
       qcow2: Add l2_slice_size field to BDRVQcow2State
       qcow2: Add offset_to_l2_slice_index()
       qcow2: Update l2_load() to support L2 slices
       qcow2: Prepare l2_allocate() for adding L2 slice support
       qcow2: Update l2_allocate() to support L2 slices
       qcow2: Refactor get_cluster_table()
       qcow2: Update get_cluster_table() to support L2 slices
       qcow2: Update qcow2_get_cluster_offset() to support L2 slices
       qcow2: Update qcow2_alloc_cluster_link_l2() to support L2 slices
       qcow2: Update handle_copied() to support L2 slices
       qcow2: Update handle_alloc() to support L2 slices
       qcow2: Update discard_single_l2() to support L2 slices
       qcow2: Update zero_single_l2() to support L2 slices
       qcow2: Prepare qcow2_update_snapshot_refcount() for adding L2 slice support
       qcow2: Update qcow2_update_snapshot_refcount() to support L2 slices
       qcow2: Read refcount before L2 table in expand_zero_clusters_in_l1()
       qcow2: Prepare expand_zero_clusters_in_l1() for adding L2 slice support
       qcow2: Update expand_zero_clusters_in_l1() to support L2 slices
       qcow2: Update qcow2_truncate() to support L2 slices
       qcow2: Rename l2_table in qcow2_alloc_compressed_cluster_offset()
       qcow2: Rename l2_table in count_contiguous_clusters()
       qcow2: Rename l2_table in count_contiguous_clusters_unallocated()
       qcow2: Rename l2_table in count_cow_clusters()
       qcow2: Allow configuring the L2 slice size
       iotests: Test valid values of l2-cache-entry-size
       iotests: Test downgrading an image using a small L2 slice size
       iotests: Add l2-cache-entry-size to iotest 137
-Daniel P. Berrangé (1):
+Edgar Kaziakhmedov (1):
-      qemu-io: fix EOF Ctrl-D handling in qemu-io readline code
+      qcow2: get rid of qcow2_backing_read1 routine
-Fam Zheng (4):
+Fam Zheng (2):
-      iotests: Fix CID for VMDK afl image
+      block: Open backing image in force share mode for size probe
-      qemu-img.texi: Clean up parameter list
+      block: Remove unused bdrv_requests_pending
       qemu-img: Document --force-share / -U
       docs: Document share-rw property more thoroughly
-Kevin Wolf (1):
+John Snow (1):
-      Merge remote-tracking branch 'mreitz/tags/pull-block-2018-02-13' into queue-block
+      iotests: fix 197 for vpc
-Max Reitz (8):
+Kevin Wolf (27):
-      iotests: Use virtio-blk in 155
+      block: Formats don't need CONSISTENT_READ with NO_IO
-      gluster: Move glfs_close() to create's clean-up
+      block: Make bdrv_drain_invoke() recursive
-      gluster: Pull truncation from qemu_gluster_create
+      block: Call .drain_begin only once in bdrv_drain_all_begin()
-      gluster: Query current size in do_truncate()
+      test-bdrv-drain: Test BlockDriver callbacks for drain
-      gluster: Add preallocated truncation
+      block: bdrv_drain_recurse(): Remove unused begin parameter
-      sheepdog: Make sd_prealloc() take a BDS
+      block: Don't wait for requests in bdrv_drain*_end()
-      sheepdog: Pass old and new size to sd_prealloc()
+      block: Unify order in drain functions
-      sheepdog: Allow fully preallocated truncation
+      block: Don't acquire AioContext in hmp_qemu_io()
       block: Document that x-blockdev-change breaks quorum children list
       block: Assert drain_all is only called from main AioContext
       block: Make bdrv_drain() driver callbacks non-recursive
       test-bdrv-drain: Test callback for bdrv_drain
       test-bdrv-drain: Test bs->quiesce_counter
       blockjob: Pause job on draining any job BDS
       test-bdrv-drain: Test drain vs. block jobs
       block: Don't block_job_pause_all() in bdrv_drain_all()
       block: Nested drain_end must still call callbacks
       test-bdrv-drain: Test nested drain sections
       block: Don't notify parents in drain call chain
       block: Add bdrv_subtree_drained_begin/end()
       test-bdrv-drain: Tests for bdrv_subtree_drain
       test-bdrv-drain: Test behaviour in coroutine context
       test-bdrv-drain: Recursive draining with multiple parents
       block: Allow graph changes in subtree drained section
       test-bdrv-drain: Test graph changes in drained section
       commit: Simplify reopen of base
       block: Keep nodes drained between reopen_queue/multiple
-Paolo Bonzini (1):
+Thomas Huth (3):
-      block: early check for blockers on drive-mirror
+      block: Remove the obsolete -drive boot=on|off parameter
       block: Remove the deprecated -hdachs option
       block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter
-Vladimir Sementsov-Ogievskiy (1):
+ qapi/block-core.json             |   4 +
-      block: maintain persistent disabled bitmaps
+ block/qcow2.h                    |   3 -
  include/block/block.h            |  15 +-
  include/block/block_int.h        |   6 +-
  block.c                          |  75 ++++-
  block/commit.c                   |   8 +-
  block/io.c                       | 164 +++++++---
  block/qcow2.c                    |  51 +--
  block/replication.c              |   6 +
  blockdev.c                       |  11 -
  blockjob.c                       |  22 +-
  hmp.c                            |   6 -
  hw/block/nvme.c                  | 349 +++++++++++++++++----
  qemu-io-cmds.c                   |   3 +
  tests/test-bdrv-drain.c          | 651 +++++++++++++++++++++++++++++++++++++++
  vl.c                             |  86 +-----
  hw/block/trace-events            |  93 ++++++
  qemu-doc.texi                    |  29 +-
  qemu-options.hx                  |  19 +-
  tests/Makefile.include           |   2 +
  tests/qemu-iotests/197           |   4 +
  tests/qemu-iotests/common.filter |   3 +-
 files changed, 1294 insertions(+), 316 deletions(-)
  create mode 100644 tests/test-bdrv-drain.c
- qapi/block-core.json                           |  12 +-
- block/qcow2.h                                  |  33 +-
- include/block/dirty-bitmap.h                   |   1 -
- block/dirty-bitmap.c                           |  18 -
- block/gluster.c                                | 116 +++---
- block/qcow2-bitmap.c                           |  12 +-
- block/qcow2-cache.c                            |  80 ++--
- block/qcow2-cluster.c                          | 519 +++++++++++++------------
- block/qcow2-refcount.c                         | 206 +++++-----
- block/qcow2.c                                  |  63 ++-
- block/sheepdog.c                               |  56 ++-
- blockdev.c                                     |  15 +-
- qemu-io.c                                      |  27 +-
- docs/qemu-block-drivers.texi                   |  10 +
- qemu-doc.texi                                  |   7 +
- qemu-img.texi                                  |  74 ++--
- tests/qemu-iotests/059.out                     |   2 +-
- tests/qemu-iotests/061                         |  16 +
- tests/qemu-iotests/061.out                     |  61 +++
- tests/qemu-iotests/103                         |  17 +
- tests/qemu-iotests/103.out                     |   3 +
- tests/qemu-iotests/137                         |   5 +
- tests/qemu-iotests/137.out                     |   2 +
- tests/qemu-iotests/155                         |  14 +-
- tests/qemu-iotests/165                         |   2 +-
- tests/qemu-iotests/176                         |   2 +-
- tests/qemu-iotests/sample_images/afl9.vmdk.bz2 | Bin 178 -> 618 bytes
-files changed, 816 insertions(+), 557 deletions(-)

-[Qemu-devel] [PULL 09/55] gluster: Move glfs_close() to create's clean-up
+[Qemu-devel] [PULL v3 01/35] block: Formats don't need CONSISTENT_READ with NO_IO
-From: Max Reitz <mreitz@redhat.com>
+Commit 1f4ad7d fixed 'qemu-img info' for raw images that are currently
 in use as a mirror target. It is not enough for image formats, though,
 as these still unconditionally request BLK_PERM_CONSISTENT_READ.
-glfs_close() is a classical clean-up operation, as can be seen by the
+As this permission is geared towards whether the guest-visible data is
-fact that it is executed even if the truncation before it failed.
+consistent, and has no impact on whether the metadata is sane, and
-Also, moving it to clean-up makes it more clear that if it fails, we do
+'qemu-img info' does not read guest-visible data (except for the raw
-not want it to overwrite the current ret value if that signifies an
+format), it makes sense to not require BLK_PERM_CONSISTENT_READ if there
-error already.
+is not going to be any guest I/O performed, regardless of image format.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/gluster.c | 10 ++++++----
+ block.c | 6 +++++-
-file changed, 6 insertions(+), 4 deletions(-)
+file changed, 5 insertions(+), 1 deletion(-)
-diff --git a/block/gluster.c b/block/gluster.c
+diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/gluster.c
+--- a/block.c
-+++ b/block/gluster.c
++++ b/block.c
-@@ -XXX,XX +XXX,XX @@ static int qemu_gluster_create(const char *filename,
+@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
- {
+     assert(role == &child_backing || role == &child_file);
-     BlockdevOptionsGluster *gconf;
-     struct glfs *glfs;
+     if (!backing) {
--    struct glfs_fd *fd;
++        int flags = bdrv_reopen_get_flags(reopen_queue, bs);
-+    struct glfs_fd *fd = NULL;
++
-     int ret = 0;
+         /* Apart from the modifications below, the same permissions are
-     PreallocMode prealloc;
+          * forwarded and left alone as for filters */
-     int64_t total_size = 0;
+         bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared,
-@@ -XXX,XX +XXX,XX @@ static int qemu_gluster_create(const char *filename,
+@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
-         break;
-     }
+         /* bs->file always needs to be consistent because of the metadata. We
+          * can never allow other users to resize or write to it. */
--    if (glfs_close(fd) != 0) {
+-        perm |= BLK_PERM_CONSISTENT_READ;
--        ret = -errno;
++        if (!(flags & BDRV_O_NO_IO)) {
--    }
++            perm |= BLK_PERM_CONSISTENT_READ;
  out:
 +    if (fd) {
 +        if (glfs_close(fd) != 0 && ret == 0) {
 +            ret = -errno;
 +        }
-+    }
+         shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
-     qapi_free_BlockdevOptionsGluster(gconf);
+     } else {
-     glfs_clear_preopened(glfs);
+         /* We want consistent read from backing files if the parent needs it.
      return ret;
 --
 .13.6

-[Qemu-devel] [PULL 36/55] qcow2: Update qcow2_get_cluster_offset() to support L2 slices
+[Qemu-devel] [PULL v3 02/35] iotests: fix 197 for vpc
-From: Alberto Garcia <berto@igalia.com>
+From: John Snow <jsnow@redhat.com>
-qcow2_get_cluster_offset() checks how many contiguous bytes are
+VPC has some difficulty creating geometries of particular size.
-available at a given offset. The returned number of bytes is limited
+However, we can indeed force it to use a literal one, so let's
-by the amount that can be addressed without having to load more than
+do that for the sake of test 197, which is testing some specific
-one L2 table.
+offsets.
-Since we'll be loading L2 slices instead of full tables this patch
+Signed-off-by: John Snow <jsnow@redhat.com>
-changes the limit accordingly using the size of the L2 slice for the
+Reviewed-by: Eric Blake <eblake@redhat.com>
-calculations instead of the full table size.
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Lukáš Doktor <ldoktor@redhat.com>
 ---
  tests/qemu-iotests/197           | 4 ++++
  tests/qemu-iotests/common.filter | 3 ++-
 files changed, 6 insertions(+), 1 deletion(-)
-One consequence of this is that with small L2 slices operations such
+diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197
-as 'qemu-img map' will need to iterate in more steps because each
+index XXXXXXX..XXXXXXX 100755
-qcow2_get_cluster_offset() call will potentially return a smaller
+--- a/tests/qemu-iotests/197
-number. However the code is already prepared for that so this doesn't
++++ b/tests/qemu-iotests/197
-break semantics.
+@@ -XXX,XX +XXX,XX @@ echo '=== Copy-on-read ==='
+ echo
-The l2_table variable is also renamed to l2_slice to reflect this, and
-offset_to_l2_index() is replaced with offset_to_l2_slice_index().
+ # Prep the images
++# VPC rounds image sizes to a specific geometry, force a specific size.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
++if [ "$IMGFMT" = "vpc" ]; then
-Reviewed-by: Eric Blake <eblake@redhat.com>
++    IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size")
-Reviewed-by: Max Reitz <mreitz@redhat.com>
++fi
-Message-id: 6b602260acb33da56ed6af9611731cb7acd110eb.1517840877.git.berto@igalia.com
+ _make_test_img 4G
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+ $QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io
----
+ IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \
- block/qcow2-cluster.c | 30 +++++++++++++++---------------
+diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
 file changed, 15 insertions(+), 15 deletions(-)
 diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
+--- a/tests/qemu-iotests/common.filter
-+++ b/block/qcow2-cluster.c
++++ b/tests/qemu-iotests/common.filter
-@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
+@@ -XXX,XX +XXX,XX @@ _filter_img_create()
- {
+         -e "s# log_size=[0-9]\\+##g" \
-     BDRVQcow2State *s = bs->opaque;
+         -e "s# refcount_bits=[0-9]\\+##g" \
-     unsigned int l2_index;
+         -e "s# key-secret=[a-zA-Z0-9]\\+##g" \
--    uint64_t l1_index, l2_offset, *l2_table;
+-        -e "s# iter-time=[0-9]\\+##g"
--    int l1_bits, c;
++        -e "s# iter-time=[0-9]\\+##g" \
-+    uint64_t l1_index, l2_offset, *l2_slice;
++        -e "s# force_size=\\(on\\|off\\)##g"
 +    int c;
      unsigned int offset_in_cluster;
      uint64_t bytes_available, bytes_needed, nb_clusters;
      QCow2ClusterType type;
@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
      offset_in_cluster = offset_into_cluster(s, offset);
      bytes_needed = (uint64_t) *bytes + offset_in_cluster;
 -    l1_bits = s->l2_bits + s->cluster_bits;
 -
      /* compute how many bytes there are between the start of the cluster
 -     * containing offset and the end of the l1 entry */
 -    bytes_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1))
 -                    + offset_in_cluster;
 +     * containing offset and the end of the l2 slice that contains
 +     * the entry pointing to it */
 +    bytes_available =
 +        ((uint64_t) (s->l2_slice_size - offset_to_l2_slice_index(s, offset)))
 +        << s->cluster_bits;
      if (bytes_needed > bytes_available) {
          bytes_needed = bytes_available;
@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
          return -EIO;
      }
 -    /* load the l2 table in memory */
 +    /* load the l2 slice in memory */
 -    ret = l2_load(bs, offset, l2_offset, &l2_table);
 +    ret = l2_load(bs, offset, l2_offset, &l2_slice);
      if (ret < 0) {
          return ret;
      }
      /* find the cluster offset for the given disk offset */
 -    l2_index = offset_to_l2_index(s, offset);
 -    *cluster_offset = be64_to_cpu(l2_table[l2_index]);
 +    l2_index = offset_to_l2_slice_index(s, offset);
 +    *cluster_offset = be64_to_cpu(l2_slice[l2_index]);
      nb_clusters = size_to_clusters(s, bytes_needed);
      /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned
@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
      case QCOW2_CLUSTER_UNALLOCATED:
          /* how many empty clusters ? */
          c = count_contiguous_clusters_unallocated(nb_clusters,
 -                                                  &l2_table[l2_index], type);
 +                                                  &l2_slice[l2_index], type);
          *cluster_offset = 0;
          break;
      case QCOW2_CLUSTER_ZERO_ALLOC:
      case QCOW2_CLUSTER_NORMAL:
          /* how many allocated clusters ? */
          c = count_contiguous_clusters(nb_clusters, s->cluster_size,
 -                                      &l2_table[l2_index], QCOW_OFLAG_ZERO);
 +                                      &l2_slice[l2_index], QCOW_OFLAG_ZERO);
          *cluster_offset &= L2E_OFFSET_MASK;
          if (offset_into_cluster(s, *cluster_offset)) {
              qcow2_signal_corruption(bs, true, -1, -1,
@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
          abort();
      }
 -    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
 +    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
      bytes_available = (int64_t)c * s->cluster_size;
@@ -XXX,XX +XXX,XX @@ out:
      return type;
  fail:
 -    qcow2_cache_put(s->l2_table_cache, (void **)&l2_table);
 +    qcow2_cache_put(s->l2_table_cache, (void **)&l2_slice);
      return ret;
  }
+ _filter_img_info()
 --
 .13.6

-[Qemu-devel] [PULL 52/55] qcow2: Allow configuring the L2 slice size
+[Qemu-devel] [PULL v3 03/35] block: Make bdrv_drain_invoke() recursive
-From: Alberto Garcia <berto@igalia.com>
+This change separates bdrv_drain_invoke(), which calls the BlockDriver
 drain callbacks, from bdrv_drain_recurse(). Instead, the function
 performs its own recursion now.
-Now that the code is ready to handle L2 slices we can finally add an
+One reason for this is that bdrv_drain_recurse() can be called multiple
-option to allow configuring their size.
+times by bdrv_drain_all_begin(), but the callbacks may only be called
 once. The separation is necessary to fix this bug.
-An L2 slice is the portion of an L2 table that is read by the qcow2
+The other reason is that we intend to go to a model where we call all
-cache. Until now the cache was always reading full L2 tables, and
+driver callbacks first, and only then start polling. This is not fully
-since the L2 table size is equal to the cluster size this was not very
+achieved yet with this patch, as bdrv_drain_invoke() contains a
-efficient with large clusters. Here's a more detailed explanation of
+BDRV_POLL_WHILE() loop for the block driver callbacks, which can still
-why it makes sense to have smaller cache entries in order to load L2
+call callbacks for any unrelated event. It's a step in this direction
-data:
+anyway.
-   https://lists.gnu.org/archive/html/qemu-block/2017-09/msg00635.html
+Cc: qemu-stable@nongnu.org
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  block/io.c | 14 +++++++++++---
 file changed, 11 insertions(+), 3 deletions(-)
-This patch introduces a new command-line option to the qcow2 driver
+diff --git a/block/io.c b/block/io.c
 named l2-cache-entry-size (cf. l2-cache-size). The cache entry size
 has the same restrictions as the cluster size: it must be a power of
 two and it has the same range of allowed values, with the additional
 requirement that it must not be larger than the cluster size.
 The L2 cache entry size (L2 slice size) remains equal to the cluster
 size for now by default, so this feature must be explicitly enabled.
 Although my tests show that 4KB slices consistently improve
 performance and give the best results, let's wait and make more tests
 with different cluster sizes before deciding on an optimal default.
 Now that the cache entry size is not necessarily equal to the cluster
 size we need to reflect that in the MIN_L2_CACHE_SIZE documentation.
 That minimum value is a requirement of the COW algorithm: we need to
 read two L2 slices (and not two L2 tables) in order to do COW, see
 l2_allocate() for the actual code.
 Signed-off-by: Alberto Garcia <berto@igalia.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Message-id: c73e5611ff4a9ec5d20de68a6c289553a13d2354.1517840877.git.berto@igalia.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  qapi/block-core.json |  6 ++++++
  block/qcow2.h        |  6 ++++--
  block/qcow2-cache.c  | 10 ++++++++--
  block/qcow2.c        | 34 +++++++++++++++++++++++++++-------
 files changed, 45 insertions(+), 11 deletions(-)
 diff --git a/qapi/block-core.json b/qapi/block-core.json
 index XXXXXXX..XXXXXXX 100644
---- a/qapi/block-core.json
+--- a/block/io.c
-+++ b/qapi/block-core.json
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
- # @l2-cache-size:         the maximum size of the L2 table cache in
+     bdrv_wakeup(bs);
  #                         bytes (since 2.2)
  #
 +# @l2-cache-entry-size:   the size of each entry in the L2 cache in
 +#                         bytes. It must be a power of two between 512
 +#                         and the cluster size. The default value is
 +#                         the cluster size (since 2.12)
 +#
  # @refcount-cache-size:   the maximum size of the refcount block cache
  #                         in bytes (since 2.2)
  #
@@ -XXX,XX +XXX,XX @@
              '*overlap-check': 'Qcow2OverlapChecks',
              '*cache-size': 'int',
              '*l2-cache-size': 'int',
 +            '*l2-cache-entry-size': 'int',
              '*refcount-cache-size': 'int',
              '*cache-clean-interval': 'int',
              '*encrypt': 'BlockdevQcow2Encryption' } }
 diff --git a/block/qcow2.h b/block/qcow2.h
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2.h
 +++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@
  #define MAX_CLUSTER_BITS 21
  /* Must be at least 2 to cover COW */
 -#define MIN_L2_CACHE_SIZE 2 /* clusters */
 +#define MIN_L2_CACHE_SIZE 2 /* cache entries */
  /* Must be at least 4 to cover all cases of refcount table growth */
  #define MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */
@@ -XXX,XX +XXX,XX @@
  #define QCOW2_OPT_OVERLAP_INACTIVE_L2 "overlap-check.inactive-l2"
  #define QCOW2_OPT_CACHE_SIZE "cache-size"
  #define QCOW2_OPT_L2_CACHE_SIZE "l2-cache-size"
 +#define QCOW2_OPT_L2_CACHE_ENTRY_SIZE "l2-cache-entry-size"
  #define QCOW2_OPT_REFCOUNT_CACHE_SIZE "refcount-cache-size"
  #define QCOW2_OPT_CACHE_CLEAN_INTERVAL "cache-clean-interval"
@@ -XXX,XX +XXX,XX @@ void qcow2_free_snapshots(BlockDriverState *bs);
  int qcow2_read_snapshots(BlockDriverState *bs);
  /* qcow2-cache.c functions */
 -Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables);
 +Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
 +                               unsigned table_size);
  int qcow2_cache_destroy(Qcow2Cache *c);
  void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table);
 diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2-cache.c
 +++ b/block/qcow2-cache.c
@@ -XXX,XX +XXX,XX @@ void qcow2_cache_clean_unused(Qcow2Cache *c)
      c->cache_clean_lru_counter = c->lru_counter;
  }
--Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
++/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
-+Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
+ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 +                               unsigned table_size)
  {
-     BDRVQcow2State *s = bs->opaque;
++    BdrvChild *child, *tmp;
-     Qcow2Cache *c;
+     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
-+    assert(num_tables > 0);
+     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
-+    assert(is_power_of_2(table_size));
+@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
-+    assert(table_size >= (1 << MIN_CLUSTER_BITS));
+     data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
-+    assert(table_size <= s->cluster_size);
+     bdrv_coroutine_enter(bs, data.co);
      BDRV_POLL_WHILE(bs, !data.done);
 +
-     c = g_new0(Qcow2Cache, 1);
++    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
-     c->size = num_tables;
++        bdrv_drain_invoke(child->bs, begin);
 -    c->table_size = s->cluster_size;
 +    c->table_size = table_size;
      c->entries = g_try_new0(Qcow2CachedTable, num_tables);
      c->table_array = qemu_try_blockalign(bs->file->bs,
                                           (size_t) num_tables * c->table_size);
 diff --git a/block/qcow2.c b/block/qcow2.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2.c
 +++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static QemuOptsList qcow2_runtime_opts = {
              .help = "Maximum L2 table cache size",
          },
          {
 +            .name = QCOW2_OPT_L2_CACHE_ENTRY_SIZE,
 +            .type = QEMU_OPT_SIZE,
 +            .help = "Size of each entry in the L2 cache",
 +        },
 +        {
              .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE,
              .type = QEMU_OPT_SIZE,
              .help = "Maximum refcount block cache size",
@@ -XXX,XX +XXX,XX @@ static void qcow2_attach_aio_context(BlockDriverState *bs,
  static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
                               uint64_t *l2_cache_size,
 +                             uint64_t *l2_cache_entry_size,
                               uint64_t *refcount_cache_size, Error **errp)
  {
      BDRVQcow2State *s = bs->opaque;
@@ -XXX,XX +XXX,XX @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
      *refcount_cache_size = qemu_opt_get_size(opts,
                                               QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0);
 +    *l2_cache_entry_size = qemu_opt_get_size(
 +        opts, QCOW2_OPT_L2_CACHE_ENTRY_SIZE, s->cluster_size);
 +
      if (combined_cache_size_set) {
          if (l2_cache_size_set && refcount_cache_size_set) {
              error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE
@@ -XXX,XX +XXX,XX @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
                                   / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
          }
      }
 +
 +    if (*l2_cache_entry_size < (1 << MIN_CLUSTER_BITS) ||
 +        *l2_cache_entry_size > s->cluster_size ||
 +        !is_power_of_2(*l2_cache_entry_size)) {
 +        error_setg(errp, "L2 cache entry size must be a power of two "
 +                   "between %d and the cluster size (%d)",
 +                   1 << MIN_CLUSTER_BITS, s->cluster_size);
 +        return;
 +    }
  }
- typedef struct Qcow2ReopenState {
+ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
-@@ -XXX,XX +XXX,XX @@ static int qcow2_update_options_prepare(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
-     QemuOpts *opts = NULL;
+     BdrvChild *child, *tmp;
-     const char *opt_overlap_check, *opt_overlap_check_template;
+     bool waited;
-     int overlap_check_template = 0;
--    uint64_t l2_cache_size, refcount_cache_size;
+-    /* Ensure any pending metadata writes are submitted to bs->file.  */
-+    uint64_t l2_cache_size, l2_cache_entry_size, refcount_cache_size;
+-    bdrv_drain_invoke(bs, begin);
-     int i;
+-
-     const char *encryptfmt;
+     /* Wait for drained requests to finish */
-     QDict *encryptopts = NULL;
+     waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
-@@ -XXX,XX +XXX,XX @@ static int qcow2_update_options_prepare(BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
          bdrv_parent_drained_begin(bs);
      }
-     /* get L2 table/refcount block cache size from command line options */
++    bdrv_drain_invoke(bs, true);
--    read_cache_sizes(bs, opts, &l2_cache_size, &refcount_cache_size,
+     bdrv_drain_recurse(bs, true);
--                     &local_err);
+ }
-+    read_cache_sizes(bs, opts, &l2_cache_size, &l2_cache_entry_size,
-+                     &refcount_cache_size, &local_err);
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
      if (local_err) {
          error_propagate(errp, local_err);
          ret = -EINVAL;
          goto fail;
      }
--    l2_cache_size /= s->cluster_size;
+     bdrv_parent_drained_end(bs);
-+    l2_cache_size /= l2_cache_entry_size;
++    bdrv_drain_invoke(bs, false);
-     if (l2_cache_size < MIN_L2_CACHE_SIZE) {
+     bdrv_drain_recurse(bs, false);
-         l2_cache_size = MIN_L2_CACHE_SIZE;
+     aio_enable_external(bdrv_get_aio_context(bs));
  }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
              aio_context_acquire(aio_context);
              for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                  if (aio_context == bdrv_get_aio_context(bs)) {
 +                    /* FIXME Calling this multiple times is wrong */
 +                    bdrv_drain_invoke(bs, true);
                      waited |= bdrv_drain_recurse(bs, true);
                  }
              }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
          aio_context_acquire(aio_context);
          aio_enable_external(aio_context);
          bdrv_parent_drained_end(bs);
 +        bdrv_drain_invoke(bs, false);
          bdrv_drain_recurse(bs, false);
          aio_context_release(aio_context);
      }
-@@ -XXX,XX +XXX,XX @@ static int qcow2_update_options_prepare(BlockDriverState *bs,
-         }
-     }
--    r->l2_slice_size = s->cluster_size / sizeof(uint64_t);
--    r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size);
--    r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size);
-+    r->l2_slice_size = l2_cache_entry_size / sizeof(uint64_t);
-+    r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size,
-+                                           l2_cache_entry_size);
-+    r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size,
-+                                                 s->cluster_size);
-     if (r->l2_table_cache == NULL || r->refcount_block_cache == NULL) {
-         error_setg(errp, "Could not allocate metadata caches");
-         ret = -ENOMEM;
 --
 .13.6

-[Qemu-devel] [PULL 45/55] qcow2: Prepare expand_zero_clusters_in_l1() for adding L2 slice support
+[Qemu-devel] [PULL v3 04/35] block: Call .drain_begin only once in bdrv_drain_all_begin()
-From: Alberto Garcia <berto@igalia.com>
+bdrv_drain_all_begin() used to call the .bdrv_co_drain_begin() driver
 callback inside its polling loop. This means that how many times it got
 called for each node depended on long it had to poll the event loop.
-Adding support for L2 slices to expand_zero_clusters_in_l1() needs
+This is obviously not right and results in nodes that stay drained even
-(among other things) an extra loop that iterates over all slices of
+after bdrv_drain_all_end(), which calls .bdrv_co_drain_begin() once per
-each L2 table.
+node.
-Putting all changes in one patch would make it hard to read because
+Fix bdrv_drain_all_begin() to call the callback only once, too.
 all semantic changes would be mixed with pure indentation changes.
-To make things easier this patch simply creates a new block and
+Cc: qemu-stable@nongnu.org
-changes the indentation of all lines of code inside it. Thus, all
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-modifications in this patch are cosmetic. There are no semantic
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-changes and no variables are renamed yet. The next patch will take
+---
-care of that.
+ block/io.c | 3 +--
 file changed, 1 insertion(+), 2 deletions(-)
-Signed-off-by: Alberto Garcia <berto@igalia.com>
+diff --git a/block/io.c b/block/io.c
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Message-id: c2ae9f31ed5b6e591477ad4654448badd1c89d73.1517840877.git.berto@igalia.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  block/qcow2-cluster.c | 187 ++++++++++++++++++++++++++------------------------
 file changed, 96 insertions(+), 91 deletions(-)
 diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
+--- a/block/io.c
-+++ b/block/qcow2-cluster.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
-             goto fail;
+         aio_context_acquire(aio_context);
-         }
+         bdrv_parent_drained_begin(bs);
+         aio_disable_external(aio_context);
--        if (is_active_l1) {
++        bdrv_drain_invoke(bs, true);
--            /* get active L2 tables from cache */
+         aio_context_release(aio_context);
--            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
--                    (void **)&l2_table);
+         if (!g_slist_find(aio_ctxs, aio_context)) {
--        } else {
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
--            /* load inactive L2 tables from disk */
+             aio_context_acquire(aio_context);
--            ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE,
+             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
--                            (void *)l2_table, s->cluster_sectors);
+                 if (aio_context == bdrv_get_aio_context(bs)) {
--        }
+-                    /* FIXME Calling this multiple times is wrong */
--        if (ret < 0) {
+-                    bdrv_drain_invoke(bs, true);
--            goto fail;
+                     waited |= bdrv_drain_recurse(bs, true);
 -        }
 -
 -        for (j = 0; j < s->l2_size; j++) {
 -            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
 -            int64_t offset = l2_entry & L2E_OFFSET_MASK;
 -            QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
 -
 -            if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN &&
 -                cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) {
 -                continue;
 +        {
 +            if (is_active_l1) {
 +                /* get active L2 tables from cache */
 +                ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
 +                                      (void **)&l2_table);
 +            } else {
 +                /* load inactive L2 tables from disk */
 +                ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE,
 +                                (void *)l2_table, s->cluster_sectors);
 +            }
 +            if (ret < 0) {
 +                goto fail;
              }
 -            if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
 -                if (!bs->backing) {
 -                    /* not backed; therefore we can simply deallocate the
 -                     * cluster */
 -                    l2_table[j] = 0;
 -                    l2_dirty = true;
 +            for (j = 0; j < s->l2_size; j++) {
 +                uint64_t l2_entry = be64_to_cpu(l2_table[j]);
 +                int64_t offset = l2_entry & L2E_OFFSET_MASK;
 +                QCow2ClusterType cluster_type =
 +                    qcow2_get_cluster_type(l2_entry);
 +
 +                if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN &&
 +                    cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) {
                      continue;
                  }
 -                offset = qcow2_alloc_clusters(bs, s->cluster_size);
 -                if (offset < 0) {
 -                    ret = offset;
 -                    goto fail;
 -                }
 +                if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
 +                    if (!bs->backing) {
 +                        /* not backed; therefore we can simply deallocate the
 +                         * cluster */
 +                        l2_table[j] = 0;
 +                        l2_dirty = true;
 +                        continue;
 +                    }
 +
 +                    offset = qcow2_alloc_clusters(bs, s->cluster_size);
 +                    if (offset < 0) {
 +                        ret = offset;
 +                        goto fail;
 +                    }
 -                if (l2_refcount > 1) {
 -                    /* For shared L2 tables, set the refcount accordingly (it is
 -                     * already 1 and needs to be l2_refcount) */
 -                    ret = qcow2_update_cluster_refcount(bs,
 -                            offset >> s->cluster_bits,
 +                    if (l2_refcount > 1) {
 +                        /* For shared L2 tables, set the refcount accordingly
 +                         * (it is already 1 and needs to be l2_refcount) */
 +                        ret = qcow2_update_cluster_refcount(
 +                            bs, offset >> s->cluster_bits,
                              refcount_diff(1, l2_refcount), false,
                              QCOW2_DISCARD_OTHER);
 -                    if (ret < 0) {
 -                        qcow2_free_clusters(bs, offset, s->cluster_size,
 -                                            QCOW2_DISCARD_OTHER);
 -                        goto fail;
 +                        if (ret < 0) {
 +                            qcow2_free_clusters(bs, offset, s->cluster_size,
 +                                                QCOW2_DISCARD_OTHER);
 +                            goto fail;
 +                        }
                      }
                  }
 -            }
 -            if (offset_into_cluster(s, offset)) {
 -                qcow2_signal_corruption(bs, true, -1, -1,
 -                                        "Cluster allocation offset "
 -                                        "%#" PRIx64 " unaligned (L2 offset: %#"
 -                                        PRIx64 ", L2 index: %#x)", offset,
 -                                        l2_offset, j);
 -                if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
 -                    qcow2_free_clusters(bs, offset, s->cluster_size,
 -                                        QCOW2_DISCARD_ALWAYS);
 +                if (offset_into_cluster(s, offset)) {
 +                    qcow2_signal_corruption(
 +                        bs, true, -1, -1,
 +                        "Cluster allocation offset "
 +                        "%#" PRIx64 " unaligned (L2 offset: %#"
 +                        PRIx64 ", L2 index: %#x)", offset,
 +                        l2_offset, j);
 +                    if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
 +                        qcow2_free_clusters(bs, offset, s->cluster_size,
 +                                            QCOW2_DISCARD_ALWAYS);
 +                    }
 +                    ret = -EIO;
 +                    goto fail;
                  }
 -                ret = -EIO;
 -                goto fail;
 -            }
 -            ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
 -            if (ret < 0) {
 -                if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
 -                    qcow2_free_clusters(bs, offset, s->cluster_size,
 -                                        QCOW2_DISCARD_ALWAYS);
 +                ret = qcow2_pre_write_overlap_check(bs, 0, offset,
 +                                                    s->cluster_size);
 +                if (ret < 0) {
 +                    if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
 +                        qcow2_free_clusters(bs, offset, s->cluster_size,
 +                                            QCOW2_DISCARD_ALWAYS);
 +                    }
 +                    goto fail;
                  }
 -                goto fail;
 -            }
 -            ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0);
 -            if (ret < 0) {
 -                if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
 -                    qcow2_free_clusters(bs, offset, s->cluster_size,
 -                                        QCOW2_DISCARD_ALWAYS);
 +                ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0);
 +                if (ret < 0) {
 +                    if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
 +                        qcow2_free_clusters(bs, offset, s->cluster_size,
 +                                            QCOW2_DISCARD_ALWAYS);
 +                    }
 +                    goto fail;
                  }
 -                goto fail;
 -            }
 -            if (l2_refcount == 1) {
 -                l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
 -            } else {
 -                l2_table[j] = cpu_to_be64(offset);
 +                if (l2_refcount == 1) {
 +                    l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
 +                } else {
 +                    l2_table[j] = cpu_to_be64(offset);
 +                }
 +                l2_dirty = true;
              }
 -            l2_dirty = true;
 -        }
 -        if (is_active_l1) {
 -            if (l2_dirty) {
 -                qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
 -                qcow2_cache_depends_on_flush(s->l2_table_cache);
 -            }
 -            qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
 -        } else {
 -            if (l2_dirty) {
 -                ret = qcow2_pre_write_overlap_check(bs,
 -                        QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset,
 -                        s->cluster_size);
 -                if (ret < 0) {
 -                    goto fail;
 +            if (is_active_l1) {
 +                if (l2_dirty) {
 +                    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
 +                    qcow2_cache_depends_on_flush(s->l2_table_cache);
                  }
 +                qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
 +            } else {
 +                if (l2_dirty) {
 +                    ret = qcow2_pre_write_overlap_check(
 +                        bs, QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2,
 +                        l2_offset, s->cluster_size);
 +                    if (ret < 0) {
 +                        goto fail;
 +                    }
 -                ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE,
 -                                 (void *)l2_table, s->cluster_sectors);
 -                if (ret < 0) {
 -                    goto fail;
 +                    ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE,
 +                                     (void *)l2_table, s->cluster_sectors);
 +                    if (ret < 0) {
 +                        goto fail;
 +                    }
                  }
              }
-         }
 --
 .13.6

-[Qemu-devel] [PULL 10/55] gluster: Pull truncation from qemu_gluster_create
+[Qemu-devel] [PULL v3 05/35] test-bdrv-drain: Test BlockDriver callbacks for drain
-From: Max Reitz <mreitz@redhat.com>
+This adds a test case that the BlockDriver callbacks for drain are
 called in bdrv_drained_all_begin/end(), and that both of them are called
 exactly once.
-Pull out the truncation code from the qemu_cluster_create() function so
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-we can later reuse it in qemu_gluster_truncate().
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 ---
  tests/test-bdrv-drain.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++
  tests/Makefile.include  |   2 +
 files changed, 139 insertions(+)
  create mode 100644 tests/test-bdrv-drain.c
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
-Reviewed-by: Eric Blake <eblake@redhat.com>
+new file mode 100644
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+index XXXXXXX..XXXXXXX
----
+--- /dev/null
- block/gluster.c | 74 +++++++++++++++++++++++++++++++--------------------------
++++ b/tests/test-bdrv-drain.c
-file changed, 40 insertions(+), 34 deletions(-)
+@@ -XXX,XX +XXX,XX @@
++/*
-diff --git a/block/gluster.c b/block/gluster.c
++ * Block node draining tests
-index XXXXXXX..XXXXXXX 100644
++ *
---- a/block/gluster.c
++ * Copyright (c) 2017 Kevin Wolf <kwolf@redhat.com>
-+++ b/block/gluster.c
++ *
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
- }
++ * of this software and associated documentation files (the "Software"), to deal
- #endif
++ * in the Software without restriction, including without limitation the rights
++ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-+static int qemu_gluster_do_truncate(struct glfs_fd *fd, int64_t offset,
++ * copies of the Software, and to permit persons to whom the Software is
-+                                    PreallocMode prealloc, Error **errp)
++ * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "block/block.h"
 +#include "sysemu/block-backend.h"
 +#include "qapi/error.h"
 +
 +typedef struct BDRVTestState {
 +    int drain_count;
 +} BDRVTestState;
 +
 +static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
 +{
-+    switch (prealloc) {
++    BDRVTestState *s = bs->opaque;
-+#ifdef CONFIG_GLUSTERFS_FALLOCATE
++    s->drain_count++;
-+    case PREALLOC_MODE_FALLOC:
++}
-+        if (glfs_fallocate(fd, 0, 0, offset)) {
++
-+            error_setg_errno(errp, errno, "Could not preallocate data");
++static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
-+            return -errno;
++{
-+        }
++    BDRVTestState *s = bs->opaque;
-+        break;
++    s->drain_count--;
-+#endif /* CONFIG_GLUSTERFS_FALLOCATE */
++}
-+#ifdef CONFIG_GLUSTERFS_ZEROFILL
++
-+    case PREALLOC_MODE_FULL:
++static void bdrv_test_close(BlockDriverState *bs)
-+        if (glfs_ftruncate(fd, offset)) {
++{
-+            error_setg_errno(errp, errno, "Could not resize file");
++    BDRVTestState *s = bs->opaque;
-+            return -errno;
++    g_assert_cmpint(s->drain_count, >, 0);
-+        }
++}
-+        if (glfs_zerofill(fd, 0, offset)) {
++
-+            error_setg_errno(errp, errno, "Could not zerofill the new area");
++static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
-+            return -errno;
++                                            uint64_t offset, uint64_t bytes,
-+        }
++                                            QEMUIOVector *qiov, int flags)
-+        break;
++{
-+#endif /* CONFIG_GLUSTERFS_ZEROFILL */
++    /* We want this request to stay until the polling loop in drain waits for
-+    case PREALLOC_MODE_OFF:
++     * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
-+        if (glfs_ftruncate(fd, offset)) {
++     * first and polls its result, too, but it shouldn't accidentally complete
-+            error_setg_errno(errp, errno, "Could not resize file");
++     * this request yet. */
-+            return -errno;
++    qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
 +        }
 +        break;
 +    default:
 +        error_setg(errp, "Unsupported preallocation mode: %s",
 +                   PreallocMode_str(prealloc));
 +        return -EINVAL;
 +    }
 +
 +    return 0;
 +}
 +
- static int qemu_gluster_create(const char *filename,
++static BlockDriver bdrv_test = {
-                                QemuOpts *opts, Error **errp)
++    .format_name            = "test",
- {
++    .instance_size          = sizeof(BDRVTestState),
-@@ -XXX,XX +XXX,XX @@ static int qemu_gluster_create(const char *filename,
++
-         goto out;
++    .bdrv_close             = bdrv_test_close,
-     }
++    .bdrv_co_preadv         = bdrv_test_co_preadv,
++
--    switch (prealloc) {
++    .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
--#ifdef CONFIG_GLUSTERFS_FALLOCATE
++    .bdrv_co_drain_end      = bdrv_test_co_drain_end,
--    case PREALLOC_MODE_FALLOC:
++};
--        if (glfs_fallocate(fd, 0, 0, total_size)) {
++
--            error_setg(errp, "Could not preallocate data for the new file");
++static void aio_ret_cb(void *opaque, int ret)
--            ret = -errno;
++{
--        }
++    int *aio_ret = opaque;
--        break;
++    *aio_ret = ret;
--#endif /* CONFIG_GLUSTERFS_FALLOCATE */
++}
--#ifdef CONFIG_GLUSTERFS_ZEROFILL
++
--    case PREALLOC_MODE_FULL:
++static void test_drv_cb_drain_all(void)
--        if (!glfs_ftruncate(fd, total_size)) {
++{
--            if (glfs_zerofill(fd, 0, total_size)) {
++    BlockBackend *blk;
--                error_setg(errp, "Could not zerofill the new file");
++    BlockDriverState *bs;
--                ret = -errno;
++    BDRVTestState *s;
--            }
++    BlockAIOCB *acb;
--        } else {
++    int aio_ret;
--            error_setg(errp, "Could not resize file");
++
--            ret = -errno;
++    QEMUIOVector qiov;
--        }
++    struct iovec iov = {
--        break;
++        .iov_base = NULL,
--#endif /* CONFIG_GLUSTERFS_ZEROFILL */
++        .iov_len = 0,
--    case PREALLOC_MODE_OFF:
++    };
--        if (glfs_ftruncate(fd, total_size) != 0) {
++    qemu_iovec_init_external(&qiov, &iov, 1);
--            ret = -errno;
++
--            error_setg(errp, "Could not resize file");
++    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
--        }
++    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
--        break;
++                              &error_abort);
--    default:
++    s = bs->opaque;
--        ret = -EINVAL;
++    blk_insert_bs(blk, bs, &error_abort);
--        error_setg(errp, "Unsupported preallocation mode: %s",
++
--                   PreallocMode_str(prealloc));
++    /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
--        break;
++    g_assert_cmpint(s->drain_count, ==, 0);
--    }
++    bdrv_drain_all_begin();
-+    ret = qemu_gluster_do_truncate(fd, total_size, prealloc, errp);
++    g_assert_cmpint(s->drain_count, ==, 1);
++    bdrv_drain_all_end();
- out:
++    g_assert_cmpint(s->drain_count, ==, 0);
-     if (fd) {
++
 +    /* Now do the same while a request is pending */
 +    aio_ret = -EINPROGRESS;
 +    acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
 +    g_assert(acb != NULL);
 +    g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
 +
 +    g_assert_cmpint(s->drain_count, ==, 0);
 +    bdrv_drain_all_begin();
 +    g_assert_cmpint(aio_ret, ==, 0);
 +    g_assert_cmpint(s->drain_count, ==, 1);
 +    bdrv_drain_all_end();
 +    g_assert_cmpint(s->drain_count, ==, 0);
 +
 +    bdrv_unref(bs);
 +    blk_unref(blk);
 +}
 +
 +int main(int argc, char **argv)
 +{
 +    bdrv_init();
 +    qemu_init_main_loop(&error_abort);
 +
 +    g_test_init(&argc, &argv, NULL);
 +
 +    g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
 +
 +    return g_test_run();
 +}
 diff --git a/tests/Makefile.include b/tests/Makefile.include
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/Makefile.include
 +++ b/tests/Makefile.include
@@ -XXX,XX +XXX,XX @@ gcov-files-test-thread-pool-y = thread-pool.c
  gcov-files-test-hbitmap-y = util/hbitmap.c
  check-unit-y += tests/test-hbitmap$(EXESUF)
  gcov-files-test-hbitmap-y = blockjob.c
 +check-unit-y += tests/test-bdrv-drain$(EXESUF)
  check-unit-y += tests/test-blockjob$(EXESUF)
  check-unit-y += tests/test-blockjob-txn$(EXESUF)
  check-unit-y += tests/test-x86-cpuid$(EXESUF)
@@ -XXX,XX +XXX,XX @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
  tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
  tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
  tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
 +tests/test-bdrv-drain$(EXESUF): tests/test-bdrv-drain.o $(test-block-obj-y) $(test-util-obj-y)
  tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
  tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
  tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
 --
 .13.6

-[Qemu-devel] [PULL 22/55] qcow2: Remove BDS parameter from qcow2_cache_entry_mark_dirty()
+[Qemu-devel] [PULL v3 06/35] block: bdrv_drain_recurse(): Remove unused begin parameter
-From: Alberto Garcia <berto@igalia.com>
+Now that the bdrv_drain_invoke() calls are pulled up to the callers of
 bdrv_drain_recurse(), the 'begin' parameter isn't needed any more.
-This function was only using the BlockDriverState parameter to pass it
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-to qcow2_cache_get_table_idx(). This is no longer necessary so this
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-parameter can be removed.
+---
  block/io.c | 12 ++++++------
 file changed, 6 insertions(+), 6 deletions(-)
-Signed-off-by: Alberto Garcia <berto@igalia.com>
+diff --git a/block/io.c b/block/io.c
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Message-id: 5c40516a91782b083c1428b7b6a41bb9e2679bfb.1517840876.git.berto@igalia.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  block/qcow2.h          |  3 +--
  block/qcow2-cache.c    |  3 +--
  block/qcow2-cluster.c  | 12 ++++++------
  block/qcow2-refcount.c | 14 ++++++--------
 files changed, 14 insertions(+), 18 deletions(-)
 diff --git a/block/qcow2.h b/block/qcow2.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
+--- a/block/io.c
-+++ b/block/qcow2.h
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ int qcow2_read_snapshots(BlockDriverState *bs);
+@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
- Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables);
+     }
  int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c);
 -void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c,
 -     void *table);
 +void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table);
  int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c);
  int qcow2_cache_write(BlockDriverState *bs, Qcow2Cache *c);
  int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
 diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2-cache.c
 +++ b/block/qcow2-cache.c
@@ -XXX,XX +XXX,XX @@ void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
      assert(c->entries[i].ref >= 0);
  }
--void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c,
+-static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
--     void *table)
++static bool bdrv_drain_recurse(BlockDriverState *bs)
 +void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table)
  {
-     int i = qcow2_cache_get_table_idx(c, table);
+     BdrvChild *child, *tmp;
-     assert(c->entries[i].offset != 0);
+     bool waited;
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
+@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
-index XXXXXXX..XXXXXXX 100644
+              */
---- a/block/qcow2-cluster.c
+             bdrv_ref(bs);
-+++ b/block/qcow2-cluster.c
+         }
-@@ -XXX,XX +XXX,XX @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
+-        waited |= bdrv_drain_recurse(bs, begin);
-     BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
++        waited |= bdrv_drain_recurse(bs);
+         if (in_main_loop) {
-     trace_qcow2_l2_allocate_write_l2(bs, l1_index);
+             bdrv_unref(bs);
--    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
+         }
-+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
      ret = qcow2_cache_flush(bs, s->l2_table_cache);
      if (ret < 0) {
          goto fail;
@@ -XXX,XX +XXX,XX @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
      /* compressed clusters never have the copied flag */
      BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
 -    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
 +    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
      l2_table[l2_index] = cpu_to_be64(cluster_offset);
      qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
      if (ret < 0) {
          goto err;
      }
--    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
-+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+     bdrv_drain_invoke(bs, true);
+-    bdrv_drain_recurse(bs, true);
-     assert(l2_index + m->nb_clusters <= s->l2_size);
++    bdrv_drain_recurse(bs);
-     for (i = 0; i < m->nb_clusters; i++) {
+ }
-@@ -XXX,XX +XXX,XX @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
-         }
+ void bdrv_drained_end(BlockDriverState *bs)
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
-         /* First remove L2 entries */
--        qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
+     bdrv_parent_drained_end(bs);
-+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+     bdrv_drain_invoke(bs, false);
-         if (!full_discard && s->qcow_version >= 3) {
+-    bdrv_drain_recurse(bs, false);
-             l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
++    bdrv_drain_recurse(bs);
-         } else {
+     aio_enable_external(bdrv_get_aio_context(bs));
-@@ -XXX,XX +XXX,XX @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
+ }
-             continue;
-         }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
+             aio_context_acquire(aio_context);
--        qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
+             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+                 if (aio_context == bdrv_get_aio_context(bs)) {
-         if (cluster_type == QCOW2_CLUSTER_COMPRESSED || unmap) {
+-                    waited |= bdrv_drain_recurse(bs, true);
-             l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
++                    waited |= bdrv_drain_recurse(bs);
              qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
@@ -XXX,XX +XXX,XX @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
          if (is_active_l1) {
              if (l2_dirty) {
 -                qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
 +                qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
                  qcow2_cache_depends_on_flush(s->l2_table_cache);
              }
              qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2-refcount.c
 +++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ static int alloc_refcount_block(BlockDriverState *bs,
      /* Now the new refcount block needs to be written to disk */
      BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
 -    qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block);
 +    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
      ret = qcow2_cache_flush(bs, s->refcount_block_cache);
      if (ret < 0) {
          goto fail;
@@ -XXX,XX +XXX,XX @@ int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
                  goto fail;
              }
              memset(refblock_data, 0, s->cluster_size);
 -            qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
 +            qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
                                           refblock_data);
              new_table[i] = block_offset;
@@ -XXX,XX +XXX,XX @@ int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
                  s->set_refcount(refblock_data, j, 1);
              }
 -            qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
 +            qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
                                           refblock_data);
          }
@@ -XXX,XX +XXX,XX @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
          }
          old_table_index = table_index;
 -        qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
 -                                     refcount_block);
 +        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
          /* we can update the count and save it */
          block_index = cluster_index & (s->refcount_block_size - 1);
@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                              s->refcount_block_cache);
                      }
                      l2_table[j] = cpu_to_be64(entry);
 -                    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
 -                                                 l2_table);
 +                    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
                  }
              }
+             aio_context_release(aio_context);
-@@ -XXX,XX +XXX,XX @@ static int qcow2_discard_refcount_block(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
          aio_enable_external(aio_context);
          bdrv_parent_drained_end(bs);
          bdrv_drain_invoke(bs, false);
 -        bdrv_drain_recurse(bs, false);
 +        bdrv_drain_recurse(bs);
          aio_context_release(aio_context);
      }
-     s->set_refcount(refblock, block_index, 0);
--    qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, refblock);
-+    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
-     qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
 --
 .13.6

-[Qemu-devel] [PULL 27/55] qcow2: Remove BDS parameter from qcow2_cache_is_table_offset()
+[Qemu-devel] [PULL v3 07/35] block: Don't wait for requests in bdrv_drain*_end()
-From: Alberto Garcia <berto@igalia.com>
+The device is drained, so there is no point in waiting for requests at
 the end of the drained section. Remove the bdrv_drain_recurse() calls
 there.
-This function was only using the BlockDriverState parameter to pass it
+The bdrv_drain_recurse() calls were introduced in commit 481cad48e5e
-to qcow2_cache_get_table_addr(). This is no longer necessary so this
+in order to call the .bdrv_co_drain_end() driver callback. This is now
-parameter can be removed.
+done by a separate bdrv_drain_invoke() call.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Message-id: eb0ed90affcf302e5a954bafb5931b5215483d3a.1517840877.git.berto@igalia.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- block/qcow2.h          | 3 +--
+ block/io.c | 2 --
- block/qcow2-cache.c    | 3 +--
+file changed, 2 deletions(-)
  block/qcow2-refcount.c | 6 +++---
 files changed, 5 insertions(+), 7 deletions(-)
-diff --git a/block/qcow2.h b/block/qcow2.h
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
+--- a/block/io.c
-+++ b/block/qcow2.h
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
- int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
-     void **table);
+     bdrv_parent_drained_end(bs);
- void qcow2_cache_put(Qcow2Cache *c, void **table);
+     bdrv_drain_invoke(bs, false);
--void *qcow2_cache_is_table_offset(BlockDriverState *bs, Qcow2Cache *c,
+-    bdrv_drain_recurse(bs);
--                                  uint64_t offset);
+     aio_enable_external(bdrv_get_aio_context(bs));
 +void *qcow2_cache_is_table_offset(Qcow2Cache *c, uint64_t offset);
  void qcow2_cache_discard(Qcow2Cache *c, void *table);
  /* qcow2-bitmap.c functions */
 diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2-cache.c
 +++ b/block/qcow2-cache.c
@@ -XXX,XX +XXX,XX @@ void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table)
      c->entries[i].dirty = true;
  }
--void *qcow2_cache_is_table_offset(BlockDriverState *bs, Qcow2Cache *c,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
--                                  uint64_t offset)
+         aio_enable_external(aio_context);
-+void *qcow2_cache_is_table_offset(Qcow2Cache *c, uint64_t offset)
+         bdrv_parent_drained_end(bs);
- {
+         bdrv_drain_invoke(bs, false);
-     int i;
+-        bdrv_drain_recurse(bs);
+         aio_context_release(aio_context);
 diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2-refcount.c
 +++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
          if (refcount == 0) {
              void *table;
 -            table = qcow2_cache_is_table_offset(bs, s->refcount_block_cache,
 +            table = qcow2_cache_is_table_offset(s->refcount_block_cache,
                                                  offset);
              if (table != NULL) {
                  qcow2_cache_put(s->refcount_block_cache, &refcount_block);
                  qcow2_cache_discard(s->refcount_block_cache, table);
              }
 -            table = qcow2_cache_is_table_offset(bs, s->l2_table_cache, offset);
 +            table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
              if (table != NULL) {
                  qcow2_cache_discard(s->l2_table_cache, table);
              }
@@ -XXX,XX +XXX,XX @@ static int qcow2_discard_refcount_block(BlockDriverState *bs,
          s->free_cluster_index = cluster_index;
      }
--    refblock = qcow2_cache_is_table_offset(bs, s->refcount_block_cache,
-+    refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
-                                            discard_block_offs);
-     if (refblock) {
-         /* discard refblock from the cache if refblock is cached */
 --
 .13.6

-[Qemu-devel] [PULL 48/55] qcow2: Rename l2_table in qcow2_alloc_compressed_cluster_offset()
+[Qemu-devel] [PULL v3 08/35] block: Unify order in drain functions
-From: Alberto Garcia <berto@igalia.com>
+Drain requests are propagated to child nodes, parent nodes and directly
 to the AioContext. The order in which this happened was different
 between all combinations of drain/drain_all and begin/end.
-This function doesn't need any changes to support L2 slices, but since
+The correct order is to keep children only drained when their parents
-it's now dealing with slices instead of full tables, the l2_table
+are also drained. This means that at the start of a drained section, the
-variable is renamed for clarity.
+AioContext needs to be drained first, the parents second and only then
 the children. The correct order for the end of a drained section is the
 opposite.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
+This patch changes the three other functions to follow the example of
-Reviewed-by: Eric Blake <eblake@redhat.com>
+bdrv_drained_begin(), which is the only one that got it right.
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: 0c5d4b9bf163aa3b49ec19cc512a50d83563f2ad.1517840877.git.berto@igalia.com
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qcow2-cluster.c | 16 ++++++++--------
+ block/io.c | 12 ++++++++----
-file changed, 8 insertions(+), 8 deletions(-)
+file changed, 8 insertions(+), 4 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
+--- a/block/io.c
-+++ b/block/qcow2-cluster.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
- {
+         return;
      BDRVQcow2State *s = bs->opaque;
      int l2_index, ret;
 -    uint64_t *l2_table;
 +    uint64_t *l2_slice;
      int64_t cluster_offset;
      int nb_csectors;
 -    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
 +    ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
      if (ret < 0) {
          return 0;
      }
-     /* Compression can't overwrite anything. Fail if the cluster was already
++    /* Stop things in parent-to-child order */
-      * allocated. */
+     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
--    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+         aio_disable_external(bdrv_get_aio_context(bs));
-+    cluster_offset = be64_to_cpu(l2_slice[l2_index]);
+         bdrv_parent_drained_begin(bs);
-     if (cluster_offset & L2E_OFFSET_MASK) {
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
--        qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
+         return;
 +        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
          return 0;
      }
-     cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
+-    bdrv_parent_drained_end(bs);
-     if (cluster_offset < 0) {
++    /* Re-enable things in child-to-parent order */
--        qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
+     bdrv_drain_invoke(bs, false);
-+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
++    bdrv_parent_drained_end(bs);
-         return 0;
+     aio_enable_external(bdrv_get_aio_context(bs));
  }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
      for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
          AioContext *aio_context = bdrv_get_aio_context(bs);
 +        /* Stop things in parent-to-child order */
          aio_context_acquire(aio_context);
 -        bdrv_parent_drained_begin(bs);
          aio_disable_external(aio_context);
 +        bdrv_parent_drained_begin(bs);
          bdrv_drain_invoke(bs, true);
          aio_context_release(aio_context);
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
      for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
          AioContext *aio_context = bdrv_get_aio_context(bs);
 +        /* Re-enable things in child-to-parent order */
          aio_context_acquire(aio_context);
 -        aio_enable_external(aio_context);
 -        bdrv_parent_drained_end(bs);
          bdrv_drain_invoke(bs, false);
 +        bdrv_parent_drained_end(bs);
 +        aio_enable_external(aio_context);
          aio_context_release(aio_context);
      }
-@@ -XXX,XX +XXX,XX @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
-     /* compressed clusters never have the copied flag */
-     BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
--    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
--    l2_table[l2_index] = cpu_to_be64(cluster_offset);
--    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
-+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
-+    l2_slice[l2_index] = cpu_to_be64(cluster_offset);
-+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
-     return cluster_offset;
- }
 --
 .13.6

-[Qemu-devel] [PULL 46/55] qcow2: Update expand_zero_clusters_in_l1() to support L2 slices
+[Qemu-devel] [PULL v3 09/35] block: Don't acquire AioContext in hmp_qemu_io()
-From: Alberto Garcia <berto@igalia.com>
+Commit 15afd94a047 added code to acquire and release the AioContext in
 qemuio_command(). This means that the lock is taken twice now in the
 call path from hmp_qemu_io(). This causes BDRV_POLL_WHILE() to hang for
 any requests issued to nodes in a non-mainloop AioContext.
-expand_zero_clusters_in_l1() expands zero clusters as a necessary step
+Dropping the first locking from hmp_qemu_io() fixes the problem.
 to downgrade qcow2 images to a version that doesn't support metadata
 zero clusters. This function takes an L1 table (which may or may not
 be active) and iterates over all its L2 tables looking for zero
 clusters.
-Since we'll be loading L2 slices instead of full tables we need to add
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-an extra loop that iterates over all slices of each L2 table, and we
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-should also use the slice size when allocating the buffer used when
+---
-the L1 table is not active.
+ hmp.c | 6 ------
 file changed, 6 deletions(-)
-This function doesn't need any additional changes so apart from that
+diff --git a/hmp.c b/hmp.c
 this patch simply updates the variable name from l2_table to l2_slice.
 Finally, and since we have to touch the bdrv_read() / bdrv_write()
 calls anyway, this patch takes the opportunity to replace them with
 the byte-based bdrv_pread() / bdrv_pwrite().
 Signed-off-by: Alberto Garcia <berto@igalia.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Message-id: 43590976f730501688096cff103f2923b72b0f32.1517840877.git.berto@igalia.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  block/qcow2-cluster.c | 51 ++++++++++++++++++++++++++++-----------------------
 file changed, 28 insertions(+), 23 deletions(-)
 diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
+--- a/hmp.c
-+++ b/block/qcow2-cluster.c
++++ b/hmp.c
-@@ -XXX,XX +XXX,XX @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
+@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
  {
-     BDRVQcow2State *s = bs->opaque;
+     BlockBackend *blk;
-     bool is_active_l1 = (l1_table == s->l1_table);
+     BlockBackend *local_blk = NULL;
--    uint64_t *l2_table = NULL;
+-    AioContext *aio_context;
-+    uint64_t *l2_slice = NULL;
+     const char* device = qdict_get_str(qdict, "device");
-+    unsigned slice, slice_size2, n_slices;
+     const char* command = qdict_get_str(qdict, "command");
-     int ret;
+     Error *err = NULL;
-     int i, j;
+@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
 +    slice_size2 = s->l2_slice_size * sizeof(uint64_t);
 +    n_slices = s->cluster_size / slice_size2;
 +
      if (!is_active_l1) {
          /* inactive L2 tables require a buffer to be stored in when loading
           * them from disk */
 -        l2_table = qemu_try_blockalign(bs->file->bs, s->cluster_size);
 -        if (l2_table == NULL) {
 +        l2_slice = qemu_try_blockalign(bs->file->bs, slice_size2);
 +        if (l2_slice == NULL) {
              return -ENOMEM;
          }
      }
-     for (i = 0; i < l1_size; i++) {
+-    aio_context = blk_get_aio_context(blk);
-         uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
+-    aio_context_acquire(aio_context);
--        bool l2_dirty = false;
+-
-         uint64_t l2_refcount;
+     /*
+      * Notably absent: Proper permission management. This is sad, but it seems
-         if (!l2_offset) {
+      * almost impossible to achieve without changing the semantics and thereby
-@@ -XXX,XX +XXX,XX @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
+@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
-             goto fail;
+      */
-         }
+     qemuio_command(blk, command);
--        {
+-    aio_context_release(aio_context);
-+        for (slice = 0; slice < n_slices; slice++) {
+-
 +            uint64_t slice_offset = l2_offset + slice * slice_size2;
 +            bool l2_dirty = false;
              if (is_active_l1) {
                  /* get active L2 tables from cache */
 -                ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
 -                                      (void **)&l2_table);
 +                ret = qcow2_cache_get(bs, s->l2_table_cache, slice_offset,
 +                                      (void **)&l2_slice);
              } else {
                  /* load inactive L2 tables from disk */
 -                ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE,
 -                                (void *)l2_table, s->cluster_sectors);
 +                ret = bdrv_pread(bs->file, slice_offset, l2_slice, slice_size2);
              }
              if (ret < 0) {
                  goto fail;
              }
 -            for (j = 0; j < s->l2_size; j++) {
 -                uint64_t l2_entry = be64_to_cpu(l2_table[j]);
 +            for (j = 0; j < s->l2_slice_size; j++) {
 +                uint64_t l2_entry = be64_to_cpu(l2_slice[j]);
                  int64_t offset = l2_entry & L2E_OFFSET_MASK;
                  QCow2ClusterType cluster_type =
                      qcow2_get_cluster_type(l2_entry);
@@ -XXX,XX +XXX,XX @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                      if (!bs->backing) {
                          /* not backed; therefore we can simply deallocate the
                           * cluster */
 -                        l2_table[j] = 0;
 +                        l2_slice[j] = 0;
                          l2_dirty = true;
                          continue;
                      }
@@ -XXX,XX +XXX,XX @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                  }
                  if (offset_into_cluster(s, offset)) {
 +                    int l2_index = slice * s->l2_slice_size + j;
                      qcow2_signal_corruption(
                          bs, true, -1, -1,
                          "Cluster allocation offset "
                          "%#" PRIx64 " unaligned (L2 offset: %#"
                          PRIx64 ", L2 index: %#x)", offset,
 -                        l2_offset, j);
 +                        l2_offset, l2_index);
                      if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
                          qcow2_free_clusters(bs, offset, s->cluster_size,
                                              QCOW2_DISCARD_ALWAYS);
@@ -XXX,XX +XXX,XX @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                  }
                  if (l2_refcount == 1) {
 -                    l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
 +                    l2_slice[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
                  } else {
 -                    l2_table[j] = cpu_to_be64(offset);
 +                    l2_slice[j] = cpu_to_be64(offset);
                  }
                  l2_dirty = true;
              }
              if (is_active_l1) {
                  if (l2_dirty) {
 -                    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
 +                    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
                      qcow2_cache_depends_on_flush(s->l2_table_cache);
                  }
 -                qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
 +                qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
              } else {
                  if (l2_dirty) {
                      ret = qcow2_pre_write_overlap_check(
                          bs, QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2,
 -                        l2_offset, s->cluster_size);
 +                        slice_offset, slice_size2);
                      if (ret < 0) {
                          goto fail;
                      }
 -                    ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE,
 -                                     (void *)l2_table, s->cluster_sectors);
 +                    ret = bdrv_pwrite(bs->file, slice_offset,
 +                                      l2_slice, slice_size2);
                      if (ret < 0) {
                          goto fail;
                      }
@@ -XXX,XX +XXX,XX @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
      ret = 0;
  fail:
--    if (l2_table) {
+     blk_unref(local_blk);
-+    if (l2_slice) {
+     hmp_handle_error(mon, &err);
          if (!is_active_l1) {
 -            qemu_vfree(l2_table);
 +            qemu_vfree(l2_slice);
          } else {
 -            qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
 +            qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
          }
      }
      return ret;
 --
 .13.6

-[Qemu-devel] [PULL 24/55] qcow2: Remove BDS parameter from qcow2_cache_destroy()
+[Qemu-devel] [PULL v3 10/35] qcow2: get rid of qcow2_backing_read1 routine
-From: Alberto Garcia <berto@igalia.com>
+From: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
-This function was never using the BlockDriverState parameter so it can
+Since bdrv_co_preadv does all neccessary checks including
-be safely removed.
+reading after the end of the backing file, avoid duplication
 of verification before bdrv_co_preadv call.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
+Signed-off-by: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Message-id: 49c74fe8b3aead9056e61a85b145ce787d06262b.1517840876.git.berto@igalia.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- block/qcow2.h       |  2 +-
+ block/qcow2.h |  3 ---
- block/qcow2-cache.c |  2 +-
+ block/qcow2.c | 51 ++++++++-------------------------------------------
- block/qcow2.c       | 16 ++++++++--------
+files changed, 8 insertions(+), 46 deletions(-)
 files changed, 10 insertions(+), 10 deletions(-)
 diff --git a/block/qcow2.h b/block/qcow2.h
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2.h
 +++ b/block/qcow2.h
-@@ -XXX,XX +XXX,XX @@ int qcow2_read_snapshots(BlockDriverState *bs);
+@@ -XXX,XX +XXX,XX @@ uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset)
  /* qcow2-cache.c functions */
  Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables);
 -int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c);
 +int qcow2_cache_destroy(Qcow2Cache *c);
  void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table);
  int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c);
 diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2-cache.c
 +++ b/block/qcow2-cache.c
@@ -XXX,XX +XXX,XX @@ Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
      return c;
  }
--int qcow2_cache_destroy(BlockDriverState *bs, Qcow2Cache *c)
+ /* qcow2.c functions */
-+int qcow2_cache_destroy(Qcow2Cache *c)
+-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
- {
+-                  int64_t sector_num, int nb_sectors);
-     int i;
+-
+ int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
                                       int refcount_order, bool generous_increase,
                                       uint64_t *refblock_count);
 diff --git a/block/qcow2.c b/block/qcow2.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2.c
 +++ b/block/qcow2.c
-@@ -XXX,XX +XXX,XX @@ static void qcow2_update_options_commit(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
-     int i;
+     return status;
+ }
-     if (s->l2_table_cache) {
--        qcow2_cache_destroy(bs, s->l2_table_cache);
+-/* handle reading after the end of the backing file */
-+        qcow2_cache_destroy(s->l2_table_cache);
+-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-     }
+-                        int64_t offset, int bytes)
-     if (s->refcount_block_cache) {
+-{
--        qcow2_cache_destroy(bs, s->refcount_block_cache);
+-    uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE;
-+        qcow2_cache_destroy(s->refcount_block_cache);
+-    int n1;
-     }
+-
-     s->l2_table_cache = r->l2_table_cache;
+-    if ((offset + bytes) <= bs_size) {
-     s->refcount_block_cache = r->refcount_block_cache;
+-        return bytes;
-@@ -XXX,XX +XXX,XX @@ static void qcow2_update_options_abort(BlockDriverState *bs,
+-    }
-                                        Qcow2ReopenState *r)
+-
 -    if (offset >= bs_size) {
 -        n1 = 0;
 -    } else {
 -        n1 = bs_size - offset;
 -    }
 -
 -    qemu_iovec_memset(qiov, n1, 0, bytes - n1);
 -
 -    return n1;
 -}
 -
  static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
                                          uint64_t bytes, QEMUIOVector *qiov,
                                          int flags)
  {
-     if (r->l2_table_cache) {
+     BDRVQcow2State *s = bs->opaque;
--        qcow2_cache_destroy(bs, r->l2_table_cache);
+-    int offset_in_cluster, n1;
-+        qcow2_cache_destroy(r->l2_table_cache);
++    int offset_in_cluster;
-     }
+     int ret;
-     if (r->refcount_block_cache) {
+     unsigned int cur_bytes; /* number of bytes in current iteration */
--        qcow2_cache_destroy(bs, r->refcount_block_cache);
+     uint64_t cluster_offset = 0;
-+        qcow2_cache_destroy(r->refcount_block_cache);
+@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
-     }
+         case QCOW2_CLUSTER_UNALLOCATED:
-     qapi_free_QCryptoBlockOpenOptions(r->crypto_opts);
- }
+             if (bs->backing) {
-@@ -XXX,XX +XXX,XX @@ static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags,
+-                /* read from the base image */
-     s->l1_table = NULL;
+-                n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov,
-     cache_clean_timer_del(bs);
+-                                         offset, cur_bytes);
-     if (s->l2_table_cache) {
+-                if (n1 > 0) {
--        qcow2_cache_destroy(bs, s->l2_table_cache);
+-                    QEMUIOVector local_qiov;
-+        qcow2_cache_destroy(s->l2_table_cache);
+-
-     }
+-                    qemu_iovec_init(&local_qiov, hd_qiov.niov);
-     if (s->refcount_block_cache) {
+-                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1);
--        qcow2_cache_destroy(bs, s->refcount_block_cache);
+-
-+        qcow2_cache_destroy(s->refcount_block_cache);
+-                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
-     }
+-                    qemu_co_mutex_unlock(&s->lock);
-     qcrypto_block_free(s->crypto);
+-                    ret = bdrv_co_preadv(bs->backing, offset, n1,
-     qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
+-                                         &local_qiov, 0);
-@@ -XXX,XX +XXX,XX @@ static void qcow2_close(BlockDriverState *bs)
+-                    qemu_co_mutex_lock(&s->lock);
-     }
+-
+-                    qemu_iovec_destroy(&local_qiov);
-     cache_clean_timer_del(bs);
+-
--    qcow2_cache_destroy(bs, s->l2_table_cache);
+-                    if (ret < 0) {
--    qcow2_cache_destroy(bs, s->refcount_block_cache);
+-                        goto fail;
-+    qcow2_cache_destroy(s->l2_table_cache);
+-                    }
-+    qcow2_cache_destroy(s->refcount_block_cache);
++                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
++                qemu_co_mutex_unlock(&s->lock);
-     qcrypto_block_free(s->crypto);
++                ret = bdrv_co_preadv(bs->backing, offset, cur_bytes,
-     s->crypto = NULL;
++                                     &hd_qiov, 0);
 +                qemu_co_mutex_lock(&s->lock);
 +                if (ret < 0) {
 +                    goto fail;
                  }
              } else {
                  /* Note: in this case, no need to wait */
 --
 .13.6

-[Qemu-devel] [PULL 16/55] block: maintain persistent disabled bitmaps
+[Qemu-devel] [PULL v3 11/35] block: Document that x-blockdev-change breaks quorum children list
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+Removing a quorum child node with x-blockdev-change results in a quorum
 driver state that cannot be recreated with create options because it
 would require a list with gaps. This causes trouble in at least
 .bdrv_refresh_filename().
-To maintain load/store disabled bitmap there is new approach:
+Document this problem so that we won't accidentally mark the command
 stable without having addressed it.
- - deprecate @autoload flag of block-dirty-bitmap-add, make it ignored
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
- - store enabled bitmaps as "auto" to qcow2
+Reviewed-by: Alberto Garcia <berto@igalia.com>
  - store disabled bitmaps without "auto" flag to qcow2
  - on qcow2 open load "auto" bitmaps as enabled and others
    as disabled (except in_use bitmaps)
 Also, adjust iotests 165 and 176 appropriately.
 Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 Message-id: 20180202160752.143796-1-vsementsov@virtuozzo.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- qapi/block-core.json         |  6 +++---
+ qapi/block-core.json | 4 ++++
- block/qcow2.h                |  2 +-
+file changed, 4 insertions(+)
  include/block/dirty-bitmap.h |  1 -
  block/dirty-bitmap.c         | 18 ------------------
  block/qcow2-bitmap.c         | 12 +++++++-----
  block/qcow2.c                |  2 +-
  blockdev.c                   | 10 ++--------
  qemu-doc.texi                |  7 +++++++
  tests/qemu-iotests/165       |  2 +-
  tests/qemu-iotests/176       |  2 +-
 files changed, 23 insertions(+), 39 deletions(-)
 diff --git a/qapi/block-core.json b/qapi/block-core.json
 index XXXXXXX..XXXXXXX 100644
 --- a/qapi/block-core.json
 +++ b/qapi/block-core.json
 @@ -XXX,XX +XXX,XX @@
- #              Qcow2 disks support persistent bitmaps. Default is false for
+ # does not support all kinds of operations, all kinds of children, nor
- #              block-dirty-bitmap-add. (Since: 2.10)
+ # all block drivers.
  #
--# @autoload: the bitmap will be automatically loaded when the image it is stored
++# FIXME Removing children from a quorum node means introducing gaps in the
--#            in is opened. This flag may only be specified for persistent
++# child indices. This cannot be represented in the 'children' list of
--#            bitmaps. Default is false for block-dirty-bitmap-add. (Since: 2.10)
++# BlockdevOptionsQuorum, as returned by .bdrv_refresh_filename().
-+# @autoload: ignored and deprecated since 2.12.
++#
-+#            Currently, all dirty tracking bitmaps are loaded from Qcow2 on
+ # Warning: The data in a new quorum child MUST be consistent with that of
-+#            open.
+ # the rest of the array.
  #
- # Since: 2.4
- ##
-diff --git a/block/qcow2.h b/block/qcow2.h
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
-+++ b/block/qcow2.h
-@@ -XXX,XX +XXX,XX @@ void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table);
- int qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
-                                   void **refcount_table,
-                                   int64_t *refcount_table_size);
--bool qcow2_load_autoloading_dirty_bitmaps(BlockDriverState *bs, Error **errp);
-+bool qcow2_load_dirty_bitmaps(BlockDriverState *bs, Error **errp);
- int qcow2_reopen_bitmaps_rw(BlockDriverState *bs, Error **errp);
- void qcow2_store_persistent_dirty_bitmaps(BlockDriverState *bs, Error **errp);
- int qcow2_reopen_bitmaps_ro(BlockDriverState *bs, Error **errp);
-diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/dirty-bitmap.h
-+++ b/include/block/dirty-bitmap.h
-@@ -XXX,XX +XXX,XX @@ void bdrv_dirty_bitmap_deserialize_ones(BdrvDirtyBitmap *bitmap,
- void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap);
- void bdrv_dirty_bitmap_set_readonly(BdrvDirtyBitmap *bitmap, bool value);
--void bdrv_dirty_bitmap_set_autoload(BdrvDirtyBitmap *bitmap, bool autoload);
- void bdrv_dirty_bitmap_set_persistance(BdrvDirtyBitmap *bitmap,
-                                        bool persistent);
-diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/dirty-bitmap.c
-+++ b/block/dirty-bitmap.c
-@@ -XXX,XX +XXX,XX @@ struct BdrvDirtyBitmap {
-                                    Such operations must fail and both the image
-                                    and this bitmap must remain unchanged while
-                                    this flag is set. */
--    bool autoload;              /* For persistent bitmaps: bitmap must be
--                                   autoloaded on image opening */
-     bool persistent;            /* bitmap must be saved to owner disk image */
-     QLIST_ENTRY(BdrvDirtyBitmap) list;
- };
-@@ -XXX,XX +XXX,XX @@ void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap)
-     g_free(bitmap->name);
-     bitmap->name = NULL;
-     bitmap->persistent = false;
--    bitmap->autoload = false;
- }
- /* Called with BQL taken.  */
-@@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
-     bitmap->successor = NULL;
-     successor->persistent = bitmap->persistent;
-     bitmap->persistent = false;
--    successor->autoload = bitmap->autoload;
--    bitmap->autoload = false;
-     bdrv_release_dirty_bitmap(bs, bitmap);
-     return successor;
-@@ -XXX,XX +XXX,XX @@ bool bdrv_has_readonly_bitmaps(BlockDriverState *bs)
- }
- /* Called with BQL taken. */
--void bdrv_dirty_bitmap_set_autoload(BdrvDirtyBitmap *bitmap, bool autoload)
--{
--    qemu_mutex_lock(bitmap->mutex);
--    bitmap->autoload = autoload;
--    qemu_mutex_unlock(bitmap->mutex);
--}
--
--bool bdrv_dirty_bitmap_get_autoload(const BdrvDirtyBitmap *bitmap)
--{
--    return bitmap->autoload;
--}
--
--/* Called with BQL taken. */
- void bdrv_dirty_bitmap_set_persistance(BdrvDirtyBitmap *bitmap, bool persistent)
- {
-     qemu_mutex_lock(bitmap->mutex);
-diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-bitmap.c
-+++ b/block/qcow2-bitmap.c
-@@ -XXX,XX +XXX,XX @@ static void set_readonly_helper(gpointer bitmap, gpointer value)
-     bdrv_dirty_bitmap_set_readonly(bitmap, (bool)value);
- }
--/* qcow2_load_autoloading_dirty_bitmaps()
-+/* qcow2_load_dirty_bitmaps()
-  * Return value is a hint for caller: true means that the Qcow2 header was
-  * updated. (false doesn't mean that the header should be updated by the
-  * caller, it just means that updating was not needed or the image cannot be
-  * written to).
-  * On failure the function returns false.
-  */
--bool qcow2_load_autoloading_dirty_bitmaps(BlockDriverState *bs, Error **errp)
-+bool qcow2_load_dirty_bitmaps(BlockDriverState *bs, Error **errp)
- {
-     BDRVQcow2State *s = bs->opaque;
-     Qcow2BitmapList *bm_list;
-@@ -XXX,XX +XXX,XX @@ bool qcow2_load_autoloading_dirty_bitmaps(BlockDriverState *bs, Error **errp)
-     }
-     QSIMPLEQ_FOREACH(bm, bm_list, entry) {
--        if ((bm->flags & BME_FLAG_AUTO) && !(bm->flags & BME_FLAG_IN_USE)) {
-+        if (!(bm->flags & BME_FLAG_IN_USE)) {
-             BdrvDirtyBitmap *bitmap = load_bitmap(bs, bm, errp);
-             if (bitmap == NULL) {
-                 goto fail;
-             }
-+            if (!(bm->flags & BME_FLAG_AUTO)) {
-+                bdrv_disable_dirty_bitmap(bitmap);
-+            }
-             bdrv_dirty_bitmap_set_persistance(bitmap, true);
--            bdrv_dirty_bitmap_set_autoload(bitmap, true);
-             bm->flags |= BME_FLAG_IN_USE;
-             created_dirty_bitmaps =
-                     g_slist_append(created_dirty_bitmaps, bitmap);
-@@ -XXX,XX +XXX,XX @@ void qcow2_store_persistent_dirty_bitmaps(BlockDriverState *bs, Error **errp)
-             bm->table.size = 0;
-             QSIMPLEQ_INSERT_TAIL(&drop_tables, tb, entry);
-         }
--        bm->flags = bdrv_dirty_bitmap_get_autoload(bitmap) ? BME_FLAG_AUTO : 0;
-+        bm->flags = bdrv_dirty_bitmap_enabled(bitmap) ? BME_FLAG_AUTO : 0;
-         bm->granularity_bits = ctz32(bdrv_dirty_bitmap_granularity(bitmap));
-         bm->dirty_bitmap = bitmap;
-     }
-diff --git a/block/qcow2.c b/block/qcow2.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.c
-+++ b/block/qcow2.c
-@@ -XXX,XX +XXX,XX @@ static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags,
-         s->autoclear_features &= QCOW2_AUTOCLEAR_MASK;
-     }
--    if (qcow2_load_autoloading_dirty_bitmaps(bs, &local_err)) {
-+    if (qcow2_load_dirty_bitmaps(bs, &local_err)) {
-         update_header = false;
-     }
-     if (local_err != NULL) {
-diff --git a/blockdev.c b/blockdev.c
-index XXXXXXX..XXXXXXX 100644
---- a/blockdev.c
-+++ b/blockdev.c
-@@ -XXX,XX +XXX,XX @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name,
-     if (!has_persistent) {
-         persistent = false;
-     }
--    if (!has_autoload) {
--        autoload = false;
--    }
--    if (has_autoload && !persistent) {
--        error_setg(errp, "Autoload flag must be used only for persistent "
--                         "bitmaps");
--        return;
-+    if (has_autoload) {
-+        warn_report("Autoload option is deprecated and its value is ignored");
-     }
-     if (persistent &&
-@@ -XXX,XX +XXX,XX @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name,
-     }
-     bdrv_dirty_bitmap_set_persistance(bitmap, persistent);
--    bdrv_dirty_bitmap_set_autoload(bitmap, autoload);
- }
- void qmp_block_dirty_bitmap_remove(const char *node, const char *name,
-diff --git a/qemu-doc.texi b/qemu-doc.texi
-index XXXXXXX..XXXXXXX 100644
---- a/qemu-doc.texi
-+++ b/qemu-doc.texi
-@@ -XXX,XX +XXX,XX @@ used and it will be removed with no replacement.
- The ``convert -s snapshot_id_or_name'' argument is obsoleted
- by the ``convert -l snapshot_param'' argument instead.
-+@section QEMU Machine Protocol (QMP) commands
-+
-+@subsection block-dirty-bitmap-add "autoload" parameter (since 2.12.0)
-+
-+"autoload" parameter is now ignored. All bitmaps are automatically loaded
-+from qcow2 images.
-+
- @section System emulator human monitor commands
- @subsection host_net_add (since 2.10.0)
-diff --git a/tests/qemu-iotests/165 b/tests/qemu-iotests/165
-index XXXXXXX..XXXXXXX 100755
---- a/tests/qemu-iotests/165
-+++ b/tests/qemu-iotests/165
-@@ -XXX,XX +XXX,XX @@ class TestPersistentDirtyBitmap(iotests.QMPTestCase):
-     def qmpAddBitmap(self):
-         self.vm.qmp('block-dirty-bitmap-add', node='drive0',
--                    name='bitmap0', persistent=True, autoload=True)
-+                    name='bitmap0', persistent=True)
-     def test_persistent(self):
-         self.vm = self.mkVm()
-diff --git a/tests/qemu-iotests/176 b/tests/qemu-iotests/176
-index XXXXXXX..XXXXXXX 100755
---- a/tests/qemu-iotests/176
-+++ b/tests/qemu-iotests/176
-@@ -XXX,XX +XXX,XX @@ case $reason in
-      "file": { "driver": "file", "filename": "$TEST_IMG" } } }
- { "execute": "block-dirty-bitmap-add",
-   "arguments": { "node": "drive0", "name": "bitmap0",
--     "persistent": true, "autoload": true } }
-+     "persistent": true } }
- { "execute": "quit" }
- EOF
-     ;;
 --
 .13.6

-[Qemu-devel] [PULL 02/55] qemu-img.texi: Clean up parameter list
+[Qemu-devel] [PULL v3 12/35] nvme: Add tracing
-From: Fam Zheng <famz@redhat.com>
+From: Doug Gale <doug16k@gmail.com>
-Split options out of the "@table @var" section and create a "@table
+Add trace output for commands, errors, and undefined behavior.
-@option", then use whitespaces and blank lines consistently.
+Add guest error log output for undefined behavior.
 Report invalid undefined accesses to MMIO.
 Annotate unlikely error checks with unlikely.
-Suggested-by: Kevin Wolf <kwolf@redhat.com>
+Signed-off-by: Doug Gale <doug16k@gmail.com>
-Signed-off-by: Fam Zheng <famz@redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Kashyap Chamarthy <kchamart@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- qemu-img.texi | 66 +++++++++++++++++++++++++++++++++++------------------------
+ hw/block/nvme.c       | 349 ++++++++++++++++++++++++++++++++++++++++++--------
-file changed, 39 insertions(+), 27 deletions(-)
+ hw/block/trace-events |  93 ++++++++++++++
 files changed, 390 insertions(+), 52 deletions(-)
-diff --git a/qemu-img.texi b/qemu-img.texi
+diff --git a/hw/block/nvme.c b/hw/block/nvme.c
 index XXXXXXX..XXXXXXX 100644
---- a/qemu-img.texi
+--- a/hw/block/nvme.c
-+++ b/qemu-img.texi
++++ b/hw/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ The following commands are supported:
+@@ -XXX,XX +XXX,XX @@
+ #include "qapi/visitor.h"
- Command parameters:
+ #include "sysemu/block-backend.h"
- @table @var
--@item filename
++#include "qemu/log.h"
-- is a disk image filename
++#include "trace.h"
--
+ #include "nvme.h"
--@item --object @var{objectdef}
--
++#define NVME_GUEST_ERR(trace, fmt, ...) \
--is a QEMU user creatable object definition. See the @code{qemu(1)} manual
++    do { \
--page for a description of the object properties. The most common object
++        (trace_##trace)(__VA_ARGS__); \
--type is a @code{secret}, which is used to supply passwords and/or encryption
++        qemu_log_mask(LOG_GUEST_ERROR, #trace \
--keys.
++            " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
--
++    } while (0)
--@item --image-opts
++
--
+ static void nvme_process_sq(void *opaque);
--Indicates that the source @var{filename} parameter is to be interpreted as a
--full option string, not a plain filename. This parameter is mutually
+ static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
--exclusive with the @var{-f} parameter.
+@@ -XXX,XX +XXX,XX @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
--
+ {
--@item --target-image-opts
+     if (cq->irq_enabled) {
+         if (msix_enabled(&(n->parent_obj))) {
--Indicates that the @var{output_filename} parameter(s) are to be interpreted as
++            trace_nvme_irq_msix(cq->vector);
--a full option string, not a plain filename. This parameter is mutually
+             msix_notify(&(n->parent_obj), cq->vector);
--exclusive with the @var{-O} parameters. It is currently required to also use
+         } else {
--the @var{-n} parameter to skip image creation. This restriction may be relaxed
++            trace_nvme_irq_pin();
--in a future release.
+             pci_irq_pulse(&n->parent_obj);
-+@item filename
+         }
-+is a disk image filename
++    } else {
++        trace_nvme_irq_masked();
- @item fmt
+     }
- is the disk image format. It is guessed automatically in most cases. See below
+ }
- for a description of the supported disk formats.
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
--@item --backing-chain
+     trans_len = MIN(len, trans_len);
--will enumerate information about backing files in a disk image chain. Refer
+     int num_prps = (len >> n->page_bits) + 1;
--below for further description.
--
+-    if (!prp1) {
- @item size
++    if (unlikely(!prp1)) {
- is the disk image size in bytes. Optional suffixes @code{k} or @code{K}
++        trace_nvme_err_invalid_prp();
- (kilobyte, 1024) @code{M} (megabyte, 1024k) and @code{G} (gigabyte, 1024M)
+         return NVME_INVALID_FIELD | NVME_DNR;
-@@ -XXX,XX +XXX,XX @@ and T (terabyte, 1024G) are supported.  @code{b} is ignored.
+     } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
- is the destination disk image filename
+                prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
- @item output_fmt
+     }
-- is the destination format
+     len -= trans_len;
-+is the destination format
+     if (len) {
-+
+-        if (!prp2) {
- @item options
++        if (unlikely(!prp2)) {
- is a comma separated list of format specific options in a
++            trace_nvme_err_invalid_prp2_missing();
- name=value format. Use @code{-o ?} for an overview of the options supported
+             goto unmap;
- by the used format or see the format descriptions below for details.
+         }
-+
+         if (len > n->page_size) {
- @item snapshot_param
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
- is param used for internal snapshot, format is
+                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
- 'snapshot.id=[ID],snapshot.name=[NAME]' or '[ID_OR_NAME]'
-+
+                 if (i == n->max_prp_ents - 1 && len > n->page_size) {
- @item snapshot_id_or_name
+-                    if (!prp_ent || prp_ent & (n->page_size - 1)) {
- is deprecated, use snapshot_param instead
++                    if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
++                        trace_nvme_err_invalid_prplist_ent(prp_ent);
-+@end table
+                         goto unmap;
-+
+                     }
-+@table @option
-+
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
-+@item --object @var{objectdef}
+                     prp_ent = le64_to_cpu(prp_list[i]);
-+is a QEMU user creatable object definition. See the @code{qemu(1)} manual
+                 }
-+page for a description of the object properties. The most common object
-+type is a @code{secret}, which is used to supply passwords and/or encryption
+-                if (!prp_ent || prp_ent & (n->page_size - 1)) {
-+keys.
++                if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
-+
++                    trace_nvme_err_invalid_prplist_ent(prp_ent);
-+@item --image-opts
+                     goto unmap;
-+Indicates that the source @var{filename} parameter is to be interpreted as a
+                 }
-+full option string, not a plain filename. This parameter is mutually
-+exclusive with the @var{-f} parameter.
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
-+
+                 i++;
-+@item --target-image-opts
+             }
-+Indicates that the @var{output_filename} parameter(s) are to be interpreted as
+         } else {
-+a full option string, not a plain filename. This parameter is mutually
+-            if (prp2 & (n->page_size - 1)) {
-+exclusive with the @var{-O} parameters. It is currently required to also use
++            if (unlikely(prp2 & (n->page_size - 1))) {
-+the @var{-n} parameter to skip image creation. This restriction may be relaxed
++                trace_nvme_err_invalid_prp2_align(prp2);
-+in a future release.
+                 goto unmap;
-+
+             }
-+@item --backing-chain
+             if (qsg->nsg) {
-+will enumerate information about backing files in a disk image chain. Refer
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
-+below for further description.
+     QEMUIOVector iov;
-+
+     uint16_t status = NVME_SUCCESS;
- @item -c
- indicates that target image must be compressed (qcow format only)
++    trace_nvme_dma_read(prp1, prp2);
 +
- @item -h
+     if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
- with or without a command shows help and lists the supported formats
+         return NVME_INVALID_FIELD | NVME_DNR;
-+
+     }
- @item -p
+     if (qsg.nsg > 0) {
- display progress bar (compare, convert and rebase commands only).
+-        if (dma_buf_read(ptr, len, &qsg)) {
- If the @var{-p} option is not used for a command that supports it, the
++        if (unlikely(dma_buf_read(ptr, len, &qsg))) {
- progress is reported when the process receives a @code{SIGUSR1} or
++            trace_nvme_err_invalid_dma();
- @code{SIGINFO} signal.
+             status = NVME_INVALID_FIELD | NVME_DNR;
-+
+         }
- @item -q
+         qemu_sglist_destroy(&qsg);
- Quiet mode - do not print any output (except errors). There's no progress bar
+     } else {
- in case both @var{-q} and @var{-p} options are used.
+-        if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
-+
++        if (unlikely(qemu_iovec_to_buf(&iov, 0, ptr, len) != len)) {
- @item -S @var{size}
++            trace_nvme_err_invalid_dma();
- indicates the consecutive number of bytes that must contain only zeros
+             status = NVME_INVALID_FIELD | NVME_DNR;
- for qemu-img to create a sparse image during conversion. This value is rounded
+         }
- down to the nearest 512 bytes. You may use the common size suffixes like
+         qemu_iovec_destroy(&iov);
- @code{k} for kilobytes.
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
-+
+     uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
- @item -t @var{cache}
+     uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS);
- specifies the cache mode that should be used with the (destination) file. See
- the documentation of the emulator's @code{-drive cache=...} option for allowed
+-    if (slba + nlb > ns->id_ns.nsze) {
- values.
++    if (unlikely(slba + nlb > ns->id_ns.nsze)) {
-+
++        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
- @item -T @var{src_cache}
+         return NVME_LBA_RANGE | NVME_DNR;
- specifies the cache mode that should be used with the source file(s). See
+     }
- the documentation of the emulator's @code{-drive cache=...} option for allowed
- values.
+@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
-+
+     int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
- @end table
+     enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
- Parameters to snapshot subcommand:
+-    if ((slba + nlb) > ns->id_ns.nsze) {
 +    trace_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
 +
 +    if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
          block_acct_invalid(blk_get_stats(n->conf.blk), acct);
 +        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
          return NVME_LBA_RANGE | NVME_DNR;
      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
      NvmeNamespace *ns;
      uint32_t nsid = le32_to_cpu(cmd->nsid);
 -    if (nsid == 0 || nsid > n->num_namespaces) {
 +    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
 +        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
          return NVME_INVALID_NSID | NVME_DNR;
      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
      case NVME_CMD_READ:
          return nvme_rw(n, ns, cmd, req);
      default:
 +        trace_nvme_err_invalid_opc(cmd->opcode);
          return NVME_INVALID_OPCODE | NVME_DNR;
      }
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
      NvmeCQueue *cq;
      uint16_t qid = le16_to_cpu(c->qid);
 -    if (!qid || nvme_check_sqid(n, qid)) {
 +    if (unlikely(!qid || nvme_check_sqid(n, qid))) {
 +        trace_nvme_err_invalid_del_sq(qid);
          return NVME_INVALID_QID | NVME_DNR;
      }
 +    trace_nvme_del_sq(qid);
 +
      sq = n->sq[qid];
      while (!QTAILQ_EMPTY(&sq->out_req_list)) {
          req = QTAILQ_FIRST(&sq->out_req_list);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
      uint16_t qflags = le16_to_cpu(c->sq_flags);
      uint64_t prp1 = le64_to_cpu(c->prp1);
 -    if (!cqid || nvme_check_cqid(n, cqid)) {
 +    trace_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
 +
 +    if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
 +        trace_nvme_err_invalid_create_sq_cqid(cqid);
          return NVME_INVALID_CQID | NVME_DNR;
      }
 -    if (!sqid || !nvme_check_sqid(n, sqid)) {
 +    if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) {
 +        trace_nvme_err_invalid_create_sq_sqid(sqid);
          return NVME_INVALID_QID | NVME_DNR;
      }
 -    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
 +    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
 +        trace_nvme_err_invalid_create_sq_size(qsize);
          return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
      }
 -    if (!prp1 || prp1 & (n->page_size - 1)) {
 +    if (unlikely(!prp1 || prp1 & (n->page_size - 1))) {
 +        trace_nvme_err_invalid_create_sq_addr(prp1);
          return NVME_INVALID_FIELD | NVME_DNR;
      }
 -    if (!(NVME_SQ_FLAGS_PC(qflags))) {
 +    if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
 +        trace_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
          return NVME_INVALID_FIELD | NVME_DNR;
      }
      sq = g_malloc0(sizeof(*sq));
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
      NvmeCQueue *cq;
      uint16_t qid = le16_to_cpu(c->qid);
 -    if (!qid || nvme_check_cqid(n, qid)) {
 +    if (unlikely(!qid || nvme_check_cqid(n, qid))) {
 +        trace_nvme_err_invalid_del_cq_cqid(qid);
          return NVME_INVALID_CQID | NVME_DNR;
      }
      cq = n->cq[qid];
 -    if (!QTAILQ_EMPTY(&cq->sq_list)) {
 +    if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
 +        trace_nvme_err_invalid_del_cq_notempty(qid);
          return NVME_INVALID_QUEUE_DEL;
      }
 +    trace_nvme_del_cq(qid);
      nvme_free_cq(cq, n);
      return NVME_SUCCESS;
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
      uint16_t qflags = le16_to_cpu(c->cq_flags);
      uint64_t prp1 = le64_to_cpu(c->prp1);
 -    if (!cqid || !nvme_check_cqid(n, cqid)) {
 +    trace_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
 +                         NVME_CQ_FLAGS_IEN(qflags) != 0);
 +
 +    if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) {
 +        trace_nvme_err_invalid_create_cq_cqid(cqid);
          return NVME_INVALID_CQID | NVME_DNR;
      }
 -    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
 +    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
 +        trace_nvme_err_invalid_create_cq_size(qsize);
          return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
      }
 -    if (!prp1) {
 +    if (unlikely(!prp1)) {
 +        trace_nvme_err_invalid_create_cq_addr(prp1);
          return NVME_INVALID_FIELD | NVME_DNR;
      }
 -    if (vector > n->num_queues) {
 +    if (unlikely(vector > n->num_queues)) {
 +        trace_nvme_err_invalid_create_cq_vector(vector);
          return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
      }
 -    if (!(NVME_CQ_FLAGS_PC(qflags))) {
 +    if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
 +        trace_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
          return NVME_INVALID_FIELD | NVME_DNR;
      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
      uint64_t prp1 = le64_to_cpu(c->prp1);
      uint64_t prp2 = le64_to_cpu(c->prp2);
 +    trace_nvme_identify_ctrl();
 +
      return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
          prp1, prp2);
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
      uint64_t prp1 = le64_to_cpu(c->prp1);
      uint64_t prp2 = le64_to_cpu(c->prp2);
 -    if (nsid == 0 || nsid > n->num_namespaces) {
 +    trace_nvme_identify_ns(nsid);
 +
 +    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
 +        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
          return NVME_INVALID_NSID | NVME_DNR;
      }
      ns = &n->namespaces[nsid - 1];
 +
      return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
          prp1, prp2);
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
      uint16_t ret;
      int i, j = 0;
 +    trace_nvme_identify_nslist(min_nsid);
 +
      list = g_malloc0(data_len);
      for (i = 0; i < n->num_namespaces; i++) {
          if (i < min_nsid) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
      case 0x02:
          return nvme_identify_nslist(n, c);
      default:
 +        trace_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
          return NVME_INVALID_FIELD | NVME_DNR;
      }
  }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
      switch (dw10) {
      case NVME_VOLATILE_WRITE_CACHE:
          result = blk_enable_write_cache(n->conf.blk);
 +        trace_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
          break;
      case NVME_NUMBER_OF_QUEUES:
          result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
 +        trace_nvme_getfeat_numq(result);
          break;
      default:
 +        trace_nvme_err_invalid_getfeat(dw10);
          return NVME_INVALID_FIELD | NVME_DNR;
      }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
          blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
          break;
      case NVME_NUMBER_OF_QUEUES:
 +        trace_nvme_setfeat_numq((dw11 & 0xFFFF) + 1,
 +                                ((dw11 >> 16) & 0xFFFF) + 1,
 +                                n->num_queues - 1, n->num_queues - 1);
          req->cqe.result =
              cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
          break;
      default:
 +        trace_nvme_err_invalid_setfeat(dw10);
          return NVME_INVALID_FIELD | NVME_DNR;
      }
      return NVME_SUCCESS;
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
      case NVME_ADM_CMD_GET_FEATURES:
          return nvme_get_feature(n, cmd, req);
      default:
 +        trace_nvme_err_invalid_admin_opc(cmd->opcode);
          return NVME_INVALID_OPCODE | NVME_DNR;
      }
  }
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
      uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
      uint32_t page_size = 1 << page_bits;
 -    if (n->cq[0] || n->sq[0] || !n->bar.asq || !n->bar.acq ||
 -            n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1) ||
 -            NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) ||
 -            NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap) ||
 -            NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) ||
 -            NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes) ||
 -            NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.sqes) ||
 -            NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.sqes) ||
 -            !NVME_AQA_ASQS(n->bar.aqa) || !NVME_AQA_ACQS(n->bar.aqa)) {
 +    if (unlikely(n->cq[0])) {
 +        trace_nvme_err_startfail_cq();
 +        return -1;
 +    }
 +    if (unlikely(n->sq[0])) {
 +        trace_nvme_err_startfail_sq();
 +        return -1;
 +    }
 +    if (unlikely(!n->bar.asq)) {
 +        trace_nvme_err_startfail_nbarasq();
 +        return -1;
 +    }
 +    if (unlikely(!n->bar.acq)) {
 +        trace_nvme_err_startfail_nbaracq();
 +        return -1;
 +    }
 +    if (unlikely(n->bar.asq & (page_size - 1))) {
 +        trace_nvme_err_startfail_asq_misaligned(n->bar.asq);
 +        return -1;
 +    }
 +    if (unlikely(n->bar.acq & (page_size - 1))) {
 +        trace_nvme_err_startfail_acq_misaligned(n->bar.acq);
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_MPS(n->bar.cc) <
 +                 NVME_CAP_MPSMIN(n->bar.cap))) {
 +        trace_nvme_err_startfail_page_too_small(
 +                    NVME_CC_MPS(n->bar.cc),
 +                    NVME_CAP_MPSMIN(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_MPS(n->bar.cc) >
 +                 NVME_CAP_MPSMAX(n->bar.cap))) {
 +        trace_nvme_err_startfail_page_too_large(
 +                    NVME_CC_MPS(n->bar.cc),
 +                    NVME_CAP_MPSMAX(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
 +                 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
 +        trace_nvme_err_startfail_cqent_too_small(
 +                    NVME_CC_IOCQES(n->bar.cc),
 +                    NVME_CTRL_CQES_MIN(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
 +                 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
 +        trace_nvme_err_startfail_cqent_too_large(
 +                    NVME_CC_IOCQES(n->bar.cc),
 +                    NVME_CTRL_CQES_MAX(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
 +                 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
 +        trace_nvme_err_startfail_sqent_too_small(
 +                    NVME_CC_IOSQES(n->bar.cc),
 +                    NVME_CTRL_SQES_MIN(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
 +                 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
 +        trace_nvme_err_startfail_sqent_too_large(
 +                    NVME_CC_IOSQES(n->bar.cc),
 +                    NVME_CTRL_SQES_MAX(n->bar.cap));
 +        return -1;
 +    }
 +    if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
 +        trace_nvme_err_startfail_asqent_sz_zero();
 +        return -1;
 +    }
 +    if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
 +        trace_nvme_err_startfail_acqent_sz_zero();
          return -1;
      }
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
  static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
      unsigned size)
  {
 +    if (unlikely(offset & (sizeof(uint32_t) - 1))) {
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_misaligned32,
 +                       "MMIO write not 32-bit aligned,"
 +                       " offset=0x%"PRIx64"", offset);
 +        /* should be ignored, fall through for now */
 +    }
 +
 +    if (unlikely(size < sizeof(uint32_t))) {
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_toosmall,
 +                       "MMIO write smaller than 32-bits,"
 +                       " offset=0x%"PRIx64", size=%u",
 +                       offset, size);
 +        /* should be ignored, fall through for now */
 +    }
 +
      switch (offset) {
 -    case 0xc:
 +    case 0xc:   /* INTMS */
 +        if (unlikely(msix_enabled(&(n->parent_obj)))) {
 +            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
 +                           "undefined access to interrupt mask set"
 +                           " when MSI-X is enabled");
 +            /* should be ignored, fall through for now */
 +        }
          n->bar.intms |= data & 0xffffffff;
          n->bar.intmc = n->bar.intms;
 +        trace_nvme_mmio_intm_set(data & 0xffffffff,
 +                                 n->bar.intmc);
          break;
 -    case 0x10:
 +    case 0x10:  /* INTMC */
 +        if (unlikely(msix_enabled(&(n->parent_obj)))) {
 +            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
 +                           "undefined access to interrupt mask clr"
 +                           " when MSI-X is enabled");
 +            /* should be ignored, fall through for now */
 +        }
          n->bar.intms &= ~(data & 0xffffffff);
          n->bar.intmc = n->bar.intms;
 +        trace_nvme_mmio_intm_clr(data & 0xffffffff,
 +                                 n->bar.intmc);
          break;
 -    case 0x14:
 +    case 0x14:  /* CC */
 +        trace_nvme_mmio_cfg(data & 0xffffffff);
          /* Windows first sends data, then sends enable bit */
          if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
              !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
@@ -XXX,XX +XXX,XX @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
          if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
              n->bar.cc = data;
 -            if (nvme_start_ctrl(n)) {
 +            if (unlikely(nvme_start_ctrl(n))) {
 +                trace_nvme_err_startfail();
                  n->bar.csts = NVME_CSTS_FAILED;
              } else {
 +                trace_nvme_mmio_start_success();
                  n->bar.csts = NVME_CSTS_READY;
              }
          } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
 +            trace_nvme_mmio_stopped();
              nvme_clear_ctrl(n);
              n->bar.csts &= ~NVME_CSTS_READY;
          }
          if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
 -                nvme_clear_ctrl(n);
 -                n->bar.cc = data;
 -                n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
 +            trace_nvme_mmio_shutdown_set();
 +            nvme_clear_ctrl(n);
 +            n->bar.cc = data;
 +            n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
          } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
 -                n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
 -                n->bar.cc = data;
 +            trace_nvme_mmio_shutdown_cleared();
 +            n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
 +            n->bar.cc = data;
 +        }
 +        break;
 +    case 0x1C:  /* CSTS */
 +        if (data & (1 << 4)) {
 +            NVME_GUEST_ERR(nvme_ub_mmiowr_ssreset_w1c_unsupported,
 +                           "attempted to W1C CSTS.NSSRO"
 +                           " but CAP.NSSRS is zero (not supported)");
 +        } else if (data != 0) {
 +            NVME_GUEST_ERR(nvme_ub_mmiowr_ro_csts,
 +                           "attempted to set a read only bit"
 +                           " of controller status");
 +        }
 +        break;
 +    case 0x20:  /* NSSR */
 +        if (data == 0x4E564D65) {
 +            trace_nvme_ub_mmiowr_ssreset_unsupported();
 +        } else {
 +            /* The spec says that writes of other values have no effect */
 +            return;
          }
          break;
 -    case 0x24:
 +    case 0x24:  /* AQA */
          n->bar.aqa = data & 0xffffffff;
 +        trace_nvme_mmio_aqattr(data & 0xffffffff);
          break;
 -    case 0x28:
 +    case 0x28:  /* ASQ */
          n->bar.asq = data;
 +        trace_nvme_mmio_asqaddr(data);
          break;
 -    case 0x2c:
 +    case 0x2c:  /* ASQ hi */
          n->bar.asq |= data << 32;
 +        trace_nvme_mmio_asqaddr_hi(data, n->bar.asq);
          break;
 -    case 0x30:
 +    case 0x30:  /* ACQ */
 +        trace_nvme_mmio_acqaddr(data);
          n->bar.acq = data;
          break;
 -    case 0x34:
 +    case 0x34:  /* ACQ hi */
          n->bar.acq |= data << 32;
 +        trace_nvme_mmio_acqaddr_hi(data, n->bar.acq);
          break;
 +    case 0x38:  /* CMBLOC */
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbloc_reserved,
 +                       "invalid write to reserved CMBLOC"
 +                       " when CMBSZ is zero, ignored");
 +        return;
 +    case 0x3C:  /* CMBSZ */
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
 +                       "invalid write to read only CMBSZ, ignored");
 +        return;
      default:
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
 +                       "invalid MMIO write,"
 +                       " offset=0x%"PRIx64", data=%"PRIx64"",
 +                       offset, data);
          break;
      }
  }
@@ -XXX,XX +XXX,XX @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
      uint8_t *ptr = (uint8_t *)&n->bar;
      uint64_t val = 0;
 +    if (unlikely(addr & (sizeof(uint32_t) - 1))) {
 +        NVME_GUEST_ERR(nvme_ub_mmiord_misaligned32,
 +                       "MMIO read not 32-bit aligned,"
 +                       " offset=0x%"PRIx64"", addr);
 +        /* should RAZ, fall through for now */
 +    } else if (unlikely(size < sizeof(uint32_t))) {
 +        NVME_GUEST_ERR(nvme_ub_mmiord_toosmall,
 +                       "MMIO read smaller than 32-bits,"
 +                       " offset=0x%"PRIx64"", addr);
 +        /* should RAZ, fall through for now */
 +    }
 +
      if (addr < sizeof(n->bar)) {
          memcpy(&val, ptr + addr, size);
 +    } else {
 +        NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
 +                       "MMIO read beyond last register,"
 +                       " offset=0x%"PRIx64", returning 0", addr);
      }
 +
      return val;
  }
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
  {
      uint32_t qid;
 -    if (addr & ((1 << 2) - 1)) {
 +    if (unlikely(addr & ((1 << 2) - 1))) {
 +        NVME_GUEST_ERR(nvme_ub_db_wr_misaligned,
 +                       "doorbell write not 32-bit aligned,"
 +                       " offset=0x%"PRIx64", ignoring", addr);
          return;
      }
      if (((addr - 0x1000) >> 2) & 1) {
 +        /* Completion queue doorbell write */
 +
          uint16_t new_head = val & 0xffff;
          int start_sqs;
          NvmeCQueue *cq;
          qid = (addr - (0x1000 + (1 << 2))) >> 3;
 -        if (nvme_check_cqid(n, qid)) {
 +        if (unlikely(nvme_check_cqid(n, qid))) {
 +            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cq,
 +                           "completion queue doorbell write"
 +                           " for nonexistent queue,"
 +                           " sqid=%"PRIu32", ignoring", qid);
              return;
          }
          cq = n->cq[qid];
 -        if (new_head >= cq->size) {
 +        if (unlikely(new_head >= cq->size)) {
 +            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cqhead,
 +                           "completion queue doorbell write value"
 +                           " beyond queue size, sqid=%"PRIu32","
 +                           " new_head=%"PRIu16", ignoring",
 +                           qid, new_head);
              return;
          }
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
              nvme_isr_notify(n, cq);
          }
      } else {
 +        /* Submission queue doorbell write */
 +
          uint16_t new_tail = val & 0xffff;
          NvmeSQueue *sq;
          qid = (addr - 0x1000) >> 3;
 -        if (nvme_check_sqid(n, qid)) {
 +        if (unlikely(nvme_check_sqid(n, qid))) {
 +            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sq,
 +                           "submission queue doorbell write"
 +                           " for nonexistent queue,"
 +                           " sqid=%"PRIu32", ignoring", qid);
              return;
          }
          sq = n->sq[qid];
 -        if (new_tail >= sq->size) {
 +        if (unlikely(new_tail >= sq->size)) {
 +            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sqtail,
 +                           "submission queue doorbell write value"
 +                           " beyond queue size, sqid=%"PRIu32","
 +                           " new_tail=%"PRIu16", ignoring",
 +                           qid, new_tail);
              return;
          }
 diff --git a/hw/block/trace-events b/hw/block/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/block/trace-events
 +++ b/hw/block/trace-events
@@ -XXX,XX +XXX,XX @@ virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint6
  hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
  hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int trans) "blk %p CHS %u %u %u trans %d"
 +# hw/block/nvme.c
 +# nvme traces for successful events
 +nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u"
 +nvme_irq_pin(void) "pulsing IRQ pin"
 +nvme_irq_masked(void) "IRQ is masked"
 +nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
 +nvme_rw(char const *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
 +nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
 +nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
 +nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
 +nvme_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16""
 +nvme_identify_ctrl(void) "identify controller"
 +nvme_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
 +nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16""
 +nvme_getfeat_vwcache(char const* result) "get feature volatile write cache, result=%s"
 +nvme_getfeat_numq(int result) "get feature number of queues, result=%d"
 +nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
 +nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
 +nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64""
 +nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64""
 +nvme_mmio_aqattr(uint64_t data) "wrote MMIO, admin queue attributes=0x%"PRIx64""
 +nvme_mmio_asqaddr(uint64_t data) "wrote MMIO, admin submission queue address=0x%"PRIx64""
 +nvme_mmio_acqaddr(uint64_t data) "wrote MMIO, admin completion queue address=0x%"PRIx64""
 +nvme_mmio_asqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin submission queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
 +nvme_mmio_acqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin completion queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
 +nvme_mmio_start_success(void) "setting controller enable bit succeeded"
 +nvme_mmio_stopped(void) "cleared controller enable bit"
 +nvme_mmio_shutdown_set(void) "shutdown bit set"
 +nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
 +
 +# nvme traces for error conditions
 +nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
 +nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
 +nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""
 +nvme_err_invalid_prp2_missing(void) "PRP2 is null and more data to be transferred"
 +nvme_err_invalid_field(void) "invalid field"
 +nvme_err_invalid_prp(void) "invalid PRP"
 +nvme_err_invalid_sgl(void) "invalid SGL"
 +nvme_err_invalid_ns(uint32_t ns, uint32_t limit) "invalid namespace %u not within 1-%u"
 +nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
 +nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
 +nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
 +nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
 +nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
 +nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16""
 +nvme_err_invalid_create_sq_size(uint16_t qsize) "failed creating submission queue, invalid qsize=%"PRIu16""
 +nvme_err_invalid_create_sq_addr(uint64_t addr) "failed creating submission queue, addr=0x%"PRIx64""
 +nvme_err_invalid_create_sq_qflags(uint16_t qflags) "failed creating submission queue, qflags=%"PRIu16""
 +nvme_err_invalid_del_cq_cqid(uint16_t cqid) "failed deleting completion queue, cqid=%"PRIu16""
 +nvme_err_invalid_del_cq_notempty(uint16_t cqid) "failed deleting completion queue, it is not empty, cqid=%"PRIu16""
 +nvme_err_invalid_create_cq_cqid(uint16_t cqid) "failed creating completion queue, cqid=%"PRIu16""
 +nvme_err_invalid_create_cq_size(uint16_t size) "failed creating completion queue, size=%"PRIu16""
 +nvme_err_invalid_create_cq_addr(uint64_t addr) "failed creating completion queue, addr=0x%"PRIx64""
 +nvme_err_invalid_create_cq_vector(uint16_t vector) "failed creating completion queue, vector=%"PRIu16""
 +nvme_err_invalid_create_cq_qflags(uint16_t qflags) "failed creating completion queue, qflags=%"PRIu16""
 +nvme_err_invalid_identify_cns(uint16_t cns) "identify, invalid cns=0x%"PRIx16""
 +nvme_err_invalid_getfeat(int dw10) "invalid get features, dw10=0x%"PRIx32""
 +nvme_err_invalid_setfeat(uint32_t dw10) "invalid set features, dw10=0x%"PRIx32""
 +nvme_err_startfail_cq(void) "nvme_start_ctrl failed because there are non-admin completion queues"
 +nvme_err_startfail_sq(void) "nvme_start_ctrl failed because there are non-admin submission queues"
 +nvme_err_startfail_nbarasq(void) "nvme_start_ctrl failed because the admin submission queue address is null"
 +nvme_err_startfail_nbaracq(void) "nvme_start_ctrl failed because the admin completion queue address is null"
 +nvme_err_startfail_asq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin submission queue address is misaligned: 0x%"PRIx64""
 +nvme_err_startfail_acq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin completion queue address is misaligned: 0x%"PRIx64""
 +nvme_err_startfail_page_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too small: log2size=%u, min=%u"
 +nvme_err_startfail_page_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too large: log2size=%u, max=%u"
 +nvme_err_startfail_cqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too small: log2size=%u, min=%u"
 +nvme_err_startfail_cqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too large: log2size=%u, max=%u"
 +nvme_err_startfail_sqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too small: log2size=%u, min=%u"
 +nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too large: log2size=%u, max=%u"
 +nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero"
 +nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero"
 +nvme_err_startfail(void) "setting controller enable bit failed"
 +
 +# Traces for undefined behavior
 +nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64""
 +nvme_ub_mmiowr_toosmall(uint64_t offset, unsigned size) "MMIO write smaller than 32 bits, offset=0x%"PRIx64", size=%u"
 +nvme_ub_mmiowr_intmask_with_msix(void) "undefined access to interrupt mask set when MSI-X is enabled"
 +nvme_ub_mmiowr_ro_csts(void) "attempted to set a read only bit of controller status"
 +nvme_ub_mmiowr_ssreset_w1c_unsupported(void) "attempted to W1C CSTS.NSSRO but CAP.NSSRS is zero (not supported)"
 +nvme_ub_mmiowr_ssreset_unsupported(void) "attempted NVM subsystem reset but CAP.NSSRS is zero (not supported)"
 +nvme_ub_mmiowr_cmbloc_reserved(void) "invalid write to reserved CMBLOC when CMBSZ is zero, ignored"
 +nvme_ub_mmiowr_cmbsz_readonly(void) "invalid write to read only CMBSZ, ignored"
 +nvme_ub_mmiowr_invalid(uint64_t offset, uint64_t data) "invalid MMIO write, offset=0x%"PRIx64", data=0x%"PRIx64""
 +nvme_ub_mmiord_misaligned32(uint64_t offset) "MMIO read not 32-bit aligned, offset=0x%"PRIx64""
 +nvme_ub_mmiord_toosmall(uint64_t offset) "MMIO read smaller than 32-bits, offset=0x%"PRIx64""
 +nvme_ub_mmiord_invalid_ofs(uint64_t offset) "MMIO read beyond last register, offset=0x%"PRIx64", returning 0"
 +nvme_ub_db_wr_misaligned(uint64_t offset) "doorbell write not 32-bit aligned, offset=0x%"PRIx64", ignoring"
 +nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for nonexistent queue, cqid=%"PRIu32", ignoring"
 +nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring"
 +nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring"
 +nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring"
 +
  # hw/block/xen_disk.c
  xen_disk_alloc(char *name) "%s"
  xen_disk_init(char *name) "%s"
 --
 .13.6

-[Qemu-devel] [PULL 01/55] iotests: Fix CID for VMDK afl image
+[Qemu-devel] [PULL v3 13/35] block: Open backing image in force share mode for size probe
 From: Fam Zheng <famz@redhat.com>
-This reverts commit 76bf133c4 which updated the reference output, and
+Management tools create overlays of running guests with qemu-img:
 fixed the reference image, because the code path we want to exercise is
 actually the invalid image size.
-The descriptor block in the image, which includes the CID to verify, has been
+  $ qemu-img create -b /image/in/use.qcow2 -f qcow2 /overlay/image.qcow2
 invalid since the reference image was added. Since commit 9877860e7bd we report
 this error earlier than the "file too large", so 059.out mismatches.
-The binary change is generated along the operations of:
+but this doesn't work anymore due to image locking:
-  $ bunzip2 afl9.vmdk.bz2
+    qemu-img: /overlay/image.qcow2: Failed to get shared "write" lock
-  $ qemu-img create -f vmdk fix.vmdk 1G
+    Is another process using the image?
-  $ dd if=afl9.vmdk of=fix.vmdk bs=512 count=1 conv=notrunc
+    Could not open backing image to determine size.
-  $ mv fix.vmdk afl9.vmdk
+Use the force share option to allow this use case again.
   $ bzip2 afl9.vmdk
+Cc: qemu-stable@nongnu.org
 Signed-off-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- tests/qemu-iotests/059.out                     |   2 +-
+ block.c | 3 ++-
- tests/qemu-iotests/sample_images/afl9.vmdk.bz2 | Bin 178 -> 618 bytes
+file changed, 2 insertions(+), 1 deletion(-)
 files changed, 1 insertion(+), 1 deletion(-)
-diff --git a/tests/qemu-iotests/059.out b/tests/qemu-iotests/059.out
+diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/059.out
+--- a/block.c
-+++ b/tests/qemu-iotests/059.out
++++ b/block.c
-@@ -XXX,XX +XXX,XX @@ Offset          Length          Mapped to       File
+@@ -XXX,XX +XXX,XX @@ void bdrv_img_create(const char *filename, const char *fmt,
-x140000000     0x10000         0x50000         TEST_DIR/t-s003.vmdk
+         back_flags = flags;
+         back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
- === Testing afl image with a very large capacity ===
--qemu-img: Could not open 'TEST_DIR/afl9.IMGFMT': Could not open 'TEST_DIR/afl9.IMGFMT': Invalid argument
++        backing_options = qdict_new();
-+qemu-img: Can't get image size 'TEST_DIR/afl9.IMGFMT': File too large
+         if (backing_fmt) {
- *** done
+-            backing_options = qdict_new();
-diff --git a/tests/qemu-iotests/sample_images/afl9.vmdk.bz2 b/tests/qemu-iotests/sample_images/afl9.vmdk.bz2
+             qdict_put_str(backing_options, "driver", backing_fmt);
-index XXXXXXX..XXXXXXX 100644
+         }
-GIT binary patch
++        qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
-literal 618
-zcmV-w0+szjT4*^jL0KkKSvgW7ssIN3|NsBH-Q9UpfAhclU70`s-*NE~5QvC~h=_=Y
+         bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
-zh>D2n*q*=vygR634445h35k;?00h9835kMW00004$iPepVE{Bqk)uhJ^wfGLr=)3s
+                        &local_err);
 zhM5CR88jLh7)B;cA*K)*6GmuECPU3o4NWG5O#pg>Ak#xY8Z^<M8Z>CrMt}oD38Ns$
 z02n}M0LdjZ&}cLPqd+nPKmn$j0iXe(02%-d27nnJriN-uE+X&cz@Bj4BBfd|yV!NB
 zwqkL}nW3AI5x^jp=t%^F1pxqp)v#n#)j$zcm1xqv(!$2d*5%vF{5RPWnOV8-^tE<(
 zU~%&}Y0uNu*9Wt=yS^8PkC&gPueZO%IG;aD{l#sG`<Af;l1Pnwpi9I75FkQ`LLhd8
 z6(9f*2s+N5=%bwp80ddrD6>m4Ho*fsHXdM<jtl*zKvRiTx7Ugy1|Nl<Ns!z;1dvhy
 z=`SDHh~{u|1ZodC(_lzezQ)I*Kv2z|PZ@!SJjlVzwGdx2iu#W}dI{t+T&dDWT^LPy
 zg3NouEM=V~7GvZQS1CXy676F6mJXWGgW!KTr+E$OspGYCjWmuwa^<Bc>_(-i7fPIW
 zA+~n9iy_f)g8B2RILhd%F)dZ5f?7pFLw)@;Ncl<JE}gvMrfh{elT#3gLjY6r8xY4O
 z)UO#pv=WYptukn<DuoMH2ip%k?V^k!rjQirK^RC<Brw>3Bz9<|!xm0F{45K+gg8#n
 z4FNAJ!<X|3Vq+lyV4=xZ;>AN0<K=%c4A2ruB!4rGvWm!KFrvd4PyfZ-kxmpO4pfM$
 EfLnqQYXATM
 literal 178
 zcmV;j08RfwT4*^jL0KkKS>A08g#Z9x|HJ$H)ZJi0004xF0SE*D03g5s00IDLSQelF
 ziVX^$pfWNUJrmRhn2k52pQ;Rs0EQC;(S%|!m`2~BZ@b++;etskRJUVl!Kt)wu7?VN
 zl;%JdqX2?TgsNVJP?87M*MvL1qQnBkCES&?0@MeaN-bL4;bDzxmMm|da4fuh!=#fu
 g@i9R@5z!av{9tA<GGr!3hi~HUNT&)C8_l7xpl%OKQ2+n{
 --
 .13.6

-[Qemu-devel] [PULL 06/55] block: early check for blockers on drive-mirror
+[Qemu-devel] [PULL v3 14/35] block: Remove the obsolete -drive boot=on|off parameter
-From: Paolo Bonzini <pbonzini@redhat.com>
+From: Thomas Huth <thuth@redhat.com>
-Even if an op blocker is present for BLOCK_OP_TYPE_MIRROR_SOURCE,
+It's not working anymore since QEMU v1.3.0 - time to remove it now.
 it is checked a bit late and the result is that the target is
 created even if drive-mirror subsequently fails.  Add an early
 check to avoid this.
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Thomas Huth <thuth@redhat.com>
-Reviewed-by: Fam Zheng <famz@redhat.com>
+Reviewed-by: John Snow <jsnow@redhat.com>
-Reviewed-by: Alberto Garcia <berto@igalia.com>
+Reviewed-by: Markus Armbruster <armbru@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- blockdev.c | 5 +++++
+ blockdev.c    | 11 -----------
-file changed, 5 insertions(+)
+ qemu-doc.texi |  6 ------
 files changed, 17 deletions(-)
 diff --git a/blockdev.c b/blockdev.c
 index XXXXXXX..XXXXXXX 100644
 --- a/blockdev.c
 +++ b/blockdev.c
-@@ -XXX,XX +XXX,XX @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
+@@ -XXX,XX +XXX,XX @@ QemuOptsList qemu_legacy_drive_opts = {
-         return;
+             .type = QEMU_OPT_STRING,
              .help = "chs translation (auto, lba, none)",
          },{
 -            .name = "boot",
 -            .type = QEMU_OPT_BOOL,
 -            .help = "(deprecated, ignored)",
 -        },{
              .name = "addr",
              .type = QEMU_OPT_STRING,
              .help = "pci address (virtio only)",
@@ -XXX,XX +XXX,XX @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type)
          goto fail;
      }
-+    /* Early check to avoid creating target */
+-    /* Deprecated option boot=[on|off] */
-+    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_MIRROR_SOURCE, errp)) {
+-    if (qemu_opt_get(legacy_opts, "boot") != NULL) {
-+        return;
+-        fprintf(stderr, "qemu-kvm: boot=on|off is deprecated and will be "
-+    }
+-                "ignored. Future versions will reject this parameter. Please "
-+
+-                "update your scripts.\n");
-     aio_context = bdrv_get_aio_context(bs);
+-    }
-     aio_context_acquire(aio_context);
+-
+     /* Other deprecated options */
      if (!qtest_enabled()) {
          for (i = 0; i < ARRAY_SIZE(deprecated); i++) {
 diff --git a/qemu-doc.texi b/qemu-doc.texi
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-doc.texi
 +++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ deprecated.
  @section System emulator command line arguments
 -@subsection -drive boot=on|off (since 1.3.0)
 -
 -The ``boot=on|off'' option to the ``-drive'' argument is
 -ignored. Applications should use the ``bootindex=N'' parameter
 -to set an absolute ordering between devices instead.
 -
  @subsection -tdf (since 1.3.0)
  The ``-tdf'' argument is ignored. The behaviour implemented
 --
 .13.6

-[Qemu-devel] [PULL 05/55] qcow2: Use g_try_realloc() in qcow2_expand_zero_clusters()
+[Qemu-devel] [PULL v3 15/35] block: Remove the deprecated -hdachs option
-From: Alberto Garcia <berto@igalia.com>
+From: Thomas Huth <thuth@redhat.com>
-g_realloc() aborts the program if it fails to allocate the required
+It's been marked as deprecated since QEMU v2.10.0, and so far nobody
-amount of memory. We want to detect that scenario and return an error
+complained that we should keep it, so let's remove this legacy option
-instead, so let's use g_try_realloc().
+now to simplify the code quite a bit.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
+Signed-off-by: Thomas Huth <thuth@redhat.com>
 Reviewed-by: John Snow <jsnow@redhat.com>
 Reviewed-by: Markus Armbruster <armbru@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/qcow2-cluster.c | 10 +++++++++-
+ vl.c            | 86 ++-------------------------------------------------------
-file changed, 9 insertions(+), 1 deletion(-)
+ qemu-doc.texi   |  8 ------
+ qemu-options.hx | 19 ++-----------
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
+files changed, 4 insertions(+), 109 deletions(-)
 diff --git a/vl.c b/vl.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
+--- a/vl.c
-+++ b/block/qcow2-cluster.c
++++ b/vl.c
-@@ -XXX,XX +XXX,XX @@ int qcow2_expand_zero_clusters(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
-         int l1_sectors = DIV_ROUND_UP(s->snapshots[i].l1_size *
+     const char *boot_order = NULL;
-                                       sizeof(uint64_t), BDRV_SECTOR_SIZE);
+     const char *boot_once = NULL;
+     DisplayState *ds;
--        l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);
+-    int cyls, heads, secs, translation;
-+        uint64_t *new_l1_table =
+     QemuOpts *opts, *machine_opts;
-+            g_try_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);
+-    QemuOpts *hda_opts = NULL, *icount_opts = NULL, *accel_opts = NULL;
-+
++    QemuOpts *icount_opts = NULL, *accel_opts = NULL;
-+        if (!new_l1_table) {
+     QemuOptsList *olist;
-+            ret = -ENOMEM;
+     int optind;
-+            goto fail;
+     const char *optarg;
-+        }
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
-+
-+        l1_table = new_l1_table;
+     cpu_model = NULL;
+     snapshot = 0;
-         ret = bdrv_read(bs->file,
+-    cyls = heads = secs = 0;
-                         s->snapshots[i].l1_table_offset / BDRV_SECTOR_SIZE,
+-    translation = BIOS_ATA_TRANSLATION_AUTO;
      nb_nics = 0;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
          if (optind >= argc)
              break;
          if (argv[optind][0] != '-') {
 -            hda_opts = drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
 +            drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
          } else {
              const QEMUOption *popt;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
                  cpu_model = optarg;
                  break;
              case QEMU_OPTION_hda:
 -                {
 -                    char buf[256];
 -                    if (cyls == 0)
 -                        snprintf(buf, sizeof(buf), "%s", HD_OPTS);
 -                    else
 -                        snprintf(buf, sizeof(buf),
 -                                 "%s,cyls=%d,heads=%d,secs=%d%s",
 -                                 HD_OPTS , cyls, heads, secs,
 -                                 translation == BIOS_ATA_TRANSLATION_LBA ?
 -                                 ",trans=lba" :
 -                                 translation == BIOS_ATA_TRANSLATION_NONE ?
 -                                 ",trans=none" : "");
 -                    drive_add(IF_DEFAULT, 0, optarg, buf);
 -                    break;
 -                }
              case QEMU_OPTION_hdb:
              case QEMU_OPTION_hdc:
              case QEMU_OPTION_hdd:
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
              case QEMU_OPTION_snapshot:
                  snapshot = 1;
                  break;
 -            case QEMU_OPTION_hdachs:
 -                {
 -                    const char *p;
 -                    p = optarg;
 -                    cyls = strtol(p, (char **)&p, 0);
 -                    if (cyls < 1 || cyls > 16383)
 -                        goto chs_fail;
 -                    if (*p != ',')
 -                        goto chs_fail;
 -                    p++;
 -                    heads = strtol(p, (char **)&p, 0);
 -                    if (heads < 1 || heads > 16)
 -                        goto chs_fail;
 -                    if (*p != ',')
 -                        goto chs_fail;
 -                    p++;
 -                    secs = strtol(p, (char **)&p, 0);
 -                    if (secs < 1 || secs > 63)
 -                        goto chs_fail;
 -                    if (*p == ',') {
 -                        p++;
 -                        if (!strcmp(p, "large")) {
 -                            translation = BIOS_ATA_TRANSLATION_LARGE;
 -                        } else if (!strcmp(p, "rechs")) {
 -                            translation = BIOS_ATA_TRANSLATION_RECHS;
 -                        } else if (!strcmp(p, "none")) {
 -                            translation = BIOS_ATA_TRANSLATION_NONE;
 -                        } else if (!strcmp(p, "lba")) {
 -                            translation = BIOS_ATA_TRANSLATION_LBA;
 -                        } else if (!strcmp(p, "auto")) {
 -                            translation = BIOS_ATA_TRANSLATION_AUTO;
 -                        } else {
 -                            goto chs_fail;
 -                        }
 -                    } else if (*p != '\0') {
 -                    chs_fail:
 -                        error_report("invalid physical CHS format");
 -                        exit(1);
 -                    }
 -                    if (hda_opts != NULL) {
 -                        qemu_opt_set_number(hda_opts, "cyls", cyls,
 -                                            &error_abort);
 -                        qemu_opt_set_number(hda_opts, "heads", heads,
 -                                            &error_abort);
 -                        qemu_opt_set_number(hda_opts, "secs", secs,
 -                                            &error_abort);
 -                        if (translation == BIOS_ATA_TRANSLATION_LARGE) {
 -                            qemu_opt_set(hda_opts, "trans", "large",
 -                                         &error_abort);
 -                        } else if (translation == BIOS_ATA_TRANSLATION_RECHS) {
 -                            qemu_opt_set(hda_opts, "trans", "rechs",
 -                                         &error_abort);
 -                        } else if (translation == BIOS_ATA_TRANSLATION_LBA) {
 -                            qemu_opt_set(hda_opts, "trans", "lba",
 -                                         &error_abort);
 -                        } else if (translation == BIOS_ATA_TRANSLATION_NONE) {
 -                            qemu_opt_set(hda_opts, "trans", "none",
 -                                         &error_abort);
 -                        }
 -                    }
 -                }
 -                error_report("'-hdachs' is deprecated, please use '-device"
 -                             " ide-hd,cyls=c,heads=h,secs=s,...' instead");
 -                break;
              case QEMU_OPTION_numa:
                  opts = qemu_opts_parse_noisily(qemu_find_opts("numa"),
                                                 optarg, true);
 diff --git a/qemu-doc.texi b/qemu-doc.texi
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-doc.texi
 +++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ The ``--net dump'' argument is now replaced with the
  ``-object filter-dump'' argument which works in combination
  with the modern ``-netdev`` backends instead.
 -@subsection -hdachs (since 2.10.0)
 -
 -The ``-hdachs'' argument is now a synonym for setting
 -the ``cyls'', ``heads'', ``secs'', and ``trans'' properties
 -on the ``ide-hd'' device using the ``-device'' argument.
 -The new syntax allows different settings to be provided
 -per disk.
 -
  @subsection -usbdevice (since 2.10.0)
  The ``-usbdevice DEV'' argument is now a synonym for setting
 diff --git a/qemu-options.hx b/qemu-options.hx
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-options.hx
 +++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ of available connectors of a given interface type.
  @item media=@var{media}
  This option defines the type of the media: disk or cdrom.
  @item cyls=@var{c},heads=@var{h},secs=@var{s}[,trans=@var{t}]
 -These options have the same definition as they have in @option{-hdachs}.
 -These parameters are deprecated, use the corresponding parameters
 +Force disk physical geometry and the optional BIOS translation (trans=none or
 +lba). These parameters are deprecated, use the corresponding parameters
  of @code{-device} instead.
  @item snapshot=@var{snapshot}
  @var{snapshot} is "on" or "off" and controls snapshot mode for the given drive
@@ -XXX,XX +XXX,XX @@ the raw disk image you use is not written back. You can however force
  the write back by pressing @key{C-a s} (@pxref{disk_images}).
  ETEXI
 -DEF("hdachs", HAS_ARG, QEMU_OPTION_hdachs, \
 -    "-hdachs c,h,s[,t]\n" \
 -    "                force hard disk 0 physical geometry and the optional BIOS\n" \
 -    "                translation (t=none or lba) (usually QEMU can guess them)\n",
 -    QEMU_ARCH_ALL)
 -STEXI
 -@item -hdachs @var{c},@var{h},@var{s},[,@var{t}]
 -@findex -hdachs
 -Force hard disk 0 physical geometry (1 <= @var{c} <= 16383, 1 <=
 -@var{h} <= 16, 1 <= @var{s} <= 63) and optionally force the BIOS
 -translation mode (@var{t}=none, lba or auto). Usually QEMU can guess
 -all those parameters. This option is deprecated, please use
 -@code{-device ide-hd,cyls=c,heads=h,secs=s,...} instead.
 -ETEXI
 -
  DEF("fsdev", HAS_ARG, QEMU_OPTION_fsdev,
      "-fsdev fsdriver,id=id[,path=path,][security_model={mapped-xattr|mapped-file|passthrough|none}]\n"
      " [,writeout=immediate][,readonly][,socket=socket|sock_fd=sock_fd][,fmode=fmode][,dmode=dmode]\n"
 --
 .13.6

-[Qemu-devel] [PULL 07/55] iotests: Use virtio-blk in 155
+[Qemu-devel] [PULL v3 16/35] block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter
-From: Max Reitz <mreitz@redhat.com>
+From: Thomas Huth <thuth@redhat.com>
-Only a few select machine types support floppy drives and there is
+Looks like we forgot to announce the deprecation of these options in
-actually nothing preventing us from using virtio here, so let's do it.
+the corresponding chapter of the qemu-doc text, so let's do that now.
-Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Signed-off-by: Thomas Huth <thuth@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Reviewed-by: John Snow <jsnow@redhat.com>
-Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Reviewed-by: Markus Armbruster <armbru@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- tests/qemu-iotests/155 | 14 +++++++++-----
+ qemu-doc.texi | 15 +++++++++++++++
-file changed, 9 insertions(+), 5 deletions(-)
+file changed, 15 insertions(+)
-diff --git a/tests/qemu-iotests/155 b/tests/qemu-iotests/155
+diff --git a/qemu-doc.texi b/qemu-doc.texi
-index XXXXXXX..XXXXXXX 100755
+index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/155
+--- a/qemu-doc.texi
-+++ b/tests/qemu-iotests/155
++++ b/qemu-doc.texi
-@@ -XXX,XX +XXX,XX @@ class BaseClass(iotests.QMPTestCase):
+@@ -XXX,XX +XXX,XX @@ longer be directly supported in QEMU.
-                     'file': {'driver': 'file',
+ The ``-drive if=scsi'' argument is replaced by the the
-                              'filename': source_img}}
+ ``-device BUS-TYPE'' argument combined with ``-drive if=none''.
-         self.vm.add_blockdev(self.qmp_to_opts(blockdev))
--        self.vm.add_device('floppy,id=qdev0,drive=source')
++@subsection -drive cyls=...,heads=...,secs=...,trans=... (since 2.10.0)
-+        self.vm.add_device('virtio-blk,id=qdev0,drive=source')
++
-         self.vm.launch()
++The drive geometry arguments are replaced by the the geometry arguments
++that can be specified with the ``-device'' parameter.
-         self.assertIntactSourceBackingChain()
++
-@@ -XXX,XX +XXX,XX @@ class MirrorBaseClass(BaseClass):
++@subsection -drive serial=... (since 2.10.0)
-     def testFull(self):
++
-         self.runMirror('full')
++The drive serial argument is replaced by the the serial argument
++that can be specified with the ``-device'' parameter.
--        node = self.findBlockNode('target', 'qdev0')
++
-+        node = self.findBlockNode('target',
++@subsection -drive addr=... (since 2.10.0)
-+                                  '/machine/peripheral/qdev0/virtio-backend')
++
-         self.assertCorrectBackingImage(node, None)
++The drive addr argument is replaced by the the addr argument
-         self.assertIntactSourceBackingChain()
++that can be specified with the ``-device'' parameter.
++
-     def testTop(self):
+ @subsection -net dump (since 2.10.0)
-         self.runMirror('top')
+ The ``--net dump'' argument is now replaced with the
 -        node = self.findBlockNode('target', 'qdev0')
 +        node = self.findBlockNode('target',
 +                                  '/machine/peripheral/qdev0/virtio-backend')
          self.assertCorrectBackingImage(node, back2_img)
          self.assertIntactSourceBackingChain()
      def testNone(self):
          self.runMirror('none')
 -        node = self.findBlockNode('target', 'qdev0')
 +        node = self.findBlockNode('target',
 +                                  '/machine/peripheral/qdev0/virtio-backend')
          self.assertCorrectBackingImage(node, source_img)
          self.assertIntactSourceBackingChain()
@@ -XXX,XX +XXX,XX @@ class TestCommit(BaseClass):
          self.vm.event_wait('BLOCK_JOB_COMPLETED')
 -        node = self.findBlockNode(None, 'qdev0')
 +        node = self.findBlockNode(None,
 +                                  '/machine/peripheral/qdev0/virtio-backend')
          self.assert_qmp(node, 'image' + '/backing-image' * 0 + '/filename',
                          back1_img)
          self.assert_qmp(node, 'image' + '/backing-image' * 1 + '/filename',
 --
 .13.6

-[Qemu-devel] [PULL 03/55] qemu-img: Document --force-share / -U
+[Qemu-devel] [PULL v3 17/35] block: Remove unused bdrv_requests_pending
 From: Fam Zheng <famz@redhat.com>
 Signed-off-by: Fam Zheng <famz@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Kashyap Chamarthy <kchamart@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- qemu-img.texi | 8 ++++++++
+ include/block/block_int.h |  1 -
-file changed, 8 insertions(+)
+ block/io.c                | 18 ------------------
 files changed, 19 deletions(-)
-diff --git a/qemu-img.texi b/qemu-img.texi
+diff --git a/include/block/block_int.h b/include/block/block_int.h
 index XXXXXXX..XXXXXXX 100644
---- a/qemu-img.texi
+--- a/include/block/block_int.h
-+++ b/qemu-img.texi
++++ b/include/block/block_int.h
-@@ -XXX,XX +XXX,XX @@ exclusive with the @var{-O} parameters. It is currently required to also use
+@@ -XXX,XX +XXX,XX @@ bool blk_dev_is_tray_open(BlockBackend *blk);
- the @var{-n} parameter to skip image creation. This restriction may be relaxed
+ bool blk_dev_is_medium_locked(BlockBackend *blk);
- in a future release.
+ void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes);
-+@item --force-share (-U)
+-bool bdrv_requests_pending(BlockDriverState *bs);
-+If specified, @code{qemu-img} will open the image in shared mode, allowing
-+other QEMU processes to open it in write mode. For example, this can be used to
+ void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
-+get the image information (with 'info' subcommand) when the image is used by a
+ void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
-+running guest.  Note that this could produce inconsistent results because of
+diff --git a/block/io.c b/block/io.c
-+concurrent metadata changes, etc. This option is only allowed when opening
+index XXXXXXX..XXXXXXX 100644
-+images in read-only mode.
+--- a/block/io.c
-+
++++ b/block/io.c
- @item --backing-chain
+@@ -XXX,XX +XXX,XX @@ void bdrv_disable_copy_on_read(BlockDriverState *bs)
- will enumerate information about backing files in a disk image chain. Refer
+     assert(old >= 1);
- below for further description.
+ }
 -/* Check if any requests are in-flight (including throttled requests) */
 -bool bdrv_requests_pending(BlockDriverState *bs)
 -{
 -    BdrvChild *child;
 -
 -    if (atomic_read(&bs->in_flight)) {
 -        return true;
 -    }
 -
 -    QLIST_FOREACH(child, &bs->children, next) {
 -        if (bdrv_requests_pending(child->bs)) {
 -            return true;
 -        }
 -    }
 -
 -    return false;
 -}
 -
  typedef struct {
      Coroutine *co;
      BlockDriverState *bs;
 --
 .13.6

-[Qemu-devel] [PULL 55/55] iotests: Add l2-cache-entry-size to iotest 137
+[Qemu-devel] [PULL v3 18/35] block: Assert drain_all is only called from main AioContext
-From: Alberto Garcia <berto@igalia.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 ---
  block/io.c | 6 ++++++
 file changed, 6 insertions(+)
-This test tries reopening a qcow2 image with valid and invalid
+diff --git a/block/io.c b/block/io.c
 options. This patch adds l2-cache-entry-size to the set.
 Signed-off-by: Alberto Garcia <berto@igalia.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Message-id: 3d3b7d2dbfc020deaef60fb58739b0801eb9517c.1517840877.git.berto@igalia.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  tests/qemu-iotests/137     | 5 +++++
  tests/qemu-iotests/137.out | 2 ++
 files changed, 7 insertions(+)
 diff --git a/tests/qemu-iotests/137 b/tests/qemu-iotests/137
 index XXXXXXX..XXXXXXX 100755
 --- a/tests/qemu-iotests/137
 +++ b/tests/qemu-iotests/137
@@ -XXX,XX +XXX,XX @@ $QEMU_IO \
      -c "reopen -o overlap-check.inactive-l2=off" \
      -c "reopen -o cache-size=1M" \
      -c "reopen -o l2-cache-size=512k" \
 +    -c "reopen -o l2-cache-entry-size=512" \
 +    -c "reopen -o l2-cache-entry-size=4k" \
 +    -c "reopen -o l2-cache-entry-size=64k" \
      -c "reopen -o refcount-cache-size=128k" \
      -c "reopen -o cache-clean-interval=5" \
      -c "reopen -o cache-clean-interval=0" \
@@ -XXX,XX +XXX,XX @@ $QEMU_IO \
      -c "reopen -o cache-size=1M,l2-cache-size=2M" \
      -c "reopen -o cache-size=1M,refcount-cache-size=2M" \
      -c "reopen -o l2-cache-size=256T" \
 +    -c "reopen -o l2-cache-entry-size=33k" \
 +    -c "reopen -o l2-cache-entry-size=128k" \
      -c "reopen -o refcount-cache-size=256T" \
      -c "reopen -o overlap-check=constant,overlap-check.template=all" \
      -c "reopen -o overlap-check=blubb" \
 diff --git a/tests/qemu-iotests/137.out b/tests/qemu-iotests/137.out
 index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/137.out
+--- a/block/io.c
-+++ b/tests/qemu-iotests/137.out
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ cache-size, l2-cache-size and refcount-cache-size may not be set the same time
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
- l2-cache-size may not exceed cache-size
+     BdrvNextIterator it;
- refcount-cache-size may not exceed cache-size
+     GSList *aio_ctxs = NULL, *ctx;
- L2 cache size too big
-+L2 cache entry size must be a power of two between 512 and the cluster size (65536)
++    /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
-+L2 cache entry size must be a power of two between 512 and the cluster size (65536)
++     * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
- L2 cache size too big
++     * nodes in several different AioContexts, so make sure we're in the main
- Conflicting values for qcow2 options 'overlap-check' ('constant') and 'overlap-check.template' ('all')
++     * context. */
- Unsupported value 'blubb' for qcow2 option 'overlap-check'. Allowed are any of the following: none, constant, cached, all
++    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 +
      block_job_pause_all();
      for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 --
 .13.6

-[Qemu-devel] [PULL 21/55] qcow2: Remove BDS parameter from qcow2_cache_table_release()
+[Qemu-devel] [PULL v3 19/35] block: Make bdrv_drain() driver callbacks non-recursive
-From: Alberto Garcia <berto@igalia.com>
+bdrv_drained_begin() doesn't increase bs->quiesce_counter recursively
 and also doesn't notify other parent nodes of children, which both means
 that the child nodes are not actually drained, and bdrv_drained_begin()
 is providing useful functionality only on a single node.
-This function was only using the BlockDriverState parameter to get the
+To keep things consistent, we also shouldn't call the block driver
-cache table size (since it was equal to the cluster size). This is no
+callbacks recursively.
 longer necessary so this parameter can be removed.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
+A proper recursive drain version that provides an actually working
-Reviewed-by: Eric Blake <eblake@redhat.com>
+drained section for child nodes will be introduced later.
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: 7c1b262344375d52544525f85bbbf0548d5ba575.1517840876.git.berto@igalia.com
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
 ---
- block/qcow2-cache.c | 9 ++++-----
+ block/io.c | 16 +++++++++-------
-file changed, 4 insertions(+), 5 deletions(-)
+file changed, 9 insertions(+), 7 deletions(-)
-diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cache.c
+--- a/block/io.c
-+++ b/block/qcow2-cache.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static inline const char *qcow2_cache_get_name(BDRVQcow2State *s, Qcow2Cache *c)
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
  }
  /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
 -static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 +static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
  {
      BdrvChild *child, *tmp;
      BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
      bdrv_coroutine_enter(bs, data.co);
      BDRV_POLL_WHILE(bs, !data.done);
 -    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
 -        bdrv_drain_invoke(child->bs, begin);
 +    if (recursive) {
 +        QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
 +            bdrv_drain_invoke(child->bs, begin, true);
 +        }
      }
  }
--static void qcow2_cache_table_release(BlockDriverState *bs, Qcow2Cache *c,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
--                                      int i, int num_tables)
+         bdrv_parent_drained_begin(bs);
 +static void qcow2_cache_table_release(Qcow2Cache *c, int i, int num_tables)
  {
  /* Using MADV_DONTNEED to discard memory is a Linux-specific feature */
  #ifdef CONFIG_LINUX
@@ -XXX,XX +XXX,XX @@ void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c)
          }
          if (to_clean > 0) {
 -            qcow2_cache_table_release(bs, c, i - to_clean, to_clean);
 +            qcow2_cache_table_release(c, i - to_clean, to_clean);
          }
      }
-@@ -XXX,XX +XXX,XX @@ int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c)
+-    bdrv_drain_invoke(bs, true);
-         c->entries[i].lru_counter = 0;
++    bdrv_drain_invoke(bs, true, false);
      bdrv_drain_recurse(bs);
  }
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
      }
--    qcow2_cache_table_release(bs, c, 0, c->size);
+     /* Re-enable things in child-to-parent order */
-+    qcow2_cache_table_release(c, 0, c->size);
+-    bdrv_drain_invoke(bs, false);
++    bdrv_drain_invoke(bs, false, false);
-     c->lru_counter = 0;
+     bdrv_parent_drained_end(bs);
+     aio_enable_external(bdrv_get_aio_context(bs));
@@ -XXX,XX +XXX,XX @@ void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table)
      c->entries[i].lru_counter = 0;
      c->entries[i].dirty = false;
 -    qcow2_cache_table_release(bs, c, i, 1);
 +    qcow2_cache_table_release(c, i, 1);
  }
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
+         aio_context_acquire(aio_context);
+         aio_disable_external(aio_context);
+         bdrv_parent_drained_begin(bs);
+-        bdrv_drain_invoke(bs, true);
++        bdrv_drain_invoke(bs, true, true);
+         aio_context_release(aio_context);
+         if (!g_slist_find(aio_ctxs, aio_context)) {
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
+         /* Re-enable things in child-to-parent order */
+         aio_context_acquire(aio_context);
+-        bdrv_drain_invoke(bs, false);
++        bdrv_drain_invoke(bs, false, true);
+         bdrv_parent_drained_end(bs);
+         aio_enable_external(aio_context);
+         aio_context_release(aio_context);
 --
 .13.6

-[Qemu-devel] [PULL 30/55] qcow2: Add offset_to_l2_slice_index()
+[Qemu-devel] [PULL v3 20/35] test-bdrv-drain: Test callback for bdrv_drain
-From: Alberto Garcia <berto@igalia.com>
+The existing test is for bdrv_drain_all_begin/end() only. Generalise the
 test case so that it can be run for the other variants as well. At the
 moment this is only bdrv_drain_begin/end(), but in a while, we'll add
 another one.
-Similar to offset_to_l2_index(), this function takes a guest offset
+Also, add a backing file to the test node to test whether the operations
-and returns the index in the L2 slice that contains its L2 entry.
+work recursively.
-An L2 slice has currently the same size as an L2 table (one cluster),
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-so both functions return the same value for now.
+---
  tests/test-bdrv-drain.c | 69 ++++++++++++++++++++++++++++++++++++++++++++-----
 file changed, 62 insertions(+), 7 deletions(-)
-Signed-off-by: Alberto Garcia <berto@igalia.com>
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Message-id: a1c45c5c5a76146dd1712d8d1e7b409ad539c718.1517840877.git.berto@igalia.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  block/qcow2.h | 5 +++++
 file changed, 5 insertions(+)
 diff --git a/block/qcow2.h b/block/qcow2.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
+--- a/tests/test-bdrv-drain.c
-+++ b/block/qcow2.h
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ static inline int offset_to_l2_index(BDRVQcow2State *s, int64_t offset)
+@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_test = {
-     return (offset >> s->cluster_bits) & (s->l2_size - 1);
      .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
      .bdrv_co_drain_end      = bdrv_test_co_drain_end,
 +
 +    .bdrv_child_perm        = bdrv_format_default_perms,
  };
  static void aio_ret_cb(void *opaque, int ret)
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
      *aio_ret = ret;
  }
-+static inline int offset_to_l2_slice_index(BDRVQcow2State *s, int64_t offset)
+-static void test_drv_cb_drain_all(void)
 +enum drain_type {
 +    BDRV_DRAIN_ALL,
 +    BDRV_DRAIN,
 +};
 +
 +static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
 +{
-+    return (offset >> s->cluster_bits) & (s->l2_slice_size - 1);
++    switch (drain_type) {
 +    case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
 +    case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
 +    default:                    g_assert_not_reached();
 +    }
 +}
 +
- static inline int64_t align_offset(int64_t offset, int n)
++static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
 +{
 +    switch (drain_type) {
 +    case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
 +    case BDRV_DRAIN:            bdrv_drained_end(bs); break;
 +    default:                    g_assert_not_reached();
 +    }
 +}
 +
 +static void test_drv_cb_common(enum drain_type drain_type, bool recursive)
  {
-     offset = (offset + n - 1) & ~(n - 1);
+     BlockBackend *blk;
 -    BlockDriverState *bs;
 -    BDRVTestState *s;
 +    BlockDriverState *bs, *backing;
 +    BDRVTestState *s, *backing_s;
      BlockAIOCB *acb;
      int aio_ret;
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
      s = bs->opaque;
      blk_insert_bs(blk, bs, &error_abort);
 +    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
 +    backing_s = backing->opaque;
 +    bdrv_set_backing_hd(bs, backing, &error_abort);
 +
      /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
      g_assert_cmpint(s->drain_count, ==, 0);
 -    bdrv_drain_all_begin();
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    do_drain_begin(drain_type, bs);
 +
      g_assert_cmpint(s->drain_count, ==, 1);
 -    bdrv_drain_all_end();
 +    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
 +
 +    do_drain_end(drain_type, bs);
 +
      g_assert_cmpint(s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
      /* Now do the same while a request is pending */
      aio_ret = -EINPROGRESS;
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
      g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
      g_assert_cmpint(s->drain_count, ==, 0);
 -    bdrv_drain_all_begin();
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    do_drain_begin(drain_type, bs);
 +
      g_assert_cmpint(aio_ret, ==, 0);
      g_assert_cmpint(s->drain_count, ==, 1);
 -    bdrv_drain_all_end();
 +    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
 +
 +    do_drain_end(drain_type, bs);
 +
      g_assert_cmpint(s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +    bdrv_unref(backing);
      bdrv_unref(bs);
      blk_unref(blk);
  }
 +static void test_drv_cb_drain_all(void)
 +{
 +    test_drv_cb_common(BDRV_DRAIN_ALL, true);
 +}
 +
 +static void test_drv_cb_drain(void)
 +{
 +    test_drv_cb_common(BDRV_DRAIN, false);
 +}
 +
  int main(int argc, char **argv)
  {
      bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_init(&argc, &argv, NULL);
      g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
 +    g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
      return g_test_run();
  }
 --
 .13.6

-[Qemu-devel] [PULL 25/55] qcow2: Remove BDS parameter from qcow2_cache_clean_unused()
+[Qemu-devel] [PULL v3 21/35] test-bdrv-drain: Test bs->quiesce_counter
-From: Alberto Garcia <berto@igalia.com>
+This is currently only working correctly for bdrv_drain(), not for
 bdrv_drain_all(). Leave a comment for the drain_all case, we'll address
 it later.
-This function was only using the BlockDriverState parameter to pass it
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-to qcow2_cache_table_release(). This is no longer necessary so this
+---
-parameter can be removed.
+ tests/test-bdrv-drain.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 file changed, 45 insertions(+)
-Signed-off-by: Alberto Garcia <berto@igalia.com>
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Message-id: b74f17591af52f201de0ea3a3b2dd0a81932334d.1517840876.git.berto@igalia.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  block/qcow2.h       | 2 +-
  block/qcow2-cache.c | 2 +-
  block/qcow2.c       | 4 ++--
 files changed, 4 insertions(+), 4 deletions(-)
 diff --git a/block/qcow2.h b/block/qcow2.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
+--- a/tests/test-bdrv-drain.c
-+++ b/block/qcow2.h
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
+@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
-     Qcow2Cache *dependency);
+     test_drv_cb_common(BDRV_DRAIN, false);
  void qcow2_cache_depends_on_flush(Qcow2Cache *c);
 -void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c);
 +void qcow2_cache_clean_unused(Qcow2Cache *c);
  int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c);
  int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
 diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2-cache.c
 +++ b/block/qcow2-cache.c
@@ -XXX,XX +XXX,XX @@ static inline bool can_clean_entry(Qcow2Cache *c, int i)
          t->lru_counter <= c->cache_clean_lru_counter;
  }
--void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c)
++static void test_quiesce_common(enum drain_type drain_type, bool recursive)
-+void qcow2_cache_clean_unused(Qcow2Cache *c)
++{
 +    BlockBackend *blk;
 +    BlockDriverState *bs, *backing;
 +
 +    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
 +                              &error_abort);
 +    blk_insert_bs(blk, bs, &error_abort);
 +
 +    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
 +    bdrv_set_backing_hd(bs, backing, &error_abort);
 +
 +    g_assert_cmpint(bs->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +
 +    do_drain_begin(drain_type, bs);
 +
 +    g_assert_cmpint(bs->quiesce_counter, ==, 1);
 +    g_assert_cmpint(backing->quiesce_counter, ==, !!recursive);
 +
 +    do_drain_end(drain_type, bs);
 +
 +    g_assert_cmpint(bs->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +
 +    bdrv_unref(backing);
 +    bdrv_unref(bs);
 +    blk_unref(blk);
 +}
 +
 +static void test_quiesce_drain_all(void)
 +{
 +    // XXX drain_all doesn't quiesce
 +    //test_quiesce_common(BDRV_DRAIN_ALL, true);
 +}
 +
 +static void test_quiesce_drain(void)
 +{
 +    test_quiesce_common(BDRV_DRAIN, false);
 +}
 +
  int main(int argc, char **argv)
  {
-     int i = 0;
+     bdrv_init();
-     while (i < c->size) {
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
-diff --git a/block/qcow2.c b/block/qcow2.c
+     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
-index XXXXXXX..XXXXXXX 100644
+     g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
---- a/block/qcow2.c
-+++ b/block/qcow2.c
++    g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
-@@ -XXX,XX +XXX,XX @@ static void cache_clean_timer_cb(void *opaque)
++    g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
- {
++
-     BlockDriverState *bs = opaque;
+     return g_test_run();
      BDRVQcow2State *s = bs->opaque;
 -    qcow2_cache_clean_unused(bs, s->l2_table_cache);
 -    qcow2_cache_clean_unused(bs, s->refcount_block_cache);
 +    qcow2_cache_clean_unused(s->l2_table_cache);
 +    qcow2_cache_clean_unused(s->refcount_block_cache);
      timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
                (int64_t) s->cache_clean_interval * 1000);
  }
 --
 .13.6

-[Qemu-devel] [PULL 18/55] qcow2: Add table size field to Qcow2Cache
+[Qemu-devel] [PULL v3 22/35] blockjob: Pause job on draining any job BDS
-From: Alberto Garcia <berto@igalia.com>
+Block jobs already paused themselves when their main BlockBackend
 entered a drained section. This is not good enough: We also want to
 pause a block job and may not submit new requests if, for example, the
 mirror target node should be drained.
-The table size in the qcow2 cache is currently equal to the cluster
+This implements .drained_begin/end callbacks in child_job in order to
-size. This doesn't allow us to use the cache memory efficiently,
+consider all block nodes related to the job, and removes the
-particularly with large cluster sizes, so we need to be able to have
+BlockBackend callbacks which are unnecessary now because the root of the
-smaller cache tables that are independent from the cluster size. This
+job main BlockBackend is always referenced with a child_job, too.
 patch adds a new field to Qcow2Cache that we can use instead of the
 cluster size.
-The current table size is still being initialized to the cluster size,
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-so there are no semantic changes yet, but this patch will allow us to
+---
-prepare the rest of the code and simplify a few function calls.
+ blockjob.c | 22 +++++++++-------------
 file changed, 9 insertions(+), 13 deletions(-)
-Signed-off-by: Alberto Garcia <berto@igalia.com>
+diff --git a/blockjob.c b/blockjob.c
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Message-id: 67a1bf9e55f417005c567bead95a018dc34bc687.1517840876.git.berto@igalia.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  block/qcow2-cache.c | 29 ++++++++++++++---------------
 file changed, 14 insertions(+), 15 deletions(-)
 diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cache.c
+--- a/blockjob.c
-+++ b/block/qcow2-cache.c
++++ b/blockjob.c
-@@ -XXX,XX +XXX,XX @@ struct Qcow2Cache {
+@@ -XXX,XX +XXX,XX @@ static char *child_job_get_parent_desc(BdrvChild *c)
-     Qcow2CachedTable       *entries;
+                            job->id);
-     struct Qcow2Cache      *depends;
+ }
-     int                     size;
-+    int                     table_size;
+-static const BdrvChildRole child_job = {
-     bool                    depends_on_flush;
+-    .get_parent_desc    = child_job_get_parent_desc,
-     void                   *table_array;
+-    .stay_at_node       = true,
-     uint64_t                lru_counter;
+-};
-@@ -XXX,XX +XXX,XX @@ struct Qcow2Cache {
+-
- static inline void *qcow2_cache_get_table_addr(BlockDriverState *bs,
+-static void block_job_drained_begin(void *opaque)
-                     Qcow2Cache *c, int table)
++static void child_job_drained_begin(BdrvChild *c)
  {
--    BDRVQcow2State *s = bs->opaque;
+-    BlockJob *job = opaque;
--    return (uint8_t *) c->table_array + (size_t) table * s->cluster_size;
++    BlockJob *job = c->opaque;
-+    return (uint8_t *) c->table_array + (size_t) table * c->table_size;
+     block_job_pause(job);
  }
- static inline int qcow2_cache_get_table_idx(BlockDriverState *bs,
+-static void block_job_drained_end(void *opaque)
-                   Qcow2Cache *c, void *table)
++static void child_job_drained_end(BdrvChild *c)
  {
--    BDRVQcow2State *s = bs->opaque;
+-    BlockJob *job = opaque;
-     ptrdiff_t table_offset = (uint8_t *) table - (uint8_t *) c->table_array;
++    BlockJob *job = c->opaque;
--    int idx = table_offset / s->cluster_size;
+     block_job_resume(job);
 -    assert(idx >= 0 && idx < c->size && table_offset % s->cluster_size == 0);
 +    int idx = table_offset / c->table_size;
 +    assert(idx >= 0 && idx < c->size && table_offset % c->table_size == 0);
      return idx;
  }
-@@ -XXX,XX +XXX,XX @@ static void qcow2_cache_table_release(BlockDriverState *bs, Qcow2Cache *c,
+-static const BlockDevOps block_job_dev_ops = {
- {
+-    .drained_begin = block_job_drained_begin,
- /* Using MADV_DONTNEED to discard memory is a Linux-specific feature */
+-    .drained_end = block_job_drained_end,
- #ifdef CONFIG_LINUX
++static const BdrvChildRole child_job = {
--    BDRVQcow2State *s = bs->opaque;
++    .get_parent_desc    = child_job_get_parent_desc,
-     void *t = qcow2_cache_get_table_addr(bs, c, i);
++    .drained_begin      = child_job_drained_begin,
-     int align = getpagesize();
++    .drained_end        = child_job_drained_end,
--    size_t mem_size = (size_t) s->cluster_size * num_tables;
++    .stay_at_node       = true,
-+    size_t mem_size = (size_t) c->table_size * num_tables;
+ };
-     size_t offset = QEMU_ALIGN_UP((uintptr_t) t, align) - (uintptr_t) t;
-     size_t length = QEMU_ALIGN_DOWN(mem_size - offset, align);
+ void block_job_remove_all_bdrv(BlockJob *job)
-     if (mem_size > offset && length > 0) {
+@@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
-@@ -XXX,XX +XXX,XX @@ Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
+     block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort);
+     bs->job = job;
-     c = g_new0(Qcow2Cache, 1);
-     c->size = num_tables;
+-    blk_set_dev_ops(blk, &block_job_dev_ops, job);
-+    c->table_size = s->cluster_size;
+     bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
-     c->entries = g_try_new0(Qcow2CachedTable, num_tables);
-     c->table_array = qemu_try_blockalign(bs->file->bs,
+     QLIST_INSERT_HEAD(&block_jobs, job, job_list);
 -                                         (size_t) num_tables * s->cluster_size);
 +                                         (size_t) num_tables * c->table_size);
      if (!c->entries || !c->table_array) {
          qemu_vfree(c->table_array);
@@ -XXX,XX +XXX,XX @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
      if (c == s->refcount_block_cache) {
          ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_BLOCK,
 -                c->entries[i].offset, s->cluster_size);
 +                c->entries[i].offset, c->table_size);
      } else if (c == s->l2_table_cache) {
          ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
 -                c->entries[i].offset, s->cluster_size);
 +                c->entries[i].offset, c->table_size);
      } else {
          ret = qcow2_pre_write_overlap_check(bs, 0,
 -                c->entries[i].offset, s->cluster_size);
 +                c->entries[i].offset, c->table_size);
      }
      if (ret < 0) {
@@ -XXX,XX +XXX,XX @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
      }
      ret = bdrv_pwrite(bs->file, c->entries[i].offset,
 -                      qcow2_cache_get_table_addr(bs, c, i), s->cluster_size);
 +                      qcow2_cache_get_table_addr(bs, c, i), c->table_size);
      if (ret < 0) {
          return ret;
      }
@@ -XXX,XX +XXX,XX @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
      trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache,
                            offset, read_from_disk);
 -    if (offset_into_cluster(s, offset)) {
 +    if (!QEMU_IS_ALIGNED(offset, c->table_size)) {
          qcow2_signal_corruption(bs, true, -1, -1, "Cannot get entry from %s "
                                  "cache: Offset %#" PRIx64 " is unaligned",
                                  qcow2_cache_get_name(s, c), offset);
@@ -XXX,XX +XXX,XX @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
      }
      /* Check if the table is already cached */
 -    i = lookup_index = (offset / s->cluster_size * 4) % c->size;
 +    i = lookup_index = (offset / c->table_size * 4) % c->size;
      do {
          const Qcow2CachedTable *t = &c->entries[i];
          if (t->offset == offset) {
@@ -XXX,XX +XXX,XX @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
          ret = bdrv_pread(bs->file, offset,
                           qcow2_cache_get_table_addr(bs, c, i),
 -                         s->cluster_size);
 +                         c->table_size);
          if (ret < 0) {
              return ret;
          }
 --
 .13.6

-[Qemu-devel] [PULL 11/55] gluster: Query current size in do_truncate()
+[Qemu-devel] [PULL v3 23/35] test-bdrv-drain: Test drain vs. block jobs
-From: Max Reitz <mreitz@redhat.com>
+Block jobs must be paused if any of the involved nodes are drained.
-Instead of expecting the current size to be 0, query it and allocate
-only the area [current_size, offset) if preallocation is requested.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/gluster.c | 21 +++++++++++++++++++--
+ tests/test-bdrv-drain.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++
-file changed, 19 insertions(+), 2 deletions(-)
+file changed, 121 insertions(+)
-diff --git a/block/gluster.c b/block/gluster.c
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/gluster.c
+--- a/tests/test-bdrv-drain.c
-+++ b/block/gluster.c
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@
- static int qemu_gluster_do_truncate(struct glfs_fd *fd, int64_t offset,
-                                     PreallocMode prealloc, Error **errp)
+ #include "qemu/osdep.h"
- {
+ #include "block/block.h"
-+    int64_t current_length;
++#include "block/blockjob_int.h"
  #include "sysemu/block-backend.h"
  #include "qapi/error.h"
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
      test_quiesce_common(BDRV_DRAIN, false);
  }
 +
-+    current_length = glfs_lseek(fd, 0, SEEK_END);
++typedef struct TestBlockJob {
-+    if (current_length < 0) {
++    BlockJob common;
-+        error_setg_errno(errp, errno, "Failed to determine current size");
++    bool should_complete;
-+        return -errno;
++} TestBlockJob;
 +
 +static void test_job_completed(BlockJob *job, void *opaque)
 +{
 +    block_job_completed(job, 0);
 +}
 +
 +static void coroutine_fn test_job_start(void *opaque)
 +{
 +    TestBlockJob *s = opaque;
 +
 +    while (!s->should_complete) {
 +        block_job_sleep_ns(&s->common, 100000);
 +    }
 +
-+    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
++    block_job_defer_to_main_loop(&s->common, test_job_completed, NULL);
-+        error_setg(errp, "Cannot use preallocation for shrinking files");
++}
-+        return -ENOTSUP;
++
 +static void test_job_complete(BlockJob *job, Error **errp)
 +{
 +    TestBlockJob *s = container_of(job, TestBlockJob, common);
 +    s->should_complete = true;
 +}
 +
 +BlockJobDriver test_job_driver = {
 +    .instance_size  = sizeof(TestBlockJob),
 +    .start          = test_job_start,
 +    .complete       = test_job_complete,
 +};
 +
 +static void test_blockjob_common(enum drain_type drain_type)
 +{
 +    BlockBackend *blk_src, *blk_target;
 +    BlockDriverState *src, *target;
 +    BlockJob *job;
 +    int ret;
 +
 +    src = bdrv_new_open_driver(&bdrv_test, "source", BDRV_O_RDWR,
 +                               &error_abort);
 +    blk_src = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    blk_insert_bs(blk_src, src, &error_abort);
 +
 +    target = bdrv_new_open_driver(&bdrv_test, "target", BDRV_O_RDWR,
 +                                  &error_abort);
 +    blk_target = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    blk_insert_bs(blk_target, target, &error_abort);
 +
 +    job = block_job_create("job0", &test_job_driver, src, 0, BLK_PERM_ALL, 0,
 +                           0, NULL, NULL, &error_abort);
 +    block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort);
 +    block_job_start(job);
 +
 +    g_assert_cmpint(job->pause_count, ==, 0);
 +    g_assert_false(job->paused);
 +    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
 +
 +    do_drain_begin(drain_type, src);
 +
 +    if (drain_type == BDRV_DRAIN_ALL) {
 +        /* bdrv_drain_all() drains both src and target, and involves an
 +         * additional block_job_pause_all() */
 +        g_assert_cmpint(job->pause_count, ==, 3);
 +    } else {
 +        g_assert_cmpint(job->pause_count, ==, 1);
 +    }
++    /* XXX We don't wait until the job is actually paused. Is this okay? */
++    /* g_assert_true(job->paused); */
++    g_assert_false(job->busy); /* The job is paused */
 +
-+    if (current_length == offset) {
++    do_drain_end(drain_type, src);
-+        return 0;
++
 +    g_assert_cmpint(job->pause_count, ==, 0);
 +    g_assert_false(job->paused);
 +    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
 +
 +    do_drain_begin(drain_type, target);
 +
 +    if (drain_type == BDRV_DRAIN_ALL) {
 +        /* bdrv_drain_all() drains both src and target, and involves an
 +         * additional block_job_pause_all() */
 +        g_assert_cmpint(job->pause_count, ==, 3);
 +    } else {
 +        g_assert_cmpint(job->pause_count, ==, 1);
 +    }
++    /* XXX We don't wait until the job is actually paused. Is this okay? */
++    /* g_assert_true(job->paused); */
++    g_assert_false(job->busy); /* The job is paused */
 +
-     switch (prealloc) {
++    do_drain_end(drain_type, target);
- #ifdef CONFIG_GLUSTERFS_FALLOCATE
++
-     case PREALLOC_MODE_FALLOC:
++    g_assert_cmpint(job->pause_count, ==, 0);
--        if (glfs_fallocate(fd, 0, 0, offset)) {
++    g_assert_false(job->paused);
-+        if (glfs_fallocate(fd, 0, current_length, offset - current_length)) {
++    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
-             error_setg_errno(errp, errno, "Could not preallocate data");
++
-             return -errno;
++    ret = block_job_complete_sync(job, &error_abort);
-         }
++    g_assert_cmpint(ret, ==, 0);
-@@ -XXX,XX +XXX,XX @@ static int qemu_gluster_do_truncate(struct glfs_fd *fd, int64_t offset,
++
-             error_setg_errno(errp, errno, "Could not resize file");
++    blk_unref(blk_src);
-             return -errno;
++    blk_unref(blk_target);
-         }
++    bdrv_unref(src);
--        if (glfs_zerofill(fd, 0, offset)) {
++    bdrv_unref(target);
-+        if (glfs_zerofill(fd, current_length, offset - current_length)) {
++}
-             error_setg_errno(errp, errno, "Could not zerofill the new area");
++
-             return -errno;
++static void test_blockjob_drain_all(void)
-         }
++{
 +    test_blockjob_common(BDRV_DRAIN_ALL);
 +}
 +
 +static void test_blockjob_drain(void)
 +{
 +    test_blockjob_common(BDRV_DRAIN);
 +}
 +
  int main(int argc, char **argv)
  {
      bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
      g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 +    g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
 +    g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 +
      return g_test_run();
  }
 --
 .13.6

-[Qemu-devel] [PULL 14/55] sheepdog: Pass old and new size to sd_prealloc()
+[Qemu-devel] [PULL v3 24/35] block: Don't block_job_pause_all() in bdrv_drain_all()
-From: Max Reitz <mreitz@redhat.com>
+Block jobs are already paused using the BdrvChildRole drain callbacks,
 so we don't need an additional block_job_pause_all() call.
-sd_prealloc() will now preallocate the area [old_size, new_size).  As
-before, it rounds to buf_size and may thus overshoot and preallocate
-areas that were not requested to be preallocated.  For image creation,
-this is no change in behavior.  For truncation, this is in accordance
-with the documentation for preallocated truncation.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/sheepdog.c | 16 +++++-----------
+ block/io.c              |  4 ----
-file changed, 5 insertions(+), 11 deletions(-)
+ tests/test-bdrv-drain.c | 10 ++++------
 files changed, 4 insertions(+), 10 deletions(-)
-diff --git a/block/sheepdog.c b/block/sheepdog.c
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/sheepdog.c
+--- a/block/io.c
-+++ b/block/sheepdog.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
+@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
-     return 0;
+      * context. */
      assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 -    block_job_pause_all();
 -
      for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
          AioContext *aio_context = bdrv_get_aio_context(bs);
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
          aio_enable_external(aio_context);
          aio_context_release(aio_context);
      }
 -
 -    block_job_resume_all();
  }
--static int sd_prealloc(BlockDriverState *bs, Error **errp)
+ void bdrv_drain_all(void)
-+static int sd_prealloc(BlockDriverState *bs, int64_t old_size, int64_t new_size,
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
-+                       Error **errp)
+index XXXXXXX..XXXXXXX 100644
- {
+--- a/tests/test-bdrv-drain.c
-     BlockBackend *blk = NULL;
++++ b/tests/test-bdrv-drain.c
-     BDRVSheepdogState *base = bs->opaque;
+@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
-     unsigned long buf_size;
+     do_drain_begin(drain_type, src);
-     uint32_t idx, max_idx;
-     uint32_t object_size;
+     if (drain_type == BDRV_DRAIN_ALL) {
--    int64_t vdi_size;
+-        /* bdrv_drain_all() drains both src and target, and involves an
-     void *buf = NULL;
+-         * additional block_job_pause_all() */
-     int ret;
+-        g_assert_cmpint(job->pause_count, ==, 3);
++        /* bdrv_drain_all() drains both src and target */
-@@ -XXX,XX +XXX,XX @@ static int sd_prealloc(BlockDriverState *bs, Error **errp)
++        g_assert_cmpint(job->pause_count, ==, 2);
+     } else {
-     blk_set_allow_write_beyond_eof(blk, true);
+         g_assert_cmpint(job->pause_count, ==, 1);
+     }
--    vdi_size = blk_getlength(blk);
+@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
--    if (vdi_size < 0) {
+     do_drain_begin(drain_type, target);
--        ret = vdi_size;
--        goto out;
+     if (drain_type == BDRV_DRAIN_ALL) {
--    }
+-        /* bdrv_drain_all() drains both src and target, and involves an
--
+-         * additional block_job_pause_all() */
-     object_size = (UINT32_C(1) << base->inode.block_size_shift);
+-        g_assert_cmpint(job->pause_count, ==, 3);
-     buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
++        /* bdrv_drain_all() drains both src and target */
-     buf = g_malloc0(buf_size);
++        g_assert_cmpint(job->pause_count, ==, 2);
+     } else {
--    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
+         g_assert_cmpint(job->pause_count, ==, 1);
 +    max_idx = DIV_ROUND_UP(new_size, buf_size);
 -    for (idx = 0; idx < max_idx; idx++) {
 +    for (idx = old_size / buf_size; idx < max_idx; idx++) {
          /*
           * The created image can be a cloned image, so we need to read
           * a data from the source image.
@@ -XXX,XX +XXX,XX @@ static int sd_create(const char *filename, QemuOpts *opts,
              goto out;
          }
 -        ret = sd_prealloc(bs, errp);
 +        ret = sd_prealloc(bs, 0, s->inode.vdi_size, errp);
          bdrv_unref(bs);
      }
 --
 .13.6

-[Qemu-devel] [PULL 49/55] qcow2: Rename l2_table in count_contiguous_clusters()
+[Qemu-devel] [PULL v3 25/35] block: Nested drain_end must still call callbacks
-From: Alberto Garcia <berto@igalia.com>
+bdrv_do_drained_begin() restricts the call of parent callbacks and
 aio_disable_external() to the outermost drain section, but the block
 driver callbacks are always called. bdrv_do_drained_end() must match
 this behaviour, otherwise nodes stay drained even if begin/end calls
 were balanced.
-This function doesn't need any changes to support L2 slices, but since
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-it's now dealing with slices intead of full tables, the l2_table
+---
-variable is renamed for clarity.
+ block/io.c | 12 +++++++-----
 file changed, 7 insertions(+), 5 deletions(-)
-Signed-off-by: Alberto Garcia <berto@igalia.com>
+diff --git a/block/io.c b/block/io.c
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Message-id: 812b0c3505bb1687e51285dccf1a94f0cecb1f74.1517840877.git.berto@igalia.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  block/qcow2-cluster.c | 8 ++++----
 file changed, 4 insertions(+), 4 deletions(-)
 diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
+--- a/block/io.c
-+++ b/block/qcow2-cluster.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ fail:
+@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
  void bdrv_drained_end(BlockDriverState *bs)
  {
 +    int old_quiesce_counter;
 +
      if (qemu_in_coroutine()) {
          bdrv_co_yield_to_drain(bs, false);
          return;
      }
      assert(bs->quiesce_counter > 0);
 -    if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
 -        return;
 -    }
 +    old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
      /* Re-enable things in child-to-parent order */
      bdrv_drain_invoke(bs, false, false);
 -    bdrv_parent_drained_end(bs);
 -    aio_enable_external(bdrv_get_aio_context(bs));
 +    if (old_quiesce_counter == 1) {
 +        bdrv_parent_drained_end(bs);
 +        aio_enable_external(bdrv_get_aio_context(bs));
 +    }
  }
  /*
-- * Checks how many clusters in a given L2 table are contiguous in the image
-+ * Checks how many clusters in a given L2 slice are contiguous in the image
-  * file. As soon as one of the flags in the bitmask stop_flags changes compared
-  * to the first cluster, the search is stopped and the cluster is not counted
-  * as contiguous. (This allows it, for example, to stop at the first compressed
-  * cluster which may require a different handling)
-  */
- static int count_contiguous_clusters(int nb_clusters, int cluster_size,
--        uint64_t *l2_table, uint64_t stop_flags)
-+        uint64_t *l2_slice, uint64_t stop_flags)
- {
-     int i;
-     QCow2ClusterType first_cluster_type;
-     uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED;
--    uint64_t first_entry = be64_to_cpu(l2_table[0]);
-+    uint64_t first_entry = be64_to_cpu(l2_slice[0]);
-     uint64_t offset = first_entry & mask;
-     if (!offset) {
-@@ -XXX,XX +XXX,XX @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,
-            first_cluster_type == QCOW2_CLUSTER_ZERO_ALLOC);
-     for (i = 0; i < nb_clusters; i++) {
--        uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
-+        uint64_t l2_entry = be64_to_cpu(l2_slice[i]) & mask;
-         if (offset + (uint64_t) i * cluster_size != l2_entry) {
-             break;
-         }
 --
 .13.6

-[Qemu-devel] [PULL 08/55] qemu-io: fix EOF Ctrl-D handling in qemu-io readline code
+[Qemu-devel] [PULL v3 26/35] test-bdrv-drain: Test nested drain sections
-From: "Daniel P. Berrange" <berrange@redhat.com>
-qemu-io puts the TTY into non-canonical mode, which means no EOF processing is
-done and thus getchar() will never return the EOF constant. Instead we have to
-query the TTY attributes to determine the configured EOF character (usually
-Ctrl-D / 0x4), and then explicitly check for that value. This fixes the
-regression that prevented Ctrl-D from triggering an exit of qemu-io that has
-existed since readline was first added in
-  commit 0cf17e181798063c3824c8200ba46f25f54faa1a
-  Author: Stefan Hajnoczi <stefanha@redhat.com>
-  Date:   Thu Nov 14 11:54:17 2013 +0100
-    qemu-io: use readline.c
-It also ensures that a newline is printed when exiting, to complete the
-line output by the "qemu-io> " prompt.
-Signed-off-by: Daniel P. Berrange <berrange@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- qemu-io.c | 27 ++++++++++++++++++++++++++-
+ tests/test-bdrv-drain.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++
-file changed, 26 insertions(+), 1 deletion(-)
+file changed, 57 insertions(+)
-diff --git a/qemu-io.c b/qemu-io.c
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/qemu-io.c
+--- a/tests/test-bdrv-drain.c
-+++ b/qemu-io.c
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
- #include "qemu/osdep.h"
+ enum drain_type {
- #include <getopt.h>
+     BDRV_DRAIN_ALL,
- #include <libgen.h>
+     BDRV_DRAIN,
-+#ifndef _WIN32
++    DRAIN_TYPE_MAX,
-+#include <termios.h>
+ };
-+#endif
+ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
- #include "qapi/error.h"
+@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
- #include "qemu-io.h"
+     test_quiesce_common(BDRV_DRAIN, false);
-@@ -XXX,XX +XXX,XX @@ static bool imageOpts;
+ }
- static ReadLineState *readline_state;
++static void test_nested(void)
++{
-+static int ttyEOF;
++    BlockBackend *blk;
 +    BlockDriverState *bs, *backing;
 +    BDRVTestState *s, *backing_s;
 +    enum drain_type outer, inner;
 +
-+static int get_eof_char(void)
++    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
-+{
++    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
-+#ifdef _WIN32
++                              &error_abort);
-+    return 0x4; /* Ctrl-D */
++    s = bs->opaque;
-+#else
++    blk_insert_bs(blk, bs, &error_abort);
-+    struct termios tty;
++
-+    if (tcgetattr(STDIN_FILENO, &tty) != 0) {
++    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
-+        if (errno == ENOTTY) {
++    backing_s = backing->opaque;
-+            return 0x0; /* just expect read() == 0 */
++    bdrv_set_backing_hd(bs, backing, &error_abort);
-+        } else {
++
-+            return 0x4; /* Ctrl-D */
++    for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
 +        for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
 +            /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
 +            int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
 +                                  (inner != BDRV_DRAIN_ALL);
 +            int backing_quiesce = 0;
 +            int backing_cb_cnt  = (outer != BDRV_DRAIN) +
 +                                  (inner != BDRV_DRAIN);
 +
 +            g_assert_cmpint(bs->quiesce_counter, ==, 0);
 +            g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +            g_assert_cmpint(s->drain_count, ==, 0);
 +            g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +            do_drain_begin(outer, bs);
 +            do_drain_begin(inner, bs);
 +
 +            g_assert_cmpint(bs->quiesce_counter, ==, bs_quiesce);
 +            g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
 +            g_assert_cmpint(s->drain_count, ==, 2);
 +            g_assert_cmpint(backing_s->drain_count, ==, backing_cb_cnt);
 +
 +            do_drain_end(inner, bs);
 +            do_drain_end(outer, bs);
 +
 +            g_assert_cmpint(bs->quiesce_counter, ==, 0);
 +            g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +            g_assert_cmpint(s->drain_count, ==, 0);
 +            g_assert_cmpint(backing_s->drain_count, ==, 0);
 +        }
 +    }
 +
-+    return tty.c_cc[VEOF];
++    bdrv_unref(backing);
-+#endif
++    bdrv_unref(bs);
 +    blk_unref(blk);
 +}
 +
- static int close_f(BlockBackend *blk, int argc, char **argv)
- {
+ typedef struct TestBlockJob {
-     blk_unref(qemuio_blk);
+     BlockJob common;
@@ -XXX,XX +XXX,XX @@ static char *fetchline_readline(void)
      readline_start(readline_state, get_prompt(), 0, readline_func, &line);
      while (!line) {
          int ch = getchar();
 -        if (ch == EOF) {
 +        if (ttyEOF != 0x0 && ch == ttyEOF) {
 +            printf("\n");
              break;
          }
          readline_handle_byte(readline_state, ch);
 @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
-     qemuio_add_command(&close_cmd);
+     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
+     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
-     if (isatty(STDIN_FILENO)) {
-+        ttyEOF = get_eof_char();
++    g_test_add_func("/bdrv-drain/nested", test_nested);
-         readline_state = readline_init(readline_printf_func,
++
-                                        readline_flush_func,
+     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
-                                        NULL,
+     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 --
 .13.6

-[Qemu-devel] [PULL 04/55] docs: Document share-rw property more thoroughly
+[Qemu-devel] [PULL v3 27/35] block: Don't notify parents in drain call chain
-From: Fam Zheng <famz@redhat.com>
+This is in preparation for subtree drains, i.e. drained sections that
+affect not only a single node, but recursively all child nodes, too.
-Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Fam Zheng <famz@redhat.com>
+Calling the parent callbacks for drain is pointless when we just came
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+from that parent node recursively and leads to multiple increases of
-Reviewed-by: Kashyap Chamarthy <kchamart@redhat.com>
+bs->quiesce_counter in a single drain call. Don't do it.
 In order for this to work correctly, the parent callback must be called
 for every bdrv_drain_begin/end() call, not only for the outermost one:
 If we have a node N with two parents A and B, recursive draining of A
 should cause the quiesce_counter of B to increase because its child N is
 drained independently of B. If now B is recursively drained, too, A must
 increase its quiesce_counter because N is drained independently of A
 only now, even if N is going from quiesce_counter 1 to 2.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- docs/qemu-block-drivers.texi | 10 ++++++++++
+ include/block/block.h |  4 ++--
-file changed, 10 insertions(+)
+ block.c               | 13 +++++++++----
+ block/io.c            | 47 ++++++++++++++++++++++++++++++++++-------------
-diff --git a/docs/qemu-block-drivers.texi b/docs/qemu-block-drivers.texi
+files changed, 45 insertions(+), 19 deletions(-)
 diff --git a/include/block/block.h b/include/block/block.h
 index XXXXXXX..XXXXXXX 100644
---- a/docs/qemu-block-drivers.texi
+--- a/include/block/block.h
-+++ b/docs/qemu-block-drivers.texi
++++ b/include/block/block.h
-@@ -XXX,XX +XXX,XX @@ QEMU transparently handles lock handover during shared storage migration.  For
+@@ -XXX,XX +XXX,XX @@ void bdrv_io_unplug(BlockDriverState *bs);
- shared virtual disk images between multiple VMs, the "share-rw" device option
+  * Begin a quiesced section of all users of @bs. This is part of
- should be used.
+  * bdrv_drained_begin.
+  */
-+By default, the guest has exclusive write access to its disk image. If the
+-void bdrv_parent_drained_begin(BlockDriverState *bs);
-+guest can safely share the disk image with other writers the @code{-device
++void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
-+...,share-rw=on} parameter can be used.  This is only safe if the guest is
-+running software, such as a cluster file system, that coordinates disk accesses
+ /**
-+to avoid corruption.
+  * bdrv_parent_drained_end:
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs);
   * End a quiesced section of all users of @bs. This is part of
   * bdrv_drained_end.
   */
 -void bdrv_parent_drained_end(BlockDriverState *bs);
 +void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
  /**
   * bdrv_drained_begin:
 diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block.c
 +++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
                                        BlockDriverState *new_bs)
  {
      BlockDriverState *old_bs = child->bs;
 +    int i;
      if (old_bs && new_bs) {
          assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
      }
      if (old_bs) {
          if (old_bs->quiesce_counter && child->role->drained_end) {
 -            child->role->drained_end(child);
 +            for (i = 0; i < old_bs->quiesce_counter; i++) {
 +                child->role->drained_end(child);
 +            }
          }
          if (child->role->detach) {
              child->role->detach(child);
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
      if (new_bs) {
          QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
          if (new_bs->quiesce_counter && child->role->drained_begin) {
 -            child->role->drained_begin(child);
 +            for (i = 0; i < new_bs->quiesce_counter; i++) {
 +                child->role->drained_begin(child);
 +            }
          }
          if (child->role->attach) {
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
      AioContext *ctx = bdrv_get_aio_context(bs);
      aio_disable_external(ctx);
 -    bdrv_parent_drained_begin(bs);
 +    bdrv_parent_drained_begin(bs, NULL);
      bdrv_drain(bs); /* ensure there are no in-flight requests */
      while (aio_poll(ctx, false)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
       */
      aio_context_acquire(new_context);
      bdrv_attach_aio_context(bs, new_context);
 -    bdrv_parent_drained_end(bs);
 +    bdrv_parent_drained_end(bs, NULL);
      aio_enable_external(ctx);
      aio_context_release(new_context);
  }
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@
  static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
      int64_t offset, int bytes, BdrvRequestFlags flags);
 -void bdrv_parent_drained_begin(BlockDriverState *bs)
 +void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
  {
      BdrvChild *c, *next;
      QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
 +        if (c == ignore) {
 +            continue;
 +        }
          if (c->role->drained_begin) {
              c->role->drained_begin(c);
          }
      }
  }
 -void bdrv_parent_drained_end(BlockDriverState *bs)
 +void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
  {
      BdrvChild *c, *next;
      QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
 +        if (c == ignore) {
 +            continue;
 +        }
          if (c->role->drained_end) {
              c->role->drained_end(c);
          }
@@ -XXX,XX +XXX,XX @@ typedef struct {
      BlockDriverState *bs;
      bool done;
      bool begin;
 +    BdrvChild *parent;
  } BdrvCoDrainData;
  static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
      return waited;
  }
 +static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
 +static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
 +
-+Note that share-rw=on only declares the guest's ability to share the disk.
+ static void bdrv_co_drain_bh_cb(void *opaque)
-+Some QEMU features, such as image file formats, require exclusive write access
+ {
-+to the disk image and this is unaffected by the share-rw=on option.
+     BdrvCoDrainData *data = opaque;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
      bdrv_dec_in_flight(bs);
      if (data->begin) {
 -        bdrv_drained_begin(bs);
 +        bdrv_do_drained_begin(bs, data->parent);
      } else {
 -        bdrv_drained_end(bs);
 +        bdrv_do_drained_end(bs, data->parent);
      }
      data->done = true;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
  }
  static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
 -                                                bool begin)
 +                                                bool begin, BdrvChild *parent)
  {
      BdrvCoDrainData data;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
          .bs = bs,
          .done = false,
          .begin = begin,
 +        .parent = parent,
      };
      bdrv_inc_in_flight(bs);
      aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
      assert(data.done);
  }
 -void bdrv_drained_begin(BlockDriverState *bs)
 +static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
  {
      if (qemu_in_coroutine()) {
 -        bdrv_co_yield_to_drain(bs, true);
 +        bdrv_co_yield_to_drain(bs, true, parent);
          return;
      }
      /* Stop things in parent-to-child order */
      if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
          aio_disable_external(bdrv_get_aio_context(bs));
 -        bdrv_parent_drained_begin(bs);
      }
 +    bdrv_parent_drained_begin(bs, parent);
      bdrv_drain_invoke(bs, true, false);
      bdrv_drain_recurse(bs);
  }
 -void bdrv_drained_end(BlockDriverState *bs)
 +void bdrv_drained_begin(BlockDriverState *bs)
 +{
 +    bdrv_do_drained_begin(bs, NULL);
 +}
 +
- Alternatively, locking can be fully disabled by "locking=off" block device
++static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
- option. In the command line, the option is usually in the form of
+ {
- "file.locking=off" as the protocol driver is normally placed as a "file" child
+     int old_quiesce_counter;
      if (qemu_in_coroutine()) {
 -        bdrv_co_yield_to_drain(bs, false);
 +        bdrv_co_yield_to_drain(bs, false, parent);
          return;
      }
      assert(bs->quiesce_counter > 0);
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
      /* Re-enable things in child-to-parent order */
      bdrv_drain_invoke(bs, false, false);
 +    bdrv_parent_drained_end(bs, parent);
      if (old_quiesce_counter == 1) {
 -        bdrv_parent_drained_end(bs);
          aio_enable_external(bdrv_get_aio_context(bs));
      }
  }
 +void bdrv_drained_end(BlockDriverState *bs)
 +{
 +    bdrv_do_drained_end(bs, NULL);
 +}
 +
  /*
   * Wait for pending requests to complete on a single BlockDriverState subtree,
   * and suspend block driver's internal I/O until next request arrives.
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
          /* Stop things in parent-to-child order */
          aio_context_acquire(aio_context);
          aio_disable_external(aio_context);
 -        bdrv_parent_drained_begin(bs);
 +        bdrv_parent_drained_begin(bs, NULL);
          bdrv_drain_invoke(bs, true, true);
          aio_context_release(aio_context);
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
          /* Re-enable things in child-to-parent order */
          aio_context_acquire(aio_context);
          bdrv_drain_invoke(bs, false, true);
 -        bdrv_parent_drained_end(bs);
 +        bdrv_parent_drained_end(bs, NULL);
          aio_enable_external(aio_context);
          aio_context_release(aio_context);
      }
 --
 .13.6

-[Qemu-devel] [PULL 15/55] sheepdog: Allow fully preallocated truncation
+[Qemu-devel] [PULL v3 28/35] block: Add bdrv_subtree_drained_begin/end()
-From: Max Reitz <mreitz@redhat.com>
+bdrv_drained_begin() waits for the completion of requests in the whole
 subtree, but it only actually keeps its immediate bs parameter quiesced
 until bdrv_drained_end().
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Add a version that keeps the whole subtree drained. As of this commit,
-Reviewed-by: Eric Blake <eblake@redhat.com>
+graph changes cannot be allowed during a subtree drained section, but
 this will be fixed soon.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/sheepdog.c | 15 ++++++++++++---
+ include/block/block.h | 13 +++++++++++++
-file changed, 12 insertions(+), 3 deletions(-)
+ block/io.c            | 54 ++++++++++++++++++++++++++++++++++++++++-----------
 files changed, 56 insertions(+), 11 deletions(-)
-diff --git a/block/sheepdog.c b/block/sheepdog.c
+diff --git a/include/block/block.h b/include/block/block.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/sheepdog.c
+--- a/include/block/block.h
-+++ b/block/sheepdog.c
++++ b/include/block/block.h
-@@ -XXX,XX +XXX,XX @@ static int sd_truncate(BlockDriverState *bs, int64_t offset,
+@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
-     int ret, fd;
+ void bdrv_drained_begin(BlockDriverState *bs);
-     unsigned int datalen;
-     uint64_t max_vdi_size;
+ /**
-+    int64_t old_size = s->inode.vdi_size;
++ * Like bdrv_drained_begin, but recursively begins a quiesced section for
++ * exclusive access to all child nodes as well.
--    if (prealloc != PREALLOC_MODE_OFF) {
++ *
-+    if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_FULL) {
++ * Graph changes are not allowed during a subtree drain section.
-         error_setg(errp, "Unsupported preallocation mode '%s'",
++ */
-                    PreallocMode_str(prealloc));
++void bdrv_subtree_drained_begin(BlockDriverState *bs);
-         return -ENOTSUP;
++
 +/**
   * bdrv_drained_end:
   *
   * End a quiescent section started by bdrv_drained_begin().
   */
  void bdrv_drained_end(BlockDriverState *bs);
 +/**
 + * End a quiescent section started by bdrv_subtree_drained_begin().
 + */
 +void bdrv_subtree_drained_end(BlockDriverState *bs);
 +
  void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
                      Error **errp);
  void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
      BlockDriverState *bs;
      bool done;
      bool begin;
 +    bool recursive;
      BdrvChild *parent;
  } BdrvCoDrainData;
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
      return waited;
  }
 -static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
 -static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
 +static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 +                                  BdrvChild *parent);
 +static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 +                                BdrvChild *parent);
  static void bdrv_co_drain_bh_cb(void *opaque)
  {
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
      bdrv_dec_in_flight(bs);
      if (data->begin) {
 -        bdrv_do_drained_begin(bs, data->parent);
 +        bdrv_do_drained_begin(bs, data->recursive, data->parent);
      } else {
 -        bdrv_do_drained_end(bs, data->parent);
 +        bdrv_do_drained_end(bs, data->recursive, data->parent);
      }
-     max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
+     data->done = true;
--    if (offset < s->inode.vdi_size) {
+@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
-+    if (offset < old_size) {
+ }
-         error_setg(errp, "shrinking is not supported");
-         return -EINVAL;
+ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-     } else if (offset > max_vdi_size) {
+-                                                bool begin, BdrvChild *parent)
-@@ -XXX,XX +XXX,XX @@ static int sd_truncate(BlockDriverState *bs, int64_t offset,
++                                                bool begin, bool recursive,
++                                                BdrvChild *parent)
-     if (ret < 0) {
+ {
-         error_setg_errno(errp, -ret, "failed to update an inode");
+     BdrvCoDrainData data;
-+        return ret;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
          .bs = bs,
          .done = false,
          .begin = begin,
 +        .recursive = recursive,
          .parent = parent,
      };
      bdrv_inc_in_flight(bs);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
      assert(data.done);
  }
 -static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
 +static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 +                                  BdrvChild *parent)
  {
 +    BdrvChild *child, *next;
 +
      if (qemu_in_coroutine()) {
 -        bdrv_co_yield_to_drain(bs, true, parent);
 +        bdrv_co_yield_to_drain(bs, true, recursive, parent);
          return;
      }
--    return ret;
+@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
-+    if (prealloc == PREALLOC_MODE_FULL) {
+     bdrv_parent_drained_begin(bs, parent);
-+        ret = sd_prealloc(bs, old_size, offset, errp);
+     bdrv_drain_invoke(bs, true, false);
-+        if (ret < 0) {
+     bdrv_drain_recurse(bs);
-+            return ret;
++
 +    if (recursive) {
 +        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 +            bdrv_do_drained_begin(child->bs, true, child);
 +        }
 +    }
+ }
+ void bdrv_drained_begin(BlockDriverState *bs)
+ {
+-    bdrv_do_drained_begin(bs, NULL);
++    bdrv_do_drained_begin(bs, false, NULL);
++}
 +
-+    return 0;
++void bdrv_subtree_drained_begin(BlockDriverState *bs)
 +{
 +    bdrv_do_drained_begin(bs, true, NULL);
  }
+-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
++static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
++                                BdrvChild *parent)
+ {
++    BdrvChild *child, *next;
+     int old_quiesce_counter;
+     if (qemu_in_coroutine()) {
+-        bdrv_co_yield_to_drain(bs, false, parent);
++        bdrv_co_yield_to_drain(bs, false, recursive, parent);
+         return;
+     }
+     assert(bs->quiesce_counter > 0);
+@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
+     if (old_quiesce_counter == 1) {
+         aio_enable_external(bdrv_get_aio_context(bs));
+     }
++
++    if (recursive) {
++        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
++            bdrv_do_drained_end(child->bs, true, child);
++        }
++    }
+ }
+ void bdrv_drained_end(BlockDriverState *bs)
+ {
+-    bdrv_do_drained_end(bs, NULL);
++    bdrv_do_drained_end(bs, false, NULL);
++}
++
++void bdrv_subtree_drained_end(BlockDriverState *bs)
++{
++    bdrv_do_drained_end(bs, true, NULL);
+ }
  /*
 --
 .13.6

-[Qemu-devel] [PULL 12/55] gluster: Add preallocated truncation
+[Qemu-devel] [PULL v3 29/35] test-bdrv-drain: Tests for bdrv_subtree_drain
-From: Max Reitz <mreitz@redhat.com>
+Add a subtree drain version to the existing test cases.
-By using qemu_do_cluster_truncate() in qemu_cluster_truncate(), we now
-automatically have preallocated truncation.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/gluster.c | 17 +----------------
+ tests/test-bdrv-drain.c | 27 ++++++++++++++++++++++++++-
-file changed, 1 insertion(+), 16 deletions(-)
+file changed, 26 insertions(+), 1 deletion(-)
-diff --git a/block/gluster.c b/block/gluster.c
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/gluster.c
+--- a/tests/test-bdrv-drain.c
-+++ b/block/gluster.c
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
- static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset,
+ enum drain_type {
-                                  PreallocMode prealloc, Error **errp)
+     BDRV_DRAIN_ALL,
      BDRV_DRAIN,
 +    BDRV_SUBTREE_DRAIN,
      DRAIN_TYPE_MAX,
  };
@@ -XXX,XX +XXX,XX @@ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
      switch (drain_type) {
      case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
      case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
 +    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_begin(bs); break;
      default:                    g_assert_not_reached();
      }
  }
@@ -XXX,XX +XXX,XX @@ static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
      switch (drain_type) {
      case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
      case BDRV_DRAIN:            bdrv_drained_end(bs); break;
 +    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_end(bs); break;
      default:                    g_assert_not_reached();
      }
  }
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
      test_drv_cb_common(BDRV_DRAIN, false);
  }
 +static void test_drv_cb_drain_subtree(void)
 +{
 +    test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
 +}
 +
  static void test_quiesce_common(enum drain_type drain_type, bool recursive)
  {
--    int ret;
+     BlockBackend *blk;
-     BDRVGlusterState *s = bs->opaque;
+@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
--
+     test_quiesce_common(BDRV_DRAIN, false);
 -    if (prealloc != PREALLOC_MODE_OFF) {
 -        error_setg(errp, "Unsupported preallocation mode '%s'",
 -                   PreallocMode_str(prealloc));
 -        return -ENOTSUP;
 -    }
 -
 -    ret = glfs_ftruncate(s->fd, offset);
 -    if (ret < 0) {
 -        ret = -errno;
 -        error_setg_errno(errp, -ret, "Failed to truncate file");
 -        return ret;
 -    }
 -
 -    return 0;
 +    return qemu_gluster_do_truncate(s->fd, offset, prealloc, errp);
  }
- static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs,
++static void test_quiesce_drain_subtree(void)
 +{
 +    test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
 +}
 +
  static void test_nested(void)
  {
      BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
              /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
              int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
                                    (inner != BDRV_DRAIN_ALL);
 -            int backing_quiesce = 0;
 +            int backing_quiesce = (outer == BDRV_SUBTREE_DRAIN) +
 +                                  (inner == BDRV_SUBTREE_DRAIN);
              int backing_cb_cnt  = (outer != BDRV_DRAIN) +
                                    (inner != BDRV_DRAIN);
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_drain(void)
      test_blockjob_common(BDRV_DRAIN);
  }
 +static void test_blockjob_drain_subtree(void)
 +{
 +    test_blockjob_common(BDRV_SUBTREE_DRAIN);
 +}
 +
  int main(int argc, char **argv)
  {
      bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
      g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 +    g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
 +                    test_drv_cb_drain_subtree);
      g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
      g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 +    g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
 +                    test_quiesce_drain_subtree);
      g_test_add_func("/bdrv-drain/nested", test_nested);
      g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
      g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 +    g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
 +                    test_blockjob_drain_subtree);
      return g_test_run();
  }
 --
 .13.6

-[Qemu-devel] [PULL 53/55] iotests: Test valid values of l2-cache-entry-size
+[Qemu-devel] [PULL v3 30/35] test-bdrv-drain: Test behaviour in coroutine context
-From: Alberto Garcia <berto@igalia.com>
+If bdrv_do_drained_begin/end() are called in coroutine context, they
 first use a BH to get out of the coroutine context. Call some existing
 tests again from a coroutine to cover this code path.
-The l2-cache-entry-size setting can only contain values that are
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-powers of two between 512 and the cluster size.
+---
  tests/test-bdrv-drain.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
 file changed, 59 insertions(+)
-Signed-off-by: Alberto Garcia <berto@igalia.com>
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
-Reviewed-by: Eric Blake <eblake@redhat.com>
+index XXXXXXX..XXXXXXX 100644
-Reviewed-by: Max Reitz <mreitz@redhat.com>
+--- a/tests/test-bdrv-drain.c
-Message-id: bd3547b670b8d0af11480c760991a22bcae5b48c.1517840877.git.berto@igalia.com
++++ b/tests/test-bdrv-drain.c
-[mreitz: Changed non-power-of-two test value from 300 to 4242]
+@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+     *aio_ret = ret;
----
+ }
- tests/qemu-iotests/103     | 17 +++++++++++++++++
- tests/qemu-iotests/103.out |  3 +++
++typedef struct CallInCoroutineData {
-files changed, 20 insertions(+)
++    void (*entry)(void);
++    bool done;
-diff --git a/tests/qemu-iotests/103 b/tests/qemu-iotests/103
++} CallInCoroutineData;
 index XXXXXXX..XXXXXXX 100755
 --- a/tests/qemu-iotests/103
 +++ b/tests/qemu-iotests/103
@@ -XXX,XX +XXX,XX @@ $QEMU_IO -c "open -o cache-size=1M,refcount-cache-size=2M $TEST_IMG" 2>&1 \
  $QEMU_IO -c "open -o cache-size=0,l2-cache-size=0,refcount-cache-size=0 $TEST_IMG" \
 >&1 | _filter_testdir | _filter_imgfmt
 +# Invalid cache entry sizes
 +$QEMU_IO -c "open -o l2-cache-entry-size=256 $TEST_IMG" \
 +    2>&1 | _filter_testdir | _filter_imgfmt
 +$QEMU_IO -c "open -o l2-cache-entry-size=4242 $TEST_IMG" \
 +    2>&1 | _filter_testdir | _filter_imgfmt
 +$QEMU_IO -c "open -o l2-cache-entry-size=128k $TEST_IMG" \
 +    2>&1 | _filter_testdir | _filter_imgfmt
 +
- echo
++static coroutine_fn void call_in_coroutine_entry(void *opaque)
- echo '=== Testing valid option combinations ==='
++{
- echo
++    CallInCoroutineData *data = opaque;
-@@ -XXX,XX +XXX,XX @@ $QEMU_IO -c "open -o l2-cache-size=1M,refcount-cache-size=0.25M $TEST_IMG" \
++
-          -c 'read -P 42 0 64k' \
++    data->entry();
-     | _filter_qemu_io
++    data->done = true;
++}
-+# Valid cache entry sizes
++
-+$QEMU_IO -c "open -o l2-cache-entry-size=512 $TEST_IMG" \
++static void call_in_coroutine(void (*entry)(void))
-+    2>&1 | _filter_testdir | _filter_imgfmt
++{
-+$QEMU_IO -c "open -o l2-cache-entry-size=16k $TEST_IMG" \
++    Coroutine *co;
-+    2>&1 | _filter_testdir | _filter_imgfmt
++    CallInCoroutineData data = {
-+$QEMU_IO -c "open -o l2-cache-entry-size=64k $TEST_IMG" \
++        .entry  = entry,
-+    2>&1 | _filter_testdir | _filter_imgfmt
++        .done   = false,
 +    };
 +
 +    co = qemu_coroutine_create(call_in_coroutine_entry, &data);
 +    qemu_coroutine_enter(co);
 +    while (!data.done) {
 +        aio_poll(qemu_get_aio_context(), true);
 +    }
 +}
 +
  enum drain_type {
      BDRV_DRAIN_ALL,
      BDRV_DRAIN,
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_subtree(void)
      test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
  }
 +static void test_drv_cb_co_drain(void)
 +{
 +    call_in_coroutine(test_drv_cb_drain);
 +}
 +
 +static void test_drv_cb_co_drain_subtree(void)
 +{
 +    call_in_coroutine(test_drv_cb_drain_subtree);
 +}
 +
  static void test_quiesce_common(enum drain_type drain_type, bool recursive)
  {
      BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain_subtree(void)
      test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
  }
 +static void test_quiesce_co_drain(void)
 +{
 +    call_in_coroutine(test_quiesce_drain);
 +}
 +
 +static void test_quiesce_co_drain_subtree(void)
 +{
 +    call_in_coroutine(test_quiesce_drain_subtree);
 +}
 +
  static void test_nested(void)
  {
      BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
                      test_drv_cb_drain_subtree);
 +    // XXX bdrv_drain_all() doesn't work in coroutine context
 +    g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
 +    g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
 +                    test_drv_cb_co_drain_subtree);
 +
 +
- echo
+     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
- echo '=== Testing minimal L2 cache and COW ==='
+     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
- echo
+     g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
-diff --git a/tests/qemu-iotests/103.out b/tests/qemu-iotests/103.out
+                     test_quiesce_drain_subtree);
-index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/103.out
++    // XXX bdrv_drain_all() doesn't work in coroutine context
-+++ b/tests/qemu-iotests/103.out
++    g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
-@@ -XXX,XX +XXX,XX @@ can't open device TEST_DIR/t.IMGFMT: cache-size, l2-cache-size and refcount-cach
++    g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
- can't open device TEST_DIR/t.IMGFMT: l2-cache-size may not exceed cache-size
++                    test_quiesce_co_drain_subtree);
- can't open device TEST_DIR/t.IMGFMT: refcount-cache-size may not exceed cache-size
++
- can't open device TEST_DIR/t.IMGFMT: cache-size, l2-cache-size and refcount-cache-size may not be set the same time
+     g_test_add_func("/bdrv-drain/nested", test_nested);
-+can't open device TEST_DIR/t.IMGFMT: L2 cache entry size must be a power of two between 512 and the cluster size (65536)
-+can't open device TEST_DIR/t.IMGFMT: L2 cache entry size must be a power of two between 512 and the cluster size (65536)
+     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
 +can't open device TEST_DIR/t.IMGFMT: L2 cache entry size must be a power of two between 512 and the cluster size (65536)
  === Testing valid option combinations ===
 --
 .13.6

-[Qemu-devel] [PULL 28/55] qcow2: Add offset_to_l1_index()
+[Qemu-devel] [PULL v3 31/35] test-bdrv-drain: Recursive draining with multiple parents
-From: Alberto Garcia <berto@igalia.com>
+Test that drain sections are correctly propagated through the graph.
-Similar to offset_to_l2_index(), this function returns the index in
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-the L1 table for a given guest offset. This is only used in a couple
+---
-of places and it's not a particularly complex calculation, but it
+ tests/test-bdrv-drain.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++
-makes the code a bit more readable.
+file changed, 74 insertions(+)
-Although in the qcow2_get_cluster_offset() case the old code was
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 taking advantage of the l1_bits variable, we're going to get rid of
 the other uses of l1_bits in a later patch anyway, so it doesn't make
 sense to keep it just for this.
 Signed-off-by: Alberto Garcia <berto@igalia.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Message-id: a5f626fed526b7459a0425fad06d823d18df8522.1517840877.git.berto@igalia.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  block/qcow2.h         | 5 +++++
  block/qcow2-cluster.c | 4 ++--
 files changed, 7 insertions(+), 2 deletions(-)
 diff --git a/block/qcow2.h b/block/qcow2.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
+--- a/tests/test-bdrv-drain.c
-+++ b/block/qcow2.h
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ static inline int64_t size_to_l1(BDRVQcow2State *s, int64_t size)
+@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
-     return (size + (1ULL << shift) - 1) >> shift;
+     blk_unref(blk);
  }
-+static inline int offset_to_l1_index(BDRVQcow2State *s, uint64_t offset)
++static void test_multiparent(void)
 +{
-+    return offset >> (s->l2_bits + s->cluster_bits);
++    BlockBackend *blk_a, *blk_b;
 +    BlockDriverState *bs_a, *bs_b, *backing;
 +    BDRVTestState *a_s, *b_s, *backing_s;
 +
 +    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
 +                                &error_abort);
 +    a_s = bs_a->opaque;
 +    blk_insert_bs(blk_a, bs_a, &error_abort);
 +
 +    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 +    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
 +                                &error_abort);
 +    b_s = bs_b->opaque;
 +    blk_insert_bs(blk_b, bs_b, &error_abort);
 +
 +    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
 +    backing_s = backing->opaque;
 +    bdrv_set_backing_hd(bs_a, backing, &error_abort);
 +    bdrv_set_backing_hd(bs_b, backing, &error_abort);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 1);
 +    g_assert_cmpint(a_s->drain_count, ==, 1);
 +    g_assert_cmpint(b_s->drain_count, ==, 1);
 +    g_assert_cmpint(backing_s->drain_count, ==, 1);
 +
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 2);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 2);
 +    g_assert_cmpint(a_s->drain_count, ==, 2);
 +    g_assert_cmpint(b_s->drain_count, ==, 2);
 +    g_assert_cmpint(backing_s->drain_count, ==, 2);
 +
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 1);
 +    g_assert_cmpint(a_s->drain_count, ==, 1);
 +    g_assert_cmpint(b_s->drain_count, ==, 1);
 +    g_assert_cmpint(backing_s->drain_count, ==, 1);
 +
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    bdrv_unref(backing);
 +    bdrv_unref(bs_a);
 +    bdrv_unref(bs_b);
 +    blk_unref(blk_a);
 +    blk_unref(blk_b);
 +}
 +
- static inline int offset_to_l2_index(BDRVQcow2State *s, int64_t offset)
- {
+ typedef struct TestBlockJob {
-     return (offset >> s->cluster_bits) & (s->l2_size - 1);
+     BlockJob common;
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
-index XXXXXXX..XXXXXXX 100644
+                     test_quiesce_co_drain_subtree);
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
+     g_test_add_func("/bdrv-drain/nested", test_nested);
-@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
++    g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
-     /* seek to the l2 offset in the l1 table */
+     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
+     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 -    l1_index = offset >> l1_bits;
 +    l1_index = offset_to_l1_index(s, offset);
      if (l1_index >= s->l1_size) {
          type = QCOW2_CLUSTER_UNALLOCATED;
          goto out;
@@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
      /* seek to the l2 offset in the l1 table */
 -    l1_index = offset >> (s->l2_bits + s->cluster_bits);
 +    l1_index = offset_to_l1_index(s, offset);
      if (l1_index >= s->l1_size) {
          ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
          if (ret < 0) {
 --
 .13.6

-[Qemu-devel] [PULL 40/55] qcow2: Update discard_single_l2() to support L2 slices
+[Qemu-devel] [PULL v3 32/35] block: Allow graph changes in subtree drained section
-From: Alberto Garcia <berto@igalia.com>
+We need to remember how many of the drain sections in which a node is
+were recursive (i.e. subtree drain rather than node drain), so that they
-discard_single_l2() limits the number of clusters to be discarded
+can be correctly applied when children are added or removed during the
-to the amount that fits inside an L2 table. Since we'll be loading
+drained section.
-L2 slices instead of full tables we need to update that limit. The
-function is renamed to discard_in_l2_slice() for clarity.
+With this change, it is safe to modify the graph even inside a
+bdrv_subtree_drained_begin/end() section.
-Apart from that, this function doesn't need any additional changes, so
-this patch simply updates the variable name from l2_table to l2_slice.
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Alberto Garcia <berto@igalia.com>
 Message-id: 1cb44a5b68be5334cb01b97a3db3a3c5a43396e5.1517840877.git.berto@igalia.com
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- block/qcow2-cluster.c | 32 ++++++++++++++++----------------
+ include/block/block.h     |  2 --
-file changed, 16 insertions(+), 16 deletions(-)
+ include/block/block_int.h |  5 +++++
+ block.c                   | 32 +++++++++++++++++++++++++++++---
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
+ block/io.c                | 28 ++++++++++++++++++++++++----
-index XXXXXXX..XXXXXXX 100644
+files changed, 58 insertions(+), 9 deletions(-)
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
+diff --git a/include/block/block.h b/include/block/block.h
-@@ -XXX,XX +XXX,XX @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
+index XXXXXXX..XXXXXXX 100644
 --- a/include/block/block.h
 +++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs);
  /**
   * Like bdrv_drained_begin, but recursively begins a quiesced section for
   * exclusive access to all child nodes as well.
 - *
 - * Graph changes are not allowed during a subtree drain section.
   */
  void bdrv_subtree_drained_begin(BlockDriverState *bs);
 diff --git a/include/block/block_int.h b/include/block/block_int.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/block_int.h
 +++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
      /* Accessed with atomic ops.  */
      int quiesce_counter;
 +    int recursive_quiesce_counter;
 +
      unsigned int write_gen;               /* Current data generation */
      /* Protected by reqs_lock.  */
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
      int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
      BdrvRequestFlags flags);
 +void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
 +void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
 +
  int get_tmp_filename(char *filename, int size);
  BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
                              const char *filename);
 diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block.c
 +++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_child_cb_drained_end(BdrvChild *child)
      bdrv_drained_end(bs);
  }
 +static void bdrv_child_cb_attach(BdrvChild *child)
 +{
 +    BlockDriverState *bs = child->opaque;
 +    bdrv_apply_subtree_drain(child, bs);
 +}
 +
 +static void bdrv_child_cb_detach(BdrvChild *child)
 +{
 +    BlockDriverState *bs = child->opaque;
 +    bdrv_unapply_subtree_drain(child, bs);
 +}
 +
  static int bdrv_child_cb_inactivate(BdrvChild *child)
  {
      BlockDriverState *bs = child->opaque;
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_file = {
      .inherit_options = bdrv_inherited_options,
      .drained_begin   = bdrv_child_cb_drained_begin,
      .drained_end     = bdrv_child_cb_drained_end,
 +    .attach          = bdrv_child_cb_attach,
 +    .detach          = bdrv_child_cb_detach,
      .inactivate      = bdrv_child_cb_inactivate,
  };
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_format = {
      .inherit_options = bdrv_inherited_fmt_options,
      .drained_begin   = bdrv_child_cb_drained_begin,
      .drained_end     = bdrv_child_cb_drained_end,
 +    .attach          = bdrv_child_cb_attach,
 +    .detach          = bdrv_child_cb_detach,
      .inactivate      = bdrv_child_cb_inactivate,
  };
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_attach(BdrvChild *c)
                      parent->backing_blocker);
      bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
                      parent->backing_blocker);
 +
 +    bdrv_child_cb_attach(c);
  }
  static void bdrv_backing_detach(BdrvChild *c)
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_detach(BdrvChild *c)
      bdrv_op_unblock_all(c->bs, parent->backing_blocker);
      error_free(parent->backing_blocker);
      parent->backing_blocker = NULL;
 +
 +    bdrv_child_cb_detach(c);
  }
  /*
-  * This discards as many clusters of nb_clusters as possible at once (i.e.
+@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
-- * all clusters in the same L2 table) and returns the number of discarded
+         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
-+ * all clusters in the same L2 slice) and returns the number of discarded
+     }
-  * clusters.
+     if (old_bs) {
-  */
++        /* Detach first so that the recursive drain sections coming from @child
--static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
++         * are already gone and we only end the drain sections that came from
--                             uint64_t nb_clusters, enum qcow2_discard_type type,
++         * elsewhere. */
--                             bool full_discard)
++        if (child->role->detach) {
-+static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
++            child->role->detach(child);
-+                               uint64_t nb_clusters,
++        }
-+                               enum qcow2_discard_type type, bool full_discard)
+         if (old_bs->quiesce_counter && child->role->drained_end) {
              for (i = 0; i < old_bs->quiesce_counter; i++) {
                  child->role->drained_end(child);
              }
          }
 -        if (child->role->detach) {
 -            child->role->detach(child);
 -        }
          QLIST_REMOVE(child, next_parent);
      }
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
              }
          }
 +        /* Attach only after starting new drained sections, so that recursive
 +         * drain sections coming from @child don't get an extra .drained_begin
 +         * callback. */
          if (child->role->attach) {
              child->role->attach(child);
          }
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
      assert(data.done);
  }
 -static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 -                                  BdrvChild *parent)
 +void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 +                           BdrvChild *parent)
  {
-     BDRVQcow2State *s = bs->opaque;
+     BdrvChild *child, *next;
--    uint64_t *l2_table;
-+    uint64_t *l2_slice;
+@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-     int l2_index;
+     bdrv_drain_recurse(bs);
-     int ret;
-     int i;
+     if (recursive) {
++        bs->recursive_quiesce_counter++;
--    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
-+    ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
+             bdrv_do_drained_begin(child->bs, true, child);
-     if (ret < 0) {
+         }
-         return ret;
+@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_begin(BlockDriverState *bs)
      bdrv_do_drained_begin(bs, true, NULL);
  }
 -static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 -                                BdrvChild *parent)
 +void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 +                         BdrvChild *parent)
  {
      BdrvChild *child, *next;
      int old_quiesce_counter;
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
      }
--    /* Limit nb_clusters to one L2 table */
+     if (recursive) {
--    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
++        bs->recursive_quiesce_counter--;
-+    /* Limit nb_clusters to one L2 slice */
+         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
-+    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
+             bdrv_do_drained_end(child->bs, true, child);
-     assert(nb_clusters <= INT_MAX);
+         }
+@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_end(BlockDriverState *bs)
-     for (i = 0; i < nb_clusters; i++) {
+     bdrv_do_drained_end(bs, true, NULL);
-         uint64_t old_l2_entry;
+ }
--        old_l2_entry = be64_to_cpu(l2_table[l2_index + i]);
++void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
-+        old_l2_entry = be64_to_cpu(l2_slice[l2_index + i]);
++{
++    int i;
-         /*
++
-          * If full_discard is false, make sure that a discarded area reads back
++    for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
-@@ -XXX,XX +XXX,XX @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
++        bdrv_do_drained_begin(child->bs, true, child);
-         }
++    }
++}
-         /* First remove L2 entries */
++
--        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
++void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
-+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
++{
-         if (!full_discard && s->qcow_version >= 3) {
++    int i;
--            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
++
-+            l2_slice[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
++    for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
-         } else {
++        bdrv_do_drained_end(child->bs, true, child);
--            l2_table[l2_index + i] = cpu_to_be64(0);
++    }
-+            l2_slice[l2_index + i] = cpu_to_be64(0);
++}
-         }
++
+ /*
-         /* Then decrease the refcount */
+  * Wait for pending requests to complete on a single BlockDriverState subtree,
-         qcow2_free_any_clusters(bs, old_l2_entry, 1, type);
+  * and suspend block driver's internal I/O until next request arrives.
      }
 -    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
 +    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
      return nb_clusters;
  }
@@ -XXX,XX +XXX,XX @@ int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
      s->cache_discards = true;
 -    /* Each L2 table is handled by its own loop iteration */
 +    /* Each L2 slice is handled by its own loop iteration */
      while (nb_clusters > 0) {
 -        cleared = discard_single_l2(bs, offset, nb_clusters, type,
 -                                    full_discard);
 +        cleared = discard_in_l2_slice(bs, offset, nb_clusters, type,
 +                                      full_discard);
          if (cleared < 0) {
              ret = cleared;
              goto fail;
 --
 .13.6

-[Qemu-devel] [PULL 13/55] sheepdog: Make sd_prealloc() take a BDS
+[Qemu-devel] [PULL v3 33/35] test-bdrv-drain: Test graph changes in drained section
-From: Max Reitz <mreitz@redhat.com>
-We want to use this function in sd_truncate() later on, so taking a
-filename is not exactly ideal.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/sheepdog.c | 29 +++++++++++++++++++++--------
+ tests/test-bdrv-drain.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
-file changed, 21 insertions(+), 8 deletions(-)
+file changed, 80 insertions(+)
-diff --git a/block/sheepdog.c b/block/sheepdog.c
+diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/sheepdog.c
+--- a/tests/test-bdrv-drain.c
-+++ b/block/sheepdog.c
++++ b/tests/test-bdrv-drain.c
-@@ -XXX,XX +XXX,XX @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
+@@ -XXX,XX +XXX,XX @@ static void test_multiparent(void)
-     return 0;
+     blk_unref(blk_b);
  }
--static int sd_prealloc(const char *filename, Error **errp)
++static void test_graph_change(void)
-+static int sd_prealloc(BlockDriverState *bs, Error **errp)
++{
- {
++    BlockBackend *blk_a, *blk_b;
-     BlockBackend *blk = NULL;
++    BlockDriverState *bs_a, *bs_b, *backing;
--    BDRVSheepdogState *base = NULL;
++    BDRVTestState *a_s, *b_s, *backing_s;
 +    BDRVSheepdogState *base = bs->opaque;
      unsigned long buf_size;
      uint32_t idx, max_idx;
      uint32_t object_size;
@@ -XXX,XX +XXX,XX @@ static int sd_prealloc(const char *filename, Error **errp)
      void *buf = NULL;
      int ret;
 -    blk = blk_new_open(filename, NULL, NULL,
 -                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
 -    if (blk == NULL) {
 -        ret = -EIO;
 +    blk = blk_new(BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | BLK_PERM_RESIZE,
 +                  BLK_PERM_ALL);
 +
-+    ret = blk_insert_bs(blk, bs, errp);
++    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
-+    if (ret < 0) {
++    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
-         goto out_with_err_set;
++                                &error_abort);
-     }
++    a_s = bs_a->opaque;
++    blk_insert_bs(blk_a, bs_a, &error_abort);
@@ -XXX,XX +XXX,XX @@ static int sd_prealloc(const char *filename, Error **errp)
          goto out;
      }
 -    base = blk_bs(blk)->opaque;
      object_size = (UINT32_C(1) << base->inode.block_size_shift);
      buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
      buf = g_malloc0(buf_size);
@@ -XXX,XX +XXX,XX @@ static int sd_create(const char *filename, QemuOpts *opts,
      }
      if (prealloc) {
 -        ret = sd_prealloc(filename, errp);
 +        BlockDriverState *bs;
 +        QDict *opts;
 +
-+        opts = qdict_new();
++    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
-+        qdict_put_str(opts, "driver", "sheepdog");
++    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
-+        bs = bdrv_open(filename, NULL, opts, BDRV_O_PROTOCOL | BDRV_O_RDWR,
++                                &error_abort);
-+                       errp);
++    b_s = bs_b->opaque;
-+        if (!bs) {
++    blk_insert_bs(blk_b, bs_b, &error_abort);
 +            goto out;
 +        }
 +
-+        ret = sd_prealloc(bs, errp);
++    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
 +    backing_s = backing->opaque;
 +    bdrv_set_backing_hd(bs_a, backing, &error_abort);
 +
-+        bdrv_unref(bs);
++    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
-     }
++    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
- out:
++    g_assert_cmpint(backing->quiesce_counter, ==, 0);
-     g_free(backing_file);
++    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
 +    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
 +
 +    bdrv_set_backing_hd(bs_b, backing, &error_abort);
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 5);
 +    g_assert_cmpint(a_s->drain_count, ==, 5);
 +    g_assert_cmpint(b_s->drain_count, ==, 5);
 +    g_assert_cmpint(backing_s->drain_count, ==, 5);
 +
 +    bdrv_set_backing_hd(bs_b, NULL, &error_abort);
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 3);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 3);
 +    g_assert_cmpint(a_s->drain_count, ==, 3);
 +    g_assert_cmpint(b_s->drain_count, ==, 2);
 +    g_assert_cmpint(backing_s->drain_count, ==, 3);
 +
 +    bdrv_set_backing_hd(bs_b, backing, &error_abort);
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 5);
 +    g_assert_cmpint(a_s->drain_count, ==, 5);
 +    g_assert_cmpint(b_s->drain_count, ==, 5);
 +    g_assert_cmpint(backing_s->drain_count, ==, 5);
 +
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
 +
 +    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
 +    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
 +    g_assert_cmpint(backing->quiesce_counter, ==, 0);
 +    g_assert_cmpint(a_s->drain_count, ==, 0);
 +    g_assert_cmpint(b_s->drain_count, ==, 0);
 +    g_assert_cmpint(backing_s->drain_count, ==, 0);
 +
 +    bdrv_unref(backing);
 +    bdrv_unref(bs_a);
 +    bdrv_unref(bs_b);
 +    blk_unref(blk_a);
 +    blk_unref(blk_b);
 +}
 +
  typedef struct TestBlockJob {
      BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/bdrv-drain/nested", test_nested);
      g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
 +    g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
      g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
      g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 --
 .13.6

-[Qemu-devel] [PULL 17/55] qcow2: Fix documentation of get_cluster_table()
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-This function has not been returning the offset of the L2 table since
-commit 3948d1d4876065160583e79533bf604481063833
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: b498733b6706a859a03678d74ecbd26aeba129aa.1517840876.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-cluster.c | 3 +--
-file changed, 1 insertion(+), 2 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
-@@ -XXX,XX +XXX,XX @@ fail:
-  * for a given disk offset, load (and allocate if needed)
-  * the l2 table.
-  *
-- * the l2 table offset in the qcow2 file and the cluster index
-- * in the l2 table are given to the caller.
-+ * the cluster index in the l2 table is given to the caller.
-  *
-  * Returns 0 on success, -errno in failure case
-  */
---
-.13.6

-[Qemu-devel] [PULL 19/55] qcow2: Remove BDS parameter from qcow2_cache_get_table_addr()
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-This function was only using the BlockDriverState parameter to get the
-cache table size (since it was equal to the cluster size). This is no
-longer necessary so this parameter can be removed.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: e1f943a9e89e1deb876f45de1bb22419ccdb6ad3.1517840876.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-cache.c | 13 ++++++-------
-file changed, 6 insertions(+), 7 deletions(-)
-diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cache.c
-+++ b/block/qcow2-cache.c
-@@ -XXX,XX +XXX,XX @@ struct Qcow2Cache {
-     uint64_t                cache_clean_lru_counter;
- };
--static inline void *qcow2_cache_get_table_addr(BlockDriverState *bs,
--                    Qcow2Cache *c, int table)
-+static inline void *qcow2_cache_get_table_addr(Qcow2Cache *c, int table)
- {
-     return (uint8_t *) c->table_array + (size_t) table * c->table_size;
- }
-@@ -XXX,XX +XXX,XX @@ static void qcow2_cache_table_release(BlockDriverState *bs, Qcow2Cache *c,
- {
- /* Using MADV_DONTNEED to discard memory is a Linux-specific feature */
- #ifdef CONFIG_LINUX
--    void *t = qcow2_cache_get_table_addr(bs, c, i);
-+    void *t = qcow2_cache_get_table_addr(c, i);
-     int align = getpagesize();
-     size_t mem_size = (size_t) c->table_size * num_tables;
-     size_t offset = QEMU_ALIGN_UP((uintptr_t) t, align) - (uintptr_t) t;
-@@ -XXX,XX +XXX,XX @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
-     }
-     ret = bdrv_pwrite(bs->file, c->entries[i].offset,
--                      qcow2_cache_get_table_addr(bs, c, i), c->table_size);
-+                      qcow2_cache_get_table_addr(c, i), c->table_size);
-     if (ret < 0) {
-         return ret;
-     }
-@@ -XXX,XX +XXX,XX @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
-         }
-         ret = bdrv_pread(bs->file, offset,
--                         qcow2_cache_get_table_addr(bs, c, i),
-+                         qcow2_cache_get_table_addr(c, i),
-                          c->table_size);
-         if (ret < 0) {
-             return ret;
-@@ -XXX,XX +XXX,XX @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
-     /* And return the right table */
- found:
-     c->entries[i].ref++;
--    *table = qcow2_cache_get_table_addr(bs, c, i);
-+    *table = qcow2_cache_get_table_addr(c, i);
-     trace_qcow2_cache_get_done(qemu_coroutine_self(),
-                                c == s->l2_table_cache, i);
-@@ -XXX,XX +XXX,XX @@ void *qcow2_cache_is_table_offset(BlockDriverState *bs, Qcow2Cache *c,
-     for (i = 0; i < c->size; i++) {
-         if (c->entries[i].offset == offset) {
--            return qcow2_cache_get_table_addr(bs, c, i);
-+            return qcow2_cache_get_table_addr(c, i);
-         }
-     }
-     return NULL;
---
-.13.6

-[Qemu-devel] [PULL 20/55] qcow2: Remove BDS parameter from qcow2_cache_get_table_idx()
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-This function was only using the BlockDriverState parameter to get the
-cache table size (since it was equal to the cluster size). This is no
-longer necessary so this parameter can be removed.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: da3575d47c9a181a2cfd4715e53dd84a2c651017.1517840876.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-cache.c | 9 ++++-----
-file changed, 4 insertions(+), 5 deletions(-)
-diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cache.c
-+++ b/block/qcow2-cache.c
-@@ -XXX,XX +XXX,XX @@ static inline void *qcow2_cache_get_table_addr(Qcow2Cache *c, int table)
-     return (uint8_t *) c->table_array + (size_t) table * c->table_size;
- }
--static inline int qcow2_cache_get_table_idx(BlockDriverState *bs,
--                  Qcow2Cache *c, void *table)
-+static inline int qcow2_cache_get_table_idx(Qcow2Cache *c, void *table)
- {
-     ptrdiff_t table_offset = (uint8_t *) table - (uint8_t *) c->table_array;
-     int idx = table_offset / c->table_size;
-@@ -XXX,XX +XXX,XX @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
- void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
- {
--    int i = qcow2_cache_get_table_idx(bs, c, *table);
-+    int i = qcow2_cache_get_table_idx(c, *table);
-     c->entries[i].ref--;
-     *table = NULL;
-@@ -XXX,XX +XXX,XX @@ void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
- void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c,
-      void *table)
- {
--    int i = qcow2_cache_get_table_idx(bs, c, table);
-+    int i = qcow2_cache_get_table_idx(c, table);
-     assert(c->entries[i].offset != 0);
-     c->entries[i].dirty = true;
- }
-@@ -XXX,XX +XXX,XX @@ void *qcow2_cache_is_table_offset(BlockDriverState *bs, Qcow2Cache *c,
- void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table)
- {
--    int i = qcow2_cache_get_table_idx(bs, c, table);
-+    int i = qcow2_cache_get_table_idx(c, table);
-     assert(c->entries[i].ref == 0);
---
-.13.6

-[Qemu-devel] [PULL 41/55] qcow2: Update zero_single_l2() to support L2 slices
+[Qemu-devel] [PULL v3 34/35] commit: Simplify reopen of base
-From: Alberto Garcia <berto@igalia.com>
+Since commit bde70715, base is the only node that is reopened in
 commit_start(). This means that the code, which still involves an
 explicit BlockReopenQueue, can now be simplified by using bdrv_reopen().
-zero_single_l2() limits the number of clusters to be zeroed to the
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-amount that fits inside an L2 table. Since we'll be loading L2 slices
+Reviewed-by: Fam Zheng <famz@redhat.com>
-instead of full tables we need to update that limit. The function is
+---
-renamed to zero_in_l2_slice() for clarity.
+ block/commit.c | 8 +-------
 file changed, 1 insertion(+), 7 deletions(-)
-Apart from that, this function doesn't need any additional changes, so
+diff --git a/block/commit.c b/block/commit.c
 this patch simply updates the variable name from l2_table to l2_slice.
 Signed-off-by: Alberto Garcia <berto@igalia.com>
 Message-id: ebc16e7e79fa6969d8975ef487d679794de4fbcc.1517840877.git.berto@igalia.com
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  block/qcow2-cluster.c | 28 ++++++++++++++--------------
 file changed, 14 insertions(+), 14 deletions(-)
 diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
+--- a/block/commit.c
-+++ b/block/qcow2-cluster.c
++++ b/block/commit.c
-@@ -XXX,XX +XXX,XX @@ fail:
+@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
+                   const char *filter_node_name, Error **errp)
  /*
   * This zeroes as many clusters of nb_clusters as possible at once (i.e.
 - * all clusters in the same L2 table) and returns the number of zeroed
 + * all clusters in the same L2 slice) and returns the number of zeroed
   * clusters.
   */
 -static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
 -                          uint64_t nb_clusters, int flags)
 +static int zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
 +                            uint64_t nb_clusters, int flags)
  {
-     BDRVQcow2State *s = bs->opaque;
+     CommitBlockJob *s;
--    uint64_t *l2_table;
+-    BlockReopenQueue *reopen_queue = NULL;
-+    uint64_t *l2_slice;
+     int orig_base_flags;
-     int l2_index;
+     BlockDriverState *iter;
-     int ret;
+     BlockDriverState *commit_top_bs = NULL;
-     int i;
+@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
-     bool unmap = !!(flags & BDRV_REQ_MAY_UNMAP);
+     /* convert base to r/w, if necessary */
+     orig_base_flags = bdrv_get_flags(base);
--    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+     if (!(orig_base_flags & BDRV_O_RDWR)) {
-+    ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
+-        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
-     if (ret < 0) {
+-                                         orig_base_flags | BDRV_O_RDWR);
-         return ret;
+-    }
-     }
+-
+-    if (reopen_queue) {
--    /* Limit nb_clusters to one L2 table */
+-        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
--    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
++        bdrv_reopen(base, orig_base_flags | BDRV_O_RDWR, &local_err);
-+    /* Limit nb_clusters to one L2 slice */
+         if (local_err != NULL) {
-+    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
+             error_propagate(errp, local_err);
      assert(nb_clusters <= INT_MAX);
      for (i = 0; i < nb_clusters; i++) {
          uint64_t old_offset;
          QCow2ClusterType cluster_type;
 -        old_offset = be64_to_cpu(l2_table[l2_index + i]);
 +        old_offset = be64_to_cpu(l2_slice[l2_index + i]);
          /*
           * Minimize L2 changes if the cluster already reads back as
@@ -XXX,XX +XXX,XX @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
              continue;
          }
 -        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
 +        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
          if (cluster_type == QCOW2_CLUSTER_COMPRESSED || unmap) {
 -            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
 +            l2_slice[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
              qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
          } else {
 -            l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
 +            l2_slice[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
          }
      }
 -    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
 +    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
      return nb_clusters;
  }
@@ -XXX,XX +XXX,XX @@ int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset,
          return -ENOTSUP;
      }
 -    /* Each L2 table is handled by its own loop iteration */
 +    /* Each L2 slice is handled by its own loop iteration */
      nb_clusters = size_to_clusters(s, bytes);
      s->cache_discards = true;
      while (nb_clusters > 0) {
 -        cleared = zero_single_l2(bs, offset, nb_clusters, flags);
 +        cleared = zero_in_l2_slice(bs, offset, nb_clusters, flags);
          if (cleared < 0) {
              ret = cleared;
              goto fail;
 --
 .13.6

-[Qemu-devel] [PULL 23/55] qcow2: Remove BDS parameter from qcow2_cache_put()
+[Qemu-devel] [PULL v3 35/35] block: Keep nodes drained between reopen_queue/multiple
-From: Alberto Garcia <berto@igalia.com>
+The bdrv_reopen*() implementation doesn't like it if the graph is
 changed between queuing nodes for reopen and actually reopening them
 (one of the reasons is that queuing can be recursive).
-This function was only using the BlockDriverState parameter to pass it
+So instead of draining the device only in bdrv_reopen_multiple(),
-to qcow2_cache_get_table_idx(). This is no longer necessary so this
+require that callers already drained all affected nodes, and assert this
-parameter can be removed.
+in bdrv_reopen_queue().
-Signed-off-by: Alberto Garcia <berto@igalia.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Message-id: 6f98155489054a457563da77cdad1a66ebb3e896.1517840876.git.berto@igalia.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- block/qcow2.h          |  2 +-
+ block.c             | 23 ++++++++++++++++-------
- block/qcow2-cache.c    |  2 +-
+ block/replication.c |  6 ++++++
- block/qcow2-cluster.c  | 28 ++++++++++++++--------------
+ qemu-io-cmds.c      |  3 +++
- block/qcow2-refcount.c | 30 +++++++++++++++---------------
+files changed, 25 insertions(+), 7 deletions(-)
 files changed, 31 insertions(+), 31 deletions(-)
-diff --git a/block/qcow2.h b/block/qcow2.h
+diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
+--- a/block.c
-+++ b/block/qcow2.h
++++ b/block.c
-@@ -XXX,XX +XXX,XX @@ int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
-     void **table);
+  * returns a pointer to bs_queue, which is either the newly allocated
- int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+  * bs_queue, or the existing bs_queue being used.
-     void **table);
+  *
--void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
++ * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
-+void qcow2_cache_put(Qcow2Cache *c, void **table);
+  */
- void *qcow2_cache_is_table_offset(BlockDriverState *bs, Qcow2Cache *c,
+ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
-                                   uint64_t offset);
+                                                  BlockDriverState *bs,
- void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table);
+@@ -XXX,XX +XXX,XX @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
-diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
+     BdrvChild *child;
-index XXXXXXX..XXXXXXX 100644
+     QDict *old_options, *explicit_options;
---- a/block/qcow2-cache.c
-+++ b/block/qcow2-cache.c
++    /* Make sure that the caller remembered to use a drained section. This is
-@@ -XXX,XX +XXX,XX @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
++     * important to avoid graph changes between the recursive queuing here and
-     return qcow2_cache_do_get(bs, c, offset, table, false);
++     * bdrv_reopen_multiple(). */
- }
++    assert(bs->quiesce_counter > 0);
++
--void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
+     if (bs_queue == NULL) {
-+void qcow2_cache_put(Qcow2Cache *c, void **table)
+         bs_queue = g_new0(BlockReopenQueue, 1);
          QSIMPLEQ_INIT(bs_queue);
@@ -XXX,XX +XXX,XX @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
   * If all devices prepare successfully, then the changes are committed
   * to all devices.
   *
 + * All affected nodes must be drained between bdrv_reopen_queue() and
 + * bdrv_reopen_multiple().
   */
  int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp)
  {
-     int i = qcow2_cache_get_table_idx(c, *table);
+@@ -XXX,XX +XXX,XX @@ int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **er
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
+     assert(bs_queue != NULL);
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
+-    aio_context_release(ctx);
-+++ b/block/qcow2-cluster.c
+-    bdrv_drain_all_begin();
-@@ -XXX,XX +XXX,XX @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
+-    aio_context_acquire(ctx);
+-
-         memcpy(l2_table, old_table, s->cluster_size);
+     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
++        assert(bs_entry->state.bs->quiesce_counter > 0);
--        qcow2_cache_put(bs, s->l2_table_cache, (void **) &old_table);
+         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
-+        qcow2_cache_put(s->l2_table_cache, (void **) &old_table);
+             error_propagate(errp, local_err);
              goto cleanup;
@@ -XXX,XX +XXX,XX @@ cleanup:
      }
+     g_free(bs_queue);
-     /* write the l2 table to the file */
-@@ -XXX,XX +XXX,XX @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
+-    bdrv_drain_all_end();
- fail:
+-
      trace_qcow2_l2_allocate_done(bs, l1_index, ret);
      if (l2_table != NULL) {
 -        qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
 +        qcow2_cache_put(s->l2_table_cache, (void **) table);
      }
      s->l1_table[l1_index] = old_l2_offset;
      if (l2_offset > 0) {
@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
          abort();
      }
 -    qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 +    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
      bytes_available = (int64_t)c * s->cluster_size;
@@ -XXX,XX +XXX,XX @@ out:
      return type;
  fail:
 -    qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
 +    qcow2_cache_put(s->l2_table_cache, (void **)&l2_table);
      return ret;
  }
-@@ -XXX,XX +XXX,XX @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
-      * allocated. */
+ {
-     cluster_offset = be64_to_cpu(l2_table[l2_index]);
+     int ret = -1;
-     if (cluster_offset & L2E_OFFSET_MASK) {
+     Error *local_err = NULL;
--        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+-    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
-+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
++    BlockReopenQueue *queue;
-         return 0;
 +    bdrv_subtree_drained_begin(bs);
 +
 +    queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
      ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err);
      if (local_err != NULL) {
          error_propagate(errp, local_err);
      }
++
-     cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
++    bdrv_subtree_drained_end(bs);
-     if (cluster_offset < 0) {
++
 -        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 +        qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
          return 0;
      }
@@ -XXX,XX +XXX,XX @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
      BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
      qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
      l2_table[l2_index] = cpu_to_be64(cluster_offset);
 -    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 +    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
      return cluster_offset;
  }
@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
       }
 -    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 +    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
      /*
       * If this was a COW, we need to decrease the refcount of the old cluster.
@@ -XXX,XX +XXX,XX @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
      /* Cleanup */
  out:
 -    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 +    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
      /* Only return a host offset if we actually made progress. Otherwise we
       * would make requirements for handle_alloc() that it can't fulfill */
@@ -XXX,XX +XXX,XX @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
          keep_old_clusters = true;
      }
 -    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 +    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
      if (!alloc_cluster_offset) {
          /* Allocate, if necessary at a given offset in the image file */
@@ -XXX,XX +XXX,XX @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
          qcow2_free_any_clusters(bs, old_l2_entry, 1, type);
      }
 -    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 +    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
      return nb_clusters;
  }
@@ -XXX,XX +XXX,XX @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
          }
      }
 -    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 +    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
      return nb_clusters;
  }
@@ -XXX,XX +XXX,XX @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                  qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
                  qcow2_cache_depends_on_flush(s->l2_table_cache);
              }
 -            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 +            qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
          } else {
              if (l2_dirty) {
                  ret = qcow2_pre_write_overlap_check(bs,
@@ -XXX,XX +XXX,XX @@ fail:
          if (!is_active_l1) {
              qemu_vfree(l2_table);
          } else {
 -            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 +            qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
          }
      }
      return ret;
 diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2-refcount.c
 +++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
      block_index = cluster_index & (s->refcount_block_size - 1);
      *refcount = s->get_refcount(refcount_block, block_index);
 -    qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 +    qcow2_cache_put(s->refcount_block_cache, &refcount_block);
      return 0;
  }
@@ -XXX,XX +XXX,XX @@ static int alloc_refcount_block(BlockDriverState *bs,
          return -EAGAIN;
      }
 -    qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
 +    qcow2_cache_put(s->refcount_block_cache, refcount_block);
      /*
       * If we come here, we need to grow the refcount table. Again, a new
@@ -XXX,XX +XXX,XX @@ static int alloc_refcount_block(BlockDriverState *bs,
  fail:
      if (*refcount_block != NULL) {
 -        qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
 +        qcow2_cache_put(s->refcount_block_cache, refcount_block);
      }
      return ret;
  }
-@@ -XXX,XX +XXX,XX @@ int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
-                                          refblock_data);
+diff --git a/block/replication.c b/block/replication.c
-         }
+index XXXXXXX..XXXXXXX 100644
+--- a/block/replication.c
--        qcow2_cache_put(bs, s->refcount_block_cache, &refblock_data);
++++ b/block/replication.c
-+        qcow2_cache_put(s->refcount_block_cache, &refblock_data);
+@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
          new_secondary_flags = s->orig_secondary_flags;
      }
-     assert(block_offset == table_offset);
++    bdrv_subtree_drained_begin(s->hidden_disk->bs);
-@@ -XXX,XX +XXX,XX @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
++    bdrv_subtree_drained_begin(s->secondary_disk->bs);
-         /* Load the refcount block and allocate it if needed */
++
-         if (table_index != old_table_index) {
+     if (orig_hidden_flags != new_hidden_flags) {
-             if (refcount_block) {
+         reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, NULL,
--                qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
+                                          new_hidden_flags);
-+                qcow2_cache_put(s->refcount_block_cache, &refcount_block);
+@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
-             }
+                              reopen_queue, &local_err);
-             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
+         error_propagate(errp, local_err);
              if (ret < 0) {
@@ -XXX,XX +XXX,XX @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
              table = qcow2_cache_is_table_offset(bs, s->refcount_block_cache,
                                                  offset);
              if (table != NULL) {
 -                qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 +                qcow2_cache_put(s->refcount_block_cache, &refcount_block);
                  qcow2_cache_discard(bs, s->refcount_block_cache, table);
              }
@@ -XXX,XX +XXX,XX @@ fail:
      /* Write last changed block to disk */
      if (refcount_block) {
 -        qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 +        qcow2_cache_put(s->refcount_block_cache, &refcount_block);
      }
++
-     /*
++    bdrv_subtree_drained_end(s->hidden_disk->bs);
-@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
++    bdrv_subtree_drained_end(s->secondary_disk->bs);
-                 }
+ }
-             }
+ static void backup_job_cleanup(BlockDriverState *bs)
--            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
-+            qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
+index XXXXXXX..XXXXXXX 100644
+--- a/qemu-io-cmds.c
-             if (addend != 0) {
++++ b/qemu-io-cmds.c
-                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
+@@ -XXX,XX +XXX,XX @@ static int reopen_f(BlockBackend *blk, int argc, char **argv)
-@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
+     opts = qopts ? qemu_opts_to_qdict(qopts, NULL) : NULL;
-     ret = bdrv_flush(bs);
+     qemu_opts_reset(&reopen_opts);
- fail:
-     if (l2_table) {
++    bdrv_subtree_drained_begin(bs);
--        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+     brq = bdrv_reopen_queue(NULL, bs, opts, flags);
-+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
+     bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, &local_err);
-     }
++    bdrv_subtree_drained_end(bs);
++
-     s->cache_discards = false;
+     if (local_err) {
-@@ -XXX,XX +XXX,XX @@ static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
+         error_report_err(local_err);
-                                     new_reftable_size, new_refblock,
+     } else {
                                      new_refblock_empty, allocated, errp);
                      if (ret < 0) {
 -                        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
 +                        qcow2_cache_put(s->refcount_block_cache, &refblock);
                          return ret;
                      }
@@ -XXX,XX +XXX,XX @@ static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
                  if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
                      uint64_t offset;
 -                    qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
 +                    qcow2_cache_put(s->refcount_block_cache, &refblock);
                      offset = ((reftable_index << s->refcount_block_bits)
                                + refblock_index) << s->cluster_bits;
@@ -XXX,XX +XXX,XX @@ static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
                  new_refblock_empty = new_refblock_empty && refcount == 0;
              }
 -            qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
 +            qcow2_cache_put(s->refcount_block_cache, &refblock);
          } else {
              /* No refblock means every refcount is 0 */
              for (refblock_index = 0; refblock_index < s->refcount_block_size;
@@ -XXX,XX +XXX,XX @@ static int qcow2_discard_refcount_block(BlockDriverState *bs,
                                  offset_to_reftable_index(s, discard_block_offs),
                                  discard_block_offs,
                                  s->get_refcount(refblock, block_index));
 -        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
 +        qcow2_cache_put(s->refcount_block_cache, &refblock);
          return -EINVAL;
      }
      s->set_refcount(refblock, block_index, 0);
      qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
 -    qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
 +    qcow2_cache_put(s->refcount_block_cache, &refblock);
      if (cluster_index < s->free_cluster_index) {
          s->free_cluster_index = cluster_index;
@@ -XXX,XX +XXX,XX @@ int qcow2_shrink_reftable(BlockDriverState *bs)
          } else {
              unused_block = buffer_is_zero(refblock, s->cluster_size);
          }
 -        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
 +        qcow2_cache_put(s->refcount_block_cache, &refblock);
          reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
      }
 --
 .13.6

-[Qemu-devel] [PULL 26/55] qcow2: Remove BDS parameter from qcow2_cache_discard()
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-This function was only using the BlockDriverState parameter to pass it
-to qcow2_cache_get_table_idx() and qcow2_cache_table_release(). This
-is no longer necessary so this parameter can be removed.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: 9724f7e38e763ad3be32627c6b7fe8df9edb1476.1517840877.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2.h          | 2 +-
- block/qcow2-cache.c    | 2 +-
- block/qcow2-refcount.c | 6 +++---
-files changed, 5 insertions(+), 5 deletions(-)
-diff --git a/block/qcow2.h b/block/qcow2.h
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
-+++ b/block/qcow2.h
-@@ -XXX,XX +XXX,XX @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
- void qcow2_cache_put(Qcow2Cache *c, void **table);
- void *qcow2_cache_is_table_offset(BlockDriverState *bs, Qcow2Cache *c,
-                                   uint64_t offset);
--void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table);
-+void qcow2_cache_discard(Qcow2Cache *c, void *table);
- /* qcow2-bitmap.c functions */
- int qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
-diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cache.c
-+++ b/block/qcow2-cache.c
-@@ -XXX,XX +XXX,XX @@ void *qcow2_cache_is_table_offset(BlockDriverState *bs, Qcow2Cache *c,
-     return NULL;
- }
--void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table)
-+void qcow2_cache_discard(Qcow2Cache *c, void *table)
- {
-     int i = qcow2_cache_get_table_idx(c, table);
-diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-refcount.c
-+++ b/block/qcow2-refcount.c
-@@ -XXX,XX +XXX,XX @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
-                                                 offset);
-             if (table != NULL) {
-                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
--                qcow2_cache_discard(bs, s->refcount_block_cache, table);
-+                qcow2_cache_discard(s->refcount_block_cache, table);
-             }
-             table = qcow2_cache_is_table_offset(bs, s->l2_table_cache, offset);
-             if (table != NULL) {
--                qcow2_cache_discard(bs, s->l2_table_cache, table);
-+                qcow2_cache_discard(s->l2_table_cache, table);
-             }
-             if (s->discard_passthrough[type]) {
-@@ -XXX,XX +XXX,XX @@ static int qcow2_discard_refcount_block(BlockDriverState *bs,
-                                            discard_block_offs);
-     if (refblock) {
-         /* discard refblock from the cache if refblock is cached */
--        qcow2_cache_discard(bs, s->refcount_block_cache, refblock);
-+        qcow2_cache_discard(s->refcount_block_cache, refblock);
-     }
-     update_refcount_discard(bs, discard_block_offs, s->cluster_size);
---
-.13.6

-[Qemu-devel] [PULL 29/55] qcow2: Add l2_slice_size field to BDRVQcow2State
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-The BDRVQcow2State structure contains an l2_size field, which stores
-the number of 64-bit entries in an L2 table.
-For efficiency reasons we want to be able to load slices instead of
-full L2 tables, so we need to know how many entries an L2 slice can
-hold.
-An L2 slice is the portion of an L2 table that is loaded by the qcow2
-cache. At the moment that cache can only load complete tables,
-therefore an L2 slice has the same size as an L2 table (one cluster)
-and l2_size == l2_slice_size.
-Later we'll allow smaller slices, but until then we have to use this
-new l2_slice_size field to make the rest of the code ready for that.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: adb048595f9fb5dfb110c802a8b3c3be3b937f37.1517840877.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2.h | 1 +
- block/qcow2.c | 3 +++
-files changed, 4 insertions(+)
-diff --git a/block/qcow2.h b/block/qcow2.h
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
-+++ b/block/qcow2.h
-@@ -XXX,XX +XXX,XX @@ typedef struct BDRVQcow2State {
-     int cluster_bits;
-     int cluster_size;
-     int cluster_sectors;
-+    int l2_slice_size;
-     int l2_bits;
-     int l2_size;
-     int l1_size;
-diff --git a/block/qcow2.c b/block/qcow2.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.c
-+++ b/block/qcow2.c
-@@ -XXX,XX +XXX,XX @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
- typedef struct Qcow2ReopenState {
-     Qcow2Cache *l2_table_cache;
-     Qcow2Cache *refcount_block_cache;
-+    int l2_slice_size; /* Number of entries in a slice of the L2 table */
-     bool use_lazy_refcounts;
-     int overlap_check;
-     bool discard_passthrough[QCOW2_DISCARD_MAX];
-@@ -XXX,XX +XXX,XX @@ static int qcow2_update_options_prepare(BlockDriverState *bs,
-         }
-     }
-+    r->l2_slice_size = s->cluster_size / sizeof(uint64_t);
-     r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size);
-     r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size);
-     if (r->l2_table_cache == NULL || r->refcount_block_cache == NULL) {
-@@ -XXX,XX +XXX,XX @@ static void qcow2_update_options_commit(BlockDriverState *bs,
-     }
-     s->l2_table_cache = r->l2_table_cache;
-     s->refcount_block_cache = r->refcount_block_cache;
-+    s->l2_slice_size = r->l2_slice_size;
-     s->overlap_check = r->overlap_check;
-     s->use_lazy_refcounts = r->use_lazy_refcounts;
---
-.13.6

-[Qemu-devel] [PULL 31/55] qcow2: Update l2_load() to support L2 slices
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-Each entry in the qcow2 L2 cache stores a full L2 table (which uses a
-complete cluster in the qcow2 image). A cluster is usually too large
-to be used efficiently as the size for a cache entry, so we want to
-decouple both values by allowing smaller cache entries. Therefore the
-qcow2 L2 cache will no longer return full L2 tables but slices
-instead.
-This patch updates l2_load() so it can handle L2 slices correctly.
-Apart from the offset of the L2 table (which we already had) we also
-need the guest offset in order to calculate which one of the slices
-we need.
-An L2 slice has currently the same size as an L2 table (one cluster),
-so for now this function will load exactly the same data as before.
-This patch also removes a stale comment about the return value being
-a pointer to the L2 table. This function returns an error code since
-c17e9821c474d5fcdebdc82ed2fc096777d611.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: b830aa1fc5b6f8e3cb331d006853fe22facca847.1517840877.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-cluster.c | 28 +++++++++++++++++-----------
-file changed, 17 insertions(+), 11 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
-@@ -XXX,XX +XXX,XX @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
- /*
-  * l2_load
-  *
-- * Loads a L2 table into memory. If the table is in the cache, the cache
-- * is used; otherwise the L2 table is loaded from the image file.
-+ * @bs: The BlockDriverState
-+ * @offset: A guest offset, used to calculate what slice of the L2
-+ *          table to load.
-+ * @l2_offset: Offset to the L2 table in the image file.
-+ * @l2_slice: Location to store the pointer to the L2 slice.
-  *
-- * Returns a pointer to the L2 table on success, or NULL if the read from
-- * the image file failed.
-+ * Loads a L2 slice into memory (L2 slices are the parts of L2 tables
-+ * that are loaded by the qcow2 cache). If the slice is in the cache,
-+ * the cache is used; otherwise the L2 slice is loaded from the image
-+ * file.
-  */
--
--static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
--    uint64_t **l2_table)
-+static int l2_load(BlockDriverState *bs, uint64_t offset,
-+                   uint64_t l2_offset, uint64_t **l2_slice)
- {
-     BDRVQcow2State *s = bs->opaque;
-+    int start_of_slice = sizeof(uint64_t) *
-+        (offset_to_l2_index(s, offset) - offset_to_l2_slice_index(s, offset));
--    return qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
--                           (void **)l2_table);
-+    return qcow2_cache_get(bs, s->l2_table_cache, l2_offset + start_of_slice,
-+                           (void **)l2_slice);
- }
- /*
-@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
-     /* load the l2 table in memory */
--    ret = l2_load(bs, l2_offset, &l2_table);
-+    ret = l2_load(bs, offset, l2_offset, &l2_table);
-     if (ret < 0) {
-         return ret;
-     }
-@@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
-     if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) {
-         /* load the l2 table in memory */
--        ret = l2_load(bs, l2_offset, &l2_table);
-+        ret = l2_load(bs, offset, l2_offset, &l2_table);
-         if (ret < 0) {
-             return ret;
-         }
---
-.13.6

-[Qemu-devel] [PULL 32/55] qcow2: Prepare l2_allocate() for adding L2 slice support
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-Adding support for L2 slices to l2_allocate() needs (among other
-things) an extra loop that iterates over all slices of a new L2 table.
-Putting all changes in one patch would make it hard to read because
-all semantic changes would be mixed with pure indentation changes.
-To make things easier this patch simply creates a new block and
-changes the indentation of all lines of code inside it. Thus, all
-modifications in this patch are cosmetic. There are no semantic
-changes and no variables are renamed yet. The next patch will take
-care of that.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: d0d7dca8520db304524f52f49d8157595a707a35.1517840877.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-cluster.c | 53 ++++++++++++++++++++++++++++-----------------------
-file changed, 29 insertions(+), 24 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
-@@ -XXX,XX +XXX,XX @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
-     /* allocate a new entry in the l2 cache */
-     trace_qcow2_l2_allocate_get_empty(bs, l1_index);
--    ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table);
--    if (ret < 0) {
--        goto fail;
--    }
-+    {
-+        ret = qcow2_cache_get_empty(bs, s->l2_table_cache,
-+                                    l2_offset,
-+                                    (void **) table);
-+        if (ret < 0) {
-+            goto fail;
-+        }
--    l2_table = *table;
-+        l2_table = *table;
--    if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
--        /* if there was no old l2 table, clear the new table */
--        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
--    } else {
--        uint64_t* old_table;
-+        if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
-+            /* if there was no old l2 table, clear the new table */
-+            memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
-+        } else {
-+            uint64_t *old_table;
--        /* if there was an old l2 table, read it from the disk */
--        BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
--        ret = qcow2_cache_get(bs, s->l2_table_cache,
--            old_l2_offset & L1E_OFFSET_MASK,
--            (void**) &old_table);
--        if (ret < 0) {
--            goto fail;
-+            /* if there was an old l2 table, read it from the disk */
-+            BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
-+            ret = qcow2_cache_get(bs, s->l2_table_cache,
-+                                  old_l2_offset & L1E_OFFSET_MASK,
-+                                  (void **) &old_table);
-+            if (ret < 0) {
-+                goto fail;
-+            }
-+
-+            memcpy(l2_table, old_table, s->cluster_size);
-+
-+            qcow2_cache_put(s->l2_table_cache, (void **) &old_table);
-         }
--        memcpy(l2_table, old_table, s->cluster_size);
-+        /* write the l2 table to the file */
-+        BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
--        qcow2_cache_put(s->l2_table_cache, (void **) &old_table);
-+        trace_qcow2_l2_allocate_write_l2(bs, l1_index);
-+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
-     }
--    /* write the l2 table to the file */
--    BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
--
--    trace_qcow2_l2_allocate_write_l2(bs, l1_index);
--    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
-     ret = qcow2_cache_flush(bs, s->l2_table_cache);
-     if (ret < 0) {
-         goto fail;
---
-.13.6

-[Qemu-devel] [PULL 33/55] qcow2: Update l2_allocate() to support L2 slices
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-This patch updates l2_allocate() to support the qcow2 cache returning
-L2 slices instead of full L2 tables.
-The old code simply gets an L2 table from the cache and initializes it
-with zeroes or with the contents of an existing table. With a cache
-that returns slices instead of tables the idea remains the same, but
-the code must now iterate over all the slices that are contained in an
-L2 table.
-Since now we're operating with slices the function can no longer
-return the newly-allocated table, so it's up to the caller to retrieve
-the appropriate L2 slice after calling l2_allocate() (note that with
-this patch the caller is still loading full L2 tables, but we'll deal
-with that in a separate patch).
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Message-id: 20fc0415bf0e011e29f6487ec86eb06a11f37445.1517840877.git.berto@igalia.com
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-cluster.c | 56 +++++++++++++++++++++++++++++++--------------------
-file changed, 34 insertions(+), 22 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
-@@ -XXX,XX +XXX,XX @@ int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
-  *
-  */
--static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
-+static int l2_allocate(BlockDriverState *bs, int l1_index)
- {
-     BDRVQcow2State *s = bs->opaque;
-     uint64_t old_l2_offset;
--    uint64_t *l2_table = NULL;
-+    uint64_t *l2_slice = NULL;
-+    unsigned slice, slice_size2, n_slices;
-     int64_t l2_offset;
-     int ret;
-@@ -XXX,XX +XXX,XX @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
-     /* allocate a new entry in the l2 cache */
-+    slice_size2 = s->l2_slice_size * sizeof(uint64_t);
-+    n_slices = s->cluster_size / slice_size2;
-+
-     trace_qcow2_l2_allocate_get_empty(bs, l1_index);
--    {
-+    for (slice = 0; slice < n_slices; slice++) {
-         ret = qcow2_cache_get_empty(bs, s->l2_table_cache,
--                                    l2_offset,
--                                    (void **) table);
-+                                    l2_offset + slice * slice_size2,
-+                                    (void **) &l2_slice);
-         if (ret < 0) {
-             goto fail;
-         }
--        l2_table = *table;
--
-         if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
--            /* if there was no old l2 table, clear the new table */
--            memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
-+            /* if there was no old l2 table, clear the new slice */
-+            memset(l2_slice, 0, slice_size2);
-         } else {
--            uint64_t *old_table;
-+            uint64_t *old_slice;
-+            uint64_t old_l2_slice_offset =
-+                (old_l2_offset & L1E_OFFSET_MASK) + slice * slice_size2;
--            /* if there was an old l2 table, read it from the disk */
-+            /* if there was an old l2 table, read a slice from the disk */
-             BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
--            ret = qcow2_cache_get(bs, s->l2_table_cache,
--                                  old_l2_offset & L1E_OFFSET_MASK,
--                                  (void **) &old_table);
-+            ret = qcow2_cache_get(bs, s->l2_table_cache, old_l2_slice_offset,
-+                                  (void **) &old_slice);
-             if (ret < 0) {
-                 goto fail;
-             }
--            memcpy(l2_table, old_table, s->cluster_size);
-+            memcpy(l2_slice, old_slice, slice_size2);
--            qcow2_cache_put(s->l2_table_cache, (void **) &old_table);
-+            qcow2_cache_put(s->l2_table_cache, (void **) &old_slice);
-         }
--        /* write the l2 table to the file */
-+        /* write the l2 slice to the file */
-         BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
-         trace_qcow2_l2_allocate_write_l2(bs, l1_index);
--        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
-+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
-+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
-     }
-     ret = qcow2_cache_flush(bs, s->l2_table_cache);
-@@ -XXX,XX +XXX,XX @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
-         goto fail;
-     }
--    *table = l2_table;
-     trace_qcow2_l2_allocate_done(bs, l1_index, 0);
-     return 0;
- fail:
-     trace_qcow2_l2_allocate_done(bs, l1_index, ret);
--    if (l2_table != NULL) {
--        qcow2_cache_put(s->l2_table_cache, (void **) table);
-+    if (l2_slice != NULL) {
-+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
-     }
-     s->l1_table[l1_index] = old_l2_offset;
-     if (l2_offset > 0) {
-@@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
-         }
-     } else {
-         /* First allocate a new L2 table (and do COW if needed) */
--        ret = l2_allocate(bs, l1_index, &l2_table);
-+        ret = l2_allocate(bs, l1_index);
-         if (ret < 0) {
-             return ret;
-         }
-@@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
-             qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
-                                 QCOW2_DISCARD_OTHER);
-         }
-+
-+        /* Get the offset of the newly-allocated l2 table */
-+        l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
-+        assert(offset_into_cluster(s, l2_offset) == 0);
-+        /* Load the l2 table in memory */
-+        ret = l2_load(bs, offset, l2_offset, &l2_table);
-+        if (ret < 0) {
-+            return ret;
-+        }
-     }
-     /* find the cluster offset for the given disk offset */
---
-.13.6

-[Qemu-devel] [PULL 34/55] qcow2: Refactor get_cluster_table()
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-After the previous patch we're now always using l2_load() in
-get_cluster_table() regardless of whether a new L2 table has to be
-allocated or not.
-This patch refactors that part of the code to use one single l2_load()
-call.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: ce31758c4a1fadccea7a6ccb93951eb01d95fd4c.1517840877.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-cluster.c | 21 +++++++--------------
-file changed, 7 insertions(+), 14 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
-@@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
-         return -EIO;
-     }
--    /* seek the l2 table of the given l2 offset */
--
--    if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) {
--        /* load the l2 table in memory */
--        ret = l2_load(bs, offset, l2_offset, &l2_table);
--        if (ret < 0) {
--            return ret;
--        }
--    } else {
-+    if (!(s->l1_table[l1_index] & QCOW_OFLAG_COPIED)) {
-         /* First allocate a new L2 table (and do COW if needed) */
-         ret = l2_allocate(bs, l1_index);
-         if (ret < 0) {
-@@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
-         /* Get the offset of the newly-allocated l2 table */
-         l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
-         assert(offset_into_cluster(s, l2_offset) == 0);
--        /* Load the l2 table in memory */
--        ret = l2_load(bs, offset, l2_offset, &l2_table);
--        if (ret < 0) {
--            return ret;
--        }
-+    }
-+
-+    /* load the l2 table in memory */
-+    ret = l2_load(bs, offset, l2_offset, &l2_table);
-+    if (ret < 0) {
-+        return ret;
-     }
-     /* find the cluster offset for the given disk offset */
---
-.13.6

-[Qemu-devel] [PULL 35/55] qcow2: Update get_cluster_table() to support L2 slices
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-This patch updates get_cluster_table() to return L2 slices instead of
-full L2 tables.
-The code itself needs almost no changes, it only needs to call
-offset_to_l2_slice_index() instead of offset_to_l2_index(). This patch
-also renames all the relevant variables and the documentation.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: 64cf064c0021ba315d3f3032da0f95db1b615f33.1517840877.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-cluster.c | 16 ++++++++--------
-file changed, 8 insertions(+), 8 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
-@@ -XXX,XX +XXX,XX @@ fail:
-  * get_cluster_table
-  *
-  * for a given disk offset, load (and allocate if needed)
-- * the l2 table.
-+ * the appropriate slice of its l2 table.
-  *
-- * the cluster index in the l2 table is given to the caller.
-+ * the cluster index in the l2 slice is given to the caller.
-  *
-  * Returns 0 on success, -errno in failure case
-  */
- static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
--                             uint64_t **new_l2_table,
-+                             uint64_t **new_l2_slice,
-                              int *new_l2_index)
- {
-     BDRVQcow2State *s = bs->opaque;
-     unsigned int l2_index;
-     uint64_t l1_index, l2_offset;
--    uint64_t *l2_table = NULL;
-+    uint64_t *l2_slice = NULL;
-     int ret;
-     /* seek to the l2 offset in the l1 table */
-@@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
-         assert(offset_into_cluster(s, l2_offset) == 0);
-     }
--    /* load the l2 table in memory */
--    ret = l2_load(bs, offset, l2_offset, &l2_table);
-+    /* load the l2 slice in memory */
-+    ret = l2_load(bs, offset, l2_offset, &l2_slice);
-     if (ret < 0) {
-         return ret;
-     }
-     /* find the cluster offset for the given disk offset */
--    l2_index = offset_to_l2_index(s, offset);
-+    l2_index = offset_to_l2_slice_index(s, offset);
--    *new_l2_table = l2_table;
-+    *new_l2_slice = l2_slice;
-     *new_l2_index = l2_index;
-     return 0;
---
-.13.6

-[Qemu-devel] [PULL 37/55] qcow2: Update qcow2_alloc_cluster_link_l2() to support L2 slices
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-There's a loop in this function that iterates over the L2 entries in a
-table, so now we need to assert that it remains within the limits of
-an L2 slice.
-Apart from that, this function doesn't need any additional changes, so
-this patch simply updates the variable name from l2_table to l2_slice.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: f9846a1c2efc51938e877e2a25852d9ab14797ff.1517840877.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-cluster.c | 16 ++++++++--------
-file changed, 8 insertions(+), 8 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
-@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
- {
-     BDRVQcow2State *s = bs->opaque;
-     int i, j = 0, l2_index, ret;
--    uint64_t *old_cluster, *l2_table;
-+    uint64_t *old_cluster, *l2_slice;
-     uint64_t cluster_offset = m->alloc_offset;
-     trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
-@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
-                                    s->refcount_block_cache);
-     }
--    ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
-+    ret = get_cluster_table(bs, m->offset, &l2_slice, &l2_index);
-     if (ret < 0) {
-         goto err;
-     }
--    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
-+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
--    assert(l2_index + m->nb_clusters <= s->l2_size);
-+    assert(l2_index + m->nb_clusters <= s->l2_slice_size);
-     for (i = 0; i < m->nb_clusters; i++) {
-         /* if two concurrent writes happen to the same unallocated cluster
-          * each write allocates separate cluster and writes data concurrently.
-@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
-          * cluster the second one has to do RMW (which is done above by
-          * perform_cow()), update l2 table with its cluster pointer and free
-          * old cluster. This is what this loop does */
--        if (l2_table[l2_index + i] != 0) {
--            old_cluster[j++] = l2_table[l2_index + i];
-+        if (l2_slice[l2_index + i] != 0) {
-+            old_cluster[j++] = l2_slice[l2_index + i];
-         }
--        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
-+        l2_slice[l2_index + i] = cpu_to_be64((cluster_offset +
-                     (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
-      }
--    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
-+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
-     /*
-      * If this was a COW, we need to decrease the refcount of the old cluster.
---
-.13.6

-[Qemu-devel] [PULL 38/55] qcow2: Update handle_copied() to support L2 slices
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-handle_copied() loads an L2 table and limits the number of checked
-clusters to the amount that fits inside that table. Since we'll be
-loading L2 slices instead of full tables we need to update that limit.
-Apart from that, this function doesn't need any additional changes, so
-this patch simply updates the variable name from l2_table to l2_slice.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: 541ac001a7d6b86bab2392554bee53c2b312148c.1517840877.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-cluster.c | 16 ++++++++--------
-file changed, 8 insertions(+), 8 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
-@@ -XXX,XX +XXX,XX @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
-     BDRVQcow2State *s = bs->opaque;
-     int l2_index;
-     uint64_t cluster_offset;
--    uint64_t *l2_table;
-+    uint64_t *l2_slice;
-     uint64_t nb_clusters;
-     unsigned int keep_clusters;
-     int ret;
-@@ -XXX,XX +XXX,XX @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
-                                 == offset_into_cluster(s, *host_offset));
-     /*
--     * Calculate the number of clusters to look for. We stop at L2 table
-+     * Calculate the number of clusters to look for. We stop at L2 slice
-      * boundaries to keep things simple.
-      */
-     nb_clusters =
-         size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
--    l2_index = offset_to_l2_index(s, guest_offset);
--    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
-+    l2_index = offset_to_l2_slice_index(s, guest_offset);
-+    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
-     assert(nb_clusters <= INT_MAX);
-     /* Find L2 entry for the first involved cluster */
--    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
-+    ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
-     if (ret < 0) {
-         return ret;
-     }
--    cluster_offset = be64_to_cpu(l2_table[l2_index]);
-+    cluster_offset = be64_to_cpu(l2_slice[l2_index]);
-     /* Check how many clusters are already allocated and don't need COW */
-     if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
-@@ -XXX,XX +XXX,XX @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
-         /* We keep all QCOW_OFLAG_COPIED clusters */
-         keep_clusters =
-             count_contiguous_clusters(nb_clusters, s->cluster_size,
--                                      &l2_table[l2_index],
-+                                      &l2_slice[l2_index],
-                                       QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
-         assert(keep_clusters <= nb_clusters);
-@@ -XXX,XX +XXX,XX @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
-     /* Cleanup */
- out:
--    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
-+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
-     /* Only return a host offset if we actually made progress. Otherwise we
-      * would make requirements for handle_alloc() that it can't fulfill */
---
-.13.6

-[Qemu-devel] [PULL 39/55] qcow2: Update handle_alloc() to support L2 slices
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-handle_alloc() loads an L2 table and limits the number of checked
-clusters to the amount that fits inside that table. Since we'll be
-loading L2 slices instead of full tables we need to update that limit.
-Apart from that, this function doesn't need any additional changes, so
-this patch simply updates the variable name from l2_table to l2_slice.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: b243299c7136f7014c5af51665431ddbf5e99afd.1517840877.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-cluster.c | 18 +++++++++---------
-file changed, 9 insertions(+), 9 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
-@@ -XXX,XX +XXX,XX @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
- {
-     BDRVQcow2State *s = bs->opaque;
-     int l2_index;
--    uint64_t *l2_table;
-+    uint64_t *l2_slice;
-     uint64_t entry;
-     uint64_t nb_clusters;
-     int ret;
-@@ -XXX,XX +XXX,XX @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
-     assert(*bytes > 0);
-     /*
--     * Calculate the number of clusters to look for. We stop at L2 table
-+     * Calculate the number of clusters to look for. We stop at L2 slice
-      * boundaries to keep things simple.
-      */
-     nb_clusters =
-         size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
--    l2_index = offset_to_l2_index(s, guest_offset);
--    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
-+    l2_index = offset_to_l2_slice_index(s, guest_offset);
-+    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
-     assert(nb_clusters <= INT_MAX);
-     /* Find L2 entry for the first involved cluster */
--    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
-+    ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
-     if (ret < 0) {
-         return ret;
-     }
--    entry = be64_to_cpu(l2_table[l2_index]);
-+    entry = be64_to_cpu(l2_slice[l2_index]);
-     /* For the moment, overwrite compressed clusters one by one */
-     if (entry & QCOW_OFLAG_COMPRESSED) {
-         nb_clusters = 1;
-     } else {
--        nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index);
-+        nb_clusters = count_cow_clusters(s, nb_clusters, l2_slice, l2_index);
-     }
-     /* This function is only called when there were no non-COW clusters, so if
-@@ -XXX,XX +XXX,XX @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
-          * nb_clusters already to a range of COW clusters */
-         preallocated_nb_clusters =
-             count_contiguous_clusters(nb_clusters, s->cluster_size,
--                                      &l2_table[l2_index], QCOW_OFLAG_COPIED);
-+                                      &l2_slice[l2_index], QCOW_OFLAG_COPIED);
-         assert(preallocated_nb_clusters > 0);
-         nb_clusters = preallocated_nb_clusters;
-@@ -XXX,XX +XXX,XX @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
-         keep_old_clusters = true;
-     }
--    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
-+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
-     if (!alloc_cluster_offset) {
-         /* Allocate, if necessary at a given offset in the image file */
---
-.13.6

-[Qemu-devel] [PULL 42/55] qcow2: Prepare qcow2_update_snapshot_refcount() for adding L2 slice support
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-Adding support for L2 slices to qcow2_update_snapshot_refcount() needs
-(among other things) an extra loop that iterates over all slices of
-each L2 table.
-Putting all changes in one patch would make it hard to read because
-all semantic changes would be mixed with pure indentation changes.
-To make things easier this patch simply creates a new block and
-changes the indentation of all lines of code inside it. Thus, all
-modifications in this patch are cosmetic. There are no semantic
-changes and no variables are renamed yet. The next patch will take
-care of that.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: 8ffaa5e55bd51121f80e498f4045b64902a94293.1517840877.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-refcount.c | 144 +++++++++++++++++++++++++------------------------
-file changed, 75 insertions(+), 69 deletions(-)
-diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-refcount.c
-+++ b/block/qcow2-refcount.c
-@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
-                 goto fail;
-             }
--            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
--                (void**) &l2_table);
--            if (ret < 0) {
--                goto fail;
--            }
-+            {
-+                ret = qcow2_cache_get(bs, s->l2_table_cache,
-+                                      l2_offset,
-+                                      (void **) &l2_table);
-+                if (ret < 0) {
-+                    goto fail;
-+                }
--            for (j = 0; j < s->l2_size; j++) {
--                uint64_t cluster_index;
--                uint64_t offset;
--
--                entry = be64_to_cpu(l2_table[j]);
--                old_entry = entry;
--                entry &= ~QCOW_OFLAG_COPIED;
--                offset = entry & L2E_OFFSET_MASK;
--
--                switch (qcow2_get_cluster_type(entry)) {
--                case QCOW2_CLUSTER_COMPRESSED:
--                    nb_csectors = ((entry >> s->csize_shift) &
--                                   s->csize_mask) + 1;
--                    if (addend != 0) {
--                        ret = update_refcount(bs,
--                                (entry & s->cluster_offset_mask) & ~511,
-+                for (j = 0; j < s->l2_size; j++) {
-+                    uint64_t cluster_index;
-+                    uint64_t offset;
-+
-+                    entry = be64_to_cpu(l2_table[j]);
-+                    old_entry = entry;
-+                    entry &= ~QCOW_OFLAG_COPIED;
-+                    offset = entry & L2E_OFFSET_MASK;
-+
-+                    switch (qcow2_get_cluster_type(entry)) {
-+                    case QCOW2_CLUSTER_COMPRESSED:
-+                        nb_csectors = ((entry >> s->csize_shift) &
-+                                       s->csize_mask) + 1;
-+                        if (addend != 0) {
-+                            ret = update_refcount(
-+                                bs, (entry & s->cluster_offset_mask) & ~511,
-                                 nb_csectors * 512, abs(addend), addend < 0,
-                                 QCOW2_DISCARD_SNAPSHOT);
--                        if (ret < 0) {
-+                            if (ret < 0) {
-+                                goto fail;
-+                            }
-+                        }
-+                        /* compressed clusters are never modified */
-+                        refcount = 2;
-+                        break;
-+
-+                    case QCOW2_CLUSTER_NORMAL:
-+                    case QCOW2_CLUSTER_ZERO_ALLOC:
-+                        if (offset_into_cluster(s, offset)) {
-+                            qcow2_signal_corruption(
-+                                bs, true, -1, -1, "Cluster "
-+                                "allocation offset %#" PRIx64
-+                                " unaligned (L2 offset: %#"
-+                                PRIx64 ", L2 index: %#x)",
-+                                offset, l2_offset, j);
-+                            ret = -EIO;
-                             goto fail;
-                         }
--                    }
--                    /* compressed clusters are never modified */
--                    refcount = 2;
--                    break;
--
--                case QCOW2_CLUSTER_NORMAL:
--                case QCOW2_CLUSTER_ZERO_ALLOC:
--                    if (offset_into_cluster(s, offset)) {
--                        qcow2_signal_corruption(bs, true, -1, -1, "Cluster "
--                                                "allocation offset %#" PRIx64
--                                                " unaligned (L2 offset: %#"
--                                                PRIx64 ", L2 index: %#x)",
--                                                offset, l2_offset, j);
--                        ret = -EIO;
--                        goto fail;
--                    }
--                    cluster_index = offset >> s->cluster_bits;
--                    assert(cluster_index);
--                    if (addend != 0) {
--                        ret = qcow2_update_cluster_refcount(bs,
--                                    cluster_index, abs(addend), addend < 0,
--                                    QCOW2_DISCARD_SNAPSHOT);
-+                        cluster_index = offset >> s->cluster_bits;
-+                        assert(cluster_index);
-+                        if (addend != 0) {
-+                            ret = qcow2_update_cluster_refcount(
-+                                bs, cluster_index, abs(addend), addend < 0,
-+                                QCOW2_DISCARD_SNAPSHOT);
-+                            if (ret < 0) {
-+                                goto fail;
-+                            }
-+                        }
-+
-+                        ret = qcow2_get_refcount(bs, cluster_index, &refcount);
-                         if (ret < 0) {
-                             goto fail;
-                         }
--                    }
-+                        break;
--                    ret = qcow2_get_refcount(bs, cluster_index, &refcount);
--                    if (ret < 0) {
--                        goto fail;
--                    }
--                    break;
--
--                case QCOW2_CLUSTER_ZERO_PLAIN:
--                case QCOW2_CLUSTER_UNALLOCATED:
--                    refcount = 0;
--                    break;
-+                    case QCOW2_CLUSTER_ZERO_PLAIN:
-+                    case QCOW2_CLUSTER_UNALLOCATED:
-+                        refcount = 0;
-+                        break;
--                default:
--                    abort();
--                }
-+                    default:
-+                        abort();
-+                    }
--                if (refcount == 1) {
--                    entry |= QCOW_OFLAG_COPIED;
--                }
--                if (entry != old_entry) {
--                    if (addend > 0) {
--                        qcow2_cache_set_dependency(bs, s->l2_table_cache,
--                            s->refcount_block_cache);
-+                    if (refcount == 1) {
-+                        entry |= QCOW_OFLAG_COPIED;
-+                    }
-+                    if (entry != old_entry) {
-+                        if (addend > 0) {
-+                            qcow2_cache_set_dependency(bs, s->l2_table_cache,
-+                                                       s->refcount_block_cache);
-+                        }
-+                        l2_table[j] = cpu_to_be64(entry);
-+                        qcow2_cache_entry_mark_dirty(s->l2_table_cache,
-+                                                     l2_table);
-                     }
--                    l2_table[j] = cpu_to_be64(entry);
--                    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
-                 }
--            }
--            qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
-+                qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
-+
-+            }
-             if (addend != 0) {
-                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
---
-.13.6

-[Qemu-devel] [PULL 43/55] qcow2: Update qcow2_update_snapshot_refcount() to support L2 slices
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-qcow2_update_snapshot_refcount() increases the refcount of all
-clusters of a given snapshot. In order to do that it needs to load all
-its L2 tables and iterate over their entries. Since we'll be loading
-L2 slices instead of full tables we need to add an extra loop that
-iterates over all slices of each L2 table.
-This function doesn't need any additional changes so apart from that
-this patch simply updates the variable name from l2_table to l2_slice.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Message-id: 5f4db199b9637f4833b58487135124d70add8cf0.1517840877.git.berto@igalia.com
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-refcount.c | 32 ++++++++++++++++++--------------
-file changed, 18 insertions(+), 14 deletions(-)
-diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-refcount.c
-+++ b/block/qcow2-refcount.c
-@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
-     int64_t l1_table_offset, int l1_size, int addend)
- {
-     BDRVQcow2State *s = bs->opaque;
--    uint64_t *l1_table, *l2_table, l2_offset, entry, l1_size2, refcount;
-+    uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
-     bool l1_allocated = false;
-     int64_t old_entry, old_l2_offset;
-+    unsigned slice, slice_size2, n_slices;
-     int i, j, l1_modified = 0, nb_csectors;
-     int ret;
-     assert(addend >= -1 && addend <= 1);
--    l2_table = NULL;
-+    l2_slice = NULL;
-     l1_table = NULL;
-     l1_size2 = l1_size * sizeof(uint64_t);
-+    slice_size2 = s->l2_slice_size * sizeof(uint64_t);
-+    n_slices = s->cluster_size / slice_size2;
-     s->cache_discards = true;
-@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
-                 goto fail;
-             }
--            {
-+            for (slice = 0; slice < n_slices; slice++) {
-                 ret = qcow2_cache_get(bs, s->l2_table_cache,
--                                      l2_offset,
--                                      (void **) &l2_table);
-+                                      l2_offset + slice * slice_size2,
-+                                      (void **) &l2_slice);
-                 if (ret < 0) {
-                     goto fail;
-                 }
--                for (j = 0; j < s->l2_size; j++) {
-+                for (j = 0; j < s->l2_slice_size; j++) {
-                     uint64_t cluster_index;
-                     uint64_t offset;
--                    entry = be64_to_cpu(l2_table[j]);
-+                    entry = be64_to_cpu(l2_slice[j]);
-                     old_entry = entry;
-                     entry &= ~QCOW_OFLAG_COPIED;
-                     offset = entry & L2E_OFFSET_MASK;
-@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
-                     case QCOW2_CLUSTER_NORMAL:
-                     case QCOW2_CLUSTER_ZERO_ALLOC:
-                         if (offset_into_cluster(s, offset)) {
-+                            /* Here l2_index means table (not slice) index */
-+                            int l2_index = slice * s->l2_slice_size + j;
-                             qcow2_signal_corruption(
-                                 bs, true, -1, -1, "Cluster "
-                                 "allocation offset %#" PRIx64
-                                 " unaligned (L2 offset: %#"
-                                 PRIx64 ", L2 index: %#x)",
--                                offset, l2_offset, j);
-+                                offset, l2_offset, l2_index);
-                             ret = -EIO;
-                             goto fail;
-                         }
-@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
-                             qcow2_cache_set_dependency(bs, s->l2_table_cache,
-                                                        s->refcount_block_cache);
-                         }
--                        l2_table[j] = cpu_to_be64(entry);
-+                        l2_slice[j] = cpu_to_be64(entry);
-                         qcow2_cache_entry_mark_dirty(s->l2_table_cache,
--                                                     l2_table);
-+                                                     l2_slice);
-                     }
-                 }
--                qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
--
-+                qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
-             }
-             if (addend != 0) {
-@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
-     ret = bdrv_flush(bs);
- fail:
--    if (l2_table) {
--        qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
-+    if (l2_slice) {
-+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
-     }
-     s->cache_discards = false;
---
-.13.6

-[Qemu-devel] [PULL 44/55] qcow2: Read refcount before L2 table in expand_zero_clusters_in_l1()
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-At the moment it doesn't really make a difference whether we call
-qcow2_get_refcount() before of after reading the L2 table, but if we
-want to support L2 slices we'll need to read the refcount first.
-This patch simply changes the order of those two operations to prepare
-for that. The patch with the actual semantic changes will be easier to
-read because of this.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: 947a91d934053a2dbfef979aeb9568f57ef57c5d.1517840877.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-cluster.c | 12 ++++++------
-file changed, 6 insertions(+), 6 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
-@@ -XXX,XX +XXX,XX @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
-             goto fail;
-         }
-+        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
-+                                 &l2_refcount);
-+        if (ret < 0) {
-+            goto fail;
-+        }
-+
-         if (is_active_l1) {
-             /* get active L2 tables from cache */
-             ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
-@@ -XXX,XX +XXX,XX @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
-             goto fail;
-         }
--        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
--                                 &l2_refcount);
--        if (ret < 0) {
--            goto fail;
--        }
--
-         for (j = 0; j < s->l2_size; j++) {
-             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
-             int64_t offset = l2_entry & L2E_OFFSET_MASK;
---
-.13.6

-[Qemu-devel] [PULL 47/55] qcow2: Update qcow2_truncate() to support L2 slices
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-The qcow2_truncate() code is mostly independent from whether
-we're using L2 slices or full L2 tables, but in full and
-falloc preallocation modes new L2 tables are allocated using
-qcow2_alloc_cluster_link_l2().  Therefore the code needs to be
-modified to ensure that all nb_clusters that are processed in each
-call can be allocated with just one L2 slice.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Message-id: 1fd7d272b5e7b66254a090b74cf2bed1cc334c0e.1517840877.git.berto@igalia.com
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2.c | 6 +++---
-file changed, 3 insertions(+), 3 deletions(-)
-diff --git a/block/qcow2.c b/block/qcow2.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.c
-+++ b/block/qcow2.c
-@@ -XXX,XX +XXX,XX @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset,
-         host_offset = allocation_start;
-         guest_offset = old_length;
-         while (nb_new_data_clusters) {
--            int64_t guest_cluster = guest_offset >> s->cluster_bits;
--            int64_t nb_clusters = MIN(nb_new_data_clusters,
--                                      s->l2_size - guest_cluster % s->l2_size);
-+            int64_t nb_clusters = MIN(
-+                nb_new_data_clusters,
-+                s->l2_slice_size - offset_to_l2_slice_index(s, guest_offset));
-             QCowL2Meta allocation = {
-                 .offset       = guest_offset,
-                 .alloc_offset = host_offset,
---
-.13.6

-[Qemu-devel] [PULL 50/55] qcow2: Rename l2_table in count_contiguous_clusters_unallocated()
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-This function doesn't need any changes to support L2 slices, but since
-it's now dealing with slices instead of full tables, the l2_table
-variable is renamed for clarity.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: 78bcc54bc632574dd0b900a77a00a1b6ffc359e6.1517840877.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-cluster.c | 6 +++---
-file changed, 3 insertions(+), 3 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
-@@ -XXX,XX +XXX,XX @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,
- /*
-  * Checks how many consecutive unallocated clusters in a given L2
-- * table have the same cluster type.
-+ * slice have the same cluster type.
-  */
- static int count_contiguous_clusters_unallocated(int nb_clusters,
--                                                 uint64_t *l2_table,
-+                                                 uint64_t *l2_slice,
-                                                  QCow2ClusterType wanted_type)
- {
-     int i;
-@@ -XXX,XX +XXX,XX @@ static int count_contiguous_clusters_unallocated(int nb_clusters,
-     assert(wanted_type == QCOW2_CLUSTER_ZERO_PLAIN ||
-            wanted_type == QCOW2_CLUSTER_UNALLOCATED);
-     for (i = 0; i < nb_clusters; i++) {
--        uint64_t entry = be64_to_cpu(l2_table[i]);
-+        uint64_t entry = be64_to_cpu(l2_slice[i]);
-         QCow2ClusterType type = qcow2_get_cluster_type(entry);
-         if (type != wanted_type) {
---
-.13.6

-[Qemu-devel] [PULL 51/55] qcow2: Rename l2_table in count_cow_clusters()
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-This function doesn't need any changes to support L2 slices, but since
-it's now dealing with slices intead of full tables, the l2_table
-variable is renamed for clarity.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: 6107001fc79e6739242f1de7d191375e4f130aac.1517840877.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-cluster.c | 4 ++--
-file changed, 2 insertions(+), 2 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
-@@ -XXX,XX +XXX,XX @@ err:
-  * which must copy from the backing file)
-  */
- static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters,
--    uint64_t *l2_table, int l2_index)
-+    uint64_t *l2_slice, int l2_index)
- {
-     int i;
-     for (i = 0; i < nb_clusters; i++) {
--        uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]);
-+        uint64_t l2_entry = be64_to_cpu(l2_slice[l2_index + i]);
-         QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
-         switch(cluster_type) {
---
-.13.6

-[Qemu-devel] [PULL 54/55] iotests: Test downgrading an image using a small L2 slice size
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-expand_zero_clusters_in_l1() is used when downgrading qcow2 images
-from v3 to v2 (compat=0.10). This is one of the functions that needed
-more changes to support L2 slices, so this patch extends iotest 061 to
-test downgrading a qcow2 image using a smaller slice size.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: 3e5662dce5e4926c8fabbad4c0b9142b2a506dd4.1517840877.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- tests/qemu-iotests/061     | 16 ++++++++++++
- tests/qemu-iotests/061.out | 61 ++++++++++++++++++++++++++++++++++++++++++++++
-files changed, 77 insertions(+)
-diff --git a/tests/qemu-iotests/061 b/tests/qemu-iotests/061
-index XXXXXXX..XXXXXXX 100755
---- a/tests/qemu-iotests/061
-+++ b/tests/qemu-iotests/061
-@@ -XXX,XX +XXX,XX @@ $QEMU_IO -c "read -P 0 0 128k" "$TEST_IMG" | _filter_qemu_io
- _check_test_img
- echo
-+echo "=== Testing version downgrade with zero expansion and 4K cache entries ==="
-+echo
-+IMGOPTS="compat=1.1,lazy_refcounts=on" _make_test_img 64M
-+$QEMU_IO -c "write -z 0 128k" "$TEST_IMG" | _filter_qemu_io
-+$QEMU_IO -c "write -z 32M 128k" "$TEST_IMG" | _filter_qemu_io
-+$QEMU_IO -c map "$TEST_IMG" | _filter_qemu_io
-+$PYTHON qcow2.py "$TEST_IMG" dump-header
-+$QEMU_IMG amend -o "compat=0.10" --image-opts \
-+          driver=qcow2,file.filename=$TEST_IMG,l2-cache-entry-size=4096
-+$PYTHON qcow2.py "$TEST_IMG" dump-header
-+$QEMU_IO -c "read -P 0 0 128k" "$TEST_IMG" | _filter_qemu_io
-+$QEMU_IO -c "read -P 0 32M 128k" "$TEST_IMG" | _filter_qemu_io
-+$QEMU_IO -c map "$TEST_IMG" | _filter_qemu_io
-+_check_test_img
-+
-+echo
- echo "=== Testing dirty version downgrade ==="
- echo
- IMGOPTS="compat=1.1,lazy_refcounts=on" _make_test_img 64M
-diff --git a/tests/qemu-iotests/061.out b/tests/qemu-iotests/061.out
-index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/061.out
-+++ b/tests/qemu-iotests/061.out
-@@ -XXX,XX +XXX,XX @@ read 131072/131072 bytes at offset 0
-KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
- No errors were found on the image.
-+=== Testing version downgrade with zero expansion and 4K cache entries ===
-+
-+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
-+wrote 131072/131072 bytes at offset 0
-+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+wrote 131072/131072 bytes at offset 33554432
-+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+128 KiB (0x20000) bytes     allocated at offset 0 bytes (0x0)
-+31.875 MiB (0x1fe0000) bytes not allocated at offset 128 KiB (0x20000)
-+128 KiB (0x20000) bytes     allocated at offset 32 MiB (0x2000000)
-+31.875 MiB (0x1fe0000) bytes not allocated at offset 32.125 MiB (0x2020000)
-+magic                     0x514649fb
-+version                   3
-+backing_file_offset       0x0
-+backing_file_size         0x0
-+cluster_bits              16
-+size                      67108864
-+crypt_method              0
-+l1_size                   1
-+l1_table_offset           0x30000
-+refcount_table_offset     0x10000
-+refcount_table_clusters   1
-+nb_snapshots              0
-+snapshot_offset           0x0
-+incompatible_features     0x0
-+compatible_features       0x1
-+autoclear_features        0x0
-+refcount_order            4
-+header_length             104
-+
-+Header extension:
-+magic                     0x6803f857
-+length                    144
-+data                      <binary>
-+
-+magic                     0x514649fb
-+version                   2
-+backing_file_offset       0x0
-+backing_file_size         0x0
-+cluster_bits              16
-+size                      67108864
-+crypt_method              0
-+l1_size                   1
-+l1_table_offset           0x30000
-+refcount_table_offset     0x10000
-+refcount_table_clusters   1
-+nb_snapshots              0
-+snapshot_offset           0x0
-+incompatible_features     0x0
-+compatible_features       0x0
-+autoclear_features        0x0
-+refcount_order            4
-+header_length             72
-+
-+read 131072/131072 bytes at offset 0
-+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+read 131072/131072 bytes at offset 33554432
-+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+64 MiB (0x4000000) bytes not allocated at offset 0 bytes (0x0)
-+No errors were found on the image.
-+
- === Testing dirty version downgrade ===
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
---
-.13.6

The following changes since commit fb68096da3d35e64c88cd610c1fa42766c58e92a:

Revert "tests: use memfd in vhost-user-test" (2018-02-13 09:51:52 +0000)

are available in the git repository at:

git://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to 0a4dc980e6c935e9be745ce3ee1a4c71629ecd00:

Merge remote-tracking branch 'mreitz/tags/pull-block-2018-02-13' into queue-block (2018-02-13 17:01:13 +0100)

----------------------------------------------------------------
Block layer patches

----------------------------------------------------------------
Alberto Garcia (40):
      qcow2: Use g_try_realloc() in qcow2_expand_zero_clusters()
      qcow2: Fix documentation of get_cluster_table()
      qcow2: Add table size field to Qcow2Cache
      qcow2: Remove BDS parameter from qcow2_cache_get_table_addr()
      qcow2: Remove BDS parameter from qcow2_cache_get_table_idx()
      qcow2: Remove BDS parameter from qcow2_cache_table_release()
      qcow2: Remove BDS parameter from qcow2_cache_entry_mark_dirty()
      qcow2: Remove BDS parameter from qcow2_cache_put()
      qcow2: Remove BDS parameter from qcow2_cache_destroy()
      qcow2: Remove BDS parameter from qcow2_cache_clean_unused()
      qcow2: Remove BDS parameter from qcow2_cache_discard()
      qcow2: Remove BDS parameter from qcow2_cache_is_table_offset()
      qcow2: Add offset_to_l1_index()
      qcow2: Add l2_slice_size field to BDRVQcow2State
      qcow2: Add offset_to_l2_slice_index()
      qcow2: Update l2_load() to support L2 slices
      qcow2: Prepare l2_allocate() for adding L2 slice support
      qcow2: Update l2_allocate() to support L2 slices
      qcow2: Refactor get_cluster_table()
      qcow2: Update get_cluster_table() to support L2 slices
      qcow2: Update qcow2_get_cluster_offset() to support L2 slices
      qcow2: Update qcow2_alloc_cluster_link_l2() to support L2 slices
      qcow2: Update handle_copied() to support L2 slices
      qcow2: Update handle_alloc() to support L2 slices
      qcow2: Update discard_single_l2() to support L2 slices
      qcow2: Update zero_single_l2() to support L2 slices
      qcow2: Prepare qcow2_update_snapshot_refcount() for adding L2 slice support
      qcow2: Update qcow2_update_snapshot_refcount() to support L2 slices
      qcow2: Read refcount before L2 table in expand_zero_clusters_in_l1()
      qcow2: Prepare expand_zero_clusters_in_l1() for adding L2 slice support
      qcow2: Update expand_zero_clusters_in_l1() to support L2 slices
      qcow2: Update qcow2_truncate() to support L2 slices
      qcow2: Rename l2_table in qcow2_alloc_compressed_cluster_offset()
      qcow2: Rename l2_table in count_contiguous_clusters()
      qcow2: Rename l2_table in count_contiguous_clusters_unallocated()
      qcow2: Rename l2_table in count_cow_clusters()
      qcow2: Allow configuring the L2 slice size
      iotests: Test valid values of l2-cache-entry-size
      iotests: Test downgrading an image using a small L2 slice size
      iotests: Add l2-cache-entry-size to iotest 137

Daniel P. Berrangé (1):
      qemu-io: fix EOF Ctrl-D handling in qemu-io readline code

Fam Zheng (4):
      iotests: Fix CID for VMDK afl image
      qemu-img.texi: Clean up parameter list
      qemu-img: Document --force-share / -U
      docs: Document share-rw property more thoroughly

Kevin Wolf (1):
      Merge remote-tracking branch 'mreitz/tags/pull-block-2018-02-13' into queue-block

Max Reitz (8):
      iotests: Use virtio-blk in 155
      gluster: Move glfs_close() to create's clean-up
      gluster: Pull truncation from qemu_gluster_create
      gluster: Query current size in do_truncate()
      gluster: Add preallocated truncation
      sheepdog: Make sd_prealloc() take a BDS
      sheepdog: Pass old and new size to sd_prealloc()
      sheepdog: Allow fully preallocated truncation

Paolo Bonzini (1):
      block: early check for blockers on drive-mirror

Vladimir Sementsov-Ogievskiy (1):
      block: maintain persistent disabled bitmaps

From: Fam Zheng <famz@redhat.com>

This reverts commit 76bf133c4 which updated the reference output, and
fixed the reference image, because the code path we want to exercise is
actually the invalid image size.

The descriptor block in the image, which includes the CID to verify, has been
invalid since the reference image was added. Since commit 9877860e7bd we report
this error earlier than the "file too large", so 059.out mismatches.

The binary change is generated along the operations of:

$ bunzip2 afl9.vmdk.bz2
  $ qemu-img create -f vmdk fix.vmdk 1G
  $ dd if=afl9.vmdk of=fix.vmdk bs=512 count=1 conv=notrunc
  $ mv fix.vmdk afl9.vmdk
  $ bzip2 afl9.vmdk

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/059.out                     |   2 +-
 tests/qemu-iotests/sample_images/afl9.vmdk.bz2 | Bin 178 -> 618 bytes
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/059.out b/tests/qemu-iotests/059.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/059.out
+++ b/tests/qemu-iotests/059.out
@@ -XXX,XX +XXX,XX @@ Offset          Length          Mapped to       File
 0x140000000     0x10000         0x50000         TEST_DIR/t-s003.vmdk
 
 === Testing afl image with a very large capacity ===
-qemu-img: Could not open 'TEST_DIR/afl9.IMGFMT': Could not open 'TEST_DIR/afl9.IMGFMT': Invalid argument
+qemu-img: Can't get image size 'TEST_DIR/afl9.IMGFMT': File too large
 *** done
diff --git a/tests/qemu-iotests/sample_images/afl9.vmdk.bz2 b/tests/qemu-iotests/sample_images/afl9.vmdk.bz2
index XXXXXXX..XXXXXXX 100644
GIT binary patch
literal 618
zcmV-w0+szjT4*^jL0KkKSvgW7ssIN3|NsBH-Q9UpfAhclU70`s-*NE~5QvC~h=_=Y
zh>D2n*q*=vygR634445h35k;?00h9835kMW00004$iPepVE{Bqk)uhJ^wfGLr=)3s
zhM5CR88jLh7)B;cA*K)*6GmuECPU3o4NWG5O#pg>Ak#xY8Z^<M8Z>CrMt}oD38Ns$
z02n}M0LdjZ&}cLPqd+nPKmn$j0iXe(02%-d27nnJriN-uE+X&cz@Bj4BBfd|yV!NB
zwqkL}nW3AI5x^jp=t%^F1pxqp)v#n#)j$zcm1xqv(!$2d*5%vF{5RPWnOV8-^tE<(
zU~%&}Y0uNu*9Wt=yS^8PkC&gPueZO%IG;aD{l#sG`<Af;l1Pnwpi9I75FkQ`LLhd8
z6(9f*2s+N5=%bwp80ddrD6>m4Ho*fsHXdM<jtl*zKvRiTx7Ugy1|Nl<Ns!z;1dvhy
z=`SDHh~{u|1ZodC(_lzezQ)I*Kv2z|PZ@!SJjlVzwGdx2iu#W}dI{t+T&dDWT^LPy
zg3NouEM=V~7GvZQS1CXy676F6mJXWGgW!KTr+E$OspGYCjWmuwa^<Bc>_(-i7fPIW
zA+~n9iy_f)g8B2RILhd%F)dZ5f?7pFLw)@;Ncl<JE}gvMrfh{elT#3gLjY6r8xY4O
z)UO#pv=WYptukn<DuoMH2ip%k?V^k!rjQirK^RC<Brw>3Bz9<|!xm0F{45K+gg8#n
z4FNAJ!<X|3Vq+lyV4=xZ;>AN0<K=%c4A2ruB!4rGvWm!KFrvd4PyfZ-kxmpO4pfM$
EfLnqQYXATM

literal 178
zcmV;j08RfwT4*^jL0KkKS>A08g#Z9x|HJ$H)ZJi0004xF0SE*D03g5s00IDLSQelF
ziVX^$pfWNUJrmRhn2k52pQ;Rs0EQC;(S%|!m`2~BZ@b++;etskRJUVl!Kt)wu7?VN
zl;%JdqX2?TgsNVJP?87M*MvL1qQnBkCES&?0@MeaN-bL4;bDzxmMm|da4fuh!=#fu
g@i9R@5z!av{9tA<GGr!3hi~HUNT&)C8_l7xpl%OKQ2+n{

-- 
2.13.6

From: Fam Zheng <famz@redhat.com>

Split options out of the "@table @var" section and create a "@table
@option", then use whitespaces and blank lines consistently.

Suggested-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kashyap Chamarthy <kchamart@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qemu-img.texi | 66 +++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 39 insertions(+), 27 deletions(-)

diff --git a/qemu-img.texi b/qemu-img.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -XXX,XX +XXX,XX @@ The following commands are supported:
 
 Command parameters:
 @table @var
-@item filename
- is a disk image filename
-
-@item --object @var{objectdef}
-
-is a QEMU user creatable object definition. See the @code{qemu(1)} manual
-page for a description of the object properties. The most common object
-type is a @code{secret}, which is used to supply passwords and/or encryption
-keys.
-
-@item --image-opts
-
-Indicates that the source @var{filename} parameter is to be interpreted as a
-full option string, not a plain filename. This parameter is mutually
-exclusive with the @var{-f} parameter.
-
-@item --target-image-opts
 
-Indicates that the @var{output_filename} parameter(s) are to be interpreted as
-a full option string, not a plain filename. This parameter is mutually
-exclusive with the @var{-O} parameters. It is currently required to also use
-the @var{-n} parameter to skip image creation. This restriction may be relaxed
-in a future release.
+@item filename
+is a disk image filename
 
 @item fmt
 is the disk image format. It is guessed automatically in most cases. See below
 for a description of the supported disk formats.
 
-@item --backing-chain
-will enumerate information about backing files in a disk image chain. Refer
-below for further description.
-
 @item size
 is the disk image size in bytes. Optional suffixes @code{k} or @code{K}
 (kilobyte, 1024) @code{M} (megabyte, 1024k) and @code{G} (gigabyte, 1024M)
@@ -XXX,XX +XXX,XX @@ and T (terabyte, 1024G) are supported.  @code{b} is ignored.
 is the destination disk image filename
 
 @item output_fmt
- is the destination format
+is the destination format
+
 @item options
 is a comma separated list of format specific options in a
 name=value format. Use @code{-o ?} for an overview of the options supported
 by the used format or see the format descriptions below for details.
+
 @item snapshot_param
 is param used for internal snapshot, format is
 'snapshot.id=[ID],snapshot.name=[NAME]' or '[ID_OR_NAME]'
+
 @item snapshot_id_or_name
 is deprecated, use snapshot_param instead
 
+@end table
+
+@table @option
+
+@item --object @var{objectdef}
+is a QEMU user creatable object definition. See the @code{qemu(1)} manual
+page for a description of the object properties. The most common object
+type is a @code{secret}, which is used to supply passwords and/or encryption
+keys.
+
+@item --image-opts
+Indicates that the source @var{filename} parameter is to be interpreted as a
+full option string, not a plain filename. This parameter is mutually
+exclusive with the @var{-f} parameter.
+
+@item --target-image-opts
+Indicates that the @var{output_filename} parameter(s) are to be interpreted as
+a full option string, not a plain filename. This parameter is mutually
+exclusive with the @var{-O} parameters. It is currently required to also use
+the @var{-n} parameter to skip image creation. This restriction may be relaxed
+in a future release.
+
+@item --backing-chain
+will enumerate information about backing files in a disk image chain. Refer
+below for further description.
+
 @item -c
 indicates that target image must be compressed (qcow format only)
+
 @item -h
 with or without a command shows help and lists the supported formats
+
 @item -p
 display progress bar (compare, convert and rebase commands only).
 If the @var{-p} option is not used for a command that supports it, the
 progress is reported when the process receives a @code{SIGUSR1} or
 @code{SIGINFO} signal.
+
 @item -q
 Quiet mode - do not print any output (except errors). There's no progress bar
 in case both @var{-q} and @var{-p} options are used.
+
 @item -S @var{size}
 indicates the consecutive number of bytes that must contain only zeros
 for qemu-img to create a sparse image during conversion. This value is rounded
 down to the nearest 512 bytes. You may use the common size suffixes like
 @code{k} for kilobytes.
+
 @item -t @var{cache}
 specifies the cache mode that should be used with the (destination) file. See
 the documentation of the emulator's @code{-drive cache=...} option for allowed
 values.
+
 @item -T @var{src_cache}
 specifies the cache mode that should be used with the source file(s). See
 the documentation of the emulator's @code{-drive cache=...} option for allowed
 values.
+
 @end table
 
 Parameters to snapshot subcommand:
-- 
2.13.6

From: Fam Zheng <famz@redhat.com>

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kashyap Chamarthy <kchamart@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qemu-img.texi | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/qemu-img.texi b/qemu-img.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -XXX,XX +XXX,XX @@ exclusive with the @var{-O} parameters. It is currently required to also use
 the @var{-n} parameter to skip image creation. This restriction may be relaxed
 in a future release.
 
+@item --force-share (-U)
+If specified, @code{qemu-img} will open the image in shared mode, allowing
+other QEMU processes to open it in write mode. For example, this can be used to
+get the image information (with 'info' subcommand) when the image is used by a
+running guest.  Note that this could produce inconsistent results because of
+concurrent metadata changes, etc. This option is only allowed when opening
+images in read-only mode.
+
 @item --backing-chain
 will enumerate information about backing files in a disk image chain. Refer
 below for further description.
-- 
2.13.6

From: Fam Zheng <famz@redhat.com>

Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kashyap Chamarthy <kchamart@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 docs/qemu-block-drivers.texi | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/qemu-block-drivers.texi b/docs/qemu-block-drivers.texi
index XXXXXXX..XXXXXXX 100644
--- a/docs/qemu-block-drivers.texi
+++ b/docs/qemu-block-drivers.texi
@@ -XXX,XX +XXX,XX @@ QEMU transparently handles lock handover during shared storage migration.  For
 shared virtual disk images between multiple VMs, the "share-rw" device option
 should be used.
 
+By default, the guest has exclusive write access to its disk image. If the
+guest can safely share the disk image with other writers the @code{-device
+...,share-rw=on} parameter can be used.  This is only safe if the guest is
+running software, such as a cluster file system, that coordinates disk accesses
+to avoid corruption.
+
+Note that share-rw=on only declares the guest's ability to share the disk.
+Some QEMU features, such as image file formats, require exclusive write access
+to the disk image and this is unaffected by the share-rw=on option.
+
 Alternatively, locking can be fully disabled by "locking=off" block device
 option. In the command line, the option is usually in the form of
 "file.locking=off" as the protocol driver is normally placed as a "file" child
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

g_realloc() aborts the program if it fails to allocate the required
amount of memory. We want to detect that scenario and return an error
instead, so let's use g_try_realloc().

Signed-off-by: Alberto Garcia <berto@igalia.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ int qcow2_expand_zero_clusters(BlockDriverState *bs,
         int l1_sectors = DIV_ROUND_UP(s->snapshots[i].l1_size *
                                       sizeof(uint64_t), BDRV_SECTOR_SIZE);
 
-        l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);
+        uint64_t *new_l1_table =
+            g_try_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);
+
+        if (!new_l1_table) {
+            ret = -ENOMEM;
+            goto fail;
+        }
+
+        l1_table = new_l1_table;
 
         ret = bdrv_read(bs->file,
                         s->snapshots[i].l1_table_offset / BDRV_SECTOR_SIZE,
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Only a few select machine types support floppy drives and there is
actually nothing preventing us from using virtio here, so let's do it.

Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/155 | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/qemu-iotests/155 b/tests/qemu-iotests/155
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/155
+++ b/tests/qemu-iotests/155
@@ -XXX,XX +XXX,XX @@ class BaseClass(iotests.QMPTestCase):
                     'file': {'driver': 'file',
                              'filename': source_img}}
         self.vm.add_blockdev(self.qmp_to_opts(blockdev))
-        self.vm.add_device('floppy,id=qdev0,drive=source')
+        self.vm.add_device('virtio-blk,id=qdev0,drive=source')
         self.vm.launch()
 
         self.assertIntactSourceBackingChain()
@@ -XXX,XX +XXX,XX @@ class MirrorBaseClass(BaseClass):
     def testFull(self):
         self.runMirror('full')
 
-        node = self.findBlockNode('target', 'qdev0')
+        node = self.findBlockNode('target',
+                                  '/machine/peripheral/qdev0/virtio-backend')
         self.assertCorrectBackingImage(node, None)
         self.assertIntactSourceBackingChain()
 
     def testTop(self):
         self.runMirror('top')
 
-        node = self.findBlockNode('target', 'qdev0')
+        node = self.findBlockNode('target',
+                                  '/machine/peripheral/qdev0/virtio-backend')
         self.assertCorrectBackingImage(node, back2_img)
         self.assertIntactSourceBackingChain()
 
     def testNone(self):
         self.runMirror('none')
 
-        node = self.findBlockNode('target', 'qdev0')
+        node = self.findBlockNode('target',
+                                  '/machine/peripheral/qdev0/virtio-backend')
         self.assertCorrectBackingImage(node, source_img)
         self.assertIntactSourceBackingChain()
 
@@ -XXX,XX +XXX,XX @@ class TestCommit(BaseClass):
 
         self.vm.event_wait('BLOCK_JOB_COMPLETED')
 
-        node = self.findBlockNode(None, 'qdev0')
+        node = self.findBlockNode(None,
+                                  '/machine/peripheral/qdev0/virtio-backend')
         self.assert_qmp(node, 'image' + '/backing-image' * 0 + '/filename',
                         back1_img)
         self.assert_qmp(node, 'image' + '/backing-image' * 1 + '/filename',
-- 
2.13.6

From: "Daniel P. Berrange" <berrange@redhat.com>

qemu-io puts the TTY into non-canonical mode, which means no EOF processing is
done and thus getchar() will never return the EOF constant. Instead we have to
query the TTY attributes to determine the configured EOF character (usually
Ctrl-D / 0x4), and then explicitly check for that value. This fixes the
regression that prevented Ctrl-D from triggering an exit of qemu-io that has
existed since readline was first added in

commit 0cf17e181798063c3824c8200ba46f25f54faa1a
  Author: Stefan Hajnoczi <stefanha@redhat.com>
  Date:   Thu Nov 14 11:54:17 2013 +0100

qemu-io: use readline.c

It also ensures that a newline is printed when exiting, to complete the
line output by the "qemu-io> " prompt.

Signed-off-by: Daniel P. Berrange <berrange@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qemu-io.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/qemu-io.c b/qemu-io.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include <getopt.h>
 #include <libgen.h>
+#ifndef _WIN32
+#include <termios.h>
+#endif
 
 #include "qapi/error.h"
 #include "qemu-io.h"
@@ -XXX,XX +XXX,XX @@ static bool imageOpts;
 
 static ReadLineState *readline_state;
 
+static int ttyEOF;
+
+static int get_eof_char(void)
+{
+#ifdef _WIN32
+    return 0x4; /* Ctrl-D */
+#else
+    struct termios tty;
+    if (tcgetattr(STDIN_FILENO, &tty) != 0) {
+        if (errno == ENOTTY) {
+            return 0x0; /* just expect read() == 0 */
+        } else {
+            return 0x4; /* Ctrl-D */
+        }
+    }
+
+    return tty.c_cc[VEOF];
+#endif
+}
+
 static int close_f(BlockBackend *blk, int argc, char **argv)
 {
     blk_unref(qemuio_blk);
@@ -XXX,XX +XXX,XX @@ static char *fetchline_readline(void)
     readline_start(readline_state, get_prompt(), 0, readline_func, &line);
     while (!line) {
         int ch = getchar();
-        if (ch == EOF) {
+        if (ttyEOF != 0x0 && ch == ttyEOF) {
+            printf("\n");
             break;
         }
         readline_handle_byte(readline_state, ch);
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     qemuio_add_command(&close_cmd);
 
     if (isatty(STDIN_FILENO)) {
+        ttyEOF = get_eof_char();
         readline_state = readline_init(readline_printf_func,
                                        readline_flush_func,
                                        NULL,
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

glfs_close() is a classical clean-up operation, as can be seen by the
fact that it is executed even if the truncation before it failed.
Also, moving it to clean-up makes it more clear that if it fails, we do
not want it to overwrite the current ret value if that signifies an
error already.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/gluster.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/block/gluster.c b/block/gluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -XXX,XX +XXX,XX @@ static int qemu_gluster_create(const char *filename,
 {
     BlockdevOptionsGluster *gconf;
     struct glfs *glfs;
-    struct glfs_fd *fd;
+    struct glfs_fd *fd = NULL;
     int ret = 0;
     PreallocMode prealloc;
     int64_t total_size = 0;
@@ -XXX,XX +XXX,XX @@ static int qemu_gluster_create(const char *filename,
         break;
     }
 
-    if (glfs_close(fd) != 0) {
-        ret = -errno;
-    }
 out:
+    if (fd) {
+        if (glfs_close(fd) != 0 && ret == 0) {
+            ret = -errno;
+        }
+    }
     qapi_free_BlockdevOptionsGluster(gconf);
     glfs_clear_preopened(glfs);
     return ret;
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Pull out the truncation code from the qemu_cluster_create() function so
we can later reuse it in qemu_gluster_truncate().

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/gluster.c | 74 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 40 insertions(+), 34 deletions(-)

diff --git a/block/gluster.c b/block/gluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
 }
 #endif
 
+static int qemu_gluster_do_truncate(struct glfs_fd *fd, int64_t offset,
+                                    PreallocMode prealloc, Error **errp)
+{
+    switch (prealloc) {
+#ifdef CONFIG_GLUSTERFS_FALLOCATE
+    case PREALLOC_MODE_FALLOC:
+        if (glfs_fallocate(fd, 0, 0, offset)) {
+            error_setg_errno(errp, errno, "Could not preallocate data");
+            return -errno;
+        }
+        break;
+#endif /* CONFIG_GLUSTERFS_FALLOCATE */
+#ifdef CONFIG_GLUSTERFS_ZEROFILL
+    case PREALLOC_MODE_FULL:
+        if (glfs_ftruncate(fd, offset)) {
+            error_setg_errno(errp, errno, "Could not resize file");
+            return -errno;
+        }
+        if (glfs_zerofill(fd, 0, offset)) {
+            error_setg_errno(errp, errno, "Could not zerofill the new area");
+            return -errno;
+        }
+        break;
+#endif /* CONFIG_GLUSTERFS_ZEROFILL */
+    case PREALLOC_MODE_OFF:
+        if (glfs_ftruncate(fd, offset)) {
+            error_setg_errno(errp, errno, "Could not resize file");
+            return -errno;
+        }
+        break;
+    default:
+        error_setg(errp, "Unsupported preallocation mode: %s",
+                   PreallocMode_str(prealloc));
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
 static int qemu_gluster_create(const char *filename,
                                QemuOpts *opts, Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ static int qemu_gluster_create(const char *filename,
         goto out;
     }
 
-    switch (prealloc) {
-#ifdef CONFIG_GLUSTERFS_FALLOCATE
-    case PREALLOC_MODE_FALLOC:
-        if (glfs_fallocate(fd, 0, 0, total_size)) {
-            error_setg(errp, "Could not preallocate data for the new file");
-            ret = -errno;
-        }
-        break;
-#endif /* CONFIG_GLUSTERFS_FALLOCATE */
-#ifdef CONFIG_GLUSTERFS_ZEROFILL
-    case PREALLOC_MODE_FULL:
-        if (!glfs_ftruncate(fd, total_size)) {
-            if (glfs_zerofill(fd, 0, total_size)) {
-                error_setg(errp, "Could not zerofill the new file");
-                ret = -errno;
-            }
-        } else {
-            error_setg(errp, "Could not resize file");
-            ret = -errno;
-        }
-        break;
-#endif /* CONFIG_GLUSTERFS_ZEROFILL */
-    case PREALLOC_MODE_OFF:
-        if (glfs_ftruncate(fd, total_size) != 0) {
-            ret = -errno;
-            error_setg(errp, "Could not resize file");
-        }
-        break;
-    default:
-        ret = -EINVAL;
-        error_setg(errp, "Unsupported preallocation mode: %s",
-                   PreallocMode_str(prealloc));
-        break;
-    }
+    ret = qemu_gluster_do_truncate(fd, total_size, prealloc, errp);
 
 out:
     if (fd) {
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Instead of expecting the current size to be 0, query it and allocate
only the area [current_size, offset) if preallocation is requested.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/gluster.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

From: Max Reitz <mreitz@redhat.com>

By using qemu_do_cluster_truncate() in qemu_cluster_truncate(), we now
automatically have preallocated truncation.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/gluster.c | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

From: Max Reitz <mreitz@redhat.com>

We want to use this function in sd_truncate() later on, so taking a
filename is not exactly ideal.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/sheepdog.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index XXXXXXX..XXXXXXX 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
     return 0;
 }
 
-static int sd_prealloc(const char *filename, Error **errp)
+static int sd_prealloc(BlockDriverState *bs, Error **errp)
 {
     BlockBackend *blk = NULL;
-    BDRVSheepdogState *base = NULL;
+    BDRVSheepdogState *base = bs->opaque;
     unsigned long buf_size;
     uint32_t idx, max_idx;
     uint32_t object_size;
@@ -XXX,XX +XXX,XX @@ static int sd_prealloc(const char *filename, Error **errp)
     void *buf = NULL;
     int ret;
 
-    blk = blk_new_open(filename, NULL, NULL,
-                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
-    if (blk == NULL) {
-        ret = -EIO;
+    blk = blk_new(BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | BLK_PERM_RESIZE,
+                  BLK_PERM_ALL);
+
+    ret = blk_insert_bs(blk, bs, errp);
+    if (ret < 0) {
         goto out_with_err_set;
     }
 
@@ -XXX,XX +XXX,XX @@ static int sd_prealloc(const char *filename, Error **errp)
         goto out;
     }
 
-    base = blk_bs(blk)->opaque;
     object_size = (UINT32_C(1) << base->inode.block_size_shift);
     buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
     buf = g_malloc0(buf_size);
@@ -XXX,XX +XXX,XX @@ static int sd_create(const char *filename, QemuOpts *opts,
     }
 
     if (prealloc) {
-        ret = sd_prealloc(filename, errp);
+        BlockDriverState *bs;
+        QDict *opts;
+
+        opts = qdict_new();
+        qdict_put_str(opts, "driver", "sheepdog");
+        bs = bdrv_open(filename, NULL, opts, BDRV_O_PROTOCOL | BDRV_O_RDWR,
+                       errp);
+        if (!bs) {
+            goto out;
+        }
+
+        ret = sd_prealloc(bs, errp);
+
+        bdrv_unref(bs);
     }
 out:
     g_free(backing_file);
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

sd_prealloc() will now preallocate the area [old_size, new_size).  As
before, it rounds to buf_size and may thus overshoot and preallocate
areas that were not requested to be preallocated.  For image creation,
this is no change in behavior.  For truncation, this is in accordance
with the documentation for preallocated truncation.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/sheepdog.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index XXXXXXX..XXXXXXX 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
     return 0;
 }
 
-static int sd_prealloc(BlockDriverState *bs, Error **errp)
+static int sd_prealloc(BlockDriverState *bs, int64_t old_size, int64_t new_size,
+                       Error **errp)
 {
     BlockBackend *blk = NULL;
     BDRVSheepdogState *base = bs->opaque;
     unsigned long buf_size;
     uint32_t idx, max_idx;
     uint32_t object_size;
-    int64_t vdi_size;
     void *buf = NULL;
     int ret;
 
@@ -XXX,XX +XXX,XX @@ static int sd_prealloc(BlockDriverState *bs, Error **errp)
 
     blk_set_allow_write_beyond_eof(blk, true);
 
-    vdi_size = blk_getlength(blk);
-    if (vdi_size < 0) {
-        ret = vdi_size;
-        goto out;
-    }
-
     object_size = (UINT32_C(1) << base->inode.block_size_shift);
     buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
     buf = g_malloc0(buf_size);
 
-    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
+    max_idx = DIV_ROUND_UP(new_size, buf_size);
 
-    for (idx = 0; idx < max_idx; idx++) {
+    for (idx = old_size / buf_size; idx < max_idx; idx++) {
         /*
          * The created image can be a cloned image, so we need to read
          * a data from the source image.
@@ -XXX,XX +XXX,XX @@ static int sd_create(const char *filename, QemuOpts *opts,
             goto out;
         }
 
-        ret = sd_prealloc(bs, errp);
+        ret = sd_prealloc(bs, 0, s->inode.vdi_size, errp);
 
         bdrv_unref(bs);
     }
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/sheepdog.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index XXXXXXX..XXXXXXX 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static int sd_truncate(BlockDriverState *bs, int64_t offset,
     int ret, fd;
     unsigned int datalen;
     uint64_t max_vdi_size;
+    int64_t old_size = s->inode.vdi_size;
 
-    if (prealloc != PREALLOC_MODE_OFF) {
+    if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_FULL) {
         error_setg(errp, "Unsupported preallocation mode '%s'",
                    PreallocMode_str(prealloc));
         return -ENOTSUP;
     }
 
     max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
-    if (offset < s->inode.vdi_size) {
+    if (offset < old_size) {
         error_setg(errp, "shrinking is not supported");
         return -EINVAL;
     } else if (offset > max_vdi_size) {
@@ -XXX,XX +XXX,XX @@ static int sd_truncate(BlockDriverState *bs, int64_t offset,
 
     if (ret < 0) {
         error_setg_errno(errp, -ret, "failed to update an inode");
+        return ret;
     }
 
-    return ret;
+    if (prealloc == PREALLOC_MODE_FULL) {
+        ret = sd_prealloc(bs, old_size, offset, errp);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    return 0;
 }
 
 /*
-- 
2.13.6

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

To maintain load/store disabled bitmap there is new approach:

- deprecate @autoload flag of block-dirty-bitmap-add, make it ignored
 - store enabled bitmaps as "auto" to qcow2
 - store disabled bitmaps without "auto" flag to qcow2
 - on qcow2 open load "auto" bitmaps as enabled and others
   as disabled (except in_use bitmaps)

Also, adjust iotests 165 and 176 appropriately.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-id: 20180202160752.143796-1-vsementsov@virtuozzo.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qapi/block-core.json         |  6 +++---
 block/qcow2.h                |  2 +-
 include/block/dirty-bitmap.h |  1 -
 block/dirty-bitmap.c         | 18 ------------------
 block/qcow2-bitmap.c         | 12 +++++++-----
 block/qcow2.c                |  2 +-
 blockdev.c                   | 10 ++--------
 qemu-doc.texi                |  7 +++++++
 tests/qemu-iotests/165       |  2 +-
 tests/qemu-iotests/176       |  2 +-
 10 files changed, 23 insertions(+), 39 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 #              Qcow2 disks support persistent bitmaps. Default is false for
 #              block-dirty-bitmap-add. (Since: 2.10)
 #
-# @autoload: the bitmap will be automatically loaded when the image it is stored
-#            in is opened. This flag may only be specified for persistent
-#            bitmaps. Default is false for block-dirty-bitmap-add. (Since: 2.10)
+# @autoload: ignored and deprecated since 2.12.
+#            Currently, all dirty tracking bitmaps are loaded from Qcow2 on
+#            open.
 #
 # Since: 2.4
 ##
diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table);
 int qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                                   void **refcount_table,
                                   int64_t *refcount_table_size);
-bool qcow2_load_autoloading_dirty_bitmaps(BlockDriverState *bs, Error **errp);
+bool qcow2_load_dirty_bitmaps(BlockDriverState *bs, Error **errp);
 int qcow2_reopen_bitmaps_rw(BlockDriverState *bs, Error **errp);
 void qcow2_store_persistent_dirty_bitmaps(BlockDriverState *bs, Error **errp);
 int qcow2_reopen_bitmaps_ro(BlockDriverState *bs, Error **errp);
diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -XXX,XX +XXX,XX @@ void bdrv_dirty_bitmap_deserialize_ones(BdrvDirtyBitmap *bitmap,
 void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap);
 
 void bdrv_dirty_bitmap_set_readonly(BdrvDirtyBitmap *bitmap, bool value);
-void bdrv_dirty_bitmap_set_autoload(BdrvDirtyBitmap *bitmap, bool autoload);
 void bdrv_dirty_bitmap_set_persistance(BdrvDirtyBitmap *bitmap,
                                        bool persistent);
 
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index XXXXXXX..XXXXXXX 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -XXX,XX +XXX,XX @@ struct BdrvDirtyBitmap {
                                    Such operations must fail and both the image
                                    and this bitmap must remain unchanged while
                                    this flag is set. */
-    bool autoload;              /* For persistent bitmaps: bitmap must be
-                                   autoloaded on image opening */
     bool persistent;            /* bitmap must be saved to owner disk image */
     QLIST_ENTRY(BdrvDirtyBitmap) list;
 };
@@ -XXX,XX +XXX,XX @@ void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap)
     g_free(bitmap->name);
     bitmap->name = NULL;
     bitmap->persistent = false;
-    bitmap->autoload = false;
 }
 
 /* Called with BQL taken.  */
@@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
     bitmap->successor = NULL;
     successor->persistent = bitmap->persistent;
     bitmap->persistent = false;
-    successor->autoload = bitmap->autoload;
-    bitmap->autoload = false;
     bdrv_release_dirty_bitmap(bs, bitmap);
 
     return successor;
@@ -XXX,XX +XXX,XX @@ bool bdrv_has_readonly_bitmaps(BlockDriverState *bs)
 }
 
 /* Called with BQL taken. */
-void bdrv_dirty_bitmap_set_autoload(BdrvDirtyBitmap *bitmap, bool autoload)
-{
-    qemu_mutex_lock(bitmap->mutex);
-    bitmap->autoload = autoload;
-    qemu_mutex_unlock(bitmap->mutex);
-}
-
-bool bdrv_dirty_bitmap_get_autoload(const BdrvDirtyBitmap *bitmap)
-{
-    return bitmap->autoload;
-}
-
-/* Called with BQL taken. */
 void bdrv_dirty_bitmap_set_persistance(BdrvDirtyBitmap *bitmap, bool persistent)
 {
     qemu_mutex_lock(bitmap->mutex);
diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-bitmap.c
+++ b/block/qcow2-bitmap.c
@@ -XXX,XX +XXX,XX @@ static void set_readonly_helper(gpointer bitmap, gpointer value)
     bdrv_dirty_bitmap_set_readonly(bitmap, (bool)value);
 }
 
-/* qcow2_load_autoloading_dirty_bitmaps()
+/* qcow2_load_dirty_bitmaps()
  * Return value is a hint for caller: true means that the Qcow2 header was
  * updated. (false doesn't mean that the header should be updated by the
  * caller, it just means that updating was not needed or the image cannot be
  * written to).
  * On failure the function returns false.
  */
-bool qcow2_load_autoloading_dirty_bitmaps(BlockDriverState *bs, Error **errp)
+bool qcow2_load_dirty_bitmaps(BlockDriverState *bs, Error **errp)
 {
     BDRVQcow2State *s = bs->opaque;
     Qcow2BitmapList *bm_list;
@@ -XXX,XX +XXX,XX @@ bool qcow2_load_autoloading_dirty_bitmaps(BlockDriverState *bs, Error **errp)
     }
 
     QSIMPLEQ_FOREACH(bm, bm_list, entry) {
-        if ((bm->flags & BME_FLAG_AUTO) && !(bm->flags & BME_FLAG_IN_USE)) {
+        if (!(bm->flags & BME_FLAG_IN_USE)) {
             BdrvDirtyBitmap *bitmap = load_bitmap(bs, bm, errp);
             if (bitmap == NULL) {
                 goto fail;
             }
 
+            if (!(bm->flags & BME_FLAG_AUTO)) {
+                bdrv_disable_dirty_bitmap(bitmap);
+            }
             bdrv_dirty_bitmap_set_persistance(bitmap, true);
-            bdrv_dirty_bitmap_set_autoload(bitmap, true);
             bm->flags |= BME_FLAG_IN_USE;
             created_dirty_bitmaps =
                     g_slist_append(created_dirty_bitmaps, bitmap);
@@ -XXX,XX +XXX,XX @@ void qcow2_store_persistent_dirty_bitmaps(BlockDriverState *bs, Error **errp)
             bm->table.size = 0;
             QSIMPLEQ_INSERT_TAIL(&drop_tables, tb, entry);
         }
-        bm->flags = bdrv_dirty_bitmap_get_autoload(bitmap) ? BME_FLAG_AUTO : 0;
+        bm->flags = bdrv_dirty_bitmap_enabled(bitmap) ? BME_FLAG_AUTO : 0;
         bm->granularity_bits = ctz32(bdrv_dirty_bitmap_granularity(bitmap));
         bm->dirty_bitmap = bitmap;
     }
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags,
         s->autoclear_features &= QCOW2_AUTOCLEAR_MASK;
     }
 
-    if (qcow2_load_autoloading_dirty_bitmaps(bs, &local_err)) {
+    if (qcow2_load_dirty_bitmaps(bs, &local_err)) {
         update_header = false;
     }
     if (local_err != NULL) {
diff --git a/blockdev.c b/blockdev.c
index XXXXXXX..XXXXXXX 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -XXX,XX +XXX,XX @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name,
     if (!has_persistent) {
         persistent = false;
     }
-    if (!has_autoload) {
-        autoload = false;
-    }
 
-    if (has_autoload && !persistent) {
-        error_setg(errp, "Autoload flag must be used only for persistent "
-                         "bitmaps");
-        return;
+    if (has_autoload) {
+        warn_report("Autoload option is deprecated and its value is ignored");
     }
 
     if (persistent &&
@@ -XXX,XX +XXX,XX @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name,
     }
 
     bdrv_dirty_bitmap_set_persistance(bitmap, persistent);
-    bdrv_dirty_bitmap_set_autoload(bitmap, autoload);
 }
 
 void qmp_block_dirty_bitmap_remove(const char *node, const char *name,
diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ used and it will be removed with no replacement.
 The ``convert -s snapshot_id_or_name'' argument is obsoleted
 by the ``convert -l snapshot_param'' argument instead.
 
+@section QEMU Machine Protocol (QMP) commands
+
+@subsection block-dirty-bitmap-add "autoload" parameter (since 2.12.0)
+
+"autoload" parameter is now ignored. All bitmaps are automatically loaded
+from qcow2 images.
+
 @section System emulator human monitor commands
 
 @subsection host_net_add (since 2.10.0)
diff --git a/tests/qemu-iotests/165 b/tests/qemu-iotests/165
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/165
+++ b/tests/qemu-iotests/165
@@ -XXX,XX +XXX,XX @@ class TestPersistentDirtyBitmap(iotests.QMPTestCase):
 
     def qmpAddBitmap(self):
         self.vm.qmp('block-dirty-bitmap-add', node='drive0',
-                    name='bitmap0', persistent=True, autoload=True)
+                    name='bitmap0', persistent=True)
 
     def test_persistent(self):
         self.vm = self.mkVm()
diff --git a/tests/qemu-iotests/176 b/tests/qemu-iotests/176
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/176
+++ b/tests/qemu-iotests/176
@@ -XXX,XX +XXX,XX @@ case $reason in
      "file": { "driver": "file", "filename": "$TEST_IMG" } } }
 { "execute": "block-dirty-bitmap-add",
   "arguments": { "node": "drive0", "name": "bitmap0",
-     "persistent": true, "autoload": true } }
+     "persistent": true } }
 { "execute": "quit" }
 EOF
 	;;
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

This function has not been returning the offset of the L2 table since
commit 3948d1d4876065160583e79533bf604481063833

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: b498733b6706a859a03678d74ecbd26aeba129aa.1517840876.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ fail:
  * for a given disk offset, load (and allocate if needed)
  * the l2 table.
  *
- * the l2 table offset in the qcow2 file and the cluster index
- * in the l2 table are given to the caller.
+ * the cluster index in the l2 table is given to the caller.
  *
  * Returns 0 on success, -errno in failure case
  */
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

The table size in the qcow2 cache is currently equal to the cluster
size. This doesn't allow us to use the cache memory efficiently,
particularly with large cluster sizes, so we need to be able to have
smaller cache tables that are independent from the cluster size. This
patch adds a new field to Qcow2Cache that we can use instead of the
cluster size.

The current table size is still being initialized to the cluster size,
so there are no semantic changes yet, but this patch will allow us to
prepare the rest of the code and simplify a few function calls.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 67a1bf9e55f417005c567bead95a018dc34bc687.1517840876.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cache.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -XXX,XX +XXX,XX @@ struct Qcow2Cache {
     Qcow2CachedTable       *entries;
     struct Qcow2Cache      *depends;
     int                     size;
+    int                     table_size;
     bool                    depends_on_flush;
     void                   *table_array;
     uint64_t                lru_counter;
@@ -XXX,XX +XXX,XX @@ struct Qcow2Cache {
 static inline void *qcow2_cache_get_table_addr(BlockDriverState *bs,
                     Qcow2Cache *c, int table)
 {
-    BDRVQcow2State *s = bs->opaque;
-    return (uint8_t *) c->table_array + (size_t) table * s->cluster_size;
+    return (uint8_t *) c->table_array + (size_t) table * c->table_size;
 }
 
 static inline int qcow2_cache_get_table_idx(BlockDriverState *bs,
                   Qcow2Cache *c, void *table)
 {
-    BDRVQcow2State *s = bs->opaque;
     ptrdiff_t table_offset = (uint8_t *) table - (uint8_t *) c->table_array;
-    int idx = table_offset / s->cluster_size;
-    assert(idx >= 0 && idx < c->size && table_offset % s->cluster_size == 0);
+    int idx = table_offset / c->table_size;
+    assert(idx >= 0 && idx < c->size && table_offset % c->table_size == 0);
     return idx;
 }
 
@@ -XXX,XX +XXX,XX @@ static void qcow2_cache_table_release(BlockDriverState *bs, Qcow2Cache *c,
 {
 /* Using MADV_DONTNEED to discard memory is a Linux-specific feature */
 #ifdef CONFIG_LINUX
-    BDRVQcow2State *s = bs->opaque;
     void *t = qcow2_cache_get_table_addr(bs, c, i);
     int align = getpagesize();
-    size_t mem_size = (size_t) s->cluster_size * num_tables;
+    size_t mem_size = (size_t) c->table_size * num_tables;
     size_t offset = QEMU_ALIGN_UP((uintptr_t) t, align) - (uintptr_t) t;
     size_t length = QEMU_ALIGN_DOWN(mem_size - offset, align);
     if (mem_size > offset && length > 0) {
@@ -XXX,XX +XXX,XX @@ Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
 
     c = g_new0(Qcow2Cache, 1);
     c->size = num_tables;
+    c->table_size = s->cluster_size;
     c->entries = g_try_new0(Qcow2CachedTable, num_tables);
     c->table_array = qemu_try_blockalign(bs->file->bs,
-                                         (size_t) num_tables * s->cluster_size);
+                                         (size_t) num_tables * c->table_size);
 
     if (!c->entries || !c->table_array) {
         qemu_vfree(c->table_array);
@@ -XXX,XX +XXX,XX @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
 
     if (c == s->refcount_block_cache) {
         ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_BLOCK,
-                c->entries[i].offset, s->cluster_size);
+                c->entries[i].offset, c->table_size);
     } else if (c == s->l2_table_cache) {
         ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
-                c->entries[i].offset, s->cluster_size);
+                c->entries[i].offset, c->table_size);
     } else {
         ret = qcow2_pre_write_overlap_check(bs, 0,
-                c->entries[i].offset, s->cluster_size);
+                c->entries[i].offset, c->table_size);
     }
 
     if (ret < 0) {
@@ -XXX,XX +XXX,XX @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
     }
 
     ret = bdrv_pwrite(bs->file, c->entries[i].offset,
-                      qcow2_cache_get_table_addr(bs, c, i), s->cluster_size);
+                      qcow2_cache_get_table_addr(bs, c, i), c->table_size);
     if (ret < 0) {
         return ret;
     }
@@ -XXX,XX +XXX,XX @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
     trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache,
                           offset, read_from_disk);
 
-    if (offset_into_cluster(s, offset)) {
+    if (!QEMU_IS_ALIGNED(offset, c->table_size)) {
         qcow2_signal_corruption(bs, true, -1, -1, "Cannot get entry from %s "
                                 "cache: Offset %#" PRIx64 " is unaligned",
                                 qcow2_cache_get_name(s, c), offset);
@@ -XXX,XX +XXX,XX @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
     }
 
     /* Check if the table is already cached */
-    i = lookup_index = (offset / s->cluster_size * 4) % c->size;
+    i = lookup_index = (offset / c->table_size * 4) % c->size;
     do {
         const Qcow2CachedTable *t = &c->entries[i];
         if (t->offset == offset) {
@@ -XXX,XX +XXX,XX @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
 
         ret = bdrv_pread(bs->file, offset,
                          qcow2_cache_get_table_addr(bs, c, i),
-                         s->cluster_size);
+                         c->table_size);
         if (ret < 0) {
             return ret;
         }
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

This function was only using the BlockDriverState parameter to get the
cache table size (since it was equal to the cluster size). This is no
longer necessary so this parameter can be removed.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: e1f943a9e89e1deb876f45de1bb22419ccdb6ad3.1517840876.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cache.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -XXX,XX +XXX,XX @@ struct Qcow2Cache {
     uint64_t                cache_clean_lru_counter;
 };
 
-static inline void *qcow2_cache_get_table_addr(BlockDriverState *bs,
-                    Qcow2Cache *c, int table)
+static inline void *qcow2_cache_get_table_addr(Qcow2Cache *c, int table)
 {
     return (uint8_t *) c->table_array + (size_t) table * c->table_size;
 }
@@ -XXX,XX +XXX,XX @@ static void qcow2_cache_table_release(BlockDriverState *bs, Qcow2Cache *c,
 {
 /* Using MADV_DONTNEED to discard memory is a Linux-specific feature */
 #ifdef CONFIG_LINUX
-    void *t = qcow2_cache_get_table_addr(bs, c, i);
+    void *t = qcow2_cache_get_table_addr(c, i);
     int align = getpagesize();
     size_t mem_size = (size_t) c->table_size * num_tables;
     size_t offset = QEMU_ALIGN_UP((uintptr_t) t, align) - (uintptr_t) t;
@@ -XXX,XX +XXX,XX @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
     }
 
     ret = bdrv_pwrite(bs->file, c->entries[i].offset,
-                      qcow2_cache_get_table_addr(bs, c, i), c->table_size);
+                      qcow2_cache_get_table_addr(c, i), c->table_size);
     if (ret < 0) {
         return ret;
     }
@@ -XXX,XX +XXX,XX @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
         }
 
         ret = bdrv_pread(bs->file, offset,
-                         qcow2_cache_get_table_addr(bs, c, i),
+                         qcow2_cache_get_table_addr(c, i),
                          c->table_size);
         if (ret < 0) {
             return ret;
@@ -XXX,XX +XXX,XX @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
     /* And return the right table */
 found:
     c->entries[i].ref++;
-    *table = qcow2_cache_get_table_addr(bs, c, i);
+    *table = qcow2_cache_get_table_addr(c, i);
 
     trace_qcow2_cache_get_done(qemu_coroutine_self(),
                                c == s->l2_table_cache, i);
@@ -XXX,XX +XXX,XX @@ void *qcow2_cache_is_table_offset(BlockDriverState *bs, Qcow2Cache *c,
 
     for (i = 0; i < c->size; i++) {
         if (c->entries[i].offset == offset) {
-            return qcow2_cache_get_table_addr(bs, c, i);
+            return qcow2_cache_get_table_addr(c, i);
         }
     }
     return NULL;
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

This function was only using the BlockDriverState parameter to get the
cache table size (since it was equal to the cluster size). This is no
longer necessary so this parameter can be removed.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: da3575d47c9a181a2cfd4715e53dd84a2c651017.1517840876.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cache.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -XXX,XX +XXX,XX @@ static inline void *qcow2_cache_get_table_addr(Qcow2Cache *c, int table)
     return (uint8_t *) c->table_array + (size_t) table * c->table_size;
 }
 
-static inline int qcow2_cache_get_table_idx(BlockDriverState *bs,
-                  Qcow2Cache *c, void *table)
+static inline int qcow2_cache_get_table_idx(Qcow2Cache *c, void *table)
 {
     ptrdiff_t table_offset = (uint8_t *) table - (uint8_t *) c->table_array;
     int idx = table_offset / c->table_size;
@@ -XXX,XX +XXX,XX @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
 
 void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
 {
-    int i = qcow2_cache_get_table_idx(bs, c, *table);
+    int i = qcow2_cache_get_table_idx(c, *table);
 
     c->entries[i].ref--;
     *table = NULL;
@@ -XXX,XX +XXX,XX @@ void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
 void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c,
      void *table)
 {
-    int i = qcow2_cache_get_table_idx(bs, c, table);
+    int i = qcow2_cache_get_table_idx(c, table);
     assert(c->entries[i].offset != 0);
     c->entries[i].dirty = true;
 }
@@ -XXX,XX +XXX,XX @@ void *qcow2_cache_is_table_offset(BlockDriverState *bs, Qcow2Cache *c,
 
 void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table)
 {
-    int i = qcow2_cache_get_table_idx(bs, c, table);
+    int i = qcow2_cache_get_table_idx(c, table);
 
     assert(c->entries[i].ref == 0);
 
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

This function was only using the BlockDriverState parameter to get the
cache table size (since it was equal to the cluster size). This is no
longer necessary so this parameter can be removed.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 7c1b262344375d52544525f85bbbf0548d5ba575.1517840876.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cache.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -XXX,XX +XXX,XX @@ static inline const char *qcow2_cache_get_name(BDRVQcow2State *s, Qcow2Cache *c)
     }
 }
 
-static void qcow2_cache_table_release(BlockDriverState *bs, Qcow2Cache *c,
-                                      int i, int num_tables)
+static void qcow2_cache_table_release(Qcow2Cache *c, int i, int num_tables)
 {
 /* Using MADV_DONTNEED to discard memory is a Linux-specific feature */
 #ifdef CONFIG_LINUX
@@ -XXX,XX +XXX,XX @@ void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c)
         }
 
         if (to_clean > 0) {
-            qcow2_cache_table_release(bs, c, i - to_clean, to_clean);
+            qcow2_cache_table_release(c, i - to_clean, to_clean);
         }
     }
 
@@ -XXX,XX +XXX,XX @@ int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c)
         c->entries[i].lru_counter = 0;
     }
 
-    qcow2_cache_table_release(bs, c, 0, c->size);
+    qcow2_cache_table_release(c, 0, c->size);
 
     c->lru_counter = 0;
 
@@ -XXX,XX +XXX,XX @@ void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table)
     c->entries[i].lru_counter = 0;
     c->entries[i].dirty = false;
 
-    qcow2_cache_table_release(bs, c, i, 1);
+    qcow2_cache_table_release(c, i, 1);
 }
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

This function was only using the BlockDriverState parameter to pass it
to qcow2_cache_get_table_idx(). This is no longer necessary so this
parameter can be removed.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 5c40516a91782b083c1428b7b6a41bb9e2679bfb.1517840876.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.h          |  3 +--
 block/qcow2-cache.c    |  3 +--
 block/qcow2-cluster.c  | 12 ++++++------
 block/qcow2-refcount.c | 14 ++++++--------
 4 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ int qcow2_read_snapshots(BlockDriverState *bs);
 Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables);
 int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c);
 
-void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c,
-     void *table);
+void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table);
 int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c);
 int qcow2_cache_write(BlockDriverState *bs, Qcow2Cache *c);
 int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -XXX,XX +XXX,XX @@ void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
     assert(c->entries[i].ref >= 0);
 }
 
-void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c,
-     void *table)
+void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table)
 {
     int i = qcow2_cache_get_table_idx(c, table);
     assert(c->entries[i].offset != 0);
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
     BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
 
     trace_qcow2_l2_allocate_write_l2(bs, l1_index);
-    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
     ret = qcow2_cache_flush(bs, s->l2_table_cache);
     if (ret < 0) {
         goto fail;
@@ -XXX,XX +XXX,XX @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
     /* compressed clusters never have the copied flag */
 
     BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
-    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
     l2_table[l2_index] = cpu_to_be64(cluster_offset);
     qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 
@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     if (ret < 0) {
         goto err;
     }
-    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
 
     assert(l2_index + m->nb_clusters <= s->l2_size);
     for (i = 0; i < m->nb_clusters; i++) {
@@ -XXX,XX +XXX,XX @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
         }
 
         /* First remove L2 entries */
-        qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
         if (!full_discard && s->qcow_version >= 3) {
             l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
         } else {
@@ -XXX,XX +XXX,XX @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
             continue;
         }
 
-        qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
         if (cluster_type == QCOW2_CLUSTER_COMPRESSED || unmap) {
             l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
             qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
@@ -XXX,XX +XXX,XX @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
 
         if (is_active_l1) {
             if (l2_dirty) {
-                qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
+                qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
                 qcow2_cache_depends_on_flush(s->l2_table_cache);
             }
             qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ static int alloc_refcount_block(BlockDriverState *bs,
 
     /* Now the new refcount block needs to be written to disk */
     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
-    qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block);
+    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
     if (ret < 0) {
         goto fail;
@@ -XXX,XX +XXX,XX @@ int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
                 goto fail;
             }
             memset(refblock_data, 0, s->cluster_size);
-            qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
+            qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
                                          refblock_data);
 
             new_table[i] = block_offset;
@@ -XXX,XX +XXX,XX @@ int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
                 s->set_refcount(refblock_data, j, 1);
             }
 
-            qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
+            qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
                                          refblock_data);
         }
 
@@ -XXX,XX +XXX,XX @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         }
         old_table_index = table_index;
 
-        qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
-                                     refcount_block);
+        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
 
         /* we can update the count and save it */
         block_index = cluster_index & (s->refcount_block_size - 1);
@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                             s->refcount_block_cache);
                     }
                     l2_table[j] = cpu_to_be64(entry);
-                    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
-                                                 l2_table);
+                    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
                 }
             }
 
@@ -XXX,XX +XXX,XX @@ static int qcow2_discard_refcount_block(BlockDriverState *bs,
     }
     s->set_refcount(refblock, block_index, 0);
 
-    qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, refblock);
+    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
 
     qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
 
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

This function was only using the BlockDriverState parameter to pass it
to qcow2_cache_get_table_idx(). This is no longer necessary so this
parameter can be removed.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 6f98155489054a457563da77cdad1a66ebb3e896.1517840876.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.h          |  2 +-
 block/qcow2-cache.c    |  2 +-
 block/qcow2-cluster.c  | 28 ++++++++++++++--------------
 block/qcow2-refcount.c | 30 +++++++++++++++---------------
 4 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
     void **table);
 int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
     void **table);
-void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
+void qcow2_cache_put(Qcow2Cache *c, void **table);
 void *qcow2_cache_is_table_offset(BlockDriverState *bs, Qcow2Cache *c,
                                   uint64_t offset);
 void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table);
diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -XXX,XX +XXX,XX @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
     return qcow2_cache_do_get(bs, c, offset, table, false);
 }
 
-void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
+void qcow2_cache_put(Qcow2Cache *c, void **table)
 {
     int i = qcow2_cache_get_table_idx(c, *table);
 
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
 
         memcpy(l2_table, old_table, s->cluster_size);
 
-        qcow2_cache_put(bs, s->l2_table_cache, (void **) &old_table);
+        qcow2_cache_put(s->l2_table_cache, (void **) &old_table);
     }
 
     /* write the l2 table to the file */
@@ -XXX,XX +XXX,XX @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
 fail:
     trace_qcow2_l2_allocate_done(bs, l1_index, ret);
     if (l2_table != NULL) {
-        qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
+        qcow2_cache_put(s->l2_table_cache, (void **) table);
     }
     s->l1_table[l1_index] = old_l2_offset;
     if (l2_offset > 0) {
@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
         abort();
     }
 
-    qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
 
     bytes_available = (int64_t)c * s->cluster_size;
 
@@ -XXX,XX +XXX,XX @@ out:
     return type;
 
 fail:
-    qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **)&l2_table);
     return ret;
 }
 
@@ -XXX,XX +XXX,XX @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
      * allocated. */
     cluster_offset = be64_to_cpu(l2_table[l2_index]);
     if (cluster_offset & L2E_OFFSET_MASK) {
-        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
         return 0;
     }
 
     cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
     if (cluster_offset < 0) {
-        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
         return 0;
     }
 
@@ -XXX,XX +XXX,XX @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
     BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
     l2_table[l2_index] = cpu_to_be64(cluster_offset);
-    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
 
     return cluster_offset;
 }
@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
      }
 
 
-    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
 
     /*
      * If this was a COW, we need to decrease the refcount of the old cluster.
@@ -XXX,XX +XXX,XX @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
 
     /* Cleanup */
 out:
-    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
 
     /* Only return a host offset if we actually made progress. Otherwise we
      * would make requirements for handle_alloc() that it can't fulfill */
@@ -XXX,XX +XXX,XX @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
         keep_old_clusters = true;
     }
 
-    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
 
     if (!alloc_cluster_offset) {
         /* Allocate, if necessary at a given offset in the image file */
@@ -XXX,XX +XXX,XX @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
         qcow2_free_any_clusters(bs, old_l2_entry, 1, type);
     }
 
-    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
 
     return nb_clusters;
 }
@@ -XXX,XX +XXX,XX @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
         }
     }
 
-    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
 
     return nb_clusters;
 }
@@ -XXX,XX +XXX,XX @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
                 qcow2_cache_depends_on_flush(s->l2_table_cache);
             }
-            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+            qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
         } else {
             if (l2_dirty) {
                 ret = qcow2_pre_write_overlap_check(bs,
@@ -XXX,XX +XXX,XX @@ fail:
         if (!is_active_l1) {
             qemu_vfree(l2_table);
         } else {
-            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+            qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
         }
     }
     return ret;
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
     block_index = cluster_index & (s->refcount_block_size - 1);
     *refcount = s->get_refcount(refcount_block, block_index);
 
-    qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
+    qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 
     return 0;
 }
@@ -XXX,XX +XXX,XX @@ static int alloc_refcount_block(BlockDriverState *bs,
         return -EAGAIN;
     }
 
-    qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
+    qcow2_cache_put(s->refcount_block_cache, refcount_block);
 
     /*
      * If we come here, we need to grow the refcount table. Again, a new
@@ -XXX,XX +XXX,XX @@ static int alloc_refcount_block(BlockDriverState *bs,
 
 fail:
     if (*refcount_block != NULL) {
-        qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
+        qcow2_cache_put(s->refcount_block_cache, refcount_block);
     }
     return ret;
 }
@@ -XXX,XX +XXX,XX @@ int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
                                          refblock_data);
         }
 
-        qcow2_cache_put(bs, s->refcount_block_cache, &refblock_data);
+        qcow2_cache_put(s->refcount_block_cache, &refblock_data);
     }
 
     assert(block_offset == table_offset);
@@ -XXX,XX +XXX,XX @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         /* Load the refcount block and allocate it if needed */
         if (table_index != old_table_index) {
             if (refcount_block) {
-                qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
+                qcow2_cache_put(s->refcount_block_cache, &refcount_block);
             }
             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
             if (ret < 0) {
@@ -XXX,XX +XXX,XX @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
             table = qcow2_cache_is_table_offset(bs, s->refcount_block_cache,
                                                 offset);
             if (table != NULL) {
-                qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
+                qcow2_cache_put(s->refcount_block_cache, &refcount_block);
                 qcow2_cache_discard(bs, s->refcount_block_cache, table);
             }
 
@@ -XXX,XX +XXX,XX @@ fail:
 
     /* Write last changed block to disk */
     if (refcount_block) {
-        qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
+        qcow2_cache_put(s->refcount_block_cache, &refcount_block);
     }
 
     /*
@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                 }
             }
 
-            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+            qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
 
             if (addend != 0) {
                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     ret = bdrv_flush(bs);
 fail:
     if (l2_table) {
-        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
     }
 
     s->cache_discards = false;
@@ -XXX,XX +XXX,XX @@ static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
                                     new_reftable_size, new_refblock,
                                     new_refblock_empty, allocated, errp);
                     if (ret < 0) {
-                        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+                        qcow2_cache_put(s->refcount_block_cache, &refblock);
                         return ret;
                     }
 
@@ -XXX,XX +XXX,XX @@ static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
                 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
                     uint64_t offset;
 
-                    qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+                    qcow2_cache_put(s->refcount_block_cache, &refblock);
 
                     offset = ((reftable_index << s->refcount_block_bits)
                               + refblock_index) << s->cluster_bits;
@@ -XXX,XX +XXX,XX @@ static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
                 new_refblock_empty = new_refblock_empty && refcount == 0;
             }
 
-            qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+            qcow2_cache_put(s->refcount_block_cache, &refblock);
         } else {
             /* No refblock means every refcount is 0 */
             for (refblock_index = 0; refblock_index < s->refcount_block_size;
@@ -XXX,XX +XXX,XX @@ static int qcow2_discard_refcount_block(BlockDriverState *bs,
                                 offset_to_reftable_index(s, discard_block_offs),
                                 discard_block_offs,
                                 s->get_refcount(refblock, block_index));
-        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+        qcow2_cache_put(s->refcount_block_cache, &refblock);
         return -EINVAL;
     }
     s->set_refcount(refblock, block_index, 0);
 
     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
 
-    qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+    qcow2_cache_put(s->refcount_block_cache, &refblock);
 
     if (cluster_index < s->free_cluster_index) {
         s->free_cluster_index = cluster_index;
@@ -XXX,XX +XXX,XX @@ int qcow2_shrink_reftable(BlockDriverState *bs)
         } else {
             unused_block = buffer_is_zero(refblock, s->cluster_size);
         }
-        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+        qcow2_cache_put(s->refcount_block_cache, &refblock);
 
         reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
     }
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

This function was never using the BlockDriverState parameter so it can
be safely removed.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 49c74fe8b3aead9056e61a85b145ce787d06262b.1517840876.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.h       |  2 +-
 block/qcow2-cache.c |  2 +-
 block/qcow2.c       | 16 ++++++++--------
 3 files changed, 10 insertions(+), 10 deletions(-)

From: Alberto Garcia <berto@igalia.com>

This function was only using the BlockDriverState parameter to pass it
to qcow2_cache_table_release(). This is no longer necessary so this
parameter can be removed.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: b74f17591af52f201de0ea3a3b2dd0a81932334d.1517840876.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.h       | 2 +-
 block/qcow2-cache.c | 2 +-
 block/qcow2.c       | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
     Qcow2Cache *dependency);
 void qcow2_cache_depends_on_flush(Qcow2Cache *c);
 
-void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c);
+void qcow2_cache_clean_unused(Qcow2Cache *c);
 int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c);
 
 int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -XXX,XX +XXX,XX @@ static inline bool can_clean_entry(Qcow2Cache *c, int i)
         t->lru_counter <= c->cache_clean_lru_counter;
 }
 
-void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c)
+void qcow2_cache_clean_unused(Qcow2Cache *c)
 {
     int i = 0;
     while (i < c->size) {
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static void cache_clean_timer_cb(void *opaque)
 {
     BlockDriverState *bs = opaque;
     BDRVQcow2State *s = bs->opaque;
-    qcow2_cache_clean_unused(bs, s->l2_table_cache);
-    qcow2_cache_clean_unused(bs, s->refcount_block_cache);
+    qcow2_cache_clean_unused(s->l2_table_cache);
+    qcow2_cache_clean_unused(s->refcount_block_cache);
     timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
               (int64_t) s->cache_clean_interval * 1000);
 }
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

This function was only using the BlockDriverState parameter to pass it
to qcow2_cache_get_table_idx() and qcow2_cache_table_release(). This
is no longer necessary so this parameter can be removed.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 9724f7e38e763ad3be32627c6b7fe8df9edb1476.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.h          | 2 +-
 block/qcow2-cache.c    | 2 +-
 block/qcow2-refcount.c | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
 void qcow2_cache_put(Qcow2Cache *c, void **table);
 void *qcow2_cache_is_table_offset(BlockDriverState *bs, Qcow2Cache *c,
                                   uint64_t offset);
-void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table);
+void qcow2_cache_discard(Qcow2Cache *c, void *table);
 
 /* qcow2-bitmap.c functions */
 int qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -XXX,XX +XXX,XX @@ void *qcow2_cache_is_table_offset(BlockDriverState *bs, Qcow2Cache *c,
     return NULL;
 }
 
-void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table)
+void qcow2_cache_discard(Qcow2Cache *c, void *table)
 {
     int i = qcow2_cache_get_table_idx(c, table);
 
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
                                                 offset);
             if (table != NULL) {
                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
-                qcow2_cache_discard(bs, s->refcount_block_cache, table);
+                qcow2_cache_discard(s->refcount_block_cache, table);
             }
 
             table = qcow2_cache_is_table_offset(bs, s->l2_table_cache, offset);
             if (table != NULL) {
-                qcow2_cache_discard(bs, s->l2_table_cache, table);
+                qcow2_cache_discard(s->l2_table_cache, table);
             }
 
             if (s->discard_passthrough[type]) {
@@ -XXX,XX +XXX,XX @@ static int qcow2_discard_refcount_block(BlockDriverState *bs,
                                            discard_block_offs);
     if (refblock) {
         /* discard refblock from the cache if refblock is cached */
-        qcow2_cache_discard(bs, s->refcount_block_cache, refblock);
+        qcow2_cache_discard(s->refcount_block_cache, refblock);
     }
     update_refcount_discard(bs, discard_block_offs, s->cluster_size);
 
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

This function was only using the BlockDriverState parameter to pass it
to qcow2_cache_get_table_addr(). This is no longer necessary so this
parameter can be removed.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: eb0ed90affcf302e5a954bafb5931b5215483d3a.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.h          | 3 +--
 block/qcow2-cache.c    | 3 +--
 block/qcow2-refcount.c | 6 +++---
 3 files changed, 5 insertions(+), 7 deletions(-)

From: Alberto Garcia <berto@igalia.com>

Similar to offset_to_l2_index(), this function returns the index in
the L1 table for a given guest offset. This is only used in a couple
of places and it's not a particularly complex calculation, but it
makes the code a bit more readable.

Although in the qcow2_get_cluster_offset() case the old code was
taking advantage of the l1_bits variable, we're going to get rid of
the other uses of l1_bits in a later patch anyway, so it doesn't make
sense to keep it just for this.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: a5f626fed526b7459a0425fad06d823d18df8522.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.h         | 5 +++++
 block/qcow2-cluster.c | 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ static inline int64_t size_to_l1(BDRVQcow2State *s, int64_t size)
     return (size + (1ULL << shift) - 1) >> shift;
 }
 
+static inline int offset_to_l1_index(BDRVQcow2State *s, uint64_t offset)
+{
+    return offset >> (s->l2_bits + s->cluster_bits);
+}
+
 static inline int offset_to_l2_index(BDRVQcow2State *s, int64_t offset)
 {
     return (offset >> s->cluster_bits) & (s->l2_size - 1);
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
 
     /* seek to the l2 offset in the l1 table */
 
-    l1_index = offset >> l1_bits;
+    l1_index = offset_to_l1_index(s, offset);
     if (l1_index >= s->l1_size) {
         type = QCOW2_CLUSTER_UNALLOCATED;
         goto out;
@@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
 
     /* seek to the l2 offset in the l1 table */
 
-    l1_index = offset >> (s->l2_bits + s->cluster_bits);
+    l1_index = offset_to_l1_index(s, offset);
     if (l1_index >= s->l1_size) {
         ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
         if (ret < 0) {
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

The BDRVQcow2State structure contains an l2_size field, which stores
the number of 64-bit entries in an L2 table.

For efficiency reasons we want to be able to load slices instead of
full L2 tables, so we need to know how many entries an L2 slice can
hold.

An L2 slice is the portion of an L2 table that is loaded by the qcow2
cache. At the moment that cache can only load complete tables,
therefore an L2 slice has the same size as an L2 table (one cluster)
and l2_size == l2_slice_size.

Later we'll allow smaller slices, but until then we have to use this
new l2_slice_size field to make the rest of the code ready for that.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: adb048595f9fb5dfb110c802a8b3c3be3b937f37.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.h | 1 +
 block/qcow2.c | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ typedef struct BDRVQcow2State {
     int cluster_bits;
     int cluster_size;
     int cluster_sectors;
+    int l2_slice_size;
     int l2_bits;
     int l2_size;
     int l1_size;
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
 typedef struct Qcow2ReopenState {
     Qcow2Cache *l2_table_cache;
     Qcow2Cache *refcount_block_cache;
+    int l2_slice_size; /* Number of entries in a slice of the L2 table */
     bool use_lazy_refcounts;
     int overlap_check;
     bool discard_passthrough[QCOW2_DISCARD_MAX];
@@ -XXX,XX +XXX,XX @@ static int qcow2_update_options_prepare(BlockDriverState *bs,
         }
     }
 
+    r->l2_slice_size = s->cluster_size / sizeof(uint64_t);
     r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size);
     r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size);
     if (r->l2_table_cache == NULL || r->refcount_block_cache == NULL) {
@@ -XXX,XX +XXX,XX @@ static void qcow2_update_options_commit(BlockDriverState *bs,
     }
     s->l2_table_cache = r->l2_table_cache;
     s->refcount_block_cache = r->refcount_block_cache;
+    s->l2_slice_size = r->l2_slice_size;
 
     s->overlap_check = r->overlap_check;
     s->use_lazy_refcounts = r->use_lazy_refcounts;
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

Similar to offset_to_l2_index(), this function takes a guest offset
and returns the index in the L2 slice that contains its L2 entry.

An L2 slice has currently the same size as an L2 table (one cluster),
so both functions return the same value for now.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: a1c45c5c5a76146dd1712d8d1e7b409ad539c718.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ static inline int offset_to_l2_index(BDRVQcow2State *s, int64_t offset)
     return (offset >> s->cluster_bits) & (s->l2_size - 1);
 }
 
+static inline int offset_to_l2_slice_index(BDRVQcow2State *s, int64_t offset)
+{
+    return (offset >> s->cluster_bits) & (s->l2_slice_size - 1);
+}
+
 static inline int64_t align_offset(int64_t offset, int n)
 {
     offset = (offset + n - 1) & ~(n - 1);
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

Each entry in the qcow2 L2 cache stores a full L2 table (which uses a
complete cluster in the qcow2 image). A cluster is usually too large
to be used efficiently as the size for a cache entry, so we want to
decouple both values by allowing smaller cache entries. Therefore the
qcow2 L2 cache will no longer return full L2 tables but slices
instead.

This patch updates l2_load() so it can handle L2 slices correctly.
Apart from the offset of the L2 table (which we already had) we also
need the guest offset in order to calculate which one of the slices
we need.

An L2 slice has currently the same size as an L2 table (one cluster),
so for now this function will load exactly the same data as before.

This patch also removes a stale comment about the return value being
a pointer to the L2 table. This function returns an error code since
55c17e9821c474d5fcdebdc82ed2fc096777d611.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: b830aa1fc5b6f8e3cb331d006853fe22facca847.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
 /*
  * l2_load
  *
- * Loads a L2 table into memory. If the table is in the cache, the cache
- * is used; otherwise the L2 table is loaded from the image file.
+ * @bs: The BlockDriverState
+ * @offset: A guest offset, used to calculate what slice of the L2
+ *          table to load.
+ * @l2_offset: Offset to the L2 table in the image file.
+ * @l2_slice: Location to store the pointer to the L2 slice.
  *
- * Returns a pointer to the L2 table on success, or NULL if the read from
- * the image file failed.
+ * Loads a L2 slice into memory (L2 slices are the parts of L2 tables
+ * that are loaded by the qcow2 cache). If the slice is in the cache,
+ * the cache is used; otherwise the L2 slice is loaded from the image
+ * file.
  */
-
-static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
-    uint64_t **l2_table)
+static int l2_load(BlockDriverState *bs, uint64_t offset,
+                   uint64_t l2_offset, uint64_t **l2_slice)
 {
     BDRVQcow2State *s = bs->opaque;
+    int start_of_slice = sizeof(uint64_t) *
+        (offset_to_l2_index(s, offset) - offset_to_l2_slice_index(s, offset));
 
-    return qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
-                           (void **)l2_table);
+    return qcow2_cache_get(bs, s->l2_table_cache, l2_offset + start_of_slice,
+                           (void **)l2_slice);
 }
 
 /*
@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
 
     /* load the l2 table in memory */
 
-    ret = l2_load(bs, l2_offset, &l2_table);
+    ret = l2_load(bs, offset, l2_offset, &l2_table);
     if (ret < 0) {
         return ret;
     }
@@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
 
     if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) {
         /* load the l2 table in memory */
-        ret = l2_load(bs, l2_offset, &l2_table);
+        ret = l2_load(bs, offset, l2_offset, &l2_table);
         if (ret < 0) {
             return ret;
         }
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

Adding support for L2 slices to l2_allocate() needs (among other
things) an extra loop that iterates over all slices of a new L2 table.

Putting all changes in one patch would make it hard to read because
all semantic changes would be mixed with pure indentation changes.

To make things easier this patch simply creates a new block and
changes the indentation of all lines of code inside it. Thus, all
modifications in this patch are cosmetic. There are no semantic
changes and no variables are renamed yet. The next patch will take
care of that.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: d0d7dca8520db304524f52f49d8157595a707a35.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 53 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
     /* allocate a new entry in the l2 cache */
 
     trace_qcow2_l2_allocate_get_empty(bs, l1_index);
-    ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table);
-    if (ret < 0) {
-        goto fail;
-    }
+    {
+        ret = qcow2_cache_get_empty(bs, s->l2_table_cache,
+                                    l2_offset,
+                                    (void **) table);
+        if (ret < 0) {
+            goto fail;
+        }
 
-    l2_table = *table;
+        l2_table = *table;
 
-    if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
-        /* if there was no old l2 table, clear the new table */
-        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
-    } else {
-        uint64_t* old_table;
+        if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
+            /* if there was no old l2 table, clear the new table */
+            memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+        } else {
+            uint64_t *old_table;
 
-        /* if there was an old l2 table, read it from the disk */
-        BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
-        ret = qcow2_cache_get(bs, s->l2_table_cache,
-            old_l2_offset & L1E_OFFSET_MASK,
-            (void**) &old_table);
-        if (ret < 0) {
-            goto fail;
+            /* if there was an old l2 table, read it from the disk */
+            BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
+            ret = qcow2_cache_get(bs, s->l2_table_cache,
+                                  old_l2_offset & L1E_OFFSET_MASK,
+                                  (void **) &old_table);
+            if (ret < 0) {
+                goto fail;
+            }
+
+            memcpy(l2_table, old_table, s->cluster_size);
+
+            qcow2_cache_put(s->l2_table_cache, (void **) &old_table);
         }
 
-        memcpy(l2_table, old_table, s->cluster_size);
+        /* write the l2 table to the file */
+        BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
 
-        qcow2_cache_put(s->l2_table_cache, (void **) &old_table);
+        trace_qcow2_l2_allocate_write_l2(bs, l1_index);
+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
     }
 
-    /* write the l2 table to the file */
-    BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
-
-    trace_qcow2_l2_allocate_write_l2(bs, l1_index);
-    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
     ret = qcow2_cache_flush(bs, s->l2_table_cache);
     if (ret < 0) {
         goto fail;
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

This patch updates l2_allocate() to support the qcow2 cache returning
L2 slices instead of full L2 tables.

The old code simply gets an L2 table from the cache and initializes it
with zeroes or with the contents of an existing table. With a cache
that returns slices instead of tables the idea remains the same, but
the code must now iterate over all the slices that are contained in an
L2 table.

Since now we're operating with slices the function can no longer
return the newly-allocated table, so it's up to the caller to retrieve
the appropriate L2 slice after calling l2_allocate() (note that with
this patch the caller is still loading full L2 tables, but we'll deal
with that in a separate patch).

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20fc0415bf0e011e29f6487ec86eb06a11f37445.1517840877.git.berto@igalia.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 56 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
  *
  */
 
-static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
+static int l2_allocate(BlockDriverState *bs, int l1_index)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t old_l2_offset;
-    uint64_t *l2_table = NULL;
+    uint64_t *l2_slice = NULL;
+    unsigned slice, slice_size2, n_slices;
     int64_t l2_offset;
     int ret;
 
@@ -XXX,XX +XXX,XX @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
 
     /* allocate a new entry in the l2 cache */
 
+    slice_size2 = s->l2_slice_size * sizeof(uint64_t);
+    n_slices = s->cluster_size / slice_size2;
+
     trace_qcow2_l2_allocate_get_empty(bs, l1_index);
-    {
+    for (slice = 0; slice < n_slices; slice++) {
         ret = qcow2_cache_get_empty(bs, s->l2_table_cache,
-                                    l2_offset,
-                                    (void **) table);
+                                    l2_offset + slice * slice_size2,
+                                    (void **) &l2_slice);
         if (ret < 0) {
             goto fail;
         }
 
-        l2_table = *table;
-
         if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
-            /* if there was no old l2 table, clear the new table */
-            memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+            /* if there was no old l2 table, clear the new slice */
+            memset(l2_slice, 0, slice_size2);
         } else {
-            uint64_t *old_table;
+            uint64_t *old_slice;
+            uint64_t old_l2_slice_offset =
+                (old_l2_offset & L1E_OFFSET_MASK) + slice * slice_size2;
 
-            /* if there was an old l2 table, read it from the disk */
+            /* if there was an old l2 table, read a slice from the disk */
             BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
-            ret = qcow2_cache_get(bs, s->l2_table_cache,
-                                  old_l2_offset & L1E_OFFSET_MASK,
-                                  (void **) &old_table);
+            ret = qcow2_cache_get(bs, s->l2_table_cache, old_l2_slice_offset,
+                                  (void **) &old_slice);
             if (ret < 0) {
                 goto fail;
             }
 
-            memcpy(l2_table, old_table, s->cluster_size);
+            memcpy(l2_slice, old_slice, slice_size2);
 
-            qcow2_cache_put(s->l2_table_cache, (void **) &old_table);
+            qcow2_cache_put(s->l2_table_cache, (void **) &old_slice);
         }
 
-        /* write the l2 table to the file */
+        /* write the l2 slice to the file */
         BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
 
         trace_qcow2_l2_allocate_write_l2(bs, l1_index);
-        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
     }
 
     ret = qcow2_cache_flush(bs, s->l2_table_cache);
@@ -XXX,XX +XXX,XX @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
         goto fail;
     }
 
-    *table = l2_table;
     trace_qcow2_l2_allocate_done(bs, l1_index, 0);
     return 0;
 
 fail:
     trace_qcow2_l2_allocate_done(bs, l1_index, ret);
-    if (l2_table != NULL) {
-        qcow2_cache_put(s->l2_table_cache, (void **) table);
+    if (l2_slice != NULL) {
+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
     }
     s->l1_table[l1_index] = old_l2_offset;
     if (l2_offset > 0) {
@@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
         }
     } else {
         /* First allocate a new L2 table (and do COW if needed) */
-        ret = l2_allocate(bs, l1_index, &l2_table);
+        ret = l2_allocate(bs, l1_index);
         if (ret < 0) {
             return ret;
         }
@@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
             qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
                                 QCOW2_DISCARD_OTHER);
         }
+
+        /* Get the offset of the newly-allocated l2 table */
+        l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
+        assert(offset_into_cluster(s, l2_offset) == 0);
+        /* Load the l2 table in memory */
+        ret = l2_load(bs, offset, l2_offset, &l2_table);
+        if (ret < 0) {
+            return ret;
+        }
     }
 
     /* find the cluster offset for the given disk offset */
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

After the previous patch we're now always using l2_load() in
get_cluster_table() regardless of whether a new L2 table has to be
allocated or not.

This patch refactors that part of the code to use one single l2_load()
call.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: ce31758c4a1fadccea7a6ccb93951eb01d95fd4c.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
         return -EIO;
     }
 
-    /* seek the l2 table of the given l2 offset */
-
-    if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) {
-        /* load the l2 table in memory */
-        ret = l2_load(bs, offset, l2_offset, &l2_table);
-        if (ret < 0) {
-            return ret;
-        }
-    } else {
+    if (!(s->l1_table[l1_index] & QCOW_OFLAG_COPIED)) {
         /* First allocate a new L2 table (and do COW if needed) */
         ret = l2_allocate(bs, l1_index);
         if (ret < 0) {
@@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
         /* Get the offset of the newly-allocated l2 table */
         l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
         assert(offset_into_cluster(s, l2_offset) == 0);
-        /* Load the l2 table in memory */
-        ret = l2_load(bs, offset, l2_offset, &l2_table);
-        if (ret < 0) {
-            return ret;
-        }
+    }
+
+    /* load the l2 table in memory */
+    ret = l2_load(bs, offset, l2_offset, &l2_table);
+    if (ret < 0) {
+        return ret;
     }
 
     /* find the cluster offset for the given disk offset */
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

This patch updates get_cluster_table() to return L2 slices instead of
full L2 tables.

The code itself needs almost no changes, it only needs to call
offset_to_l2_slice_index() instead of offset_to_l2_index(). This patch
also renames all the relevant variables and the documentation.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 64cf064c0021ba315d3f3032da0f95db1b615f33.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ fail:
  * get_cluster_table
  *
  * for a given disk offset, load (and allocate if needed)
- * the l2 table.
+ * the appropriate slice of its l2 table.
  *
- * the cluster index in the l2 table is given to the caller.
+ * the cluster index in the l2 slice is given to the caller.
  *
  * Returns 0 on success, -errno in failure case
  */
 static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
-                             uint64_t **new_l2_table,
+                             uint64_t **new_l2_slice,
                              int *new_l2_index)
 {
     BDRVQcow2State *s = bs->opaque;
     unsigned int l2_index;
     uint64_t l1_index, l2_offset;
-    uint64_t *l2_table = NULL;
+    uint64_t *l2_slice = NULL;
     int ret;
 
     /* seek to the l2 offset in the l1 table */
@@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
         assert(offset_into_cluster(s, l2_offset) == 0);
     }
 
-    /* load the l2 table in memory */
-    ret = l2_load(bs, offset, l2_offset, &l2_table);
+    /* load the l2 slice in memory */
+    ret = l2_load(bs, offset, l2_offset, &l2_slice);
     if (ret < 0) {
         return ret;
     }
 
     /* find the cluster offset for the given disk offset */
 
-    l2_index = offset_to_l2_index(s, offset);
+    l2_index = offset_to_l2_slice_index(s, offset);
 
-    *new_l2_table = l2_table;
+    *new_l2_slice = l2_slice;
     *new_l2_index = l2_index;
 
     return 0;
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

qcow2_get_cluster_offset() checks how many contiguous bytes are
available at a given offset. The returned number of bytes is limited
by the amount that can be addressed without having to load more than
one L2 table.

Since we'll be loading L2 slices instead of full tables this patch
changes the limit accordingly using the size of the L2 slice for the
calculations instead of the full table size.

One consequence of this is that with small L2 slices operations such
as 'qemu-img map' will need to iterate in more steps because each
qcow2_get_cluster_offset() call will potentially return a smaller
number. However the code is already prepared for that so this doesn't
break semantics.

The l2_table variable is also renamed to l2_slice to reflect this, and
offset_to_l2_index() is replaced with offset_to_l2_slice_index().

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 6b602260acb33da56ed6af9611731cb7acd110eb.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
 {
     BDRVQcow2State *s = bs->opaque;
     unsigned int l2_index;
-    uint64_t l1_index, l2_offset, *l2_table;
-    int l1_bits, c;
+    uint64_t l1_index, l2_offset, *l2_slice;
+    int c;
     unsigned int offset_in_cluster;
     uint64_t bytes_available, bytes_needed, nb_clusters;
     QCow2ClusterType type;
@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
     offset_in_cluster = offset_into_cluster(s, offset);
     bytes_needed = (uint64_t) *bytes + offset_in_cluster;
 
-    l1_bits = s->l2_bits + s->cluster_bits;
-
     /* compute how many bytes there are between the start of the cluster
-     * containing offset and the end of the l1 entry */
-    bytes_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1))
-                    + offset_in_cluster;
+     * containing offset and the end of the l2 slice that contains
+     * the entry pointing to it */
+    bytes_available =
+        ((uint64_t) (s->l2_slice_size - offset_to_l2_slice_index(s, offset)))
+        << s->cluster_bits;
 
     if (bytes_needed > bytes_available) {
         bytes_needed = bytes_available;
@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
         return -EIO;
     }
 
-    /* load the l2 table in memory */
+    /* load the l2 slice in memory */
 
-    ret = l2_load(bs, offset, l2_offset, &l2_table);
+    ret = l2_load(bs, offset, l2_offset, &l2_slice);
     if (ret < 0) {
         return ret;
     }
 
     /* find the cluster offset for the given disk offset */
 
-    l2_index = offset_to_l2_index(s, offset);
-    *cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    l2_index = offset_to_l2_slice_index(s, offset);
+    *cluster_offset = be64_to_cpu(l2_slice[l2_index]);
 
     nb_clusters = size_to_clusters(s, bytes_needed);
     /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned
@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
     case QCOW2_CLUSTER_UNALLOCATED:
         /* how many empty clusters ? */
         c = count_contiguous_clusters_unallocated(nb_clusters,
-                                                  &l2_table[l2_index], type);
+                                                  &l2_slice[l2_index], type);
         *cluster_offset = 0;
         break;
     case QCOW2_CLUSTER_ZERO_ALLOC:
     case QCOW2_CLUSTER_NORMAL:
         /* how many allocated clusters ? */
         c = count_contiguous_clusters(nb_clusters, s->cluster_size,
-                                      &l2_table[l2_index], QCOW_OFLAG_ZERO);
+                                      &l2_slice[l2_index], QCOW_OFLAG_ZERO);
         *cluster_offset &= L2E_OFFSET_MASK;
         if (offset_into_cluster(s, *cluster_offset)) {
             qcow2_signal_corruption(bs, true, -1, -1,
@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
         abort();
     }
 
-    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
 
     bytes_available = (int64_t)c * s->cluster_size;
 
@@ -XXX,XX +XXX,XX @@ out:
     return type;
 
 fail:
-    qcow2_cache_put(s->l2_table_cache, (void **)&l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **)&l2_slice);
     return ret;
 }
 
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

There's a loop in this function that iterates over the L2 entries in a
table, so now we need to assert that it remains within the limits of
an L2 slice.

Apart from that, this function doesn't need any additional changes, so
this patch simply updates the variable name from l2_table to l2_slice.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: f9846a1c2efc51938e877e2a25852d9ab14797ff.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
 {
     BDRVQcow2State *s = bs->opaque;
     int i, j = 0, l2_index, ret;
-    uint64_t *old_cluster, *l2_table;
+    uint64_t *old_cluster, *l2_slice;
     uint64_t cluster_offset = m->alloc_offset;
 
     trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
                                    s->refcount_block_cache);
     }
 
-    ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
+    ret = get_cluster_table(bs, m->offset, &l2_slice, &l2_index);
     if (ret < 0) {
         goto err;
     }
-    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
 
-    assert(l2_index + m->nb_clusters <= s->l2_size);
+    assert(l2_index + m->nb_clusters <= s->l2_slice_size);
     for (i = 0; i < m->nb_clusters; i++) {
         /* if two concurrent writes happen to the same unallocated cluster
          * each write allocates separate cluster and writes data concurrently.
@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
          * cluster the second one has to do RMW (which is done above by
          * perform_cow()), update l2 table with its cluster pointer and free
          * old cluster. This is what this loop does */
-        if (l2_table[l2_index + i] != 0) {
-            old_cluster[j++] = l2_table[l2_index + i];
+        if (l2_slice[l2_index + i] != 0) {
+            old_cluster[j++] = l2_slice[l2_index + i];
         }
 
-        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
+        l2_slice[l2_index + i] = cpu_to_be64((cluster_offset +
                     (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
      }
 
 
-    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
 
     /*
      * If this was a COW, we need to decrease the refcount of the old cluster.
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

handle_copied() loads an L2 table and limits the number of checked
clusters to the amount that fits inside that table. Since we'll be
loading L2 slices instead of full tables we need to update that limit.

Apart from that, this function doesn't need any additional changes, so
this patch simply updates the variable name from l2_table to l2_slice.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 541ac001a7d6b86bab2392554bee53c2b312148c.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
     BDRVQcow2State *s = bs->opaque;
     int l2_index;
     uint64_t cluster_offset;
-    uint64_t *l2_table;
+    uint64_t *l2_slice;
     uint64_t nb_clusters;
     unsigned int keep_clusters;
     int ret;
@@ -XXX,XX +XXX,XX @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
                                 == offset_into_cluster(s, *host_offset));
 
     /*
-     * Calculate the number of clusters to look for. We stop at L2 table
+     * Calculate the number of clusters to look for. We stop at L2 slice
      * boundaries to keep things simple.
      */
     nb_clusters =
         size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
 
-    l2_index = offset_to_l2_index(s, guest_offset);
-    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+    l2_index = offset_to_l2_slice_index(s, guest_offset);
+    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
     assert(nb_clusters <= INT_MAX);
 
     /* Find L2 entry for the first involved cluster */
-    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
+    ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
     if (ret < 0) {
         return ret;
     }
 
-    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    cluster_offset = be64_to_cpu(l2_slice[l2_index]);
 
     /* Check how many clusters are already allocated and don't need COW */
     if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
@@ -XXX,XX +XXX,XX @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
         /* We keep all QCOW_OFLAG_COPIED clusters */
         keep_clusters =
             count_contiguous_clusters(nb_clusters, s->cluster_size,
-                                      &l2_table[l2_index],
+                                      &l2_slice[l2_index],
                                       QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
         assert(keep_clusters <= nb_clusters);
 
@@ -XXX,XX +XXX,XX @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
 
     /* Cleanup */
 out:
-    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
 
     /* Only return a host offset if we actually made progress. Otherwise we
      * would make requirements for handle_alloc() that it can't fulfill */
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

handle_alloc() loads an L2 table and limits the number of checked
clusters to the amount that fits inside that table. Since we'll be
loading L2 slices instead of full tables we need to update that limit.

Apart from that, this function doesn't need any additional changes, so
this patch simply updates the variable name from l2_table to l2_slice.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: b243299c7136f7014c5af51665431ddbf5e99afd.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
 {
     BDRVQcow2State *s = bs->opaque;
     int l2_index;
-    uint64_t *l2_table;
+    uint64_t *l2_slice;
     uint64_t entry;
     uint64_t nb_clusters;
     int ret;
@@ -XXX,XX +XXX,XX @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
     assert(*bytes > 0);
 
     /*
-     * Calculate the number of clusters to look for. We stop at L2 table
+     * Calculate the number of clusters to look for. We stop at L2 slice
      * boundaries to keep things simple.
      */
     nb_clusters =
         size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
 
-    l2_index = offset_to_l2_index(s, guest_offset);
-    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+    l2_index = offset_to_l2_slice_index(s, guest_offset);
+    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
     assert(nb_clusters <= INT_MAX);
 
     /* Find L2 entry for the first involved cluster */
-    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
+    ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
     if (ret < 0) {
         return ret;
     }
 
-    entry = be64_to_cpu(l2_table[l2_index]);
+    entry = be64_to_cpu(l2_slice[l2_index]);
 
     /* For the moment, overwrite compressed clusters one by one */
     if (entry & QCOW_OFLAG_COMPRESSED) {
         nb_clusters = 1;
     } else {
-        nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index);
+        nb_clusters = count_cow_clusters(s, nb_clusters, l2_slice, l2_index);
     }
 
     /* This function is only called when there were no non-COW clusters, so if
@@ -XXX,XX +XXX,XX @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
          * nb_clusters already to a range of COW clusters */
         preallocated_nb_clusters =
             count_contiguous_clusters(nb_clusters, s->cluster_size,
-                                      &l2_table[l2_index], QCOW_OFLAG_COPIED);
+                                      &l2_slice[l2_index], QCOW_OFLAG_COPIED);
         assert(preallocated_nb_clusters > 0);
 
         nb_clusters = preallocated_nb_clusters;
@@ -XXX,XX +XXX,XX @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
         keep_old_clusters = true;
     }
 
-    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
 
     if (!alloc_cluster_offset) {
         /* Allocate, if necessary at a given offset in the image file */
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

discard_single_l2() limits the number of clusters to be discarded
to the amount that fits inside an L2 table. Since we'll be loading
L2 slices instead of full tables we need to update that limit. The
function is renamed to discard_in_l2_slice() for clarity.

Apart from that, this function doesn't need any additional changes, so
this patch simply updates the variable name from l2_table to l2_slice.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Message-id: 1cb44a5b68be5334cb01b97a3db3a3c5a43396e5.1517840877.git.berto@igalia.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
 
 /*
  * This discards as many clusters of nb_clusters as possible at once (i.e.
- * all clusters in the same L2 table) and returns the number of discarded
+ * all clusters in the same L2 slice) and returns the number of discarded
  * clusters.
  */
-static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
-                             uint64_t nb_clusters, enum qcow2_discard_type type,
-                             bool full_discard)
+static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
+                               uint64_t nb_clusters,
+                               enum qcow2_discard_type type, bool full_discard)
 {
     BDRVQcow2State *s = bs->opaque;
-    uint64_t *l2_table;
+    uint64_t *l2_slice;
     int l2_index;
     int ret;
     int i;
 
-    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+    ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
     if (ret < 0) {
         return ret;
     }
 
-    /* Limit nb_clusters to one L2 table */
-    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+    /* Limit nb_clusters to one L2 slice */
+    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
     assert(nb_clusters <= INT_MAX);
 
     for (i = 0; i < nb_clusters; i++) {
         uint64_t old_l2_entry;
 
-        old_l2_entry = be64_to_cpu(l2_table[l2_index + i]);
+        old_l2_entry = be64_to_cpu(l2_slice[l2_index + i]);
 
         /*
          * If full_discard is false, make sure that a discarded area reads back
@@ -XXX,XX +XXX,XX @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
         }
 
         /* First remove L2 entries */
-        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
         if (!full_discard && s->qcow_version >= 3) {
-            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
+            l2_slice[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
         } else {
-            l2_table[l2_index + i] = cpu_to_be64(0);
+            l2_slice[l2_index + i] = cpu_to_be64(0);
         }
 
         /* Then decrease the refcount */
         qcow2_free_any_clusters(bs, old_l2_entry, 1, type);
     }
 
-    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
 
     return nb_clusters;
 }
@@ -XXX,XX +XXX,XX @@ int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
 
     s->cache_discards = true;
 
-    /* Each L2 table is handled by its own loop iteration */
+    /* Each L2 slice is handled by its own loop iteration */
     while (nb_clusters > 0) {
-        cleared = discard_single_l2(bs, offset, nb_clusters, type,
-                                    full_discard);
+        cleared = discard_in_l2_slice(bs, offset, nb_clusters, type,
+                                      full_discard);
         if (cleared < 0) {
             ret = cleared;
             goto fail;
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

zero_single_l2() limits the number of clusters to be zeroed to the
amount that fits inside an L2 table. Since we'll be loading L2 slices
instead of full tables we need to update that limit. The function is
renamed to zero_in_l2_slice() for clarity.

Apart from that, this function doesn't need any additional changes, so
this patch simply updates the variable name from l2_table to l2_slice.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Message-id: ebc16e7e79fa6969d8975ef487d679794de4fbcc.1517840877.git.berto@igalia.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ fail:
 
 /*
  * This zeroes as many clusters of nb_clusters as possible at once (i.e.
- * all clusters in the same L2 table) and returns the number of zeroed
+ * all clusters in the same L2 slice) and returns the number of zeroed
  * clusters.
  */
-static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
-                          uint64_t nb_clusters, int flags)
+static int zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
+                            uint64_t nb_clusters, int flags)
 {
     BDRVQcow2State *s = bs->opaque;
-    uint64_t *l2_table;
+    uint64_t *l2_slice;
     int l2_index;
     int ret;
     int i;
     bool unmap = !!(flags & BDRV_REQ_MAY_UNMAP);
 
-    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+    ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
     if (ret < 0) {
         return ret;
     }
 
-    /* Limit nb_clusters to one L2 table */
-    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+    /* Limit nb_clusters to one L2 slice */
+    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
     assert(nb_clusters <= INT_MAX);
 
     for (i = 0; i < nb_clusters; i++) {
         uint64_t old_offset;
         QCow2ClusterType cluster_type;
 
-        old_offset = be64_to_cpu(l2_table[l2_index + i]);
+        old_offset = be64_to_cpu(l2_slice[l2_index + i]);
 
         /*
          * Minimize L2 changes if the cluster already reads back as
@@ -XXX,XX +XXX,XX @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
             continue;
         }
 
-        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
         if (cluster_type == QCOW2_CLUSTER_COMPRESSED || unmap) {
-            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
+            l2_slice[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
             qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
         } else {
-            l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
+            l2_slice[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
         }
     }
 
-    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
 
     return nb_clusters;
 }
@@ -XXX,XX +XXX,XX @@ int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset,
         return -ENOTSUP;
     }
 
-    /* Each L2 table is handled by its own loop iteration */
+    /* Each L2 slice is handled by its own loop iteration */
     nb_clusters = size_to_clusters(s, bytes);
 
     s->cache_discards = true;
 
     while (nb_clusters > 0) {
-        cleared = zero_single_l2(bs, offset, nb_clusters, flags);
+        cleared = zero_in_l2_slice(bs, offset, nb_clusters, flags);
         if (cleared < 0) {
             ret = cleared;
             goto fail;
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

Adding support for L2 slices to qcow2_update_snapshot_refcount() needs
(among other things) an extra loop that iterates over all slices of
each L2 table.

Putting all changes in one patch would make it hard to read because
all semantic changes would be mixed with pure indentation changes.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 8ffaa5e55bd51121f80e498f4045b64902a94293.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-refcount.c | 144 +++++++++++++++++++++++++------------------------
 1 file changed, 75 insertions(+), 69 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                 goto fail;
             }
 
-            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
-                (void**) &l2_table);
-            if (ret < 0) {
-                goto fail;
-            }
+            {
+                ret = qcow2_cache_get(bs, s->l2_table_cache,
+                                      l2_offset,
+                                      (void **) &l2_table);
+                if (ret < 0) {
+                    goto fail;
+                }
 
-            for (j = 0; j < s->l2_size; j++) {
-                uint64_t cluster_index;
-                uint64_t offset;
-
-                entry = be64_to_cpu(l2_table[j]);
-                old_entry = entry;
-                entry &= ~QCOW_OFLAG_COPIED;
-                offset = entry & L2E_OFFSET_MASK;
-
-                switch (qcow2_get_cluster_type(entry)) {
-                case QCOW2_CLUSTER_COMPRESSED:
-                    nb_csectors = ((entry >> s->csize_shift) &
-                                   s->csize_mask) + 1;
-                    if (addend != 0) {
-                        ret = update_refcount(bs,
-                                (entry & s->cluster_offset_mask) & ~511,
+                for (j = 0; j < s->l2_size; j++) {
+                    uint64_t cluster_index;
+                    uint64_t offset;
+
+                    entry = be64_to_cpu(l2_table[j]);
+                    old_entry = entry;
+                    entry &= ~QCOW_OFLAG_COPIED;
+                    offset = entry & L2E_OFFSET_MASK;
+
+                    switch (qcow2_get_cluster_type(entry)) {
+                    case QCOW2_CLUSTER_COMPRESSED:
+                        nb_csectors = ((entry >> s->csize_shift) &
+                                       s->csize_mask) + 1;
+                        if (addend != 0) {
+                            ret = update_refcount(
+                                bs, (entry & s->cluster_offset_mask) & ~511,
                                 nb_csectors * 512, abs(addend), addend < 0,
                                 QCOW2_DISCARD_SNAPSHOT);
-                        if (ret < 0) {
+                            if (ret < 0) {
+                                goto fail;
+                            }
+                        }
+                        /* compressed clusters are never modified */
+                        refcount = 2;
+                        break;
+
+                    case QCOW2_CLUSTER_NORMAL:
+                    case QCOW2_CLUSTER_ZERO_ALLOC:
+                        if (offset_into_cluster(s, offset)) {
+                            qcow2_signal_corruption(
+                                bs, true, -1, -1, "Cluster "
+                                "allocation offset %#" PRIx64
+                                " unaligned (L2 offset: %#"
+                                PRIx64 ", L2 index: %#x)",
+                                offset, l2_offset, j);
+                            ret = -EIO;
                             goto fail;
                         }
-                    }
-                    /* compressed clusters are never modified */
-                    refcount = 2;
-                    break;
-
-                case QCOW2_CLUSTER_NORMAL:
-                case QCOW2_CLUSTER_ZERO_ALLOC:
-                    if (offset_into_cluster(s, offset)) {
-                        qcow2_signal_corruption(bs, true, -1, -1, "Cluster "
-                                                "allocation offset %#" PRIx64
-                                                " unaligned (L2 offset: %#"
-                                                PRIx64 ", L2 index: %#x)",
-                                                offset, l2_offset, j);
-                        ret = -EIO;
-                        goto fail;
-                    }
 
-                    cluster_index = offset >> s->cluster_bits;
-                    assert(cluster_index);
-                    if (addend != 0) {
-                        ret = qcow2_update_cluster_refcount(bs,
-                                    cluster_index, abs(addend), addend < 0,
-                                    QCOW2_DISCARD_SNAPSHOT);
+                        cluster_index = offset >> s->cluster_bits;
+                        assert(cluster_index);
+                        if (addend != 0) {
+                            ret = qcow2_update_cluster_refcount(
+                                bs, cluster_index, abs(addend), addend < 0,
+                                QCOW2_DISCARD_SNAPSHOT);
+                            if (ret < 0) {
+                                goto fail;
+                            }
+                        }
+
+                        ret = qcow2_get_refcount(bs, cluster_index, &refcount);
                         if (ret < 0) {
                             goto fail;
                         }
-                    }
+                        break;
 
-                    ret = qcow2_get_refcount(bs, cluster_index, &refcount);
-                    if (ret < 0) {
-                        goto fail;
-                    }
-                    break;
-
-                case QCOW2_CLUSTER_ZERO_PLAIN:
-                case QCOW2_CLUSTER_UNALLOCATED:
-                    refcount = 0;
-                    break;
+                    case QCOW2_CLUSTER_ZERO_PLAIN:
+                    case QCOW2_CLUSTER_UNALLOCATED:
+                        refcount = 0;
+                        break;
 
-                default:
-                    abort();
-                }
+                    default:
+                        abort();
+                    }
 
-                if (refcount == 1) {
-                    entry |= QCOW_OFLAG_COPIED;
-                }
-                if (entry != old_entry) {
-                    if (addend > 0) {
-                        qcow2_cache_set_dependency(bs, s->l2_table_cache,
-                            s->refcount_block_cache);
+                    if (refcount == 1) {
+                        entry |= QCOW_OFLAG_COPIED;
+                    }
+                    if (entry != old_entry) {
+                        if (addend > 0) {
+                            qcow2_cache_set_dependency(bs, s->l2_table_cache,
+                                                       s->refcount_block_cache);
+                        }
+                        l2_table[j] = cpu_to_be64(entry);
+                        qcow2_cache_entry_mark_dirty(s->l2_table_cache,
+                                                     l2_table);
                     }
-                    l2_table[j] = cpu_to_be64(entry);
-                    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
                 }
-            }
 
-            qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
+                qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
+
+            }
 
             if (addend != 0) {
                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

qcow2_update_snapshot_refcount() increases the refcount of all
clusters of a given snapshot. In order to do that it needs to load all
its L2 tables and iterate over their entries. Since we'll be loading
L2 slices instead of full tables we need to add an extra loop that
iterates over all slices of each L2 table.

This function doesn't need any additional changes so apart from that
this patch simply updates the variable name from l2_table to l2_slice.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Message-id: 5f4db199b9637f4833b58487135124d70add8cf0.1517840877.git.berto@igalia.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-refcount.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     int64_t l1_table_offset, int l1_size, int addend)
 {
     BDRVQcow2State *s = bs->opaque;
-    uint64_t *l1_table, *l2_table, l2_offset, entry, l1_size2, refcount;
+    uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
     bool l1_allocated = false;
     int64_t old_entry, old_l2_offset;
+    unsigned slice, slice_size2, n_slices;
     int i, j, l1_modified = 0, nb_csectors;
     int ret;
 
     assert(addend >= -1 && addend <= 1);
 
-    l2_table = NULL;
+    l2_slice = NULL;
     l1_table = NULL;
     l1_size2 = l1_size * sizeof(uint64_t);
+    slice_size2 = s->l2_slice_size * sizeof(uint64_t);
+    n_slices = s->cluster_size / slice_size2;
 
     s->cache_discards = true;
 
@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                 goto fail;
             }
 
-            {
+            for (slice = 0; slice < n_slices; slice++) {
                 ret = qcow2_cache_get(bs, s->l2_table_cache,
-                                      l2_offset,
-                                      (void **) &l2_table);
+                                      l2_offset + slice * slice_size2,
+                                      (void **) &l2_slice);
                 if (ret < 0) {
                     goto fail;
                 }
 
-                for (j = 0; j < s->l2_size; j++) {
+                for (j = 0; j < s->l2_slice_size; j++) {
                     uint64_t cluster_index;
                     uint64_t offset;
 
-                    entry = be64_to_cpu(l2_table[j]);
+                    entry = be64_to_cpu(l2_slice[j]);
                     old_entry = entry;
                     entry &= ~QCOW_OFLAG_COPIED;
                     offset = entry & L2E_OFFSET_MASK;
@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                     case QCOW2_CLUSTER_NORMAL:
                     case QCOW2_CLUSTER_ZERO_ALLOC:
                         if (offset_into_cluster(s, offset)) {
+                            /* Here l2_index means table (not slice) index */
+                            int l2_index = slice * s->l2_slice_size + j;
                             qcow2_signal_corruption(
                                 bs, true, -1, -1, "Cluster "
                                 "allocation offset %#" PRIx64
                                 " unaligned (L2 offset: %#"
                                 PRIx64 ", L2 index: %#x)",
-                                offset, l2_offset, j);
+                                offset, l2_offset, l2_index);
                             ret = -EIO;
                             goto fail;
                         }
@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                             qcow2_cache_set_dependency(bs, s->l2_table_cache,
                                                        s->refcount_block_cache);
                         }
-                        l2_table[j] = cpu_to_be64(entry);
+                        l2_slice[j] = cpu_to_be64(entry);
                         qcow2_cache_entry_mark_dirty(s->l2_table_cache,
-                                                     l2_table);
+                                                     l2_slice);
                     }
                 }
 
-                qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
-
+                qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
             }
 
             if (addend != 0) {
@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
 
     ret = bdrv_flush(bs);
 fail:
-    if (l2_table) {
-        qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
+    if (l2_slice) {
+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
     }
 
     s->cache_discards = false;
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

At the moment it doesn't really make a difference whether we call
qcow2_get_refcount() before of after reading the L2 table, but if we
want to support L2 slices we'll need to read the refcount first.

This patch simply changes the order of those two operations to prepare
for that. The patch with the actual semantic changes will be easier to
read because of this.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 947a91d934053a2dbfef979aeb9568f57ef57c5d.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

From: Alberto Garcia <berto@igalia.com>

Adding support for L2 slices to expand_zero_clusters_in_l1() needs
(among other things) an extra loop that iterates over all slices of
each L2 table.

Putting all changes in one patch would make it hard to read because
all semantic changes would be mixed with pure indentation changes.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: c2ae9f31ed5b6e591477ad4654448badd1c89d73.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 187 ++++++++++++++++++++++++++------------------------
 1 file changed, 96 insertions(+), 91 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
             goto fail;
         }
 
-        if (is_active_l1) {
-            /* get active L2 tables from cache */
-            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
-                    (void **)&l2_table);
-        } else {
-            /* load inactive L2 tables from disk */
-            ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE,
-                            (void *)l2_table, s->cluster_sectors);
-        }
-        if (ret < 0) {
-            goto fail;
-        }
-
-        for (j = 0; j < s->l2_size; j++) {
-            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
-            int64_t offset = l2_entry & L2E_OFFSET_MASK;
-            QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
-
-            if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN &&
-                cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) {
-                continue;
+        {
+            if (is_active_l1) {
+                /* get active L2 tables from cache */
+                ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
+                                      (void **)&l2_table);
+            } else {
+                /* load inactive L2 tables from disk */
+                ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE,
+                                (void *)l2_table, s->cluster_sectors);
+            }
+            if (ret < 0) {
+                goto fail;
             }
 
-            if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
-                if (!bs->backing) {
-                    /* not backed; therefore we can simply deallocate the
-                     * cluster */
-                    l2_table[j] = 0;
-                    l2_dirty = true;
+            for (j = 0; j < s->l2_size; j++) {
+                uint64_t l2_entry = be64_to_cpu(l2_table[j]);
+                int64_t offset = l2_entry & L2E_OFFSET_MASK;
+                QCow2ClusterType cluster_type =
+                    qcow2_get_cluster_type(l2_entry);
+
+                if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN &&
+                    cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) {
                     continue;
                 }
 
-                offset = qcow2_alloc_clusters(bs, s->cluster_size);
-                if (offset < 0) {
-                    ret = offset;
-                    goto fail;
-                }
+                if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
+                    if (!bs->backing) {
+                        /* not backed; therefore we can simply deallocate the
+                         * cluster */
+                        l2_table[j] = 0;
+                        l2_dirty = true;
+                        continue;
+                    }
+
+                    offset = qcow2_alloc_clusters(bs, s->cluster_size);
+                    if (offset < 0) {
+                        ret = offset;
+                        goto fail;
+                    }
 
-                if (l2_refcount > 1) {
-                    /* For shared L2 tables, set the refcount accordingly (it is
-                     * already 1 and needs to be l2_refcount) */
-                    ret = qcow2_update_cluster_refcount(bs,
-                            offset >> s->cluster_bits,
+                    if (l2_refcount > 1) {
+                        /* For shared L2 tables, set the refcount accordingly
+                         * (it is already 1 and needs to be l2_refcount) */
+                        ret = qcow2_update_cluster_refcount(
+                            bs, offset >> s->cluster_bits,
                             refcount_diff(1, l2_refcount), false,
                             QCOW2_DISCARD_OTHER);
-                    if (ret < 0) {
-                        qcow2_free_clusters(bs, offset, s->cluster_size,
-                                            QCOW2_DISCARD_OTHER);
-                        goto fail;
+                        if (ret < 0) {
+                            qcow2_free_clusters(bs, offset, s->cluster_size,
+                                                QCOW2_DISCARD_OTHER);
+                            goto fail;
+                        }
                     }
                 }
-            }
 
-            if (offset_into_cluster(s, offset)) {
-                qcow2_signal_corruption(bs, true, -1, -1,
-                                        "Cluster allocation offset "
-                                        "%#" PRIx64 " unaligned (L2 offset: %#"
-                                        PRIx64 ", L2 index: %#x)", offset,
-                                        l2_offset, j);
-                if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
-                    qcow2_free_clusters(bs, offset, s->cluster_size,
-                                        QCOW2_DISCARD_ALWAYS);
+                if (offset_into_cluster(s, offset)) {
+                    qcow2_signal_corruption(
+                        bs, true, -1, -1,
+                        "Cluster allocation offset "
+                        "%#" PRIx64 " unaligned (L2 offset: %#"
+                        PRIx64 ", L2 index: %#x)", offset,
+                        l2_offset, j);
+                    if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
+                        qcow2_free_clusters(bs, offset, s->cluster_size,
+                                            QCOW2_DISCARD_ALWAYS);
+                    }
+                    ret = -EIO;
+                    goto fail;
                 }
-                ret = -EIO;
-                goto fail;
-            }
 
-            ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
-            if (ret < 0) {
-                if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
-                    qcow2_free_clusters(bs, offset, s->cluster_size,
-                                        QCOW2_DISCARD_ALWAYS);
+                ret = qcow2_pre_write_overlap_check(bs, 0, offset,
+                                                    s->cluster_size);
+                if (ret < 0) {
+                    if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
+                        qcow2_free_clusters(bs, offset, s->cluster_size,
+                                            QCOW2_DISCARD_ALWAYS);
+                    }
+                    goto fail;
                 }
-                goto fail;
-            }
 
-            ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0);
-            if (ret < 0) {
-                if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
-                    qcow2_free_clusters(bs, offset, s->cluster_size,
-                                        QCOW2_DISCARD_ALWAYS);
+                ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0);
+                if (ret < 0) {
+                    if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
+                        qcow2_free_clusters(bs, offset, s->cluster_size,
+                                            QCOW2_DISCARD_ALWAYS);
+                    }
+                    goto fail;
                 }
-                goto fail;
-            }
 
-            if (l2_refcount == 1) {
-                l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
-            } else {
-                l2_table[j] = cpu_to_be64(offset);
+                if (l2_refcount == 1) {
+                    l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
+                } else {
+                    l2_table[j] = cpu_to_be64(offset);
+                }
+                l2_dirty = true;
             }
-            l2_dirty = true;
-        }
 
-        if (is_active_l1) {
-            if (l2_dirty) {
-                qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
-                qcow2_cache_depends_on_flush(s->l2_table_cache);
-            }
-            qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
-        } else {
-            if (l2_dirty) {
-                ret = qcow2_pre_write_overlap_check(bs,
-                        QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset,
-                        s->cluster_size);
-                if (ret < 0) {
-                    goto fail;
+            if (is_active_l1) {
+                if (l2_dirty) {
+                    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+                    qcow2_cache_depends_on_flush(s->l2_table_cache);
                 }
+                qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
+            } else {
+                if (l2_dirty) {
+                    ret = qcow2_pre_write_overlap_check(
+                        bs, QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2,
+                        l2_offset, s->cluster_size);
+                    if (ret < 0) {
+                        goto fail;
+                    }
 
-                ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE,
-                                 (void *)l2_table, s->cluster_sectors);
-                if (ret < 0) {
-                    goto fail;
+                    ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE,
+                                     (void *)l2_table, s->cluster_sectors);
+                    if (ret < 0) {
+                        goto fail;
+                    }
                 }
             }
         }
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

expand_zero_clusters_in_l1() expands zero clusters as a necessary step
to downgrade qcow2 images to a version that doesn't support metadata
zero clusters. This function takes an L1 table (which may or may not
be active) and iterates over all its L2 tables looking for zero
clusters.

Since we'll be loading L2 slices instead of full tables we need to add
an extra loop that iterates over all slices of each L2 table, and we
should also use the slice size when allocating the buffer used when
the L1 table is not active.

This function doesn't need any additional changes so apart from that
this patch simply updates the variable name from l2_table to l2_slice.

Finally, and since we have to touch the bdrv_read() / bdrv_write()
calls anyway, this patch takes the opportunity to replace them with
the byte-based bdrv_pread() / bdrv_pwrite().

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 43590976f730501688096cff103f2923b72b0f32.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 51 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 28 insertions(+), 23 deletions(-)

From: Alberto Garcia <berto@igalia.com>

The qcow2_truncate() code is mostly independent from whether
we're using L2 slices or full L2 tables, but in full and
falloc preallocation modes new L2 tables are allocated using
qcow2_alloc_cluster_link_l2().  Therefore the code needs to be
modified to ensure that all nb_clusters that are processed in each
call can be allocated with just one L2 slice.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Message-id: 1fd7d272b5e7b66254a090b74cf2bed1cc334c0e.1517840877.git.berto@igalia.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset,
         host_offset = allocation_start;
         guest_offset = old_length;
         while (nb_new_data_clusters) {
-            int64_t guest_cluster = guest_offset >> s->cluster_bits;
-            int64_t nb_clusters = MIN(nb_new_data_clusters,
-                                      s->l2_size - guest_cluster % s->l2_size);
+            int64_t nb_clusters = MIN(
+                nb_new_data_clusters,
+                s->l2_slice_size - offset_to_l2_slice_index(s, guest_offset));
             QCowL2Meta allocation = {
                 .offset       = guest_offset,
                 .alloc_offset = host_offset,
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

This function doesn't need any changes to support L2 slices, but since
it's now dealing with slices instead of full tables, the l2_table
variable is renamed for clarity.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 0c5d4b9bf163aa3b49ec19cc512a50d83563f2ad.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
 {
     BDRVQcow2State *s = bs->opaque;
     int l2_index, ret;
-    uint64_t *l2_table;
+    uint64_t *l2_slice;
     int64_t cluster_offset;
     int nb_csectors;
 
-    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+    ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
     if (ret < 0) {
         return 0;
     }
 
     /* Compression can't overwrite anything. Fail if the cluster was already
      * allocated. */
-    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    cluster_offset = be64_to_cpu(l2_slice[l2_index]);
     if (cluster_offset & L2E_OFFSET_MASK) {
-        qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
         return 0;
     }
 
     cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
     if (cluster_offset < 0) {
-        qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
+        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
         return 0;
     }
 
@@ -XXX,XX +XXX,XX @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
     /* compressed clusters never have the copied flag */
 
     BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
-    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
-    l2_table[l2_index] = cpu_to_be64(cluster_offset);
-    qcow2_cache_put(s->l2_table_cache, (void **) &l2_table);
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
+    l2_slice[l2_index] = cpu_to_be64(cluster_offset);
+    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
 
     return cluster_offset;
 }
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

This function doesn't need any changes to support L2 slices, but since
it's now dealing with slices intead of full tables, the l2_table
variable is renamed for clarity.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 812b0c3505bb1687e51285dccf1a94f0cecb1f74.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ fail:
 }
 
 /*
- * Checks how many clusters in a given L2 table are contiguous in the image
+ * Checks how many clusters in a given L2 slice are contiguous in the image
  * file. As soon as one of the flags in the bitmask stop_flags changes compared
  * to the first cluster, the search is stopped and the cluster is not counted
  * as contiguous. (This allows it, for example, to stop at the first compressed
  * cluster which may require a different handling)
  */
 static int count_contiguous_clusters(int nb_clusters, int cluster_size,
-        uint64_t *l2_table, uint64_t stop_flags)
+        uint64_t *l2_slice, uint64_t stop_flags)
 {
     int i;
     QCow2ClusterType first_cluster_type;
     uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED;
-    uint64_t first_entry = be64_to_cpu(l2_table[0]);
+    uint64_t first_entry = be64_to_cpu(l2_slice[0]);
     uint64_t offset = first_entry & mask;
 
     if (!offset) {
@@ -XXX,XX +XXX,XX @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,
            first_cluster_type == QCOW2_CLUSTER_ZERO_ALLOC);
 
     for (i = 0; i < nb_clusters; i++) {
-        uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
+        uint64_t l2_entry = be64_to_cpu(l2_slice[i]) & mask;
         if (offset + (uint64_t) i * cluster_size != l2_entry) {
             break;
         }
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

This function doesn't need any changes to support L2 slices, but since
it's now dealing with slices instead of full tables, the l2_table
variable is renamed for clarity.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 78bcc54bc632574dd0b900a77a00a1b6ffc359e6.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,
 
 /*
  * Checks how many consecutive unallocated clusters in a given L2
- * table have the same cluster type.
+ * slice have the same cluster type.
  */
 static int count_contiguous_clusters_unallocated(int nb_clusters,
-                                                 uint64_t *l2_table,
+                                                 uint64_t *l2_slice,
                                                  QCow2ClusterType wanted_type)
 {
     int i;
@@ -XXX,XX +XXX,XX @@ static int count_contiguous_clusters_unallocated(int nb_clusters,
     assert(wanted_type == QCOW2_CLUSTER_ZERO_PLAIN ||
            wanted_type == QCOW2_CLUSTER_UNALLOCATED);
     for (i = 0; i < nb_clusters; i++) {
-        uint64_t entry = be64_to_cpu(l2_table[i]);
+        uint64_t entry = be64_to_cpu(l2_slice[i]);
         QCow2ClusterType type = qcow2_get_cluster_type(entry);
 
         if (type != wanted_type) {
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

This function doesn't need any changes to support L2 slices, but since
it's now dealing with slices intead of full tables, the l2_table
variable is renamed for clarity.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 6107001fc79e6739242f1de7d191375e4f130aac.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ err:
  * which must copy from the backing file)
  */
 static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters,
-    uint64_t *l2_table, int l2_index)
+    uint64_t *l2_slice, int l2_index)
 {
     int i;
 
     for (i = 0; i < nb_clusters; i++) {
-        uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]);
+        uint64_t l2_entry = be64_to_cpu(l2_slice[l2_index + i]);
         QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
 
         switch(cluster_type) {
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

Now that the code is ready to handle L2 slices we can finally add an
option to allow configuring their size.

An L2 slice is the portion of an L2 table that is read by the qcow2
cache. Until now the cache was always reading full L2 tables, and
since the L2 table size is equal to the cluster size this was not very
efficient with large clusters. Here's a more detailed explanation of
why it makes sense to have smaller cache entries in order to load L2
data:

https://lists.gnu.org/archive/html/qemu-block/2017-09/msg00635.html

This patch introduces a new command-line option to the qcow2 driver
named l2-cache-entry-size (cf. l2-cache-size). The cache entry size
has the same restrictions as the cluster size: it must be a power of
two and it has the same range of allowed values, with the additional
requirement that it must not be larger than the cluster size.

The L2 cache entry size (L2 slice size) remains equal to the cluster
size for now by default, so this feature must be explicitly enabled.
Although my tests show that 4KB slices consistently improve
performance and give the best results, let's wait and make more tests
with different cluster sizes before deciding on an optimal default.

Now that the cache entry size is not necessarily equal to the cluster
size we need to reflect that in the MIN_L2_CACHE_SIZE documentation.
That minimum value is a requirement of the COW algorithm: we need to
read two L2 slices (and not two L2 tables) in order to do COW, see
l2_allocate() for the actual code.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: c73e5611ff4a9ec5d20de68a6c289553a13d2354.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qapi/block-core.json |  6 ++++++
 block/qcow2.h        |  6 ++++--
 block/qcow2-cache.c  | 10 ++++++++--
 block/qcow2.c        | 34 +++++++++++++++++++++++++++-------
 4 files changed, 45 insertions(+), 11 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 # @l2-cache-size:         the maximum size of the L2 table cache in
 #                         bytes (since 2.2)
 #
+# @l2-cache-entry-size:   the size of each entry in the L2 cache in
+#                         bytes. It must be a power of two between 512
+#                         and the cluster size. The default value is
+#                         the cluster size (since 2.12)
+#
 # @refcount-cache-size:   the maximum size of the refcount block cache
 #                         in bytes (since 2.2)
 #
@@ -XXX,XX +XXX,XX @@
             '*overlap-check': 'Qcow2OverlapChecks',
             '*cache-size': 'int',
             '*l2-cache-size': 'int',
+            '*l2-cache-entry-size': 'int',
             '*refcount-cache-size': 'int',
             '*cache-clean-interval': 'int',
             '*encrypt': 'BlockdevQcow2Encryption' } }
diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@
 #define MAX_CLUSTER_BITS 21
 
 /* Must be at least 2 to cover COW */
-#define MIN_L2_CACHE_SIZE 2 /* clusters */
+#define MIN_L2_CACHE_SIZE 2 /* cache entries */
 
 /* Must be at least 4 to cover all cases of refcount table growth */
 #define MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */
@@ -XXX,XX +XXX,XX @@
 #define QCOW2_OPT_OVERLAP_INACTIVE_L2 "overlap-check.inactive-l2"
 #define QCOW2_OPT_CACHE_SIZE "cache-size"
 #define QCOW2_OPT_L2_CACHE_SIZE "l2-cache-size"
+#define QCOW2_OPT_L2_CACHE_ENTRY_SIZE "l2-cache-entry-size"
 #define QCOW2_OPT_REFCOUNT_CACHE_SIZE "refcount-cache-size"
 #define QCOW2_OPT_CACHE_CLEAN_INTERVAL "cache-clean-interval"
 
@@ -XXX,XX +XXX,XX @@ void qcow2_free_snapshots(BlockDriverState *bs);
 int qcow2_read_snapshots(BlockDriverState *bs);
 
 /* qcow2-cache.c functions */
-Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables);
+Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
+                               unsigned table_size);
 int qcow2_cache_destroy(Qcow2Cache *c);
 
 void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table);
diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -XXX,XX +XXX,XX @@ void qcow2_cache_clean_unused(Qcow2Cache *c)
     c->cache_clean_lru_counter = c->lru_counter;
 }
 
-Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
+Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
+                               unsigned table_size)
 {
     BDRVQcow2State *s = bs->opaque;
     Qcow2Cache *c;
 
+    assert(num_tables > 0);
+    assert(is_power_of_2(table_size));
+    assert(table_size >= (1 << MIN_CLUSTER_BITS));
+    assert(table_size <= s->cluster_size);
+
     c = g_new0(Qcow2Cache, 1);
     c->size = num_tables;
-    c->table_size = s->cluster_size;
+    c->table_size = table_size;
     c->entries = g_try_new0(Qcow2CachedTable, num_tables);
     c->table_array = qemu_try_blockalign(bs->file->bs,
                                          (size_t) num_tables * c->table_size);
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static QemuOptsList qcow2_runtime_opts = {
             .help = "Maximum L2 table cache size",
         },
         {
+            .name = QCOW2_OPT_L2_CACHE_ENTRY_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "Size of each entry in the L2 cache",
+        },
+        {
             .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE,
             .type = QEMU_OPT_SIZE,
             .help = "Maximum refcount block cache size",
@@ -XXX,XX +XXX,XX @@ static void qcow2_attach_aio_context(BlockDriverState *bs,
 
 static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
                              uint64_t *l2_cache_size,
+                             uint64_t *l2_cache_entry_size,
                              uint64_t *refcount_cache_size, Error **errp)
 {
     BDRVQcow2State *s = bs->opaque;
@@ -XXX,XX +XXX,XX @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
     *refcount_cache_size = qemu_opt_get_size(opts,
                                              QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0);
 
+    *l2_cache_entry_size = qemu_opt_get_size(
+        opts, QCOW2_OPT_L2_CACHE_ENTRY_SIZE, s->cluster_size);
+
     if (combined_cache_size_set) {
         if (l2_cache_size_set && refcount_cache_size_set) {
             error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE
@@ -XXX,XX +XXX,XX @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
                                  / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
         }
     }
+
+    if (*l2_cache_entry_size < (1 << MIN_CLUSTER_BITS) ||
+        *l2_cache_entry_size > s->cluster_size ||
+        !is_power_of_2(*l2_cache_entry_size)) {
+        error_setg(errp, "L2 cache entry size must be a power of two "
+                   "between %d and the cluster size (%d)",
+                   1 << MIN_CLUSTER_BITS, s->cluster_size);
+        return;
+    }
 }
 
 typedef struct Qcow2ReopenState {
@@ -XXX,XX +XXX,XX @@ static int qcow2_update_options_prepare(BlockDriverState *bs,
     QemuOpts *opts = NULL;
     const char *opt_overlap_check, *opt_overlap_check_template;
     int overlap_check_template = 0;
-    uint64_t l2_cache_size, refcount_cache_size;
+    uint64_t l2_cache_size, l2_cache_entry_size, refcount_cache_size;
     int i;
     const char *encryptfmt;
     QDict *encryptopts = NULL;
@@ -XXX,XX +XXX,XX @@ static int qcow2_update_options_prepare(BlockDriverState *bs,
     }
 
     /* get L2 table/refcount block cache size from command line options */
-    read_cache_sizes(bs, opts, &l2_cache_size, &refcount_cache_size,
-                     &local_err);
+    read_cache_sizes(bs, opts, &l2_cache_size, &l2_cache_entry_size,
+                     &refcount_cache_size, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         ret = -EINVAL;
         goto fail;
     }
 
-    l2_cache_size /= s->cluster_size;
+    l2_cache_size /= l2_cache_entry_size;
     if (l2_cache_size < MIN_L2_CACHE_SIZE) {
         l2_cache_size = MIN_L2_CACHE_SIZE;
     }
@@ -XXX,XX +XXX,XX @@ static int qcow2_update_options_prepare(BlockDriverState *bs,
         }
     }
 
-    r->l2_slice_size = s->cluster_size / sizeof(uint64_t);
-    r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size);
-    r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size);
+    r->l2_slice_size = l2_cache_entry_size / sizeof(uint64_t);
+    r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size,
+                                           l2_cache_entry_size);
+    r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size,
+                                                 s->cluster_size);
     if (r->l2_table_cache == NULL || r->refcount_block_cache == NULL) {
         error_setg(errp, "Could not allocate metadata caches");
         ret = -ENOMEM;
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

The l2-cache-entry-size setting can only contain values that are
powers of two between 512 and the cluster size.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: bd3547b670b8d0af11480c760991a22bcae5b48c.1517840877.git.berto@igalia.com
[mreitz: Changed non-power-of-two test value from 300 to 4242]
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/103     | 17 +++++++++++++++++
 tests/qemu-iotests/103.out |  3 +++
 2 files changed, 20 insertions(+)

diff --git a/tests/qemu-iotests/103 b/tests/qemu-iotests/103
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/103
+++ b/tests/qemu-iotests/103
@@ -XXX,XX +XXX,XX @@ $QEMU_IO -c "open -o cache-size=1M,refcount-cache-size=2M $TEST_IMG" 2>&1 \
 $QEMU_IO -c "open -o cache-size=0,l2-cache-size=0,refcount-cache-size=0 $TEST_IMG" \
     2>&1 | _filter_testdir | _filter_imgfmt
 
+# Invalid cache entry sizes
+$QEMU_IO -c "open -o l2-cache-entry-size=256 $TEST_IMG" \
+    2>&1 | _filter_testdir | _filter_imgfmt
+$QEMU_IO -c "open -o l2-cache-entry-size=4242 $TEST_IMG" \
+    2>&1 | _filter_testdir | _filter_imgfmt
+$QEMU_IO -c "open -o l2-cache-entry-size=128k $TEST_IMG" \
+    2>&1 | _filter_testdir | _filter_imgfmt
+
 echo
 echo '=== Testing valid option combinations ==='
 echo
@@ -XXX,XX +XXX,XX @@ $QEMU_IO -c "open -o l2-cache-size=1M,refcount-cache-size=0.25M $TEST_IMG" \
          -c 'read -P 42 0 64k' \
     | _filter_qemu_io
 
+# Valid cache entry sizes
+$QEMU_IO -c "open -o l2-cache-entry-size=512 $TEST_IMG" \
+    2>&1 | _filter_testdir | _filter_imgfmt
+$QEMU_IO -c "open -o l2-cache-entry-size=16k $TEST_IMG" \
+    2>&1 | _filter_testdir | _filter_imgfmt
+$QEMU_IO -c "open -o l2-cache-entry-size=64k $TEST_IMG" \
+    2>&1 | _filter_testdir | _filter_imgfmt
+
+
 echo
 echo '=== Testing minimal L2 cache and COW ==='
 echo
diff --git a/tests/qemu-iotests/103.out b/tests/qemu-iotests/103.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/103.out
+++ b/tests/qemu-iotests/103.out
@@ -XXX,XX +XXX,XX @@ can't open device TEST_DIR/t.IMGFMT: cache-size, l2-cache-size and refcount-cach
 can't open device TEST_DIR/t.IMGFMT: l2-cache-size may not exceed cache-size
 can't open device TEST_DIR/t.IMGFMT: refcount-cache-size may not exceed cache-size
 can't open device TEST_DIR/t.IMGFMT: cache-size, l2-cache-size and refcount-cache-size may not be set the same time
+can't open device TEST_DIR/t.IMGFMT: L2 cache entry size must be a power of two between 512 and the cluster size (65536)
+can't open device TEST_DIR/t.IMGFMT: L2 cache entry size must be a power of two between 512 and the cluster size (65536)
+can't open device TEST_DIR/t.IMGFMT: L2 cache entry size must be a power of two between 512 and the cluster size (65536)
 
 === Testing valid option combinations ===
 
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

expand_zero_clusters_in_l1() is used when downgrading qcow2 images
from v3 to v2 (compat=0.10). This is one of the functions that needed
more changes to support L2 slices, so this patch extends iotest 061 to
test downgrading a qcow2 image using a smaller slice size.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 3e5662dce5e4926c8fabbad4c0b9142b2a506dd4.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/061     | 16 ++++++++++++
 tests/qemu-iotests/061.out | 61 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/tests/qemu-iotests/061 b/tests/qemu-iotests/061
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/061
+++ b/tests/qemu-iotests/061
@@ -XXX,XX +XXX,XX @@ $QEMU_IO -c "read -P 0 0 128k" "$TEST_IMG" | _filter_qemu_io
 _check_test_img
 
 echo
+echo "=== Testing version downgrade with zero expansion and 4K cache entries ==="
+echo
+IMGOPTS="compat=1.1,lazy_refcounts=on" _make_test_img 64M
+$QEMU_IO -c "write -z 0 128k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "write -z 32M 128k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c map "$TEST_IMG" | _filter_qemu_io
+$PYTHON qcow2.py "$TEST_IMG" dump-header
+$QEMU_IMG amend -o "compat=0.10" --image-opts \
+          driver=qcow2,file.filename=$TEST_IMG,l2-cache-entry-size=4096
+$PYTHON qcow2.py "$TEST_IMG" dump-header
+$QEMU_IO -c "read -P 0 0 128k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 32M 128k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c map "$TEST_IMG" | _filter_qemu_io
+_check_test_img
+
+echo
 echo "=== Testing dirty version downgrade ==="
 echo
 IMGOPTS="compat=1.1,lazy_refcounts=on" _make_test_img 64M
diff --git a/tests/qemu-iotests/061.out b/tests/qemu-iotests/061.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/061.out
+++ b/tests/qemu-iotests/061.out
@@ -XXX,XX +XXX,XX @@ read 131072/131072 bytes at offset 0
 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 No errors were found on the image.
 
+=== Testing version downgrade with zero expansion and 4K cache entries ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 131072/131072 bytes at offset 33554432
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+128 KiB (0x20000) bytes     allocated at offset 0 bytes (0x0)
+31.875 MiB (0x1fe0000) bytes not allocated at offset 128 KiB (0x20000)
+128 KiB (0x20000) bytes     allocated at offset 32 MiB (0x2000000)
+31.875 MiB (0x1fe0000) bytes not allocated at offset 32.125 MiB (0x2020000)
+magic                     0x514649fb
+version                   3
+backing_file_offset       0x0
+backing_file_size         0x0
+cluster_bits              16
+size                      67108864
+crypt_method              0
+l1_size                   1
+l1_table_offset           0x30000
+refcount_table_offset     0x10000
+refcount_table_clusters   1
+nb_snapshots              0
+snapshot_offset           0x0
+incompatible_features     0x0
+compatible_features       0x1
+autoclear_features        0x0
+refcount_order            4
+header_length             104
+
+Header extension:
+magic                     0x6803f857
+length                    144
+data                      <binary>
+
+magic                     0x514649fb
+version                   2
+backing_file_offset       0x0
+backing_file_size         0x0
+cluster_bits              16
+size                      67108864
+crypt_method              0
+l1_size                   1
+l1_table_offset           0x30000
+refcount_table_offset     0x10000
+refcount_table_clusters   1
+nb_snapshots              0
+snapshot_offset           0x0
+incompatible_features     0x0
+compatible_features       0x0
+autoclear_features        0x0
+refcount_order            4
+header_length             72
+
+read 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 131072/131072 bytes at offset 33554432
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+64 MiB (0x4000000) bytes not allocated at offset 0 bytes (0x0)
+No errors were found on the image.
+
 === Testing dirty version downgrade ===
 
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

This test tries reopening a qcow2 image with valid and invalid
options. This patch adds l2-cache-entry-size to the set.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 3d3b7d2dbfc020deaef60fb58739b0801eb9517c.1517840877.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/137     | 5 +++++
 tests/qemu-iotests/137.out | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/tests/qemu-iotests/137 b/tests/qemu-iotests/137
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/137
+++ b/tests/qemu-iotests/137
@@ -XXX,XX +XXX,XX @@ $QEMU_IO \
     -c "reopen -o overlap-check.inactive-l2=off" \
     -c "reopen -o cache-size=1M" \
     -c "reopen -o l2-cache-size=512k" \
+    -c "reopen -o l2-cache-entry-size=512" \
+    -c "reopen -o l2-cache-entry-size=4k" \
+    -c "reopen -o l2-cache-entry-size=64k" \
     -c "reopen -o refcount-cache-size=128k" \
     -c "reopen -o cache-clean-interval=5" \
     -c "reopen -o cache-clean-interval=0" \
@@ -XXX,XX +XXX,XX @@ $QEMU_IO \
     -c "reopen -o cache-size=1M,l2-cache-size=2M" \
     -c "reopen -o cache-size=1M,refcount-cache-size=2M" \
     -c "reopen -o l2-cache-size=256T" \
+    -c "reopen -o l2-cache-entry-size=33k" \
+    -c "reopen -o l2-cache-entry-size=128k" \
     -c "reopen -o refcount-cache-size=256T" \
     -c "reopen -o overlap-check=constant,overlap-check.template=all" \
     -c "reopen -o overlap-check=blubb" \
diff --git a/tests/qemu-iotests/137.out b/tests/qemu-iotests/137.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/137.out
+++ b/tests/qemu-iotests/137.out
@@ -XXX,XX +XXX,XX @@ cache-size, l2-cache-size and refcount-cache-size may not be set the same time
 l2-cache-size may not exceed cache-size
 refcount-cache-size may not exceed cache-size
 L2 cache size too big
+L2 cache entry size must be a power of two between 512 and the cluster size (65536)
+L2 cache entry size must be a power of two between 512 and the cluster size (65536)
 L2 cache size too big
 Conflicting values for qcow2 options 'overlap-check' ('constant') and 'overlap-check.template' ('all')
 Unsupported value 'blubb' for qcow2 option 'overlap-check'. Allowed are any of the following: none, constant, cached, all
-- 
2.13.6

The following changes since commit 281f327487c9c9b1599f93c589a408bbf4a651b8:

Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.12-pull-request' into staging (2017-12-22 00:11:36 +0000)

are available in the git repository at:

git://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to 1a63a907507fbbcfaee3f622907ec244b7eabda8:

block: Keep nodes drained between reopen_queue/multiple (2017-12-22 15:05:32 +0100)

----------------------------------------------------------------
Block layer patches

----------------------------------------------------------------
Doug Gale (1):
      nvme: Add tracing

Edgar Kaziakhmedov (1):
      qcow2: get rid of qcow2_backing_read1 routine

Fam Zheng (2):
      block: Open backing image in force share mode for size probe
      block: Remove unused bdrv_requests_pending

John Snow (1):
      iotests: fix 197 for vpc

Kevin Wolf (27):
      block: Formats don't need CONSISTENT_READ with NO_IO
      block: Make bdrv_drain_invoke() recursive
      block: Call .drain_begin only once in bdrv_drain_all_begin()
      test-bdrv-drain: Test BlockDriver callbacks for drain
      block: bdrv_drain_recurse(): Remove unused begin parameter
      block: Don't wait for requests in bdrv_drain*_end()
      block: Unify order in drain functions
      block: Don't acquire AioContext in hmp_qemu_io()
      block: Document that x-blockdev-change breaks quorum children list
      block: Assert drain_all is only called from main AioContext
      block: Make bdrv_drain() driver callbacks non-recursive
      test-bdrv-drain: Test callback for bdrv_drain
      test-bdrv-drain: Test bs->quiesce_counter
      blockjob: Pause job on draining any job BDS
      test-bdrv-drain: Test drain vs. block jobs
      block: Don't block_job_pause_all() in bdrv_drain_all()
      block: Nested drain_end must still call callbacks
      test-bdrv-drain: Test nested drain sections
      block: Don't notify parents in drain call chain
      block: Add bdrv_subtree_drained_begin/end()
      test-bdrv-drain: Tests for bdrv_subtree_drain
      test-bdrv-drain: Test behaviour in coroutine context
      test-bdrv-drain: Recursive draining with multiple parents
      block: Allow graph changes in subtree drained section
      test-bdrv-drain: Test graph changes in drained section
      commit: Simplify reopen of base
      block: Keep nodes drained between reopen_queue/multiple

Thomas Huth (3):
      block: Remove the obsolete -drive boot=on|off parameter
      block: Remove the deprecated -hdachs option
      block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter

Commit 1f4ad7d fixed 'qemu-img info' for raw images that are currently
in use as a mirror target. It is not enough for image formats, though,
as these still unconditionally request BLK_PERM_CONSISTENT_READ.

As this permission is geared towards whether the guest-visible data is
consistent, and has no impact on whether the metadata is sane, and
'qemu-img info' does not read guest-visible data (except for the raw
format), it makes sense to not require BLK_PERM_CONSISTENT_READ if there
is not going to be any guest I/O performed, regardless of image format.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
     assert(role == &child_backing || role == &child_file);
 
     if (!backing) {
+        int flags = bdrv_reopen_get_flags(reopen_queue, bs);
+
         /* Apart from the modifications below, the same permissions are
          * forwarded and left alone as for filters */
         bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared,
@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
 
         /* bs->file always needs to be consistent because of the metadata. We
          * can never allow other users to resize or write to it. */
-        perm |= BLK_PERM_CONSISTENT_READ;
+        if (!(flags & BDRV_O_NO_IO)) {
+            perm |= BLK_PERM_CONSISTENT_READ;
+        }
         shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
     } else {
         /* We want consistent read from backing files if the parent needs it.
-- 
2.13.6

From: John Snow <jsnow@redhat.com>

VPC has some difficulty creating geometries of particular size.
However, we can indeed force it to use a literal one, so let's
do that for the sake of test 197, which is testing some specific
offsets.

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Lukáš Doktor <ldoktor@redhat.com>
---
 tests/qemu-iotests/197           | 4 ++++
 tests/qemu-iotests/common.filter | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/197
+++ b/tests/qemu-iotests/197
@@ -XXX,XX +XXX,XX @@ echo '=== Copy-on-read ==='
 echo
 
 # Prep the images
+# VPC rounds image sizes to a specific geometry, force a specific size.
+if [ "$IMGFMT" = "vpc" ]; then
+    IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size")
+fi
 _make_test_img 4G
 $QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io
 IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \
diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/common.filter
+++ b/tests/qemu-iotests/common.filter
@@ -XXX,XX +XXX,XX @@ _filter_img_create()
         -e "s# log_size=[0-9]\\+##g" \
         -e "s# refcount_bits=[0-9]\\+##g" \
         -e "s# key-secret=[a-zA-Z0-9]\\+##g" \
-        -e "s# iter-time=[0-9]\\+##g"
+        -e "s# iter-time=[0-9]\\+##g" \
+        -e "s# force_size=\$on\\|off\$##g"
 }
 
 _filter_img_info()
-- 
2.13.6

This change separates bdrv_drain_invoke(), which calls the BlockDriver
drain callbacks, from bdrv_drain_recurse(). Instead, the function
performs its own recursion now.

One reason for this is that bdrv_drain_recurse() can be called multiple
times by bdrv_drain_all_begin(), but the callbacks may only be called
once. The separation is necessary to fix this bug.

The other reason is that we intend to go to a model where we call all
driver callbacks first, and only then start polling. This is not fully
achieved yet with this patch, as bdrv_drain_invoke() contains a
BDRV_POLL_WHILE() loop for the block driver callbacks, which can still
call callbacks for any unrelated event. It's a step in this direction
anyway.

Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
     bdrv_wakeup(bs);
 }
 
+/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 {
+    BdrvChild *child, *tmp;
     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
 
     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
     data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
     bdrv_coroutine_enter(bs, data.co);
     BDRV_POLL_WHILE(bs, !data.done);
+
+    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+        bdrv_drain_invoke(child->bs, begin);
+    }
 }
 
 static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
     BdrvChild *child, *tmp;
     bool waited;
 
-    /* Ensure any pending metadata writes are submitted to bs->file.  */
-    bdrv_drain_invoke(bs, begin);
-
     /* Wait for drained requests to finish */
     waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
         bdrv_parent_drained_begin(bs);
     }
 
+    bdrv_drain_invoke(bs, true);
     bdrv_drain_recurse(bs, true);
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
     }
 
     bdrv_parent_drained_end(bs);
+    bdrv_drain_invoke(bs, false);
     bdrv_drain_recurse(bs, false);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
+                    /* FIXME Calling this multiple times is wrong */
+                    bdrv_drain_invoke(bs, true);
                     waited |= bdrv_drain_recurse(bs, true);
                 }
             }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_context_acquire(aio_context);
         aio_enable_external(aio_context);
         bdrv_parent_drained_end(bs);
+        bdrv_drain_invoke(bs, false);
         bdrv_drain_recurse(bs, false);
         aio_context_release(aio_context);
     }
-- 
2.13.6

bdrv_drain_all_begin() used to call the .bdrv_co_drain_begin() driver
callback inside its polling loop. This means that how many times it got
called for each node depended on long it had to poll the event loop.

This is obviously not right and results in nodes that stay drained even
after bdrv_drain_all_end(), which calls .bdrv_co_drain_begin() once per
node.

Fix bdrv_drain_all_begin() to call the callback only once, too.

Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
         aio_context_acquire(aio_context);
         bdrv_parent_drained_begin(bs);
         aio_disable_external(aio_context);
+        bdrv_drain_invoke(bs, true);
         aio_context_release(aio_context);
 
         if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
-                    /* FIXME Calling this multiple times is wrong */
-                    bdrv_drain_invoke(bs, true);
                     waited |= bdrv_drain_recurse(bs, true);
                 }
             }
-- 
2.13.6

This adds a test case that the BlockDriver callbacks for drain are
called in bdrv_drained_all_begin/end(), and that both of them are called
exactly once.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 tests/test-bdrv-drain.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++
 tests/Makefile.include  |   2 +
 2 files changed, 139 insertions(+)
 create mode 100644 tests/test-bdrv-drain.c

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Block node draining tests
+ *
+ * Copyright (c) 2017 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "block/block.h"
+#include "sysemu/block-backend.h"
+#include "qapi/error.h"
+
+typedef struct BDRVTestState {
+    int drain_count;
+} BDRVTestState;
+
+static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
+{
+    BDRVTestState *s = bs->opaque;
+    s->drain_count++;
+}
+
+static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
+{
+    BDRVTestState *s = bs->opaque;
+    s->drain_count--;
+}
+
+static void bdrv_test_close(BlockDriverState *bs)
+{
+    BDRVTestState *s = bs->opaque;
+    g_assert_cmpint(s->drain_count, >, 0);
+}
+
+static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
+                                            uint64_t offset, uint64_t bytes,
+                                            QEMUIOVector *qiov, int flags)
+{
+    /* We want this request to stay until the polling loop in drain waits for
+     * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
+     * first and polls its result, too, but it shouldn't accidentally complete
+     * this request yet. */
+    qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
+
+    return 0;
+}
+
+static BlockDriver bdrv_test = {
+    .format_name            = "test",
+    .instance_size          = sizeof(BDRVTestState),
+
+    .bdrv_close             = bdrv_test_close,
+    .bdrv_co_preadv         = bdrv_test_co_preadv,
+
+    .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
+    .bdrv_co_drain_end      = bdrv_test_co_drain_end,
+};
+
+static void aio_ret_cb(void *opaque, int ret)
+{
+    int *aio_ret = opaque;
+    *aio_ret = ret;
+}
+
+static void test_drv_cb_drain_all(void)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs;
+    BDRVTestState *s;
+    BlockAIOCB *acb;
+    int aio_ret;
+
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = NULL,
+        .iov_len = 0,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
+                              &error_abort);
+    s = bs->opaque;
+    blk_insert_bs(blk, bs, &error_abort);
+
+    /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
+    g_assert_cmpint(s->drain_count, ==, 0);
+    bdrv_drain_all_begin();
+    g_assert_cmpint(s->drain_count, ==, 1);
+    bdrv_drain_all_end();
+    g_assert_cmpint(s->drain_count, ==, 0);
+
+    /* Now do the same while a request is pending */
+    aio_ret = -EINPROGRESS;
+    acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
+    g_assert(acb != NULL);
+    g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
+
+    g_assert_cmpint(s->drain_count, ==, 0);
+    bdrv_drain_all_begin();
+    g_assert_cmpint(aio_ret, ==, 0);
+    g_assert_cmpint(s->drain_count, ==, 1);
+    bdrv_drain_all_end();
+    g_assert_cmpint(s->drain_count, ==, 0);
+
+    bdrv_unref(bs);
+    blk_unref(blk);
+}
+
+int main(int argc, char **argv)
+{
+    bdrv_init();
+    qemu_init_main_loop(&error_abort);
+
+    g_test_init(&argc, &argv, NULL);
+
+    g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
+
+    return g_test_run();
+}
diff --git a/tests/Makefile.include b/tests/Makefile.include
index XXXXXXX..XXXXXXX 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -XXX,XX +XXX,XX @@ gcov-files-test-thread-pool-y = thread-pool.c
 gcov-files-test-hbitmap-y = util/hbitmap.c
 check-unit-y += tests/test-hbitmap$(EXESUF)
 gcov-files-test-hbitmap-y = blockjob.c
+check-unit-y += tests/test-bdrv-drain$(EXESUF)
 check-unit-y += tests/test-blockjob$(EXESUF)
 check-unit-y += tests/test-blockjob-txn$(EXESUF)
 check-unit-y += tests/test-x86-cpuid$(EXESUF)
@@ -XXX,XX +XXX,XX @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
 tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
 tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
+tests/test-bdrv-drain$(EXESUF): tests/test-bdrv-drain.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
-- 
2.13.6

Now that the bdrv_drain_invoke() calls are pulled up to the callers of
bdrv_drain_recurse(), the 'begin' parameter isn't needed any more.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
     }
 }
 
-static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
+static bool bdrv_drain_recurse(BlockDriverState *bs)
 {
     BdrvChild *child, *tmp;
     bool waited;
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
              */
             bdrv_ref(bs);
         }
-        waited |= bdrv_drain_recurse(bs, begin);
+        waited |= bdrv_drain_recurse(bs);
         if (in_main_loop) {
             bdrv_unref(bs);
         }
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
     }
 
     bdrv_drain_invoke(bs, true);
-    bdrv_drain_recurse(bs, true);
+    bdrv_drain_recurse(bs);
 }
 
 void bdrv_drained_end(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
 
     bdrv_parent_drained_end(bs);
     bdrv_drain_invoke(bs, false);
-    bdrv_drain_recurse(bs, false);
+    bdrv_drain_recurse(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
-                    waited |= bdrv_drain_recurse(bs, true);
+                    waited |= bdrv_drain_recurse(bs);
                 }
             }
             aio_context_release(aio_context);
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_enable_external(aio_context);
         bdrv_parent_drained_end(bs);
         bdrv_drain_invoke(bs, false);
-        bdrv_drain_recurse(bs, false);
+        bdrv_drain_recurse(bs);
         aio_context_release(aio_context);
     }
 
-- 
2.13.6

The device is drained, so there is no point in waiting for requests at
the end of the drained section. Remove the bdrv_drain_recurse() calls
there.

The bdrv_drain_recurse() calls were introduced in commit 481cad48e5e
in order to call the .bdrv_co_drain_end() driver callback. This is now
done by a separate bdrv_drain_invoke() call.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
 
     bdrv_parent_drained_end(bs);
     bdrv_drain_invoke(bs, false);
-    bdrv_drain_recurse(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_enable_external(aio_context);
         bdrv_parent_drained_end(bs);
         bdrv_drain_invoke(bs, false);
-        bdrv_drain_recurse(bs);
         aio_context_release(aio_context);
     }
 
-- 
2.13.6

Drain requests are propagated to child nodes, parent nodes and directly
to the AioContext. The order in which this happened was different
between all combinations of drain/drain_all and begin/end.

The correct order is to keep children only drained when their parents
are also drained. This means that at the start of a drained section, the
AioContext needs to be drained first, the parents second and only then
the children. The correct order for the end of a drained section is the
opposite.

This patch changes the three other functions to follow the example of
bdrv_drained_begin(), which is the only one that got it right.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
         return;
     }
 
+    /* Stop things in parent-to-child order */
     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
         aio_disable_external(bdrv_get_aio_context(bs));
         bdrv_parent_drained_begin(bs);
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
         return;
     }
 
-    bdrv_parent_drained_end(bs);
+    /* Re-enable things in child-to-parent order */
     bdrv_drain_invoke(bs, false);
+    bdrv_parent_drained_end(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
+        /* Stop things in parent-to-child order */
         aio_context_acquire(aio_context);
-        bdrv_parent_drained_begin(bs);
         aio_disable_external(aio_context);
+        bdrv_parent_drained_begin(bs);
         bdrv_drain_invoke(bs, true);
         aio_context_release(aio_context);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
+        /* Re-enable things in child-to-parent order */
         aio_context_acquire(aio_context);
-        aio_enable_external(aio_context);
-        bdrv_parent_drained_end(bs);
         bdrv_drain_invoke(bs, false);
+        bdrv_parent_drained_end(bs);
+        aio_enable_external(aio_context);
         aio_context_release(aio_context);
     }
 
-- 
2.13.6

Commit 15afd94a047 added code to acquire and release the AioContext in
qemuio_command(). This means that the lock is taken twice now in the
call path from hmp_qemu_io(). This causes BDRV_POLL_WHILE() to hang for
any requests issued to nodes in a non-mainloop AioContext.

Dropping the first locking from hmp_qemu_io() fixes the problem.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 hmp.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/hmp.c b/hmp.c
index XXXXXXX..XXXXXXX 100644
--- a/hmp.c
+++ b/hmp.c
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
 {
     BlockBackend *blk;
     BlockBackend *local_blk = NULL;
-    AioContext *aio_context;
     const char* device = qdict_get_str(qdict, "device");
     const char* command = qdict_get_str(qdict, "command");
     Error *err = NULL;
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
         }
     }
 
-    aio_context = blk_get_aio_context(blk);
-    aio_context_acquire(aio_context);
-
     /*
      * Notably absent: Proper permission management. This is sad, but it seems
      * almost impossible to achieve without changing the semantics and thereby
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
      */
     qemuio_command(blk, command);
 
-    aio_context_release(aio_context);
-
 fail:
     blk_unref(local_blk);
     hmp_handle_error(mon, &err);
-- 
2.13.6

From: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>

Since bdrv_co_preadv does all neccessary checks including
reading after the end of the backing file, avoid duplication
of verification before bdrv_co_preadv call.

Signed-off-by: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.h |  3 ---
 block/qcow2.c | 51 ++++++++-------------------------------------------
 2 files changed, 8 insertions(+), 46 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset)
 }
 
 /* qcow2.c functions */
-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-                  int64_t sector_num, int nb_sectors);
-
 int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
                                      int refcount_order, bool generous_increase,
                                      uint64_t *refblock_count);
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
     return status;
 }
 
-/* handle reading after the end of the backing file */
-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-                        int64_t offset, int bytes)
-{
-    uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE;
-    int n1;
-
-    if ((offset + bytes) <= bs_size) {
-        return bytes;
-    }
-
-    if (offset >= bs_size) {
-        n1 = 0;
-    } else {
-        n1 = bs_size - offset;
-    }
-
-    qemu_iovec_memset(qiov, n1, 0, bytes - n1);
-
-    return n1;
-}
-
 static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
                                         uint64_t bytes, QEMUIOVector *qiov,
                                         int flags)
 {
     BDRVQcow2State *s = bs->opaque;
-    int offset_in_cluster, n1;
+    int offset_in_cluster;
     int ret;
     unsigned int cur_bytes; /* number of bytes in current iteration */
     uint64_t cluster_offset = 0;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
         case QCOW2_CLUSTER_UNALLOCATED:
 
             if (bs->backing) {
-                /* read from the base image */
-                n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov,
-                                         offset, cur_bytes);
-                if (n1 > 0) {
-                    QEMUIOVector local_qiov;
-
-                    qemu_iovec_init(&local_qiov, hd_qiov.niov);
-                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1);
-
-                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
-                    qemu_co_mutex_unlock(&s->lock);
-                    ret = bdrv_co_preadv(bs->backing, offset, n1,
-                                         &local_qiov, 0);
-                    qemu_co_mutex_lock(&s->lock);
-
-                    qemu_iovec_destroy(&local_qiov);
-
-                    if (ret < 0) {
-                        goto fail;
-                    }
+                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
+                qemu_co_mutex_unlock(&s->lock);
+                ret = bdrv_co_preadv(bs->backing, offset, cur_bytes,
+                                     &hd_qiov, 0);
+                qemu_co_mutex_lock(&s->lock);
+                if (ret < 0) {
+                    goto fail;
                 }
             } else {
                 /* Note: in this case, no need to wait */
-- 
2.13.6

Removing a quorum child node with x-blockdev-change results in a quorum
driver state that cannot be recreated with create options because it
would require a list with gaps. This causes trouble in at least
.bdrv_refresh_filename().

Document this problem so that we won't accidentally mark the command
stable without having addressed it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
---
 qapi/block-core.json | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 # does not support all kinds of operations, all kinds of children, nor
 # all block drivers.
 #
+# FIXME Removing children from a quorum node means introducing gaps in the
+# child indices. This cannot be represented in the 'children' list of
+# BlockdevOptionsQuorum, as returned by .bdrv_refresh_filename().
+#
 # Warning: The data in a new quorum child MUST be consistent with that of
 # the rest of the array.
 #
-- 
2.13.6

From: Doug Gale <doug16k@gmail.com>

Add trace output for commands, errors, and undefined behavior.
Add guest error log output for undefined behavior.
Report invalid undefined accesses to MMIO.
Annotate unlikely error checks with unlikely.

Signed-off-by: Doug Gale <doug16k@gmail.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/nvme.c       | 349 ++++++++++++++++++++++++++++++++++++++++++--------
 hw/block/trace-events |  93 ++++++++++++++
 2 files changed, 390 insertions(+), 52 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -XXX,XX +XXX,XX @@
 #include "qapi/visitor.h"
 #include "sysemu/block-backend.h"
 
+#include "qemu/log.h"
+#include "trace.h"
 #include "nvme.h"
 
+#define NVME_GUEST_ERR(trace, fmt, ...) \
+    do { \
+        (trace_##trace)(__VA_ARGS__); \
+        qemu_log_mask(LOG_GUEST_ERROR, #trace \
+            " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
+    } while (0)
+
 static void nvme_process_sq(void *opaque);
 
 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
@@ -XXX,XX +XXX,XX @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
 {
     if (cq->irq_enabled) {
         if (msix_enabled(&(n->parent_obj))) {
+            trace_nvme_irq_msix(cq->vector);
             msix_notify(&(n->parent_obj), cq->vector);
         } else {
+            trace_nvme_irq_pin();
             pci_irq_pulse(&n->parent_obj);
         }
+    } else {
+        trace_nvme_irq_masked();
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
     trans_len = MIN(len, trans_len);
     int num_prps = (len >> n->page_bits) + 1;
 
-    if (!prp1) {
+    if (unlikely(!prp1)) {
+        trace_nvme_err_invalid_prp();
         return NVME_INVALID_FIELD | NVME_DNR;
     } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
                prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
     }
     len -= trans_len;
     if (len) {
-        if (!prp2) {
+        if (unlikely(!prp2)) {
+            trace_nvme_err_invalid_prp2_missing();
             goto unmap;
         }
         if (len > n->page_size) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
 
                 if (i == n->max_prp_ents - 1 && len > n->page_size) {
-                    if (!prp_ent || prp_ent & (n->page_size - 1)) {
+                    if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
+                        trace_nvme_err_invalid_prplist_ent(prp_ent);
                         goto unmap;
                     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                     prp_ent = le64_to_cpu(prp_list[i]);
                 }
 
-                if (!prp_ent || prp_ent & (n->page_size - 1)) {
+                if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
+                    trace_nvme_err_invalid_prplist_ent(prp_ent);
                     goto unmap;
                 }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
                 i++;
             }
         } else {
-            if (prp2 & (n->page_size - 1)) {
+            if (unlikely(prp2 & (n->page_size - 1))) {
+                trace_nvme_err_invalid_prp2_align(prp2);
                 goto unmap;
             }
             if (qsg->nsg) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
     QEMUIOVector iov;
     uint16_t status = NVME_SUCCESS;
 
+    trace_nvme_dma_read(prp1, prp2);
+
     if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
         return NVME_INVALID_FIELD | NVME_DNR;
     }
     if (qsg.nsg > 0) {
-        if (dma_buf_read(ptr, len, &qsg)) {
+        if (unlikely(dma_buf_read(ptr, len, &qsg))) {
+            trace_nvme_err_invalid_dma();
             status = NVME_INVALID_FIELD | NVME_DNR;
         }
         qemu_sglist_destroy(&qsg);
     } else {
-        if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
+        if (unlikely(qemu_iovec_to_buf(&iov, 0, ptr, len) != len)) {
+            trace_nvme_err_invalid_dma();
             status = NVME_INVALID_FIELD | NVME_DNR;
         }
         qemu_iovec_destroy(&iov);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
     uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS);
 
-    if (slba + nlb > ns->id_ns.nsze) {
+    if (unlikely(slba + nlb > ns->id_ns.nsze)) {
+        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
         return NVME_LBA_RANGE | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
     enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
 
-    if ((slba + nlb) > ns->id_ns.nsze) {
+    trace_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
+
+    if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
         block_acct_invalid(blk_get_stats(n->conf.blk), acct);
+        trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
         return NVME_LBA_RANGE | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     NvmeNamespace *ns;
     uint32_t nsid = le32_to_cpu(cmd->nsid);
 
-    if (nsid == 0 || nsid > n->num_namespaces) {
+    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
+        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
         return NVME_INVALID_NSID | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     case NVME_CMD_READ:
         return nvme_rw(n, ns, cmd, req);
     default:
+        trace_nvme_err_invalid_opc(cmd->opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
     }
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
     NvmeCQueue *cq;
     uint16_t qid = le16_to_cpu(c->qid);
 
-    if (!qid || nvme_check_sqid(n, qid)) {
+    if (unlikely(!qid || nvme_check_sqid(n, qid))) {
+        trace_nvme_err_invalid_del_sq(qid);
         return NVME_INVALID_QID | NVME_DNR;
     }
 
+    trace_nvme_del_sq(qid);
+
     sq = n->sq[qid];
     while (!QTAILQ_EMPTY(&sq->out_req_list)) {
         req = QTAILQ_FIRST(&sq->out_req_list);
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
     uint16_t qflags = le16_to_cpu(c->sq_flags);
     uint64_t prp1 = le64_to_cpu(c->prp1);
 
-    if (!cqid || nvme_check_cqid(n, cqid)) {
+    trace_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
+
+    if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
+        trace_nvme_err_invalid_create_sq_cqid(cqid);
         return NVME_INVALID_CQID | NVME_DNR;
     }
-    if (!sqid || !nvme_check_sqid(n, sqid)) {
+    if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) {
+        trace_nvme_err_invalid_create_sq_sqid(sqid);
         return NVME_INVALID_QID | NVME_DNR;
     }
-    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
+    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
+        trace_nvme_err_invalid_create_sq_size(qsize);
         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
     }
-    if (!prp1 || prp1 & (n->page_size - 1)) {
+    if (unlikely(!prp1 || prp1 & (n->page_size - 1))) {
+        trace_nvme_err_invalid_create_sq_addr(prp1);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
-    if (!(NVME_SQ_FLAGS_PC(qflags))) {
+    if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
+        trace_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
         return NVME_INVALID_FIELD | NVME_DNR;
     }
     sq = g_malloc0(sizeof(*sq));
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
     NvmeCQueue *cq;
     uint16_t qid = le16_to_cpu(c->qid);
 
-    if (!qid || nvme_check_cqid(n, qid)) {
+    if (unlikely(!qid || nvme_check_cqid(n, qid))) {
+        trace_nvme_err_invalid_del_cq_cqid(qid);
         return NVME_INVALID_CQID | NVME_DNR;
     }
 
     cq = n->cq[qid];
-    if (!QTAILQ_EMPTY(&cq->sq_list)) {
+    if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
+        trace_nvme_err_invalid_del_cq_notempty(qid);
         return NVME_INVALID_QUEUE_DEL;
     }
+    trace_nvme_del_cq(qid);
     nvme_free_cq(cq, n);
     return NVME_SUCCESS;
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
     uint16_t qflags = le16_to_cpu(c->cq_flags);
     uint64_t prp1 = le64_to_cpu(c->prp1);
 
-    if (!cqid || !nvme_check_cqid(n, cqid)) {
+    trace_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
+                         NVME_CQ_FLAGS_IEN(qflags) != 0);
+
+    if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) {
+        trace_nvme_err_invalid_create_cq_cqid(cqid);
         return NVME_INVALID_CQID | NVME_DNR;
     }
-    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
+    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
+        trace_nvme_err_invalid_create_cq_size(qsize);
         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
     }
-    if (!prp1) {
+    if (unlikely(!prp1)) {
+        trace_nvme_err_invalid_create_cq_addr(prp1);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
-    if (vector > n->num_queues) {
+    if (unlikely(vector > n->num_queues)) {
+        trace_nvme_err_invalid_create_cq_vector(vector);
         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
     }
-    if (!(NVME_CQ_FLAGS_PC(qflags))) {
+    if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
+        trace_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
     uint64_t prp1 = le64_to_cpu(c->prp1);
     uint64_t prp2 = le64_to_cpu(c->prp2);
 
+    trace_nvme_identify_ctrl();
+
     return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
         prp1, prp2);
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
     uint64_t prp1 = le64_to_cpu(c->prp1);
     uint64_t prp2 = le64_to_cpu(c->prp2);
 
-    if (nsid == 0 || nsid > n->num_namespaces) {
+    trace_nvme_identify_ns(nsid);
+
+    if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
+        trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
         return NVME_INVALID_NSID | NVME_DNR;
     }
 
     ns = &n->namespaces[nsid - 1];
+
     return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
         prp1, prp2);
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
     uint16_t ret;
     int i, j = 0;
 
+    trace_nvme_identify_nslist(min_nsid);
+
     list = g_malloc0(data_len);
     for (i = 0; i < n->num_namespaces; i++) {
         if (i < min_nsid) {
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
     case 0x02:
         return nvme_identify_nslist(n, c);
     default:
+        trace_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 }
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     switch (dw10) {
     case NVME_VOLATILE_WRITE_CACHE:
         result = blk_enable_write_cache(n->conf.blk);
+        trace_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
         break;
     case NVME_NUMBER_OF_QUEUES:
         result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
+        trace_nvme_getfeat_numq(result);
         break;
     default:
+        trace_nvme_err_invalid_getfeat(dw10);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
         break;
     case NVME_NUMBER_OF_QUEUES:
+        trace_nvme_setfeat_numq((dw11 & 0xFFFF) + 1,
+                                ((dw11 >> 16) & 0xFFFF) + 1,
+                                n->num_queues - 1, n->num_queues - 1);
         req->cqe.result =
             cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
         break;
     default:
+        trace_nvme_err_invalid_setfeat(dw10);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
     return NVME_SUCCESS;
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     case NVME_ADM_CMD_GET_FEATURES:
         return nvme_get_feature(n, cmd, req);
     default:
+        trace_nvme_err_invalid_admin_opc(cmd->opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
     }
 }
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
     uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
     uint32_t page_size = 1 << page_bits;
 
-    if (n->cq[0] || n->sq[0] || !n->bar.asq || !n->bar.acq ||
-            n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1) ||
-            NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) ||
-            NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap) ||
-            NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) ||
-            NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes) ||
-            NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.sqes) ||
-            NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.sqes) ||
-            !NVME_AQA_ASQS(n->bar.aqa) || !NVME_AQA_ACQS(n->bar.aqa)) {
+    if (unlikely(n->cq[0])) {
+        trace_nvme_err_startfail_cq();
+        return -1;
+    }
+    if (unlikely(n->sq[0])) {
+        trace_nvme_err_startfail_sq();
+        return -1;
+    }
+    if (unlikely(!n->bar.asq)) {
+        trace_nvme_err_startfail_nbarasq();
+        return -1;
+    }
+    if (unlikely(!n->bar.acq)) {
+        trace_nvme_err_startfail_nbaracq();
+        return -1;
+    }
+    if (unlikely(n->bar.asq & (page_size - 1))) {
+        trace_nvme_err_startfail_asq_misaligned(n->bar.asq);
+        return -1;
+    }
+    if (unlikely(n->bar.acq & (page_size - 1))) {
+        trace_nvme_err_startfail_acq_misaligned(n->bar.acq);
+        return -1;
+    }
+    if (unlikely(NVME_CC_MPS(n->bar.cc) <
+                 NVME_CAP_MPSMIN(n->bar.cap))) {
+        trace_nvme_err_startfail_page_too_small(
+                    NVME_CC_MPS(n->bar.cc),
+                    NVME_CAP_MPSMIN(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_MPS(n->bar.cc) >
+                 NVME_CAP_MPSMAX(n->bar.cap))) {
+        trace_nvme_err_startfail_page_too_large(
+                    NVME_CC_MPS(n->bar.cc),
+                    NVME_CAP_MPSMAX(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
+                 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
+        trace_nvme_err_startfail_cqent_too_small(
+                    NVME_CC_IOCQES(n->bar.cc),
+                    NVME_CTRL_CQES_MIN(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
+                 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
+        trace_nvme_err_startfail_cqent_too_large(
+                    NVME_CC_IOCQES(n->bar.cc),
+                    NVME_CTRL_CQES_MAX(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
+                 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
+        trace_nvme_err_startfail_sqent_too_small(
+                    NVME_CC_IOSQES(n->bar.cc),
+                    NVME_CTRL_SQES_MIN(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
+                 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
+        trace_nvme_err_startfail_sqent_too_large(
+                    NVME_CC_IOSQES(n->bar.cc),
+                    NVME_CTRL_SQES_MAX(n->bar.cap));
+        return -1;
+    }
+    if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
+        trace_nvme_err_startfail_asqent_sz_zero();
+        return -1;
+    }
+    if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
+        trace_nvme_err_startfail_acqent_sz_zero();
         return -1;
     }
 
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
     unsigned size)
 {
+    if (unlikely(offset & (sizeof(uint32_t) - 1))) {
+        NVME_GUEST_ERR(nvme_ub_mmiowr_misaligned32,
+                       "MMIO write not 32-bit aligned,"
+                       " offset=0x%"PRIx64"", offset);
+        /* should be ignored, fall through for now */
+    }
+
+    if (unlikely(size < sizeof(uint32_t))) {
+        NVME_GUEST_ERR(nvme_ub_mmiowr_toosmall,
+                       "MMIO write smaller than 32-bits,"
+                       " offset=0x%"PRIx64", size=%u",
+                       offset, size);
+        /* should be ignored, fall through for now */
+    }
+
     switch (offset) {
-    case 0xc:
+    case 0xc:   /* INTMS */
+        if (unlikely(msix_enabled(&(n->parent_obj)))) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
+                           "undefined access to interrupt mask set"
+                           " when MSI-X is enabled");
+            /* should be ignored, fall through for now */
+        }
         n->bar.intms |= data & 0xffffffff;
         n->bar.intmc = n->bar.intms;
+        trace_nvme_mmio_intm_set(data & 0xffffffff,
+                                 n->bar.intmc);
         break;
-    case 0x10:
+    case 0x10:  /* INTMC */
+        if (unlikely(msix_enabled(&(n->parent_obj)))) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
+                           "undefined access to interrupt mask clr"
+                           " when MSI-X is enabled");
+            /* should be ignored, fall through for now */
+        }
         n->bar.intms &= ~(data & 0xffffffff);
         n->bar.intmc = n->bar.intms;
+        trace_nvme_mmio_intm_clr(data & 0xffffffff,
+                                 n->bar.intmc);
         break;
-    case 0x14:
+    case 0x14:  /* CC */
+        trace_nvme_mmio_cfg(data & 0xffffffff);
         /* Windows first sends data, then sends enable bit */
         if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
             !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
@@ -XXX,XX +XXX,XX @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
 
         if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
             n->bar.cc = data;
-            if (nvme_start_ctrl(n)) {
+            if (unlikely(nvme_start_ctrl(n))) {
+                trace_nvme_err_startfail();
                 n->bar.csts = NVME_CSTS_FAILED;
             } else {
+                trace_nvme_mmio_start_success();
                 n->bar.csts = NVME_CSTS_READY;
             }
         } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
+            trace_nvme_mmio_stopped();
             nvme_clear_ctrl(n);
             n->bar.csts &= ~NVME_CSTS_READY;
         }
         if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
-                nvme_clear_ctrl(n);
-                n->bar.cc = data;
-                n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
+            trace_nvme_mmio_shutdown_set();
+            nvme_clear_ctrl(n);
+            n->bar.cc = data;
+            n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
         } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
-                n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
-                n->bar.cc = data;
+            trace_nvme_mmio_shutdown_cleared();
+            n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
+            n->bar.cc = data;
+        }
+        break;
+    case 0x1C:  /* CSTS */
+        if (data & (1 << 4)) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_ssreset_w1c_unsupported,
+                           "attempted to W1C CSTS.NSSRO"
+                           " but CAP.NSSRS is zero (not supported)");
+        } else if (data != 0) {
+            NVME_GUEST_ERR(nvme_ub_mmiowr_ro_csts,
+                           "attempted to set a read only bit"
+                           " of controller status");
+        }
+        break;
+    case 0x20:  /* NSSR */
+        if (data == 0x4E564D65) {
+            trace_nvme_ub_mmiowr_ssreset_unsupported();
+        } else {
+            /* The spec says that writes of other values have no effect */
+            return;
         }
         break;
-    case 0x24:
+    case 0x24:  /* AQA */
         n->bar.aqa = data & 0xffffffff;
+        trace_nvme_mmio_aqattr(data & 0xffffffff);
         break;
-    case 0x28:
+    case 0x28:  /* ASQ */
         n->bar.asq = data;
+        trace_nvme_mmio_asqaddr(data);
         break;
-    case 0x2c:
+    case 0x2c:  /* ASQ hi */
         n->bar.asq |= data << 32;
+        trace_nvme_mmio_asqaddr_hi(data, n->bar.asq);
         break;
-    case 0x30:
+    case 0x30:  /* ACQ */
+        trace_nvme_mmio_acqaddr(data);
         n->bar.acq = data;
         break;
-    case 0x34:
+    case 0x34:  /* ACQ hi */
         n->bar.acq |= data << 32;
+        trace_nvme_mmio_acqaddr_hi(data, n->bar.acq);
         break;
+    case 0x38:  /* CMBLOC */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbloc_reserved,
+                       "invalid write to reserved CMBLOC"
+                       " when CMBSZ is zero, ignored");
+        return;
+    case 0x3C:  /* CMBSZ */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
+                       "invalid write to read only CMBSZ, ignored");
+        return;
     default:
+        NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
+                       "invalid MMIO write,"
+                       " offset=0x%"PRIx64", data=%"PRIx64"",
+                       offset, data);
         break;
     }
 }
@@ -XXX,XX +XXX,XX @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
     uint8_t *ptr = (uint8_t *)&n->bar;
     uint64_t val = 0;
 
+    if (unlikely(addr & (sizeof(uint32_t) - 1))) {
+        NVME_GUEST_ERR(nvme_ub_mmiord_misaligned32,
+                       "MMIO read not 32-bit aligned,"
+                       " offset=0x%"PRIx64"", addr);
+        /* should RAZ, fall through for now */
+    } else if (unlikely(size < sizeof(uint32_t))) {
+        NVME_GUEST_ERR(nvme_ub_mmiord_toosmall,
+                       "MMIO read smaller than 32-bits,"
+                       " offset=0x%"PRIx64"", addr);
+        /* should RAZ, fall through for now */
+    }
+
     if (addr < sizeof(n->bar)) {
         memcpy(&val, ptr + addr, size);
+    } else {
+        NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
+                       "MMIO read beyond last register,"
+                       " offset=0x%"PRIx64", returning 0", addr);
     }
+
     return val;
 }
 
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
 {
     uint32_t qid;
 
-    if (addr & ((1 << 2) - 1)) {
+    if (unlikely(addr & ((1 << 2) - 1))) {
+        NVME_GUEST_ERR(nvme_ub_db_wr_misaligned,
+                       "doorbell write not 32-bit aligned,"
+                       " offset=0x%"PRIx64", ignoring", addr);
         return;
     }
 
     if (((addr - 0x1000) >> 2) & 1) {
+        /* Completion queue doorbell write */
+
         uint16_t new_head = val & 0xffff;
         int start_sqs;
         NvmeCQueue *cq;
 
         qid = (addr - (0x1000 + (1 << 2))) >> 3;
-        if (nvme_check_cqid(n, qid)) {
+        if (unlikely(nvme_check_cqid(n, qid))) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cq,
+                           "completion queue doorbell write"
+                           " for nonexistent queue,"
+                           " sqid=%"PRIu32", ignoring", qid);
             return;
         }
 
         cq = n->cq[qid];
-        if (new_head >= cq->size) {
+        if (unlikely(new_head >= cq->size)) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cqhead,
+                           "completion queue doorbell write value"
+                           " beyond queue size, sqid=%"PRIu32","
+                           " new_head=%"PRIu16", ignoring",
+                           qid, new_head);
             return;
         }
 
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
             nvme_isr_notify(n, cq);
         }
     } else {
+        /* Submission queue doorbell write */
+
         uint16_t new_tail = val & 0xffff;
         NvmeSQueue *sq;
 
         qid = (addr - 0x1000) >> 3;
-        if (nvme_check_sqid(n, qid)) {
+        if (unlikely(nvme_check_sqid(n, qid))) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sq,
+                           "submission queue doorbell write"
+                           " for nonexistent queue,"
+                           " sqid=%"PRIu32", ignoring", qid);
             return;
         }
 
         sq = n->sq[qid];
-        if (new_tail >= sq->size) {
+        if (unlikely(new_tail >= sq->size)) {
+            NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sqtail,
+                           "submission queue doorbell write value"
+                           " beyond queue size, sqid=%"PRIu32","
+                           " new_tail=%"PRIu16", ignoring",
+                           qid, new_tail);
             return;
         }
 
diff --git a/hw/block/trace-events b/hw/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -XXX,XX +XXX,XX @@ virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint6
 hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
 hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int trans) "blk %p CHS %u %u %u trans %d"
 
+# hw/block/nvme.c
+# nvme traces for successful events
+nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u"
+nvme_irq_pin(void) "pulsing IRQ pin"
+nvme_irq_masked(void) "IRQ is masked"
+nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
+nvme_rw(char const *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
+nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
+nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
+nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
+nvme_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16""
+nvme_identify_ctrl(void) "identify controller"
+nvme_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
+nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16""
+nvme_getfeat_vwcache(char const* result) "get feature volatile write cache, result=%s"
+nvme_getfeat_numq(int result) "get feature number of queues, result=%d"
+nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
+nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
+nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64""
+nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64""
+nvme_mmio_aqattr(uint64_t data) "wrote MMIO, admin queue attributes=0x%"PRIx64""
+nvme_mmio_asqaddr(uint64_t data) "wrote MMIO, admin submission queue address=0x%"PRIx64""
+nvme_mmio_acqaddr(uint64_t data) "wrote MMIO, admin completion queue address=0x%"PRIx64""
+nvme_mmio_asqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin submission queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
+nvme_mmio_acqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin completion queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
+nvme_mmio_start_success(void) "setting controller enable bit succeeded"
+nvme_mmio_stopped(void) "cleared controller enable bit"
+nvme_mmio_shutdown_set(void) "shutdown bit set"
+nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
+
+# nvme traces for error conditions
+nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
+nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
+nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""
+nvme_err_invalid_prp2_missing(void) "PRP2 is null and more data to be transferred"
+nvme_err_invalid_field(void) "invalid field"
+nvme_err_invalid_prp(void) "invalid PRP"
+nvme_err_invalid_sgl(void) "invalid SGL"
+nvme_err_invalid_ns(uint32_t ns, uint32_t limit) "invalid namespace %u not within 1-%u"
+nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
+nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
+nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
+nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
+nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
+nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16""
+nvme_err_invalid_create_sq_size(uint16_t qsize) "failed creating submission queue, invalid qsize=%"PRIu16""
+nvme_err_invalid_create_sq_addr(uint64_t addr) "failed creating submission queue, addr=0x%"PRIx64""
+nvme_err_invalid_create_sq_qflags(uint16_t qflags) "failed creating submission queue, qflags=%"PRIu16""
+nvme_err_invalid_del_cq_cqid(uint16_t cqid) "failed deleting completion queue, cqid=%"PRIu16""
+nvme_err_invalid_del_cq_notempty(uint16_t cqid) "failed deleting completion queue, it is not empty, cqid=%"PRIu16""
+nvme_err_invalid_create_cq_cqid(uint16_t cqid) "failed creating completion queue, cqid=%"PRIu16""
+nvme_err_invalid_create_cq_size(uint16_t size) "failed creating completion queue, size=%"PRIu16""
+nvme_err_invalid_create_cq_addr(uint64_t addr) "failed creating completion queue, addr=0x%"PRIx64""
+nvme_err_invalid_create_cq_vector(uint16_t vector) "failed creating completion queue, vector=%"PRIu16""
+nvme_err_invalid_create_cq_qflags(uint16_t qflags) "failed creating completion queue, qflags=%"PRIu16""
+nvme_err_invalid_identify_cns(uint16_t cns) "identify, invalid cns=0x%"PRIx16""
+nvme_err_invalid_getfeat(int dw10) "invalid get features, dw10=0x%"PRIx32""
+nvme_err_invalid_setfeat(uint32_t dw10) "invalid set features, dw10=0x%"PRIx32""
+nvme_err_startfail_cq(void) "nvme_start_ctrl failed because there are non-admin completion queues"
+nvme_err_startfail_sq(void) "nvme_start_ctrl failed because there are non-admin submission queues"
+nvme_err_startfail_nbarasq(void) "nvme_start_ctrl failed because the admin submission queue address is null"
+nvme_err_startfail_nbaracq(void) "nvme_start_ctrl failed because the admin completion queue address is null"
+nvme_err_startfail_asq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin submission queue address is misaligned: 0x%"PRIx64""
+nvme_err_startfail_acq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin completion queue address is misaligned: 0x%"PRIx64""
+nvme_err_startfail_page_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too small: log2size=%u, min=%u"
+nvme_err_startfail_page_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too large: log2size=%u, max=%u"
+nvme_err_startfail_cqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too small: log2size=%u, min=%u"
+nvme_err_startfail_cqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too large: log2size=%u, max=%u"
+nvme_err_startfail_sqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too small: log2size=%u, min=%u"
+nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too large: log2size=%u, max=%u"
+nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero"
+nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero"
+nvme_err_startfail(void) "setting controller enable bit failed"
+
+# Traces for undefined behavior
+nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64""
+nvme_ub_mmiowr_toosmall(uint64_t offset, unsigned size) "MMIO write smaller than 32 bits, offset=0x%"PRIx64", size=%u"
+nvme_ub_mmiowr_intmask_with_msix(void) "undefined access to interrupt mask set when MSI-X is enabled"
+nvme_ub_mmiowr_ro_csts(void) "attempted to set a read only bit of controller status"
+nvme_ub_mmiowr_ssreset_w1c_unsupported(void) "attempted to W1C CSTS.NSSRO but CAP.NSSRS is zero (not supported)"
+nvme_ub_mmiowr_ssreset_unsupported(void) "attempted NVM subsystem reset but CAP.NSSRS is zero (not supported)"
+nvme_ub_mmiowr_cmbloc_reserved(void) "invalid write to reserved CMBLOC when CMBSZ is zero, ignored"
+nvme_ub_mmiowr_cmbsz_readonly(void) "invalid write to read only CMBSZ, ignored"
+nvme_ub_mmiowr_invalid(uint64_t offset, uint64_t data) "invalid MMIO write, offset=0x%"PRIx64", data=0x%"PRIx64""
+nvme_ub_mmiord_misaligned32(uint64_t offset) "MMIO read not 32-bit aligned, offset=0x%"PRIx64""
+nvme_ub_mmiord_toosmall(uint64_t offset) "MMIO read smaller than 32-bits, offset=0x%"PRIx64""
+nvme_ub_mmiord_invalid_ofs(uint64_t offset) "MMIO read beyond last register, offset=0x%"PRIx64", returning 0"
+nvme_ub_db_wr_misaligned(uint64_t offset) "doorbell write not 32-bit aligned, offset=0x%"PRIx64", ignoring"
+nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for nonexistent queue, cqid=%"PRIu32", ignoring"
+nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring"
+nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring"
+nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring"
+
 # hw/block/xen_disk.c
 xen_disk_alloc(char *name) "%s"
 xen_disk_init(char *name) "%s"
-- 
2.13.6

From: Fam Zheng <famz@redhat.com>

Management tools create overlays of running guests with qemu-img:

$ qemu-img create -b /image/in/use.qcow2 -f qcow2 /overlay/image.qcow2

but this doesn't work anymore due to image locking:

qemu-img: /overlay/image.qcow2: Failed to get shared "write" lock
    Is another process using the image?
    Could not open backing image to determine size.
Use the force share option to allow this use case again.

Cc: qemu-stable@nongnu.org
Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ void bdrv_img_create(const char *filename, const char *fmt,
         back_flags = flags;
         back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
 
+        backing_options = qdict_new();
         if (backing_fmt) {
-            backing_options = qdict_new();
             qdict_put_str(backing_options, "driver", backing_fmt);
         }
+        qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
 
         bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
                        &local_err);
-- 
2.13.6

From: Thomas Huth <thuth@redhat.com>

It's not working anymore since QEMU v1.3.0 - time to remove it now.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c    | 11 -----------
 qemu-doc.texi |  6 ------
 2 files changed, 17 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index XXXXXXX..XXXXXXX 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -XXX,XX +XXX,XX @@ QemuOptsList qemu_legacy_drive_opts = {
             .type = QEMU_OPT_STRING,
             .help = "chs translation (auto, lba, none)",
         },{
-            .name = "boot",
-            .type = QEMU_OPT_BOOL,
-            .help = "(deprecated, ignored)",
-        },{
             .name = "addr",
             .type = QEMU_OPT_STRING,
             .help = "pci address (virtio only)",
@@ -XXX,XX +XXX,XX @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type)
         goto fail;
     }
 
-    /* Deprecated option boot=[on|off] */
-    if (qemu_opt_get(legacy_opts, "boot") != NULL) {
-        fprintf(stderr, "qemu-kvm: boot=on|off is deprecated and will be "
-                "ignored. Future versions will reject this parameter. Please "
-                "update your scripts.\n");
-    }
-
     /* Other deprecated options */
     if (!qtest_enabled()) {
         for (i = 0; i < ARRAY_SIZE(deprecated); i++) {
diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ deprecated.
 
 @section System emulator command line arguments
 
-@subsection -drive boot=on|off (since 1.3.0)
-
-The ``boot=on|off'' option to the ``-drive'' argument is
-ignored. Applications should use the ``bootindex=N'' parameter
-to set an absolute ordering between devices instead.
-
 @subsection -tdf (since 1.3.0)
 
 The ``-tdf'' argument is ignored. The behaviour implemented
-- 
2.13.6

From: Thomas Huth <thuth@redhat.com>

It's been marked as deprecated since QEMU v2.10.0, and so far nobody
complained that we should keep it, so let's remove this legacy option
now to simplify the code quite a bit.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 vl.c            | 86 ++-------------------------------------------------------
 qemu-doc.texi   |  8 ------
 qemu-options.hx | 19 ++-----------
 3 files changed, 4 insertions(+), 109 deletions(-)

diff --git a/vl.c b/vl.c
index XXXXXXX..XXXXXXX 100644
--- a/vl.c
+++ b/vl.c
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
     const char *boot_order = NULL;
     const char *boot_once = NULL;
     DisplayState *ds;
-    int cyls, heads, secs, translation;
     QemuOpts *opts, *machine_opts;
-    QemuOpts *hda_opts = NULL, *icount_opts = NULL, *accel_opts = NULL;
+    QemuOpts *icount_opts = NULL, *accel_opts = NULL;
     QemuOptsList *olist;
     int optind;
     const char *optarg;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
 
     cpu_model = NULL;
     snapshot = 0;
-    cyls = heads = secs = 0;
-    translation = BIOS_ATA_TRANSLATION_AUTO;
 
     nb_nics = 0;
 
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
         if (optind >= argc)
             break;
         if (argv[optind][0] != '-') {
-            hda_opts = drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
+            drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
         } else {
             const QEMUOption *popt;
 
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
                 cpu_model = optarg;
                 break;
             case QEMU_OPTION_hda:
-                {
-                    char buf[256];
-                    if (cyls == 0)
-                        snprintf(buf, sizeof(buf), "%s", HD_OPTS);
-                    else
-                        snprintf(buf, sizeof(buf),
-                                 "%s,cyls=%d,heads=%d,secs=%d%s",
-                                 HD_OPTS , cyls, heads, secs,
-                                 translation == BIOS_ATA_TRANSLATION_LBA ?
-                                 ",trans=lba" :
-                                 translation == BIOS_ATA_TRANSLATION_NONE ?
-                                 ",trans=none" : "");
-                    drive_add(IF_DEFAULT, 0, optarg, buf);
-                    break;
-                }
             case QEMU_OPTION_hdb:
             case QEMU_OPTION_hdc:
             case QEMU_OPTION_hdd:
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
             case QEMU_OPTION_snapshot:
                 snapshot = 1;
                 break;
-            case QEMU_OPTION_hdachs:
-                {
-                    const char *p;
-                    p = optarg;
-                    cyls = strtol(p, (char **)&p, 0);
-                    if (cyls < 1 || cyls > 16383)
-                        goto chs_fail;
-                    if (*p != ',')
-                        goto chs_fail;
-                    p++;
-                    heads = strtol(p, (char **)&p, 0);
-                    if (heads < 1 || heads > 16)
-                        goto chs_fail;
-                    if (*p != ',')
-                        goto chs_fail;
-                    p++;
-                    secs = strtol(p, (char **)&p, 0);
-                    if (secs < 1 || secs > 63)
-                        goto chs_fail;
-                    if (*p == ',') {
-                        p++;
-                        if (!strcmp(p, "large")) {
-                            translation = BIOS_ATA_TRANSLATION_LARGE;
-                        } else if (!strcmp(p, "rechs")) {
-                            translation = BIOS_ATA_TRANSLATION_RECHS;
-                        } else if (!strcmp(p, "none")) {
-                            translation = BIOS_ATA_TRANSLATION_NONE;
-                        } else if (!strcmp(p, "lba")) {
-                            translation = BIOS_ATA_TRANSLATION_LBA;
-                        } else if (!strcmp(p, "auto")) {
-                            translation = BIOS_ATA_TRANSLATION_AUTO;
-                        } else {
-                            goto chs_fail;
-                        }
-                    } else if (*p != '\0') {
-                    chs_fail:
-                        error_report("invalid physical CHS format");
-                        exit(1);
-                    }
-                    if (hda_opts != NULL) {
-                        qemu_opt_set_number(hda_opts, "cyls", cyls,
-                                            &error_abort);
-                        qemu_opt_set_number(hda_opts, "heads", heads,
-                                            &error_abort);
-                        qemu_opt_set_number(hda_opts, "secs", secs,
-                                            &error_abort);
-                        if (translation == BIOS_ATA_TRANSLATION_LARGE) {
-                            qemu_opt_set(hda_opts, "trans", "large",
-                                         &error_abort);
-                        } else if (translation == BIOS_ATA_TRANSLATION_RECHS) {
-                            qemu_opt_set(hda_opts, "trans", "rechs",
-                                         &error_abort);
-                        } else if (translation == BIOS_ATA_TRANSLATION_LBA) {
-                            qemu_opt_set(hda_opts, "trans", "lba",
-                                         &error_abort);
-                        } else if (translation == BIOS_ATA_TRANSLATION_NONE) {
-                            qemu_opt_set(hda_opts, "trans", "none",
-                                         &error_abort);
-                        }
-                    }
-                }
-                error_report("'-hdachs' is deprecated, please use '-device"
-                             " ide-hd,cyls=c,heads=h,secs=s,...' instead");
-                break;
             case QEMU_OPTION_numa:
                 opts = qemu_opts_parse_noisily(qemu_find_opts("numa"),
                                                optarg, true);
diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ The ``--net dump'' argument is now replaced with the
 ``-object filter-dump'' argument which works in combination
 with the modern ``-netdev`` backends instead.
 
-@subsection -hdachs (since 2.10.0)
-
-The ``-hdachs'' argument is now a synonym for setting
-the ``cyls'', ``heads'', ``secs'', and ``trans'' properties
-on the ``ide-hd'' device using the ``-device'' argument.
-The new syntax allows different settings to be provided
-per disk.
-
 @subsection -usbdevice (since 2.10.0)
 
 The ``-usbdevice DEV'' argument is now a synonym for setting
diff --git a/qemu-options.hx b/qemu-options.hx
index XXXXXXX..XXXXXXX 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ of available connectors of a given interface type.
 @item media=@var{media}
 This option defines the type of the media: disk or cdrom.
 @item cyls=@var{c},heads=@var{h},secs=@var{s}[,trans=@var{t}]
-These options have the same definition as they have in @option{-hdachs}.
-These parameters are deprecated, use the corresponding parameters
+Force disk physical geometry and the optional BIOS translation (trans=none or
+lba). These parameters are deprecated, use the corresponding parameters
 of @code{-device} instead.
 @item snapshot=@var{snapshot}
 @var{snapshot} is "on" or "off" and controls snapshot mode for the given drive
@@ -XXX,XX +XXX,XX @@ the raw disk image you use is not written back. You can however force
 the write back by pressing @key{C-a s} (@pxref{disk_images}).
 ETEXI
 
-DEF("hdachs", HAS_ARG, QEMU_OPTION_hdachs, \
-    "-hdachs c,h,s[,t]\n" \
-    "                force hard disk 0 physical geometry and the optional BIOS\n" \
-    "                translation (t=none or lba) (usually QEMU can guess them)\n",
-    QEMU_ARCH_ALL)
-STEXI
-@item -hdachs @var{c},@var{h},@var{s},[,@var{t}]
-@findex -hdachs
-Force hard disk 0 physical geometry (1 <= @var{c} <= 16383, 1 <=
-@var{h} <= 16, 1 <= @var{s} <= 63) and optionally force the BIOS
-translation mode (@var{t}=none, lba or auto). Usually QEMU can guess
-all those parameters. This option is deprecated, please use
-@code{-device ide-hd,cyls=c,heads=h,secs=s,...} instead.
-ETEXI
-
 DEF("fsdev", HAS_ARG, QEMU_OPTION_fsdev,
     "-fsdev fsdriver,id=id[,path=path,][security_model={mapped-xattr|mapped-file|passthrough|none}]\n"
     " [,writeout=immediate][,readonly][,socket=socket|sock_fd=sock_fd][,fmode=fmode][,dmode=dmode]\n"
-- 
2.13.6

From: Thomas Huth <thuth@redhat.com>

Looks like we forgot to announce the deprecation of these options in
the corresponding chapter of the qemu-doc text, so let's do that now.

diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ longer be directly supported in QEMU.
 The ``-drive if=scsi'' argument is replaced by the the
 ``-device BUS-TYPE'' argument combined with ``-drive if=none''.
 
+@subsection -drive cyls=...,heads=...,secs=...,trans=... (since 2.10.0)
+
+The drive geometry arguments are replaced by the the geometry arguments
+that can be specified with the ``-device'' parameter.
+
+@subsection -drive serial=... (since 2.10.0)
+
+The drive serial argument is replaced by the the serial argument
+that can be specified with the ``-device'' parameter.
+
+@subsection -drive addr=... (since 2.10.0)
+
+The drive addr argument is replaced by the the addr argument
+that can be specified with the ``-device'' parameter.
+
 @subsection -net dump (since 2.10.0)
 
 The ``--net dump'' argument is now replaced with the
-- 
2.13.6

From: Fam Zheng <famz@redhat.com>

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block_int.h |  1 -
 block/io.c                | 18 ------------------
 2 files changed, 19 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ bool blk_dev_is_tray_open(BlockBackend *blk);
 bool blk_dev_is_medium_locked(BlockBackend *blk);
 
 void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes);
-bool bdrv_requests_pending(BlockDriverState *bs);
 
 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
 void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_disable_copy_on_read(BlockDriverState *bs)
     assert(old >= 1);
 }
 
-/* Check if any requests are in-flight (including throttled requests) */
-bool bdrv_requests_pending(BlockDriverState *bs)
-{
-    BdrvChild *child;
-
-    if (atomic_read(&bs->in_flight)) {
-        return true;
-    }
-
-    QLIST_FOREACH(child, &bs->children, next) {
-        if (bdrv_requests_pending(child->bs)) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
 typedef struct {
     Coroutine *co;
     BlockDriverState *bs;
-- 
2.13.6

bdrv_drained_begin() doesn't increase bs->quiesce_counter recursively
and also doesn't notify other parent nodes of children, which both means
that the child nodes are not actually drained, and bdrv_drained_begin()
is providing useful functionality only on a single node.

To keep things consistent, we also shouldn't call the block driver
callbacks recursively.

A proper recursive drain version that provides an actually working
drained section for child nodes will be introduced later.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
---
 block/io.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
 }
 
 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
-static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
+static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
 {
     BdrvChild *child, *tmp;
     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
     bdrv_coroutine_enter(bs, data.co);
     BDRV_POLL_WHILE(bs, !data.done);
 
-    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
-        bdrv_drain_invoke(child->bs, begin);
+    if (recursive) {
+        QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+            bdrv_drain_invoke(child->bs, begin, true);
+        }
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
         bdrv_parent_drained_begin(bs);
     }
 
-    bdrv_drain_invoke(bs, true);
+    bdrv_drain_invoke(bs, true, false);
     bdrv_drain_recurse(bs);
 }
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
     }
 
     /* Re-enable things in child-to-parent order */
-    bdrv_drain_invoke(bs, false);
+    bdrv_drain_invoke(bs, false, false);
     bdrv_parent_drained_end(bs);
     aio_enable_external(bdrv_get_aio_context(bs));
 }
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
         aio_context_acquire(aio_context);
         aio_disable_external(aio_context);
         bdrv_parent_drained_begin(bs);
-        bdrv_drain_invoke(bs, true);
+        bdrv_drain_invoke(bs, true, true);
         aio_context_release(aio_context);
 
         if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
 
         /* Re-enable things in child-to-parent order */
         aio_context_acquire(aio_context);
-        bdrv_drain_invoke(bs, false);
+        bdrv_drain_invoke(bs, false, true);
         bdrv_parent_drained_end(bs);
         aio_enable_external(aio_context);
         aio_context_release(aio_context);
-- 
2.13.6

The existing test is for bdrv_drain_all_begin/end() only. Generalise the
test case so that it can be run for the other variants as well. At the
moment this is only bdrv_drain_begin/end(), but in a while, we'll add
another one.

Also, add a backing file to the test node to test whether the operations
work recursively.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 69 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 62 insertions(+), 7 deletions(-)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_test = {
 
     .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
     .bdrv_co_drain_end      = bdrv_test_co_drain_end,
+
+    .bdrv_child_perm        = bdrv_format_default_perms,
 };
 
 static void aio_ret_cb(void *opaque, int ret)
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
     *aio_ret = ret;
 }
 
-static void test_drv_cb_drain_all(void)
+enum drain_type {
+    BDRV_DRAIN_ALL,
+    BDRV_DRAIN,
+};
+
+static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
+{
+    switch (drain_type) {
+    case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
+    case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
+    default:                    g_assert_not_reached();
+    }
+}
+
+static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
+{
+    switch (drain_type) {
+    case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
+    case BDRV_DRAIN:            bdrv_drained_end(bs); break;
+    default:                    g_assert_not_reached();
+    }
+}
+
+static void test_drv_cb_common(enum drain_type drain_type, bool recursive)
 {
     BlockBackend *blk;
-    BlockDriverState *bs;
-    BDRVTestState *s;
+    BlockDriverState *bs, *backing;
+    BDRVTestState *s, *backing_s;
     BlockAIOCB *acb;
     int aio_ret;
 
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
     s = bs->opaque;
     blk_insert_bs(blk, bs, &error_abort);
 
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs, backing, &error_abort);
+
     /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
     g_assert_cmpint(s->drain_count, ==, 0);
-    bdrv_drain_all_begin();
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(drain_type, bs);
+
     g_assert_cmpint(s->drain_count, ==, 1);
-    bdrv_drain_all_end();
+    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
+
+    do_drain_end(drain_type, bs);
+
     g_assert_cmpint(s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
 
     /* Now do the same while a request is pending */
     aio_ret = -EINPROGRESS;
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
     g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
 
     g_assert_cmpint(s->drain_count, ==, 0);
-    bdrv_drain_all_begin();
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(drain_type, bs);
+
     g_assert_cmpint(aio_ret, ==, 0);
     g_assert_cmpint(s->drain_count, ==, 1);
-    bdrv_drain_all_end();
+    g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
+
+    do_drain_end(drain_type, bs);
+
     g_assert_cmpint(s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
 
+    bdrv_unref(backing);
     bdrv_unref(bs);
     blk_unref(blk);
 }
 
+static void test_drv_cb_drain_all(void)
+{
+    test_drv_cb_common(BDRV_DRAIN_ALL, true);
+}
+
+static void test_drv_cb_drain(void)
+{
+    test_drv_cb_common(BDRV_DRAIN, false);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_init(&argc, &argv, NULL);
 
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
+    g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 
     return g_test_run();
 }
-- 
2.13.6

This is currently only working correctly for bdrv_drain(), not for
bdrv_drain_all(). Leave a comment for the drain_all case, we'll address
it later.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
     test_drv_cb_common(BDRV_DRAIN, false);
 }
 
+static void test_quiesce_common(enum drain_type drain_type, bool recursive)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs, *backing;
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
+                              &error_abort);
+    blk_insert_bs(blk, bs, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    bdrv_set_backing_hd(bs, backing, &error_abort);
+
+    g_assert_cmpint(bs->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+
+    do_drain_begin(drain_type, bs);
+
+    g_assert_cmpint(bs->quiesce_counter, ==, 1);
+    g_assert_cmpint(backing->quiesce_counter, ==, !!recursive);
+
+    do_drain_end(drain_type, bs);
+
+    g_assert_cmpint(bs->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+
+    bdrv_unref(backing);
+    bdrv_unref(bs);
+    blk_unref(blk);
+}
+
+static void test_quiesce_drain_all(void)
+{
+    // XXX drain_all doesn't quiesce
+    //test_quiesce_common(BDRV_DRAIN_ALL, true);
+}
+
+static void test_quiesce_drain(void)
+{
+    test_quiesce_common(BDRV_DRAIN, false);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
     g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
 
+    g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
+    g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
+
     return g_test_run();
 }
-- 
2.13.6

Block jobs already paused themselves when their main BlockBackend
entered a drained section. This is not good enough: We also want to
pause a block job and may not submit new requests if, for example, the
mirror target node should be drained.

This implements .drained_begin/end callbacks in child_job in order to
consider all block nodes related to the job, and removes the
BlockBackend callbacks which are unnecessary now because the root of the
job main BlockBackend is always referenced with a child_job, too.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockjob.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static char *child_job_get_parent_desc(BdrvChild *c)
                            job->id);
 }
 
-static const BdrvChildRole child_job = {
-    .get_parent_desc    = child_job_get_parent_desc,
-    .stay_at_node       = true,
-};
-
-static void block_job_drained_begin(void *opaque)
+static void child_job_drained_begin(BdrvChild *c)
 {
-    BlockJob *job = opaque;
+    BlockJob *job = c->opaque;
     block_job_pause(job);
 }
 
-static void block_job_drained_end(void *opaque)
+static void child_job_drained_end(BdrvChild *c)
 {
-    BlockJob *job = opaque;
+    BlockJob *job = c->opaque;
     block_job_resume(job);
 }
 
-static const BlockDevOps block_job_dev_ops = {
-    .drained_begin = block_job_drained_begin,
-    .drained_end = block_job_drained_end,
+static const BdrvChildRole child_job = {
+    .get_parent_desc    = child_job_get_parent_desc,
+    .drained_begin      = child_job_drained_begin,
+    .drained_end        = child_job_drained_end,
+    .stay_at_node       = true,
 };
 
 void block_job_remove_all_bdrv(BlockJob *job)
@@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
     block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort);
     bs->job = job;
 
-    blk_set_dev_ops(blk, &block_job_dev_ops, job);
     bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
 
     QLIST_INSERT_HEAD(&block_jobs, job, job_list);
-- 
2.13.6

Block jobs must be paused if any of the involved nodes are drained.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "block/block.h"
+#include "block/blockjob_int.h"
 #include "sysemu/block-backend.h"
 #include "qapi/error.h"
 
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
     test_quiesce_common(BDRV_DRAIN, false);
 }
 
+
+typedef struct TestBlockJob {
+    BlockJob common;
+    bool should_complete;
+} TestBlockJob;
+
+static void test_job_completed(BlockJob *job, void *opaque)
+{
+    block_job_completed(job, 0);
+}
+
+static void coroutine_fn test_job_start(void *opaque)
+{
+    TestBlockJob *s = opaque;
+
+    while (!s->should_complete) {
+        block_job_sleep_ns(&s->common, 100000);
+    }
+
+    block_job_defer_to_main_loop(&s->common, test_job_completed, NULL);
+}
+
+static void test_job_complete(BlockJob *job, Error **errp)
+{
+    TestBlockJob *s = container_of(job, TestBlockJob, common);
+    s->should_complete = true;
+}
+
+BlockJobDriver test_job_driver = {
+    .instance_size  = sizeof(TestBlockJob),
+    .start          = test_job_start,
+    .complete       = test_job_complete,
+};
+
+static void test_blockjob_common(enum drain_type drain_type)
+{
+    BlockBackend *blk_src, *blk_target;
+    BlockDriverState *src, *target;
+    BlockJob *job;
+    int ret;
+
+    src = bdrv_new_open_driver(&bdrv_test, "source", BDRV_O_RDWR,
+                               &error_abort);
+    blk_src = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_insert_bs(blk_src, src, &error_abort);
+
+    target = bdrv_new_open_driver(&bdrv_test, "target", BDRV_O_RDWR,
+                                  &error_abort);
+    blk_target = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_insert_bs(blk_target, target, &error_abort);
+
+    job = block_job_create("job0", &test_job_driver, src, 0, BLK_PERM_ALL, 0,
+                           0, NULL, NULL, &error_abort);
+    block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort);
+    block_job_start(job);
+
+    g_assert_cmpint(job->pause_count, ==, 0);
+    g_assert_false(job->paused);
+    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
+
+    do_drain_begin(drain_type, src);
+
+    if (drain_type == BDRV_DRAIN_ALL) {
+        /* bdrv_drain_all() drains both src and target, and involves an
+         * additional block_job_pause_all() */
+        g_assert_cmpint(job->pause_count, ==, 3);
+    } else {
+        g_assert_cmpint(job->pause_count, ==, 1);
+    }
+    /* XXX We don't wait until the job is actually paused. Is this okay? */
+    /* g_assert_true(job->paused); */
+    g_assert_false(job->busy); /* The job is paused */
+
+    do_drain_end(drain_type, src);
+
+    g_assert_cmpint(job->pause_count, ==, 0);
+    g_assert_false(job->paused);
+    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
+
+    do_drain_begin(drain_type, target);
+
+    if (drain_type == BDRV_DRAIN_ALL) {
+        /* bdrv_drain_all() drains both src and target, and involves an
+         * additional block_job_pause_all() */
+        g_assert_cmpint(job->pause_count, ==, 3);
+    } else {
+        g_assert_cmpint(job->pause_count, ==, 1);
+    }
+    /* XXX We don't wait until the job is actually paused. Is this okay? */
+    /* g_assert_true(job->paused); */
+    g_assert_false(job->busy); /* The job is paused */
+
+    do_drain_end(drain_type, target);
+
+    g_assert_cmpint(job->pause_count, ==, 0);
+    g_assert_false(job->paused);
+    g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
+
+    ret = block_job_complete_sync(job, &error_abort);
+    g_assert_cmpint(ret, ==, 0);
+
+    blk_unref(blk_src);
+    blk_unref(blk_target);
+    bdrv_unref(src);
+    bdrv_unref(target);
+}
+
+static void test_blockjob_drain_all(void)
+{
+    test_blockjob_common(BDRV_DRAIN_ALL);
+}
+
+static void test_blockjob_drain(void)
+{
+    test_blockjob_common(BDRV_DRAIN);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 
+    g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
+    g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
+
     return g_test_run();
 }
-- 
2.13.6

Block jobs are already paused using the BdrvChildRole drain callbacks,
so we don't need an additional block_job_pause_all() call.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c              |  4 ----
 tests/test-bdrv-drain.c | 10 ++++------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
      * context. */
     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 
-    block_job_pause_all();
-
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         aio_enable_external(aio_context);
         aio_context_release(aio_context);
     }
-
-    block_job_resume_all();
 }
 
 void bdrv_drain_all(void)
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
     do_drain_begin(drain_type, src);
 
     if (drain_type == BDRV_DRAIN_ALL) {
-        /* bdrv_drain_all() drains both src and target, and involves an
-         * additional block_job_pause_all() */
-        g_assert_cmpint(job->pause_count, ==, 3);
+        /* bdrv_drain_all() drains both src and target */
+        g_assert_cmpint(job->pause_count, ==, 2);
     } else {
         g_assert_cmpint(job->pause_count, ==, 1);
     }
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
     do_drain_begin(drain_type, target);
 
     if (drain_type == BDRV_DRAIN_ALL) {
-        /* bdrv_drain_all() drains both src and target, and involves an
-         * additional block_job_pause_all() */
-        g_assert_cmpint(job->pause_count, ==, 3);
+        /* bdrv_drain_all() drains both src and target */
+        g_assert_cmpint(job->pause_count, ==, 2);
     } else {
         g_assert_cmpint(job->pause_count, ==, 1);
     }
-- 
2.13.6

bdrv_do_drained_begin() restricts the call of parent callbacks and
aio_disable_external() to the outermost drain section, but the block
driver callbacks are always called. bdrv_do_drained_end() must match
this behaviour, otherwise nodes stay drained even if begin/end calls
were balanced.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
 
 void bdrv_drained_end(BlockDriverState *bs)
 {
+    int old_quiesce_counter;
+
     if (qemu_in_coroutine()) {
         bdrv_co_yield_to_drain(bs, false);
         return;
     }
     assert(bs->quiesce_counter > 0);
-    if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
-        return;
-    }
+    old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
 
     /* Re-enable things in child-to-parent order */
     bdrv_drain_invoke(bs, false, false);
-    bdrv_parent_drained_end(bs);
-    aio_enable_external(bdrv_get_aio_context(bs));
+    if (old_quiesce_counter == 1) {
+        bdrv_parent_drained_end(bs);
+        aio_enable_external(bdrv_get_aio_context(bs));
+    }
 }
 
 /*
-- 
2.13.6

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
 enum drain_type {
     BDRV_DRAIN_ALL,
     BDRV_DRAIN,
+    DRAIN_TYPE_MAX,
 };
 
 static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
     test_quiesce_common(BDRV_DRAIN, false);
 }
 
+static void test_nested(void)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs, *backing;
+    BDRVTestState *s, *backing_s;
+    enum drain_type outer, inner;
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
+                              &error_abort);
+    s = bs->opaque;
+    blk_insert_bs(blk, bs, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs, backing, &error_abort);
+
+    for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
+        for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
+            /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
+            int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
+                                  (inner != BDRV_DRAIN_ALL);
+            int backing_quiesce = 0;
+            int backing_cb_cnt  = (outer != BDRV_DRAIN) +
+                                  (inner != BDRV_DRAIN);
+
+            g_assert_cmpint(bs->quiesce_counter, ==, 0);
+            g_assert_cmpint(backing->quiesce_counter, ==, 0);
+            g_assert_cmpint(s->drain_count, ==, 0);
+            g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+            do_drain_begin(outer, bs);
+            do_drain_begin(inner, bs);
+
+            g_assert_cmpint(bs->quiesce_counter, ==, bs_quiesce);
+            g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
+            g_assert_cmpint(s->drain_count, ==, 2);
+            g_assert_cmpint(backing_s->drain_count, ==, backing_cb_cnt);
+
+            do_drain_end(inner, bs);
+            do_drain_end(outer, bs);
+
+            g_assert_cmpint(bs->quiesce_counter, ==, 0);
+            g_assert_cmpint(backing->quiesce_counter, ==, 0);
+            g_assert_cmpint(s->drain_count, ==, 0);
+            g_assert_cmpint(backing_s->drain_count, ==, 0);
+        }
+    }
+
+    bdrv_unref(backing);
+    bdrv_unref(bs);
+    blk_unref(blk);
+}
+
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
 
+    g_test_add_func("/bdrv-drain/nested", test_nested);
+
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
 
-- 
2.13.6

This is in preparation for subtree drains, i.e. drained sections that
affect not only a single node, but recursively all child nodes, too.

Calling the parent callbacks for drain is pointless when we just came
from that parent node recursively and leads to multiple increases of
bs->quiesce_counter in a single drain call. Don't do it.

In order for this to work correctly, the parent callback must be called
for every bdrv_drain_begin/end() call, not only for the outermost one:

If we have a node N with two parents A and B, recursive draining of A
should cause the quiesce_counter of B to increase because its child N is
drained independently of B. If now B is recursively drained, too, A must
increase its quiesce_counter because N is drained independently of A
only now, even if N is going from quiesce_counter 1 to 2.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h |  4 ++--
 block.c               | 13 +++++++++----
 block/io.c            | 47 ++++++++++++++++++++++++++++++++++-------------
 3 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_io_unplug(BlockDriverState *bs);
  * Begin a quiesced section of all users of @bs. This is part of
  * bdrv_drained_begin.
  */
-void bdrv_parent_drained_begin(BlockDriverState *bs);
+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
 
 /**
  * bdrv_parent_drained_end:
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs);
  * End a quiesced section of all users of @bs. This is part of
  * bdrv_drained_end.
  */
-void bdrv_parent_drained_end(BlockDriverState *bs);
+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
 
 /**
  * bdrv_drained_begin:
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
                                       BlockDriverState *new_bs)
 {
     BlockDriverState *old_bs = child->bs;
+    int i;
 
     if (old_bs && new_bs) {
         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
     }
     if (old_bs) {
         if (old_bs->quiesce_counter && child->role->drained_end) {
-            child->role->drained_end(child);
+            for (i = 0; i < old_bs->quiesce_counter; i++) {
+                child->role->drained_end(child);
+            }
         }
         if (child->role->detach) {
             child->role->detach(child);
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
     if (new_bs) {
         QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
         if (new_bs->quiesce_counter && child->role->drained_begin) {
-            child->role->drained_begin(child);
+            for (i = 0; i < new_bs->quiesce_counter; i++) {
+                child->role->drained_begin(child);
+            }
         }
 
         if (child->role->attach) {
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
     AioContext *ctx = bdrv_get_aio_context(bs);
 
     aio_disable_external(ctx);
-    bdrv_parent_drained_begin(bs);
+    bdrv_parent_drained_begin(bs, NULL);
     bdrv_drain(bs); /* ensure there are no in-flight requests */
 
     while (aio_poll(ctx, false)) {
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
      */
     aio_context_acquire(new_context);
     bdrv_attach_aio_context(bs, new_context);
-    bdrv_parent_drained_end(bs);
+    bdrv_parent_drained_end(bs, NULL);
     aio_enable_external(ctx);
     aio_context_release(new_context);
 }
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
     int64_t offset, int bytes, BdrvRequestFlags flags);
 
-void bdrv_parent_drained_begin(BlockDriverState *bs)
+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
 {
     BdrvChild *c, *next;
 
     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
+        if (c == ignore) {
+            continue;
+        }
         if (c->role->drained_begin) {
             c->role->drained_begin(c);
         }
     }
 }
 
-void bdrv_parent_drained_end(BlockDriverState *bs)
+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
 {
     BdrvChild *c, *next;
 
     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
+        if (c == ignore) {
+            continue;
+        }
         if (c->role->drained_end) {
             c->role->drained_end(c);
         }
@@ -XXX,XX +XXX,XX @@ typedef struct {
     BlockDriverState *bs;
     bool done;
     bool begin;
+    BdrvChild *parent;
 } BdrvCoDrainData;
 
 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
     return waited;
 }
 
+static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
+static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
+
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
     BdrvCoDrainData *data = opaque;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 
     bdrv_dec_in_flight(bs);
     if (data->begin) {
-        bdrv_drained_begin(bs);
+        bdrv_do_drained_begin(bs, data->parent);
     } else {
-        bdrv_drained_end(bs);
+        bdrv_do_drained_end(bs, data->parent);
     }
 
     data->done = true;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 }
 
 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-                                                bool begin)
+                                                bool begin, BdrvChild *parent)
 {
     BdrvCoDrainData data;
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
         .bs = bs,
         .done = false,
         .begin = begin,
+        .parent = parent,
     };
     bdrv_inc_in_flight(bs);
     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     assert(data.done);
 }
 
-void bdrv_drained_begin(BlockDriverState *bs)
+static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
 {
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, true);
+        bdrv_co_yield_to_drain(bs, true, parent);
         return;
     }
 
     /* Stop things in parent-to-child order */
     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
         aio_disable_external(bdrv_get_aio_context(bs));
-        bdrv_parent_drained_begin(bs);
     }
 
+    bdrv_parent_drained_begin(bs, parent);
     bdrv_drain_invoke(bs, true, false);
     bdrv_drain_recurse(bs);
 }
 
-void bdrv_drained_end(BlockDriverState *bs)
+void bdrv_drained_begin(BlockDriverState *bs)
+{
+    bdrv_do_drained_begin(bs, NULL);
+}
+
+static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
 {
     int old_quiesce_counter;
 
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, false);
+        bdrv_co_yield_to_drain(bs, false, parent);
         return;
     }
     assert(bs->quiesce_counter > 0);
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
 
     /* Re-enable things in child-to-parent order */
     bdrv_drain_invoke(bs, false, false);
+    bdrv_parent_drained_end(bs, parent);
     if (old_quiesce_counter == 1) {
-        bdrv_parent_drained_end(bs);
         aio_enable_external(bdrv_get_aio_context(bs));
     }
 }
 
+void bdrv_drained_end(BlockDriverState *bs)
+{
+    bdrv_do_drained_end(bs, NULL);
+}
+
 /*
  * Wait for pending requests to complete on a single BlockDriverState subtree,
  * and suspend block driver's internal I/O until next request arrives.
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
         /* Stop things in parent-to-child order */
         aio_context_acquire(aio_context);
         aio_disable_external(aio_context);
-        bdrv_parent_drained_begin(bs);
+        bdrv_parent_drained_begin(bs, NULL);
         bdrv_drain_invoke(bs, true, true);
         aio_context_release(aio_context);
 
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
         /* Re-enable things in child-to-parent order */
         aio_context_acquire(aio_context);
         bdrv_drain_invoke(bs, false, true);
-        bdrv_parent_drained_end(bs);
+        bdrv_parent_drained_end(bs, NULL);
         aio_enable_external(aio_context);
         aio_context_release(aio_context);
     }
-- 
2.13.6

bdrv_drained_begin() waits for the completion of requests in the whole
subtree, but it only actually keeps its immediate bs parameter quiesced
until bdrv_drained_end().

Add a version that keeps the whole subtree drained. As of this commit,
graph changes cannot be allowed during a subtree drained section, but
this will be fixed soon.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h | 13 +++++++++++++
 block/io.c            | 54 ++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
 void bdrv_drained_begin(BlockDriverState *bs);
 
 /**
+ * Like bdrv_drained_begin, but recursively begins a quiesced section for
+ * exclusive access to all child nodes as well.
+ *
+ * Graph changes are not allowed during a subtree drain section.
+ */
+void bdrv_subtree_drained_begin(BlockDriverState *bs);
+
+/**
  * bdrv_drained_end:
  *
  * End a quiescent section started by bdrv_drained_begin().
  */
 void bdrv_drained_end(BlockDriverState *bs);
 
+/**
+ * End a quiescent section started by bdrv_subtree_drained_begin().
+ */
+void bdrv_subtree_drained_end(BlockDriverState *bs);
+
 void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
                     Error **errp);
 void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
     BlockDriverState *bs;
     bool done;
     bool begin;
+    bool recursive;
     BdrvChild *parent;
 } BdrvCoDrainData;
 
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
     return waited;
 }
 
-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
+static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                                  BdrvChild *parent);
+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                                BdrvChild *parent);
 
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 
     bdrv_dec_in_flight(bs);
     if (data->begin) {
-        bdrv_do_drained_begin(bs, data->parent);
+        bdrv_do_drained_begin(bs, data->recursive, data->parent);
     } else {
-        bdrv_do_drained_end(bs, data->parent);
+        bdrv_do_drained_end(bs, data->recursive, data->parent);
     }
 
     data->done = true;
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
 }
 
 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
-                                                bool begin, BdrvChild *parent)
+                                                bool begin, bool recursive,
+                                                BdrvChild *parent)
 {
     BdrvCoDrainData data;
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
         .bs = bs,
         .done = false,
         .begin = begin,
+        .recursive = recursive,
         .parent = parent,
     };
     bdrv_inc_in_flight(bs);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     assert(data.done);
 }
 
-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
+static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                                  BdrvChild *parent)
 {
+    BdrvChild *child, *next;
+
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, true, parent);
+        bdrv_co_yield_to_drain(bs, true, recursive, parent);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
     bdrv_parent_drained_begin(bs, parent);
     bdrv_drain_invoke(bs, true, false);
     bdrv_drain_recurse(bs);
+
+    if (recursive) {
+        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
+            bdrv_do_drained_begin(child->bs, true, child);
+        }
+    }
 }
 
 void bdrv_drained_begin(BlockDriverState *bs)
 {
-    bdrv_do_drained_begin(bs, NULL);
+    bdrv_do_drained_begin(bs, false, NULL);
+}
+
+void bdrv_subtree_drained_begin(BlockDriverState *bs)
+{
+    bdrv_do_drained_begin(bs, true, NULL);
 }
 
-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                                BdrvChild *parent)
 {
+    BdrvChild *child, *next;
     int old_quiesce_counter;
 
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, false, parent);
+        bdrv_co_yield_to_drain(bs, false, recursive, parent);
         return;
     }
     assert(bs->quiesce_counter > 0);
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
     if (old_quiesce_counter == 1) {
         aio_enable_external(bdrv_get_aio_context(bs));
     }
+
+    if (recursive) {
+        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
+            bdrv_do_drained_end(child->bs, true, child);
+        }
+    }
 }
 
 void bdrv_drained_end(BlockDriverState *bs)
 {
-    bdrv_do_drained_end(bs, NULL);
+    bdrv_do_drained_end(bs, false, NULL);
+}
+
+void bdrv_subtree_drained_end(BlockDriverState *bs)
+{
+    bdrv_do_drained_end(bs, true, NULL);
 }
 
 /*
-- 
2.13.6

Add a subtree drain version to the existing test cases.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
 enum drain_type {
     BDRV_DRAIN_ALL,
     BDRV_DRAIN,
+    BDRV_SUBTREE_DRAIN,
     DRAIN_TYPE_MAX,
 };
 
@@ -XXX,XX +XXX,XX @@ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
     switch (drain_type) {
     case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
     case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
+    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_begin(bs); break;
     default:                    g_assert_not_reached();
     }
 }
@@ -XXX,XX +XXX,XX @@ static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
     switch (drain_type) {
     case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
     case BDRV_DRAIN:            bdrv_drained_end(bs); break;
+    case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_end(bs); break;
     default:                    g_assert_not_reached();
     }
 }
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
     test_drv_cb_common(BDRV_DRAIN, false);
 }
 
+static void test_drv_cb_drain_subtree(void)
+{
+    test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
+}
+
 static void test_quiesce_common(enum drain_type drain_type, bool recursive)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
     test_quiesce_common(BDRV_DRAIN, false);
 }
 
+static void test_quiesce_drain_subtree(void)
+{
+    test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
+}
+
 static void test_nested(void)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
             /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
             int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
                                   (inner != BDRV_DRAIN_ALL);
-            int backing_quiesce = 0;
+            int backing_quiesce = (outer == BDRV_SUBTREE_DRAIN) +
+                                  (inner == BDRV_SUBTREE_DRAIN);
             int backing_cb_cnt  = (outer != BDRV_DRAIN) +
                                   (inner != BDRV_DRAIN);
 
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_drain(void)
     test_blockjob_common(BDRV_DRAIN);
 }
 
+static void test_blockjob_drain_subtree(void)
+{
+    test_blockjob_common(BDRV_SUBTREE_DRAIN);
+}
+
 int main(int argc, char **argv)
 {
     bdrv_init();
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
 
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
     g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
+    g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
+                    test_drv_cb_drain_subtree);
 
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
+    g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
+                    test_quiesce_drain_subtree);
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
+    g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
+                    test_blockjob_drain_subtree);
 
     return g_test_run();
 }
-- 
2.13.6

If bdrv_do_drained_begin/end() are called in coroutine context, they
first use a BH to get out of the coroutine context. Call some existing
tests again from a coroutine to cover this code path.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
     *aio_ret = ret;
 }
 
+typedef struct CallInCoroutineData {
+    void (*entry)(void);
+    bool done;
+} CallInCoroutineData;
+
+static coroutine_fn void call_in_coroutine_entry(void *opaque)
+{
+    CallInCoroutineData *data = opaque;
+
+    data->entry();
+    data->done = true;
+}
+
+static void call_in_coroutine(void (*entry)(void))
+{
+    Coroutine *co;
+    CallInCoroutineData data = {
+        .entry  = entry,
+        .done   = false,
+    };
+
+    co = qemu_coroutine_create(call_in_coroutine_entry, &data);
+    qemu_coroutine_enter(co);
+    while (!data.done) {
+        aio_poll(qemu_get_aio_context(), true);
+    }
+}
+
 enum drain_type {
     BDRV_DRAIN_ALL,
     BDRV_DRAIN,
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_subtree(void)
     test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
 }
 
+static void test_drv_cb_co_drain(void)
+{
+    call_in_coroutine(test_drv_cb_drain);
+}
+
+static void test_drv_cb_co_drain_subtree(void)
+{
+    call_in_coroutine(test_drv_cb_drain_subtree);
+}
+
 static void test_quiesce_common(enum drain_type drain_type, bool recursive)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain_subtree(void)
     test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
 }
 
+static void test_quiesce_co_drain(void)
+{
+    call_in_coroutine(test_quiesce_drain);
+}
+
+static void test_quiesce_co_drain_subtree(void)
+{
+    call_in_coroutine(test_quiesce_drain_subtree);
+}
+
 static void test_nested(void)
 {
     BlockBackend *blk;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
                     test_drv_cb_drain_subtree);
 
+    // XXX bdrv_drain_all() doesn't work in coroutine context
+    g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
+    g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
+                    test_drv_cb_co_drain_subtree);
+
+
     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
     g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
                     test_quiesce_drain_subtree);
 
+    // XXX bdrv_drain_all() doesn't work in coroutine context
+    g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
+    g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
+                    test_quiesce_co_drain_subtree);
+
     g_test_add_func("/bdrv-drain/nested", test_nested);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
-- 
2.13.6

Test that drain sections are correctly propagated through the graph.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
     blk_unref(blk);
 }
 
+static void test_multiparent(void)
+{
+    BlockBackend *blk_a, *blk_b;
+    BlockDriverState *bs_a, *bs_b, *backing;
+    BDRVTestState *a_s, *b_s, *backing_s;
+
+    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
+                                &error_abort);
+    a_s = bs_a->opaque;
+    blk_insert_bs(blk_a, bs_a, &error_abort);
+
+    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
+                                &error_abort);
+    b_s = bs_b->opaque;
+    blk_insert_bs(blk_b, bs_b, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs_a, backing, &error_abort);
+    bdrv_set_backing_hd(bs_b, backing, &error_abort);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(backing->quiesce_counter, ==, 1);
+    g_assert_cmpint(a_s->drain_count, ==, 1);
+    g_assert_cmpint(b_s->drain_count, ==, 1);
+    g_assert_cmpint(backing_s->drain_count, ==, 1);
+
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 2);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
+    g_assert_cmpint(backing->quiesce_counter, ==, 2);
+    g_assert_cmpint(a_s->drain_count, ==, 2);
+    g_assert_cmpint(b_s->drain_count, ==, 2);
+    g_assert_cmpint(backing_s->drain_count, ==, 2);
+
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(backing->quiesce_counter, ==, 1);
+    g_assert_cmpint(a_s->drain_count, ==, 1);
+    g_assert_cmpint(b_s->drain_count, ==, 1);
+    g_assert_cmpint(backing_s->drain_count, ==, 1);
+
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    bdrv_unref(backing);
+    bdrv_unref(bs_a);
+    bdrv_unref(bs_b);
+    blk_unref(blk_a);
+    blk_unref(blk_b);
+}
+
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
                     test_quiesce_co_drain_subtree);
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
+    g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
-- 
2.13.6

We need to remember how many of the drain sections in which a node is
were recursive (i.e. subtree drain rather than node drain), so that they
can be correctly applied when children are added or removed during the
drained section.

With this change, it is safe to modify the graph even inside a
bdrv_subtree_drained_begin/end() section.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h     |  2 --
 include/block/block_int.h |  5 +++++
 block.c                   | 32 +++++++++++++++++++++++++++++---
 block/io.c                | 28 ++++++++++++++++++++++++----
 4 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs);
 /**
  * Like bdrv_drained_begin, but recursively begins a quiesced section for
  * exclusive access to all child nodes as well.
- *
- * Graph changes are not allowed during a subtree drain section.
  */
 void bdrv_subtree_drained_begin(BlockDriverState *bs);
 
diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
 
     /* Accessed with atomic ops.  */
     int quiesce_counter;
+    int recursive_quiesce_counter;
+
     unsigned int write_gen;               /* Current data generation */
 
     /* Protected by reqs_lock.  */
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags);
 
+void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
+void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
+
 int get_tmp_filename(char *filename, int size);
 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
                             const char *filename);
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_child_cb_drained_end(BdrvChild *child)
     bdrv_drained_end(bs);
 }
 
+static void bdrv_child_cb_attach(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_apply_subtree_drain(child, bs);
+}
+
+static void bdrv_child_cb_detach(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_unapply_subtree_drain(child, bs);
+}
+
 static int bdrv_child_cb_inactivate(BdrvChild *child)
 {
     BlockDriverState *bs = child->opaque;
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_file = {
     .inherit_options = bdrv_inherited_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
     .drained_end     = bdrv_child_cb_drained_end,
+    .attach          = bdrv_child_cb_attach,
+    .detach          = bdrv_child_cb_detach,
     .inactivate      = bdrv_child_cb_inactivate,
 };
 
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_format = {
     .inherit_options = bdrv_inherited_fmt_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
     .drained_end     = bdrv_child_cb_drained_end,
+    .attach          = bdrv_child_cb_attach,
+    .detach          = bdrv_child_cb_detach,
     .inactivate      = bdrv_child_cb_inactivate,
 };
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_attach(BdrvChild *c)
                     parent->backing_blocker);
     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
                     parent->backing_blocker);
+
+    bdrv_child_cb_attach(c);
 }
 
 static void bdrv_backing_detach(BdrvChild *c)
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_detach(BdrvChild *c)
     bdrv_op_unblock_all(c->bs, parent->backing_blocker);
     error_free(parent->backing_blocker);
     parent->backing_blocker = NULL;
+
+    bdrv_child_cb_detach(c);
 }
 
 /*
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
     }
     if (old_bs) {
+        /* Detach first so that the recursive drain sections coming from @child
+         * are already gone and we only end the drain sections that came from
+         * elsewhere. */
+        if (child->role->detach) {
+            child->role->detach(child);
+        }
         if (old_bs->quiesce_counter && child->role->drained_end) {
             for (i = 0; i < old_bs->quiesce_counter; i++) {
                 child->role->drained_end(child);
             }
         }
-        if (child->role->detach) {
-            child->role->detach(child);
-        }
         QLIST_REMOVE(child, next_parent);
     }
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
             }
         }
 
+        /* Attach only after starting new drained sections, so that recursive
+         * drain sections coming from @child don't get an extra .drained_begin
+         * callback. */
         if (child->role->attach) {
             child->role->attach(child);
         }
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     assert(data.done);
 }
 
-static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-                                  BdrvChild *parent)
+void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                           BdrvChild *parent)
 {
     BdrvChild *child, *next;
 
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
     bdrv_drain_recurse(bs);
 
     if (recursive) {
+        bs->recursive_quiesce_counter++;
         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
             bdrv_do_drained_begin(child->bs, true, child);
         }
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_begin(BlockDriverState *bs)
     bdrv_do_drained_begin(bs, true, NULL);
 }
 
-static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
-                                BdrvChild *parent)
+void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                         BdrvChild *parent)
 {
     BdrvChild *child, *next;
     int old_quiesce_counter;
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
     }
 
     if (recursive) {
+        bs->recursive_quiesce_counter--;
         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
             bdrv_do_drained_end(child->bs, true, child);
         }
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_end(BlockDriverState *bs)
     bdrv_do_drained_end(bs, true, NULL);
 }
 
+void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
+{
+    int i;
+
+    for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
+        bdrv_do_drained_begin(child->bs, true, child);
+    }
+}
+
+void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
+{
+    int i;
+
+    for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
+        bdrv_do_drained_end(child->bs, true, child);
+    }
+}
+
 /*
  * Wait for pending requests to complete on a single BlockDriverState subtree,
  * and suspend block driver's internal I/O until next request arrives.
-- 
2.13.6

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_multiparent(void)
     blk_unref(blk_b);
 }
 
+static void test_graph_change(void)
+{
+    BlockBackend *blk_a, *blk_b;
+    BlockDriverState *bs_a, *bs_b, *backing;
+    BDRVTestState *a_s, *b_s, *backing_s;
+
+    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
+                                &error_abort);
+    a_s = bs_a->opaque;
+    blk_insert_bs(blk_a, bs_a, &error_abort);
+
+    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
+                                &error_abort);
+    b_s = bs_b->opaque;
+    blk_insert_bs(blk_b, bs_b, &error_abort);
+
+    backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
+    backing_s = backing->opaque;
+    bdrv_set_backing_hd(bs_a, backing, &error_abort);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
+    do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
+
+    bdrv_set_backing_hd(bs_b, backing, &error_abort);
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
+    g_assert_cmpint(backing->quiesce_counter, ==, 5);
+    g_assert_cmpint(a_s->drain_count, ==, 5);
+    g_assert_cmpint(b_s->drain_count, ==, 5);
+    g_assert_cmpint(backing_s->drain_count, ==, 5);
+
+    bdrv_set_backing_hd(bs_b, NULL, &error_abort);
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 3);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
+    g_assert_cmpint(backing->quiesce_counter, ==, 3);
+    g_assert_cmpint(a_s->drain_count, ==, 3);
+    g_assert_cmpint(b_s->drain_count, ==, 2);
+    g_assert_cmpint(backing_s->drain_count, ==, 3);
+
+    bdrv_set_backing_hd(bs_b, backing, &error_abort);
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
+    g_assert_cmpint(backing->quiesce_counter, ==, 5);
+    g_assert_cmpint(a_s->drain_count, ==, 5);
+    g_assert_cmpint(b_s->drain_count, ==, 5);
+    g_assert_cmpint(backing_s->drain_count, ==, 5);
+
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+    do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(backing->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+    g_assert_cmpint(backing_s->drain_count, ==, 0);
+
+    bdrv_unref(backing);
+    bdrv_unref(bs_a);
+    bdrv_unref(bs_b);
+    blk_unref(blk_a);
+    blk_unref(blk_b);
+}
+
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
     g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
+    g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
-- 
2.13.6

Since commit bde70715, base is the only node that is reopened in
commit_start(). This means that the code, which still involves an
explicit BlockReopenQueue, can now be simplified by using bdrv_reopen().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
---
 block/commit.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/block/commit.c b/block/commit.c
index XXXXXXX..XXXXXXX 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
                   const char *filter_node_name, Error **errp)
 {
     CommitBlockJob *s;
-    BlockReopenQueue *reopen_queue = NULL;
     int orig_base_flags;
     BlockDriverState *iter;
     BlockDriverState *commit_top_bs = NULL;
@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
     /* convert base to r/w, if necessary */
     orig_base_flags = bdrv_get_flags(base);
     if (!(orig_base_flags & BDRV_O_RDWR)) {
-        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
-                                         orig_base_flags | BDRV_O_RDWR);
-    }
-
-    if (reopen_queue) {
-        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
+        bdrv_reopen(base, orig_base_flags | BDRV_O_RDWR, &local_err);
         if (local_err != NULL) {
             error_propagate(errp, local_err);
             goto fail;
-- 
2.13.6

The bdrv_reopen*() implementation doesn't like it if the graph is
changed between queuing nodes for reopen and actually reopening them
(one of the reasons is that queuing can be recursive).

So instead of draining the device only in bdrv_reopen_multiple(),
require that callers already drained all affected nodes, and assert this
in bdrv_reopen_queue().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
---
 block.c             | 23 ++++++++++++++++-------
 block/replication.c |  6 ++++++
 qemu-io-cmds.c      |  3 +++
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
  * returns a pointer to bs_queue, which is either the newly allocated
  * bs_queue, or the existing bs_queue being used.
  *
+ * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
  */
 static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
                                                  BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
     BdrvChild *child;
     QDict *old_options, *explicit_options;
 
+    /* Make sure that the caller remembered to use a drained section. This is
+     * important to avoid graph changes between the recursive queuing here and
+     * bdrv_reopen_multiple(). */
+    assert(bs->quiesce_counter > 0);
+
     if (bs_queue == NULL) {
         bs_queue = g_new0(BlockReopenQueue, 1);
         QSIMPLEQ_INIT(bs_queue);
@@ -XXX,XX +XXX,XX @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
  * If all devices prepare successfully, then the changes are committed
  * to all devices.
  *
+ * All affected nodes must be drained between bdrv_reopen_queue() and
+ * bdrv_reopen_multiple().
  */
 int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **er
 
     assert(bs_queue != NULL);
 
-    aio_context_release(ctx);
-    bdrv_drain_all_begin();
-    aio_context_acquire(ctx);
-
     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
+        assert(bs_entry->state.bs->quiesce_counter > 0);
         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
             error_propagate(errp, local_err);
             goto cleanup;
@@ -XXX,XX +XXX,XX @@ cleanup:
     }
     g_free(bs_queue);
 
-    bdrv_drain_all_end();
-
     return ret;
 }
 
@@ -XXX,XX +XXX,XX @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
 {
     int ret = -1;
     Error *local_err = NULL;
-    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
+    BlockReopenQueue *queue;
 
+    bdrv_subtree_drained_begin(bs);
+
+    queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
     ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err);
     if (local_err != NULL) {
         error_propagate(errp, local_err);
     }
+
+    bdrv_subtree_drained_end(bs);
+
     return ret;
 }
 
diff --git a/block/replication.c b/block/replication.c
index XXXXXXX..XXXXXXX 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
         new_secondary_flags = s->orig_secondary_flags;
     }
 
+    bdrv_subtree_drained_begin(s->hidden_disk->bs);
+    bdrv_subtree_drained_begin(s->secondary_disk->bs);
+
     if (orig_hidden_flags != new_hidden_flags) {
         reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, NULL,
                                          new_hidden_flags);
@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
                              reopen_queue, &local_err);
         error_propagate(errp, local_err);
     }
+
+    bdrv_subtree_drained_end(s->hidden_disk->bs);
+    bdrv_subtree_drained_end(s->secondary_disk->bs);
 }
 
 static void backup_job_cleanup(BlockDriverState *bs)
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -XXX,XX +XXX,XX @@ static int reopen_f(BlockBackend *blk, int argc, char **argv)
     opts = qopts ? qemu_opts_to_qdict(qopts, NULL) : NULL;
     qemu_opts_reset(&reopen_opts);
 
+    bdrv_subtree_drained_begin(bs);
     brq = bdrv_reopen_queue(NULL, bs, opts, flags);
     bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, &local_err);
+    bdrv_subtree_drained_end(bs);
+
     if (local_err) {
         error_report_err(local_err);
     } else {
-- 
2.13.6