Series comparison

-[Qemu-devel] [PULL 00/37] Block layer patches
+[PULL 00/15] Block layer patches
-The following changes since commit ad1b4ec39caa5b3f17cbd8160283a03a3dcfe2ae:
+The following changes since commit 16aaacb307ed607b9780c12702c44f0fe52edc7e:
-  Merge remote-tracking branch 'remotes/kraxel/tags/input-20180515-pull-request' into staging (2018-05-15 12:50:06 +0100)
+  Merge remote-tracking branch 'remotes/cohuck/tags/s390x-20200430' into staging (2020-04-30 14:00:36 +0100)
-are available in the git repository at:
+are available in the Git repository at:
   git://repo.or.cz/qemu/kevin.git tags/for-upstream
-for you to fetch changes up to 1fce860ea5eba1ca00a67911fc0b8a5d80009514:
+for you to fetch changes up to eaae29ef89d498d0eac553c77b554f310a47f809:
-  Merge remote-tracking branch 'mreitz/tags/pull-block-2018-05-15' into queue-block (2018-05-15 16:19:53 +0200)
+  qemu-storage-daemon: Fix non-string --object properties (2020-04-30 17:51:07 +0200)
 ----------------------------------------------------------------
 Block layer patches:
-- Switch AIO/callback based block drivers to a byte-based interface
+- Fix resize (extending) of short overlays
-- Block jobs: Expose error string via query-block-jobs
+- nvme: introduce PMR support from NVMe 1.4 spec
-- Block job cleanups and fixes
+- qemu-storage-daemon: Fix non-string --object properties
-- hmp: Allow using a qdev id in block_set_io_throttle
 - Copy-on-read block driver
 - The qcow2 default refcount cache size has been decreased
 - Various bug fixes
 ----------------------------------------------------------------
-Alberto Garcia (5):
+Alberto Garcia (1):
-      hmp: Allow using a qdev id in block_set_io_throttle
+      qcow2: Add incompatibility note between backing files and raw external data files
       Fix error message about compressed clusters with OFLAG_COPIED
       specs/qcow2: Clarify that compressed clusters have the COPIED bit reset
       qcow2: Give the refcount cache the minimum possible size by default
       docs: Document the new default sizes of the qcow2 caches
-Daniel Henrique Barboza (1):
+Andrzej Jakowski (1):
-      block-backend: simplify blk_get_aio_context
+      nvme: introduce PMR support from NVMe 1.4 spec
-Eric Blake (7):
+Kevin Wolf (12):
-      block: Support byte-based aio callbacks
+      block: Add flags to BlockDriver.bdrv_co_truncate()
-      file-win32: Switch to byte-based callbacks
+      block: Add flags to bdrv(_co)_truncate()
-      null: Switch to byte-based read/write
+      block-backend: Add flags to blk_truncate()
-      rbd: Switch to byte-based callbacks
+      qcow2: Support BDRV_REQ_ZERO_WRITE for truncate
-      vxhs: Switch to byte-based callbacks
+      raw-format: Support BDRV_REQ_ZERO_WRITE for truncate
-      block: Drop last of the sector-based aio callbacks
+      file-posix: Support BDRV_REQ_ZERO_WRITE for truncate
-      block: Merge .bdrv_co_writev{,_flags} in drivers
+      block: truncate: Don't make backing file data visible
       iotests: Filter testfiles out in filter_img_info()
       iotests: Test committing to short backing file
       qcow2: Forward ZERO_WRITE flag for full preallocation
       qom: Factor out user_creatable_add_dict()
       qemu-storage-daemon: Fix non-string --object properties
-John Snow (1):
+Paolo Bonzini (1):
-      blockjob: expose error string via query
+      qemu-iotests: allow qcow2 external discarded clusters to contain stale data
-Kevin Wolf (7):
+ docs/interop/qcow2.txt          |   3 +
-      blockjob: Fix assertion in block_job_finalize()
+ hw/block/nvme.h                 |   2 +
-      blockjob: Wrappers for progress counter access
+ include/block/block.h           |   5 +-
-      blockjob: Move RateLimit to BlockJob
+ include/block/block_int.h       |  10 +-
-      blockjob: Implement block_job_set_speed() centrally
+ include/block/nvme.h            | 172 ++++++++++++++++++++++++++
-      blockjob: Introduce block_job_ratelimit_get_delay()
+ include/qom/object_interfaces.h |  16 +++
-      blockjob: Add block_job_driver()
+ include/sysemu/block-backend.h  |   2 +-
-      Merge remote-tracking branch 'mreitz/tags/pull-block-2018-05-15' into queue-block
+ block.c                         |   3 +-
  block/block-backend.c           |   4 +-
  block/commit.c                  |   4 +-
  block/crypto.c                  |   7 +-
  block/file-posix.c              |   6 +-
  block/file-win32.c              |   2 +-
  block/gluster.c                 |   1 +
  block/io.c                      |  43 ++++++-
  block/iscsi.c                   |   2 +-
  block/mirror.c                  |   2 +-
  block/nfs.c                     |   3 +-
  block/parallels.c               |   6 +-
  block/qcow.c                    |   4 +-
  block/qcow2-cluster.c           |   2 +-
  block/qcow2-refcount.c          |   2 +-
  block/qcow2.c                   |  73 +++++++++--
  block/qed.c                     |   3 +-
  block/raw-format.c              |   6 +-
  block/rbd.c                     |   1 +
  block/sheepdog.c                |   4 +-
  block/ssh.c                     |   2 +-
  block/vdi.c                     |   2 +-
  block/vhdx-log.c                |   2 +-
  block/vhdx.c                    |   6 +-
  block/vmdk.c                    |   8 +-
  block/vpc.c                     |   2 +-
  blockdev.c                      |   2 +-
  hw/block/nvme.c                 | 109 ++++++++++++++++
  qemu-img.c                      |   2 +-
  qemu-io-cmds.c                  |   2 +-
  qemu-storage-daemon.c           |   4 +-
  qom/object_interfaces.c         |  31 +++++
  qom/qom-qmp-cmds.c              |  24 +---
  tests/test-block-iothread.c     |   9 +-
  tests/qemu-iotests/iotests.py   |   5 +-
  hw/block/Makefile.objs          |   2 +-
  hw/block/trace-events           |   4 +
  tests/qemu-iotests/244          |  10 +-
  tests/qemu-iotests/244.out      |   9 +-
  tests/qemu-iotests/274          | 155 +++++++++++++++++++++++
  tests/qemu-iotests/274.out      | 268 ++++++++++++++++++++++++++++++++++++++++
  tests/qemu-iotests/group        |   1 +
 files changed, 951 insertions(+), 96 deletions(-)
  create mode 100755 tests/qemu-iotests/274
  create mode 100644 tests/qemu-iotests/274.out
-Max Reitz (17):
-      iotests: Split 214 off of 122
-      iotests: Add failure matching to common.qemu
-      iotests: Skip 181 and 201 without userfaultfd
-      block: Add COR filter driver
-      block: BLK_PERM_WRITE includes ..._UNCHANGED
-      block: Add BDRV_REQ_WRITE_UNCHANGED flag
-      block: Set BDRV_REQ_WRITE_UNCHANGED for COR writes
-      block/quorum: Support BDRV_REQ_WRITE_UNCHANGED
-      block: Support BDRV_REQ_WRITE_UNCHANGED in filters
-      iotests: Clean up wrap image in 197
-      iotests: Copy 197 for COR filter driver
-      iotests: Add test for COR across nodes
-      qemu-img: Check post-truncation size
-      block: Document BDRV_REQ_WRITE_UNCHANGED support
-      qemu-io: Use purely string blockdev options
-      qemu-img: Use only string options in img_open_opts
-      iotests: Add test for -U/force-share conflicts
- qapi/block-core.json           |  11 ++-
- docs/interop/qcow2.txt         |   8 +-
- docs/qcow2-cache.txt           |  33 ++++----
- block/qcow2.h                  |   4 -
- include/block/block.h          |   9 ++-
- include/block/block_int.h      |  28 +++++--
- include/block/blockjob.h       |  32 ++++++++
- include/block/blockjob_int.h   |  11 ++-
- include/block/raw-aio.h        |   2 +-
- block/backup.c                 |  62 ++++++---------
- block/blkdebug.c               |   9 ++-
- block/blkreplay.c              |   3 +
- block/blkverify.c              |   3 +
- block/block-backend.c          |   8 +-
- block/commit.c                 |  35 +++------
- block/copy-on-read.c           | 173 +++++++++++++++++++++++++++++++++++++++++
- block/file-win32.c             |  47 ++++++-----
- block/gluster.c                |   4 +-
- block/io.c                     |  75 ++++++++++--------
- block/iscsi.c                  |   8 +-
- block/mirror.c                 |  44 ++++-------
- block/null.c                   |  45 +++++------
- block/parallels.c              |   4 +-
- block/qcow.c                   |   6 +-
- block/qcow2-refcount.c         |   4 +-
- block/qcow2.c                  |  31 +++++---
- block/qed.c                    |   3 +-
- block/quorum.c                 |  19 +++--
- block/raw-format.c             |   9 ++-
- block/rbd.c                    |  40 +++++-----
- block/replication.c            |   4 +-
- block/sheepdog.c               |   4 +-
- block/ssh.c                    |   4 +-
- block/stream.c                 |  33 +++-----
- block/throttle.c               |   6 +-
- block/vhdx.c                   |   4 +-
- block/vxhs.c                   |  43 +++++-----
- block/win32-aio.c              |   5 +-
- blockjob.c                     |  40 +++++++---
- hmp.c                          |  14 +++-
- qemu-img.c                     |  43 ++++++++--
- qemu-io.c                      |   4 +-
- block/Makefile.objs            |   2 +-
- hmp-commands.hx                |   3 +-
- tests/qemu-iotests/122         |  47 -----------
- tests/qemu-iotests/122.out     |  33 --------
- tests/qemu-iotests/137.out     |   2 +-
- tests/qemu-iotests/153         |  17 ++++
- tests/qemu-iotests/153.out     |  16 ++++
- tests/qemu-iotests/181         |  13 ++++
- tests/qemu-iotests/197         |   1 +
- tests/qemu-iotests/201         |  13 ++++
- tests/qemu-iotests/214         |  97 +++++++++++++++++++++++
- tests/qemu-iotests/214.out     |  35 +++++++++
- tests/qemu-iotests/215         | 120 ++++++++++++++++++++++++++++
- tests/qemu-iotests/215.out     |  26 +++++++
- tests/qemu-iotests/216         | 115 +++++++++++++++++++++++++++
- tests/qemu-iotests/216.out     |  28 +++++++
- tests/qemu-iotests/common.qemu |  58 ++++++++++++--
- tests/qemu-iotests/group       |   3 +
-files changed, 1174 insertions(+), 429 deletions(-)
- create mode 100644 block/copy-on-read.c
- create mode 100755 tests/qemu-iotests/214
- create mode 100644 tests/qemu-iotests/214.out
- create mode 100755 tests/qemu-iotests/215
- create mode 100644 tests/qemu-iotests/215.out
- create mode 100755 tests/qemu-iotests/216
- create mode 100644 tests/qemu-iotests/216.out

-[Qemu-devel] [PULL 01/37] block-backend: simplify blk_get_aio_context
+Deleted patch
-From: Daniel Henrique Barboza <danielhb@linux.vnet.ibm.com>
-blk_get_aio_context verifies if BlockDriverState bs is not NULL,
-return bdrv_get_aio_context(bs) if true or qemu_get_aio_context()
-otherwise. However, bdrv_get_aio_context from block.c already does
-this verification itself, also returning qemu_get_aio_context()
-if bs is NULL:
-AioContext *bdrv_get_aio_context(BlockDriverState *bs)
-{
-    return bs ? bs->aio_context : qemu_get_aio_context();
-}
-This patch simplifies blk_get_aio_context to simply call
-bdrv_get_aio_context instead of replicating the same logic.
-Signed-off-by: Daniel Henrique Barboza <danielhb@linux.vnet.ibm.com>
-Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- block/block-backend.c | 8 +-------
-file changed, 1 insertion(+), 7 deletions(-)
-diff --git a/block/block-backend.c b/block/block-backend.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/block-backend.c
-+++ b/block/block-backend.c
-@@ -XXX,XX +XXX,XX @@ void blk_op_unblock_all(BlockBackend *blk, Error *reason)
- AioContext *blk_get_aio_context(BlockBackend *blk)
- {
--    BlockDriverState *bs = blk_bs(blk);
--
--    if (bs) {
--        return bdrv_get_aio_context(bs);
--    } else {
--        return qemu_get_aio_context();
--    }
-+    return bdrv_get_aio_context(blk_bs(blk));
- }
- static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
---
-.13.6

-[Qemu-devel] [PULL 19/37] specs/qcow2: Clarify that compressed clusters have the COPIED bit reset
+[PULL 01/15] qcow2: Add incompatibility note between backing files and raw external data files
 From: Alberto Garcia <berto@igalia.com>
-Compressed clusters are not supposed to have the COPIED bit set, but
+Backing files and raw external data files are mutually exclusive.
-this is not made explicit in the specs, so let's document it.
+The documentation of the raw external data bit (in autoclear_features)
 already indicates that, but we should also mention it on the other
 side.
+Suggested-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Alberto Garcia <berto@igalia.com>
-Message-id: 74552e1d6e858d3159cb0c0e188e80bc9248e337.1523376013.git.berto@igalia.com
+Message-Id: <20200410121816.8334-1-berto@igalia.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- docs/interop/qcow2.txt | 8 ++++----
+ docs/interop/qcow2.txt | 3 +++
-file changed, 4 insertions(+), 4 deletions(-)
+file changed, 3 insertions(+)
 diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt
 index XXXXXXX..XXXXXXX 100644
 --- a/docs/interop/qcow2.txt
 +++ b/docs/interop/qcow2.txt
-@@ -XXX,XX +XXX,XX @@ L2 table entry:
+@@ -XXX,XX +XXX,XX @@ The first cluster of a qcow2 image contains the file header:
-:   0 for standard clusters
+                     is stored (NB: The string is not null terminated). 0 if the
-for compressed clusters
+                     image doesn't have a backing file.
--              63:   0 for a cluster that is unused or requires COW, 1 if its
++                    Note: backing files are incompatible with raw external data
--                    refcount is exactly one. This information is only accurate
++                    files (auto-clear feature bit 1).
--                    in L2 tables that are reachable from the active L1
++
--                    table.
+- 19:   backing_file_size
-+              63:   0 for clusters that are unused, compressed or require COW.
+                     Length of the backing file name in bytes. Must not be
-+                    1 for standard clusters whose refcount is exactly one.
+                     longer than 1023 bytes. Undefined if the image doesn't have
 +                    This information is only accurate in L2 tables
 +                    that are reachable from the active L1 table.
  Standard Cluster Descriptor:
 --
-.13.6
+.25.3

-[Qemu-devel] [PULL 06/37] vxhs: Switch to byte-based callbacks
+[PULL 02/15] qemu-iotests: allow qcow2 external discarded clusters to contain stale data
-From: Eric Blake <eblake@redhat.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-We are gradually moving away from sector-based interfaces, towards
+Test 244 checks the expected behavior of qcow2 external data files
-byte-based.  Make the change for the last few sector-based callbacks
+with respect to zero and discarded clusters.  Filesystems however
-in the vxhs driver.
+are free to ignore discard requests, and this seems to be the
 case for overlayfs.  Relax the tests to skip checks on the
 external data file for discarded areas, which implies not using
 qemu-img compare in the data_file_raw=on case.
-Note that the driver was already using byte-based calls for
+This fixes docker tests on RHEL8.
 performing actual I/O, so this just gets rid of a round trip
 of scaling; however, as I don't know if VxHS is tolerant of
 non-sector AIO operations, I went with the conservative approach
 of adding .bdrv_refresh_limits to override the block layer
 defaults back to the pre-patch value of 512.
-Signed-off-by: Eric Blake <eblake@redhat.com>
+Cc: Kevin Wolf <kwolf@redhat.com>
 Cc: qemu-block@nongnu.org
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Message-Id: <20200409191006.24429-1-pbonzini@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/vxhs.c | 43 ++++++++++++++++++++++---------------------
+ tests/qemu-iotests/244     | 10 ++++++++--
-file changed, 22 insertions(+), 21 deletions(-)
+ tests/qemu-iotests/244.out |  9 ++++++---
 files changed, 14 insertions(+), 5 deletions(-)
-diff --git a/block/vxhs.c b/block/vxhs.c
+diff --git a/tests/qemu-iotests/244 b/tests/qemu-iotests/244
 index XXXXXXX..XXXXXXX 100755
 --- a/tests/qemu-iotests/244
 +++ b/tests/qemu-iotests/244
@@ -XXX,XX +XXX,XX @@ $QEMU_IO -c 'read -P 0 0 1M' \
  echo
  $QEMU_IO -c 'read -P 0 0 1M' \
           -c 'read -P 0x11 1M 1M' \
 -         -c 'read -P 0 2M 2M' \
           -c 'read -P 0x11 4M 1M' \
           -c 'read -P 0 5M 1M' \
           -f raw "$TEST_IMG.data" |
@@ -XXX,XX +XXX,XX @@ $QEMU_IO -c 'read -P 0 0 1M' \
           -f $IMGFMT "$TEST_IMG" |
           _filter_qemu_io
 +# Discarded clusters are only marked as such in the qcow2 metadata, but
 +# they can contain stale data in the external data file.  Instead, zero
 +# clusters must be zeroed in the external data file too.
  echo
 -$QEMU_IMG compare "$TEST_IMG" "$TEST_IMG.data"
 +$QEMU_IO -c 'read -P 0 0 1M' \
 +         -c 'read -P 0x11 1M 1M' \
 +         -c 'read -P 0 3M 3M' \
 +         -f raw "$TEST_IMG".data |
 +         _filter_qemu_io
  echo -n "qcow2 file size after I/O: "
  du -b $TEST_IMG | cut -f1
 diff --git a/tests/qemu-iotests/244.out b/tests/qemu-iotests/244.out
 index XXXXXXX..XXXXXXX 100644
---- a/block/vxhs.c
+--- a/tests/qemu-iotests/244.out
-+++ b/block/vxhs.c
++++ b/tests/qemu-iotests/244.out
-@@ -XXX,XX +XXX,XX @@ static void vxhs_parse_filename(const char *filename, QDict *options,
+@@ -XXX,XX +XXX,XX @@ read 1048576/1048576 bytes at offset 0
-     }
+MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
- }
+ read 1048576/1048576 bytes at offset 1048576
+MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+static void vxhs_refresh_limits(BlockDriverState *bs, Error **errp)
+-read 2097152/2097152 bytes at offset 2097152
-+{
+-2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+    /* XXX Does VXHS support AIO on less than 512-byte alignment? */
+ read 1048576/1048576 bytes at offset 4194304
-+    bs->bl.request_alignment = 512;
+MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+}
+ read 1048576/1048576 bytes at offset 5242880
-+
+@@ -XXX,XX +XXX,XX @@ read 1048576/1048576 bytes at offset 1048576
- static int vxhs_init_and_ref(void)
+ read 4194304/4194304 bytes at offset 2097152
- {
+MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-     if (vxhs_ref++ == 0) {
-@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo vxhs_aiocb_info = {
+-Images are identical.
-  * and is passed to QNIO. When QNIO completes the work,
++read 1048576/1048576 bytes at offset 0
-  * it will be passed back through the callback.
++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-  */
++read 1048576/1048576 bytes at offset 1048576
--static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num,
++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
--                               QEMUIOVector *qiov, int nb_sectors,
++read 3145728/3145728 bytes at offset 3145728
-+static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, uint64_t offset,
++3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+                               QEMUIOVector *qiov, uint64_t size,
+ qcow2 file size after I/O: 327680
-                                BlockCompletionFunc *cb, void *opaque,
-                                VDISKAIOCmd iodir)
+ === bdrv_co_block_status test for file and offset=0 ===
  {
      VXHSAIOCB *acb = NULL;
      BDRVVXHSState *s = bs->opaque;
 -    size_t size;
 -    uint64_t offset;
      int iio_flags = 0;
      int ret = 0;
      void *dev_handle = s->vdisk_hostinfo.dev_handle;
 -    offset = sector_num * BDRV_SECTOR_SIZE;
 -    size = nb_sectors * BDRV_SECTOR_SIZE;
      acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque);
      /*
@@ -XXX,XX +XXX,XX @@ static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num,
      switch (iodir) {
      case VDISK_AIO_WRITE:
              ret = iio_writev(dev_handle, acb, qiov->iov, qiov->niov,
 -                             offset, (uint64_t)size, iio_flags);
 +                             offset, size, iio_flags);
              break;
      case VDISK_AIO_READ:
              ret = iio_readv(dev_handle, acb, qiov->iov, qiov->niov,
 -                            offset, (uint64_t)size, iio_flags);
 +                            offset, size, iio_flags);
              break;
      default:
              trace_vxhs_aio_rw_invalid(iodir);
@@ -XXX,XX +XXX,XX @@ errout:
      return NULL;
  }
 -static BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs,
 -                                   int64_t sector_num, QEMUIOVector *qiov,
 -                                   int nb_sectors,
 +static BlockAIOCB *vxhs_aio_preadv(BlockDriverState *bs,
 +                                   uint64_t offset, uint64_t bytes,
 +                                   QEMUIOVector *qiov, int flags,
                                     BlockCompletionFunc *cb, void *opaque)
  {
 -    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, cb,
 -                       opaque, VDISK_AIO_READ);
 +    return vxhs_aio_rw(bs, offset, qiov, bytes, cb, opaque, VDISK_AIO_READ);
  }
 -static BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs,
 -                                   int64_t sector_num, QEMUIOVector *qiov,
 -                                   int nb_sectors,
 -                                   BlockCompletionFunc *cb, void *opaque)
 +static BlockAIOCB *vxhs_aio_pwritev(BlockDriverState *bs,
 +                                    uint64_t offset, uint64_t bytes,
 +                                    QEMUIOVector *qiov, int flags,
 +                                    BlockCompletionFunc *cb, void *opaque)
  {
 -    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors,
 -                       cb, opaque, VDISK_AIO_WRITE);
 +    return vxhs_aio_rw(bs, offset, qiov, bytes, cb, opaque, VDISK_AIO_WRITE);
  }
  static void vxhs_close(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_vxhs = {
      .instance_size                = sizeof(BDRVVXHSState),
      .bdrv_file_open               = vxhs_open,
      .bdrv_parse_filename          = vxhs_parse_filename,
 +    .bdrv_refresh_limits          = vxhs_refresh_limits,
      .bdrv_close                   = vxhs_close,
      .bdrv_getlength               = vxhs_getlength,
 -    .bdrv_aio_readv               = vxhs_aio_readv,
 -    .bdrv_aio_writev              = vxhs_aio_writev,
 +    .bdrv_aio_preadv              = vxhs_aio_preadv,
 +    .bdrv_aio_pwritev             = vxhs_aio_pwritev,
  };
  static void bdrv_vxhs_init(void)
 --
-.13.6
+.25.3

-[Qemu-devel] [PULL 08/37] block: Merge .bdrv_co_writev{, _flags} in drivers
+[PULL 03/15] block: Add flags to BlockDriver.bdrv_co_truncate()
-From: Eric Blake <eblake@redhat.com>
+This adds a new BdrvRequestFlags parameter to the .bdrv_co_truncate()
+driver callbacks, and a supported_truncate_flags field in
-We have too many driver callback interfaces; simplify the mess
+BlockDriverState that allows drivers to advertise support for request
-somewhat by merging the flags parameter of .bdrv_co_writev_flags()
+flags in the context of truncate.
-into .bdrv_co_writev().  Note that as long as a driver doesn't set
-.supported_write_flags, the flags argument will be 0 and behavior is
+For now, we always pass 0 and no drivers declare support for any flag.
-identical.  Also note that the public function bdrv_co_writev() still
-lacks a flags argument; so the driver signature is thus intentionally
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-slightly different.  But that's not the end of the world, nor the first
+Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-time that the driver interface differs slightly from the public
+Reviewed-by: Alberto Garcia <berto@igalia.com>
-interface.
+Reviewed-by: Max Reitz <mreitz@redhat.com>
+Message-Id: <20200424125448.63318-2-kwolf@redhat.com>
 Ideally, we should be rewriting all of these drivers to use modern
 byte-based interfaces.  But that's a more invasive patch to write
 and audit, compared to the simplification done here.
 Signed-off-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/block_int.h |  2 --
+ include/block/block_int.h   | 10 +++++++++-
- block/gluster.c           |  4 +++-
+ block/crypto.c              |  3 ++-
- block/io.c                | 13 ++++---------
+ block/file-posix.c          |  2 +-
- block/iscsi.c             |  8 ++++----
+ block/file-win32.c          |  2 +-
- block/parallels.c         |  4 +++-
+ block/gluster.c             |  1 +
- block/qcow.c              |  6 ++++--
+ block/io.c                  |  8 +++++++-
- block/qed.c               |  3 ++-
+ block/iscsi.c               |  2 +-
- block/replication.c       |  4 +++-
+ block/nfs.c                 |  3 ++-
- block/sheepdog.c          |  4 +++-
+ block/qcow2.c               |  2 +-
- block/ssh.c               |  4 +++-
+ block/qed.c                 |  1 +
- block/vhdx.c              |  4 +++-
+ block/raw-format.c          |  2 +-
-files changed, 32 insertions(+), 24 deletions(-)
+ block/rbd.c                 |  1 +
  block/sheepdog.c            |  4 ++--
  block/ssh.c                 |  2 +-
  tests/test-block-iothread.c |  3 ++-
 files changed, 33 insertions(+), 13 deletions(-)
 diff --git a/include/block/block_int.h b/include/block/block_int.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/block_int.h
 +++ b/include/block/block_int.h
 @@ -XXX,XX +XXX,XX @@ struct BlockDriver {
-     int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs,
+      */
-         uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
+     int coroutine_fn (*bdrv_co_truncate)(BlockDriverState *bs, int64_t offset,
-     int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
+                                          bool exact, PreallocMode prealloc,
--        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+-                                         Error **errp);
--    int coroutine_fn (*bdrv_co_writev_flags)(BlockDriverState *bs,
++                                         BdrvRequestFlags flags, Error **errp);
-         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags);
-     /**
+     int64_t (*bdrv_getlength)(BlockDriverState *bs);
-      * @offset: position in bytes to write at
+     bool has_variable_length;
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
      /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA,
       * BDRV_REQ_MAY_UNMAP, BDRV_REQ_WRITE_UNCHANGED) */
      unsigned int supported_zero_flags;
 +    /*
 +     * Flags honoured during truncate (so far: BDRV_REQ_ZERO_WRITE).
 +     *
 +     * If BDRV_REQ_ZERO_WRITE is given, the truncate operation must make sure
 +     * that any added space reads as all zeros. If this can't be guaranteed,
 +     * the operation must fail.
 +     */
 +    unsigned int supported_truncate_flags;
      /* the following member gives a name to every node on the bs graph. */
      char node_name[32];
 diff --git a/block/crypto.c b/block/crypto.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/crypto.c
 +++ b/block/crypto.c
@@ -XXX,XX +XXX,XX @@ static int block_crypto_co_create_generic(BlockDriverState *bs,
  static int coroutine_fn
  block_crypto_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
 -                         PreallocMode prealloc, Error **errp)
 +                         PreallocMode prealloc, BdrvRequestFlags flags,
 +                         Error **errp)
  {
      BlockCrypto *crypto = bs->opaque;
      uint64_t payload_offset =
 diff --git a/block/file-posix.c b/block/file-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/file-posix.c
 +++ b/block/file-posix.c
@@ -XXX,XX +XXX,XX @@ raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
  static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
                                          bool exact, PreallocMode prealloc,
 -                                        Error **errp)
 +                                        BdrvRequestFlags flags, Error **errp)
  {
      BDRVRawState *s = bs->opaque;
      struct stat st;
 diff --git a/block/file-win32.c b/block/file-win32.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/file-win32.c
 +++ b/block/file-win32.c
@@ -XXX,XX +XXX,XX @@ static void raw_close(BlockDriverState *bs)
  static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
                                          bool exact, PreallocMode prealloc,
 -                                        Error **errp)
 +                                        BdrvRequestFlags flags, Error **errp)
  {
      BDRVRawState *s = bs->opaque;
      LONG low, high;
 diff --git a/block/gluster.c b/block/gluster.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/gluster.c
 +++ b/block/gluster.c
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qemu_gluster_co_truncate(BlockDriverState *bs,
- static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
+                                                  int64_t offset,
-                                                int64_t sector_num,
+                                                  bool exact,
-                                                int nb_sectors,
+                                                  PreallocMode prealloc,
--                                               QEMUIOVector *qiov)
++                                                 BdrvRequestFlags flags,
-+                                               QEMUIOVector *qiov,
+                                                  Error **errp)
-+                                               int flags)
+ {
- {
+     BDRVGlusterState *s = bs->opaque;
 +    assert(!flags);
      return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
  }
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
-     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+     BlockDriverState *bs = child->bs;
-     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
+     BlockDriver *drv = bs->drv;
+     BdrvTrackedRequest req;
--    if (drv->bdrv_co_writev_flags) {
++    BdrvRequestFlags flags = 0;
--        ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
+     int64_t old_size, new_bytes;
--                                        flags & bs->supported_write_flags);
+     int ret;
--        flags &= ~bs->supported_write_flags;
--    } else {
+@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
--        assert(drv->bdrv_co_writev);
+     }
--        assert(!bs->supported_write_flags);
--        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+     if (drv->bdrv_co_truncate) {
--    }
+-        ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, errp);
-+    assert(drv->bdrv_co_writev);
++        if (flags & ~bs->supported_truncate_flags) {
-+    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
++            error_setg(errp, "Block driver does not support requested flags");
-+                              flags & bs->supported_write_flags);
++            ret = -ENOTSUP;
-+    flags &= ~bs->supported_write_flags;
++            goto out;
++        }
- emulate_flags:
++        ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp);
-     if (ret == 0 && (flags & BDRV_REQ_FUA)) {
+     } else if (bs->file && drv->is_filter) {
          ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
      } else {
 diff --git a/block/iscsi.c b/block/iscsi.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/iscsi.c
 +++ b/block/iscsi.c
-@@ -XXX,XX +XXX,XX @@ static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun,
+@@ -XXX,XX +XXX,XX @@ static void iscsi_reopen_commit(BDRVReopenState *reopen_state)
- }
  static int coroutine_fn iscsi_co_truncate(BlockDriverState *bs, int64_t offset,
                                            bool exact, PreallocMode prealloc,
 -                                          Error **errp)
 +                                          BdrvRequestFlags flags, Error **errp)
  {
      IscsiLun *iscsilun = bs->opaque;
      int64_t cur_length;
 diff --git a/block/nfs.c b/block/nfs.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/nfs.c
 +++ b/block/nfs.c
@@ -XXX,XX +XXX,XX @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
  static int coroutine_fn
--iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+ nfs_file_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
--                      QEMUIOVector *iov, int flags)
+-                     PreallocMode prealloc, Error **errp)
-+iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
++                     PreallocMode prealloc, BdrvRequestFlags flags,
-+                QEMUIOVector *iov, int flags)
++                     Error **errp)
  {
-     IscsiLun *iscsilun = bs->opaque;
+     NFSClient *client = bs->opaque;
-     struct IscsiTask iTask;
+     int ret;
-@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_iscsi = {
+diff --git a/block/qcow2.c b/block/qcow2.c
-     .bdrv_co_pdiscard      = iscsi_co_pdiscard,
+index XXXXXXX..XXXXXXX 100644
-     .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
+--- a/block/qcow2.c
-     .bdrv_co_readv         = iscsi_co_readv,
++++ b/block/qcow2.c
--    .bdrv_co_writev_flags  = iscsi_co_writev_flags,
+@@ -XXX,XX +XXX,XX @@ fail:
-+    .bdrv_co_writev        = iscsi_co_writev,
-     .bdrv_co_flush_to_disk = iscsi_co_flush,
+ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
+                                           bool exact, PreallocMode prealloc,
- #ifdef __linux__
+-                                          Error **errp)
-@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_iser = {
++                                          BdrvRequestFlags flags, Error **errp)
-     .bdrv_co_pdiscard      = iscsi_co_pdiscard,
+ {
-     .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
+     BDRVQcow2State *s = bs->opaque;
-     .bdrv_co_readv         = iscsi_co_readv,
+     uint64_t old_length;
 -    .bdrv_co_writev_flags  = iscsi_co_writev_flags,
 +    .bdrv_co_writev        = iscsi_co_writev,
      .bdrv_co_flush_to_disk = iscsi_co_flush,
  #ifdef __linux__
 diff --git a/block/parallels.c b/block/parallels.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/parallels.c
 +++ b/block/parallels.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn parallels_co_block_status(BlockDriverState *bs,
  }
  static coroutine_fn int parallels_co_writev(BlockDriverState *bs,
 -        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 +                                            int64_t sector_num, int nb_sectors,
 +                                            QEMUIOVector *qiov, int flags)
  {
      BDRVParallelsState *s = bs->opaque;
      uint64_t bytes_done = 0;
      QEMUIOVector hd_qiov;
      int ret = 0;
 +    assert(!flags);
      qemu_iovec_init(&hd_qiov, qiov->niov);
      while (nb_sectors > 0) {
 diff --git a/block/qcow.c b/block/qcow.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow.c
 +++ b/block/qcow.c
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
  }
  static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
 -                          int nb_sectors, QEMUIOVector *qiov)
 +                                       int nb_sectors, QEMUIOVector *qiov,
 +                                       int flags)
  {
      BDRVQcowState *s = bs->opaque;
      int index_in_cluster;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
      uint8_t *buf;
      void *orig_buf;
 +    assert(!flags);
      s->cluster_cache_offset = -1; /* disable compressed cache */
      /* We must always copy the iov when encrypting, so we
@@ -XXX,XX +XXX,XX @@ qcow_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
      if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
          /* could not compress: write normal cluster */
          ret = qcow_co_writev(bs, offset >> BDRV_SECTOR_BITS,
 -                             bytes >> BDRV_SECTOR_BITS, qiov);
 +                             bytes >> BDRV_SECTOR_BITS, qiov, 0);
          if (ret < 0) {
              goto fail;
          }
 diff --git a/block/qed.c b/block/qed.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.c
 +++ b/block/qed.c
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_qed_co_truncate(BlockDriverState *bs,
+                                              int64_t offset,
- static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
+                                              bool exact,
-                                            int64_t sector_num, int nb_sectors,
+                                              PreallocMode prealloc,
--                                           QEMUIOVector *qiov)
++                                             BdrvRequestFlags flags,
-+                                           QEMUIOVector *qiov, int flags)
+                                              Error **errp)
  {
-+    assert(!flags);
+     BDRVQEDState *s = bs->opaque;
-     return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
+diff --git a/block/raw-format.c b/block/raw-format.c
- }
+index XXXXXXX..XXXXXXX 100644
+--- a/block/raw-format.c
-diff --git a/block/replication.c b/block/replication.c
++++ b/block/raw-format.c
-index XXXXXXX..XXXXXXX 100644
+@@ -XXX,XX +XXX,XX @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
---- a/block/replication.c
-+++ b/block/replication.c
+ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
-@@ -XXX,XX +XXX,XX @@ out:
+                                         bool exact, PreallocMode prealloc,
- static coroutine_fn int replication_co_writev(BlockDriverState *bs,
+-                                        Error **errp)
-                                               int64_t sector_num,
++                                        BdrvRequestFlags flags, Error **errp)
-                                               int remaining_sectors,
+ {
--                                              QEMUIOVector *qiov)
+     BDRVRawState *s = bs->opaque;
-+                                              QEMUIOVector *qiov,
-+                                              int flags)
+diff --git a/block/rbd.c b/block/rbd.c
- {
+index XXXXXXX..XXXXXXX 100644
-     BDRVReplicationState *s = bs->opaque;
+--- a/block/rbd.c
-     QEMUIOVector hd_qiov;
++++ b/block/rbd.c
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs,
-     int ret;
+                                              int64_t offset,
-     int64_t n;
+                                              bool exact,
+                                              PreallocMode prealloc,
-+    assert(!flags);
++                                             BdrvRequestFlags flags,
-     ret = replication_get_io_status(s);
+                                              Error **errp)
-     if (ret < 0) {
+ {
-         goto out;
+     int r;
 diff --git a/block/sheepdog.c b/block/sheepdog.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/sheepdog.c
 +++ b/block/sheepdog.c
-@@ -XXX,XX +XXX,XX @@ static void sd_aio_complete(SheepdogAIOCB *acb)
+@@ -XXX,XX +XXX,XX @@ static int64_t sd_getlength(BlockDriverState *bs)
- }
+ static int coroutine_fn sd_co_truncate(BlockDriverState *bs, int64_t offset,
- static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
+                                        bool exact, PreallocMode prealloc,
--                        int nb_sectors, QEMUIOVector *qiov)
+-                                       Error **errp)
-+                                     int nb_sectors, QEMUIOVector *qiov,
++                                       BdrvRequestFlags flags, Error **errp)
-+                                     int flags)
+ {
  {
      SheepdogAIOCB acb;
      int ret;
      int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
      BDRVSheepdogState *s = bs->opaque;
+     int ret, fd;
-+    assert(!flags);
+@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
      assert(!flags);
      if (offset > s->inode.vdi_size) {
-         ret = sd_truncate(bs, offset, PREALLOC_MODE_OFF, NULL);
+-        ret = sd_co_truncate(bs, offset, false, PREALLOC_MODE_OFF, NULL);
 +        ret = sd_co_truncate(bs, offset, false, PREALLOC_MODE_OFF, 0, NULL);
          if (ret < 0) {
+             return ret;
+         }
 diff --git a/block/ssh.c b/block/ssh.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/ssh.c
 +++ b/block/ssh.c
-@@ -XXX,XX +XXX,XX @@ static int ssh_write(BDRVSSHState *s, BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ static int64_t ssh_getlength(BlockDriverState *bs)
- static coroutine_fn int ssh_co_writev(BlockDriverState *bs,
+ static int coroutine_fn ssh_co_truncate(BlockDriverState *bs, int64_t offset,
-                                       int64_t sector_num,
+                                         bool exact, PreallocMode prealloc,
--                                      int nb_sectors, QEMUIOVector *qiov)
+-                                        Error **errp)
-+                                      int nb_sectors, QEMUIOVector *qiov,
++                                        BdrvRequestFlags flags, Error **errp)
 +                                      int flags)
  {
      BDRVSSHState *s = bs->opaque;
-     int ret;
+diff --git a/tests/test-block-iothread.c b/tests/test-block-iothread.c
-+    assert(!flags);
+index XXXXXXX..XXXXXXX 100644
-     qemu_co_mutex_lock(&s->lock);
+--- a/tests/test-block-iothread.c
-     ret = ssh_write(s, bs, sector_num * BDRV_SECTOR_SIZE,
++++ b/tests/test-block-iothread.c
-                     nb_sectors * BDRV_SECTOR_SIZE, qiov);
+@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_test_co_pdiscard(BlockDriverState *bs,
-diff --git a/block/vhdx.c b/block/vhdx.c
-index XXXXXXX..XXXXXXX 100644
+ static int coroutine_fn
---- a/block/vhdx.c
+ bdrv_test_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
-+++ b/block/vhdx.c
+-                      PreallocMode prealloc, Error **errp)
-@@ -XXX,XX +XXX,XX @@ int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s)
++                      PreallocMode prealloc, BdrvRequestFlags flags,
 +                      Error **errp)
  {
      return 0;
  }
- static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
--                                      int nb_sectors, QEMUIOVector *qiov)
-+                                       int nb_sectors, QEMUIOVector *qiov,
-+                                       int flags)
- {
-     int ret = -ENOTSUP;
-     BDRVVHDXState *s = bs->opaque;
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
-     uint64_t bat_prior_offset = 0;
-     bool bat_update = false;
-+    assert(!flags);
-     qemu_iovec_init(&hd_qiov, qiov->niov);
-     qemu_co_mutex_lock(&s->lock);
 --
-.13.6
+.25.3

-[Qemu-devel] [PULL 26/37] block: Add BDRV_REQ_WRITE_UNCHANGED flag
+[PULL 04/15] block: Add flags to bdrv(_co)_truncate()
-From: Max Reitz <mreitz@redhat.com>
+Now that block drivers can support flags for .bdrv_co_truncate, expose
 the parameter in the node level interfaces bdrv_co_truncate() and
 bdrv_truncate().
-This flag signifies that a write request will not change the visible
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-disk content.  With this flag set, it is sufficient to have the
+Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 BLK_PERM_WRITE_UNCHANGED permission instead of BLK_PERM_WRITE.
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Alberto Garcia <berto@igalia.com>
-Message-id: 20180421132929.21610-4-mreitz@redhat.com
+Reviewed-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
+Message-Id: <20200424125448.63318-3-kwolf@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/block.h | 6 +++++-
+ include/block/block.h       |  5 +++--
- block/io.c            | 6 +++++-
+ block/block-backend.c       |  2 +-
-files changed, 10 insertions(+), 2 deletions(-)
+ block/crypto.c              |  2 +-
  block/io.c                  | 12 +++++++-----
  block/parallels.c           |  6 +++---
  block/qcow.c                |  4 ++--
  block/qcow2-refcount.c      |  2 +-
  block/qcow2.c               | 15 +++++++++------
  block/raw-format.c          |  2 +-
  block/vhdx-log.c            |  2 +-
  block/vhdx.c                |  2 +-
  block/vmdk.c                |  2 +-
  tests/test-block-iothread.c |  6 +++---
 files changed, 34 insertions(+), 28 deletions(-)
 diff --git a/include/block/block.h b/include/block/block.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/block.h
 +++ b/include/block/block.h
-@@ -XXX,XX +XXX,XX @@ typedef enum {
+@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
-     BDRV_REQ_FUA                = 0x10,
+ void bdrv_refresh_filename(BlockDriverState *bs);
-     BDRV_REQ_WRITE_COMPRESSED   = 0x20,
+ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
-+    /* Signifies that this write request will not change the visible disk
+-                                  PreallocMode prealloc, Error **errp);
-+     * content. */
++                                  PreallocMode prealloc, BdrvRequestFlags flags,
-+    BDRV_REQ_WRITE_UNCHANGED    = 0x40,
++                                  Error **errp);
-+
+ int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
-     /* Mask of valid flags */
+-                  PreallocMode prealloc, Error **errp);
--    BDRV_REQ_MASK               = 0x3f,
++                  PreallocMode prealloc, BdrvRequestFlags flags, Error **errp);
-+    BDRV_REQ_MASK               = 0x7f,
- } BdrvRequestFlags;
+ int64_t bdrv_nb_sectors(BlockDriverState *bs);
+ int64_t bdrv_getlength(BlockDriverState *bs);
- typedef struct BlockSizes {
+diff --git a/block/block-backend.c b/block/block-backend.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/block-backend.c
 +++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
          return -ENOMEDIUM;
      }
 -    return bdrv_truncate(blk->root, offset, exact, prealloc, errp);
 +    return bdrv_truncate(blk->root, offset, exact, prealloc, 0, errp);
  }
  int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
 diff --git a/block/crypto.c b/block/crypto.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/crypto.c
 +++ b/block/crypto.c
@@ -XXX,XX +XXX,XX @@ block_crypto_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
      offset += payload_offset;
 -    return bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
 +    return bdrv_co_truncate(bs->file, offset, exact, prealloc, 0, errp);
  }
  static void block_crypto_close(BlockDriverState *bs)
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
+@@ -XXX,XX +XXX,XX @@ static void bdrv_parent_cb_resize(BlockDriverState *bs)
-     assert(!waited || !req->serialising);
+  * 'offset' bytes in length.
-     assert(req->overlap_offset <= offset);
+  */
-     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
+ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
--    assert(child->perm & BLK_PERM_WRITE);
+-                                  PreallocMode prealloc, Error **errp)
-+    if (flags & BDRV_REQ_WRITE_UNCHANGED) {
++                                  PreallocMode prealloc, BdrvRequestFlags flags,
-+        assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
++                                  Error **errp)
-+    } else {
+ {
-+        assert(child->perm & BLK_PERM_WRITE);
+     BlockDriverState *bs = child->bs;
-+    }
+     BlockDriver *drv = bs->drv;
-     assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
+     BdrvTrackedRequest req;
+-    BdrvRequestFlags flags = 0;
-     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
+     int64_t old_size, new_bytes;
      int ret;
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
          }
          ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp);
      } else if (bs->file && drv->is_filter) {
 -        ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
 +        ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
      } else {
          error_setg(errp, "Image format driver does not support resize");
          ret = -ENOTSUP;
@@ -XXX,XX +XXX,XX @@ typedef struct TruncateCo {
      int64_t offset;
      bool exact;
      PreallocMode prealloc;
 +    BdrvRequestFlags flags;
      Error **errp;
      int ret;
  } TruncateCo;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_truncate_co_entry(void *opaque)
  {
      TruncateCo *tco = opaque;
      tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->exact,
 -                                tco->prealloc, tco->errp);
 +                                tco->prealloc, tco->flags, tco->errp);
      aio_wait_kick();
  }
  int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
 -                  PreallocMode prealloc, Error **errp)
 +                  PreallocMode prealloc, BdrvRequestFlags flags, Error **errp)
  {
      Coroutine *co;
      TruncateCo tco = {
@@ -XXX,XX +XXX,XX @@ int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
          .offset     = offset,
          .exact      = exact,
          .prealloc   = prealloc,
 +        .flags      = flags,
          .errp       = errp,
          .ret        = NOT_DONE,
      };
 diff --git a/block/parallels.c b/block/parallels.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/parallels.c
 +++ b/block/parallels.c
@@ -XXX,XX +XXX,XX @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
          } else {
              ret = bdrv_truncate(bs->file,
                                  (s->data_end + space) << BDRV_SECTOR_BITS,
 -                                false, PREALLOC_MODE_OFF, NULL);
 +                                false, PREALLOC_MODE_OFF, 0, NULL);
          }
          if (ret < 0) {
              return ret;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs,
               * That means we have to pass exact=true.
               */
              ret = bdrv_truncate(bs->file, res->image_end_offset, true,
 -                                PREALLOC_MODE_OFF, &local_err);
 +                                PREALLOC_MODE_OFF, 0, &local_err);
              if (ret < 0) {
                  error_report_err(local_err);
                  res->check_errors++;
@@ -XXX,XX +XXX,XX @@ static void parallels_close(BlockDriverState *bs)
          /* errors are ignored, so we might as well pass exact=true */
          bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS, true,
 -                      PREALLOC_MODE_OFF, NULL);
 +                      PREALLOC_MODE_OFF, 0, NULL);
      }
      g_free(s->bat_dirty_bmap);
 diff --git a/block/qcow.c b/block/qcow.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow.c
 +++ b/block/qcow.c
@@ -XXX,XX +XXX,XX @@ static int get_cluster_offset(BlockDriverState *bs,
                      return -E2BIG;
                  }
                  ret = bdrv_truncate(bs->file, cluster_offset + s->cluster_size,
 -                                    false, PREALLOC_MODE_OFF, NULL);
 +                                    false, PREALLOC_MODE_OFF, 0, NULL);
                  if (ret < 0) {
                      return ret;
                  }
@@ -XXX,XX +XXX,XX @@ static int qcow_make_empty(BlockDriverState *bs)
              l1_length) < 0)
          return -1;
      ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length, false,
 -                        PREALLOC_MODE_OFF, NULL);
 +                        PREALLOC_MODE_OFF, 0, NULL);
      if (ret < 0)
          return ret;
 diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2-refcount.c
 +++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
                  }
                  ret = bdrv_truncate(bs->file, offset + s->cluster_size, false,
 -                                    PREALLOC_MODE_OFF, &local_err);
 +                                    PREALLOC_MODE_OFF, 0, &local_err);
                  if (ret < 0) {
                      error_report_err(local_err);
                      goto resize_fail;
 diff --git a/block/qcow2.c b/block/qcow2.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2.c
 +++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn preallocate_co(BlockDriverState *bs, uint64_t offset,
              mode = PREALLOC_MODE_OFF;
          }
          ret = bdrv_co_truncate(s->data_file, host_offset + cur_bytes, false,
 -                               mode, errp);
 +                               mode, 0, errp);
          if (ret < 0) {
              return ret;
          }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
               * always fulfilled, so there is no need to pass it on.)
               */
              bdrv_co_truncate(bs->file, (last_cluster + 1) * s->cluster_size,
 -                             false, PREALLOC_MODE_OFF, &local_err);
 +                             false, PREALLOC_MODE_OFF, 0, &local_err);
              if (local_err) {
                  warn_reportf_err(local_err,
                                   "Failed to truncate the tail of the image: ");
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
               * file should be resized to the exact target size, too,
               * so we pass @exact here.
               */
 -            ret = bdrv_co_truncate(s->data_file, offset, exact, prealloc, errp);
 +            ret = bdrv_co_truncate(s->data_file, offset, exact, prealloc, 0,
 +                                   errp);
              if (ret < 0) {
                  goto fail;
              }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
          new_file_size = allocation_start +
                          nb_new_data_clusters * s->cluster_size;
          /* Image file grows, so @exact does not matter */
 -        ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, errp);
 +        ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, 0,
 +                               errp);
          if (ret < 0) {
              error_prepend(errp, "Failed to resize underlying file: ");
              qcow2_free_clusters(bs, allocation_start,
@@ -XXX,XX +XXX,XX @@ qcow2_co_pwritev_compressed_part(BlockDriverState *bs,
          if (len < 0) {
              return len;
          }
 -        return bdrv_co_truncate(bs->file, len, false, PREALLOC_MODE_OFF, NULL);
 +        return bdrv_co_truncate(bs->file, len, false, PREALLOC_MODE_OFF, 0,
 +                                NULL);
      }
      if (offset_into_cluster(s, offset)) {
@@ -XXX,XX +XXX,XX @@ static int make_completely_empty(BlockDriverState *bs)
      }
      ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size, false,
 -                        PREALLOC_MODE_OFF, &local_err);
 +                        PREALLOC_MODE_OFF, 0, &local_err);
      if (ret < 0) {
          error_report_err(local_err);
          goto fail;
 diff --git a/block/raw-format.c b/block/raw-format.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/raw-format.c
 +++ b/block/raw-format.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
      s->size = offset;
      offset += s->offset;
 -    return bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
 +    return bdrv_co_truncate(bs->file, offset, exact, prealloc, 0, errp);
  }
  static void raw_eject(BlockDriverState *bs, bool eject_flag)
 diff --git a/block/vhdx-log.c b/block/vhdx-log.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/vhdx-log.c
 +++ b/block/vhdx-log.c
@@ -XXX,XX +XXX,XX @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
                      goto exit;
                  }
                  ret = bdrv_truncate(bs->file, new_file_size, false,
 -                                    PREALLOC_MODE_OFF, NULL);
 +                                    PREALLOC_MODE_OFF, 0, NULL);
                  if (ret < 0) {
                      goto exit;
                  }
 diff --git a/block/vhdx.c b/block/vhdx.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/vhdx.c
 +++ b/block/vhdx.c
@@ -XXX,XX +XXX,XX @@ static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s,
      }
      return bdrv_truncate(bs->file, *new_offset + s->block_size, false,
 -                         PREALLOC_MODE_OFF, NULL);
 +                         PREALLOC_MODE_OFF, 0, NULL);
  }
  /*
 diff --git a/block/vmdk.c b/block/vmdk.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/vmdk.c
 +++ b/block/vmdk.c
@@ -XXX,XX +XXX,XX @@ vmdk_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
              }
              length = QEMU_ALIGN_UP(length, BDRV_SECTOR_SIZE);
              ret = bdrv_truncate(s->extents[i].file, length, false,
 -                                PREALLOC_MODE_OFF, NULL);
 +                                PREALLOC_MODE_OFF, 0, NULL);
              if (ret < 0) {
                  return ret;
              }
 diff --git a/tests/test-block-iothread.c b/tests/test-block-iothread.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/test-block-iothread.c
 +++ b/tests/test-block-iothread.c
@@ -XXX,XX +XXX,XX @@ static void test_sync_op_truncate(BdrvChild *c)
      int ret;
      /* Normal success path */
 -    ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, NULL);
 +    ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, 0, NULL);
      g_assert_cmpint(ret, ==, 0);
      /* Early error: Negative offset */
 -    ret = bdrv_truncate(c, -2, false, PREALLOC_MODE_OFF, NULL);
 +    ret = bdrv_truncate(c, -2, false, PREALLOC_MODE_OFF, 0, NULL);
      g_assert_cmpint(ret, ==, -EINVAL);
      /* Error: Read-only image */
      c->bs->read_only = true;
      c->bs->open_flags &= ~BDRV_O_RDWR;
 -    ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, NULL);
 +    ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, 0, NULL);
      g_assert_cmpint(ret, ==, -EACCES);
      c->bs->read_only = false;
 --
-.13.6
+.25.3

-[Qemu-devel] [PULL 12/37] blockjob: Wrappers for progress counter access
+[PULL 05/15] block-backend: Add flags to blk_truncate()
-Block job drivers are not expected to mess with the internals of the
+Now that node level interface bdrv_truncate() supports passing request
-BlockJob object, so provide wrapper functions for one of the cases where
+flags to the block driver, expose this on the BlockBackend level, too.
 they still do it: Updating the progress counter.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 Reviewed-by: Alberto Garcia <berto@igalia.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: John Snow <jsnow@redhat.com>
+Message-Id: <20200424125448.63318-4-kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/blockjob.h | 19 +++++++++++++++++++
+ include/sysemu/block-backend.h | 2 +-
- block/backup.c           | 22 +++++++++++++---------
+ block.c                        | 3 ++-
- block/commit.c           | 16 ++++++++--------
+ block/block-backend.c          | 4 ++--
- block/mirror.c           | 11 +++++------
+ block/commit.c                 | 4 ++--
- block/stream.c           | 14 ++++++++------
+ block/crypto.c                 | 2 +-
- blockjob.c               | 10 ++++++++++
+ block/mirror.c                 | 2 +-
-files changed, 63 insertions(+), 29 deletions(-)
+ block/qcow2.c                  | 4 ++--
+ block/qed.c                    | 2 +-
-diff --git a/include/block/blockjob.h b/include/block/blockjob.h
+ block/vdi.c                    | 2 +-
-index XXXXXXX..XXXXXXX 100644
+ block/vhdx.c                   | 4 ++--
---- a/include/block/blockjob.h
+ block/vmdk.c                   | 6 +++---
-+++ b/include/block/blockjob.h
+ block/vpc.c                    | 2 +-
-@@ -XXX,XX +XXX,XX @@ void block_job_finalize(BlockJob *job, Error **errp);
+ blockdev.c                     | 2 +-
- void block_job_dismiss(BlockJob **job, Error **errp);
+ qemu-img.c                     | 2 +-
+ qemu-io-cmds.c                 | 2 +-
- /**
+files changed, 22 insertions(+), 21 deletions(-)
-+ * block_job_progress_update:
-+ * @job: The job that has made progress
+diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
-+ * @done: How much progress the job made
+index XXXXXXX..XXXXXXX 100644
-+ *
+--- a/include/sysemu/block-backend.h
-+ * Updates the progress counter of the job.
++++ b/include/sysemu/block-backend.h
-+ */
+@@ -XXX,XX +XXX,XX @@ int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
-+void block_job_progress_update(BlockJob *job, uint64_t done);
+ int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
-+
+                           int bytes);
-+/**
+ int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
-+ * block_job_progress_set_remaining:
+-                 PreallocMode prealloc, Error **errp);
-+ * @job: The job whose expected progress end value is set
++                 PreallocMode prealloc, BdrvRequestFlags flags, Error **errp);
-+ * @remaining: Expected end value of the progress counter of the job
+ int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes);
-+ *
+ int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
-+ * Sets the expected end value of the progress counter of a job so that a
+                      int64_t pos, int size);
-+ * completion percentage can be calculated when the progress is updated.
+diff --git a/block.c b/block.c
-+ */
+index XXXXXXX..XXXXXXX 100644
-+void block_job_progress_set_remaining(BlockJob *job, uint64_t remaining);
+--- a/block.c
-+
++++ b/block.c
-+/**
+@@ -XXX,XX +XXX,XX @@ static int64_t create_file_fallback_truncate(BlockBackend *blk,
-  * block_job_query:
+     int64_t size;
-  * @job: The job to get information about.
+     int ret;
-  *
-diff --git a/block/backup.c b/block/backup.c
+-    ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, &local_err);
-index XXXXXXX..XXXXXXX 100644
++    ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0,
---- a/block/backup.c
++                       &local_err);
-+++ b/block/backup.c
+     if (ret < 0 && ret != -ENOTSUP) {
-@@ -XXX,XX +XXX,XX @@ typedef struct BackupBlockJob {
+         error_propagate(errp, local_err);
-     BlockdevOnError on_source_error;
+         return ret;
-     BlockdevOnError on_target_error;
+diff --git a/block/block-backend.c b/block/block-backend.c
-     CoRwlock flush_rwlock;
+index XXXXXXX..XXXXXXX 100644
-+    uint64_t len;
+--- a/block/block-backend.c
-     uint64_t bytes_read;
++++ b/block/block-backend.c
-     int64_t cluster_size;
+@@ -XXX,XX +XXX,XX @@ int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
      bool compress;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
          trace_backup_do_cow_process(job, start);
 -        n = MIN(job->cluster_size, job->common.len - start);
 +        n = MIN(job->cluster_size, job->len - start);
          if (!bounce_buffer) {
              bounce_buffer = blk_blockalign(blk, job->cluster_size);
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
           * offset field is an opaque progress value, it is not a disk offset.
           */
          job->bytes_read += n;
 -        job->common.offset += n;
 +        block_job_progress_update(&job->common, n);
      }
  out:
@@ -XXX,XX +XXX,XX @@ void backup_do_checkpoint(BlockJob *job, Error **errp)
          return;
      }
 -    len = DIV_ROUND_UP(backup_job->common.len, backup_job->cluster_size);
 +    len = DIV_ROUND_UP(backup_job->len, backup_job->cluster_size);
      hbitmap_set(backup_job->copy_bitmap, 0, len);
  }
-@@ -XXX,XX +XXX,XX @@ static void backup_incremental_init_copy_bitmap(BackupBlockJob *job)
+ int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
-         bdrv_set_dirty_iter(dbi, next_cluster * job->cluster_size);
+-                 PreallocMode prealloc, Error **errp)
-     }
++                 PreallocMode prealloc, BdrvRequestFlags flags, Error **errp)
+ {
--    job->common.offset = job->common.len -
+     if (!blk_is_available(blk)) {
--                         hbitmap_count(job->copy_bitmap) * job->cluster_size;
+         error_setg(errp, "No medium inserted");
-+    /* TODO block_job_progress_set_remaining() would make more sense */
+         return -ENOMEDIUM;
-+    block_job_progress_update(&job->common,
+     }
-+        job->len - hbitmap_count(job->copy_bitmap) * job->cluster_size);
+-    return bdrv_truncate(blk->root, offset, exact, prealloc, 0, errp);
-     bdrv_dirty_iter_free(dbi);
++    return bdrv_truncate(blk->root, offset, exact, prealloc, flags, errp);
  }
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn backup_run(void *opaque)
-     QLIST_INIT(&job->inflight_reqs);
+ int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
      qemu_co_rwlock_init(&job->flush_rwlock);
 -    nb_clusters = DIV_ROUND_UP(job->common.len, job->cluster_size);
 +    nb_clusters = DIV_ROUND_UP(job->len, job->cluster_size);
 +    block_job_progress_set_remaining(&job->common, job->len);
 +
      job->copy_bitmap = hbitmap_alloc(nb_clusters, 0);
      if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
          backup_incremental_init_copy_bitmap(job);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn backup_run(void *opaque)
          ret = backup_run_incremental(job);
      } else {
          /* Both FULL and TOP SYNC_MODE's require copying.. */
 -        for (offset = 0; offset < job->common.len;
 +        for (offset = 0; offset < job->len;
               offset += job->cluster_size) {
              bool error_is_read;
              int alloced = 0;
@@ -XXX,XX +XXX,XX @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
          goto error;
      }
 -    /* job->common.len is fixed, so we can't allow resize */
 +    /* job->len is fixed, so we can't allow resize */
      job = block_job_create(job_id, &backup_job_driver, txn, bs,
                             BLK_PERM_CONSISTENT_READ,
                             BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
@@ -XXX,XX +XXX,XX @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
      /* Required permissions are already taken with target's blk_new() */
      block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL,
                         &error_abort);
 -    job->common.len = len;
 +    job->len = len;
      return &job->common;
 diff --git a/block/commit.c b/block/commit.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/commit.c
 +++ b/block/commit.c
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
+@@ -XXX,XX +XXX,XX @@ static int coroutine_fn commit_run(Job *job, Error **errp)
-     int64_t n = 0; /* bytes */
+     }
-     void *buf = NULL;
-     int bytes_written = 0;
+     if (base_len < len) {
--    int64_t base_len;
+-        ret = blk_truncate(s->base, len, false, PREALLOC_MODE_OFF, NULL);
-+    int64_t len, base_len;
++        ret = blk_truncate(s->base, len, false, PREALLOC_MODE_OFF, 0, NULL);
 -    ret = s->common.len = blk_getlength(s->top);
 -
 -    if (s->common.len < 0) {
 +    ret = len = blk_getlength(s->top);
 +    if (len < 0) {
          goto out;
      }
 +    block_job_progress_set_remaining(&s->common, len);
      ret = base_len = blk_getlength(s->base);
      if (base_len < 0) {
          goto out;
      }
 -    if (base_len < s->common.len) {
 -        ret = blk_truncate(s->base, s->common.len, PREALLOC_MODE_OFF, NULL);
 +    if (base_len < len) {
 +        ret = blk_truncate(s->base, len, PREALLOC_MODE_OFF, NULL);
          if (ret) {
              goto out;
          }
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
+@@ -XXX,XX +XXX,XX @@ int bdrv_commit(BlockDriverState *bs)
+      * grow the backing file image if possible.  If not possible,
-     buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);
+      * we must return an error */
+     if (length > backing_length) {
--    for (offset = 0; offset < s->common.len; offset += n) {
+-        ret = blk_truncate(backing, length, false, PREALLOC_MODE_OFF,
-+    for (offset = 0; offset < len; offset += n) {
++        ret = blk_truncate(backing, length, false, PREALLOC_MODE_OFF, 0,
-         bool copy;
+                            &local_err);
+         if (ret < 0) {
-         /* Note that even when no rate limit is applied we need to yield
+             error_report_err(local_err);
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
+diff --git a/block/crypto.c b/block/crypto.c
-             }
+index XXXXXXX..XXXXXXX 100644
-         }
+--- a/block/crypto.c
-         /* Publish progress */
++++ b/block/crypto.c
--        s->common.offset += n;
+@@ -XXX,XX +XXX,XX @@ static ssize_t block_crypto_init_func(QCryptoBlock *block,
-+        block_job_progress_update(&s->common, n);
+      * which will be used by the crypto header
+      */
-         if (copy && s->common.speed) {
+     return blk_truncate(data->blk, data->size + headerlen, false,
-             delay_ns = ratelimit_calculate_delay(&s->limit, n);
+-                        data->prealloc, errp);
 +                        data->prealloc, 0, errp);
  }
 diff --git a/block/mirror.c b/block/mirror.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
-@@ -XXX,XX +XXX,XX @@ static void mirror_iteration_done(MirrorOp *op, int ret)
+@@ -XXX,XX +XXX,XX @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
-             bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
-         }
+         if (s->bdev_length > base_length) {
-         if (!s->initial_zeroing_ongoing) {
+             ret = blk_truncate(s->target, s->bdev_length, false,
--            s->common.offset += op->bytes;
+-                               PREALLOC_MODE_OFF, NULL);
-+            block_job_progress_update(&s->common, op->bytes);
++                               PREALLOC_MODE_OFF, 0, NULL);
-         }
+             if (ret < 0) {
-     }
+                 goto immediate_exit;
-     qemu_iovec_destroy(&op->qiov);
+             }
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
+diff --git a/block/qcow2.c b/block/qcow2.c
-         block_job_pause_point(&s->common);
+index XXXXXXX..XXXXXXX 100644
+--- a/block/qcow2.c
-         cnt = bdrv_get_dirty_count(s->dirty_bitmap);
++++ b/block/qcow2.c
--        /* s->common.offset contains the number of bytes already processed so
+@@ -XXX,XX +XXX,XX @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
--         * far, cnt is the number of dirty bytes remaining and
--         * s->bytes_in_flight is the number of bytes currently being
+     /* Okay, now that we have a valid image, let's give it the right size */
--         * processed; together those are the current total operation length */
+     ret = blk_truncate(blk, qcow2_opts->size, false, qcow2_opts->preallocation,
--        s->common.len = s->common.offset + s->bytes_in_flight + cnt;
+-                       errp);
-+        /* cnt is the number of dirty bytes remaining and s->bytes_in_flight is
++                       0, errp);
-+         * the number of bytes currently being processed; together those are
+     if (ret < 0) {
-+         * the current remaining operation length */
+         error_prepend(errp, "Could not resize image: ");
 +        block_job_progress_set_remaining(&s->common, s->bytes_in_flight + cnt);
          /* Note that even when no rate limit is applied we need to yield
           * periodically with no pending I/O so that bdrv_drain_all() returns.
 diff --git a/block/stream.c b/block/stream.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/stream.c
 +++ b/block/stream.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
      BlockBackend *blk = s->common.blk;
      BlockDriverState *bs = blk_bs(blk);
      BlockDriverState *base = s->base;
 +    int64_t len;
      int64_t offset = 0;
      uint64_t delay_ns = 0;
      int error = 0;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
          goto out;
-     }
+@@ -XXX,XX +XXX,XX @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
+          * Amending image options should ensure that the image has
--    s->common.len = bdrv_getlength(bs);
+          * exactly the given new values, so pass exact=true here.
--    if (s->common.len < 0) {
+          */
--        ret = s->common.len;
+-        ret = blk_truncate(blk, new_size, true, PREALLOC_MODE_OFF, errp);
-+    len = bdrv_getlength(bs);
++        ret = blk_truncate(blk, new_size, true, PREALLOC_MODE_OFF, 0, errp);
-+    if (len < 0) {
+         blk_unref(blk);
-+        ret = len;
+         if (ret < 0) {
              return ret;
 diff --git a/block/qed.c b/block/qed.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.c
 +++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts,
       * The QED format associates file length with allocation status,
       * so a new file (which is empty) must have a length of 0.
       */
 -    ret = blk_truncate(blk, 0, true, PREALLOC_MODE_OFF, errp);
 +    ret = blk_truncate(blk, 0, true, PREALLOC_MODE_OFF, 0, errp);
      if (ret < 0) {
          goto out;
      }
-+    block_job_progress_set_remaining(&s->common, len);
+diff --git a/block/vdi.c b/block/vdi.c
+index XXXXXXX..XXXXXXX 100644
-     buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE);
+--- a/block/vdi.c
++++ b/block/vdi.c
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
+@@ -XXX,XX +XXX,XX @@ static int coroutine_fn vdi_co_do_create(BlockdevCreateOptions *create_options,
-         bdrv_enable_copy_on_read(bs);
-     }
+     if (image_type == VDI_TYPE_STATIC) {
+         ret = blk_truncate(blk, offset + blocks * block_size, false,
--    for ( ; offset < s->common.len; offset += n) {
+-                           PREALLOC_MODE_OFF, errp);
-+    for ( ; offset < len; offset += n) {
++                           PREALLOC_MODE_OFF, 0, errp);
-         bool copy;
+         if (ret < 0) {
+             error_prepend(errp, "Failed to statically allocate file");
-         /* Note that even when no rate limit is applied we need to yield
+             goto exit;
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
+diff --git a/block/vhdx.c b/block/vhdx.c
+index XXXXXXX..XXXXXXX 100644
-             /* Finish early if end of backing file has been reached */
+--- a/block/vhdx.c
-             if (ret == 0 && n == 0) {
++++ b/block/vhdx.c
--                n = s->common.len - offset;
+@@ -XXX,XX +XXX,XX @@ static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
-+                n = len - offset;
+         /* All zeroes, so we can just extend the file - the end of the BAT
-             }
+          * is the furthest thing we have written yet */
+         ret = blk_truncate(blk, data_file_offset, false, PREALLOC_MODE_OFF,
-             copy = (ret == 1);
+-                           errp);
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
++                           0, errp);
-         ret = 0;
+         if (ret < 0) {
+             goto exit;
-         /* Publish progress */
+         }
--        s->common.offset += n;
+     } else if (type == VHDX_TYPE_FIXED) {
-+        block_job_progress_update(&s->common, n);
+         ret = blk_truncate(blk, data_file_offset + image_size, false,
-         if (copy && s->common.speed) {
+-                           PREALLOC_MODE_OFF, errp);
-             delay_ns = ratelimit_calculate_delay(&s->limit, n);
++                           PREALLOC_MODE_OFF, 0, errp);
-         } else {
+         if (ret < 0) {
-diff --git a/blockjob.c b/blockjob.c
+             goto exit;
-index XXXXXXX..XXXXXXX 100644
+         }
---- a/blockjob.c
+diff --git a/block/vmdk.c b/block/vmdk.c
-+++ b/blockjob.c
+index XXXXXXX..XXXXXXX 100644
-@@ -XXX,XX +XXX,XX @@ int block_job_complete_sync(BlockJob *job, Error **errp)
+--- a/block/vmdk.c
-     return block_job_finish_sync(job, &block_job_complete, errp);
++++ b/block/vmdk.c
- }
+@@ -XXX,XX +XXX,XX @@ static int vmdk_init_extent(BlockBackend *blk,
+     int gd_buf_size;
-+void block_job_progress_update(BlockJob *job, uint64_t done)
-+{
+     if (flat) {
-+    job->offset += done;
+-        ret = blk_truncate(blk, filesize, false, PREALLOC_MODE_OFF, errp);
-+}
++        ret = blk_truncate(blk, filesize, false, PREALLOC_MODE_OFF, 0, errp);
-+
+         goto exit;
-+void block_job_progress_set_remaining(BlockJob *job, uint64_t remaining)
+     }
-+{
+     magic = cpu_to_be32(VMDK4_MAGIC);
-+    job->len = job->offset + remaining;
+@@ -XXX,XX +XXX,XX @@ static int vmdk_init_extent(BlockBackend *blk,
-+}
+     }
-+
- BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
+     ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9, false,
- {
+-                       PREALLOC_MODE_OFF, errp);
-     BlockJobInfo *info;
++                       PREALLOC_MODE_OFF, 0, errp);
      if (ret < 0) {
          goto exit;
      }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn vmdk_co_do_create(int64_t size,
      /* bdrv_pwrite write padding zeros to align to sector, we don't need that
       * for description file */
      if (desc_offset == 0) {
 -        ret = blk_truncate(blk, desc_len, false, PREALLOC_MODE_OFF, errp);
 +        ret = blk_truncate(blk, desc_len, false, PREALLOC_MODE_OFF, 0, errp);
          if (ret < 0) {
              goto exit;
          }
 diff --git a/block/vpc.c b/block/vpc.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/vpc.c
 +++ b/block/vpc.c
@@ -XXX,XX +XXX,XX @@ static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
      /* Add footer to total size */
      total_size += HEADER_SIZE;
 -    ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, errp);
 +    ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, 0, errp);
      if (ret < 0) {
          return ret;
      }
 diff --git a/blockdev.c b/blockdev.c
 index XXXXXXX..XXXXXXX 100644
 --- a/blockdev.c
 +++ b/blockdev.c
@@ -XXX,XX +XXX,XX @@ void qmp_block_resize(bool has_device, const char *device,
      }
      bdrv_drained_begin(bs);
 -    ret = blk_truncate(blk, size, false, PREALLOC_MODE_OFF, errp);
 +    ret = blk_truncate(blk, size, false, PREALLOC_MODE_OFF, 0, errp);
      bdrv_drained_end(bs);
  out:
 diff --git a/qemu-img.c b/qemu-img.c
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-img.c
 +++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ static int img_resize(int argc, char **argv)
       * resizing, so pass @exact=true.  It is of no use to report
       * success when the image has not actually been resized.
       */
 -    ret = blk_truncate(blk, total_size, true, prealloc, &err);
 +    ret = blk_truncate(blk, total_size, true, prealloc, 0, &err);
      if (!ret) {
          qprintf(quiet, "Image resized.\n");
      } else {
 diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-io-cmds.c
 +++ b/qemu-io-cmds.c
@@ -XXX,XX +XXX,XX @@ static int truncate_f(BlockBackend *blk, int argc, char **argv)
       * exact=true.  It is better to err on the "emit more errors" side
       * than to be overly permissive.
       */
 -    ret = blk_truncate(blk, offset, true, PREALLOC_MODE_OFF, &local_err);
 +    ret = blk_truncate(blk, offset, true, PREALLOC_MODE_OFF, 0, &local_err);
      if (ret < 0) {
          error_report_err(local_err);
          return ret;
 --
-.13.6
+.25.3

-[Qemu-devel] [PULL 15/37] blockjob: Introduce block_job_ratelimit_get_delay()
+[PULL 06/15] qcow2: Support BDRV_REQ_ZERO_WRITE for truncate
-This gets us rid of more direct accesses to BlockJob fields from the
+If BDRV_REQ_ZERO_WRITE is set and we're extending the image, calling
-job drivers.
+qcow2_cluster_zeroize() with flags=0 does the right thing: It doesn't
 undo any previous preallocation, but just adds the zero flag to all
 relevant L2 entries. If an external data file is in use, a write_zeroes
 request to the data file is made instead.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+Message-Id: <20200424125448.63318-5-kwolf@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: John Snow <jsnow@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/blockjob_int.h |  8 ++++++++
+ block/qcow2-cluster.c |  2 +-
- block/backup.c               | 18 +++++++-----------
+ block/qcow2.c         | 34 ++++++++++++++++++++++++++++++++++
- block/commit.c               |  4 ++--
+files changed, 35 insertions(+), 1 deletion(-)
  block/mirror.c               |  5 +----
  block/stream.c               |  4 ++--
  blockjob.c                   |  9 +++++++++
 files changed, 29 insertions(+), 19 deletions(-)
-diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
+diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/blockjob_int.h
+--- a/block/qcow2-cluster.c
-+++ b/include/block/blockjob_int.h
++++ b/block/qcow2-cluster.c
-@@ -XXX,XX +XXX,XX @@ void block_job_sleep_ns(BlockJob *job, int64_t ns);
+@@ -XXX,XX +XXX,XX @@ int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset,
- void block_job_yield(BlockJob *job);
+     /* Caller must pass aligned values, except at image end */
+     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
- /**
+     assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
-+ * block_job_ratelimit_get_delay:
+-           end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
-+ *
++           end_offset >= bs->total_sectors << BDRV_SECTOR_BITS);
-+ * Calculate and return delay for the next request in ns. See the documentation
-+ * of ratelimit_calculate_delay() for details.
+     /* The zero flag is only supported by version 3 and newer */
-+ */
+     if (s->qcow_version < 3) {
-+int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n);
+diff --git a/block/qcow2.c b/block/qcow2.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2.c
 +++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
      bs->supported_zero_flags = header.version >= 3 ?
                                 BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK : 0;
 +    bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
      /* Repair image if dirty */
      if (!(flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) && !bs->read_only &&
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
          g_assert_not_reached();
      }
 +    if ((flags & BDRV_REQ_ZERO_WRITE) && offset > old_length) {
 +        uint64_t zero_start = QEMU_ALIGN_UP(old_length, s->cluster_size);
 +
-+/**
++        /*
-  * block_job_early_fail:
++         * Use zero clusters as much as we can. qcow2_cluster_zeroize()
-  * @bs: The block device.
++         * requires a cluster-aligned start. The end may be unaligned if it is
-  *
++         * at the end of the image (which it is here).
-diff --git a/block/backup.c b/block/backup.c
++         */
-index XXXXXXX..XXXXXXX 100644
++        ret = qcow2_cluster_zeroize(bs, zero_start, offset - zero_start, 0);
---- a/block/backup.c
++        if (ret < 0) {
-+++ b/block/backup.c
++            error_setg_errno(errp, -ret, "Failed to zero out new clusters");
-@@ -XXX,XX +XXX,XX @@ static void backup_complete(BlockJob *job, void *opaque)
++            goto fail;
++        }
  static bool coroutine_fn yield_and_check(BackupBlockJob *job)
  {
 +    uint64_t delay_ns;
 +
-     if (block_job_is_cancelled(&job->common)) {
++        /* Write explicit zeros for the unaligned head */
-         return true;
++        if (zero_start > old_length) {
-     }
++            uint64_t len = zero_start - old_length;
++            uint8_t *buf = qemu_blockalign0(bs, len);
--    /* we need to yield so that bdrv_drain_all() returns.
++            QEMUIOVector qiov;
--     * (without, VM does not reboot)
++            qemu_iovec_init_buf(&qiov, buf, len);
--     */
++
--    if (job->common.speed) {
++            qemu_co_mutex_unlock(&s->lock);
--        uint64_t delay_ns = ratelimit_calculate_delay(&job->common.limit,
++            ret = qcow2_co_pwritev_part(bs, old_length, len, &qiov, 0, 0);
--                                                      job->bytes_read);
++            qemu_co_mutex_lock(&s->lock);
--        job->bytes_read = 0;
++
--        block_job_sleep_ns(&job->common, delay_ns);
++            qemu_vfree(buf);
--    } else {
++            if (ret < 0) {
--        block_job_sleep_ns(&job->common, 0);
++                error_setg_errno(errp, -ret, "Failed to zero out the new area");
--    }
++                goto fail;
-+    /* We need to yield even for delay_ns = 0 so that bdrv_drain_all() can
++            }
-+     * return. Without a yield, the VM would not reboot. */
++        }
 +    delay_ns = block_job_ratelimit_get_delay(&job->common, job->bytes_read);
 +    job->bytes_read = 0;
 +    block_job_sleep_ns(&job->common, delay_ns);
      if (block_job_is_cancelled(&job->common)) {
          return true;
 diff --git a/block/commit.c b/block/commit.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/commit.c
 +++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
          /* Publish progress */
          block_job_progress_update(&s->common, n);
 -        if (copy && s->common.speed) {
 -            delay_ns = ratelimit_calculate_delay(&s->common.limit, n);
 +        if (copy) {
 +            delay_ns = block_job_ratelimit_get_delay(&s->common, n);
          } else {
              delay_ns = 0;
          }
 diff --git a/block/mirror.c b/block/mirror.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
          assert(io_bytes);
          offset += io_bytes;
          nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity);
 -        if (s->common.speed) {
 -            delay_ns = ratelimit_calculate_delay(&s->common.limit,
 -                                                 io_bytes_acct);
 -        }
 +        delay_ns = block_job_ratelimit_get_delay(&s->common, io_bytes_acct);
      }
      return delay_ns;
  }
 diff --git a/block/stream.c b/block/stream.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/stream.c
 +++ b/block/stream.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
          /* Publish progress */
          block_job_progress_update(&s->common, n);
 -        if (copy && s->common.speed) {
 -            delay_ns = ratelimit_calculate_delay(&s->common.limit, n);
 +        if (copy) {
 +            delay_ns = block_job_ratelimit_get_delay(&s->common, n);
          } else {
              delay_ns = 0;
          }
 diff --git a/blockjob.c b/blockjob.c
 index XXXXXXX..XXXXXXX 100644
 --- a/blockjob.c
 +++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
      block_job_enter_cond(job, block_job_timer_pending);
  }
 +int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n)
 +{
 +    if (!job->speed) {
 +        return 0;
 +    }
 +
-+    return ratelimit_calculate_delay(&job->limit, n);
+     if (prealloc != PREALLOC_MODE_OFF) {
-+}
+         /* Flush metadata before actually changing the image size */
-+
+         ret = qcow2_write_caches(bs);
  void block_job_complete(BlockJob *job, Error **errp)
  {
      /* Should not be reachable via external interface for internal jobs */
 --
-.13.6
+.25.3

-[Qemu-devel] [PULL 29/37] block: Support BDRV_REQ_WRITE_UNCHANGED in filters
+[PULL 07/15] raw-format: Support BDRV_REQ_ZERO_WRITE for truncate
-From: Max Reitz <mreitz@redhat.com>
+The raw format driver can simply forward the flag and let its bs->file
 child take care of actually providing the zeros.
-Update the rest of the filter drivers to support
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-BDRV_REQ_WRITE_UNCHANGED.  They already forward write request flags to
+Reviewed-by: Max Reitz <mreitz@redhat.com>
-their children, so we just have to announce support for it.
+Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 Message-Id: <20200424125448.63318-6-kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
  block/raw-format.c | 4 +++-
 file changed, 3 insertions(+), 1 deletion(-)
-This patch does not cover the replication driver because that currently
-does not support flags at all, and because it just grabs the WRITE
-permission for its children when it can, so we should be fine just
-submitting the incoming WRITE_UNCHANGED requests as normal writes.
-It also does not cover format drivers for similar reasons.  They all use
-bdrv_format_default_perms() as their .bdrv_child_perm() implementation
-so they just always grab the WRITE permission for their file children
-whenever possible.  In addition, it often would be difficult to
-ascertain whether incoming unchanging writes end up as unchanging writes
-in their files.  So we just leave them as normal potentially changing
-writes.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Alberto Garcia <berto@igalia.com>
-Message-id: 20180421132929.21610-7-mreitz@redhat.com
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/blkdebug.c     |  9 +++++----
- block/blkreplay.c    |  3 +++
- block/blkverify.c    |  3 +++
- block/copy-on-read.c | 10 ++++++----
- block/mirror.c       |  2 ++
- block/raw-format.c   |  9 +++++----
- block/throttle.c     |  6 ++++--
-files changed, 28 insertions(+), 14 deletions(-)
-diff --git a/block/blkdebug.c b/block/blkdebug.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/blkdebug.c
-+++ b/block/blkdebug.c
-@@ -XXX,XX +XXX,XX @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
-         goto out;
-     }
--    bs->supported_write_flags = BDRV_REQ_FUA &
--        bs->file->bs->supported_write_flags;
--    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
--        bs->file->bs->supported_zero_flags;
-+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
-+        (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
-+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
-+        ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
-+            bs->file->bs->supported_zero_flags);
-     ret = -EINVAL;
-     /* Set alignment overrides */
-diff --git a/block/blkreplay.c b/block/blkreplay.c
-index XXXXXXX..XXXXXXX 100755
---- a/block/blkreplay.c
-+++ b/block/blkreplay.c
-@@ -XXX,XX +XXX,XX @@ static int blkreplay_open(BlockDriverState *bs, QDict *options, int flags,
-         goto fail;
-     }
-+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
-+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED;
-+
-     ret = 0;
- fail:
-     return ret;
-diff --git a/block/blkverify.c b/block/blkverify.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/blkverify.c
-+++ b/block/blkverify.c
-@@ -XXX,XX +XXX,XX @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
-         goto fail;
-     }
-+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
-+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED;
-+
-     ret = 0;
- fail:
-     qemu_opts_del(opts);
-diff --git a/block/copy-on-read.c b/block/copy-on-read.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/copy-on-read.c
-+++ b/block/copy-on-read.c
-@@ -XXX,XX +XXX,XX @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags,
-         return -EINVAL;
-     }
--    bs->supported_write_flags = BDRV_REQ_FUA &
--                                    bs->file->bs->supported_write_flags;
-+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
-+                                (BDRV_REQ_FUA &
-+                                    bs->file->bs->supported_write_flags);
--    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
--                                    bs->file->bs->supported_zero_flags;
-+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
-+                               ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
-+                                    bs->file->bs->supported_zero_flags);
-     return 0;
- }
-diff --git a/block/mirror.c b/block/mirror.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/mirror.c
-+++ b/block/mirror.c
-@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
-         mirror_top_bs->implicit = true;
-     }
-     mirror_top_bs->total_sectors = bs->total_sectors;
-+    mirror_top_bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
-+    mirror_top_bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED;
-     bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs));
-     /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep
 diff --git a/block/raw-format.c b/block/raw-format.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/raw-format.c
 +++ b/block/raw-format.c
+@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
+     s->size = offset;
+     offset += s->offset;
+-    return bdrv_co_truncate(bs->file, offset, exact, prealloc, 0, errp);
++    return bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
+ }
+ static void raw_eject(BlockDriverState *bs, bool eject_flag)
 @@ -XXX,XX +XXX,XX @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
-     }
+     bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
+         ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
-     bs->sg = bs->file->bs->sg;
+             bs->file->bs->supported_zero_flags);
--    bs->supported_write_flags = BDRV_REQ_FUA &
++    bs->supported_truncate_flags = bs->file->bs->supported_truncate_flags &
--        bs->file->bs->supported_write_flags;
++                                   BDRV_REQ_ZERO_WRITE;
 -    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
 -        bs->file->bs->supported_zero_flags;
 +    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
 +        (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
 +    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
 +        ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
 +            bs->file->bs->supported_zero_flags);
      if (bs->probed && !bdrv_is_read_only(bs)) {
-         fprintf(stderr,
+         bdrv_refresh_filename(bs->file->bs);
 diff --git a/block/throttle.c b/block/throttle.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/throttle.c
 +++ b/block/throttle.c
@@ -XXX,XX +XXX,XX @@ static int throttle_open(BlockDriverState *bs, QDict *options,
      if (!bs->file) {
          return -EINVAL;
      }
 -    bs->supported_write_flags = bs->file->bs->supported_write_flags;
 -    bs->supported_zero_flags = bs->file->bs->supported_zero_flags;
 +    bs->supported_write_flags = bs->file->bs->supported_write_flags |
 +                                BDRV_REQ_WRITE_UNCHANGED;
 +    bs->supported_zero_flags = bs->file->bs->supported_zero_flags |
 +                               BDRV_REQ_WRITE_UNCHANGED;
      return throttle_configure_tgm(bs, tgm, options, errp);
  }
 --
-.13.6
+.25.3

-[Qemu-devel] [PULL 10/37] blockjob: expose error string via query
+[PULL 08/15] file-posix: Support BDRV_REQ_ZERO_WRITE for truncate
-From: John Snow <jsnow@redhat.com>
+For regular files, we always get BDRV_REQ_ZERO_WRITE behaviour from the
 OS, so we can advertise the flag and just ignore it.
-When we've reached the concluded state, we need to expose the error
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-state if applicable. Add the new field.
+Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 This should be sufficient for determining if a job completed
 successfully or not after concluding; if we want to discriminate
 based on how it failed more mechanically, we can always add an
 explicit return code enumeration later.
 I didn't bother to make it only show up if we are in the concluded
 state; I don't think it's necessary.
 Cc: qemu-stable@nongnu.org
 Signed-off-by: John Snow <jsnow@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Alberto Garcia <berto@igalia.com>
+Reviewed-by: Max Reitz <mreitz@redhat.com>
+Message-Id: <20200424125448.63318-7-kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- qapi/block-core.json | 6 +++++-
+ block/file-posix.c | 4 ++++
- blockjob.c           | 2 ++
+file changed, 4 insertions(+)
 files changed, 7 insertions(+), 1 deletion(-)
-diff --git a/qapi/block-core.json b/qapi/block-core.json
+diff --git a/block/file-posix.c b/block/file-posix.c
 index XXXXXXX..XXXXXXX 100644
---- a/qapi/block-core.json
+--- a/block/file-posix.c
-+++ b/qapi/block-core.json
++++ b/block/file-posix.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
- # @auto-dismiss: Job will dismiss itself when CONCLUDED, moving to the NULL
+ #endif
- #                state and disappearing from the query list. (since 2.12)
- #
+     bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
-+# @error: Error information if the job did not complete successfully.
++    if (S_ISREG(st.st_mode)) {
-+#         Not set if the job completed successfully. (since 2.12.1)
++        /* When extending regular files, we get zeros from the OS */
-+#
++        bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
- # Since: 1.1
++    }
- ##
+     ret = 0;
- { 'struct': 'BlockJobInfo',
+ fail:
-@@ -XXX,XX +XXX,XX @@
+     if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
             'offset': 'int', 'busy': 'bool', 'paused': 'bool', 'speed': 'int',
             'io-status': 'BlockDeviceIoStatus', 'ready': 'bool',
             'status': 'BlockJobStatus',
 -           'auto-finalize': 'bool', 'auto-dismiss': 'bool' } }
 +           'auto-finalize': 'bool', 'auto-dismiss': 'bool',
 +           '*error': 'str' } }
  ##
  # @query-block-jobs:
 diff --git a/blockjob.c b/blockjob.c
 index XXXXXXX..XXXXXXX 100644
 --- a/blockjob.c
 +++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
      info->status    = job->status;
      info->auto_finalize = job->auto_finalize;
      info->auto_dismiss  = job->auto_dismiss;
 +    info->has_error = job->ret != 0;
 +    info->error     = job->ret ? g_strdup(strerror(-job->ret)) : NULL;
      return info;
  }
 --
-.13.6
+.25.3

-[Qemu-devel] [PULL 02/37] block: Support byte-based aio callbacks
+[PULL 09/15] block: truncate: Don't make backing file data visible
-From: Eric Blake <eblake@redhat.com>
+When extending the size of an image that has a backing file larger than
 its old size, make sure that the backing file data doesn't become
 visible in the guest, but the added area is properly zeroed out.
-We are gradually moving away from sector-based interfaces, towards
+Consider the following scenario where the overlay is shorter than its
-byte-based.  Add new sector-based aio callbacks for read and write,
+backing file:
 to match the fact that bdrv_aio_pdiscard is already byte-based.
-Ideally, drivers should be converted to use coroutine callbacks
+    base.qcow2:     AAAAAAAA
-rather than aio; but that is not quite as trivial (and if we were
+    overlay.qcow2:  BBBB
 to do that conversion, the null-aio driver would disappear), so for
 the short term, converting the signature but keeping things with
 aio is easier.  However, we CAN declare that a driver that uses
 the byte-based aio interfaces now defaults to byte-based
 operations, and must explicitly provide a refresh_limits override
 to stick with larger alignments (making the alignment issues more
 obvious directly in the drivers touched in the next few patches).
-Once all drivers are converted, the sector-based aio callbacks will
+When resizing (extending) overlay.qcow2, the new blocks should not stay
-be removed; in the meantime, a FIXME comment is added due to a
+unallocated and make the additional As from base.qcow2 visible like
-slight inefficiency that will be touched up as part of that later
+before this patch, but zeros should be read.
 cleanup.
-Simplify some instances of 'bs->drv' into 'drv' while touching this,
+A similar case happens with the various variants of a commit job when an
-since the local variable already exists to reduce typing.
+intermediate file is short (- for unallocated):
-Signed-off-by: Eric Blake <eblake@redhat.com>
+    base.qcow2:     A-A-AAAA
     mid.qcow2:      BB-B
     top.qcow2:      C--C--C-
 After commit top.qcow2 to mid.qcow2, the following happens:
     mid.qcow2:      CB-C00C0 (correct result)
     mid.qcow2:      CB-C--C- (before this fix)
 Without the fix, blocks that previously read as zeros on top.qcow2
 suddenly turn into A.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 Message-Id: <20200424125448.63318-8-kwolf@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/block_int.h |  6 ++++++
+ block/io.c | 25 +++++++++++++++++++++++++
- block/io.c                | 38 +++++++++++++++++++++++++++++---------
+file changed, 25 insertions(+)
 files changed, 35 insertions(+), 9 deletions(-)
-diff --git a/include/block/block_int.h b/include/block/block_int.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/block_int.h
-+++ b/include/block/block_int.h
-@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
-     BlockAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs,
-         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-         BlockCompletionFunc *cb, void *opaque);
-+    BlockAIOCB *(*bdrv_aio_preadv)(BlockDriverState *bs,
-+        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags,
-+        BlockCompletionFunc *cb, void *opaque);
-     BlockAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs,
-         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-         BlockCompletionFunc *cb, void *opaque);
-+    BlockAIOCB *(*bdrv_aio_pwritev)(BlockDriverState *bs,
-+        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags,
-+        BlockCompletionFunc *cb, void *opaque);
-     BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
-         BlockCompletionFunc *cb, void *opaque);
-     BlockAIOCB *(*bdrv_aio_pdiscard)(BlockDriverState *bs,
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
+@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
          goto out;
      }
-     /* Default alignment based on whether driver has byte interface */
++    /*
--    bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512;
++     * If the image has a backing file that is large enough that it would
-+    bs->bl.request_alignment = (drv->bdrv_co_preadv ||
++     * provide data for the new area, we cannot leave it unallocated because
-+                                drv->bdrv_aio_preadv) ? 1 : 512;
++     * then the backing file content would become visible. Instead, zero-fill
++     * the new area.
-     /* Take some limits from the children as a default */
++     *
-     if (bs->file) {
++     * Note that if the image has a backing file, but was opened without the
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
++     * backing file, taking care of keeping things consistent with that backing
-         return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
++     * file is the user's responsibility.
-     }
++     */
++    if (new_bytes && bs->backing) {
-+    /* FIXME - no need to calculate these if .bdrv_aio_preadv exists */
++        int64_t backing_len;
-     sector_num = offset >> BDRV_SECTOR_BITS;
++
-     nb_sectors = bytes >> BDRV_SECTOR_BITS;
++        backing_len = bdrv_getlength(backing_bs(bs));
++        if (backing_len < 0) {
--    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
++            ret = backing_len;
--    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
++            error_setg_errno(errp, -ret, "Could not get backing file size");
--    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
++            goto out;
-+    if (!drv->bdrv_aio_preadv) {
++        }
-+        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
++
-+        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
++        if (backing_len > old_size) {
-+        assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
++            flags |= BDRV_REQ_ZERO_WRITE;
 +        }
 +    }
++
-     if (drv->bdrv_co_readv) {
+     if (drv->bdrv_co_truncate) {
-         return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+         if (flags & ~bs->supported_truncate_flags) {
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
+             error_setg(errp, "Block driver does not support requested flags");
              .coroutine = qemu_coroutine_self(),
          };
 -        acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
 +        if (drv->bdrv_aio_preadv) {
 +            acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
 +                                       bdrv_co_io_em_complete, &co);
 +        } else {
 +            acb = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
                                        bdrv_co_io_em_complete, &co);
 +        }
          if (acb == NULL) {
              return -EIO;
          } else {
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
          goto emulate_flags;
      }
 +    /* FIXME - no need to calculate these if .bdrv_aio_pwritev exists */
      sector_num = offset >> BDRV_SECTOR_BITS;
      nb_sectors = bytes >> BDRV_SECTOR_BITS;
 -    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
 -    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 -    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 +    if (!drv->bdrv_aio_pwritev) {
 +        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
 +        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 +        assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 +    }
      if (drv->bdrv_co_writev_flags) {
          ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
              .coroutine = qemu_coroutine_self(),
          };
 -        acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
 +        if (drv->bdrv_aio_pwritev) {
 +            acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
 +                                        flags & bs->supported_write_flags,
 +                                        bdrv_co_io_em_complete, &co);
 +            flags &= ~bs->supported_write_flags;
 +        } else {
 +            assert(!bs->supported_write_flags);
 +            acb = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
                                         bdrv_co_io_em_complete, &co);
 +        }
          if (acb == NULL) {
              ret = -EIO;
          } else {
 --
-.13.6
+.25.3

-[Qemu-devel] [PULL 03/37] file-win32: Switch to byte-based callbacks
+Deleted patch
-From: Eric Blake <eblake@redhat.com>
-We are gradually moving away from sector-based interfaces, towards
-byte-based.  Make the change for the last few sector-based callbacks
-in the file-win32 driver.
-Note that the driver was already using byte-based calls for
-performing actual I/O, so this just gets rid of a round trip
-of scaling; however, as I don't know if Windows is tolerant of
-non-sector AIO operations, I went with the conservative approach
-of modifying .bdrv_refresh_limits to override the block layer
-defaults back to the pre-patch value of 512.
-Signed-off-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- include/block/raw-aio.h |  2 +-
- block/file-win32.c      | 47 +++++++++++++++++++++++++++++------------------
- block/win32-aio.c       |  5 ++---
-files changed, 32 insertions(+), 22 deletions(-)
-diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/raw-aio.h
-+++ b/include/block/raw-aio.h
-@@ -XXX,XX +XXX,XX @@ void win32_aio_cleanup(QEMUWin32AIOState *aio);
- int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile);
- BlockAIOCB *win32_aio_submit(BlockDriverState *bs,
-         QEMUWin32AIOState *aio, HANDLE hfile,
--        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-+        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov,
-         BlockCompletionFunc *cb, void *opaque, int type);
- void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
-                                   AioContext *old_context);
-diff --git a/block/file-win32.c b/block/file-win32.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/file-win32.c
-+++ b/block/file-win32.c
-@@ -XXX,XX +XXX,XX @@ static void raw_probe_alignment(BlockDriverState *bs, Error **errp)
-                          &dg.Geometry.BytesPerSector,
-                          &freeClusters, &totalClusters);
-         bs->bl.request_alignment = dg.Geometry.BytesPerSector;
-+        return;
-     }
-+
-+    /* XXX Does Windows support AIO on less than 512-byte alignment? */
-+    bs->bl.request_alignment = 512;
- }
- static void raw_parse_flags(int flags, bool use_aio, int *access_flags,
-@@ -XXX,XX +XXX,XX @@ fail:
-     return ret;
- }
--static BlockAIOCB *raw_aio_readv(BlockDriverState *bs,
--                         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
--                         BlockCompletionFunc *cb, void *opaque)
-+static BlockAIOCB *raw_aio_preadv(BlockDriverState *bs,
-+                                  uint64_t offset, uint64_t bytes,
-+                                  QEMUIOVector *qiov, int flags,
-+                                  BlockCompletionFunc *cb, void *opaque)
- {
-     BDRVRawState *s = bs->opaque;
-     if (s->aio) {
--        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
--                                nb_sectors, cb, opaque, QEMU_AIO_READ);
-+        return win32_aio_submit(bs, s->aio, s->hfile, offset, bytes, qiov,
-+                                cb, opaque, QEMU_AIO_READ);
-     } else {
--        return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov,
--                           nb_sectors << BDRV_SECTOR_BITS,
-+        return paio_submit(bs, s->hfile, offset, qiov, bytes,
-                            cb, opaque, QEMU_AIO_READ);
-     }
- }
--static BlockAIOCB *raw_aio_writev(BlockDriverState *bs,
--                          int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
--                          BlockCompletionFunc *cb, void *opaque)
-+static BlockAIOCB *raw_aio_pwritev(BlockDriverState *bs,
-+                                   uint64_t offset, uint64_t bytes,
-+                                   QEMUIOVector *qiov, int flags,
-+                                   BlockCompletionFunc *cb, void *opaque)
- {
-     BDRVRawState *s = bs->opaque;
-     if (s->aio) {
--        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
--                                nb_sectors, cb, opaque, QEMU_AIO_WRITE);
-+        return win32_aio_submit(bs, s->aio, s->hfile, offset, bytes, qiov,
-+                                cb, opaque, QEMU_AIO_WRITE);
-     } else {
--        return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov,
--                           nb_sectors << BDRV_SECTOR_BITS,
-+        return paio_submit(bs, s->hfile, offset, qiov, bytes,
-                            cb, opaque, QEMU_AIO_WRITE);
-     }
- }
-@@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_file = {
-     .bdrv_co_create_opts = raw_co_create_opts,
-     .bdrv_has_zero_init = bdrv_has_zero_init_1,
--    .bdrv_aio_readv     = raw_aio_readv,
--    .bdrv_aio_writev    = raw_aio_writev,
-+    .bdrv_aio_preadv    = raw_aio_preadv,
-+    .bdrv_aio_pwritev   = raw_aio_pwritev,
-     .bdrv_aio_flush     = raw_aio_flush,
-     .bdrv_truncate    = raw_truncate,
-@@ -XXX,XX +XXX,XX @@ static void hdev_parse_filename(const char *filename, QDict *options,
-     bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
- }
-+static void hdev_refresh_limits(BlockDriverState *bs, Error **errp)
-+{
-+    /* XXX Does Windows support AIO on less than 512-byte alignment? */
-+    bs->bl.request_alignment = 512;
-+}
-+
- static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
-                      Error **errp)
- {
-@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_device = {
-     .bdrv_probe_device    = hdev_probe_device,
-     .bdrv_file_open    = hdev_open,
-     .bdrv_close        = raw_close,
-+    .bdrv_refresh_limits = hdev_refresh_limits,
--    .bdrv_aio_readv     = raw_aio_readv,
--    .bdrv_aio_writev    = raw_aio_writev,
-+    .bdrv_aio_preadv    = raw_aio_preadv,
-+    .bdrv_aio_pwritev   = raw_aio_pwritev,
-     .bdrv_aio_flush     = raw_aio_flush,
-     .bdrv_detach_aio_context = raw_detach_aio_context,
-diff --git a/block/win32-aio.c b/block/win32-aio.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/win32-aio.c
-+++ b/block/win32-aio.c
-@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo win32_aiocb_info = {
- BlockAIOCB *win32_aio_submit(BlockDriverState *bs,
-         QEMUWin32AIOState *aio, HANDLE hfile,
--        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-+        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov,
-         BlockCompletionFunc *cb, void *opaque, int type)
- {
-     struct QEMUWin32AIOCB *waiocb;
--    uint64_t offset = sector_num * 512;
-     DWORD rc;
-     waiocb = qemu_aio_get(&win32_aiocb_info, bs, cb, opaque);
--    waiocb->nbytes = nb_sectors * 512;
-+    waiocb->nbytes = bytes;
-     waiocb->qiov = qiov;
-     waiocb->is_read = (type == QEMU_AIO_READ);
---
-.13.6

-[Qemu-devel] [PULL 04/37] null: Switch to byte-based read/write
+Deleted patch
-From: Eric Blake <eblake@redhat.com>
-We are gradually moving away from sector-based interfaces, towards
-byte-based.  Make the change for the last few sector-based callbacks
-in the null-co and null-aio drivers.
-Note that since the null driver does nothing on writes, it trivially
-supports the BDRV_REQ_FUA flag (all writes have already landed to
-the same bit-bucket without needing an extra flush call).  Also, since
-the null driver does just as well with byte-based requests, we can
-now avoid cycles wasted on read-modify-write by taking advantage of
-the block layer now defaulting the alignment to 1 instead of 512.
-Signed-off-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- block/null.c | 45 +++++++++++++++++++++++----------------------
-file changed, 23 insertions(+), 22 deletions(-)
-diff --git a/block/null.c b/block/null.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/null.c
-+++ b/block/null.c
-@@ -XXX,XX +XXX,XX @@ static int null_file_open(BlockDriverState *bs, QDict *options, int flags,
-     }
-     s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, false);
-     qemu_opts_del(opts);
-+    bs->supported_write_flags = BDRV_REQ_FUA;
-     return ret;
- }
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int null_co_common(BlockDriverState *bs)
-     return 0;
- }
--static coroutine_fn int null_co_readv(BlockDriverState *bs,
--                                      int64_t sector_num, int nb_sectors,
--                                      QEMUIOVector *qiov)
-+static coroutine_fn int null_co_preadv(BlockDriverState *bs,
-+                                       uint64_t offset, uint64_t bytes,
-+                                       QEMUIOVector *qiov, int flags)
- {
-     BDRVNullState *s = bs->opaque;
-     if (s->read_zeroes) {
--        qemu_iovec_memset(qiov, 0, 0, nb_sectors * BDRV_SECTOR_SIZE);
-+        qemu_iovec_memset(qiov, 0, 0, bytes);
-     }
-     return null_co_common(bs);
- }
--static coroutine_fn int null_co_writev(BlockDriverState *bs,
--                                       int64_t sector_num, int nb_sectors,
--                                       QEMUIOVector *qiov)
-+static coroutine_fn int null_co_pwritev(BlockDriverState *bs,
-+                                        uint64_t offset, uint64_t bytes,
-+                                        QEMUIOVector *qiov, int flags)
- {
-     return null_co_common(bs);
- }
-@@ -XXX,XX +XXX,XX @@ static inline BlockAIOCB *null_aio_common(BlockDriverState *bs,
-     return &acb->common;
- }
--static BlockAIOCB *null_aio_readv(BlockDriverState *bs,
--                                  int64_t sector_num, QEMUIOVector *qiov,
--                                  int nb_sectors,
--                                  BlockCompletionFunc *cb,
--                                  void *opaque)
-+static BlockAIOCB *null_aio_preadv(BlockDriverState *bs,
-+                                   uint64_t offset, uint64_t bytes,
-+                                   QEMUIOVector *qiov, int flags,
-+                                   BlockCompletionFunc *cb,
-+                                   void *opaque)
- {
-     BDRVNullState *s = bs->opaque;
-     if (s->read_zeroes) {
--        qemu_iovec_memset(qiov, 0, 0, nb_sectors * BDRV_SECTOR_SIZE);
-+        qemu_iovec_memset(qiov, 0, 0, bytes);
-     }
-     return null_aio_common(bs, cb, opaque);
- }
--static BlockAIOCB *null_aio_writev(BlockDriverState *bs,
--                                   int64_t sector_num, QEMUIOVector *qiov,
--                                   int nb_sectors,
--                                   BlockCompletionFunc *cb,
--                                   void *opaque)
-+static BlockAIOCB *null_aio_pwritev(BlockDriverState *bs,
-+                                    uint64_t offset, uint64_t bytes,
-+                                    QEMUIOVector *qiov, int flags,
-+                                    BlockCompletionFunc *cb,
-+                                    void *opaque)
- {
-     return null_aio_common(bs, cb, opaque);
- }
-@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_null_co = {
-     .bdrv_close             = null_close,
-     .bdrv_getlength         = null_getlength,
--    .bdrv_co_readv          = null_co_readv,
--    .bdrv_co_writev         = null_co_writev,
-+    .bdrv_co_preadv         = null_co_preadv,
-+    .bdrv_co_pwritev        = null_co_pwritev,
-     .bdrv_co_flush_to_disk  = null_co_flush,
-     .bdrv_reopen_prepare    = null_reopen_prepare,
-@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_null_aio = {
-     .bdrv_close             = null_close,
-     .bdrv_getlength         = null_getlength,
--    .bdrv_aio_readv         = null_aio_readv,
--    .bdrv_aio_writev        = null_aio_writev,
-+    .bdrv_aio_preadv        = null_aio_preadv,
-+    .bdrv_aio_pwritev       = null_aio_pwritev,
-     .bdrv_aio_flush         = null_aio_flush,
-     .bdrv_reopen_prepare    = null_reopen_prepare,
---
-.13.6

-[Qemu-devel] [PULL 09/37] hmp: Allow using a qdev id in block_set_io_throttle
+[PULL 10/15] iotests: Filter testfiles out in filter_img_info()
-From: Alberto Garcia <berto@igalia.com>
+We want to keep TEST_IMG for the full path of the main test image, but
 filter_testfiles() must be called for other test images before replacing
 other things like the image format because the test directory path could
 contain the format as a substring.
-The QMP version of this command can take a qdev ID since 7a9877a02635,
+Insert a filter_testfiles() call between both.
 but the HMP version is still using the deprecated block device name so
 there's no way to refer to a block device added like this:
-  -blockdev node-name=disk0,driver=qcow2,file.driver=file,file.filename=hd.qcow2
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-  -device virtio-blk-pci,id=virtio-blk-pci0,drive=disk0
+Reviewed-by: Max Reitz <mreitz@redhat.com>
+Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-This patch works around this problem by using the specified name as a
+Message-Id: <20200424125448.63318-9-kwolf@redhat.com>
 qdev ID if the block device name is not found.
 Signed-off-by: Alberto Garcia <berto@igalia.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- hmp.c           | 14 ++++++++++++--
+ tests/qemu-iotests/iotests.py | 5 +++--
- hmp-commands.hx |  3 ++-
+file changed, 3 insertions(+), 2 deletions(-)
 files changed, 14 insertions(+), 3 deletions(-)
-diff --git a/hmp.c b/hmp.c
+diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
 index XXXXXXX..XXXXXXX 100644
---- a/hmp.c
+--- a/tests/qemu-iotests/iotests.py
-+++ b/hmp.c
++++ b/tests/qemu-iotests/iotests.py
-@@ -XXX,XX +XXX,XX @@ void hmp_change(Monitor *mon, const QDict *qdict)
+@@ -XXX,XX +XXX,XX @@ def filter_img_info(output, filename):
- void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict)
+     for line in output.split('\n'):
- {
+         if 'disk size' in line or 'actual-size' in line:
-     Error *err = NULL;
+             continue
-+    char *device = (char *) qdict_get_str(qdict, "device");
+-        line = line.replace(filename, 'TEST_IMG') \
-     BlockIOThrottle throttle = {
+-                   .replace(imgfmt, 'IMGFMT')
--        .has_device = true,
++        line = line.replace(filename, 'TEST_IMG')
--        .device = (char *) qdict_get_str(qdict, "device"),
++        line = filter_testfiles(line)
-         .bps = qdict_get_int(qdict, "bps"),
++        line = line.replace(imgfmt, 'IMGFMT')
-         .bps_rd = qdict_get_int(qdict, "bps_rd"),
+         line = re.sub('iters: [0-9]+', 'iters: XXX', line)
-         .bps_wr = qdict_get_int(qdict, "bps_wr"),
+         line = re.sub('uuid: [-a-f0-9]+', 'uuid: XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX', line)
-@@ -XXX,XX +XXX,XX @@ void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict)
+         line = re.sub('cid: [0-9]+', 'cid: XXXXXXXXXX', line)
          .iops_wr = qdict_get_int(qdict, "iops_wr"),
      };
 +    /* qmp_block_set_io_throttle has separate parameters for the
 +     * (deprecated) block device name and the qdev ID but the HMP
 +     * version has only one, so we must decide which one to pass. */
 +    if (blk_by_name(device)) {
 +        throttle.has_device = true;
 +        throttle.device = device;
 +    } else {
 +        throttle.has_id = true;
 +        throttle.id = device;
 +    }
 +
      qmp_block_set_io_throttle(&throttle, &err);
      hmp_handle_error(mon, &err);
  }
 diff --git a/hmp-commands.hx b/hmp-commands.hx
 index XXXXXXX..XXXXXXX 100644
 --- a/hmp-commands.hx
 +++ b/hmp-commands.hx
@@ -XXX,XX +XXX,XX @@ ETEXI
  STEXI
  @item block_set_io_throttle @var{device} @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}
  @findex block_set_io_throttle
 -Change I/O throttle limits for a block drive to @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}
 +Change I/O throttle limits for a block drive to @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}.
 +@var{device} can be a block device name, a qdev ID or a QOM path.
  ETEXI
      {
 --
-.13.6
+.25.3

-[Qemu-devel] [PULL 32/37] iotests: Add test for COR across nodes
+[PULL 11/15] iotests: Test committing to short backing file
-From: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Message-Id: <20200424125448.63318-10-kwolf@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
  tests/qemu-iotests/274     | 155 +++++++++++++++++++++
  tests/qemu-iotests/274.out | 268 +++++++++++++++++++++++++++++++++++++
  tests/qemu-iotests/group   |   1 +
 files changed, 424 insertions(+)
  create mode 100755 tests/qemu-iotests/274
  create mode 100644 tests/qemu-iotests/274.out
-COR across nodes (that is, you have some filter node between the
+diff --git a/tests/qemu-iotests/274 b/tests/qemu-iotests/274
 actually COR target and the node that performs the COR) cannot reliably
 work together with the permission system when there is no explicit COR
 node that can request the WRITE_UNCHANGED permission for its child.
 This is because COR (currently) sneaks its requests by the usual
 permission checks, so it can work without a WRITE* permission; but if
 there is a filter node in between, that will re-issue the request, which
 then passes through the usual check -- and if nobody has requested a
 WRITE_UNCHANGED permission, that check will fail.
 There is no real direct fix apart from hoping that there is someone who
 has requested that permission; in case of just the qemu-io HMP command
 (and no guest device), however, that is not the case.  The real real fix
 is to implement the copy-on-read flag through an implicitly added COR
 node.  Such a node can request the necessary permissions as shown in
 this test.
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 Message-id: 20180421132929.21610-10-mreitz@redhat.com
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  tests/qemu-iotests/216     | 115 +++++++++++++++++++++++++++++++++++++++++++++
  tests/qemu-iotests/216.out |  28 +++++++++++
  tests/qemu-iotests/group   |   1 +
 files changed, 144 insertions(+)
  create mode 100755 tests/qemu-iotests/216
  create mode 100644 tests/qemu-iotests/216.out
 diff --git a/tests/qemu-iotests/216 b/tests/qemu-iotests/216
 new file mode 100755
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/tests/qemu-iotests/216
++++ b/tests/qemu-iotests/274
 @@ -XXX,XX +XXX,XX @@
-+#!/usr/bin/env python
++#!/usr/bin/env python3
 +#
-+# Copy-on-read tests using a COR filter node
++# Copyright (C) 2019 Red Hat, Inc.
 +#
 +# Copyright (C) 2018 Red Hat, Inc.
 +#
 +# This program is free software; you can redistribute it and/or modify
 +# it under the terms of the GNU General Public License as published by
 +# the Free Software Foundation; either version 2 of the License, or
 +# (at your option) any later version.
 ...
 +# GNU General Public License for more details.
 +#
 +# You should have received a copy of the GNU General Public License
 +# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 +#
-+# Creator/Owner: Max Reitz <mreitz@redhat.com>
++# Creator/Owner: Kevin Wolf <kwolf@redhat.com>
 +#
 +# Some tests for short backing files and short overlays
 +
 +import iotests
-+from iotests import log, qemu_img_pipe, qemu_io, filter_qemu_io
++
-+
++iotests.verify_image_format(supported_fmts=['qcow2'])
 +# Need backing file support
 +iotests.verify_image_format(supported_fmts=['qcow2', 'qcow', 'qed', 'vmdk'])
 +iotests.verify_platform(['linux'])
 +
-+log('')
++size_short = 1 * 1024 * 1024
-+log('=== Copy-on-read across nodes ===')
++size_long = 2 * 1024 * 1024
-+log('')
++size_diff = size_long - size_short
 +
-+# The old copy-on-read mechanism without a filter node cannot request
++def create_chain() -> None:
-+# WRITE_UNCHANGED permissions for its child.  Therefore it just tries
++    iotests.qemu_img_log('create', '-f', iotests.imgfmt, base,
-+# to sneak its write by the usual permission system and holds its
++                         str(size_long))
-+# fingers crossed.  However, that sneaking does not work so well when
++    iotests.qemu_img_log('create', '-f', iotests.imgfmt, '-b', base, mid,
-+# there is a filter node in the way: That will receive the write
++                         str(size_short))
-+# request and re-issue a new one to its child, which this time is a
++    iotests.qemu_img_log('create', '-f', iotests.imgfmt, '-b', mid, top,
-+# proper write request that will make the permission system cough --
++                         str(size_long))
-+# unless there is someone at the top (like a guest device) that has
++
-+# requested write permissions.
++    iotests.qemu_io_log('-c', 'write -P 1 0 %d' % size_long, base)
-+#
++
-+# A COR filter node, however, can request the proper permissions for
++def create_vm() -> iotests.VM:
-+# its child and therefore is not hit by this issue.
++    vm = iotests.VM()
-+
++    vm.add_blockdev('file,filename=%s,node-name=base-file' % base)
-+with iotests.FilePath('base.img') as base_img_path, \
++    vm.add_blockdev('%s,file=base-file,node-name=base' % iotests.imgfmt)
-+     iotests.FilePath('top.img') as top_img_path, \
++    vm.add_blockdev('file,filename=%s,node-name=mid-file' % mid)
-+     iotests.VM() as vm:
++    vm.add_blockdev('%s,file=mid-file,node-name=mid,backing=base'
-+
++                    % iotests.imgfmt)
-+    log('--- Setting up images ---')
++    vm.add_drive(top, 'backing=mid,node-name=top')
-+    log('')
++    return vm
 +
-+    qemu_img_pipe('create', '-f', iotests.imgfmt, base_img_path, '64M')
++with iotests.FilePath('base') as base, \
-+
++     iotests.FilePath('mid') as mid, \
-+    log(filter_qemu_io(qemu_io(base_img_path, '-c', 'write -P 1 0M 1M')))
++     iotests.FilePath('top') as top:
 +
-+    qemu_img_pipe('create', '-f', iotests.imgfmt, '-b', base_img_path,
++    iotests.log('== Commit tests ==')
-+                  top_img_path)
++
-+
++    create_chain()
-+    log(filter_qemu_io(qemu_io(top_img_path,  '-c', 'write -P 2 1M 1M')))
++
-+
++    iotests.log('=== Check visible data ===')
-+    log('')
++
-+    log('--- Doing COR ---')
++    iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, top)
-+    log('')
++    iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), top)
 +
-+    # Compare with e.g. the following:
++    iotests.log('=== Checking allocation status ===')
-+    #   vm.add_drive_raw('if=none,node-name=node0,copy-on-read=on,driver=raw,' \
++
-+    #                    'file.driver=%s,file.file.filename=%s' %
++    iotests.qemu_io_log('-c', 'alloc 0 %d' % size_short,
-+    #                       (iotests.imgfmt, top_img_path))
++                        '-c', 'alloc %d %d' % (size_short, size_diff),
-+    # (Remove the blockdev-add instead.)
++                        base)
-+    # ((Not tested here because it hits an assertion in the permission
++
-+    #   system.))
++    iotests.qemu_io_log('-c', 'alloc 0 %d' % size_short,
-+
++                        '-c', 'alloc %d %d' % (size_short, size_diff),
-+    vm.launch()
++                        mid)
 +
-+    log(vm.qmp('blockdev-add',
++    iotests.qemu_io_log('-c', 'alloc 0 %d' % size_short,
-+                    node_name='node0',
++                        '-c', 'alloc %d %d' % (size_short, size_diff),
-+                    driver='copy-on-read',
++                        top)
-+                    file={
++
-+                        'driver': 'raw',
++    iotests.log('=== Checking map ===')
-+                        'file': {
++
-+                            'driver': 'copy-on-read',
++    iotests.qemu_img_log('map', '--output=json', base)
-+                            'file': {
++    iotests.qemu_img_log('map', '--output=human', base)
-+                                'driver': 'raw',
++    iotests.qemu_img_log('map', '--output=json', mid)
-+                                'file': {
++    iotests.qemu_img_log('map', '--output=human', mid)
-+                                    'driver': iotests.imgfmt,
++    iotests.qemu_img_log('map', '--output=json', top)
-+                                    'file': {
++    iotests.qemu_img_log('map', '--output=human', top)
-+                                        'driver': 'file',
++
-+                                        'filename': top_img_path
++    iotests.log('=== Testing qemu-img commit (top -> mid) ===')
-+                                    },
++
-+                                    'backing': {
++    iotests.qemu_img_log('commit', top)
-+                                        'driver': iotests.imgfmt,
++    iotests.img_info_log(mid)
-+                                        'file': {
++    iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, mid)
-+                                            'driver': 'file',
++    iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), mid)
-+                                            'filename': base_img_path
++
-+                                        }
++    iotests.log('=== Testing HMP commit (top -> mid) ===')
-+                                    }
++
-+                                }
++    create_chain()
-+                            }
++    with create_vm() as vm:
-+                        }
++        vm.launch()
-+                    }))
++        vm.qmp_log('human-monitor-command', command_line='commit drive0')
 +
-+    # Trigger COR
++    iotests.img_info_log(mid)
-+    log(vm.qmp('human-monitor-command',
++    iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, mid)
-+               command_line='qemu-io node0 "read 0 64M"'))
++    iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), mid)
 +
-+    vm.shutdown()
++    iotests.log('=== Testing QMP active commit (top -> mid) ===')
 +
-+    log('')
++    create_chain()
-+    log('--- Checking COR result ---')
++    with create_vm() as vm:
-+    log('')
++        vm.launch()
-+
++        vm.qmp_log('block-commit', device='top', base_node='mid',
-+    log(filter_qemu_io(qemu_io(base_img_path, '-c', 'discard 0 64M')))
++                   job_id='job0', auto_dismiss=False)
-+    log(filter_qemu_io(qemu_io(top_img_path,  '-c', 'read -P 1 0M 1M')))
++        vm.run_job('job0', wait=5)
-+    log(filter_qemu_io(qemu_io(top_img_path,  '-c', 'read -P 2 1M 1M')))
++
-diff --git a/tests/qemu-iotests/216.out b/tests/qemu-iotests/216.out
++    iotests.img_info_log(mid)
 +    iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, mid)
 +    iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), mid)
 +
 +
 +    iotests.log('== Resize tests ==')
 +
 +    # Use different sizes for different allocation modes:
 +    #
 +    # We want to have at least one test where 32 bit truncation in the size of
 +    # the overlapping area becomes visible. This is covered by the
 +    # prealloc='off' case (1G to 6G is an overlap of 5G).
 +    #
 +    # However, we can only do this for modes that don't preallocate data
 +    # because otherwise we might run out of space on the test host.
 +    #
 +    # We also want to test some unaligned combinations.
 +    for (prealloc, base_size, top_size_old, top_size_new, off) in [
 +            ('off',       '6G',    '1G',   '8G',   '5G'),
 +            ('metadata', '32G',   '30G',  '33G',  '31G'),
 +            ('falloc',   '10M',    '5M',  '15M',   '9M'),
 +            ('full',     '16M',    '8M',  '12M',  '11M'),
 +            ('off',      '384k', '253k', '512k', '253k'),
 +            ('off',      '400k', '256k', '512k', '336k'),
 +            ('off',      '512k', '256k', '500k', '436k')]:
 +
 +        iotests.log('=== preallocation=%s ===' % prealloc)
 +        iotests.qemu_img_log('create', '-f', iotests.imgfmt, base, base_size)
 +        iotests.qemu_img_log('create', '-f', iotests.imgfmt, '-b', base, top,
 +                             top_size_old)
 +        iotests.qemu_io_log('-c', 'write -P 1 %s 64k' % off, base)
 +
 +        # After this, top_size_old to base_size should be allocated/zeroed.
 +        #
 +        # In theory, leaving base_size to top_size_new unallocated would be
 +        # correct, but in practice, if we zero out anything, we zero out
 +        # everything up to top_size_new.
 +        iotests.qemu_img_log('resize', '-f', iotests.imgfmt,
 +                             '--preallocation', prealloc, top, top_size_new)
 +        iotests.qemu_io_log('-c', 'read -P 0 %s 64k' % off, top)
 +        iotests.qemu_io_log('-c', 'map', top)
 +        iotests.qemu_img_log('map', '--output=json', top)
 diff --git a/tests/qemu-iotests/274.out b/tests/qemu-iotests/274.out
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/tests/qemu-iotests/216.out
++++ b/tests/qemu-iotests/274.out
 @@ -XXX,XX +XXX,XX @@
-+
++== Commit tests ==
-+=== Copy-on-read across nodes ===
++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=2097152 cluster_size=65536 lazy_refcounts=off refcount_bits=16
 +
-+--- Setting up images ---
++Formatting 'TEST_DIR/PID-mid', fmt=qcow2 size=1048576 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
 +
-+wrote 1048576/1048576 bytes at offset 0
++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=2097152 backing_file=TEST_DIR/PID-mid cluster_size=65536 lazy_refcounts=off refcount_bits=16
-+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
-+
++wrote 2097152/2097152 bytes at offset 0
-+wrote 1048576/1048576 bytes at offset 1048576
++2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
-+
++=== Check visible data ===
 +
 +--- Doing COR ---
 +
 +{u'return': {}}
 +{u'return': u''}
 +
 +--- Checking COR result ---
 +
 +discard 67108864/67108864 bytes at offset 0
 +64 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
 +read 1048576/1048576 bytes at offset 0
 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
 +read 1048576/1048576 bytes at offset 1048576
 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++=== Checking allocation status ===
++1048576/1048576 bytes allocated at offset 0 bytes
++1048576/1048576 bytes allocated at offset 1 MiB
++
++0/1048576 bytes allocated at offset 0 bytes
++0/0 bytes allocated at offset 1 MiB
++
++0/1048576 bytes allocated at offset 0 bytes
++0/1048576 bytes allocated at offset 1 MiB
++
++=== Checking map ===
++[{ "start": 0, "length": 2097152, "depth": 0, "zero": false, "data": true, "offset": 327680}]
++
++Offset          Length          Mapped to       File
++0               0x200000        0x50000         TEST_DIR/PID-base
++
++[{ "start": 0, "length": 1048576, "depth": 1, "zero": false, "data": true, "offset": 327680}]
++
++Offset          Length          Mapped to       File
++0               0x100000        0x50000         TEST_DIR/PID-base
++
++[{ "start": 0, "length": 1048576, "depth": 2, "zero": false, "data": true, "offset": 327680},
++{ "start": 1048576, "length": 1048576, "depth": 0, "zero": true, "data": false}]
++
++Offset          Length          Mapped to       File
++0               0x100000        0x50000         TEST_DIR/PID-base
++
++=== Testing qemu-img commit (top -> mid) ===
++Image committed.
++
++image: TEST_IMG
++file format: IMGFMT
++virtual size: 2 MiB (2097152 bytes)
++cluster_size: 65536
++backing file: TEST_DIR/PID-base
++Format specific information:
++    compat: 1.1
++    lazy refcounts: false
++    refcount bits: 16
++    corrupt: false
++
++read 1048576/1048576 bytes at offset 0
++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++read 1048576/1048576 bytes at offset 1048576
++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++=== Testing HMP commit (top -> mid) ===
++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=2097152 cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++Formatting 'TEST_DIR/PID-mid', fmt=qcow2 size=1048576 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=2097152 backing_file=TEST_DIR/PID-mid cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++wrote 2097152/2097152 bytes at offset 0
++2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++{"execute": "human-monitor-command", "arguments": {"command-line": "commit drive0"}}
++{"return": ""}
++image: TEST_IMG
++file format: IMGFMT
++virtual size: 2 MiB (2097152 bytes)
++cluster_size: 65536
++backing file: TEST_DIR/PID-base
++Format specific information:
++    compat: 1.1
++    lazy refcounts: false
++    refcount bits: 16
++    corrupt: false
++
++read 1048576/1048576 bytes at offset 0
++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++read 1048576/1048576 bytes at offset 1048576
++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++=== Testing QMP active commit (top -> mid) ===
++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=2097152 cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++Formatting 'TEST_DIR/PID-mid', fmt=qcow2 size=1048576 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=2097152 backing_file=TEST_DIR/PID-mid cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++wrote 2097152/2097152 bytes at offset 0
++2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++{"execute": "block-commit", "arguments": {"auto-dismiss": false, "base-node": "mid", "device": "top", "job-id": "job0"}}
++{"return": {}}
++{"execute": "job-complete", "arguments": {"id": "job0"}}
++{"return": {}}
++{"data": {"device": "job0", "len": 0, "offset": 0, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_READY", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
++{"data": {"device": "job0", "len": 0, "offset": 0, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
++{"execute": "job-dismiss", "arguments": {"id": "job0"}}
++{"return": {}}
++image: TEST_IMG
++file format: IMGFMT
++virtual size: 2 MiB (2097152 bytes)
++cluster_size: 65536
++backing file: TEST_DIR/PID-base
++Format specific information:
++    compat: 1.1
++    lazy refcounts: false
++    refcount bits: 16
++    corrupt: false
++
++read 1048576/1048576 bytes at offset 0
++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++read 1048576/1048576 bytes at offset 1048576
++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++== Resize tests ==
++=== preallocation=off ===
++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=6442450944 cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=1073741824 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++wrote 65536/65536 bytes at offset 5368709120
++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++Image resized.
++
++read 65536/65536 bytes at offset 5368709120
++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++1 GiB (0x40000000) bytes not allocated at offset 0 bytes (0x0)
++7 GiB (0x1c0000000) bytes     allocated at offset 1 GiB (0x40000000)
++
++[{ "start": 0, "length": 1073741824, "depth": 1, "zero": true, "data": false},
++{ "start": 1073741824, "length": 7516192768, "depth": 0, "zero": true, "data": false}]
++
++=== preallocation=metadata ===
++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=34359738368 cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=32212254720 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++wrote 65536/65536 bytes at offset 33285996544
++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++Image resized.
++
++read 65536/65536 bytes at offset 33285996544
++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++30 GiB (0x780000000) bytes not allocated at offset 0 bytes (0x0)
++3 GiB (0xc0000000) bytes     allocated at offset 30 GiB (0x780000000)
++
++[{ "start": 0, "length": 32212254720, "depth": 1, "zero": true, "data": false},
++{ "start": 32212254720, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 327680},
++{ "start": 32749125632, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 537264128},
++{ "start": 33285996544, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 1074200576},
++{ "start": 33822867456, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 1611137024},
++{ "start": 34359738368, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 2148139008},
++{ "start": 34896609280, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 2685075456}]
++
++=== preallocation=falloc ===
++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=10485760 cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=5242880 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++wrote 65536/65536 bytes at offset 9437184
++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++Image resized.
++
++read 65536/65536 bytes at offset 9437184
++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++5 MiB (0x500000) bytes not allocated at offset 0 bytes (0x0)
++10 MiB (0xa00000) bytes     allocated at offset 5 MiB (0x500000)
++
++[{ "start": 0, "length": 5242880, "depth": 1, "zero": true, "data": false},
++{ "start": 5242880, "length": 10485760, "depth": 0, "zero": true, "data": false, "offset": 327680}]
++
++=== preallocation=full ===
++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=16777216 cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=8388608 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++wrote 65536/65536 bytes at offset 11534336
++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++Image resized.
++
++read 65536/65536 bytes at offset 11534336
++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++8 MiB (0x800000) bytes not allocated at offset 0 bytes (0x0)
++4 MiB (0x400000) bytes     allocated at offset 8 MiB (0x800000)
++
++[{ "start": 0, "length": 8388608, "depth": 1, "zero": true, "data": false},
++{ "start": 8388608, "length": 4194304, "depth": 0, "zero": true, "data": false, "offset": 327680}]
++
++=== preallocation=off ===
++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=393216 cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=259072 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++wrote 65536/65536 bytes at offset 259072
++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++Image resized.
++
++read 65536/65536 bytes at offset 259072
++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++192 KiB (0x30000) bytes not allocated at offset 0 bytes (0x0)
++320 KiB (0x50000) bytes     allocated at offset 192 KiB (0x30000)
++
++[{ "start": 0, "length": 196608, "depth": 1, "zero": true, "data": false},
++{ "start": 196608, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680},
++{ "start": 262144, "length": 262144, "depth": 0, "zero": true, "data": false}]
++
++=== preallocation=off ===
++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=409600 cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=262144 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++wrote 65536/65536 bytes at offset 344064
++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++Image resized.
++
++read 65536/65536 bytes at offset 344064
++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++256 KiB (0x40000) bytes not allocated at offset 0 bytes (0x0)
++256 KiB (0x40000) bytes     allocated at offset 256 KiB (0x40000)
++
++[{ "start": 0, "length": 262144, "depth": 1, "zero": true, "data": false},
++{ "start": 262144, "length": 262144, "depth": 0, "zero": true, "data": false}]
++
++=== preallocation=off ===
++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=524288 cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=262144 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
++
++wrote 65536/65536 bytes at offset 446464
++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++Image resized.
++
++read 65536/65536 bytes at offset 446464
++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++
++256 KiB (0x40000) bytes not allocated at offset 0 bytes (0x0)
++244 KiB (0x3d000) bytes     allocated at offset 256 KiB (0x40000)
++
++[{ "start": 0, "length": 262144, "depth": 1, "zero": true, "data": false},
++{ "start": 262144, "length": 249856, "depth": 0, "zero": true, "data": false}]
 +
 diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/qemu-iotests/group
 +++ b/tests/qemu-iotests/group
 @@ -XXX,XX +XXX,XX @@
-rw auto quick
+rw backing quick
-rw auto
+rw
-rw auto quick
+backing quick
-+216 rw auto quick
++274 rw backing
-rw auto quick
+rw quick
 rw backing quick
 rw migration quick
 --
-.13.6
+.25.3

-[Qemu-devel] [PULL 20/37] qcow2: Give the refcount cache the minimum possible size by default
+[PULL 12/15] qcow2: Forward ZERO_WRITE flag for full preallocation
-From: Alberto Garcia <berto@igalia.com>
+The BDRV_REQ_ZERO_WRITE is currently implemented in a way that first the
 image is possibly preallocated and then the zero flag is added to all
 clusters. This means that a copy-on-write operation may be needed when
 writing to these clusters, despite having used preallocation, negating
 one of the major benefits of preallocation.
-The L2 and refcount caches have default sizes that can be overridden
+Instead, try to forward the BDRV_REQ_ZERO_WRITE to the protocol driver,
-using the l2-cache-size and refcount-cache-size (an additional
+and if the protocol driver can ensure that the new area reads as zeros,
-parameter named cache-size sets the combined size of both caches).
+we can skip setting the zero flag in the qcow2 layer.
-Unless forced by one of the aforementioned parameters, QEMU will set
+Unfortunately, the same approach doesn't work for metadata
-the unspecified sizes so that the L2 cache is 4 times larger than the
+preallocation, so we'll still set the zero flag there.
 refcount cache.
-This is based on the premise that the refcount metadata needs to be
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-only a fourth of the L2 metadata to cover the same amount of disk
+Reviewed-by: Max Reitz <mreitz@redhat.com>
-space. This is incorrect for two reasons:
+Message-Id: <20200424142701.67053-1-kwolf@redhat.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
  block/qcow2.c              | 22 +++++++++++++++++++---
  tests/qemu-iotests/274.out |  4 ++--
 files changed, 21 insertions(+), 5 deletions(-)
- a) The amount of disk covered by an L2 table depends solely on the
-    cluster size, but in the case of a refcount block it depends on
-    the cluster size *and* the width of each refcount entry.
-    The 4/1 ratio is only valid with 16-bit entries (the default).
- b) When we talk about disk space and L2 tables we are talking about
-    guest space (L2 tables map guest clusters to host clusters),
-    whereas refcount blocks are used for host clusters (including
-    L1/L2 tables and the refcount blocks themselves). On a fully
-    populated (and uncompressed) qcow2 file, image size > virtual size
-    so there are more refcount entries than L2 entries.
-Problem (a) could be fixed by adjusting the algorithm to take into
-account the refcount entry width. Problem (b) could be fixed by
-increasing a bit the refcount cache size to account for the clusters
-used for qcow2 metadata.
-However this patch takes a completely different approach and instead
-of keeping a ratio between both cache sizes it assigns as much as
-possible to the L2 cache and the remainder to the refcount cache.
-The reason is that L2 tables are used for every single I/O request
-from the guest and the effect of increasing the cache is significant
-and clearly measurable. Refcount blocks are however only used for
-cluster allocation and internal snapshots and in practice are accessed
-sequentially in most cases, so the effect of increasing the cache is
-negligible (even when doing random writes from the guest).
-So, make the refcount cache as small as possible unless the user
-explicitly asks for a larger one.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Message-id: 9695182c2eb11b77cb319689a1ebaa4e7c9d6591.1523968389.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2.h              |  4 ----
- block/qcow2.c              | 31 +++++++++++++++++++------------
- tests/qemu-iotests/137.out |  2 +-
-files changed, 20 insertions(+), 17 deletions(-)
-diff --git a/block/qcow2.h b/block/qcow2.h
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
-+++ b/block/qcow2.h
-@@ -XXX,XX +XXX,XX @@
- #define DEFAULT_L2_CACHE_CLUSTERS 8 /* clusters */
- #define DEFAULT_L2_CACHE_BYTE_SIZE 1048576 /* bytes */
--/* The refblock cache needs only a fourth of the L2 cache size to cover as many
-- * clusters */
--#define DEFAULT_L2_REFCOUNT_SIZE_RATIO 4
--
- #define DEFAULT_CLUSTER_SIZE 65536
 diff --git a/block/qcow2.c b/block/qcow2.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2.c
 +++ b/block/qcow2.c
-@@ -XXX,XX +XXX,XX @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
+@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
-         } else if (refcount_cache_size_set) {
+         /* Allocate the data area */
-             *l2_cache_size = combined_cache_size - *refcount_cache_size;
+         new_file_size = allocation_start +
-         } else {
+                         nb_new_data_clusters * s->cluster_size;
--            *refcount_cache_size = combined_cache_size
+-        /* Image file grows, so @exact does not matter */
--                                 / (DEFAULT_L2_REFCOUNT_SIZE_RATIO + 1);
+-        ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, 0,
--            *l2_cache_size = combined_cache_size - *refcount_cache_size;
+-                               errp);
-+            uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
++        /*
-+            uint64_t max_l2_cache = virtual_disk_size / (s->cluster_size / 8);
++         * Image file grows, so @exact does not matter.
-+            uint64_t min_refcount_cache =
++         *
-+                (uint64_t) MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
++         * If we need to zero out the new area, try first whether the protocol
-+
++         * driver can already take care of this.
-+            /* Assign as much memory as possible to the L2 cache, and
++         */
-+             * use the remainder for the refcount cache */
++        if (flags & BDRV_REQ_ZERO_WRITE) {
-+            if (combined_cache_size >= max_l2_cache + min_refcount_cache) {
++            ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc,
-+                *l2_cache_size = max_l2_cache;
++                                   BDRV_REQ_ZERO_WRITE, NULL);
-+                *refcount_cache_size = combined_cache_size - *l2_cache_size;
++            if (ret >= 0) {
-+            } else {
++                flags &= ~BDRV_REQ_ZERO_WRITE;
 +                *refcount_cache_size =
 +                    MIN(combined_cache_size, min_refcount_cache);
 +                *l2_cache_size = combined_cache_size - *refcount_cache_size;
 +            }
-         }
++        } else {
-     } else {
++            ret = -1;
 -        if (!l2_cache_size_set && !refcount_cache_size_set) {
 +        if (!l2_cache_size_set) {
              *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE,
                                   (uint64_t)DEFAULT_L2_CACHE_CLUSTERS
                                   * s->cluster_size);
 -            *refcount_cache_size = *l2_cache_size
 -                                 / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
 -        } else if (!l2_cache_size_set) {
 -            *l2_cache_size = *refcount_cache_size
 -                           * DEFAULT_L2_REFCOUNT_SIZE_RATIO;
 -        } else if (!refcount_cache_size_set) {
 -            *refcount_cache_size = *l2_cache_size
 -                                 / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
 +        }
-+        if (!refcount_cache_size_set) {
++        if (ret < 0) {
-+            *refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
++            ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, 0,
-         }
++                                   errp);
-     }
++        }
+         if (ret < 0) {
-diff --git a/tests/qemu-iotests/137.out b/tests/qemu-iotests/137.out
+             error_prepend(errp, "Failed to resize underlying file: ");
              qcow2_free_clusters(bs, allocation_start,
 diff --git a/tests/qemu-iotests/274.out b/tests/qemu-iotests/274.out
 index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/137.out
+--- a/tests/qemu-iotests/274.out
-+++ b/tests/qemu-iotests/137.out
++++ b/tests/qemu-iotests/274.out
-@@ -XXX,XX +XXX,XX @@ refcount-cache-size may not exceed cache-size
+@@ -XXX,XX +XXX,XX @@ read 65536/65536 bytes at offset 9437184
- L2 cache size too big
+MiB (0xa00000) bytes     allocated at offset 5 MiB (0x500000)
- L2 cache entry size must be a power of two between 512 and the cluster size (65536)
- L2 cache entry size must be a power of two between 512 and the cluster size (65536)
+ [{ "start": 0, "length": 5242880, "depth": 1, "zero": true, "data": false},
--L2 cache size too big
+-{ "start": 5242880, "length": 10485760, "depth": 0, "zero": true, "data": false, "offset": 327680}]
-+Refcount cache size too big
++{ "start": 5242880, "length": 10485760, "depth": 0, "zero": false, "data": true, "offset": 327680}]
- Conflicting values for qcow2 options 'overlap-check' ('constant') and 'overlap-check.template' ('all')
- Unsupported value 'blubb' for qcow2 option 'overlap-check'. Allowed are any of the following: none, constant, cached, all
+ === preallocation=full ===
- Unsupported value 'blubb' for qcow2 option 'overlap-check'. Allowed are any of the following: none, constant, cached, all
+ Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=16777216 cluster_size=65536 lazy_refcounts=off refcount_bits=16
@@ -XXX,XX +XXX,XX @@ read 65536/65536 bytes at offset 11534336
 MiB (0x400000) bytes     allocated at offset 8 MiB (0x800000)
  [{ "start": 0, "length": 8388608, "depth": 1, "zero": true, "data": false},
 -{ "start": 8388608, "length": 4194304, "depth": 0, "zero": true, "data": false, "offset": 327680}]
 +{ "start": 8388608, "length": 4194304, "depth": 0, "zero": false, "data": true, "offset": 327680}]
  === preallocation=off ===
  Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=393216 cluster_size=65536 lazy_refcounts=off refcount_bits=16
 --
-.13.6
+.25.3

-[Qemu-devel] [PULL 07/37] block: Drop last of the sector-based aio callbacks
+[PULL 13/15] nvme: introduce PMR support from NVMe 1.4 spec
-From: Eric Blake <eblake@redhat.com>
+From: Andrzej Jakowski <andrzej.jakowski@linux.intel.com>
-We are gradually moving away from sector-based interfaces, towards
+This patch introduces support for PMR that has been defined as part of NVMe 1.4
-byte-based.  Now that all drivers with aio callbacks are using the
+spec. User can now specify a pmrdev option that should point to HostMemoryBackend.
-byte-based interfaces, we can remove the sector-based versions.
+pmrdev memory region will subsequently be exposed as PCI BAR 2 in emulated NVMe
 device. Guest OS can perform mmio read and writes to the PMR region that will stay
 persistent across system reboot.
-Signed-off-by: Eric Blake <eblake@redhat.com>
+Signed-off-by: Andrzej Jakowski <andrzej.jakowski@linux.intel.com>
 Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Message-Id: <20200330164656.9348-1-andrzej.jakowski@linux.intel.com>
 Reviewed-by: Keith Busch <kbusch@kernel.org>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/block_int.h |  6 ----
+ hw/block/nvme.h        |   2 +
- block/io.c                | 84 ++++++++++++++++++++---------------------------
+ include/block/nvme.h   | 172 +++++++++++++++++++++++++++++++++++++++++
-files changed, 36 insertions(+), 54 deletions(-)
+ hw/block/nvme.c        | 109 ++++++++++++++++++++++++++
  hw/block/Makefile.objs |   2 +-
  hw/block/trace-events  |   4 +
 files changed, 288 insertions(+), 1 deletion(-)
-diff --git a/include/block/block_int.h b/include/block/block_int.h
+diff --git a/hw/block/nvme.h b/hw/block/nvme.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/block_int.h
+--- a/hw/block/nvme.h
-+++ b/include/block/block_int.h
++++ b/hw/block/nvme.h
-@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
+@@ -XXX,XX +XXX,XX @@ typedef struct NvmeCtrl {
-     void (*bdrv_refresh_filename)(BlockDriverState *bs, QDict *options);
+     uint64_t    timestamp_set_qemu_clock_ms;    /* QEMU clock time */
-     /* aio */
+     char            *serial;
--    BlockAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs,
++    HostMemoryBackend *pmrdev;
--        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
++
--        BlockCompletionFunc *cb, void *opaque);
+     NvmeNamespace   *namespaces;
-     BlockAIOCB *(*bdrv_aio_preadv)(BlockDriverState *bs,
+     NvmeSQueue      **sq;
-         uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags,
+     NvmeCQueue      **cq;
-         BlockCompletionFunc *cb, void *opaque);
+diff --git a/include/block/nvme.h b/include/block/nvme.h
 -    BlockAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs,
 -        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
 -        BlockCompletionFunc *cb, void *opaque);
      BlockAIOCB *(*bdrv_aio_pwritev)(BlockDriverState *bs,
          uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags,
          BlockCompletionFunc *cb, void *opaque);
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/io.c
+--- a/include/block/nvme.h
-+++ b/block/io.c
++++ b/include/block/nvme.h
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ typedef struct NvmeBar {
-         return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
+     uint64_t    acq;
      uint32_t    cmbloc;
      uint32_t    cmbsz;
 +    uint8_t     padding[3520]; /* not used by QEMU */
 +    uint32_t    pmrcap;
 +    uint32_t    pmrctl;
 +    uint32_t    pmrsts;
 +    uint32_t    pmrebs;
 +    uint32_t    pmrswtp;
 +    uint32_t    pmrmsc;
  } NvmeBar;
  enum NvmeCapShift {
@@ -XXX,XX +XXX,XX @@ enum NvmeCapShift {
      CAP_CSS_SHIFT      = 37,
      CAP_MPSMIN_SHIFT   = 48,
      CAP_MPSMAX_SHIFT   = 52,
 +    CAP_PMR_SHIFT      = 56,
  };
  enum NvmeCapMask {
@@ -XXX,XX +XXX,XX @@ enum NvmeCapMask {
      CAP_CSS_MASK       = 0xff,
      CAP_MPSMIN_MASK    = 0xf,
      CAP_MPSMAX_MASK    = 0xf,
 +    CAP_PMR_MASK       = 0x1,
  };
  #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
@@ -XXX,XX +XXX,XX @@ enum NvmeCapMask {
                                                             << CAP_MPSMIN_SHIFT)
  #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\
                                                              << CAP_MPSMAX_SHIFT)
 +#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\
 +                                                            << CAP_PMR_SHIFT)
  enum NvmeCcShift {
      CC_EN_SHIFT     = 0,
@@ -XXX,XX +XXX,XX @@ enum NvmeCmbszMask {
  #define NVME_CMBSZ_GETSIZE(cmbsz) \
      (NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz))))
 +enum NvmePmrcapShift {
 +    PMRCAP_RDS_SHIFT      = 3,
 +    PMRCAP_WDS_SHIFT      = 4,
 +    PMRCAP_BIR_SHIFT      = 5,
 +    PMRCAP_PMRTU_SHIFT    = 8,
 +    PMRCAP_PMRWBM_SHIFT   = 10,
 +    PMRCAP_PMRTO_SHIFT    = 16,
 +    PMRCAP_CMSS_SHIFT     = 24,
 +};
 +
 +enum NvmePmrcapMask {
 +    PMRCAP_RDS_MASK      = 0x1,
 +    PMRCAP_WDS_MASK      = 0x1,
 +    PMRCAP_BIR_MASK      = 0x7,
 +    PMRCAP_PMRTU_MASK    = 0x3,
 +    PMRCAP_PMRWBM_MASK   = 0xf,
 +    PMRCAP_PMRTO_MASK    = 0xff,
 +    PMRCAP_CMSS_MASK     = 0x1,
 +};
 +
 +#define NVME_PMRCAP_RDS(pmrcap)    \
 +    ((pmrcap >> PMRCAP_RDS_SHIFT)   & PMRCAP_RDS_MASK)
 +#define NVME_PMRCAP_WDS(pmrcap)    \
 +    ((pmrcap >> PMRCAP_WDS_SHIFT)   & PMRCAP_WDS_MASK)
 +#define NVME_PMRCAP_BIR(pmrcap)    \
 +    ((pmrcap >> PMRCAP_BIR_SHIFT)   & PMRCAP_BIR_MASK)
 +#define NVME_PMRCAP_PMRTU(pmrcap)    \
 +    ((pmrcap >> PMRCAP_PMRTU_SHIFT)   & PMRCAP_PMRTU_MASK)
 +#define NVME_PMRCAP_PMRWBM(pmrcap)    \
 +    ((pmrcap >> PMRCAP_PMRWBM_SHIFT)   & PMRCAP_PMRWBM_MASK)
 +#define NVME_PMRCAP_PMRTO(pmrcap)    \
 +    ((pmrcap >> PMRCAP_PMRTO_SHIFT)   & PMRCAP_PMRTO_MASK)
 +#define NVME_PMRCAP_CMSS(pmrcap)    \
 +    ((pmrcap >> PMRCAP_CMSS_SHIFT)   & PMRCAP_CMSS_MASK)
 +
 +#define NVME_PMRCAP_SET_RDS(pmrcap, val)   \
 +    (pmrcap |= (uint64_t)(val & PMRCAP_RDS_MASK) << PMRCAP_RDS_SHIFT)
 +#define NVME_PMRCAP_SET_WDS(pmrcap, val)   \
 +    (pmrcap |= (uint64_t)(val & PMRCAP_WDS_MASK) << PMRCAP_WDS_SHIFT)
 +#define NVME_PMRCAP_SET_BIR(pmrcap, val)   \
 +    (pmrcap |= (uint64_t)(val & PMRCAP_BIR_MASK) << PMRCAP_BIR_SHIFT)
 +#define NVME_PMRCAP_SET_PMRTU(pmrcap, val)   \
 +    (pmrcap |= (uint64_t)(val & PMRCAP_PMRTU_MASK) << PMRCAP_PMRTU_SHIFT)
 +#define NVME_PMRCAP_SET_PMRWBM(pmrcap, val)   \
 +    (pmrcap |= (uint64_t)(val & PMRCAP_PMRWBM_MASK) << PMRCAP_PMRWBM_SHIFT)
 +#define NVME_PMRCAP_SET_PMRTO(pmrcap, val)   \
 +    (pmrcap |= (uint64_t)(val & PMRCAP_PMRTO_MASK) << PMRCAP_PMRTO_SHIFT)
 +#define NVME_PMRCAP_SET_CMSS(pmrcap, val)   \
 +    (pmrcap |= (uint64_t)(val & PMRCAP_CMSS_MASK) << PMRCAP_CMSS_SHIFT)
 +
 +enum NvmePmrctlShift {
 +    PMRCTL_EN_SHIFT   = 0,
 +};
 +
 +enum NvmePmrctlMask {
 +    PMRCTL_EN_MASK   = 0x1,
 +};
 +
 +#define NVME_PMRCTL_EN(pmrctl)  ((pmrctl >> PMRCTL_EN_SHIFT)   & PMRCTL_EN_MASK)
 +
 +#define NVME_PMRCTL_SET_EN(pmrctl, val)   \
 +    (pmrctl |= (uint64_t)(val & PMRCTL_EN_MASK) << PMRCTL_EN_SHIFT)
 +
 +enum NvmePmrstsShift {
 +    PMRSTS_ERR_SHIFT    = 0,
 +    PMRSTS_NRDY_SHIFT   = 8,
 +    PMRSTS_HSTS_SHIFT   = 9,
 +    PMRSTS_CBAI_SHIFT   = 12,
 +};
 +
 +enum NvmePmrstsMask {
 +    PMRSTS_ERR_MASK    = 0xff,
 +    PMRSTS_NRDY_MASK   = 0x1,
 +    PMRSTS_HSTS_MASK   = 0x7,
 +    PMRSTS_CBAI_MASK   = 0x1,
 +};
 +
 +#define NVME_PMRSTS_ERR(pmrsts)     \
 +    ((pmrsts >> PMRSTS_ERR_SHIFT)   & PMRSTS_ERR_MASK)
 +#define NVME_PMRSTS_NRDY(pmrsts)    \
 +    ((pmrsts >> PMRSTS_NRDY_SHIFT)   & PMRSTS_NRDY_MASK)
 +#define NVME_PMRSTS_HSTS(pmrsts)    \
 +    ((pmrsts >> PMRSTS_HSTS_SHIFT)   & PMRSTS_HSTS_MASK)
 +#define NVME_PMRSTS_CBAI(pmrsts)    \
 +    ((pmrsts >> PMRSTS_CBAI_SHIFT)   & PMRSTS_CBAI_MASK)
 +
 +#define NVME_PMRSTS_SET_ERR(pmrsts, val)   \
 +    (pmrsts |= (uint64_t)(val & PMRSTS_ERR_MASK) << PMRSTS_ERR_SHIFT)
 +#define NVME_PMRSTS_SET_NRDY(pmrsts, val)   \
 +    (pmrsts |= (uint64_t)(val & PMRSTS_NRDY_MASK) << PMRSTS_NRDY_SHIFT)
 +#define NVME_PMRSTS_SET_HSTS(pmrsts, val)   \
 +    (pmrsts |= (uint64_t)(val & PMRSTS_HSTS_MASK) << PMRSTS_HSTS_SHIFT)
 +#define NVME_PMRSTS_SET_CBAI(pmrsts, val)   \
 +    (pmrsts |= (uint64_t)(val & PMRSTS_CBAI_MASK) << PMRSTS_CBAI_SHIFT)
 +
 +enum NvmePmrebsShift {
 +    PMREBS_PMRSZU_SHIFT   = 0,
 +    PMREBS_RBB_SHIFT      = 4,
 +    PMREBS_PMRWBZ_SHIFT   = 8,
 +};
 +
 +enum NvmePmrebsMask {
 +    PMREBS_PMRSZU_MASK   = 0xf,
 +    PMREBS_RBB_MASK      = 0x1,
 +    PMREBS_PMRWBZ_MASK   = 0xffffff,
 +};
 +
 +#define NVME_PMREBS_PMRSZU(pmrebs)  \
 +    ((pmrebs >> PMREBS_PMRSZU_SHIFT)   & PMREBS_PMRSZU_MASK)
 +#define NVME_PMREBS_RBB(pmrebs)     \
 +    ((pmrebs >> PMREBS_RBB_SHIFT)   & PMREBS_RBB_MASK)
 +#define NVME_PMREBS_PMRWBZ(pmrebs)  \
 +    ((pmrebs >> PMREBS_PMRWBZ_SHIFT)   & PMREBS_PMRWBZ_MASK)
 +
 +#define NVME_PMREBS_SET_PMRSZU(pmrebs, val)   \
 +    (pmrebs |= (uint64_t)(val & PMREBS_PMRSZU_MASK) << PMREBS_PMRSZU_SHIFT)
 +#define NVME_PMREBS_SET_RBB(pmrebs, val)   \
 +    (pmrebs |= (uint64_t)(val & PMREBS_RBB_MASK) << PMREBS_RBB_SHIFT)
 +#define NVME_PMREBS_SET_PMRWBZ(pmrebs, val)   \
 +    (pmrebs |= (uint64_t)(val & PMREBS_PMRWBZ_MASK) << PMREBS_PMRWBZ_SHIFT)
 +
 +enum NvmePmrswtpShift {
 +    PMRSWTP_PMRSWTU_SHIFT   = 0,
 +    PMRSWTP_PMRSWTV_SHIFT   = 8,
 +};
 +
 +enum NvmePmrswtpMask {
 +    PMRSWTP_PMRSWTU_MASK   = 0xf,
 +    PMRSWTP_PMRSWTV_MASK   = 0xffffff,
 +};
 +
 +#define NVME_PMRSWTP_PMRSWTU(pmrswtp)   \
 +    ((pmrswtp >> PMRSWTP_PMRSWTU_SHIFT)   & PMRSWTP_PMRSWTU_MASK)
 +#define NVME_PMRSWTP_PMRSWTV(pmrswtp)   \
 +    ((pmrswtp >> PMRSWTP_PMRSWTV_SHIFT)   & PMRSWTP_PMRSWTV_MASK)
 +
 +#define NVME_PMRSWTP_SET_PMRSWTU(pmrswtp, val)   \
 +    (pmrswtp |= (uint64_t)(val & PMRSWTP_PMRSWTU_MASK) << PMRSWTP_PMRSWTU_SHIFT)
 +#define NVME_PMRSWTP_SET_PMRSWTV(pmrswtp, val)   \
 +    (pmrswtp |= (uint64_t)(val & PMRSWTP_PMRSWTV_MASK) << PMRSWTP_PMRSWTV_SHIFT)
 +
 +enum NvmePmrmscShift {
 +    PMRMSC_CMSE_SHIFT   = 1,
 +    PMRMSC_CBA_SHIFT    = 12,
 +};
 +
 +enum NvmePmrmscMask {
 +    PMRMSC_CMSE_MASK   = 0x1,
 +    PMRMSC_CBA_MASK    = 0xfffffffffffff,
 +};
 +
 +#define NVME_PMRMSC_CMSE(pmrmsc)    \
 +    ((pmrmsc >> PMRMSC_CMSE_SHIFT)   & PMRMSC_CMSE_MASK)
 +#define NVME_PMRMSC_CBA(pmrmsc)     \
 +    ((pmrmsc >> PMRMSC_CBA_SHIFT)   & PMRMSC_CBA_MASK)
 +
 +#define NVME_PMRMSC_SET_CMSE(pmrmsc, val)   \
 +    (pmrmsc |= (uint64_t)(val & PMRMSC_CMSE_MASK) << PMRMSC_CMSE_SHIFT)
 +#define NVME_PMRMSC_SET_CBA(pmrmsc, val)   \
 +    (pmrmsc |= (uint64_t)(val & PMRMSC_CBA_MASK) << PMRMSC_CBA_SHIFT)
 +
  typedef struct NvmeCmd {
      uint8_t     opcode;
      uint8_t     fuse;
 diff --git a/hw/block/nvme.c b/hw/block/nvme.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/block/nvme.c
 +++ b/hw/block/nvme.c
@@ -XXX,XX +XXX,XX @@
   *      -drive file=<file>,if=none,id=<drive_id>
   *      -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>, \
   *              cmb_size_mb=<cmb_size_mb[optional]>, \
 + *              [pmrdev=<mem_backend_file_id>,] \
   *              num_queues=<N[optional]>
   *
   * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
   * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
 + *
 + * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
 + * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
 + * both provided.
 + * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
 + * For example:
 + * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
 + *  size=<size> .... -device nvme,...,pmrdev=<mem_id>
   */
  #include "qemu/osdep.h"
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/sysemu.h"
  #include "qapi/error.h"
  #include "qapi/visitor.h"
 +#include "sysemu/hostmem.h"
  #include "sysemu/block-backend.h"
 +#include "exec/ram_addr.h"
  #include "qemu/log.h"
  #include "qemu/module.h"
@@ -XXX,XX +XXX,XX @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
          NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
                         "invalid write to read only CMBSZ, ignored");
          return;
 +    case 0xE00: /* PMRCAP */
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrcap_readonly,
 +                       "invalid write to PMRCAP register, ignored");
 +        return;
 +    case 0xE04: /* TODO PMRCTL */
 +        break;
 +    case 0xE08: /* PMRSTS */
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrsts_readonly,
 +                       "invalid write to PMRSTS register, ignored");
 +        return;
 +    case 0xE0C: /* PMREBS */
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrebs_readonly,
 +                       "invalid write to PMREBS register, ignored");
 +        return;
 +    case 0xE10: /* PMRSWTP */
 +        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrswtp_readonly,
 +                       "invalid write to PMRSWTP register, ignored");
 +        return;
 +    case 0xE14: /* TODO PMRMSC */
 +         break;
      default:
          NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
                         "invalid MMIO write,"
@@ -XXX,XX +XXX,XX @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
      }
--    /* FIXME - no need to calculate these if .bdrv_aio_preadv exists */
+     if (addr < sizeof(n->bar)) {
--    sector_num = offset >> BDRV_SECTOR_BITS;
++        /*
--    nb_sectors = bytes >> BDRV_SECTOR_BITS;
++         * When PMRWBM bit 1 is set then read from
--
++         * from PMRSTS should ensure prior writes
--    if (!drv->bdrv_aio_preadv) {
++         * made it to persistent media
--        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
++         */
--        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
++        if (addr == 0xE08 &&
--        assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
++            (NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02)) {
--    }
++            qemu_ram_writeback(n->pmrdev->mr.ram_block,
--
++                               0, n->pmrdev->size);
--    if (drv->bdrv_co_readv) {
++        }
--        return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+         memcpy(&val, ptr + addr, size);
--    } else {
+     } else {
-+    if (drv->bdrv_aio_preadv) {
+         NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
-         BlockAIOCB *acb;
+@@ -XXX,XX +XXX,XX @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
-         CoroutineIOCompletion co = {
+         error_setg(errp, "serial property not set");
-             .coroutine = qemu_coroutine_self(),
+         return;
          };
 -        if (drv->bdrv_aio_preadv) {
 -            acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
 -                                       bdrv_co_io_em_complete, &co);
 -        } else {
 -            acb = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
 -                                      bdrv_co_io_em_complete, &co);
 -        }
 +        acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
 +                                   bdrv_co_io_em_complete, &co);
          if (acb == NULL) {
              return -EIO;
          } else {
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
              return co.ret;
          }
      }
 +
-+    sector_num = offset >> BDRV_SECTOR_BITS;
++    if (!n->cmb_size_mb && n->pmrdev) {
-+    nb_sectors = bytes >> BDRV_SECTOR_BITS;
++        if (host_memory_backend_is_mapped(n->pmrdev)) {
-+
++            char *path = object_get_canonical_path_component(OBJECT(n->pmrdev));
-+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
++            error_setg(errp, "can't use already busy memdev: %s", path);
-+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
++            g_free(path);
-+    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
++            return;
-+    assert(drv->bdrv_co_readv);
++        }
 +
-+    return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
++        if (!is_power_of_2(n->pmrdev->size)) {
 +            error_setg(errp, "pmr backend size needs to be power of 2 in size");
 +            return;
 +        }
 +
 +        host_memory_backend_set_mapped(n->pmrdev, true);
 +    }
 +
      blkconf_blocksizes(&n->conf);
      if (!blkconf_apply_backend_options(&n->conf, blk_is_read_only(n->conf.blk),
                                         false, errp)) {
@@ -XXX,XX +XXX,XX @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
              PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 |
              PCI_BASE_ADDRESS_MEM_PREFETCH, &n->ctrl_mem);
 +    } else if (n->pmrdev) {
 +        /* Controller Capabilities register */
 +        NVME_CAP_SET_PMRS(n->bar.cap, 1);
 +
 +        /* PMR Capabities register */
 +        n->bar.pmrcap = 0;
 +        NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 0);
 +        NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 0);
 +        NVME_PMRCAP_SET_BIR(n->bar.pmrcap, 2);
 +        NVME_PMRCAP_SET_PMRTU(n->bar.pmrcap, 0);
 +        /* Turn on bit 1 support */
 +        NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02);
 +        NVME_PMRCAP_SET_PMRTO(n->bar.pmrcap, 0);
 +        NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 0);
 +
 +        /* PMR Control register */
 +        n->bar.pmrctl = 0;
 +        NVME_PMRCTL_SET_EN(n->bar.pmrctl, 0);
 +
 +        /* PMR Status register */
 +        n->bar.pmrsts = 0;
 +        NVME_PMRSTS_SET_ERR(n->bar.pmrsts, 0);
 +        NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 0);
 +        NVME_PMRSTS_SET_HSTS(n->bar.pmrsts, 0);
 +        NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 0);
 +
 +        /* PMR Elasticity Buffer Size register */
 +        n->bar.pmrebs = 0;
 +        NVME_PMREBS_SET_PMRSZU(n->bar.pmrebs, 0);
 +        NVME_PMREBS_SET_RBB(n->bar.pmrebs, 0);
 +        NVME_PMREBS_SET_PMRWBZ(n->bar.pmrebs, 0);
 +
 +        /* PMR Sustained Write Throughput register */
 +        n->bar.pmrswtp = 0;
 +        NVME_PMRSWTP_SET_PMRSWTU(n->bar.pmrswtp, 0);
 +        NVME_PMRSWTP_SET_PMRSWTV(n->bar.pmrswtp, 0);
 +
 +        /* PMR Memory Space Control register */
 +        n->bar.pmrmsc = 0;
 +        NVME_PMRMSC_SET_CMSE(n->bar.pmrmsc, 0);
 +        NVME_PMRMSC_SET_CBA(n->bar.pmrmsc, 0);
 +
 +        pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap),
 +            PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 |
 +            PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr);
      }
      for (i = 0; i < n->num_namespaces; i++) {
@@ -XXX,XX +XXX,XX @@ static void nvme_exit(PCIDevice *pci_dev)
      if (n->cmb_size_mb) {
          g_free(n->cmbuf);
      }
 +
 +    if (n->pmrdev) {
 +        host_memory_backend_set_mapped(n->pmrdev, false);
 +    }
      msix_uninit_exclusive_bar(pci_dev);
  }
- static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
+ static Property nvme_props[] = {
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
+     DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
-         goto emulate_flags;
++    DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmrdev, TYPE_MEMORY_BACKEND,
-     }
++                     HostMemoryBackend *),
+     DEFINE_PROP_STRING("serial", NvmeCtrl, serial),
--    /* FIXME - no need to calculate these if .bdrv_aio_pwritev exists */
+     DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, cmb_size_mb, 0),
--    sector_num = offset >> BDRV_SECTOR_BITS;
+     DEFINE_PROP_UINT32("num_queues", NvmeCtrl, num_queues, 64),
--    nb_sectors = bytes >> BDRV_SECTOR_BITS;
+diff --git a/hw/block/Makefile.objs b/hw/block/Makefile.objs
--
+index XXXXXXX..XXXXXXX 100644
--    if (!drv->bdrv_aio_pwritev) {
+--- a/hw/block/Makefile.objs
--        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
++++ b/hw/block/Makefile.objs
--        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+@@ -XXX,XX +XXX,XX @@ common-obj-$(CONFIG_PFLASH_CFI02) += pflash_cfi02.o
--        assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
+ common-obj-$(CONFIG_XEN) += xen-block.o
--    }
+ common-obj-$(CONFIG_ECC) += ecc.o
--
+ common-obj-$(CONFIG_ONENAND) += onenand.o
--    if (drv->bdrv_co_writev_flags) {
+-common-obj-$(CONFIG_NVME_PCI) += nvme.o
--        ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
+ common-obj-$(CONFIG_SWIM) += swim.o
--                                        flags & bs->supported_write_flags);
--        flags &= ~bs->supported_write_flags;
+ common-obj-$(CONFIG_SH4) += tc58128.o
--    } else if (drv->bdrv_co_writev) {
--        assert(!bs->supported_write_flags);
+ obj-$(CONFIG_VIRTIO_BLK) += virtio-blk.o
--        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+ obj-$(CONFIG_VHOST_USER_BLK) += vhost-user-blk.o
--    } else {
++obj-$(CONFIG_NVME_PCI) += nvme.o
-+    if (drv->bdrv_aio_pwritev) {
-         BlockAIOCB *acb;
+ obj-y += dataplane/
-         CoroutineIOCompletion co = {
+diff --git a/hw/block/trace-events b/hw/block/trace-events
-             .coroutine = qemu_coroutine_self(),
+index XXXXXXX..XXXXXXX 100644
-         };
+--- a/hw/block/trace-events
++++ b/hw/block/trace-events
--        if (drv->bdrv_aio_pwritev) {
+@@ -XXX,XX +XXX,XX @@ nvme_ub_mmiowr_ssreset_w1c_unsupported(void) "attempted to W1C CSTS.NSSRO but CA
--            acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
+ nvme_ub_mmiowr_ssreset_unsupported(void) "attempted NVM subsystem reset but CAP.NSSRS is zero (not supported)"
--                                        flags & bs->supported_write_flags,
+ nvme_ub_mmiowr_cmbloc_reserved(void) "invalid write to reserved CMBLOC when CMBSZ is zero, ignored"
--                                        bdrv_co_io_em_complete, &co);
+ nvme_ub_mmiowr_cmbsz_readonly(void) "invalid write to read only CMBSZ, ignored"
--            flags &= ~bs->supported_write_flags;
++nvme_ub_mmiowr_pmrcap_readonly(void) "invalid write to read only PMRCAP, ignored"
--        } else {
++nvme_ub_mmiowr_pmrsts_readonly(void) "invalid write to read only PMRSTS, ignored"
--            assert(!bs->supported_write_flags);
++nvme_ub_mmiowr_pmrebs_readonly(void) "invalid write to read only PMREBS, ignored"
--            acb = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
++nvme_ub_mmiowr_pmrswtp_readonly(void) "invalid write to read only PMRSWTP, ignored"
--                                       bdrv_co_io_em_complete, &co);
+ nvme_ub_mmiowr_invalid(uint64_t offset, uint64_t data) "invalid MMIO write, offset=0x%"PRIx64", data=0x%"PRIx64""
--        }
+ nvme_ub_mmiord_misaligned32(uint64_t offset) "MMIO read not 32-bit aligned, offset=0x%"PRIx64""
-+        acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
+ nvme_ub_mmiord_toosmall(uint64_t offset) "MMIO read smaller than 32-bits, offset=0x%"PRIx64""
 +                                    flags & bs->supported_write_flags,
 +                                    bdrv_co_io_em_complete, &co);
 +        flags &= ~bs->supported_write_flags;
          if (acb == NULL) {
              ret = -EIO;
          } else {
              qemu_coroutine_yield();
              ret = co.ret;
          }
 +        goto emulate_flags;
 +    }
 +
 +    sector_num = offset >> BDRV_SECTOR_BITS;
 +    nb_sectors = bytes >> BDRV_SECTOR_BITS;
 +
 +    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
 +    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 +    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 +
 +    if (drv->bdrv_co_writev_flags) {
 +        ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
 +                                        flags & bs->supported_write_flags);
 +        flags &= ~bs->supported_write_flags;
 +    } else {
 +        assert(drv->bdrv_co_writev);
 +        assert(!bs->supported_write_flags);
 +        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
      }
  emulate_flags:
 --
-.13.6
+.25.3

-[Qemu-devel] [PULL 14/37] blockjob: Implement block_job_set_speed() centrally
+[PULL 14/15] qom: Factor out user_creatable_add_dict()
-All block job drivers support .set_speed and all of them duplicate the
+The QMP handler qmp_object_add() and the implementation of --object in
-same code to implement it. Move that code to blockjob.c and remove the
+qemu-storage-daemon can share most of the code. Currently,
-now useless callback.
+qemu-storage-daemon calls qmp_object_add(), but this is not correct
 because different visitors need to be used.
 As a first step towards a fix, make qmp_object_add() a wrapper around a
 new function user_creatable_add_dict() that can get an additional
 parameter. The handling of "props" is only required for compatibility
 and not required for the qemu-storage-daemon command line, so it stays
 in qmp_object_add().
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: John Snow <jsnow@redhat.com>
 ---
- include/block/blockjob.h     |  2 ++
+ include/qom/object_interfaces.h | 12 ++++++++++++
- include/block/blockjob_int.h |  3 ---
+ qom/object_interfaces.c         | 27 +++++++++++++++++++++++++++
- block/backup.c               | 13 -------------
+ qom/qom-qmp-cmds.c              | 24 +-----------------------
- block/commit.c               | 14 --------------
+files changed, 40 insertions(+), 23 deletions(-)
  block/mirror.c               | 26 ++++++--------------------
  block/stream.c               | 14 --------------
  blockjob.c                   | 12 ++++--------
 files changed, 12 insertions(+), 72 deletions(-)
-diff --git a/include/block/blockjob.h b/include/block/blockjob.h
+diff --git a/include/qom/object_interfaces.h b/include/qom/object_interfaces.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/blockjob.h
+--- a/include/qom/object_interfaces.h
-+++ b/include/block/blockjob.h
++++ b/include/qom/object_interfaces.h
@@ -XXX,XX +XXX,XX @@ Object *user_creatable_add_type(const char *type, const char *id,
                                  const QDict *qdict,
                                  Visitor *v, Error **errp);
 +/**
 + * user_creatable_add_dict:
 + * @qdict: the object definition
 + * @errp: if an error occurs, a pointer to an area to store the error
 + *
 + * Create an instance of the user creatable object that is defined by
 + * @qdict.  The object type is taken from the QDict key 'qom-type', its
 + * ID from the key 'id'. The remaining entries in @qdict are used to
 + * initialize the object properties.
 + */
 +void user_creatable_add_dict(QDict *qdict, Error **errp);
 +
  /**
   * user_creatable_add_opts:
   * @opts: the object definition
 diff --git a/qom/object_interfaces.c b/qom/object_interfaces.c
 index XXXXXXX..XXXXXXX 100644
 --- a/qom/object_interfaces.c
 +++ b/qom/object_interfaces.c
 @@ -XXX,XX +XXX,XX @@
- #include "block/block.h"
+ #include "qapi/qmp/qerror.h"
- #include "qemu/ratelimit.h"
+ #include "qapi/qmp/qjson.h"
+ #include "qapi/qmp/qstring.h"
-+#define BLOCK_JOB_SLICE_TIME 100000000ULL /* ns */
++#include "qapi/qobject-input-visitor.h"
  #include "qom/object_interfaces.h"
  #include "qemu/help_option.h"
  #include "qemu/module.h"
@@ -XXX,XX +XXX,XX @@ out:
      return obj;
  }
 +void user_creatable_add_dict(QDict *qdict, Error **errp)
 +{
 +    Visitor *v;
 +    Object *obj;
 +    g_autofree char *type = NULL;
 +    g_autofree char *id = NULL;
 +
- typedef struct BlockJobDriver BlockJobDriver;
++    type = g_strdup(qdict_get_try_str(qdict, "qom-type"));
- typedef struct BlockJobTxn BlockJobTxn;
++    if (!type) {
++        error_setg(errp, QERR_MISSING_PARAMETER, "qom-type");
-diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
++        return;
 +    }
 +    qdict_del(qdict, "qom-type");
 +
 +    id = g_strdup(qdict_get_try_str(qdict, "id"));
 +    if (!id) {
 +        error_setg(errp, QERR_MISSING_PARAMETER, "id");
 +        return;
 +    }
 +    qdict_del(qdict, "id");
 +
 +    v = qobject_input_visitor_new(QOBJECT(qdict));
 +    obj = user_creatable_add_type(type, id, qdict, v, errp);
 +    visit_free(v);
 +    object_unref(obj);
 +}
  Object *user_creatable_add_opts(QemuOpts *opts, Error **errp)
  {
 diff --git a/qom/qom-qmp-cmds.c b/qom/qom-qmp-cmds.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/blockjob_int.h
+--- a/qom/qom-qmp-cmds.c
-+++ b/include/block/blockjob_int.h
++++ b/qom/qom-qmp-cmds.c
-@@ -XXX,XX +XXX,XX @@ struct BlockJobDriver {
+@@ -XXX,XX +XXX,XX @@
-     /** String describing the operation, part of query-block-jobs QMP API */
+ #include "qapi/qapi-commands-qom.h"
-     BlockJobType job_type;
+ #include "qapi/qmp/qdict.h"
+ #include "qapi/qmp/qerror.h"
--    /** Optional callback for job types that support setting a speed limit */
+-#include "qapi/qobject-input-visitor.h"
--    void (*set_speed)(BlockJob *job, int64_t speed, Error **errp);
+ #include "qemu/cutils.h"
  #include "qom/object_interfaces.h"
  #include "qom/qom-qobject.h"
@@ -XXX,XX +XXX,XX @@ void qmp_object_add(QDict *qdict, QObject **ret_data, Error **errp)
  {
      QObject *props;
      QDict *pdict;
 -    Visitor *v;
 -    Object *obj;
 -    g_autofree char *type = NULL;
 -    g_autofree char *id = NULL;
 -
-     /** Mandatory: Entrypoint for the Coroutine. */
+-    type = g_strdup(qdict_get_try_str(qdict, "qom-type"));
-     CoroutineEntry *start;
+-    if (!type) {
+-        error_setg(errp, QERR_MISSING_PARAMETER, "qom-type");
 diff --git a/block/backup.c b/block/backup.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/backup.c
 +++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/error-report.h"
  #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
 -#define SLICE_TIME 100000000ULL /* ns */
  typedef struct BackupBlockJob {
      BlockJob common;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn backup_before_write_notify(
      return backup_do_cow(job, req->offset, req->bytes, NULL, true);
  }
 -static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
 -{
 -    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
 -
 -    if (speed < 0) {
 -        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
 -        return;
 -    }
--    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
+-    qdict_del(qdict, "qom-type");
 -}
 -
- static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
+-    id = g_strdup(qdict_get_try_str(qdict, "id"));
- {
+-    if (!id) {
-     BdrvDirtyBitmap *bm;
+-        error_setg(errp, QERR_MISSING_PARAMETER, "id");
@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver backup_job_driver = {
      .instance_size          = sizeof(BackupBlockJob),
      .job_type               = BLOCK_JOB_TYPE_BACKUP,
      .start                  = backup_run,
 -    .set_speed              = backup_set_speed,
      .commit                 = backup_commit,
      .abort                  = backup_abort,
      .clean                  = backup_clean,
 diff --git a/block/commit.c b/block/commit.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/commit.c
 +++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ enum {
      COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
  };
 -#define SLICE_TIME 100000000ULL /* ns */
 -
  typedef struct CommitBlockJob {
      BlockJob common;
      BlockDriverState *commit_top_bs;
@@ -XXX,XX +XXX,XX @@ out:
      block_job_defer_to_main_loop(&s->common, commit_complete, data);
  }
 -static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
 -{
 -    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
 -
 -    if (speed < 0) {
 -        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
 -        return;
 -    }
--    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
+-    qdict_del(qdict, "id");
--}
--
+     props = qdict_get(qdict, "props");
- static const BlockJobDriver commit_job_driver = {
+     if (props) {
-     .instance_size = sizeof(CommitBlockJob),
+@@ -XXX,XX +XXX,XX @@ void qmp_object_add(QDict *qdict, QObject **ret_data, Error **errp)
-     .job_type      = BLOCK_JOB_TYPE_COMMIT,
+         qobject_unref(pdict);
--    .set_speed     = commit_set_speed,
+     }
-     .start         = commit_run,
- };
+-    v = qobject_input_visitor_new(QOBJECT(qdict));
+-    obj = user_creatable_add_type(type, id, qdict, v, errp);
-diff --git a/block/mirror.c b/block/mirror.c
+-    visit_free(v);
-index XXXXXXX..XXXXXXX 100644
+-    object_unref(obj);
---- a/block/mirror.c
++    user_creatable_add_dict(qdict, errp);
 +++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/ratelimit.h"
  #include "qemu/bitmap.h"
 -#define SLICE_TIME    100000000ULL /* ns */
  #define MAX_IN_FLIGHT 16
  #define MAX_IO_BYTES (1 << 20) /* 1 Mb */
  #define DEFAULT_MIRROR_BUF_SIZE (MAX_IN_FLIGHT * MAX_IO_BYTES)
@@ -XXX,XX +XXX,XX @@ static void mirror_throttle(MirrorBlockJob *s)
  {
      int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 -    if (now - s->last_pause_ns > SLICE_TIME) {
 +    if (now - s->last_pause_ns > BLOCK_JOB_SLICE_TIME) {
          s->last_pause_ns = now;
          block_job_sleep_ns(&s->common, 0);
      } else {
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
          /* Note that even when no rate limit is applied we need to yield
           * periodically with no pending I/O so that bdrv_drain_all() returns.
 -         * We do so every SLICE_TIME nanoseconds, or when there is an error,
 -         * or when the source is clean, whichever comes first.
 -         */
 +         * We do so every BLKOCK_JOB_SLICE_TIME nanoseconds, or when there is
 +         * an error, or when the source is clean, whichever comes first. */
          delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
 -        if (delta < SLICE_TIME &&
 +        if (delta < BLOCK_JOB_SLICE_TIME &&
              s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
              if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                  (cnt == 0 && s->in_flight > 0)) {
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
          ret = 0;
          if (s->synced && !should_complete) {
 -            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
 +            delay_ns = (s->in_flight == 0 &&
 +                        cnt == 0 ? BLOCK_JOB_SLICE_TIME : 0);
          }
          trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
          block_job_sleep_ns(&s->common, delay_ns);
@@ -XXX,XX +XXX,XX @@ immediate_exit:
      block_job_defer_to_main_loop(&s->common, mirror_exit, data);
  }
--static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
+ void qmp_object_del(const char *id, Error **errp)
 -{
 -    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
 -
 -    if (speed < 0) {
 -        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
 -        return;
 -    }
 -    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
 -}
 -
  static void mirror_complete(BlockJob *job, Error **errp)
  {
      MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
@@ -XXX,XX +XXX,XX @@ static void mirror_drain(BlockJob *job)
  static const BlockJobDriver mirror_job_driver = {
      .instance_size          = sizeof(MirrorBlockJob),
      .job_type               = BLOCK_JOB_TYPE_MIRROR,
 -    .set_speed              = mirror_set_speed,
      .start                  = mirror_run,
      .complete               = mirror_complete,
      .pause                  = mirror_pause,
@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver mirror_job_driver = {
  static const BlockJobDriver commit_active_job_driver = {
      .instance_size          = sizeof(MirrorBlockJob),
      .job_type               = BLOCK_JOB_TYPE_COMMIT,
 -    .set_speed              = mirror_set_speed,
      .start                  = mirror_run,
      .complete               = mirror_complete,
      .pause                  = mirror_pause,
 diff --git a/block/stream.c b/block/stream.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/stream.c
 +++ b/block/stream.c
@@ -XXX,XX +XXX,XX @@ enum {
      STREAM_BUFFER_SIZE = 512 * 1024, /* in bytes */
  };
 -#define SLICE_TIME 100000000ULL /* ns */
 -
  typedef struct StreamBlockJob {
      BlockJob common;
      BlockDriverState *base;
@@ -XXX,XX +XXX,XX @@ out:
      block_job_defer_to_main_loop(&s->common, stream_complete, data);
  }
 -static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
 -{
 -    StreamBlockJob *s = container_of(job, StreamBlockJob, common);
 -
 -    if (speed < 0) {
 -        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
 -        return;
 -    }
 -    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
 -}
 -
  static const BlockJobDriver stream_job_driver = {
      .instance_size = sizeof(StreamBlockJob),
      .job_type      = BLOCK_JOB_TYPE_STREAM,
 -    .set_speed     = stream_set_speed,
      .start         = stream_run,
  };
 diff --git a/blockjob.c b/blockjob.c
 index XXXXXXX..XXXXXXX 100644
 --- a/blockjob.c
 +++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static bool block_job_timer_pending(BlockJob *job)
  void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
  {
 -    Error *local_err = NULL;
      int64_t old_speed = job->speed;
 -    if (!job->driver->set_speed) {
 -        error_setg(errp, QERR_UNSUPPORTED);
 -        return;
 -    }
      if (block_job_apply_verb(job, BLOCK_JOB_VERB_SET_SPEED, errp)) {
          return;
      }
 -    job->driver->set_speed(job, speed, &local_err);
 -    if (local_err) {
 -        error_propagate(errp, local_err);
 +    if (speed < 0) {
 +        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
          return;
      }
 +    ratelimit_set_speed(&job->limit, speed, BLOCK_JOB_SLICE_TIME);
 +
      job->speed = speed;
      if (speed && speed <= old_speed) {
          return;
 --
-.13.6
+.25.3

-[Qemu-devel] [PULL 05/37] rbd: Switch to byte-based callbacks
+[PULL 15/15] qemu-storage-daemon: Fix non-string --object properties
-From: Eric Blake <eblake@redhat.com>
+After processing the option string with the keyval parser, we get a
 QDict that contains only strings. This QDict must be fed to a keyval
 visitor which converts the strings into the right data types.
-We are gradually moving away from sector-based interfaces, towards
+qmp_object_add(), however, uses the normal QObject input visitor, which
-byte-based.  Make the change for the last few sector-based callbacks
+expects a QDict where all properties already have the QType that matches
-in the rbd driver.
+the data type required by the QOM object type.
-Note that the driver was already using byte-based calls for
+Change the --object implementation in qemu-storage-daemon so that it
-performing actual I/O, so this just gets rid of a round trip
+doesn't call qmp_object_add(), but calls user_creatable_add_dict()
-of scaling; however, as I don't know if RBD is tolerant of
+directly instead and pass it a new keyval boolean that decides which
-non-sector AIO operations, I went with the conservate approach
+visitor must be used.
 of adding .bdrv_refresh_limits to override the block layer
 defaults back to the pre-patch value of 512.
-Signed-off-by: Eric Blake <eblake@redhat.com>
+Reported-by: Coiby Xu <coiby.xu@gmail.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/rbd.c | 40 ++++++++++++++++++++++------------------
+ include/qom/object_interfaces.h | 6 +++++-
-file changed, 22 insertions(+), 18 deletions(-)
+ qemu-storage-daemon.c           | 4 +---
  qom/object_interfaces.c         | 8 ++++++--
  qom/qom-qmp-cmds.c              | 2 +-
 files changed, 13 insertions(+), 7 deletions(-)
-diff --git a/block/rbd.c b/block/rbd.c
+diff --git a/include/qom/object_interfaces.h b/include/qom/object_interfaces.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/rbd.c
+--- a/include/qom/object_interfaces.h
-+++ b/block/rbd.c
++++ b/include/qom/object_interfaces.h
-@@ -XXX,XX +XXX,XX @@ done:
+@@ -XXX,XX +XXX,XX @@ Object *user_creatable_add_type(const char *type, const char *id,
  /**
   * user_creatable_add_dict:
   * @qdict: the object definition
 + * @keyval: if true, use a keyval visitor for processing @qdict (i.e.
 + *          assume that all @qdict values are strings); otherwise, use
 + *          the normal QObject visitor (i.e. assume all @qdict values
 + *          have the QType expected by the QOM object type)
   * @errp: if an error occurs, a pointer to an area to store the error
   *
   * Create an instance of the user creatable object that is defined by
@@ -XXX,XX +XXX,XX @@ Object *user_creatable_add_type(const char *type, const char *id,
   * ID from the key 'id'. The remaining entries in @qdict are used to
   * initialize the object properties.
   */
 -void user_creatable_add_dict(QDict *qdict, Error **errp);
 +void user_creatable_add_dict(QDict *qdict, bool keyval, Error **errp);
  /**
   * user_creatable_add_opts:
 diff --git a/qemu-storage-daemon.c b/qemu-storage-daemon.c
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-storage-daemon.c
 +++ b/qemu-storage-daemon.c
@@ -XXX,XX +XXX,XX @@ static void process_options(int argc, char *argv[])
                  QemuOpts *opts;
                  const char *type;
                  QDict *args;
 -                QObject *ret_data = NULL;
                  /* FIXME The keyval parser rejects 'help' arguments, so we must
                   * unconditionall try QemuOpts first. */
@@ -XXX,XX +XXX,XX @@ static void process_options(int argc, char *argv[])
                  qemu_opts_del(opts);
                  args = keyval_parse(optarg, "qom-type", &error_fatal);
 -                qmp_object_add(args, &ret_data, &error_fatal);
 +                user_creatable_add_dict(args, true, &error_fatal);
                  qobject_unref(args);
 -                qobject_unref(ret_data);
                  break;
              }
          default:
 diff --git a/qom/object_interfaces.c b/qom/object_interfaces.c
 index XXXXXXX..XXXXXXX 100644
 --- a/qom/object_interfaces.c
 +++ b/qom/object_interfaces.c
@@ -XXX,XX +XXX,XX @@ out:
      return obj;
  }
+-void user_creatable_add_dict(QDict *qdict, Error **errp)
-+static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
++void user_creatable_add_dict(QDict *qdict, bool keyval, Error **errp)
 +{
 +    /* XXX Does RBD support AIO on less than 512-byte alignment? */
 +    bs->bl.request_alignment = 512;
 +}
 +
 +
  static int qemu_rbd_set_auth(rados_t cluster, const char *secretid,
                               Error **errp)
  {
-@@ -XXX,XX +XXX,XX @@ failed:
+     Visitor *v;
-     return NULL;
+     Object *obj;
@@ -XXX,XX +XXX,XX @@ void user_creatable_add_dict(QDict *qdict, Error **errp)
      }
      qdict_del(qdict, "id");
 -    v = qobject_input_visitor_new(QOBJECT(qdict));
 +    if (keyval) {
 +        v = qobject_input_visitor_new_keyval(QOBJECT(qdict));
 +    } else {
 +        v = qobject_input_visitor_new(QOBJECT(qdict));
 +    }
      obj = user_creatable_add_type(type, id, qdict, v, errp);
      visit_free(v);
      object_unref(obj);
 diff --git a/qom/qom-qmp-cmds.c b/qom/qom-qmp-cmds.c
 index XXXXXXX..XXXXXXX 100644
 --- a/qom/qom-qmp-cmds.c
 +++ b/qom/qom-qmp-cmds.c
@@ -XXX,XX +XXX,XX @@ void qmp_object_add(QDict *qdict, QObject **ret_data, Error **errp)
          qobject_unref(pdict);
      }
 -    user_creatable_add_dict(qdict, errp);
 +    user_creatable_add_dict(qdict, false, errp);
  }
--static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
+ void qmp_object_del(const char *id, Error **errp)
 -                                      int64_t sector_num,
 -                                      QEMUIOVector *qiov,
 -                                      int nb_sectors,
 -                                      BlockCompletionFunc *cb,
 -                                      void *opaque)
 +static BlockAIOCB *qemu_rbd_aio_preadv(BlockDriverState *bs,
 +                                       uint64_t offset, uint64_t bytes,
 +                                       QEMUIOVector *qiov, int flags,
 +                                       BlockCompletionFunc *cb,
 +                                       void *opaque)
  {
 -    return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
 -                         (int64_t) nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
 +    return rbd_start_aio(bs, offset, qiov, bytes, cb, opaque,
                           RBD_AIO_READ);
  }
 -static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
 -                                       int64_t sector_num,
 -                                       QEMUIOVector *qiov,
 -                                       int nb_sectors,
 -                                       BlockCompletionFunc *cb,
 -                                       void *opaque)
 +static BlockAIOCB *qemu_rbd_aio_pwritev(BlockDriverState *bs,
 +                                        uint64_t offset, uint64_t bytes,
 +                                        QEMUIOVector *qiov, int flags,
 +                                        BlockCompletionFunc *cb,
 +                                        void *opaque)
  {
 -    return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
 -                         (int64_t) nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
 +    return rbd_start_aio(bs, offset, qiov, bytes, cb, opaque,
                           RBD_AIO_WRITE);
  }
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_rbd = {
      .format_name            = "rbd",
      .instance_size          = sizeof(BDRVRBDState),
      .bdrv_parse_filename    = qemu_rbd_parse_filename,
 +    .bdrv_refresh_limits    = qemu_rbd_refresh_limits,
      .bdrv_file_open         = qemu_rbd_open,
      .bdrv_close             = qemu_rbd_close,
      .bdrv_reopen_prepare    = qemu_rbd_reopen_prepare,
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_rbd = {
      .bdrv_truncate          = qemu_rbd_truncate,
      .protocol_name          = "rbd",
 -    .bdrv_aio_readv         = qemu_rbd_aio_readv,
 -    .bdrv_aio_writev        = qemu_rbd_aio_writev,
 +    .bdrv_aio_preadv        = qemu_rbd_aio_preadv,
 +    .bdrv_aio_pwritev       = qemu_rbd_aio_pwritev,
  #ifdef LIBRBD_SUPPORTS_AIO_FLUSH
      .bdrv_aio_flush         = qemu_rbd_aio_flush,
 --
-.13.6
+.25.3

-[Qemu-devel] [PULL 11/37] blockjob: Fix assertion in block_job_finalize()
+Deleted patch
-Every job gets a non-NULL job->txn on creation, but it doesn't
-necessarily keep it until it is decommissioned: Finalising a job removes
-it from its transaction. Therefore, calling 'blockdev-job-finalize' a
-second time on an already concluded job causes an assertion failure.
-Remove job->txn from the assertion in block_job_finalize() to fix this.
-block_job_do_finalize() still has the same assertion, but if a job is
-already removed from its transaction, block_job_apply_verb() will
-already error out before we run into that assertion.
-Cc: qemu-stable@nongnu.org
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: John Snow <jsnow@redhat.com>
----
- blockjob.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/blockjob.c b/blockjob.c
-index XXXXXXX..XXXXXXX 100644
---- a/blockjob.c
-+++ b/blockjob.c
-@@ -XXX,XX +XXX,XX @@ void block_job_complete(BlockJob *job, Error **errp)
- void block_job_finalize(BlockJob *job, Error **errp)
- {
--    assert(job && job->id && job->txn);
-+    assert(job && job->id);
-     if (block_job_apply_verb(job, BLOCK_JOB_VERB_FINALIZE, errp)) {
-         return;
-     }
---
-.13.6

-[Qemu-devel] [PULL 13/37] blockjob: Move RateLimit to BlockJob
+Deleted patch
-Every block job has a RateLimit, and they all do the exact same thing
-with it, so it should be common infrastructure. Move the struct field
-for a start.
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: John Snow <jsnow@redhat.com>
----
- include/block/blockjob.h | 4 ++++
- block/backup.c           | 5 ++---
- block/commit.c           | 5 ++---
- block/mirror.c           | 6 +++---
- block/stream.c           | 5 ++---
-files changed, 13 insertions(+), 12 deletions(-)
-diff --git a/include/block/blockjob.h b/include/block/blockjob.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/blockjob.h
-+++ b/include/block/blockjob.h
-@@ -XXX,XX +XXX,XX @@
- #define BLOCKJOB_H
- #include "block/block.h"
-+#include "qemu/ratelimit.h"
- typedef struct BlockJobDriver BlockJobDriver;
- typedef struct BlockJobTxn BlockJobTxn;
-@@ -XXX,XX +XXX,XX @@ typedef struct BlockJob {
-     /** Speed that was set with @block_job_set_speed.  */
-     int64_t speed;
-+    /** Rate limiting data structure for implementing @speed. */
-+    RateLimit limit;
-+
-     /** The completion function that will be called when the job completes.  */
-     BlockCompletionFunc *cb;
-diff --git a/block/backup.c b/block/backup.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/backup.c
-+++ b/block/backup.c
-@@ -XXX,XX +XXX,XX @@ typedef struct BackupBlockJob {
-     /* bitmap for sync=incremental */
-     BdrvDirtyBitmap *sync_bitmap;
-     MirrorSyncMode sync_mode;
--    RateLimit limit;
-     BlockdevOnError on_source_error;
-     BlockdevOnError on_target_error;
-     CoRwlock flush_rwlock;
-@@ -XXX,XX +XXX,XX @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
-         error_setg(errp, QERR_INVALID_PARAMETER, "speed");
-         return;
-     }
--    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
-+    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
- }
- static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
-@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn yield_and_check(BackupBlockJob *job)
-      * (without, VM does not reboot)
-      */
-     if (job->common.speed) {
--        uint64_t delay_ns = ratelimit_calculate_delay(&job->limit,
-+        uint64_t delay_ns = ratelimit_calculate_delay(&job->common.limit,
-                                                       job->bytes_read);
-         job->bytes_read = 0;
-         block_job_sleep_ns(&job->common, delay_ns);
-diff --git a/block/commit.c b/block/commit.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/commit.c
-+++ b/block/commit.c
-@@ -XXX,XX +XXX,XX @@ enum {
- typedef struct CommitBlockJob {
-     BlockJob common;
--    RateLimit limit;
-     BlockDriverState *commit_top_bs;
-     BlockBackend *top;
-     BlockBackend *base;
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
-         block_job_progress_update(&s->common, n);
-         if (copy && s->common.speed) {
--            delay_ns = ratelimit_calculate_delay(&s->limit, n);
-+            delay_ns = ratelimit_calculate_delay(&s->common.limit, n);
-         } else {
-             delay_ns = 0;
-         }
-@@ -XXX,XX +XXX,XX @@ static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
-         error_setg(errp, QERR_INVALID_PARAMETER, "speed");
-         return;
-     }
--    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
-+    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
- }
- static const BlockJobDriver commit_job_driver = {
-diff --git a/block/mirror.c b/block/mirror.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/mirror.c
-+++ b/block/mirror.c
-@@ -XXX,XX +XXX,XX @@ typedef struct MirrorBuffer {
- typedef struct MirrorBlockJob {
-     BlockJob common;
--    RateLimit limit;
-     BlockBackend *target;
-     BlockDriverState *mirror_top_bs;
-     BlockDriverState *source;
-@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
-         offset += io_bytes;
-         nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity);
-         if (s->common.speed) {
--            delay_ns = ratelimit_calculate_delay(&s->limit, io_bytes_acct);
-+            delay_ns = ratelimit_calculate_delay(&s->common.limit,
-+                                                 io_bytes_acct);
-         }
-     }
-     return delay_ns;
-@@ -XXX,XX +XXX,XX @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
-         error_setg(errp, QERR_INVALID_PARAMETER, "speed");
-         return;
-     }
--    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
-+    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
- }
- static void mirror_complete(BlockJob *job, Error **errp)
-diff --git a/block/stream.c b/block/stream.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/stream.c
-+++ b/block/stream.c
-@@ -XXX,XX +XXX,XX @@ enum {
- typedef struct StreamBlockJob {
-     BlockJob common;
--    RateLimit limit;
-     BlockDriverState *base;
-     BlockdevOnError on_error;
-     char *backing_file_str;
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
-         /* Publish progress */
-         block_job_progress_update(&s->common, n);
-         if (copy && s->common.speed) {
--            delay_ns = ratelimit_calculate_delay(&s->limit, n);
-+            delay_ns = ratelimit_calculate_delay(&s->common.limit, n);
-         } else {
-             delay_ns = 0;
-         }
-@@ -XXX,XX +XXX,XX @@ static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
-         error_setg(errp, QERR_INVALID_PARAMETER, "speed");
-         return;
-     }
--    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
-+    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
- }
- static const BlockJobDriver stream_job_driver = {
---
-.13.6

-[Qemu-devel] [PULL 16/37] blockjob: Add block_job_driver()
+Deleted patch
-The backup block job directly accesses the driver field in BlockJob. Add
-a wrapper for getting it.
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: John Snow <jsnow@redhat.com>
----
- include/block/blockjob.h | 7 +++++++
- block/backup.c           | 8 +++++---
- blockjob.c               | 5 +++++
-files changed, 17 insertions(+), 3 deletions(-)
-diff --git a/include/block/blockjob.h b/include/block/blockjob.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/blockjob.h
-+++ b/include/block/blockjob.h
-@@ -XXX,XX +XXX,XX @@ void block_job_txn_add_job(BlockJobTxn *txn, BlockJob *job);
-  */
- bool block_job_is_internal(BlockJob *job);
-+/**
-+ * block_job_driver:
-+ *
-+ * Returns the driver associated with a block job.
-+ */
-+const BlockJobDriver *block_job_driver(BlockJob *job);
-+
- #endif
-diff --git a/block/backup.c b/block/backup.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/backup.c
-+++ b/block/backup.c
-@@ -XXX,XX +XXX,XX @@ typedef struct BackupBlockJob {
-     HBitmap *copy_bitmap;
- } BackupBlockJob;
-+static const BlockJobDriver backup_job_driver;
-+
- /* See if in-flight requests overlap and wait for them to complete */
- static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
-                                                        int64_t start,
-@@ -XXX,XX +XXX,XX @@ void backup_do_checkpoint(BlockJob *job, Error **errp)
-     BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
-     int64_t len;
--    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
-+    assert(block_job_driver(job) == &backup_job_driver);
-     if (backup_job->sync_mode != MIRROR_SYNC_MODE_NONE) {
-         error_setg(errp, "The backup job only supports block checkpoint in"
-@@ -XXX,XX +XXX,XX @@ void backup_wait_for_overlapping_requests(BlockJob *job, int64_t offset,
-     BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
-     int64_t start, end;
--    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
-+    assert(block_job_driver(job) == &backup_job_driver);
-     start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size);
-     end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size);
-@@ -XXX,XX +XXX,XX @@ void backup_cow_request_begin(CowRequest *req, BlockJob *job,
-     BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
-     int64_t start, end;
--    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
-+    assert(block_job_driver(job) == &backup_job_driver);
-     start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size);
-     end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size);
-diff --git a/blockjob.c b/blockjob.c
-index XXXXXXX..XXXXXXX 100644
---- a/blockjob.c
-+++ b/blockjob.c
-@@ -XXX,XX +XXX,XX @@ static bool block_job_started(BlockJob *job)
-     return job->co;
- }
-+const BlockJobDriver *block_job_driver(BlockJob *job)
-+{
-+    return job->driver;
-+}
-+
- /**
-  * All jobs must allow a pause point before entering their job proper. This
-  * ensures that jobs can be paused prior to being started, then resumed later.
---
-.13.6

-[Qemu-devel] [PULL 17/37] iotests: Split 214 off of 122
+Deleted patch
-From: Max Reitz <mreitz@redhat.com>
-Commit abd3622cc03cf41ed542126a540385f30a4c0175 added a case to 122
-regarding how the qcow2 driver handles an incorrect compressed data
-length value.  This does not really fit into 122, as that file is
-supposed to contain qemu-img convert test cases, which this case is not.
-So this patch splits it off into its own file; maybe we will even get
-more qcow2-only compression tests in the future.
-Also, that test case does not work with refcount_bits=1, so mark that
-option as unsupported.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Message-id: 20180406164108.26118-1-mreitz@redhat.com
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- tests/qemu-iotests/122     | 47 ----------------------
- tests/qemu-iotests/122.out | 33 ----------------
- tests/qemu-iotests/214     | 97 ++++++++++++++++++++++++++++++++++++++++++++++
- tests/qemu-iotests/214.out | 35 +++++++++++++++++
- tests/qemu-iotests/group   |  1 +
-files changed, 133 insertions(+), 80 deletions(-)
- create mode 100755 tests/qemu-iotests/214
- create mode 100644 tests/qemu-iotests/214.out
-diff --git a/tests/qemu-iotests/122 b/tests/qemu-iotests/122
-index XXXXXXX..XXXXXXX 100755
---- a/tests/qemu-iotests/122
-+++ b/tests/qemu-iotests/122
-@@ -XXX,XX +XXX,XX @@ $QEMU_IO -c "read -P 0    1024k 1022k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _fil
- echo
--echo "=== Corrupted size field in compressed cluster descriptor ==="
--echo
--# Create an empty image and fill half of it with compressed data.
--# The L2 entries of the two compressed clusters are located at
--# 0x800000 and 0x800008, their original values are 0x4008000000a00000
--# and 0x4008000000a00802 (5 sectors for compressed data each).
--_make_test_img 8M -o cluster_size=2M
--$QEMU_IO -c "write -c -P 0x11 0 2M" -c "write -c -P 0x11 2M 2M" "$TEST_IMG" \
--         2>&1 | _filter_qemu_io | _filter_testdir
--
--# Reduce size of compressed data to 4 sectors: this corrupts the image.
--poke_file "$TEST_IMG" $((0x800000)) "\x40\x06"
--$QEMU_IO -c "read  -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
--
--# 'qemu-img check' however doesn't see anything wrong because it
--# doesn't try to decompress the data and the refcounts are consistent.
--# TODO: update qemu-img so this can be detected.
--_check_test_img
--
--# Increase size of compressed data to the maximum (8192 sectors).
--# This makes QEMU read more data (8192 sectors instead of 5, host
--# addresses [0xa00000, 0xdfffff]), but the decompression algorithm
--# stops once we have enough to restore the uncompressed cluster, so
--# the rest of the data is ignored.
--poke_file "$TEST_IMG" $((0x800000)) "\x7f\xfe"
--# Do it also for the second compressed cluster (L2 entry at 0x800008).
--# In this case the compressed data would span 3 host clusters
--# (host addresses: [0xa00802, 0xe00801])
--poke_file "$TEST_IMG" $((0x800008)) "\x7f\xfe"
--
--# Here the image is too small so we're asking QEMU to read beyond the
--# end of the image.
--$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
--# But if we grow the image we won't be reading beyond its end anymore.
--$QEMU_IO -c "write -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
--$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
--
--# The refcount data is however wrong because due to the increased size
--# of the compressed data it now reaches the following host clusters.
--# This can be repaired by qemu-img check by increasing the refcount of
--# those clusters.
--# TODO: update qemu-img to correct the compressed cluster size instead.
--_check_test_img -r all
--$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
--$QEMU_IO -c "read  -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
--
--echo
- echo "=== Full allocation with -S 0 ==="
- echo
-diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out
-index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/122.out
-+++ b/tests/qemu-iotests/122.out
-@@ -XXX,XX +XXX,XX @@ read 1024/1024 bytes at offset 1047552
- read 1046528/1046528 bytes at offset 1048576
-KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
--=== Corrupted size field in compressed cluster descriptor ===
--
--Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=8388608
--wrote 2097152/2097152 bytes at offset 0
--2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
--wrote 2097152/2097152 bytes at offset 2097152
--2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
--read failed: Input/output error
--No errors were found on the image.
--read 4194304/4194304 bytes at offset 0
--4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
--wrote 4194304/4194304 bytes at offset 4194304
--4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
--read 4194304/4194304 bytes at offset 0
--4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
--ERROR cluster 6 refcount=1 reference=3
--ERROR cluster 7 refcount=1 reference=2
--Repairing cluster 6 refcount=1 reference=3
--Repairing cluster 7 refcount=1 reference=2
--Repairing OFLAG_COPIED data cluster: l2_entry=8000000000c00000 refcount=3
--Repairing OFLAG_COPIED data cluster: l2_entry=8000000000e00000 refcount=2
--The following inconsistencies were found and repaired:
--
--    0 leaked clusters
--    4 corruptions
--
--Double checking the fixed image now...
--No errors were found on the image.
--read 4194304/4194304 bytes at offset 0
--4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
--read 4194304/4194304 bytes at offset 4194304
--4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
--
- === Full allocation with -S 0 ===
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
-diff --git a/tests/qemu-iotests/214 b/tests/qemu-iotests/214
-new file mode 100755
-index XXXXXXX..XXXXXXX
---- /dev/null
-+++ b/tests/qemu-iotests/214
-@@ -XXX,XX +XXX,XX @@
-+#!/bin/bash
-+#
-+# Test qcow2 image compression
-+#
-+# Copyright (C) 2018 Igalia, S.L.
-+# Author: Alberto Garcia <berto@igalia.com>
-+#
-+# This program is free software; you can redistribute it and/or modify
-+# it under the terms of the GNU General Public License as published by
-+# the Free Software Foundation; either version 2 of the License, or
-+# (at your option) any later version.
-+#
-+# This program is distributed in the hope that it will be useful,
-+# but WITHOUT ANY WARRANTY; without even the implied warranty of
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+# GNU General Public License for more details.
-+#
-+# You should have received a copy of the GNU General Public License
-+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-+#
-+
-+seq=$(basename "$0")
-+echo "QA output created by $seq"
-+
-+here=$PWD
-+status=1    # failure is the default!
-+
-+_cleanup()
-+{
-+    _cleanup_test_img
-+}
-+trap "_cleanup; exit \$status" 0 1 2 3 15
-+
-+# get standard environment, filters and checks
-+. ./common.rc
-+. ./common.filter
-+
-+_supported_fmt qcow2
-+_supported_proto file
-+_supported_os Linux
-+
-+# Repairing the corrupted image requires qemu-img check to store a
-+# refcount up to 3, which requires at least two refcount bits.
-+_unsupported_imgopts 'refcount_bits=1[^0-9]'
-+
-+
-+echo
-+echo "=== Corrupted size field in compressed cluster descriptor ==="
-+echo
-+# Create an empty image and fill half of it with compressed data.
-+# The L2 entries of the two compressed clusters are located at
-+# 0x800000 and 0x800008, their original values are 0x4008000000a00000
-+# and 0x4008000000a00802 (5 sectors for compressed data each).
-+_make_test_img 8M -o cluster_size=2M
-+$QEMU_IO -c "write -c -P 0x11 0 2M" -c "write -c -P 0x11 2M 2M" "$TEST_IMG" \
-+         2>&1 | _filter_qemu_io | _filter_testdir
-+
-+# Reduce size of compressed data to 4 sectors: this corrupts the image.
-+poke_file "$TEST_IMG" $((0x800000)) "\x40\x06"
-+$QEMU_IO -c "read  -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-+
-+# 'qemu-img check' however doesn't see anything wrong because it
-+# doesn't try to decompress the data and the refcounts are consistent.
-+# TODO: update qemu-img so this can be detected.
-+_check_test_img
-+
-+# Increase size of compressed data to the maximum (8192 sectors).
-+# This makes QEMU read more data (8192 sectors instead of 5, host
-+# addresses [0xa00000, 0xdfffff]), but the decompression algorithm
-+# stops once we have enough to restore the uncompressed cluster, so
-+# the rest of the data is ignored.
-+poke_file "$TEST_IMG" $((0x800000)) "\x7f\xfe"
-+# Do it also for the second compressed cluster (L2 entry at 0x800008).
-+# In this case the compressed data would span 3 host clusters
-+# (host addresses: [0xa00802, 0xe00801])
-+poke_file "$TEST_IMG" $((0x800008)) "\x7f\xfe"
-+
-+# Here the image is too small so we're asking QEMU to read beyond the
-+# end of the image.
-+$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-+# But if we grow the image we won't be reading beyond its end anymore.
-+$QEMU_IO -c "write -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-+$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-+
-+# The refcount data is however wrong because due to the increased size
-+# of the compressed data it now reaches the following host clusters.
-+# This can be repaired by qemu-img check by increasing the refcount of
-+# those clusters.
-+# TODO: update qemu-img to correct the compressed cluster size instead.
-+_check_test_img -r all
-+$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-+$QEMU_IO -c "read  -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-+
-+# success, all done
-+echo '*** done'
-+rm -f $seq.full
-+status=0
-diff --git a/tests/qemu-iotests/214.out b/tests/qemu-iotests/214.out
-new file mode 100644
-index XXXXXXX..XXXXXXX
---- /dev/null
-+++ b/tests/qemu-iotests/214.out
-@@ -XXX,XX +XXX,XX @@
-+QA output created by 214
-+
-+=== Corrupted size field in compressed cluster descriptor ===
-+
-+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=8388608
-+wrote 2097152/2097152 bytes at offset 0
-+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+wrote 2097152/2097152 bytes at offset 2097152
-+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+read failed: Input/output error
-+No errors were found on the image.
-+read 4194304/4194304 bytes at offset 0
-+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+wrote 4194304/4194304 bytes at offset 4194304
-+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+read 4194304/4194304 bytes at offset 0
-+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+ERROR cluster 6 refcount=1 reference=3
-+ERROR cluster 7 refcount=1 reference=2
-+Repairing cluster 6 refcount=1 reference=3
-+Repairing cluster 7 refcount=1 reference=2
-+Repairing OFLAG_COPIED data cluster: l2_entry=8000000000c00000 refcount=3
-+Repairing OFLAG_COPIED data cluster: l2_entry=8000000000e00000 refcount=2
-+The following inconsistencies were found and repaired:
-+
-+    0 leaked clusters
-+    4 corruptions
-+
-+Double checking the fixed image now...
-+No errors were found on the image.
-+read 4194304/4194304 bytes at offset 0
-+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+read 4194304/4194304 bytes at offset 4194304
-+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+*** done
-diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
-index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/group
-+++ b/tests/qemu-iotests/group
-@@ -XXX,XX +XXX,XX @@
-rw auto quick
-rw auto quick
-rw auto quick
-+214 rw auto
-rw auto quick
---
-.13.6

-[Qemu-devel] [PULL 18/37] Fix error message about compressed clusters with OFLAG_COPIED
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-Compressed clusters are not supposed to have the COPIED bit set.
-"qemu-img check" detects that and prints an error message reporting
-the number of the affected host cluster. This doesn't make much sense
-because compressed clusters are not aligned to host clusters, so it
-would be better to report the offset instead. Plus, the calculation is
-wrong and it uses the raw L2 entry as if it was simply an offset.
-This patch fixes the error message and reports the offset of the
-compressed cluster.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Message-id: 0f687957feb72e80c740403191a47e607c2463fe.1523376013.git.berto@igalia.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-refcount.c | 4 ++--
-file changed, 2 insertions(+), 2 deletions(-)
-diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-refcount.c
-+++ b/block/qcow2-refcount.c
-@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
-         case QCOW2_CLUSTER_COMPRESSED:
-             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
-             if (l2_entry & QCOW_OFLAG_COPIED) {
--                fprintf(stderr, "ERROR: cluster %" PRId64 ": "
-+                fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
-                     "copied flag must never be set for compressed "
--                    "clusters\n", l2_entry >> s->cluster_bits);
-+                    "clusters\n", l2_entry & s->cluster_offset_mask);
-                 l2_entry &= ~QCOW_OFLAG_COPIED;
-                 res->corruptions++;
-             }
---
-.13.6

-[Qemu-devel] [PULL 21/37] docs: Document the new default sizes of the qcow2 caches
+Deleted patch
-From: Alberto Garcia <berto@igalia.com>
-We have just reduced the refcount cache size to the minimum unless
-the user explicitly requests a larger one, so we have to update the
-documentation to reflect this change.
-Signed-off-by: Alberto Garcia <berto@igalia.com>
-Message-id: c5f0bde23558dd9d33b21fffc76ac9953cc19c56.1523968389.git.berto@igalia.com
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- docs/qcow2-cache.txt | 33 ++++++++++++++++-----------------
-file changed, 16 insertions(+), 17 deletions(-)
-diff --git a/docs/qcow2-cache.txt b/docs/qcow2-cache.txt
-index XXXXXXX..XXXXXXX 100644
---- a/docs/qcow2-cache.txt
-+++ b/docs/qcow2-cache.txt
-@@ -XXX,XX +XXX,XX @@ There are three options available, and all of them take bytes:
- "refcount-cache-size":   maximum size of the refcount block cache
- "cache-size":            maximum size of both caches combined
--There are two things that need to be taken into account:
-+There are a few things that need to be taken into account:
-  - Both caches must have a size that is a multiple of the cluster size
-    (or the cache entry size: see "Using smaller cache sizes" below).
-- - If you only set one of the options above, QEMU will automatically
--   adjust the others so that the L2 cache is 4 times bigger than the
--   refcount cache.
-+ - The default L2 cache size is 8 clusters or 1MB (whichever is more),
-+   and the minimum is 2 clusters (or 2 cache entries, see below).
--This means that these options are equivalent:
-+ - The default (and minimum) refcount cache size is 4 clusters.
--   -drive file=hd.qcow2,l2-cache-size=2097152
--   -drive file=hd.qcow2,refcount-cache-size=524288
--   -drive file=hd.qcow2,cache-size=2621440
-+ - If only "cache-size" is specified then QEMU will assign as much
-+   memory as possible to the L2 cache before increasing the refcount
-+   cache size.
--The reason for this 1/4 ratio is to ensure that both caches cover the
--same amount of disk space. Note however that this is only valid with
--the default value of refcount_bits (16). If you are using a different
--value you might want to calculate both cache sizes yourself since QEMU
--will always use the same 1/4 ratio.
-+Unlike L2 tables, refcount blocks are not used during normal I/O but
-+only during allocations and internal snapshots. In most cases they are
-+accessed sequentially (even during random guest I/O) so increasing the
-+refcount cache size won't have any measurable effect in performance
-+(this can change if you are using internal snapshots, so you may want
-+to think about increasing the cache size if you use them heavily).
--It's also worth mentioning that there's no strict need for both caches
--to cover the same amount of disk space. The refcount cache is used
--much less often than the L2 cache, so it's perfectly reasonable to
--keep it small.
-+Before QEMU 2.12 the refcount cache had a default size of 1/4 of the
-+L2 cache size. This resulted in unnecessarily large caches, so now the
-+refcount cache is as small as possible unless overridden by the user.
- Using smaller cache entries
---
-.13.6

-[Qemu-devel] [PULL 22/37] iotests: Add failure matching to common.qemu
+Deleted patch
-From: Max Reitz <mreitz@redhat.com>
-Currently, common.qemu only allows to match for results indicating
-success.  The only way to fail is by provoking a timeout.  However,
-sometimes we do have a defined failure output and can match for that,
-which saves us from having to wait for the timeout in case of failure.
-Because failure can sometimes just result in a _notrun in the test, it
-is actually important to care about being able to fail quickly.
-Also, sometimes we simply do not get any specific output in case of
-success.  The only way to handle this currently would be to define an
-error message as the string to look for, which means that actual success
-results in a timeout.  This is really bad because it unnecessarily slows
-down a succeeding test.
-Therefore, this patch adds a new parameter $success_or_failure to
-_timed_wait_for and _send_qemu_cmd.  Setting this to a non-empty string
-makes both commands expect two match parameters: If the first matches,
-the function succeeds.  If the second matches, the function fails.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Message-id: 20180406151731.4285-2-mreitz@redhat.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- tests/qemu-iotests/common.qemu | 58 +++++++++++++++++++++++++++++++++++++-----
-file changed, 51 insertions(+), 7 deletions(-)
-diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu
-index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/common.qemu
-+++ b/tests/qemu-iotests/common.qemu
-@@ -XXX,XX +XXX,XX @@ _in_fd=4
- # response is not echoed out.
- # If $mismatch_only is set, only non-matching responses will
- # be echoed.
-+#
-+# If $success_or_failure is set, the meaning of the arguments is
-+# changed as follows:
-+# $2: A string to search for in the response; if found, this indicates
-+#     success and ${QEMU_STATUS[$1]} is set to 0.
-+# $3: A string to search for in the response; if found, this indicates
-+#     failure and the test is either aborted (if $qemu_error_no_exit
-+#     is not set) or ${QEMU_STATUS[$1]} is set to -1 (otherwise).
- function _timed_wait_for()
- {
-     local h=${1}
-     shift
-+    if [ -z "${success_or_failure}" ]; then
-+        success_match=${*}
-+        failure_match=
-+    else
-+        success_match=${1}
-+        failure_match=${2}
-+    fi
-+
-+    timeout=yes
-+
-     QEMU_STATUS[$h]=0
-     while IFS= read -t ${QEMU_COMM_TIMEOUT} resp <&${QEMU_OUT[$h]}
-     do
-@@ -XXX,XX +XXX,XX @@ function _timed_wait_for()
-             echo "${resp}" | _filter_testdir | _filter_qemu \
-                            | _filter_qemu_io | _filter_qmp | _filter_hmp
-         fi
--        grep -q "${*}" < <(echo "${resp}")
-+        if [ -n "${failure_match}" ]; then
-+            grep -q "${failure_match}" < <(echo "${resp}")
-+            if [ $? -eq 0 ]; then
-+                timeout=
-+                break
-+            fi
-+        fi
-+        grep -q "${success_match}" < <(echo "${resp}")
-         if [ $? -eq 0 ]; then
-             return
--        elif [ -z "${silent}" ] && [ -n "${mismatch_only}" ]; then
-+        fi
-+        if [ -z "${silent}" ] && [ -n "${mismatch_only}" ]; then
-             echo "${resp}" | _filter_testdir | _filter_qemu \
-                            | _filter_qemu_io | _filter_qmp | _filter_hmp
-         fi
-@@ -XXX,XX +XXX,XX @@ function _timed_wait_for()
-     done
-     QEMU_STATUS[$h]=-1
-     if [ -z "${qemu_error_no_exit}" ]; then
--        echo "Timeout waiting for ${*} on handle ${h}"
--        exit 1  # Timeout means the test failed
-+        if [ -n "${timeout}" ]; then
-+            echo "Timeout waiting for ${success_match} on handle ${h}"
-+        else
-+            echo "Wrong response matching ${failure_match} on handle ${h}"
-+        fi
-+        exit 1  # Timeout or wrong match mean the test failed
-     fi
- }
-@@ -XXX,XX +XXX,XX @@ function _timed_wait_for()
- # If $qemu_error_no_exit is set, then even if the expected response
- # is not seen, we will not exit.  $QEMU_STATUS[$1] will be set it -1 in
- # that case.
-+#
-+# If $success_or_failure is set, then the last two strings are the
-+# strings the response will be scanned for.  The first of the two
-+# indicates success, the latter indicates failure.  Failure is handled
-+# like a timeout.
- function _send_qemu_cmd()
- {
-     local h=${1}
-@@ -XXX,XX +XXX,XX @@ function _send_qemu_cmd()
-         use_error="no"
-     fi
-     # This array element extraction is done to accommodate pathnames with spaces
--    cmd=${@: 1:${#@}-1}
--    shift $(($# - 1))
-+    if [ -z "${success_or_failure}" ]; then
-+        cmd=${@: 1:${#@}-1}
-+        shift $(($# - 1))
-+    else
-+        cmd=${@: 1:${#@}-2}
-+        shift $(($# - 2))
-+    fi
-     while [ ${count} -gt 0 ]
-     do
-         echo "${cmd}" >&${QEMU_IN[${h}]}
-         if [ -n "${1}" ]; then
--            qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}"
-+            if [ -z "${success_or_failure}" ]; then
-+                qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}"
-+            else
-+                qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}" "${2}"
-+            fi
-             if [ ${QEMU_STATUS[$h]} -eq 0 ]; then
-                 return
-             fi
---
-.13.6

-[Qemu-devel] [PULL 23/37] iotests: Skip 181 and 201 without userfaultfd
+Deleted patch
-From: Max Reitz <mreitz@redhat.com>
-userfaultfd support depends on the host kernel, so it may not be
-available.  If so, 181 and 201 should be skipped.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Message-id: 20180406151731.4285-3-mreitz@redhat.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- tests/qemu-iotests/181 | 13 +++++++++++++
- tests/qemu-iotests/201 | 13 +++++++++++++
-files changed, 26 insertions(+)
-diff --git a/tests/qemu-iotests/181 b/tests/qemu-iotests/181
-index XXXXXXX..XXXXXXX 100755
---- a/tests/qemu-iotests/181
-+++ b/tests/qemu-iotests/181
-@@ -XXX,XX +XXX,XX @@ echo
- # Enable postcopy-ram capability both on source and destination
- silent=yes
- _send_qemu_cmd $dest 'migrate_set_capability postcopy-ram on' "(qemu)"
-+
-+qemu_error_no_exit=yes success_or_failure=yes \
-+    _send_qemu_cmd $dest '' "(qemu)" "Postcopy is not supported"
-+if [ ${QEMU_STATUS[$dest]} -lt 0 ]; then
-+    _send_qemu_cmd $dest '' "(qemu)"
-+
-+    _send_qemu_cmd $src 'quit' ""
-+    _send_qemu_cmd $dest 'quit' ""
-+    wait=1 _cleanup_qemu
-+
-+    _notrun 'Postcopy is not supported'
-+fi
-+
- _send_qemu_cmd $src 'migrate_set_speed 4k' "(qemu)"
- _send_qemu_cmd $src 'migrate_set_capability postcopy-ram on' "(qemu)"
- _send_qemu_cmd $src "migrate -d unix:${MIG_SOCKET}" "(qemu)"
-diff --git a/tests/qemu-iotests/201 b/tests/qemu-iotests/201
-index XXXXXXX..XXXXXXX 100755
---- a/tests/qemu-iotests/201
-+++ b/tests/qemu-iotests/201
-@@ -XXX,XX +XXX,XX @@ echo
- silent=yes
- _send_qemu_cmd $dest 'migrate_set_capability postcopy-ram on' "(qemu)"
-+
-+qemu_error_no_exit=yes success_or_failure=yes \
-+    _send_qemu_cmd $dest '' "(qemu)" "Postcopy is not supported"
-+if [ ${QEMU_STATUS[$dest]} -lt 0 ]; then
-+    _send_qemu_cmd $dest '' "(qemu)"
-+
-+    _send_qemu_cmd $src 'quit' ""
-+    _send_qemu_cmd $dest 'quit' ""
-+    wait=1 _cleanup_qemu
-+
-+    _notrun 'Postcopy is not supported'
-+fi
-+
- _send_qemu_cmd $src 'migrate_set_capability postcopy-ram on' "(qemu)"
- _send_qemu_cmd $src "migrate -d unix:${MIG_SOCKET}" "(qemu)"
---
-.13.6

-[Qemu-devel] [PULL 24/37] block: Add COR filter driver
+Deleted patch
-From: Max Reitz <mreitz@redhat.com>
-This adds a simple copy-on-read filter driver.  It relies on the already
-existing COR functionality in the central block layer code, which may be
-moved here once we no longer need it there.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Message-id: 20180421132929.21610-2-mreitz@redhat.com
-Reviewed-by: Alberto Garcia <berto@igalia.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- qapi/block-core.json |   5 +-
- block/copy-on-read.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++++++
- block/Makefile.objs  |   2 +-
-files changed, 176 insertions(+), 2 deletions(-)
- create mode 100644 block/copy-on-read.c
-diff --git a/qapi/block-core.json b/qapi/block-core.json
-index XXXXXXX..XXXXXXX 100644
---- a/qapi/block-core.json
-+++ b/qapi/block-core.json
-@@ -XXX,XX +XXX,XX @@
- # @vxhs: Since 2.10
- # @throttle: Since 2.11
- # @nvme: Since 2.12
-+# @copy-on-read: Since 2.13
- #
- # Since: 2.9
- ##
- { 'enum': 'BlockdevDriver',
--  'data': [ 'blkdebug', 'blkverify', 'bochs', 'cloop',
-+  'data': [ 'blkdebug', 'blkverify', 'bochs', 'cloop', 'copy-on-read',
-             'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom',
-             'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
-             'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
-@@ -XXX,XX +XXX,XX @@
-       'blkverify':  'BlockdevOptionsBlkverify',
-       'bochs':      'BlockdevOptionsGenericFormat',
-       'cloop':      'BlockdevOptionsGenericFormat',
-+      'copy-on-read':'BlockdevOptionsGenericFormat',
-       'dmg':        'BlockdevOptionsGenericFormat',
-       'file':       'BlockdevOptionsFile',
-       'ftp':        'BlockdevOptionsCurlFtp',
-@@ -XXX,XX +XXX,XX @@
-       'blkverify':      'BlockdevCreateNotSupported',
-       'bochs':          'BlockdevCreateNotSupported',
-       'cloop':          'BlockdevCreateNotSupported',
-+      'copy-on-read':   'BlockdevCreateNotSupported',
-       'dmg':            'BlockdevCreateNotSupported',
-       'file':           'BlockdevCreateOptionsFile',
-       'ftp':            'BlockdevCreateNotSupported',
-diff --git a/block/copy-on-read.c b/block/copy-on-read.c
-new file mode 100644
-index XXXXXXX..XXXXXXX
---- /dev/null
-+++ b/block/copy-on-read.c
-@@ -XXX,XX +XXX,XX @@
-+/*
-+ * Copy-on-read filter block driver
-+ *
-+ * Copyright (c) 2018 Red Hat, Inc.
-+ *
-+ * Author:
-+ *   Max Reitz <mreitz@redhat.com>
-+ *
-+ * This program is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU General Public License as
-+ * published by the Free Software Foundation; either version 2 or
-+ * (at your option) version 3 of the License.
-+ *
-+ * This program is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+ * GNU General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU General Public License
-+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
-+ */
-+
-+#include "qemu/osdep.h"
-+#include "block/block_int.h"
-+
-+
-+static int cor_open(BlockDriverState *bs, QDict *options, int flags,
-+                    Error **errp)
-+{
-+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false,
-+                               errp);
-+    if (!bs->file) {
-+        return -EINVAL;
-+    }
-+
-+    bs->supported_write_flags = BDRV_REQ_FUA &
-+                                    bs->file->bs->supported_write_flags;
-+
-+    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
-+                                    bs->file->bs->supported_zero_flags;
-+
-+    return 0;
-+}
-+
-+
-+static void cor_close(BlockDriverState *bs)
-+{
-+}
-+
-+
-+#define PERM_PASSTHROUGH (BLK_PERM_CONSISTENT_READ \
-+                          | BLK_PERM_WRITE \
-+                          | BLK_PERM_RESIZE)
-+#define PERM_UNCHANGED (BLK_PERM_ALL & ~PERM_PASSTHROUGH)
-+
-+static void cor_child_perm(BlockDriverState *bs, BdrvChild *c,
-+                           const BdrvChildRole *role,
-+                           BlockReopenQueue *reopen_queue,
-+                           uint64_t perm, uint64_t shared,
-+                           uint64_t *nperm, uint64_t *nshared)
-+{
-+    if (c == NULL) {
-+        *nperm = (perm & PERM_PASSTHROUGH) | BLK_PERM_WRITE_UNCHANGED;
-+        *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED;
-+        return;
-+    }
-+
-+    *nperm = (perm & PERM_PASSTHROUGH) |
-+             (c->perm & PERM_UNCHANGED);
-+    *nshared = (shared & PERM_PASSTHROUGH) |
-+               (c->shared_perm & PERM_UNCHANGED);
-+}
-+
-+
-+static int64_t cor_getlength(BlockDriverState *bs)
-+{
-+    return bdrv_getlength(bs->file->bs);
-+}
-+
-+
-+static int cor_truncate(BlockDriverState *bs, int64_t offset,
-+                        PreallocMode prealloc, Error **errp)
-+{
-+    return bdrv_truncate(bs->file, offset, prealloc, errp);
-+}
-+
-+
-+static int coroutine_fn cor_co_preadv(BlockDriverState *bs,
-+                                      uint64_t offset, uint64_t bytes,
-+                                      QEMUIOVector *qiov, int flags)
-+{
-+    return bdrv_co_preadv(bs->file, offset, bytes, qiov,
-+                          flags | BDRV_REQ_COPY_ON_READ);
-+}
-+
-+
-+static int coroutine_fn cor_co_pwritev(BlockDriverState *bs,
-+                                       uint64_t offset, uint64_t bytes,
-+                                       QEMUIOVector *qiov, int flags)
-+{
-+
-+    return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
-+}
-+
-+
-+static int coroutine_fn cor_co_pwrite_zeroes(BlockDriverState *bs,
-+                                             int64_t offset, int bytes,
-+                                             BdrvRequestFlags flags)
-+{
-+    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
-+}
-+
-+
-+static int coroutine_fn cor_co_pdiscard(BlockDriverState *bs,
-+                                        int64_t offset, int bytes)
-+{
-+    return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
-+}
-+
-+
-+static void cor_eject(BlockDriverState *bs, bool eject_flag)
-+{
-+    bdrv_eject(bs->file->bs, eject_flag);
-+}
-+
-+
-+static void cor_lock_medium(BlockDriverState *bs, bool locked)
-+{
-+    bdrv_lock_medium(bs->file->bs, locked);
-+}
-+
-+
-+static bool cor_recurse_is_first_non_filter(BlockDriverState *bs,
-+                                            BlockDriverState *candidate)
-+{
-+    return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
-+}
-+
-+
-+BlockDriver bdrv_copy_on_read = {
-+    .format_name                        = "copy-on-read",
-+
-+    .bdrv_open                          = cor_open,
-+    .bdrv_close                         = cor_close,
-+    .bdrv_child_perm                    = cor_child_perm,
-+
-+    .bdrv_getlength                     = cor_getlength,
-+    .bdrv_truncate                      = cor_truncate,
-+
-+    .bdrv_co_preadv                     = cor_co_preadv,
-+    .bdrv_co_pwritev                    = cor_co_pwritev,
-+    .bdrv_co_pwrite_zeroes              = cor_co_pwrite_zeroes,
-+    .bdrv_co_pdiscard                   = cor_co_pdiscard,
-+
-+    .bdrv_eject                         = cor_eject,
-+    .bdrv_lock_medium                   = cor_lock_medium,
-+
-+    .bdrv_co_block_status               = bdrv_co_block_status_from_file,
-+
-+    .bdrv_recurse_is_first_non_filter   = cor_recurse_is_first_non_filter,
-+
-+    .has_variable_length                = true,
-+    .is_filter                          = true,
-+};
-+
-+static void bdrv_copy_on_read_init(void)
-+{
-+    bdrv_register(&bdrv_copy_on_read);
-+}
-+
-+block_init(bdrv_copy_on_read_init);
-diff --git a/block/Makefile.objs b/block/Makefile.objs
-index XXXXXXX..XXXXXXX 100644
---- a/block/Makefile.objs
-+++ b/block/Makefile.objs
-@@ -XXX,XX +XXX,XX @@ block-obj-y += accounting.o dirty-bitmap.o
- block-obj-y += write-threshold.o
- block-obj-y += backup.o
- block-obj-$(CONFIG_REPLICATION) += replication.o
--block-obj-y += throttle.o
-+block-obj-y += throttle.o copy-on-read.o
- block-obj-y += crypto.o
---
-.13.6

-[Qemu-devel] [PULL 25/37] block: BLK_PERM_WRITE includes ..._UNCHANGED
+Deleted patch
-From: Max Reitz <mreitz@redhat.com>
-Currently we never actually check whether the WRITE_UNCHANGED
-permission has been taken for unchanging writes.  But the one check that
-is commented out checks both WRITE and WRITE_UNCHANGED; and considering
-that WRITE_UNCHANGED is already documented as being weaker than WRITE,
-we should probably explicitly document WRITE to include WRITE_UNCHANGED.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Alberto Garcia <berto@igalia.com>
-Message-id: 20180421132929.21610-3-mreitz@redhat.com
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- include/block/block.h | 3 +++
-file changed, 3 insertions(+)
-diff --git a/include/block/block.h b/include/block/block.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/block.h
-+++ b/include/block/block.h
-@@ -XXX,XX +XXX,XX @@ enum {
-      * This permission (which is weaker than BLK_PERM_WRITE) is both enough and
-      * required for writes to the block node when the caller promises that
-      * the visible disk content doesn't change.
-+     *
-+     * As the BLK_PERM_WRITE permission is strictly stronger, either is
-+     * sufficient to perform an unchanging write.
-      */
-     BLK_PERM_WRITE_UNCHANGED    = 0x04,
---
-.13.6

-[Qemu-devel] [PULL 27/37] block: Set BDRV_REQ_WRITE_UNCHANGED for COR writes
+Deleted patch
-From: Max Reitz <mreitz@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Alberto Garcia <berto@igalia.com>
-Message-id: 20180421132929.21610-5-mreitz@redhat.com
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/io.c | 6 ++++--
-file changed, 4 insertions(+), 2 deletions(-)
-diff --git a/block/io.c b/block/io.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/io.c
-+++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
-                 /* FIXME: Should we (perhaps conditionally) be setting
-                  * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
-                  * that still correctly reads as zero? */
--                ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 0);
-+                ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
-+                                               BDRV_REQ_WRITE_UNCHANGED);
-             } else {
-                 /* This does not change the data on the disk, it is not
-                  * necessary to flush even in cache=writethrough mode.
-                  */
-                 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
--                                          &local_qiov, 0);
-+                                          &local_qiov,
-+                                          BDRV_REQ_WRITE_UNCHANGED);
-             }
-             if (ret < 0) {
---
-.13.6

-[Qemu-devel] [PULL 28/37] block/quorum: Support BDRV_REQ_WRITE_UNCHANGED
+Deleted patch
-From: Max Reitz <mreitz@redhat.com>
-We just need to forward it to quorum's children (except in case of a
-rewrite because of corruption), but for that we first have to support
-flags in child requests at all.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Alberto Garcia <berto@igalia.com>
-Message-id: 20180421132929.21610-6-mreitz@redhat.com
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/quorum.c | 19 +++++++++++++------
-file changed, 13 insertions(+), 6 deletions(-)
-diff --git a/block/quorum.c b/block/quorum.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/quorum.c
-+++ b/block/quorum.c
-@@ -XXX,XX +XXX,XX @@ struct QuorumAIOCB {
-     /* Request metadata */
-     uint64_t offset;
-     uint64_t bytes;
-+    int flags;
-     QEMUIOVector *qiov;         /* calling IOV */
-@@ -XXX,XX +XXX,XX @@ static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b)
- static QuorumAIOCB *quorum_aio_get(BlockDriverState *bs,
-                                    QEMUIOVector *qiov,
-                                    uint64_t offset,
--                                   uint64_t bytes)
-+                                   uint64_t bytes,
-+                                   int flags)
- {
-     BDRVQuorumState *s = bs->opaque;
-     QuorumAIOCB *acb = g_new(QuorumAIOCB, 1);
-@@ -XXX,XX +XXX,XX @@ static QuorumAIOCB *quorum_aio_get(BlockDriverState *bs,
-         .bs                 = bs,
-         .offset             = offset,
-         .bytes              = bytes,
-+        .flags              = flags,
-         .qiov               = qiov,
-         .votes.compare      = quorum_sha256_compare,
-         .votes.vote_list    = QLIST_HEAD_INITIALIZER(acb.votes.vote_list),
-@@ -XXX,XX +XXX,XX @@ static void quorum_rewrite_entry(void *opaque)
-     BDRVQuorumState *s = acb->bs->opaque;
-     /* Ignore any errors, it's just a correction attempt for already
--     * corrupted data. */
-+     * corrupted data.
-+     * Mask out BDRV_REQ_WRITE_UNCHANGED because this overwrites the
-+     * area with different data from the other children. */
-     bdrv_co_pwritev(s->children[co->idx], acb->offset, acb->bytes,
--                    acb->qiov, 0);
-+                    acb->qiov, acb->flags & ~BDRV_REQ_WRITE_UNCHANGED);
-     /* Wake up the caller after the last rewrite */
-     acb->rewrite_count--;
-@@ -XXX,XX +XXX,XX @@ static int quorum_co_preadv(BlockDriverState *bs, uint64_t offset,
-                             uint64_t bytes, QEMUIOVector *qiov, int flags)
- {
-     BDRVQuorumState *s = bs->opaque;
--    QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes);
-+    QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags);
-     int ret;
-     acb->is_read = true;
-@@ -XXX,XX +XXX,XX @@ static void write_quorum_entry(void *opaque)
-     sacb->bs = s->children[i]->bs;
-     sacb->ret = bdrv_co_pwritev(s->children[i], acb->offset, acb->bytes,
--                                acb->qiov, 0);
-+                                acb->qiov, acb->flags);
-     if (sacb->ret == 0) {
-         acb->success_count++;
-     } else {
-@@ -XXX,XX +XXX,XX @@ static int quorum_co_pwritev(BlockDriverState *bs, uint64_t offset,
-                              uint64_t bytes, QEMUIOVector *qiov, int flags)
- {
-     BDRVQuorumState *s = bs->opaque;
--    QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes);
-+    QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags);
-     int i, ret;
-     for (i = 0; i < s->num_children; i++) {
-@@ -XXX,XX +XXX,XX @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
-     }
-     s->next_child_index = s->num_children;
-+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
-+
-     g_free(opened);
-     goto exit;
---
-.13.6

-[Qemu-devel] [PULL 30/37] iotests: Clean up wrap image in 197
+Deleted patch
-From: Max Reitz <mreitz@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Alberto Garcia <berto@igalia.com>
-Message-id: 20180421132929.21610-8-mreitz@redhat.com
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- tests/qemu-iotests/197 | 1 +
-file changed, 1 insertion(+)
-diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197
-index XXXXXXX..XXXXXXX 100755
---- a/tests/qemu-iotests/197
-+++ b/tests/qemu-iotests/197
-@@ -XXX,XX +XXX,XX @@ esac
- _cleanup()
- {
-     _cleanup_test_img
-+    rm -f "$TEST_WRAP"
-     rm -f "$BLKDBG_CONF"
- }
- trap "_cleanup; exit \$status" 0 1 2 3 15
---
-.13.6

-[Qemu-devel] [PULL 31/37] iotests: Copy 197 for COR filter driver
+Deleted patch
-From: Max Reitz <mreitz@redhat.com>
-iotest 197 tests copy-on-read using the (now old) copy-on-read flag.
-Copy it to 215 and modify it to use the COR filter driver instead.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Message-id: 20180421132929.21610-9-mreitz@redhat.com
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- tests/qemu-iotests/215     | 120 +++++++++++++++++++++++++++++++++++++++++++++
- tests/qemu-iotests/215.out |  26 ++++++++++
- tests/qemu-iotests/group   |   1 +
-files changed, 147 insertions(+)
- create mode 100755 tests/qemu-iotests/215
- create mode 100644 tests/qemu-iotests/215.out
-diff --git a/tests/qemu-iotests/215 b/tests/qemu-iotests/215
-new file mode 100755
-index XXXXXXX..XXXXXXX
---- /dev/null
-+++ b/tests/qemu-iotests/215
-@@ -XXX,XX +XXX,XX @@
-+#!/bin/bash
-+#
-+# Test case for copy-on-read into qcow2, using the COR filter driver
-+#
-+# Copyright (C) 2018 Red Hat, Inc.
-+#
-+# This program is free software; you can redistribute it and/or modify
-+# it under the terms of the GNU General Public License as published by
-+# the Free Software Foundation; either version 2 of the License, or
-+# (at your option) any later version.
-+#
-+# This program is distributed in the hope that it will be useful,
-+# but WITHOUT ANY WARRANTY; without even the implied warranty of
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+# GNU General Public License for more details.
-+#
-+# You should have received a copy of the GNU General Public License
-+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-+#
-+
-+seq="$(basename $0)"
-+echo "QA output created by $seq"
-+
-+here="$PWD"
-+status=1 # failure is the default!
-+
-+# get standard environment, filters and checks
-+. ./common.rc
-+. ./common.filter
-+
-+TEST_WRAP="$TEST_DIR/t.wrap.qcow2"
-+BLKDBG_CONF="$TEST_DIR/blkdebug.conf"
-+
-+# Sanity check: our use of blkdebug fails if $TEST_DIR contains spaces
-+# or other problems
-+case "$TEST_DIR" in
-+    *[^-_a-zA-Z0-9/]*)
-+        _notrun "Suspicious TEST_DIR='$TEST_DIR', cowardly refusing to run" ;;
-+esac
-+
-+_cleanup()
-+{
-+    _cleanup_test_img
-+    rm -f "$TEST_WRAP"
-+    rm -f "$BLKDBG_CONF"
-+}
-+trap "_cleanup; exit \$status" 0 1 2 3 15
-+
-+# Test is supported for any backing file; but we force qcow2 for our wrapper.
-+_supported_fmt generic
-+_supported_proto generic
-+_supported_os Linux
-+# LUKS support may be possible, but it complicates things.
-+_unsupported_fmt luks
-+
-+echo
-+echo '=== Copy-on-read ==='
-+echo
-+
-+# Prep the images
-+# VPC rounds image sizes to a specific geometry, force a specific size.
-+if [ "$IMGFMT" = "vpc" ]; then
-+    IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size")
-+fi
-+_make_test_img 4G
-+$QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io
-+IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \
-+    _make_test_img -F "$IMGFMT" -b "$TEST_IMG" | _filter_img_create
-+$QEMU_IO -f qcow2 -c "write -z -u 1M 64k" "$TEST_WRAP" | _filter_qemu_io
-+
-+# Ensure that a read of two clusters, but where one is already allocated,
-+# does not re-write the allocated cluster
-+cat > "$BLKDBG_CONF" <<EOF
-+[inject-error]
-+event = "cor_write"
-+sector = "2048"
-+EOF
-+$QEMU_IO -c "open \
-+ -o driver=copy-on-read,file.driver=blkdebug,file.config=$BLKDBG_CONF,file.image.driver=qcow2 $TEST_WRAP" \
-+ -c "read -P 0 1M 128k" | _filter_qemu_io
-+
-+# Read the areas we want copied. A zero-length read should still be a
-+# no-op.  The next read is under 2G, but aligned so that rounding to
-+# clusters copies more than 2G of zeroes. The final read will pick up
-+# the non-zero data in the same cluster.  Since a 2G read may exhaust
-+# memory on some machines (particularly 32-bit), we skip the test if
-+# that fails due to memory pressure.
-+$QEMU_IO \
-+    -c "open -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \
-+    -c "read 0 0" \
-+    | _filter_qemu_io
-+output=$($QEMU_IO \
-+         -c "open -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \
-+         -c "read -P 0 1k $((2*1024*1024*1024 - 512))" \
-+         2>&1 | _filter_qemu_io)
-+case $output in
-+    *allocate*)
-+        _notrun "Insufficent memory to run test" ;;
-+    *) printf '%s\n' "$output" ;;
-+esac
-+$QEMU_IO \
-+    -c "open -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \
-+    -c "read -P 0 $((3*1024*1024*1024 + 1024)) 1k" \
-+    | _filter_qemu_io
-+
-+# Copy-on-read is incompatible with read-only
-+$QEMU_IO \
-+    -c "open -r -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \
-+    2>&1 | _filter_testdir
-+
-+# Break the backing chain, and show that images are identical, and that
-+# we properly copied over explicit zeros.
-+$QEMU_IMG rebase -u -b "" -f qcow2 "$TEST_WRAP"
-+$QEMU_IO -f qcow2 -c map "$TEST_WRAP"
-+_check_test_img
-+$QEMU_IMG compare -f $IMGFMT -F qcow2 "$TEST_IMG" "$TEST_WRAP"
-+
-+# success, all done
-+echo '*** done'
-+status=0
-diff --git a/tests/qemu-iotests/215.out b/tests/qemu-iotests/215.out
-new file mode 100644
-index XXXXXXX..XXXXXXX
---- /dev/null
-+++ b/tests/qemu-iotests/215.out
-@@ -XXX,XX +XXX,XX @@
-+QA output created by 215
-+
-+=== Copy-on-read ===
-+
-+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=4294967296
-+wrote 1024/1024 bytes at offset 3221225472
-+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+Formatting 'TEST_DIR/t.wrap.IMGFMT', fmt=IMGFMT size=4294967296 backing_file=TEST_DIR/t.IMGFMT backing_fmt=IMGFMT
-+wrote 65536/65536 bytes at offset 1048576
-+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+read 131072/131072 bytes at offset 1048576
-+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+read 0/0 bytes at offset 0
-+0 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+read 2147483136/2147483136 bytes at offset 1024
-+2 GiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+read 1024/1024 bytes at offset 3221226496
-+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-+can't open device TEST_DIR/t.wrap.qcow2: Block node is read-only
-+2 GiB (0x80010000) bytes     allocated at offset 0 bytes (0x0)
-+1023.938 MiB (0x3fff0000) bytes not allocated at offset 2 GiB (0x80010000)
-+64 KiB (0x10000) bytes     allocated at offset 3 GiB (0xc0000000)
-+1023.938 MiB (0x3fff0000) bytes not allocated at offset 3 GiB (0xc0010000)
-+No errors were found on the image.
-+Images are identical.
-+*** done
-diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
-index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/group
-+++ b/tests/qemu-iotests/group
-@@ -XXX,XX +XXX,XX @@
-rw auto quick
-rw auto quick
-rw auto
-+215 rw auto quick
-rw auto quick
---
-.13.6

-[Qemu-devel] [PULL 33/37] qemu-img: Check post-truncation size
+Deleted patch
-From: Max Reitz <mreitz@redhat.com>
-Some block drivers (iscsi and file-posix when dealing with device files)
-do not actually support truncation, even though they provide a
-.bdrv_truncate() method and will happily return success when providing a
-new size that does not exceed the current size.  This is because these
-drivers expect the user to resize the image outside of qemu and then
-provide qemu with that information through the block_resize command
-(compare cb1b83e740384b4e0d950f3d7c81c02b8ce86c2e).
-Of course, anyone using qemu-img resize will find that behavior useless.
-So we should check the actual size of the image after the supposedly
-successful truncation took place, emit an error if nothing changed and
-emit a warning if the target size was not met.
-Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1523065
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Message-id: 20180421163957.29872-1-mreitz@redhat.com
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- qemu-img.c | 39 +++++++++++++++++++++++++++++++++++----
-file changed, 35 insertions(+), 4 deletions(-)
-diff --git a/qemu-img.c b/qemu-img.c
-index XXXXXXX..XXXXXXX 100644
---- a/qemu-img.c
-+++ b/qemu-img.c
-@@ -XXX,XX +XXX,XX @@ static int img_resize(int argc, char **argv)
-     Error *err = NULL;
-     int c, ret, relative;
-     const char *filename, *fmt, *size;
--    int64_t n, total_size, current_size;
-+    int64_t n, total_size, current_size, new_size;
-     bool quiet = false;
-     BlockBackend *blk = NULL;
-     PreallocMode prealloc = PREALLOC_MODE_OFF;
-@@ -XXX,XX +XXX,XX @@ static int img_resize(int argc, char **argv)
-     }
-     ret = blk_truncate(blk, total_size, prealloc, &err);
--    if (!ret) {
--        qprintf(quiet, "Image resized.\n");
--    } else {
-+    if (ret < 0) {
-         error_report_err(err);
-+        goto out;
-+    }
-+
-+    new_size = blk_getlength(blk);
-+    if (new_size < 0) {
-+        error_report("Failed to verify truncated image length: %s",
-+                     strerror(-new_size));
-+        ret = -1;
-+        goto out;
-     }
-+
-+    /* Some block drivers implement a truncation method, but only so
-+     * the user can cause qemu to refresh the image's size from disk.
-+     * The idea is that the user resizes the image outside of qemu and
-+     * then invokes block_resize to inform qemu about it.
-+     * (This includes iscsi and file-posix for device files.)
-+     * Of course, that is not the behavior someone invoking
-+     * qemu-img resize would find useful, so we catch that behavior
-+     * here and tell the user. */
-+    if (new_size != total_size && new_size == current_size) {
-+        error_report("Image was not resized; resizing may not be supported "
-+                     "for this image");
-+        ret = -1;
-+        goto out;
-+    }
-+
-+    if (new_size != total_size) {
-+        warn_report("Image should have been resized to %" PRIi64
-+                    " bytes, but was resized to %" PRIi64 " bytes",
-+                    total_size, new_size);
-+    }
-+
-+    qprintf(quiet, "Image resized.\n");
-+
- out:
-     blk_unref(blk);
-     if (ret) {
---
-.13.6

-[Qemu-devel] [PULL 34/37] block: Document BDRV_REQ_WRITE_UNCHANGED support
+Deleted patch
-From: Max Reitz <mreitz@redhat.com>
-Add BDRV_REQ_WRITE_UNCHANGED to the list of flags honored during pwrite
-and pwrite_zeroes, and also add a note on when you absolutely need to
-support it.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Message-id: 20180502140359.18222-1-mreitz@redhat.com
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- include/block/block_int.h | 18 ++++++++++++++++--
-file changed, 16 insertions(+), 2 deletions(-)
-diff --git a/include/block/block_int.h b/include/block/block_int.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/block_int.h
-+++ b/include/block/block_int.h
-@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
-     /* I/O Limits */
-     BlockLimits bl;
--    /* Flags honored during pwrite (so far: BDRV_REQ_FUA) */
-+    /* Flags honored during pwrite (so far: BDRV_REQ_FUA,
-+     * BDRV_REQ_WRITE_UNCHANGED).
-+     * If a driver does not support BDRV_REQ_WRITE_UNCHANGED, those
-+     * writes will be issued as normal writes without the flag set.
-+     * This is important to note for drivers that do not explicitly
-+     * request a WRITE permission for their children and instead take
-+     * the same permissions as their parent did (this is commonly what
-+     * block filters do).  Such drivers have to be aware that the
-+     * parent may have taken a WRITE_UNCHANGED permission only and is
-+     * issuing such requests.  Drivers either must make sure that
-+     * these requests do not result in plain WRITE accesses (usually
-+     * by supporting BDRV_REQ_WRITE_UNCHANGED, and then forwarding
-+     * every incoming write request as-is, including potentially that
-+     * flag), or they have to explicitly take the WRITE permission for
-+     * their children. */
-     unsigned int supported_write_flags;
-     /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA,
--     * BDRV_REQ_MAY_UNMAP) */
-+     * BDRV_REQ_MAY_UNMAP, BDRV_REQ_WRITE_UNCHANGED) */
-     unsigned int supported_zero_flags;
-     /* the following member gives a name to every node on the bs graph. */
---
-.13.6

-[Qemu-devel] [PULL 35/37] qemu-io: Use purely string blockdev options
+Deleted patch
-From: Max Reitz <mreitz@redhat.com>
-Currently, qemu-io only uses string-valued blockdev options (as all are
-converted directly from QemuOpts) -- with one exception: -U adds the
-force-share option as a boolean.  This in itself is already a bit
-questionable, but a real issue is that it also assumes the value already
-existing in the options QDict would be a boolean, which is wrong.
-That has the following effect:
-$ ./qemu-io -r -U --image-opts \
-    driver=file,filename=/dev/null,force-share=off
-[1]    15200 segmentation fault (core dumped)  ./qemu-io -r -U
---image-opts driver=file,filename=/dev/null,force-share=off
-Since @opts is converted from QemuOpts, the value must be a string, and
-we have to compare it as such.  Consequently, it makes sense to also set
-it as a string instead of a boolean.
-Cc: qemu-stable@nongnu.org
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Message-id: 20180502202051.15493-2-mreitz@redhat.com
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- qemu-io.c | 4 ++--
-file changed, 2 insertions(+), 2 deletions(-)
-diff --git a/qemu-io.c b/qemu-io.c
-index XXXXXXX..XXXXXXX 100644
---- a/qemu-io.c
-+++ b/qemu-io.c
-@@ -XXX,XX +XXX,XX @@ static int openfile(char *name, int flags, bool writethrough, bool force_share,
-             opts = qdict_new();
-         }
-         if (qdict_haskey(opts, BDRV_OPT_FORCE_SHARE)
--            && !qdict_get_bool(opts, BDRV_OPT_FORCE_SHARE)) {
-+            && strcmp(qdict_get_str(opts, BDRV_OPT_FORCE_SHARE), "on")) {
-             error_report("-U conflicts with image options");
-             qobject_unref(opts);
-             return 1;
-         }
--        qdict_put_bool(opts, BDRV_OPT_FORCE_SHARE, true);
-+        qdict_put_str(opts, BDRV_OPT_FORCE_SHARE, "on");
-     }
-     qemuio_blk = blk_new_open(name, NULL, opts, flags, &local_err);
-     if (!qemuio_blk) {
---
-.13.6

-[Qemu-devel] [PULL 36/37] qemu-img: Use only string options in img_open_opts
+Deleted patch
-From: Max Reitz <mreitz@redhat.com>
-img_open_opts() takes a QemuOpts and converts them to a QDict, so all
-values therein are strings.  Then it may try to call qdict_get_bool(),
-however, which will fail with a segmentation fault every time:
-$ ./qemu-img info -U --image-opts \
-    driver=file,filename=/dev/null,force-share=off
-[1]    27869 segmentation fault (core dumped)  ./qemu-img info -U
---image-opts driver=file,filename=/dev/null,force-share=off
-Fix this by using qdict_get_str() and comparing the value as a string.
-Also, when adding a force-share value to the QDict, add it as a string
-so it fits the rest of the dict.
-Cc: qemu-stable@nongnu.org
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Message-id: 20180502202051.15493-3-mreitz@redhat.com
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- qemu-img.c | 4 ++--
-file changed, 2 insertions(+), 2 deletions(-)
-diff --git a/qemu-img.c b/qemu-img.c
-index XXXXXXX..XXXXXXX 100644
---- a/qemu-img.c
-+++ b/qemu-img.c
-@@ -XXX,XX +XXX,XX @@ static BlockBackend *img_open_opts(const char *optstr,
-     options = qemu_opts_to_qdict(opts, NULL);
-     if (force_share) {
-         if (qdict_haskey(options, BDRV_OPT_FORCE_SHARE)
--            && !qdict_get_bool(options, BDRV_OPT_FORCE_SHARE)) {
-+            && strcmp(qdict_get_str(options, BDRV_OPT_FORCE_SHARE), "on")) {
-             error_report("--force-share/-U conflicts with image options");
-             qobject_unref(options);
-             return NULL;
-         }
--        qdict_put_bool(options, BDRV_OPT_FORCE_SHARE, true);
-+        qdict_put_str(options, BDRV_OPT_FORCE_SHARE, "on");
-     }
-     blk = blk_new_open(NULL, NULL, options, flags, &local_err);
-     if (!blk) {
---
-.13.6

-[Qemu-devel] [PULL 37/37] iotests: Add test for -U/force-share conflicts
+Deleted patch
-From: Max Reitz <mreitz@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Message-id: 20180502202051.15493-4-mreitz@redhat.com
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- tests/qemu-iotests/153     | 17 +++++++++++++++++
- tests/qemu-iotests/153.out | 16 ++++++++++++++++
-files changed, 33 insertions(+)
-diff --git a/tests/qemu-iotests/153 b/tests/qemu-iotests/153
-index XXXXXXX..XXXXXXX 100755
---- a/tests/qemu-iotests/153
-+++ b/tests/qemu-iotests/153
-@@ -XXX,XX +XXX,XX @@ _run_cmd $QEMU_IO "${TEST_IMG}" -c 'write 0 512'
- _cleanup_qemu
-+echo
-+echo "== Detecting -U and force-share conflicts =="
-+
-+echo
-+echo 'No conflict:'
-+$QEMU_IMG info -U --image-opts driver=null-co,force-share=on
-+echo
-+echo 'Conflict:'
-+$QEMU_IMG info -U --image-opts driver=null-co,force-share=off
-+
-+echo
-+echo 'No conflict:'
-+$QEMU_IO -c 'open -r -U -o driver=null-co,force-share=on'
-+echo
-+echo 'Conflict:'
-+$QEMU_IO -c 'open -r -U -o driver=null-co,force-share=off'
-+
- # success, all done
- echo "*** done"
- rm -f $seq.full
-diff --git a/tests/qemu-iotests/153.out b/tests/qemu-iotests/153.out
-index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/153.out
-+++ b/tests/qemu-iotests/153.out
-@@ -XXX,XX +XXX,XX @@ Is another process using the image?
- Closing the other
- _qemu_io_wrapper TEST_DIR/t.qcow2 -c write 0 512
-+
-+== Detecting -U and force-share conflicts ==
-+
-+No conflict:
-+image: null-co://
-+file format: null-co
-+virtual size: 1.0G (1073741824 bytes)
-+disk size: unavailable
-+
-+Conflict:
-+qemu-img: --force-share/-U conflicts with image options
-+
-+No conflict:
-+
-+Conflict:
-+-U conflicts with image options
- *** done
---
-.13.6

The following changes since commit ad1b4ec39caa5b3f17cbd8160283a03a3dcfe2ae:

Merge remote-tracking branch 'remotes/kraxel/tags/input-20180515-pull-request' into staging (2018-05-15 12:50:06 +0100)

are available in the git repository at:

git://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to 1fce860ea5eba1ca00a67911fc0b8a5d80009514:

Merge remote-tracking branch 'mreitz/tags/pull-block-2018-05-15' into queue-block (2018-05-15 16:19:53 +0200)

----------------------------------------------------------------
Block layer patches:

- Switch AIO/callback based block drivers to a byte-based interface
- Block jobs: Expose error string via query-block-jobs
- Block job cleanups and fixes
- hmp: Allow using a qdev id in block_set_io_throttle
- Copy-on-read block driver
- The qcow2 default refcount cache size has been decreased
- Various bug fixes
----------------------------------------------------------------
Alberto Garcia (5):
      hmp: Allow using a qdev id in block_set_io_throttle
      Fix error message about compressed clusters with OFLAG_COPIED
      specs/qcow2: Clarify that compressed clusters have the COPIED bit reset
      qcow2: Give the refcount cache the minimum possible size by default
      docs: Document the new default sizes of the qcow2 caches

Daniel Henrique Barboza (1):
      block-backend: simplify blk_get_aio_context

Eric Blake (7):
      block: Support byte-based aio callbacks
      file-win32: Switch to byte-based callbacks
      null: Switch to byte-based read/write
      rbd: Switch to byte-based callbacks
      vxhs: Switch to byte-based callbacks
      block: Drop last of the sector-based aio callbacks
      block: Merge .bdrv_co_writev{,_flags} in drivers

John Snow (1):
      blockjob: expose error string via query

Kevin Wolf (7):
      blockjob: Fix assertion in block_job_finalize()
      blockjob: Wrappers for progress counter access
      blockjob: Move RateLimit to BlockJob
      blockjob: Implement block_job_set_speed() centrally
      blockjob: Introduce block_job_ratelimit_get_delay()
      blockjob: Add block_job_driver()
      Merge remote-tracking branch 'mreitz/tags/pull-block-2018-05-15' into queue-block

Max Reitz (17):
      iotests: Split 214 off of 122
      iotests: Add failure matching to common.qemu
      iotests: Skip 181 and 201 without userfaultfd
      block: Add COR filter driver
      block: BLK_PERM_WRITE includes ..._UNCHANGED
      block: Add BDRV_REQ_WRITE_UNCHANGED flag
      block: Set BDRV_REQ_WRITE_UNCHANGED for COR writes
      block/quorum: Support BDRV_REQ_WRITE_UNCHANGED
      block: Support BDRV_REQ_WRITE_UNCHANGED in filters
      iotests: Clean up wrap image in 197
      iotests: Copy 197 for COR filter driver
      iotests: Add test for COR across nodes
      qemu-img: Check post-truncation size
      block: Document BDRV_REQ_WRITE_UNCHANGED support
      qemu-io: Use purely string blockdev options
      qemu-img: Use only string options in img_open_opts
      iotests: Add test for -U/force-share conflicts

From: Daniel Henrique Barboza <danielhb@linux.vnet.ibm.com>

blk_get_aio_context verifies if BlockDriverState bs is not NULL,
return bdrv_get_aio_context(bs) if true or qemu_get_aio_context()
otherwise. However, bdrv_get_aio_context from block.c already does
this verification itself, also returning qemu_get_aio_context()
if bs is NULL:

AioContext *bdrv_get_aio_context(BlockDriverState *bs)
{
    return bs ? bs->aio_context : qemu_get_aio_context();
}

This patch simplifies blk_get_aio_context to simply call
bdrv_get_aio_context instead of replicating the same logic.

Signed-off-by: Daniel Henrique Barboza <danielhb@linux.vnet.ibm.com>
Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/block-backend.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ void blk_op_unblock_all(BlockBackend *blk, Error *reason)
 
 AioContext *blk_get_aio_context(BlockBackend *blk)
 {
-    BlockDriverState *bs = blk_bs(blk);
-
-    if (bs) {
-        return bdrv_get_aio_context(bs);
-    } else {
-        return qemu_get_aio_context();
-    }
+    return bdrv_get_aio_context(blk_bs(blk));
 }
 
 static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
-- 
2.13.6

From: Eric Blake <eblake@redhat.com>

We are gradually moving away from sector-based interfaces, towards
byte-based.  Add new sector-based aio callbacks for read and write,
to match the fact that bdrv_aio_pdiscard is already byte-based.

Ideally, drivers should be converted to use coroutine callbacks
rather than aio; but that is not quite as trivial (and if we were
to do that conversion, the null-aio driver would disappear), so for
the short term, converting the signature but keeping things with
aio is easier.  However, we CAN declare that a driver that uses
the byte-based aio interfaces now defaults to byte-based
operations, and must explicitly provide a refresh_limits override
to stick with larger alignments (making the alignment issues more
obvious directly in the drivers touched in the next few patches).

Once all drivers are converted, the sector-based aio callbacks will
be removed; in the meantime, a FIXME comment is added due to a
slight inefficiency that will be touched up as part of that later
cleanup.

Simplify some instances of 'bs->drv' into 'drv' while touching this,
since the local variable already exists to reduce typing.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block_int.h |  6 ++++++
 block/io.c                | 38 +++++++++++++++++++++++++++++---------
 2 files changed, 35 insertions(+), 9 deletions(-)

From: Eric Blake <eblake@redhat.com>

We are gradually moving away from sector-based interfaces, towards
byte-based.  Make the change for the last few sector-based callbacks
in the file-win32 driver.

Note that the driver was already using byte-based calls for
performing actual I/O, so this just gets rid of a round trip
of scaling; however, as I don't know if Windows is tolerant of
non-sector AIO operations, I went with the conservative approach
of modifying .bdrv_refresh_limits to override the block layer
defaults back to the pre-patch value of 512.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/raw-aio.h |  2 +-
 block/file-win32.c      | 47 +++++++++++++++++++++++++++++------------------
 block/win32-aio.c       |  5 ++---
 3 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -XXX,XX +XXX,XX @@ void win32_aio_cleanup(QEMUWin32AIOState *aio);
 int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile);
 BlockAIOCB *win32_aio_submit(BlockDriverState *bs,
         QEMUWin32AIOState *aio, HANDLE hfile,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov,
         BlockCompletionFunc *cb, void *opaque, int type);
 void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
                                   AioContext *old_context);
diff --git a/block/file-win32.c b/block/file-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/block/file-win32.c
+++ b/block/file-win32.c
@@ -XXX,XX +XXX,XX @@ static void raw_probe_alignment(BlockDriverState *bs, Error **errp)
                          &dg.Geometry.BytesPerSector,
                          &freeClusters, &totalClusters);
         bs->bl.request_alignment = dg.Geometry.BytesPerSector;
+        return;
     }
+
+    /* XXX Does Windows support AIO on less than 512-byte alignment? */
+    bs->bl.request_alignment = 512;
 }
 
 static void raw_parse_flags(int flags, bool use_aio, int *access_flags,
@@ -XXX,XX +XXX,XX @@ fail:
     return ret;
 }
 
-static BlockAIOCB *raw_aio_readv(BlockDriverState *bs,
-                         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-                         BlockCompletionFunc *cb, void *opaque)
+static BlockAIOCB *raw_aio_preadv(BlockDriverState *bs,
+                                  uint64_t offset, uint64_t bytes,
+                                  QEMUIOVector *qiov, int flags,
+                                  BlockCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
     if (s->aio) {
-        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
-                                nb_sectors, cb, opaque, QEMU_AIO_READ);
+        return win32_aio_submit(bs, s->aio, s->hfile, offset, bytes, qiov,
+                                cb, opaque, QEMU_AIO_READ);
     } else {
-        return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov,
-                           nb_sectors << BDRV_SECTOR_BITS,
+        return paio_submit(bs, s->hfile, offset, qiov, bytes,
                            cb, opaque, QEMU_AIO_READ);
     }
 }
 
-static BlockAIOCB *raw_aio_writev(BlockDriverState *bs,
-                          int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-                          BlockCompletionFunc *cb, void *opaque)
+static BlockAIOCB *raw_aio_pwritev(BlockDriverState *bs,
+                                   uint64_t offset, uint64_t bytes,
+                                   QEMUIOVector *qiov, int flags,
+                                   BlockCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
     if (s->aio) {
-        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
-                                nb_sectors, cb, opaque, QEMU_AIO_WRITE);
+        return win32_aio_submit(bs, s->aio, s->hfile, offset, bytes, qiov,
+                                cb, opaque, QEMU_AIO_WRITE);
     } else {
-        return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov,
-                           nb_sectors << BDRV_SECTOR_BITS,
+        return paio_submit(bs, s->hfile, offset, qiov, bytes,
                            cb, opaque, QEMU_AIO_WRITE);
     }
 }
@@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_file = {
     .bdrv_co_create_opts = raw_co_create_opts,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,
 
-    .bdrv_aio_readv     = raw_aio_readv,
-    .bdrv_aio_writev    = raw_aio_writev,
+    .bdrv_aio_preadv    = raw_aio_preadv,
+    .bdrv_aio_pwritev   = raw_aio_pwritev,
     .bdrv_aio_flush     = raw_aio_flush,
 
     .bdrv_truncate	= raw_truncate,
@@ -XXX,XX +XXX,XX @@ static void hdev_parse_filename(const char *filename, QDict *options,
     bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
 }
 
+static void hdev_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    /* XXX Does Windows support AIO on less than 512-byte alignment? */
+    bs->bl.request_alignment = 512;
+}
+
 static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
                      Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_device = {
     .bdrv_probe_device	= hdev_probe_device,
     .bdrv_file_open	= hdev_open,
     .bdrv_close		= raw_close,
+    .bdrv_refresh_limits = hdev_refresh_limits,
 
-    .bdrv_aio_readv     = raw_aio_readv,
-    .bdrv_aio_writev    = raw_aio_writev,
+    .bdrv_aio_preadv    = raw_aio_preadv,
+    .bdrv_aio_pwritev   = raw_aio_pwritev,
     .bdrv_aio_flush     = raw_aio_flush,
 
     .bdrv_detach_aio_context = raw_detach_aio_context,
diff --git a/block/win32-aio.c b/block/win32-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo win32_aiocb_info = {
 
 BlockAIOCB *win32_aio_submit(BlockDriverState *bs,
         QEMUWin32AIOState *aio, HANDLE hfile,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov,
         BlockCompletionFunc *cb, void *opaque, int type)
 {
     struct QEMUWin32AIOCB *waiocb;
-    uint64_t offset = sector_num * 512;
     DWORD rc;
 
     waiocb = qemu_aio_get(&win32_aiocb_info, bs, cb, opaque);
-    waiocb->nbytes = nb_sectors * 512;
+    waiocb->nbytes = bytes;
     waiocb->qiov = qiov;
     waiocb->is_read = (type == QEMU_AIO_READ);
 
-- 
2.13.6

From: Eric Blake <eblake@redhat.com>

We are gradually moving away from sector-based interfaces, towards
byte-based.  Make the change for the last few sector-based callbacks
in the null-co and null-aio drivers.

Note that since the null driver does nothing on writes, it trivially
supports the BDRV_REQ_FUA flag (all writes have already landed to
the same bit-bucket without needing an extra flush call).  Also, since
the null driver does just as well with byte-based requests, we can
now avoid cycles wasted on read-modify-write by taking advantage of
the block layer now defaulting the alignment to 1 instead of 512.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/null.c | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/block/null.c b/block/null.c
index XXXXXXX..XXXXXXX 100644
--- a/block/null.c
+++ b/block/null.c
@@ -XXX,XX +XXX,XX @@ static int null_file_open(BlockDriverState *bs, QDict *options, int flags,
     }
     s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, false);
     qemu_opts_del(opts);
+    bs->supported_write_flags = BDRV_REQ_FUA;
     return ret;
 }
 
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int null_co_common(BlockDriverState *bs)
     return 0;
 }
 
-static coroutine_fn int null_co_readv(BlockDriverState *bs,
-                                      int64_t sector_num, int nb_sectors,
-                                      QEMUIOVector *qiov)
+static coroutine_fn int null_co_preadv(BlockDriverState *bs,
+                                       uint64_t offset, uint64_t bytes,
+                                       QEMUIOVector *qiov, int flags)
 {
     BDRVNullState *s = bs->opaque;
 
     if (s->read_zeroes) {
-        qemu_iovec_memset(qiov, 0, 0, nb_sectors * BDRV_SECTOR_SIZE);
+        qemu_iovec_memset(qiov, 0, 0, bytes);
     }
 
     return null_co_common(bs);
 }
 
-static coroutine_fn int null_co_writev(BlockDriverState *bs,
-                                       int64_t sector_num, int nb_sectors,
-                                       QEMUIOVector *qiov)
+static coroutine_fn int null_co_pwritev(BlockDriverState *bs,
+                                        uint64_t offset, uint64_t bytes,
+                                        QEMUIOVector *qiov, int flags)
 {
     return null_co_common(bs);
 }
@@ -XXX,XX +XXX,XX @@ static inline BlockAIOCB *null_aio_common(BlockDriverState *bs,
     return &acb->common;
 }
 
-static BlockAIOCB *null_aio_readv(BlockDriverState *bs,
-                                  int64_t sector_num, QEMUIOVector *qiov,
-                                  int nb_sectors,
-                                  BlockCompletionFunc *cb,
-                                  void *opaque)
+static BlockAIOCB *null_aio_preadv(BlockDriverState *bs,
+                                   uint64_t offset, uint64_t bytes,
+                                   QEMUIOVector *qiov, int flags,
+                                   BlockCompletionFunc *cb,
+                                   void *opaque)
 {
     BDRVNullState *s = bs->opaque;
 
     if (s->read_zeroes) {
-        qemu_iovec_memset(qiov, 0, 0, nb_sectors * BDRV_SECTOR_SIZE);
+        qemu_iovec_memset(qiov, 0, 0, bytes);
     }
 
     return null_aio_common(bs, cb, opaque);
 }
 
-static BlockAIOCB *null_aio_writev(BlockDriverState *bs,
-                                   int64_t sector_num, QEMUIOVector *qiov,
-                                   int nb_sectors,
-                                   BlockCompletionFunc *cb,
-                                   void *opaque)
+static BlockAIOCB *null_aio_pwritev(BlockDriverState *bs,
+                                    uint64_t offset, uint64_t bytes,
+                                    QEMUIOVector *qiov, int flags,
+                                    BlockCompletionFunc *cb,
+                                    void *opaque)
 {
     return null_aio_common(bs, cb, opaque);
 }
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_null_co = {
     .bdrv_close             = null_close,
     .bdrv_getlength         = null_getlength,
 
-    .bdrv_co_readv          = null_co_readv,
-    .bdrv_co_writev         = null_co_writev,
+    .bdrv_co_preadv         = null_co_preadv,
+    .bdrv_co_pwritev        = null_co_pwritev,
     .bdrv_co_flush_to_disk  = null_co_flush,
     .bdrv_reopen_prepare    = null_reopen_prepare,
 
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_null_aio = {
     .bdrv_close             = null_close,
     .bdrv_getlength         = null_getlength,
 
-    .bdrv_aio_readv         = null_aio_readv,
-    .bdrv_aio_writev        = null_aio_writev,
+    .bdrv_aio_preadv        = null_aio_preadv,
+    .bdrv_aio_pwritev       = null_aio_pwritev,
     .bdrv_aio_flush         = null_aio_flush,
     .bdrv_reopen_prepare    = null_reopen_prepare,
 
-- 
2.13.6

From: Eric Blake <eblake@redhat.com>

We are gradually moving away from sector-based interfaces, towards
byte-based.  Make the change for the last few sector-based callbacks
in the rbd driver.

Note that the driver was already using byte-based calls for
performing actual I/O, so this just gets rid of a round trip
of scaling; however, as I don't know if RBD is tolerant of
non-sector AIO operations, I went with the conservate approach
of adding .bdrv_refresh_limits to override the block layer
defaults back to the pre-patch value of 512.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/rbd.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index XXXXXXX..XXXXXXX 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -XXX,XX +XXX,XX @@ done:
 }
 
 
+static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    /* XXX Does RBD support AIO on less than 512-byte alignment? */
+    bs->bl.request_alignment = 512;
+}
+
+
 static int qemu_rbd_set_auth(rados_t cluster, const char *secretid,
                              Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ failed:
     return NULL;
 }
 
-static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
-                                      int64_t sector_num,
-                                      QEMUIOVector *qiov,
-                                      int nb_sectors,
-                                      BlockCompletionFunc *cb,
-                                      void *opaque)
+static BlockAIOCB *qemu_rbd_aio_preadv(BlockDriverState *bs,
+                                       uint64_t offset, uint64_t bytes,
+                                       QEMUIOVector *qiov, int flags,
+                                       BlockCompletionFunc *cb,
+                                       void *opaque)
 {
-    return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
-                         (int64_t) nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
+    return rbd_start_aio(bs, offset, qiov, bytes, cb, opaque,
                          RBD_AIO_READ);
 }
 
-static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
-                                       int64_t sector_num,
-                                       QEMUIOVector *qiov,
-                                       int nb_sectors,
-                                       BlockCompletionFunc *cb,
-                                       void *opaque)
+static BlockAIOCB *qemu_rbd_aio_pwritev(BlockDriverState *bs,
+                                        uint64_t offset, uint64_t bytes,
+                                        QEMUIOVector *qiov, int flags,
+                                        BlockCompletionFunc *cb,
+                                        void *opaque)
 {
-    return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
-                         (int64_t) nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
+    return rbd_start_aio(bs, offset, qiov, bytes, cb, opaque,
                          RBD_AIO_WRITE);
 }
 
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_rbd = {
     .format_name            = "rbd",
     .instance_size          = sizeof(BDRVRBDState),
     .bdrv_parse_filename    = qemu_rbd_parse_filename,
+    .bdrv_refresh_limits    = qemu_rbd_refresh_limits,
     .bdrv_file_open         = qemu_rbd_open,
     .bdrv_close             = qemu_rbd_close,
     .bdrv_reopen_prepare    = qemu_rbd_reopen_prepare,
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_rbd = {
     .bdrv_truncate          = qemu_rbd_truncate,
     .protocol_name          = "rbd",
 
-    .bdrv_aio_readv         = qemu_rbd_aio_readv,
-    .bdrv_aio_writev        = qemu_rbd_aio_writev,
+    .bdrv_aio_preadv        = qemu_rbd_aio_preadv,
+    .bdrv_aio_pwritev       = qemu_rbd_aio_pwritev,
 
 #ifdef LIBRBD_SUPPORTS_AIO_FLUSH
     .bdrv_aio_flush         = qemu_rbd_aio_flush,
-- 
2.13.6

From: Eric Blake <eblake@redhat.com>

We are gradually moving away from sector-based interfaces, towards
byte-based.  Make the change for the last few sector-based callbacks
in the vxhs driver.

Note that the driver was already using byte-based calls for
performing actual I/O, so this just gets rid of a round trip
of scaling; however, as I don't know if VxHS is tolerant of
non-sector AIO operations, I went with the conservative approach
of adding .bdrv_refresh_limits to override the block layer
defaults back to the pre-patch value of 512.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vxhs.c | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/block/vxhs.c b/block/vxhs.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vxhs.c
+++ b/block/vxhs.c
@@ -XXX,XX +XXX,XX @@ static void vxhs_parse_filename(const char *filename, QDict *options,
     }
 }
 
+static void vxhs_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    /* XXX Does VXHS support AIO on less than 512-byte alignment? */
+    bs->bl.request_alignment = 512;
+}
+
 static int vxhs_init_and_ref(void)
 {
     if (vxhs_ref++ == 0) {
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo vxhs_aiocb_info = {
  * and is passed to QNIO. When QNIO completes the work,
  * it will be passed back through the callback.
  */
-static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num,
-                               QEMUIOVector *qiov, int nb_sectors,
+static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, uint64_t offset,
+                               QEMUIOVector *qiov, uint64_t size,
                                BlockCompletionFunc *cb, void *opaque,
                                VDISKAIOCmd iodir)
 {
     VXHSAIOCB *acb = NULL;
     BDRVVXHSState *s = bs->opaque;
-    size_t size;
-    uint64_t offset;
     int iio_flags = 0;
     int ret = 0;
     void *dev_handle = s->vdisk_hostinfo.dev_handle;
 
-    offset = sector_num * BDRV_SECTOR_SIZE;
-    size = nb_sectors * BDRV_SECTOR_SIZE;
     acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque);
 
     /*
@@ -XXX,XX +XXX,XX @@ static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num,
     switch (iodir) {
     case VDISK_AIO_WRITE:
             ret = iio_writev(dev_handle, acb, qiov->iov, qiov->niov,
-                             offset, (uint64_t)size, iio_flags);
+                             offset, size, iio_flags);
             break;
     case VDISK_AIO_READ:
             ret = iio_readv(dev_handle, acb, qiov->iov, qiov->niov,
-                            offset, (uint64_t)size, iio_flags);
+                            offset, size, iio_flags);
             break;
     default:
             trace_vxhs_aio_rw_invalid(iodir);
@@ -XXX,XX +XXX,XX @@ errout:
     return NULL;
 }
 
-static BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs,
-                                   int64_t sector_num, QEMUIOVector *qiov,
-                                   int nb_sectors,
+static BlockAIOCB *vxhs_aio_preadv(BlockDriverState *bs,
+                                   uint64_t offset, uint64_t bytes,
+                                   QEMUIOVector *qiov, int flags,
                                    BlockCompletionFunc *cb, void *opaque)
 {
-    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, cb,
-                       opaque, VDISK_AIO_READ);
+    return vxhs_aio_rw(bs, offset, qiov, bytes, cb, opaque, VDISK_AIO_READ);
 }
 
-static BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs,
-                                   int64_t sector_num, QEMUIOVector *qiov,
-                                   int nb_sectors,
-                                   BlockCompletionFunc *cb, void *opaque)
+static BlockAIOCB *vxhs_aio_pwritev(BlockDriverState *bs,
+                                    uint64_t offset, uint64_t bytes,
+                                    QEMUIOVector *qiov, int flags,
+                                    BlockCompletionFunc *cb, void *opaque)
 {
-    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors,
-                       cb, opaque, VDISK_AIO_WRITE);
+    return vxhs_aio_rw(bs, offset, qiov, bytes, cb, opaque, VDISK_AIO_WRITE);
 }
 
 static void vxhs_close(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_vxhs = {
     .instance_size                = sizeof(BDRVVXHSState),
     .bdrv_file_open               = vxhs_open,
     .bdrv_parse_filename          = vxhs_parse_filename,
+    .bdrv_refresh_limits          = vxhs_refresh_limits,
     .bdrv_close                   = vxhs_close,
     .bdrv_getlength               = vxhs_getlength,
-    .bdrv_aio_readv               = vxhs_aio_readv,
-    .bdrv_aio_writev              = vxhs_aio_writev,
+    .bdrv_aio_preadv              = vxhs_aio_preadv,
+    .bdrv_aio_pwritev             = vxhs_aio_pwritev,
 };
 
 static void bdrv_vxhs_init(void)
-- 
2.13.6

From: Eric Blake <eblake@redhat.com>

We are gradually moving away from sector-based interfaces, towards
byte-based.  Now that all drivers with aio callbacks are using the
byte-based interfaces, we can remove the sector-based versions.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block_int.h |  6 ----
 block/io.c                | 84 ++++++++++++++++++++---------------------------
 2 files changed, 36 insertions(+), 54 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
     void (*bdrv_refresh_filename)(BlockDriverState *bs, QDict *options);
 
     /* aio */
-    BlockAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque);
     BlockAIOCB *(*bdrv_aio_preadv)(BlockDriverState *bs,
         uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags,
         BlockCompletionFunc *cb, void *opaque);
-    BlockAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque);
     BlockAIOCB *(*bdrv_aio_pwritev)(BlockDriverState *bs,
         uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags,
         BlockCompletionFunc *cb, void *opaque);
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
         return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
     }
 
-    /* FIXME - no need to calculate these if .bdrv_aio_preadv exists */
-    sector_num = offset >> BDRV_SECTOR_BITS;
-    nb_sectors = bytes >> BDRV_SECTOR_BITS;
-
-    if (!drv->bdrv_aio_preadv) {
-        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-        assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
-    }
-
-    if (drv->bdrv_co_readv) {
-        return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
-    } else {
+    if (drv->bdrv_aio_preadv) {
         BlockAIOCB *acb;
         CoroutineIOCompletion co = {
             .coroutine = qemu_coroutine_self(),
         };
 
-        if (drv->bdrv_aio_preadv) {
-            acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
-                                       bdrv_co_io_em_complete, &co);
-        } else {
-            acb = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
-                                      bdrv_co_io_em_complete, &co);
-        }
+        acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
+                                   bdrv_co_io_em_complete, &co);
         if (acb == NULL) {
             return -EIO;
         } else {
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
             return co.ret;
         }
     }
+
+    sector_num = offset >> BDRV_SECTOR_BITS;
+    nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
+    assert(drv->bdrv_co_readv);
+
+    return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
 }
 
 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
         goto emulate_flags;
     }
 
-    /* FIXME - no need to calculate these if .bdrv_aio_pwritev exists */
-    sector_num = offset >> BDRV_SECTOR_BITS;
-    nb_sectors = bytes >> BDRV_SECTOR_BITS;
-
-    if (!drv->bdrv_aio_pwritev) {
-        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-        assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
-    }
-
-    if (drv->bdrv_co_writev_flags) {
-        ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
-                                        flags & bs->supported_write_flags);
-        flags &= ~bs->supported_write_flags;
-    } else if (drv->bdrv_co_writev) {
-        assert(!bs->supported_write_flags);
-        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
-    } else {
+    if (drv->bdrv_aio_pwritev) {
         BlockAIOCB *acb;
         CoroutineIOCompletion co = {
             .coroutine = qemu_coroutine_self(),
         };
 
-        if (drv->bdrv_aio_pwritev) {
-            acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
-                                        flags & bs->supported_write_flags,
-                                        bdrv_co_io_em_complete, &co);
-            flags &= ~bs->supported_write_flags;
-        } else {
-            assert(!bs->supported_write_flags);
-            acb = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
-                                       bdrv_co_io_em_complete, &co);
-        }
+        acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
+                                    flags & bs->supported_write_flags,
+                                    bdrv_co_io_em_complete, &co);
+        flags &= ~bs->supported_write_flags;
         if (acb == NULL) {
             ret = -EIO;
         } else {
             qemu_coroutine_yield();
             ret = co.ret;
         }
+        goto emulate_flags;
+    }
+
+    sector_num = offset >> BDRV_SECTOR_BITS;
+    nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
+
+    if (drv->bdrv_co_writev_flags) {
+        ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
+                                        flags & bs->supported_write_flags);
+        flags &= ~bs->supported_write_flags;
+    } else {
+        assert(drv->bdrv_co_writev);
+        assert(!bs->supported_write_flags);
+        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
     }
 
 emulate_flags:
-- 
2.13.6

From: Eric Blake <eblake@redhat.com>

We have too many driver callback interfaces; simplify the mess
somewhat by merging the flags parameter of .bdrv_co_writev_flags()
into .bdrv_co_writev().  Note that as long as a driver doesn't set
.supported_write_flags, the flags argument will be 0 and behavior is
identical.  Also note that the public function bdrv_co_writev() still
lacks a flags argument; so the driver signature is thus intentionally
slightly different.  But that's not the end of the world, nor the first
time that the driver interface differs slightly from the public
interface.

Ideally, we should be rewriting all of these drivers to use modern
byte-based interfaces.  But that's a more invasive patch to write
and audit, compared to the simplification done here.

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
     int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs,
         uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
     int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
-    int coroutine_fn (*bdrv_co_writev_flags)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags);
     /**
      * @offset: position in bytes to write at
diff --git a/block/gluster.c b/block/gluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs,
 static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
                                                int64_t sector_num,
                                                int nb_sectors,
-                                               QEMUIOVector *qiov)
+                                               QEMUIOVector *qiov,
+                                               int flags)
 {
+    assert(!flags);
     return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
 }
 
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 
-    if (drv->bdrv_co_writev_flags) {
-        ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
-                                        flags & bs->supported_write_flags);
-        flags &= ~bs->supported_write_flags;
-    } else {
-        assert(drv->bdrv_co_writev);
-        assert(!bs->supported_write_flags);
-        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
-    }
+    assert(drv->bdrv_co_writev);
+    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
+                              flags & bs->supported_write_flags);
+    flags &= ~bs->supported_write_flags;
 
 emulate_flags:
     if (ret == 0 && (flags & BDRV_REQ_FUA)) {
diff --git a/block/iscsi.c b/block/iscsi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun,
 }
 
 static int coroutine_fn
-iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-                      QEMUIOVector *iov, int flags)
+iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+                QEMUIOVector *iov, int flags)
 {
     IscsiLun *iscsilun = bs->opaque;
     struct IscsiTask iTask;
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_iscsi = {
     .bdrv_co_pdiscard      = iscsi_co_pdiscard,
     .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
     .bdrv_co_readv         = iscsi_co_readv,
-    .bdrv_co_writev_flags  = iscsi_co_writev_flags,
+    .bdrv_co_writev        = iscsi_co_writev,
     .bdrv_co_flush_to_disk = iscsi_co_flush,
 
 #ifdef __linux__
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_iser = {
     .bdrv_co_pdiscard      = iscsi_co_pdiscard,
     .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
     .bdrv_co_readv         = iscsi_co_readv,
-    .bdrv_co_writev_flags  = iscsi_co_writev_flags,
+    .bdrv_co_writev        = iscsi_co_writev,
     .bdrv_co_flush_to_disk = iscsi_co_flush,
 
 #ifdef __linux__
diff --git a/block/parallels.c b/block/parallels.c
index XXXXXXX..XXXXXXX 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn parallels_co_block_status(BlockDriverState *bs,
 }
 
 static coroutine_fn int parallels_co_writev(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+                                            int64_t sector_num, int nb_sectors,
+                                            QEMUIOVector *qiov, int flags)
 {
     BDRVParallelsState *s = bs->opaque;
     uint64_t bytes_done = 0;
     QEMUIOVector hd_qiov;
     int ret = 0;
 
+    assert(!flags);
     qemu_iovec_init(&hd_qiov, qiov->niov);
 
     while (nb_sectors > 0) {
diff --git a/block/qcow.c b/block/qcow.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
 }
 
 static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
-                          int nb_sectors, QEMUIOVector *qiov)
+                                       int nb_sectors, QEMUIOVector *qiov,
+                                       int flags)
 {
     BDRVQcowState *s = bs->opaque;
     int index_in_cluster;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
     uint8_t *buf;
     void *orig_buf;
 
+    assert(!flags);
     s->cluster_cache_offset = -1; /* disable compressed cache */
 
     /* We must always copy the iov when encrypting, so we
@@ -XXX,XX +XXX,XX @@ qcow_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
     if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
         /* could not compress: write normal cluster */
         ret = qcow_co_writev(bs, offset >> BDRV_SECTOR_BITS,
-                             bytes >> BDRV_SECTOR_BITS, qiov);
+                             bytes >> BDRV_SECTOR_BITS, qiov, 0);
         if (ret < 0) {
             goto fail;
         }
diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
 
 static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
                                            int64_t sector_num, int nb_sectors,
-                                           QEMUIOVector *qiov)
+                                           QEMUIOVector *qiov, int flags)
 {
+    assert(!flags);
     return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
 }
 
diff --git a/block/replication.c b/block/replication.c
index XXXXXXX..XXXXXXX 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -XXX,XX +XXX,XX @@ out:
 static coroutine_fn int replication_co_writev(BlockDriverState *bs,
                                               int64_t sector_num,
                                               int remaining_sectors,
-                                              QEMUIOVector *qiov)
+                                              QEMUIOVector *qiov,
+                                              int flags)
 {
     BDRVReplicationState *s = bs->opaque;
     QEMUIOVector hd_qiov;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs,
     int ret;
     int64_t n;
 
+    assert(!flags);
     ret = replication_get_io_status(s);
     if (ret < 0) {
         goto out;
diff --git a/block/sheepdog.c b/block/sheepdog.c
index XXXXXXX..XXXXXXX 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static void sd_aio_complete(SheepdogAIOCB *acb)
 }
 
 static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
-                        int nb_sectors, QEMUIOVector *qiov)
+                                     int nb_sectors, QEMUIOVector *qiov,
+                                     int flags)
 {
     SheepdogAIOCB acb;
     int ret;
     int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
     BDRVSheepdogState *s = bs->opaque;
 
+    assert(!flags);
     if (offset > s->inode.vdi_size) {
         ret = sd_truncate(bs, offset, PREALLOC_MODE_OFF, NULL);
         if (ret < 0) {
diff --git a/block/ssh.c b/block/ssh.c
index XXXXXXX..XXXXXXX 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -XXX,XX +XXX,XX @@ static int ssh_write(BDRVSSHState *s, BlockDriverState *bs,
 
 static coroutine_fn int ssh_co_writev(BlockDriverState *bs,
                                       int64_t sector_num,
-                                      int nb_sectors, QEMUIOVector *qiov)
+                                      int nb_sectors, QEMUIOVector *qiov,
+                                      int flags)
 {
     BDRVSSHState *s = bs->opaque;
     int ret;
 
+    assert(!flags);
     qemu_co_mutex_lock(&s->lock);
     ret = ssh_write(s, bs, sector_num * BDRV_SECTOR_SIZE,
                     nb_sectors * BDRV_SECTOR_SIZE, qiov);
diff --git a/block/vhdx.c b/block/vhdx.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -XXX,XX +XXX,XX @@ int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s)
 }
 
 static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
-                                      int nb_sectors, QEMUIOVector *qiov)
+                                       int nb_sectors, QEMUIOVector *qiov,
+                                       int flags)
 {
     int ret = -ENOTSUP;
     BDRVVHDXState *s = bs->opaque;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
     uint64_t bat_prior_offset = 0;
     bool bat_update = false;
 
+    assert(!flags);
     qemu_iovec_init(&hd_qiov, qiov->niov);
 
     qemu_co_mutex_lock(&s->lock);
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

The QMP version of this command can take a qdev ID since 7a9877a02635,
but the HMP version is still using the deprecated block device name so
there's no way to refer to a block device added like this:

-blockdev node-name=disk0,driver=qcow2,file.driver=file,file.filename=hd.qcow2
  -device virtio-blk-pci,id=virtio-blk-pci0,drive=disk0

This patch works around this problem by using the specified name as a
qdev ID if the block device name is not found.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hmp.c           | 14 ++++++++++++--
 hmp-commands.hx |  3 ++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/hmp.c b/hmp.c
index XXXXXXX..XXXXXXX 100644
--- a/hmp.c
+++ b/hmp.c
@@ -XXX,XX +XXX,XX @@ void hmp_change(Monitor *mon, const QDict *qdict)
 void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict)
 {
     Error *err = NULL;
+    char *device = (char *) qdict_get_str(qdict, "device");
     BlockIOThrottle throttle = {
-        .has_device = true,
-        .device = (char *) qdict_get_str(qdict, "device"),
         .bps = qdict_get_int(qdict, "bps"),
         .bps_rd = qdict_get_int(qdict, "bps_rd"),
         .bps_wr = qdict_get_int(qdict, "bps_wr"),
@@ -XXX,XX +XXX,XX @@ void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict)
         .iops_wr = qdict_get_int(qdict, "iops_wr"),
     };
 
+    /* qmp_block_set_io_throttle has separate parameters for the
+     * (deprecated) block device name and the qdev ID but the HMP
+     * version has only one, so we must decide which one to pass. */
+    if (blk_by_name(device)) {
+        throttle.has_device = true;
+        throttle.device = device;
+    } else {
+        throttle.has_id = true;
+        throttle.id = device;
+    }
+
     qmp_block_set_io_throttle(&throttle, &err);
     hmp_handle_error(mon, &err);
 }
diff --git a/hmp-commands.hx b/hmp-commands.hx
index XXXXXXX..XXXXXXX 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -XXX,XX +XXX,XX @@ ETEXI
 STEXI
 @item block_set_io_throttle @var{device} @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}
 @findex block_set_io_throttle
-Change I/O throttle limits for a block drive to @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}
+Change I/O throttle limits for a block drive to @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}.
+@var{device} can be a block device name, a qdev ID or a QOM path.
 ETEXI
 
     {
-- 
2.13.6

From: John Snow <jsnow@redhat.com>

When we've reached the concluded state, we need to expose the error
state if applicable. Add the new field.

This should be sufficient for determining if a job completed
successfully or not after concluding; if we want to discriminate
based on how it failed more mechanically, we can always add an
explicit return code enumeration later.

I didn't bother to make it only show up if we are in the concluded
state; I don't think it's necessary.

Cc: qemu-stable@nongnu.org
Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qapi/block-core.json | 6 +++++-
 blockjob.c           | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 # @auto-dismiss: Job will dismiss itself when CONCLUDED, moving to the NULL
 #                state and disappearing from the query list. (since 2.12)
 #
+# @error: Error information if the job did not complete successfully.
+#         Not set if the job completed successfully. (since 2.12.1)
+#
 # Since: 1.1
 ##
 { 'struct': 'BlockJobInfo',
@@ -XXX,XX +XXX,XX @@
            'offset': 'int', 'busy': 'bool', 'paused': 'bool', 'speed': 'int',
            'io-status': 'BlockDeviceIoStatus', 'ready': 'bool',
            'status': 'BlockJobStatus',
-           'auto-finalize': 'bool', 'auto-dismiss': 'bool' } }
+           'auto-finalize': 'bool', 'auto-dismiss': 'bool',
+           '*error': 'str' } }
 
 ##
 # @query-block-jobs:
diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
     info->status    = job->status;
     info->auto_finalize = job->auto_finalize;
     info->auto_dismiss  = job->auto_dismiss;
+    info->has_error = job->ret != 0;
+    info->error     = job->ret ? g_strdup(strerror(-job->ret)) : NULL;
     return info;
 }
 
-- 
2.13.6

Every job gets a non-NULL job->txn on creation, but it doesn't
necessarily keep it until it is decommissioned: Finalising a job removes
it from its transaction. Therefore, calling 'blockdev-job-finalize' a
second time on an already concluded job causes an assertion failure.

Remove job->txn from the assertion in block_job_finalize() to fix this.
block_job_do_finalize() still has the same assertion, but if a job is
already removed from its transaction, block_job_apply_verb() will
already error out before we run into that assertion.

Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
---
 blockjob.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ void block_job_complete(BlockJob *job, Error **errp)
 
 void block_job_finalize(BlockJob *job, Error **errp)
 {
-    assert(job && job->id && job->txn);
+    assert(job && job->id);
     if (block_job_apply_verb(job, BLOCK_JOB_VERB_FINALIZE, errp)) {
         return;
     }
-- 
2.13.6

Block job drivers are not expected to mess with the internals of the
BlockJob object, so provide wrapper functions for one of the cases where
they still do it: Updating the progress counter.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
---
 include/block/blockjob.h | 19 +++++++++++++++++++
 block/backup.c           | 22 +++++++++++++---------
 block/commit.c           | 16 ++++++++--------
 block/mirror.c           | 11 +++++------
 block/stream.c           | 14 ++++++++------
 blockjob.c               | 10 ++++++++++
 6 files changed, 63 insertions(+), 29 deletions(-)

diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -XXX,XX +XXX,XX @@ void block_job_finalize(BlockJob *job, Error **errp);
 void block_job_dismiss(BlockJob **job, Error **errp);
 
 /**
+ * block_job_progress_update:
+ * @job: The job that has made progress
+ * @done: How much progress the job made
+ *
+ * Updates the progress counter of the job.
+ */
+void block_job_progress_update(BlockJob *job, uint64_t done);
+
+/**
+ * block_job_progress_set_remaining:
+ * @job: The job whose expected progress end value is set
+ * @remaining: Expected end value of the progress counter of the job
+ *
+ * Sets the expected end value of the progress counter of a job so that a
+ * completion percentage can be calculated when the progress is updated.
+ */
+void block_job_progress_set_remaining(BlockJob *job, uint64_t remaining);
+
+/**
  * block_job_query:
  * @job: The job to get information about.
  *
diff --git a/block/backup.c b/block/backup.c
index XXXXXXX..XXXXXXX 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@ typedef struct BackupBlockJob {
     BlockdevOnError on_source_error;
     BlockdevOnError on_target_error;
     CoRwlock flush_rwlock;
+    uint64_t len;
     uint64_t bytes_read;
     int64_t cluster_size;
     bool compress;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
 
         trace_backup_do_cow_process(job, start);
 
-        n = MIN(job->cluster_size, job->common.len - start);
+        n = MIN(job->cluster_size, job->len - start);
 
         if (!bounce_buffer) {
             bounce_buffer = blk_blockalign(blk, job->cluster_size);
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
          * offset field is an opaque progress value, it is not a disk offset.
          */
         job->bytes_read += n;
-        job->common.offset += n;
+        block_job_progress_update(&job->common, n);
     }
 
 out:
@@ -XXX,XX +XXX,XX @@ void backup_do_checkpoint(BlockJob *job, Error **errp)
         return;
     }
 
-    len = DIV_ROUND_UP(backup_job->common.len, backup_job->cluster_size);
+    len = DIV_ROUND_UP(backup_job->len, backup_job->cluster_size);
     hbitmap_set(backup_job->copy_bitmap, 0, len);
 }
 
@@ -XXX,XX +XXX,XX @@ static void backup_incremental_init_copy_bitmap(BackupBlockJob *job)
         bdrv_set_dirty_iter(dbi, next_cluster * job->cluster_size);
     }
 
-    job->common.offset = job->common.len -
-                         hbitmap_count(job->copy_bitmap) * job->cluster_size;
+    /* TODO block_job_progress_set_remaining() would make more sense */
+    block_job_progress_update(&job->common,
+        job->len - hbitmap_count(job->copy_bitmap) * job->cluster_size);
 
     bdrv_dirty_iter_free(dbi);
 }
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn backup_run(void *opaque)
     QLIST_INIT(&job->inflight_reqs);
     qemu_co_rwlock_init(&job->flush_rwlock);
 
-    nb_clusters = DIV_ROUND_UP(job->common.len, job->cluster_size);
+    nb_clusters = DIV_ROUND_UP(job->len, job->cluster_size);
+    block_job_progress_set_remaining(&job->common, job->len);
+
     job->copy_bitmap = hbitmap_alloc(nb_clusters, 0);
     if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
         backup_incremental_init_copy_bitmap(job);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn backup_run(void *opaque)
         ret = backup_run_incremental(job);
     } else {
         /* Both FULL and TOP SYNC_MODE's require copying.. */
-        for (offset = 0; offset < job->common.len;
+        for (offset = 0; offset < job->len;
              offset += job->cluster_size) {
             bool error_is_read;
             int alloced = 0;
@@ -XXX,XX +XXX,XX @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
         goto error;
     }
 
-    /* job->common.len is fixed, so we can't allow resize */
+    /* job->len is fixed, so we can't allow resize */
     job = block_job_create(job_id, &backup_job_driver, txn, bs,
                            BLK_PERM_CONSISTENT_READ,
                            BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
@@ -XXX,XX +XXX,XX @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
     /* Required permissions are already taken with target's blk_new() */
     block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL,
                        &error_abort);
-    job->common.len = len;
+    job->len = len;
 
     return &job->common;
 
diff --git a/block/commit.c b/block/commit.c
index XXXXXXX..XXXXXXX 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
     int64_t n = 0; /* bytes */
     void *buf = NULL;
     int bytes_written = 0;
-    int64_t base_len;
+    int64_t len, base_len;
 
-    ret = s->common.len = blk_getlength(s->top);
-
-    if (s->common.len < 0) {
+    ret = len = blk_getlength(s->top);
+    if (len < 0) {
         goto out;
     }
+    block_job_progress_set_remaining(&s->common, len);
 
     ret = base_len = blk_getlength(s->base);
     if (base_len < 0) {
         goto out;
     }
 
-    if (base_len < s->common.len) {
-        ret = blk_truncate(s->base, s->common.len, PREALLOC_MODE_OFF, NULL);
+    if (base_len < len) {
+        ret = blk_truncate(s->base, len, PREALLOC_MODE_OFF, NULL);
         if (ret) {
             goto out;
         }
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
 
     buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);
 
-    for (offset = 0; offset < s->common.len; offset += n) {
+    for (offset = 0; offset < len; offset += n) {
         bool copy;
 
         /* Note that even when no rate limit is applied we need to yield
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
             }
         }
         /* Publish progress */
-        s->common.offset += n;
+        block_job_progress_update(&s->common, n);
 
         if (copy && s->common.speed) {
             delay_ns = ratelimit_calculate_delay(&s->limit, n);
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static void mirror_iteration_done(MirrorOp *op, int ret)
             bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
         }
         if (!s->initial_zeroing_ongoing) {
-            s->common.offset += op->bytes;
+            block_job_progress_update(&s->common, op->bytes);
         }
     }
     qemu_iovec_destroy(&op->qiov);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
         block_job_pause_point(&s->common);
 
         cnt = bdrv_get_dirty_count(s->dirty_bitmap);
-        /* s->common.offset contains the number of bytes already processed so
-         * far, cnt is the number of dirty bytes remaining and
-         * s->bytes_in_flight is the number of bytes currently being
-         * processed; together those are the current total operation length */
-        s->common.len = s->common.offset + s->bytes_in_flight + cnt;
+        /* cnt is the number of dirty bytes remaining and s->bytes_in_flight is
+         * the number of bytes currently being processed; together those are
+         * the current remaining operation length */
+        block_job_progress_set_remaining(&s->common, s->bytes_in_flight + cnt);
 
         /* Note that even when no rate limit is applied we need to yield
          * periodically with no pending I/O so that bdrv_drain_all() returns.
diff --git a/block/stream.c b/block/stream.c
index XXXXXXX..XXXXXXX 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
     BlockBackend *blk = s->common.blk;
     BlockDriverState *bs = blk_bs(blk);
     BlockDriverState *base = s->base;
+    int64_t len;
     int64_t offset = 0;
     uint64_t delay_ns = 0;
     int error = 0;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
         goto out;
     }
 
-    s->common.len = bdrv_getlength(bs);
-    if (s->common.len < 0) {
-        ret = s->common.len;
+    len = bdrv_getlength(bs);
+    if (len < 0) {
+        ret = len;
         goto out;
     }
+    block_job_progress_set_remaining(&s->common, len);
 
     buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE);
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
         bdrv_enable_copy_on_read(bs);
     }
 
-    for ( ; offset < s->common.len; offset += n) {
+    for ( ; offset < len; offset += n) {
         bool copy;
 
         /* Note that even when no rate limit is applied we need to yield
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
 
             /* Finish early if end of backing file has been reached */
             if (ret == 0 && n == 0) {
-                n = s->common.len - offset;
+                n = len - offset;
             }
 
             copy = (ret == 1);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
         ret = 0;
 
         /* Publish progress */
-        s->common.offset += n;
+        block_job_progress_update(&s->common, n);
         if (copy && s->common.speed) {
             delay_ns = ratelimit_calculate_delay(&s->limit, n);
         } else {
diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ int block_job_complete_sync(BlockJob *job, Error **errp)
     return block_job_finish_sync(job, &block_job_complete, errp);
 }
 
+void block_job_progress_update(BlockJob *job, uint64_t done)
+{
+    job->offset += done;
+}
+
+void block_job_progress_set_remaining(BlockJob *job, uint64_t remaining)
+{
+    job->len = job->offset + remaining;
+}
+
 BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
 {
     BlockJobInfo *info;
-- 
2.13.6

Every block job has a RateLimit, and they all do the exact same thing
with it, so it should be common infrastructure. Move the struct field
for a start.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
---
 include/block/blockjob.h | 4 ++++
 block/backup.c           | 5 ++---
 block/commit.c           | 5 ++---
 block/mirror.c           | 6 +++---
 block/stream.c           | 5 ++---
 5 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -XXX,XX +XXX,XX @@
 #define BLOCKJOB_H
 
 #include "block/block.h"
+#include "qemu/ratelimit.h"
 
 typedef struct BlockJobDriver BlockJobDriver;
 typedef struct BlockJobTxn BlockJobTxn;
@@ -XXX,XX +XXX,XX @@ typedef struct BlockJob {
     /** Speed that was set with @block_job_set_speed.  */
     int64_t speed;
 
+    /** Rate limiting data structure for implementing @speed. */
+    RateLimit limit;
+
     /** The completion function that will be called when the job completes.  */
     BlockCompletionFunc *cb;
 
diff --git a/block/backup.c b/block/backup.c
index XXXXXXX..XXXXXXX 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@ typedef struct BackupBlockJob {
     /* bitmap for sync=incremental */
     BdrvDirtyBitmap *sync_bitmap;
     MirrorSyncMode sync_mode;
-    RateLimit limit;
     BlockdevOnError on_source_error;
     BlockdevOnError on_target_error;
     CoRwlock flush_rwlock;
@@ -XXX,XX +XXX,XX @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
         error_setg(errp, QERR_INVALID_PARAMETER, "speed");
         return;
     }
-    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
+    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
 }
 
 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn yield_and_check(BackupBlockJob *job)
      * (without, VM does not reboot)
      */
     if (job->common.speed) {
-        uint64_t delay_ns = ratelimit_calculate_delay(&job->limit,
+        uint64_t delay_ns = ratelimit_calculate_delay(&job->common.limit,
                                                       job->bytes_read);
         job->bytes_read = 0;
         block_job_sleep_ns(&job->common, delay_ns);
diff --git a/block/commit.c b/block/commit.c
index XXXXXXX..XXXXXXX 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ enum {
 
 typedef struct CommitBlockJob {
     BlockJob common;
-    RateLimit limit;
     BlockDriverState *commit_top_bs;
     BlockBackend *top;
     BlockBackend *base;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
         block_job_progress_update(&s->common, n);
 
         if (copy && s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->limit, n);
+            delay_ns = ratelimit_calculate_delay(&s->common.limit, n);
         } else {
             delay_ns = 0;
         }
@@ -XXX,XX +XXX,XX @@ static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
         error_setg(errp, QERR_INVALID_PARAMETER, "speed");
         return;
     }
-    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
+    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
 }
 
 static const BlockJobDriver commit_job_driver = {
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ typedef struct MirrorBuffer {
 
 typedef struct MirrorBlockJob {
     BlockJob common;
-    RateLimit limit;
     BlockBackend *target;
     BlockDriverState *mirror_top_bs;
     BlockDriverState *source;
@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
         offset += io_bytes;
         nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity);
         if (s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->limit, io_bytes_acct);
+            delay_ns = ratelimit_calculate_delay(&s->common.limit,
+                                                 io_bytes_acct);
         }
     }
     return delay_ns;
@@ -XXX,XX +XXX,XX @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
         error_setg(errp, QERR_INVALID_PARAMETER, "speed");
         return;
     }
-    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
+    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
 }
 
 static void mirror_complete(BlockJob *job, Error **errp)
diff --git a/block/stream.c b/block/stream.c
index XXXXXXX..XXXXXXX 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -XXX,XX +XXX,XX @@ enum {
 
 typedef struct StreamBlockJob {
     BlockJob common;
-    RateLimit limit;
     BlockDriverState *base;
     BlockdevOnError on_error;
     char *backing_file_str;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
         /* Publish progress */
         block_job_progress_update(&s->common, n);
         if (copy && s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->limit, n);
+            delay_ns = ratelimit_calculate_delay(&s->common.limit, n);
         } else {
             delay_ns = 0;
         }
@@ -XXX,XX +XXX,XX @@ static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
         error_setg(errp, QERR_INVALID_PARAMETER, "speed");
         return;
     }
-    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
+    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
 }
 
 static const BlockJobDriver stream_job_driver = {
-- 
2.13.6

All block job drivers support .set_speed and all of them duplicate the
same code to implement it. Move that code to blockjob.c and remove the
now useless callback.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
---
 include/block/blockjob.h     |  2 ++
 include/block/blockjob_int.h |  3 ---
 block/backup.c               | 13 -------------
 block/commit.c               | 14 --------------
 block/mirror.c               | 26 ++++++--------------------
 block/stream.c               | 14 --------------
 blockjob.c                   | 12 ++++--------
 7 files changed, 12 insertions(+), 72 deletions(-)

diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -XXX,XX +XXX,XX @@
 #include "block/block.h"
 #include "qemu/ratelimit.h"
 
+#define BLOCK_JOB_SLICE_TIME 100000000ULL /* ns */
+
 typedef struct BlockJobDriver BlockJobDriver;
 typedef struct BlockJobTxn BlockJobTxn;
 
diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/blockjob_int.h
+++ b/include/block/blockjob_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockJobDriver {
     /** String describing the operation, part of query-block-jobs QMP API */
     BlockJobType job_type;
 
-    /** Optional callback for job types that support setting a speed limit */
-    void (*set_speed)(BlockJob *job, int64_t speed, Error **errp);
-
     /** Mandatory: Entrypoint for the Coroutine. */
     CoroutineEntry *start;
 
diff --git a/block/backup.c b/block/backup.c
index XXXXXXX..XXXXXXX 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/error-report.h"
 
 #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
-#define SLICE_TIME 100000000ULL /* ns */
 
 typedef struct BackupBlockJob {
     BlockJob common;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn backup_before_write_notify(
     return backup_do_cow(job, req->offset, req->bytes, NULL, true);
 }
 
-static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
-{
-    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
-
-    if (speed < 0) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
-        return;
-    }
-    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
-}
-
 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
 {
     BdrvDirtyBitmap *bm;
@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver backup_job_driver = {
     .instance_size          = sizeof(BackupBlockJob),
     .job_type               = BLOCK_JOB_TYPE_BACKUP,
     .start                  = backup_run,
-    .set_speed              = backup_set_speed,
     .commit                 = backup_commit,
     .abort                  = backup_abort,
     .clean                  = backup_clean,
diff --git a/block/commit.c b/block/commit.c
index XXXXXXX..XXXXXXX 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ enum {
     COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
 };
 
-#define SLICE_TIME 100000000ULL /* ns */
-
 typedef struct CommitBlockJob {
     BlockJob common;
     BlockDriverState *commit_top_bs;
@@ -XXX,XX +XXX,XX @@ out:
     block_job_defer_to_main_loop(&s->common, commit_complete, data);
 }
 
-static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
-{
-    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
-
-    if (speed < 0) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
-        return;
-    }
-    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
-}
-
 static const BlockJobDriver commit_job_driver = {
     .instance_size = sizeof(CommitBlockJob),
     .job_type      = BLOCK_JOB_TYPE_COMMIT,
-    .set_speed     = commit_set_speed,
     .start         = commit_run,
 };
 
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/ratelimit.h"
 #include "qemu/bitmap.h"
 
-#define SLICE_TIME    100000000ULL /* ns */
 #define MAX_IN_FLIGHT 16
 #define MAX_IO_BYTES (1 << 20) /* 1 Mb */
 #define DEFAULT_MIRROR_BUF_SIZE (MAX_IN_FLIGHT * MAX_IO_BYTES)
@@ -XXX,XX +XXX,XX @@ static void mirror_throttle(MirrorBlockJob *s)
 {
     int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 
-    if (now - s->last_pause_ns > SLICE_TIME) {
+    if (now - s->last_pause_ns > BLOCK_JOB_SLICE_TIME) {
         s->last_pause_ns = now;
         block_job_sleep_ns(&s->common, 0);
     } else {
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
 
         /* Note that even when no rate limit is applied we need to yield
          * periodically with no pending I/O so that bdrv_drain_all() returns.
-         * We do so every SLICE_TIME nanoseconds, or when there is an error,
-         * or when the source is clean, whichever comes first.
-         */
+         * We do so every BLKOCK_JOB_SLICE_TIME nanoseconds, or when there is
+         * an error, or when the source is clean, whichever comes first. */
         delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
-        if (delta < SLICE_TIME &&
+        if (delta < BLOCK_JOB_SLICE_TIME &&
             s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
             if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                 (cnt == 0 && s->in_flight > 0)) {
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_run(void *opaque)
         ret = 0;
 
         if (s->synced && !should_complete) {
-            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
+            delay_ns = (s->in_flight == 0 &&
+                        cnt == 0 ? BLOCK_JOB_SLICE_TIME : 0);
         }
         trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
         block_job_sleep_ns(&s->common, delay_ns);
@@ -XXX,XX +XXX,XX @@ immediate_exit:
     block_job_defer_to_main_loop(&s->common, mirror_exit, data);
 }
 
-static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
-{
-    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
-
-    if (speed < 0) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
-        return;
-    }
-    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
-}
-
 static void mirror_complete(BlockJob *job, Error **errp)
 {
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
@@ -XXX,XX +XXX,XX @@ static void mirror_drain(BlockJob *job)
 static const BlockJobDriver mirror_job_driver = {
     .instance_size          = sizeof(MirrorBlockJob),
     .job_type               = BLOCK_JOB_TYPE_MIRROR,
-    .set_speed              = mirror_set_speed,
     .start                  = mirror_run,
     .complete               = mirror_complete,
     .pause                  = mirror_pause,
@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver mirror_job_driver = {
 static const BlockJobDriver commit_active_job_driver = {
     .instance_size          = sizeof(MirrorBlockJob),
     .job_type               = BLOCK_JOB_TYPE_COMMIT,
-    .set_speed              = mirror_set_speed,
     .start                  = mirror_run,
     .complete               = mirror_complete,
     .pause                  = mirror_pause,
diff --git a/block/stream.c b/block/stream.c
index XXXXXXX..XXXXXXX 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -XXX,XX +XXX,XX @@ enum {
     STREAM_BUFFER_SIZE = 512 * 1024, /* in bytes */
 };
 
-#define SLICE_TIME 100000000ULL /* ns */
-
 typedef struct StreamBlockJob {
     BlockJob common;
     BlockDriverState *base;
@@ -XXX,XX +XXX,XX @@ out:
     block_job_defer_to_main_loop(&s->common, stream_complete, data);
 }
 
-static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
-{
-    StreamBlockJob *s = container_of(job, StreamBlockJob, common);
-
-    if (speed < 0) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
-        return;
-    }
-    ratelimit_set_speed(&s->common.limit, speed, SLICE_TIME);
-}
-
 static const BlockJobDriver stream_job_driver = {
     .instance_size = sizeof(StreamBlockJob),
     .job_type      = BLOCK_JOB_TYPE_STREAM,
-    .set_speed     = stream_set_speed,
     .start         = stream_run,
 };
 
diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static bool block_job_timer_pending(BlockJob *job)
 
 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
 {
-    Error *local_err = NULL;
     int64_t old_speed = job->speed;
 
-    if (!job->driver->set_speed) {
-        error_setg(errp, QERR_UNSUPPORTED);
-        return;
-    }
     if (block_job_apply_verb(job, BLOCK_JOB_VERB_SET_SPEED, errp)) {
         return;
     }
-    job->driver->set_speed(job, speed, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
+    if (speed < 0) {
+        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
         return;
     }
 
+    ratelimit_set_speed(&job->limit, speed, BLOCK_JOB_SLICE_TIME);
+
     job->speed = speed;
     if (speed && speed <= old_speed) {
         return;
-- 
2.13.6

This gets us rid of more direct accesses to BlockJob fields from the
job drivers.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
---
 include/block/blockjob_int.h |  8 ++++++++
 block/backup.c               | 18 +++++++-----------
 block/commit.c               |  4 ++--
 block/mirror.c               |  5 +----
 block/stream.c               |  4 ++--
 blockjob.c                   |  9 +++++++++
 6 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/blockjob_int.h
+++ b/include/block/blockjob_int.h
@@ -XXX,XX +XXX,XX @@ void block_job_sleep_ns(BlockJob *job, int64_t ns);
 void block_job_yield(BlockJob *job);
 
 /**
+ * block_job_ratelimit_get_delay:
+ *
+ * Calculate and return delay for the next request in ns. See the documentation
+ * of ratelimit_calculate_delay() for details.
+ */
+int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n);
+
+/**
  * block_job_early_fail:
  * @bs: The block device.
  *
diff --git a/block/backup.c b/block/backup.c
index XXXXXXX..XXXXXXX 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@ static void backup_complete(BlockJob *job, void *opaque)
 
 static bool coroutine_fn yield_and_check(BackupBlockJob *job)
 {
+    uint64_t delay_ns;
+
     if (block_job_is_cancelled(&job->common)) {
         return true;
     }
 
-    /* we need to yield so that bdrv_drain_all() returns.
-     * (without, VM does not reboot)
-     */
-    if (job->common.speed) {
-        uint64_t delay_ns = ratelimit_calculate_delay(&job->common.limit,
-                                                      job->bytes_read);
-        job->bytes_read = 0;
-        block_job_sleep_ns(&job->common, delay_ns);
-    } else {
-        block_job_sleep_ns(&job->common, 0);
-    }
+    /* We need to yield even for delay_ns = 0 so that bdrv_drain_all() can
+     * return. Without a yield, the VM would not reboot. */
+    delay_ns = block_job_ratelimit_get_delay(&job->common, job->bytes_read);
+    job->bytes_read = 0;
+    block_job_sleep_ns(&job->common, delay_ns);
 
     if (block_job_is_cancelled(&job->common)) {
         return true;
diff --git a/block/commit.c b/block/commit.c
index XXXXXXX..XXXXXXX 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn commit_run(void *opaque)
         /* Publish progress */
         block_job_progress_update(&s->common, n);
 
-        if (copy && s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->common.limit, n);
+        if (copy) {
+            delay_ns = block_job_ratelimit_get_delay(&s->common, n);
         } else {
             delay_ns = 0;
         }
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
         assert(io_bytes);
         offset += io_bytes;
         nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity);
-        if (s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->common.limit,
-                                                 io_bytes_acct);
-        }
+        delay_ns = block_job_ratelimit_get_delay(&s->common, io_bytes_acct);
     }
     return delay_ns;
 }
diff --git a/block/stream.c b/block/stream.c
index XXXXXXX..XXXXXXX 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn stream_run(void *opaque)
 
         /* Publish progress */
         block_job_progress_update(&s->common, n);
-        if (copy && s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->common.limit, n);
+        if (copy) {
+            delay_ns = block_job_ratelimit_get_delay(&s->common, n);
         } else {
             delay_ns = 0;
         }
diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
     block_job_enter_cond(job, block_job_timer_pending);
 }
 
+int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n)
+{
+    if (!job->speed) {
+        return 0;
+    }
+
+    return ratelimit_calculate_delay(&job->limit, n);
+}
+
 void block_job_complete(BlockJob *job, Error **errp)
 {
     /* Should not be reachable via external interface for internal jobs */
-- 
2.13.6

The backup block job directly accesses the driver field in BlockJob. Add
a wrapper for getting it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
---
 include/block/blockjob.h | 7 +++++++
 block/backup.c           | 8 +++++---
 blockjob.c               | 5 +++++
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -XXX,XX +XXX,XX @@ void block_job_txn_add_job(BlockJobTxn *txn, BlockJob *job);
  */
 bool block_job_is_internal(BlockJob *job);
 
+/**
+ * block_job_driver:
+ *
+ * Returns the driver associated with a block job.
+ */
+const BlockJobDriver *block_job_driver(BlockJob *job);
+
 #endif
diff --git a/block/backup.c b/block/backup.c
index XXXXXXX..XXXXXXX 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@ typedef struct BackupBlockJob {
     HBitmap *copy_bitmap;
 } BackupBlockJob;
 
+static const BlockJobDriver backup_job_driver;
+
 /* See if in-flight requests overlap and wait for them to complete */
 static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
                                                        int64_t start,
@@ -XXX,XX +XXX,XX @@ void backup_do_checkpoint(BlockJob *job, Error **errp)
     BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
     int64_t len;
 
-    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
+    assert(block_job_driver(job) == &backup_job_driver);
 
     if (backup_job->sync_mode != MIRROR_SYNC_MODE_NONE) {
         error_setg(errp, "The backup job only supports block checkpoint in"
@@ -XXX,XX +XXX,XX @@ void backup_wait_for_overlapping_requests(BlockJob *job, int64_t offset,
     BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
     int64_t start, end;
 
-    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
+    assert(block_job_driver(job) == &backup_job_driver);
 
     start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size);
     end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size);
@@ -XXX,XX +XXX,XX @@ void backup_cow_request_begin(CowRequest *req, BlockJob *job,
     BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
     int64_t start, end;
 
-    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
+    assert(block_job_driver(job) == &backup_job_driver);
 
     start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size);
     end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size);
diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static bool block_job_started(BlockJob *job)
     return job->co;
 }
 
+const BlockJobDriver *block_job_driver(BlockJob *job)
+{
+    return job->driver;
+}
+
 /**
  * All jobs must allow a pause point before entering their job proper. This
  * ensures that jobs can be paused prior to being started, then resumed later.
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Commit abd3622cc03cf41ed542126a540385f30a4c0175 added a case to 122
regarding how the qcow2 driver handles an incorrect compressed data
length value.  This does not really fit into 122, as that file is
supposed to contain qemu-img convert test cases, which this case is not.
So this patch splits it off into its own file; maybe we will even get
more qcow2-only compression tests in the future.

Also, that test case does not work with refcount_bits=1, so mark that
option as unsupported.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180406164108.26118-1-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Alberto Garcia <berto@igalia.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/122     | 47 ----------------------
 tests/qemu-iotests/122.out | 33 ----------------
 tests/qemu-iotests/214     | 97 ++++++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/214.out | 35 +++++++++++++++++
 tests/qemu-iotests/group   |  1 +
 5 files changed, 133 insertions(+), 80 deletions(-)
 create mode 100755 tests/qemu-iotests/214
 create mode 100644 tests/qemu-iotests/214.out

diff --git a/tests/qemu-iotests/122 b/tests/qemu-iotests/122
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/122
+++ b/tests/qemu-iotests/122
@@ -XXX,XX +XXX,XX @@ $QEMU_IO -c "read -P 0    1024k 1022k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _fil
 
 
 echo
-echo "=== Corrupted size field in compressed cluster descriptor ==="
-echo
-# Create an empty image and fill half of it with compressed data.
-# The L2 entries of the two compressed clusters are located at
-# 0x800000 and 0x800008, their original values are 0x4008000000a00000
-# and 0x4008000000a00802 (5 sectors for compressed data each).
-_make_test_img 8M -o cluster_size=2M
-$QEMU_IO -c "write -c -P 0x11 0 2M" -c "write -c -P 0x11 2M 2M" "$TEST_IMG" \
-         2>&1 | _filter_qemu_io | _filter_testdir
-
-# Reduce size of compressed data to 4 sectors: this corrupts the image.
-poke_file "$TEST_IMG" $((0x800000)) "\x40\x06"
-$QEMU_IO -c "read  -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-
-# 'qemu-img check' however doesn't see anything wrong because it
-# doesn't try to decompress the data and the refcounts are consistent.
-# TODO: update qemu-img so this can be detected.
-_check_test_img
-
-# Increase size of compressed data to the maximum (8192 sectors).
-# This makes QEMU read more data (8192 sectors instead of 5, host
-# addresses [0xa00000, 0xdfffff]), but the decompression algorithm
-# stops once we have enough to restore the uncompressed cluster, so
-# the rest of the data is ignored.
-poke_file "$TEST_IMG" $((0x800000)) "\x7f\xfe"
-# Do it also for the second compressed cluster (L2 entry at 0x800008).
-# In this case the compressed data would span 3 host clusters
-# (host addresses: [0xa00802, 0xe00801])
-poke_file "$TEST_IMG" $((0x800008)) "\x7f\xfe"
-
-# Here the image is too small so we're asking QEMU to read beyond the
-# end of the image.
-$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-# But if we grow the image we won't be reading beyond its end anymore.
-$QEMU_IO -c "write -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-
-# The refcount data is however wrong because due to the increased size
-# of the compressed data it now reaches the following host clusters.
-# This can be repaired by qemu-img check by increasing the refcount of
-# those clusters.
-# TODO: update qemu-img to correct the compressed cluster size instead.
-_check_test_img -r all
-$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-$QEMU_IO -c "read  -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-
-echo
 echo "=== Full allocation with -S 0 ==="
 echo
 
diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/122.out
+++ b/tests/qemu-iotests/122.out
@@ -XXX,XX +XXX,XX @@ read 1024/1024 bytes at offset 1047552
 read 1046528/1046528 bytes at offset 1048576
 1022 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
-=== Corrupted size field in compressed cluster descriptor ===
-
-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=8388608
-wrote 2097152/2097152 bytes at offset 0
-2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 2097152/2097152 bytes at offset 2097152
-2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read failed: Input/output error
-No errors were found on the image.
-read 4194304/4194304 bytes at offset 0
-4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 4194304/4194304 bytes at offset 4194304
-4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 4194304/4194304 bytes at offset 0
-4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-ERROR cluster 6 refcount=1 reference=3
-ERROR cluster 7 refcount=1 reference=2
-Repairing cluster 6 refcount=1 reference=3
-Repairing cluster 7 refcount=1 reference=2
-Repairing OFLAG_COPIED data cluster: l2_entry=8000000000c00000 refcount=3
-Repairing OFLAG_COPIED data cluster: l2_entry=8000000000e00000 refcount=2
-The following inconsistencies were found and repaired:
-
-    0 leaked clusters
-    4 corruptions
-
-Double checking the fixed image now...
-No errors were found on the image.
-read 4194304/4194304 bytes at offset 0
-4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 4194304/4194304 bytes at offset 4194304
-4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-
 === Full allocation with -S 0 ===
 
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
diff --git a/tests/qemu-iotests/214 b/tests/qemu-iotests/214
new file mode 100755
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/214
@@ -XXX,XX +XXX,XX @@
+#!/bin/bash
+#
+# Test qcow2 image compression
+#
+# Copyright (C) 2018 Igalia, S.L.
+# Author: Alberto Garcia <berto@igalia.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+seq=$(basename "$0")
+echo "QA output created by $seq"
+
+here=$PWD
+status=1	# failure is the default!
+
+_cleanup()
+{
+    _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+
+# Repairing the corrupted image requires qemu-img check to store a
+# refcount up to 3, which requires at least two refcount bits.
+_unsupported_imgopts 'refcount_bits=1[^0-9]'
+
+
+echo
+echo "=== Corrupted size field in compressed cluster descriptor ==="
+echo
+# Create an empty image and fill half of it with compressed data.
+# The L2 entries of the two compressed clusters are located at
+# 0x800000 and 0x800008, their original values are 0x4008000000a00000
+# and 0x4008000000a00802 (5 sectors for compressed data each).
+_make_test_img 8M -o cluster_size=2M
+$QEMU_IO -c "write -c -P 0x11 0 2M" -c "write -c -P 0x11 2M 2M" "$TEST_IMG" \
+         2>&1 | _filter_qemu_io | _filter_testdir
+
+# Reduce size of compressed data to 4 sectors: this corrupts the image.
+poke_file "$TEST_IMG" $((0x800000)) "\x40\x06"
+$QEMU_IO -c "read  -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+# 'qemu-img check' however doesn't see anything wrong because it
+# doesn't try to decompress the data and the refcounts are consistent.
+# TODO: update qemu-img so this can be detected.
+_check_test_img
+
+# Increase size of compressed data to the maximum (8192 sectors).
+# This makes QEMU read more data (8192 sectors instead of 5, host
+# addresses [0xa00000, 0xdfffff]), but the decompression algorithm
+# stops once we have enough to restore the uncompressed cluster, so
+# the rest of the data is ignored.
+poke_file "$TEST_IMG" $((0x800000)) "\x7f\xfe"
+# Do it also for the second compressed cluster (L2 entry at 0x800008).
+# In this case the compressed data would span 3 host clusters
+# (host addresses: [0xa00802, 0xe00801])
+poke_file "$TEST_IMG" $((0x800008)) "\x7f\xfe"
+
+# Here the image is too small so we're asking QEMU to read beyond the
+# end of the image.
+$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+# But if we grow the image we won't be reading beyond its end anymore.
+$QEMU_IO -c "write -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+# The refcount data is however wrong because due to the increased size
+# of the compressed data it now reaches the following host clusters.
+# This can be repaired by qemu-img check by increasing the refcount of
+# those clusters.
+# TODO: update qemu-img to correct the compressed cluster size instead.
+_check_test_img -r all
+$QEMU_IO -c "read  -P 0x11  0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read  -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+# success, all done
+echo '*** done'
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/214.out b/tests/qemu-iotests/214.out
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/214.out
@@ -XXX,XX +XXX,XX @@
+QA output created by 214
+
+=== Corrupted size field in compressed cluster descriptor ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=8388608
+wrote 2097152/2097152 bytes at offset 0
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 2097152
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read failed: Input/output error
+No errors were found on the image.
+read 4194304/4194304 bytes at offset 0
+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 4194304/4194304 bytes at offset 4194304
+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4194304/4194304 bytes at offset 0
+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+ERROR cluster 6 refcount=1 reference=3
+ERROR cluster 7 refcount=1 reference=2
+Repairing cluster 6 refcount=1 reference=3
+Repairing cluster 7 refcount=1 reference=2
+Repairing OFLAG_COPIED data cluster: l2_entry=8000000000c00000 refcount=3
+Repairing OFLAG_COPIED data cluster: l2_entry=8000000000e00000 refcount=2
+The following inconsistencies were found and repaired:
+
+    0 leaked clusters
+    4 corruptions
+
+Double checking the fixed image now...
+No errors were found on the image.
+read 4194304/4194304 bytes at offset 0
+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4194304/4194304 bytes at offset 4194304
+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 211 rw auto quick
 212 rw auto quick
 213 rw auto quick
+214 rw auto
 218 rw auto quick
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

Compressed clusters are not supposed to have the COPIED bit set.
"qemu-img check" detects that and prints an error message reporting
the number of the affected host cluster. This doesn't make much sense
because compressed clusters are not aligned to host clusters, so it
would be better to report the offset instead. Plus, the calculation is
wrong and it uses the raw L2 entry as if it was simply an offset.

This patch fixes the error message and reports the offset of the
compressed cluster.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Message-id: 0f687957feb72e80c740403191a47e607c2463fe.1523376013.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-refcount.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
         case QCOW2_CLUSTER_COMPRESSED:
             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
             if (l2_entry & QCOW_OFLAG_COPIED) {
-                fprintf(stderr, "ERROR: cluster %" PRId64 ": "
+                fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
                     "copied flag must never be set for compressed "
-                    "clusters\n", l2_entry >> s->cluster_bits);
+                    "clusters\n", l2_entry & s->cluster_offset_mask);
                 l2_entry &= ~QCOW_OFLAG_COPIED;
                 res->corruptions++;
             }
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

Compressed clusters are not supposed to have the COPIED bit set, but
this is not made explicit in the specs, so let's document it.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Message-id: 74552e1d6e858d3159cb0c0e188e80bc9248e337.1523376013.git.berto@igalia.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 docs/interop/qcow2.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt
index XXXXXXX..XXXXXXX 100644
--- a/docs/interop/qcow2.txt
+++ b/docs/interop/qcow2.txt
@@ -XXX,XX +XXX,XX @@ L2 table entry:
               62:   0 for standard clusters
                     1 for compressed clusters
 
-              63:   0 for a cluster that is unused or requires COW, 1 if its
-                    refcount is exactly one. This information is only accurate
-                    in L2 tables that are reachable from the active L1
-                    table.
+              63:   0 for clusters that are unused, compressed or require COW.
+                    1 for standard clusters whose refcount is exactly one.
+                    This information is only accurate in L2 tables
+                    that are reachable from the active L1 table.
 
 Standard Cluster Descriptor:
 
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

The L2 and refcount caches have default sizes that can be overridden
using the l2-cache-size and refcount-cache-size (an additional
parameter named cache-size sets the combined size of both caches).

Unless forced by one of the aforementioned parameters, QEMU will set
the unspecified sizes so that the L2 cache is 4 times larger than the
refcount cache.

This is based on the premise that the refcount metadata needs to be
only a fourth of the L2 metadata to cover the same amount of disk
space. This is incorrect for two reasons:

a) The amount of disk covered by an L2 table depends solely on the
    cluster size, but in the case of a refcount block it depends on
    the cluster size *and* the width of each refcount entry.
    The 4/1 ratio is only valid with 16-bit entries (the default).

b) When we talk about disk space and L2 tables we are talking about
    guest space (L2 tables map guest clusters to host clusters),
    whereas refcount blocks are used for host clusters (including
    L1/L2 tables and the refcount blocks themselves). On a fully
    populated (and uncompressed) qcow2 file, image size > virtual size
    so there are more refcount entries than L2 entries.

Problem (a) could be fixed by adjusting the algorithm to take into
account the refcount entry width. Problem (b) could be fixed by
increasing a bit the refcount cache size to account for the clusters
used for qcow2 metadata.

However this patch takes a completely different approach and instead
of keeping a ratio between both cache sizes it assigns as much as
possible to the L2 cache and the remainder to the refcount cache.

The reason is that L2 tables are used for every single I/O request
from the guest and the effect of increasing the cache is significant
and clearly measurable. Refcount blocks are however only used for
cluster allocation and internal snapshots and in practice are accessed
sequentially in most cases, so the effect of increasing the cache is
negligible (even when doing random writes from the guest).

So, make the refcount cache as small as possible unless the user
explicitly asks for a larger one.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 9695182c2eb11b77cb319689a1ebaa4e7c9d6591.1523968389.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.h              |  4 ----
 block/qcow2.c              | 31 +++++++++++++++++++------------
 tests/qemu-iotests/137.out |  2 +-
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@
 #define DEFAULT_L2_CACHE_CLUSTERS 8 /* clusters */
 #define DEFAULT_L2_CACHE_BYTE_SIZE 1048576 /* bytes */
 
-/* The refblock cache needs only a fourth of the L2 cache size to cover as many
- * clusters */
-#define DEFAULT_L2_REFCOUNT_SIZE_RATIO 4
-
 #define DEFAULT_CLUSTER_SIZE 65536
 
 
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
         } else if (refcount_cache_size_set) {
             *l2_cache_size = combined_cache_size - *refcount_cache_size;
         } else {
-            *refcount_cache_size = combined_cache_size
-                                 / (DEFAULT_L2_REFCOUNT_SIZE_RATIO + 1);
-            *l2_cache_size = combined_cache_size - *refcount_cache_size;
+            uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
+            uint64_t max_l2_cache = virtual_disk_size / (s->cluster_size / 8);
+            uint64_t min_refcount_cache =
+                (uint64_t) MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
+
+            /* Assign as much memory as possible to the L2 cache, and
+             * use the remainder for the refcount cache */
+            if (combined_cache_size >= max_l2_cache + min_refcount_cache) {
+                *l2_cache_size = max_l2_cache;
+                *refcount_cache_size = combined_cache_size - *l2_cache_size;
+            } else {
+                *refcount_cache_size =
+                    MIN(combined_cache_size, min_refcount_cache);
+                *l2_cache_size = combined_cache_size - *refcount_cache_size;
+            }
         }
     } else {
-        if (!l2_cache_size_set && !refcount_cache_size_set) {
+        if (!l2_cache_size_set) {
             *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE,
                                  (uint64_t)DEFAULT_L2_CACHE_CLUSTERS
                                  * s->cluster_size);
-            *refcount_cache_size = *l2_cache_size
-                                 / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
-        } else if (!l2_cache_size_set) {
-            *l2_cache_size = *refcount_cache_size
-                           * DEFAULT_L2_REFCOUNT_SIZE_RATIO;
-        } else if (!refcount_cache_size_set) {
-            *refcount_cache_size = *l2_cache_size
-                                 / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
+        }
+        if (!refcount_cache_size_set) {
+            *refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
         }
     }
 
diff --git a/tests/qemu-iotests/137.out b/tests/qemu-iotests/137.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/137.out
+++ b/tests/qemu-iotests/137.out
@@ -XXX,XX +XXX,XX @@ refcount-cache-size may not exceed cache-size
 L2 cache size too big
 L2 cache entry size must be a power of two between 512 and the cluster size (65536)
 L2 cache entry size must be a power of two between 512 and the cluster size (65536)
-L2 cache size too big
+Refcount cache size too big
 Conflicting values for qcow2 options 'overlap-check' ('constant') and 'overlap-check.template' ('all')
 Unsupported value 'blubb' for qcow2 option 'overlap-check'. Allowed are any of the following: none, constant, cached, all
 Unsupported value 'blubb' for qcow2 option 'overlap-check'. Allowed are any of the following: none, constant, cached, all
-- 
2.13.6

From: Alberto Garcia <berto@igalia.com>

We have just reduced the refcount cache size to the minimum unless
the user explicitly requests a larger one, so we have to update the
documentation to reflect this change.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Message-id: c5f0bde23558dd9d33b21fffc76ac9953cc19c56.1523968389.git.berto@igalia.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 docs/qcow2-cache.txt | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/docs/qcow2-cache.txt b/docs/qcow2-cache.txt
index XXXXXXX..XXXXXXX 100644
--- a/docs/qcow2-cache.txt
+++ b/docs/qcow2-cache.txt
@@ -XXX,XX +XXX,XX @@ There are three options available, and all of them take bytes:
 "refcount-cache-size":   maximum size of the refcount block cache
 "cache-size":            maximum size of both caches combined
 
-There are two things that need to be taken into account:
+There are a few things that need to be taken into account:
 
  - Both caches must have a size that is a multiple of the cluster size
    (or the cache entry size: see "Using smaller cache sizes" below).
 
- - If you only set one of the options above, QEMU will automatically
-   adjust the others so that the L2 cache is 4 times bigger than the
-   refcount cache.
+ - The default L2 cache size is 8 clusters or 1MB (whichever is more),
+   and the minimum is 2 clusters (or 2 cache entries, see below).
 
-This means that these options are equivalent:
+ - The default (and minimum) refcount cache size is 4 clusters.
 
-   -drive file=hd.qcow2,l2-cache-size=2097152
-   -drive file=hd.qcow2,refcount-cache-size=524288
-   -drive file=hd.qcow2,cache-size=2621440
+ - If only "cache-size" is specified then QEMU will assign as much
+   memory as possible to the L2 cache before increasing the refcount
+   cache size.
 
-The reason for this 1/4 ratio is to ensure that both caches cover the
-same amount of disk space. Note however that this is only valid with
-the default value of refcount_bits (16). If you are using a different
-value you might want to calculate both cache sizes yourself since QEMU
-will always use the same 1/4 ratio.
+Unlike L2 tables, refcount blocks are not used during normal I/O but
+only during allocations and internal snapshots. In most cases they are
+accessed sequentially (even during random guest I/O) so increasing the
+refcount cache size won't have any measurable effect in performance
+(this can change if you are using internal snapshots, so you may want
+to think about increasing the cache size if you use them heavily).
 
-It's also worth mentioning that there's no strict need for both caches
-to cover the same amount of disk space. The refcount cache is used
-much less often than the L2 cache, so it's perfectly reasonable to
-keep it small.
+Before QEMU 2.12 the refcount cache had a default size of 1/4 of the
+L2 cache size. This resulted in unnecessarily large caches, so now the
+refcount cache is as small as possible unless overridden by the user.
 
 
 Using smaller cache entries
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Currently, common.qemu only allows to match for results indicating
success.  The only way to fail is by provoking a timeout.  However,
sometimes we do have a defined failure output and can match for that,
which saves us from having to wait for the timeout in case of failure.
Because failure can sometimes just result in a _notrun in the test, it
is actually important to care about being able to fail quickly.

Also, sometimes we simply do not get any specific output in case of
success.  The only way to handle this currently would be to define an
error message as the string to look for, which means that actual success
results in a timeout.  This is really bad because it unnecessarily slows
down a succeeding test.

Therefore, this patch adds a new parameter $success_or_failure to
_timed_wait_for and _send_qemu_cmd.  Setting this to a non-empty string
makes both commands expect two match parameters: If the first matches,
the function succeeds.  If the second matches, the function fails.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180406151731.4285-2-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/common.qemu | 58 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 7 deletions(-)

diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/common.qemu
+++ b/tests/qemu-iotests/common.qemu
@@ -XXX,XX +XXX,XX @@ _in_fd=4
 # response is not echoed out.
 # If $mismatch_only is set, only non-matching responses will
 # be echoed.
+#
+# If $success_or_failure is set, the meaning of the arguments is
+# changed as follows:
+# $2: A string to search for in the response; if found, this indicates
+#     success and ${QEMU_STATUS[$1]} is set to 0.
+# $3: A string to search for in the response; if found, this indicates
+#     failure and the test is either aborted (if $qemu_error_no_exit
+#     is not set) or ${QEMU_STATUS[$1]} is set to -1 (otherwise).
 function _timed_wait_for()
 {
     local h=${1}
     shift
 
+    if [ -z "${success_or_failure}" ]; then
+        success_match=${*}
+        failure_match=
+    else
+        success_match=${1}
+        failure_match=${2}
+    fi
+
+    timeout=yes
+
     QEMU_STATUS[$h]=0
     while IFS= read -t ${QEMU_COMM_TIMEOUT} resp <&${QEMU_OUT[$h]}
     do
@@ -XXX,XX +XXX,XX @@ function _timed_wait_for()
             echo "${resp}" | _filter_testdir | _filter_qemu \
                            | _filter_qemu_io | _filter_qmp | _filter_hmp
         fi
-        grep -q "${*}" < <(echo "${resp}")
+        if [ -n "${failure_match}" ]; then
+            grep -q "${failure_match}" < <(echo "${resp}")
+            if [ $? -eq 0 ]; then
+                timeout=
+                break
+            fi
+        fi
+        grep -q "${success_match}" < <(echo "${resp}")
         if [ $? -eq 0 ]; then
             return
-        elif [ -z "${silent}" ] && [ -n "${mismatch_only}" ]; then
+        fi
+        if [ -z "${silent}" ] && [ -n "${mismatch_only}" ]; then
             echo "${resp}" | _filter_testdir | _filter_qemu \
                            | _filter_qemu_io | _filter_qmp | _filter_hmp
         fi
@@ -XXX,XX +XXX,XX @@ function _timed_wait_for()
     done
     QEMU_STATUS[$h]=-1
     if [ -z "${qemu_error_no_exit}" ]; then
-        echo "Timeout waiting for ${*} on handle ${h}"
-        exit 1  # Timeout means the test failed
+        if [ -n "${timeout}" ]; then
+            echo "Timeout waiting for ${success_match} on handle ${h}"
+        else
+            echo "Wrong response matching ${failure_match} on handle ${h}"
+        fi
+        exit 1  # Timeout or wrong match mean the test failed
     fi
 }
 
@@ -XXX,XX +XXX,XX @@ function _timed_wait_for()
 # If $qemu_error_no_exit is set, then even if the expected response
 # is not seen, we will not exit.  $QEMU_STATUS[$1] will be set it -1 in
 # that case.
+#
+# If $success_or_failure is set, then the last two strings are the
+# strings the response will be scanned for.  The first of the two
+# indicates success, the latter indicates failure.  Failure is handled
+# like a timeout.
 function _send_qemu_cmd()
 {
     local h=${1}
@@ -XXX,XX +XXX,XX @@ function _send_qemu_cmd()
         use_error="no"
     fi
     # This array element extraction is done to accommodate pathnames with spaces
-    cmd=${@: 1:${#@}-1}
-    shift $(($# - 1))
+    if [ -z "${success_or_failure}" ]; then
+        cmd=${@: 1:${#@}-1}
+        shift $(($# - 1))
+    else
+        cmd=${@: 1:${#@}-2}
+        shift $(($# - 2))
+    fi
 
     while [ ${count} -gt 0 ]
     do
         echo "${cmd}" >&${QEMU_IN[${h}]}
         if [ -n "${1}" ]; then
-            qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}"
+            if [ -z "${success_or_failure}" ]; then
+                qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}"
+            else
+                qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}" "${2}"
+            fi
             if [ ${QEMU_STATUS[$h]} -eq 0 ]; then
                 return
             fi
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

userfaultfd support depends on the host kernel, so it may not be
available.  If so, 181 and 201 should be skipped.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180406151731.4285-3-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/181 | 13 +++++++++++++
 tests/qemu-iotests/201 | 13 +++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/tests/qemu-iotests/181 b/tests/qemu-iotests/181
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/181
+++ b/tests/qemu-iotests/181
@@ -XXX,XX +XXX,XX @@ echo
 # Enable postcopy-ram capability both on source and destination
 silent=yes
 _send_qemu_cmd $dest 'migrate_set_capability postcopy-ram on' "(qemu)"
+
+qemu_error_no_exit=yes success_or_failure=yes \
+    _send_qemu_cmd $dest '' "(qemu)" "Postcopy is not supported"
+if [ ${QEMU_STATUS[$dest]} -lt 0 ]; then
+    _send_qemu_cmd $dest '' "(qemu)"
+
+    _send_qemu_cmd $src 'quit' ""
+    _send_qemu_cmd $dest 'quit' ""
+    wait=1 _cleanup_qemu
+
+    _notrun 'Postcopy is not supported'
+fi
+
 _send_qemu_cmd $src 'migrate_set_speed 4k' "(qemu)"
 _send_qemu_cmd $src 'migrate_set_capability postcopy-ram on' "(qemu)"
 _send_qemu_cmd $src "migrate -d unix:${MIG_SOCKET}" "(qemu)"
diff --git a/tests/qemu-iotests/201 b/tests/qemu-iotests/201
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/201
+++ b/tests/qemu-iotests/201
@@ -XXX,XX +XXX,XX @@ echo
 
 silent=yes
 _send_qemu_cmd $dest 'migrate_set_capability postcopy-ram on' "(qemu)"
+
+qemu_error_no_exit=yes success_or_failure=yes \
+    _send_qemu_cmd $dest '' "(qemu)" "Postcopy is not supported"
+if [ ${QEMU_STATUS[$dest]} -lt 0 ]; then
+    _send_qemu_cmd $dest '' "(qemu)"
+
+    _send_qemu_cmd $src 'quit' ""
+    _send_qemu_cmd $dest 'quit' ""
+    wait=1 _cleanup_qemu
+
+    _notrun 'Postcopy is not supported'
+fi
+
 _send_qemu_cmd $src 'migrate_set_capability postcopy-ram on' "(qemu)"
 _send_qemu_cmd $src "migrate -d unix:${MIG_SOCKET}" "(qemu)"
 
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

This adds a simple copy-on-read filter driver.  It relies on the already
existing COR functionality in the central block layer code, which may be
moved here once we no longer need it there.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180421132929.21610-2-mreitz@redhat.com
Reviewed-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qapi/block-core.json |   5 +-
 block/copy-on-read.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++++++
 block/Makefile.objs  |   2 +-
 3 files changed, 176 insertions(+), 2 deletions(-)
 create mode 100644 block/copy-on-read.c

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 # @vxhs: Since 2.10
 # @throttle: Since 2.11
 # @nvme: Since 2.12
+# @copy-on-read: Since 2.13
 #
 # Since: 2.9
 ##
 { 'enum': 'BlockdevDriver',
-  'data': [ 'blkdebug', 'blkverify', 'bochs', 'cloop',
+  'data': [ 'blkdebug', 'blkverify', 'bochs', 'cloop', 'copy-on-read',
             'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom',
             'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
             'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
@@ -XXX,XX +XXX,XX @@
       'blkverify':  'BlockdevOptionsBlkverify',
       'bochs':      'BlockdevOptionsGenericFormat',
       'cloop':      'BlockdevOptionsGenericFormat',
+      'copy-on-read':'BlockdevOptionsGenericFormat',
       'dmg':        'BlockdevOptionsGenericFormat',
       'file':       'BlockdevOptionsFile',
       'ftp':        'BlockdevOptionsCurlFtp',
@@ -XXX,XX +XXX,XX @@
       'blkverify':      'BlockdevCreateNotSupported',
       'bochs':          'BlockdevCreateNotSupported',
       'cloop':          'BlockdevCreateNotSupported',
+      'copy-on-read':   'BlockdevCreateNotSupported',
       'dmg':            'BlockdevCreateNotSupported',
       'file':           'BlockdevCreateOptionsFile',
       'ftp':            'BlockdevCreateNotSupported',
diff --git a/block/copy-on-read.c b/block/copy-on-read.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/block/copy-on-read.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Copy-on-read filter block driver
+ *
+ * Copyright (c) 2018 Red Hat, Inc.
+ *
+ * Author:
+ *   Max Reitz <mreitz@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "block/block_int.h"
+
+
+static int cor_open(BlockDriverState *bs, QDict *options, int flags,
+                    Error **errp)
+{
+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false,
+                               errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+
+    bs->supported_write_flags = BDRV_REQ_FUA &
+                                    bs->file->bs->supported_write_flags;
+
+    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+                                    bs->file->bs->supported_zero_flags;
+
+    return 0;
+}
+
+
+static void cor_close(BlockDriverState *bs)
+{
+}
+
+
+#define PERM_PASSTHROUGH (BLK_PERM_CONSISTENT_READ \
+                          | BLK_PERM_WRITE \
+                          | BLK_PERM_RESIZE)
+#define PERM_UNCHANGED (BLK_PERM_ALL & ~PERM_PASSTHROUGH)
+
+static void cor_child_perm(BlockDriverState *bs, BdrvChild *c,
+                           const BdrvChildRole *role,
+                           BlockReopenQueue *reopen_queue,
+                           uint64_t perm, uint64_t shared,
+                           uint64_t *nperm, uint64_t *nshared)
+{
+    if (c == NULL) {
+        *nperm = (perm & PERM_PASSTHROUGH) | BLK_PERM_WRITE_UNCHANGED;
+        *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED;
+        return;
+    }
+
+    *nperm = (perm & PERM_PASSTHROUGH) |
+             (c->perm & PERM_UNCHANGED);
+    *nshared = (shared & PERM_PASSTHROUGH) |
+               (c->shared_perm & PERM_UNCHANGED);
+}
+
+
+static int64_t cor_getlength(BlockDriverState *bs)
+{
+    return bdrv_getlength(bs->file->bs);
+}
+
+
+static int cor_truncate(BlockDriverState *bs, int64_t offset,
+                        PreallocMode prealloc, Error **errp)
+{
+    return bdrv_truncate(bs->file, offset, prealloc, errp);
+}
+
+
+static int coroutine_fn cor_co_preadv(BlockDriverState *bs,
+                                      uint64_t offset, uint64_t bytes,
+                                      QEMUIOVector *qiov, int flags)
+{
+    return bdrv_co_preadv(bs->file, offset, bytes, qiov,
+                          flags | BDRV_REQ_COPY_ON_READ);
+}
+
+
+static int coroutine_fn cor_co_pwritev(BlockDriverState *bs,
+                                       uint64_t offset, uint64_t bytes,
+                                       QEMUIOVector *qiov, int flags)
+{
+
+    return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
+}
+
+
+static int coroutine_fn cor_co_pwrite_zeroes(BlockDriverState *bs,
+                                             int64_t offset, int bytes,
+                                             BdrvRequestFlags flags)
+{
+    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
+}
+
+
+static int coroutine_fn cor_co_pdiscard(BlockDriverState *bs,
+                                        int64_t offset, int bytes)
+{
+    return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
+}
+
+
+static void cor_eject(BlockDriverState *bs, bool eject_flag)
+{
+    bdrv_eject(bs->file->bs, eject_flag);
+}
+
+
+static void cor_lock_medium(BlockDriverState *bs, bool locked)
+{
+    bdrv_lock_medium(bs->file->bs, locked);
+}
+
+
+static bool cor_recurse_is_first_non_filter(BlockDriverState *bs,
+                                            BlockDriverState *candidate)
+{
+    return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
+}
+
+
+BlockDriver bdrv_copy_on_read = {
+    .format_name                        = "copy-on-read",
+
+    .bdrv_open                          = cor_open,
+    .bdrv_close                         = cor_close,
+    .bdrv_child_perm                    = cor_child_perm,
+
+    .bdrv_getlength                     = cor_getlength,
+    .bdrv_truncate                      = cor_truncate,
+
+    .bdrv_co_preadv                     = cor_co_preadv,
+    .bdrv_co_pwritev                    = cor_co_pwritev,
+    .bdrv_co_pwrite_zeroes              = cor_co_pwrite_zeroes,
+    .bdrv_co_pdiscard                   = cor_co_pdiscard,
+
+    .bdrv_eject                         = cor_eject,
+    .bdrv_lock_medium                   = cor_lock_medium,
+
+    .bdrv_co_block_status               = bdrv_co_block_status_from_file,
+
+    .bdrv_recurse_is_first_non_filter   = cor_recurse_is_first_non_filter,
+
+    .has_variable_length                = true,
+    .is_filter                          = true,
+};
+
+static void bdrv_copy_on_read_init(void)
+{
+    bdrv_register(&bdrv_copy_on_read);
+}
+
+block_init(bdrv_copy_on_read_init);
diff --git a/block/Makefile.objs b/block/Makefile.objs
index XXXXXXX..XXXXXXX 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -XXX,XX +XXX,XX @@ block-obj-y += accounting.o dirty-bitmap.o
 block-obj-y += write-threshold.o
 block-obj-y += backup.o
 block-obj-$(CONFIG_REPLICATION) += replication.o
-block-obj-y += throttle.o
+block-obj-y += throttle.o copy-on-read.o
 
 block-obj-y += crypto.o
 
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Currently we never actually check whether the WRITE_UNCHANGED
permission has been taken for unchanging writes.  But the one check that
is commented out checks both WRITE and WRITE_UNCHANGED; and considering
that WRITE_UNCHANGED is already documented as being weaker than WRITE,
we should probably explicitly document WRITE to include WRITE_UNCHANGED.

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ enum {
      * This permission (which is weaker than BLK_PERM_WRITE) is both enough and
      * required for writes to the block node when the caller promises that
      * the visible disk content doesn't change.
+     *
+     * As the BLK_PERM_WRITE permission is strictly stronger, either is
+     * sufficient to perform an unchanging write.
      */
     BLK_PERM_WRITE_UNCHANGED    = 0x04,
 
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

This flag signifies that a write request will not change the visible
disk content.  With this flag set, it is sufficient to have the
BLK_PERM_WRITE_UNCHANGED permission instead of BLK_PERM_WRITE.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Message-id: 20180421132929.21610-4-mreitz@redhat.com
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 include/block/block.h | 6 +++++-
 block/io.c            | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
     BDRV_REQ_FUA                = 0x10,
     BDRV_REQ_WRITE_COMPRESSED   = 0x20,
 
+    /* Signifies that this write request will not change the visible disk
+     * content. */
+    BDRV_REQ_WRITE_UNCHANGED    = 0x40,
+
     /* Mask of valid flags */
-    BDRV_REQ_MASK               = 0x3f,
+    BDRV_REQ_MASK               = 0x7f,
 } BdrvRequestFlags;
 
 typedef struct BlockSizes {
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
     assert(!waited || !req->serialising);
     assert(req->overlap_offset <= offset);
     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
-    assert(child->perm & BLK_PERM_WRITE);
+    if (flags & BDRV_REQ_WRITE_UNCHANGED) {
+        assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
+    } else {
+        assert(child->perm & BLK_PERM_WRITE);
+    }
     assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
 
     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
                 /* FIXME: Should we (perhaps conditionally) be setting
                  * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
                  * that still correctly reads as zero? */
-                ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 0);
+                ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
+                                               BDRV_REQ_WRITE_UNCHANGED);
             } else {
                 /* This does not change the data on the disk, it is not
                  * necessary to flush even in cache=writethrough mode.
                  */
                 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
-                                          &local_qiov, 0);
+                                          &local_qiov,
+                                          BDRV_REQ_WRITE_UNCHANGED);
             }
 
             if (ret < 0) {
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

We just need to forward it to quorum's children (except in case of a
rewrite because of corruption), but for that we first have to support
flags in child requests at all.

diff --git a/block/quorum.c b/block/quorum.c
index XXXXXXX..XXXXXXX 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -XXX,XX +XXX,XX @@ struct QuorumAIOCB {
     /* Request metadata */
     uint64_t offset;
     uint64_t bytes;
+    int flags;
 
     QEMUIOVector *qiov;         /* calling IOV */
 
@@ -XXX,XX +XXX,XX @@ static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b)
 static QuorumAIOCB *quorum_aio_get(BlockDriverState *bs,
                                    QEMUIOVector *qiov,
                                    uint64_t offset,
-                                   uint64_t bytes)
+                                   uint64_t bytes,
+                                   int flags)
 {
     BDRVQuorumState *s = bs->opaque;
     QuorumAIOCB *acb = g_new(QuorumAIOCB, 1);
@@ -XXX,XX +XXX,XX @@ static QuorumAIOCB *quorum_aio_get(BlockDriverState *bs,
         .bs                 = bs,
         .offset             = offset,
         .bytes              = bytes,
+        .flags              = flags,
         .qiov               = qiov,
         .votes.compare      = quorum_sha256_compare,
         .votes.vote_list    = QLIST_HEAD_INITIALIZER(acb.votes.vote_list),
@@ -XXX,XX +XXX,XX @@ static void quorum_rewrite_entry(void *opaque)
     BDRVQuorumState *s = acb->bs->opaque;
 
     /* Ignore any errors, it's just a correction attempt for already
-     * corrupted data. */
+     * corrupted data.
+     * Mask out BDRV_REQ_WRITE_UNCHANGED because this overwrites the
+     * area with different data from the other children. */
     bdrv_co_pwritev(s->children[co->idx], acb->offset, acb->bytes,
-                    acb->qiov, 0);
+                    acb->qiov, acb->flags & ~BDRV_REQ_WRITE_UNCHANGED);
 
     /* Wake up the caller after the last rewrite */
     acb->rewrite_count--;
@@ -XXX,XX +XXX,XX @@ static int quorum_co_preadv(BlockDriverState *bs, uint64_t offset,
                             uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
     BDRVQuorumState *s = bs->opaque;
-    QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes);
+    QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags);
     int ret;
 
     acb->is_read = true;
@@ -XXX,XX +XXX,XX @@ static void write_quorum_entry(void *opaque)
 
     sacb->bs = s->children[i]->bs;
     sacb->ret = bdrv_co_pwritev(s->children[i], acb->offset, acb->bytes,
-                                acb->qiov, 0);
+                                acb->qiov, acb->flags);
     if (sacb->ret == 0) {
         acb->success_count++;
     } else {
@@ -XXX,XX +XXX,XX @@ static int quorum_co_pwritev(BlockDriverState *bs, uint64_t offset,
                              uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
     BDRVQuorumState *s = bs->opaque;
-    QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes);
+    QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags);
     int i, ret;
 
     for (i = 0; i < s->num_children; i++) {
@@ -XXX,XX +XXX,XX @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
     }
     s->next_child_index = s->num_children;
 
+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
+
     g_free(opened);
     goto exit;
 
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Update the rest of the filter drivers to support
BDRV_REQ_WRITE_UNCHANGED.  They already forward write request flags to
their children, so we just have to announce support for it.

This patch does not cover the replication driver because that currently
does not support flags at all, and because it just grabs the WRITE
permission for its children when it can, so we should be fine just
submitting the incoming WRITE_UNCHANGED requests as normal writes.

It also does not cover format drivers for similar reasons.  They all use
bdrv_format_default_perms() as their .bdrv_child_perm() implementation
so they just always grab the WRITE permission for their file children
whenever possible.  In addition, it often would be difficult to
ascertain whether incoming unchanging writes end up as unchanging writes
in their files.  So we just leave them as normal potentially changing
writes.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Message-id: 20180421132929.21610-7-mreitz@redhat.com
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/blkdebug.c     |  9 +++++----
 block/blkreplay.c    |  3 +++
 block/blkverify.c    |  3 +++
 block/copy-on-read.c | 10 ++++++----
 block/mirror.c       |  2 ++
 block/raw-format.c   |  9 +++++----
 block/throttle.c     |  6 ++++--
 7 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index XXXXXXX..XXXXXXX 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -XXX,XX +XXX,XX @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
         goto out;
     }
 
-    bs->supported_write_flags = BDRV_REQ_FUA &
-        bs->file->bs->supported_write_flags;
-    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
-        bs->file->bs->supported_zero_flags;
+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
+        (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
+        ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+            bs->file->bs->supported_zero_flags);
     ret = -EINVAL;
 
     /* Set alignment overrides */
diff --git a/block/blkreplay.c b/block/blkreplay.c
index XXXXXXX..XXXXXXX 100755
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -XXX,XX +XXX,XX @@ static int blkreplay_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED;
+
     ret = 0;
 fail:
     return ret;
diff --git a/block/blkverify.c b/block/blkverify.c
index XXXXXXX..XXXXXXX 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -XXX,XX +XXX,XX @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED;
+
     ret = 0;
 fail:
     qemu_opts_del(opts);
diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index XXXXXXX..XXXXXXX 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -XXX,XX +XXX,XX @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags,
         return -EINVAL;
     }
 
-    bs->supported_write_flags = BDRV_REQ_FUA &
-                                    bs->file->bs->supported_write_flags;
+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
+                                (BDRV_REQ_FUA &
+                                    bs->file->bs->supported_write_flags);
 
-    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
-                                    bs->file->bs->supported_zero_flags;
+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
+                               ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+                                    bs->file->bs->supported_zero_flags);
 
     return 0;
 }
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
         mirror_top_bs->implicit = true;
     }
     mirror_top_bs->total_sectors = bs->total_sectors;
+    mirror_top_bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
+    mirror_top_bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED;
     bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs));
 
     /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep
diff --git a/block/raw-format.c b/block/raw-format.c
index XXXXXXX..XXXXXXX 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -XXX,XX +XXX,XX @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     bs->sg = bs->file->bs->sg;
-    bs->supported_write_flags = BDRV_REQ_FUA &
-        bs->file->bs->supported_write_flags;
-    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
-        bs->file->bs->supported_zero_flags;
+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
+        (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
+        ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+            bs->file->bs->supported_zero_flags);
 
     if (bs->probed && !bdrv_is_read_only(bs)) {
         fprintf(stderr,
diff --git a/block/throttle.c b/block/throttle.c
index XXXXXXX..XXXXXXX 100644
--- a/block/throttle.c
+++ b/block/throttle.c
@@ -XXX,XX +XXX,XX @@ static int throttle_open(BlockDriverState *bs, QDict *options,
     if (!bs->file) {
         return -EINVAL;
     }
-    bs->supported_write_flags = bs->file->bs->supported_write_flags;
-    bs->supported_zero_flags = bs->file->bs->supported_zero_flags;
+    bs->supported_write_flags = bs->file->bs->supported_write_flags |
+                                BDRV_REQ_WRITE_UNCHANGED;
+    bs->supported_zero_flags = bs->file->bs->supported_zero_flags |
+                               BDRV_REQ_WRITE_UNCHANGED;
 
     return throttle_configure_tgm(bs, tgm, options, errp);
 }
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

iotest 197 tests copy-on-read using the (now old) copy-on-read flag.
Copy it to 215 and modify it to use the COR filter driver instead.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180421132929.21610-9-mreitz@redhat.com
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/215     | 120 +++++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/215.out |  26 ++++++++++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 147 insertions(+)
 create mode 100755 tests/qemu-iotests/215
 create mode 100644 tests/qemu-iotests/215.out

diff --git a/tests/qemu-iotests/215 b/tests/qemu-iotests/215
new file mode 100755
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/215
@@ -XXX,XX +XXX,XX @@
+#!/bin/bash
+#
+# Test case for copy-on-read into qcow2, using the COR filter driver
+#
+# Copyright (C) 2018 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+seq="$(basename $0)"
+echo "QA output created by $seq"
+
+here="$PWD"
+status=1 # failure is the default!
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+TEST_WRAP="$TEST_DIR/t.wrap.qcow2"
+BLKDBG_CONF="$TEST_DIR/blkdebug.conf"
+
+# Sanity check: our use of blkdebug fails if $TEST_DIR contains spaces
+# or other problems
+case "$TEST_DIR" in
+    *[^-_a-zA-Z0-9/]*)
+        _notrun "Suspicious TEST_DIR='$TEST_DIR', cowardly refusing to run" ;;
+esac
+
+_cleanup()
+{
+    _cleanup_test_img
+    rm -f "$TEST_WRAP"
+    rm -f "$BLKDBG_CONF"
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# Test is supported for any backing file; but we force qcow2 for our wrapper.
+_supported_fmt generic
+_supported_proto generic
+_supported_os Linux
+# LUKS support may be possible, but it complicates things.
+_unsupported_fmt luks
+
+echo
+echo '=== Copy-on-read ==='
+echo
+
+# Prep the images
+# VPC rounds image sizes to a specific geometry, force a specific size.
+if [ "$IMGFMT" = "vpc" ]; then
+    IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size")
+fi
+_make_test_img 4G
+$QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io
+IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \
+    _make_test_img -F "$IMGFMT" -b "$TEST_IMG" | _filter_img_create
+$QEMU_IO -f qcow2 -c "write -z -u 1M 64k" "$TEST_WRAP" | _filter_qemu_io
+
+# Ensure that a read of two clusters, but where one is already allocated,
+# does not re-write the allocated cluster
+cat > "$BLKDBG_CONF" <<EOF
+[inject-error]
+event = "cor_write"
+sector = "2048"
+EOF
+$QEMU_IO -c "open \
+ -o driver=copy-on-read,file.driver=blkdebug,file.config=$BLKDBG_CONF,file.image.driver=qcow2 $TEST_WRAP" \
+ -c "read -P 0 1M 128k" | _filter_qemu_io
+
+# Read the areas we want copied. A zero-length read should still be a
+# no-op.  The next read is under 2G, but aligned so that rounding to
+# clusters copies more than 2G of zeroes. The final read will pick up
+# the non-zero data in the same cluster.  Since a 2G read may exhaust
+# memory on some machines (particularly 32-bit), we skip the test if
+# that fails due to memory pressure.
+$QEMU_IO \
+    -c "open -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \
+    -c "read 0 0" \
+    | _filter_qemu_io
+output=$($QEMU_IO \
+         -c "open -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \
+         -c "read -P 0 1k $((2*1024*1024*1024 - 512))" \
+         2>&1 | _filter_qemu_io)
+case $output in
+    *allocate*)
+        _notrun "Insufficent memory to run test" ;;
+    *) printf '%s\n' "$output" ;;
+esac
+$QEMU_IO \
+    -c "open -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \
+    -c "read -P 0 $((3*1024*1024*1024 + 1024)) 1k" \
+    | _filter_qemu_io
+
+# Copy-on-read is incompatible with read-only
+$QEMU_IO \
+    -c "open -r -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \
+    2>&1 | _filter_testdir
+
+# Break the backing chain, and show that images are identical, and that
+# we properly copied over explicit zeros.
+$QEMU_IMG rebase -u -b "" -f qcow2 "$TEST_WRAP"
+$QEMU_IO -f qcow2 -c map "$TEST_WRAP"
+_check_test_img
+$QEMU_IMG compare -f $IMGFMT -F qcow2 "$TEST_IMG" "$TEST_WRAP"
+
+# success, all done
+echo '*** done'
+status=0
diff --git a/tests/qemu-iotests/215.out b/tests/qemu-iotests/215.out
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/215.out
@@ -XXX,XX +XXX,XX @@
+QA output created by 215
+
+=== Copy-on-read ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=4294967296
+wrote 1024/1024 bytes at offset 3221225472
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Formatting 'TEST_DIR/t.wrap.IMGFMT', fmt=IMGFMT size=4294967296 backing_file=TEST_DIR/t.IMGFMT backing_fmt=IMGFMT
+wrote 65536/65536 bytes at offset 1048576
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 131072/131072 bytes at offset 1048576
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 0/0 bytes at offset 0
+0 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2147483136/2147483136 bytes at offset 1024
+2 GiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 3221226496
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+can't open device TEST_DIR/t.wrap.qcow2: Block node is read-only
+2 GiB (0x80010000) bytes     allocated at offset 0 bytes (0x0)
+1023.938 MiB (0x3fff0000) bytes not allocated at offset 2 GiB (0x80010000)
+64 KiB (0x10000) bytes     allocated at offset 3 GiB (0xc0000000)
+1023.938 MiB (0x3fff0000) bytes not allocated at offset 3 GiB (0xc0010000)
+No errors were found on the image.
+Images are identical.
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 212 rw auto quick
 213 rw auto quick
 214 rw auto
+215 rw auto quick
 218 rw auto quick
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

COR across nodes (that is, you have some filter node between the
actually COR target and the node that performs the COR) cannot reliably
work together with the permission system when there is no explicit COR
node that can request the WRITE_UNCHANGED permission for its child.
This is because COR (currently) sneaks its requests by the usual
permission checks, so it can work without a WRITE* permission; but if
there is a filter node in between, that will re-issue the request, which
then passes through the usual check -- and if nobody has requested a
WRITE_UNCHANGED permission, that check will fail.

There is no real direct fix apart from hoping that there is someone who
has requested that permission; in case of just the qemu-io HMP command
(and no guest device), however, that is not the case.  The real real fix
is to implement the copy-on-read flag through an implicitly added COR
node.  Such a node can request the necessary permissions as shown in
this test.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180421132929.21610-10-mreitz@redhat.com
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/216     | 115 +++++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/216.out |  28 +++++++++++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 144 insertions(+)
 create mode 100755 tests/qemu-iotests/216
 create mode 100644 tests/qemu-iotests/216.out

diff --git a/tests/qemu-iotests/216 b/tests/qemu-iotests/216
new file mode 100755
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/216
@@ -XXX,XX +XXX,XX @@
+#!/usr/bin/env python
+#
+# Copy-on-read tests using a COR filter node
+#
+# Copyright (C) 2018 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# Creator/Owner: Max Reitz <mreitz@redhat.com>
+
+import iotests
+from iotests import log, qemu_img_pipe, qemu_io, filter_qemu_io
+
+# Need backing file support
+iotests.verify_image_format(supported_fmts=['qcow2', 'qcow', 'qed', 'vmdk'])
+iotests.verify_platform(['linux'])
+
+log('')
+log('=== Copy-on-read across nodes ===')
+log('')
+
+# The old copy-on-read mechanism without a filter node cannot request
+# WRITE_UNCHANGED permissions for its child.  Therefore it just tries
+# to sneak its write by the usual permission system and holds its
+# fingers crossed.  However, that sneaking does not work so well when
+# there is a filter node in the way: That will receive the write
+# request and re-issue a new one to its child, which this time is a
+# proper write request that will make the permission system cough --
+# unless there is someone at the top (like a guest device) that has
+# requested write permissions.
+#
+# A COR filter node, however, can request the proper permissions for
+# its child and therefore is not hit by this issue.
+
+with iotests.FilePath('base.img') as base_img_path, \
+     iotests.FilePath('top.img') as top_img_path, \
+     iotests.VM() as vm:
+
+    log('--- Setting up images ---')
+    log('')
+
+    qemu_img_pipe('create', '-f', iotests.imgfmt, base_img_path, '64M')
+
+    log(filter_qemu_io(qemu_io(base_img_path, '-c', 'write -P 1 0M 1M')))
+
+    qemu_img_pipe('create', '-f', iotests.imgfmt, '-b', base_img_path,
+                  top_img_path)
+
+    log(filter_qemu_io(qemu_io(top_img_path,  '-c', 'write -P 2 1M 1M')))
+
+    log('')
+    log('--- Doing COR ---')
+    log('')
+
+    # Compare with e.g. the following:
+    #   vm.add_drive_raw('if=none,node-name=node0,copy-on-read=on,driver=raw,' \
+    #                    'file.driver=%s,file.file.filename=%s' %
+    #                       (iotests.imgfmt, top_img_path))
+    # (Remove the blockdev-add instead.)
+    # ((Not tested here because it hits an assertion in the permission
+    #   system.))
+
+    vm.launch()
+
+    log(vm.qmp('blockdev-add',
+                    node_name='node0',
+                    driver='copy-on-read',
+                    file={
+                        'driver': 'raw',
+                        'file': {
+                            'driver': 'copy-on-read',
+                            'file': {
+                                'driver': 'raw',
+                                'file': {
+                                    'driver': iotests.imgfmt,
+                                    'file': {
+                                        'driver': 'file',
+                                        'filename': top_img_path
+                                    },
+                                    'backing': {
+                                        'driver': iotests.imgfmt,
+                                        'file': {
+                                            'driver': 'file',
+                                            'filename': base_img_path
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }))
+
+    # Trigger COR
+    log(vm.qmp('human-monitor-command',
+               command_line='qemu-io node0 "read 0 64M"'))
+
+    vm.shutdown()
+
+    log('')
+    log('--- Checking COR result ---')
+    log('')
+
+    log(filter_qemu_io(qemu_io(base_img_path, '-c', 'discard 0 64M')))
+    log(filter_qemu_io(qemu_io(top_img_path,  '-c', 'read -P 1 0M 1M')))
+    log(filter_qemu_io(qemu_io(top_img_path,  '-c', 'read -P 2 1M 1M')))
diff --git a/tests/qemu-iotests/216.out b/tests/qemu-iotests/216.out
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/216.out
@@ -XXX,XX +XXX,XX @@
+
+=== Copy-on-read across nodes ===
+
+--- Setting up images ---
+
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+wrote 1048576/1048576 bytes at offset 1048576
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+
+--- Doing COR ---
+
+{u'return': {}}
+{u'return': u''}
+
+--- Checking COR result ---
+
+discard 67108864/67108864 bytes at offset 0
+64 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+read 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+read 1048576/1048576 bytes at offset 1048576
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 213 rw auto quick
 214 rw auto
 215 rw auto quick
+216 rw auto quick
 218 rw auto quick
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Some block drivers (iscsi and file-posix when dealing with device files)
do not actually support truncation, even though they provide a
.bdrv_truncate() method and will happily return success when providing a
new size that does not exceed the current size.  This is because these
drivers expect the user to resize the image outside of qemu and then
provide qemu with that information through the block_resize command
(compare cb1b83e740384b4e0d950f3d7c81c02b8ce86c2e).

Of course, anyone using qemu-img resize will find that behavior useless.
So we should check the actual size of the image after the supposedly
successful truncation took place, emit an error if nothing changed and
emit a warning if the target size was not met.

Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1523065
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180421163957.29872-1-mreitz@redhat.com
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qemu-img.c | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ static int img_resize(int argc, char **argv)
     Error *err = NULL;
     int c, ret, relative;
     const char *filename, *fmt, *size;
-    int64_t n, total_size, current_size;
+    int64_t n, total_size, current_size, new_size;
     bool quiet = false;
     BlockBackend *blk = NULL;
     PreallocMode prealloc = PREALLOC_MODE_OFF;
@@ -XXX,XX +XXX,XX @@ static int img_resize(int argc, char **argv)
     }
 
     ret = blk_truncate(blk, total_size, prealloc, &err);
-    if (!ret) {
-        qprintf(quiet, "Image resized.\n");
-    } else {
+    if (ret < 0) {
         error_report_err(err);
+        goto out;
+    }
+
+    new_size = blk_getlength(blk);
+    if (new_size < 0) {
+        error_report("Failed to verify truncated image length: %s",
+                     strerror(-new_size));
+        ret = -1;
+        goto out;
     }
+
+    /* Some block drivers implement a truncation method, but only so
+     * the user can cause qemu to refresh the image's size from disk.
+     * The idea is that the user resizes the image outside of qemu and
+     * then invokes block_resize to inform qemu about it.
+     * (This includes iscsi and file-posix for device files.)
+     * Of course, that is not the behavior someone invoking
+     * qemu-img resize would find useful, so we catch that behavior
+     * here and tell the user. */
+    if (new_size != total_size && new_size == current_size) {
+        error_report("Image was not resized; resizing may not be supported "
+                     "for this image");
+        ret = -1;
+        goto out;
+    }
+
+    if (new_size != total_size) {
+        warn_report("Image should have been resized to %" PRIi64
+                    " bytes, but was resized to %" PRIi64 " bytes",
+                    total_size, new_size);
+    }
+
+    qprintf(quiet, "Image resized.\n");
+
 out:
     blk_unref(blk);
     if (ret) {
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Add BDRV_REQ_WRITE_UNCHANGED to the list of flags honored during pwrite
and pwrite_zeroes, and also add a note on when you absolutely need to
support it.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180502140359.18222-1-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 include/block/block_int.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     /* I/O Limits */
     BlockLimits bl;
 
-    /* Flags honored during pwrite (so far: BDRV_REQ_FUA) */
+    /* Flags honored during pwrite (so far: BDRV_REQ_FUA,
+     * BDRV_REQ_WRITE_UNCHANGED).
+     * If a driver does not support BDRV_REQ_WRITE_UNCHANGED, those
+     * writes will be issued as normal writes without the flag set.
+     * This is important to note for drivers that do not explicitly
+     * request a WRITE permission for their children and instead take
+     * the same permissions as their parent did (this is commonly what
+     * block filters do).  Such drivers have to be aware that the
+     * parent may have taken a WRITE_UNCHANGED permission only and is
+     * issuing such requests.  Drivers either must make sure that
+     * these requests do not result in plain WRITE accesses (usually
+     * by supporting BDRV_REQ_WRITE_UNCHANGED, and then forwarding
+     * every incoming write request as-is, including potentially that
+     * flag), or they have to explicitly take the WRITE permission for
+     * their children. */
     unsigned int supported_write_flags;
     /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA,
-     * BDRV_REQ_MAY_UNMAP) */
+     * BDRV_REQ_MAY_UNMAP, BDRV_REQ_WRITE_UNCHANGED) */
     unsigned int supported_zero_flags;
 
     /* the following member gives a name to every node on the bs graph. */
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Currently, qemu-io only uses string-valued blockdev options (as all are
converted directly from QemuOpts) -- with one exception: -U adds the
force-share option as a boolean.  This in itself is already a bit
questionable, but a real issue is that it also assumes the value already
existing in the options QDict would be a boolean, which is wrong.

That has the following effect:

$ ./qemu-io -r -U --image-opts \
    driver=file,filename=/dev/null,force-share=off
[1]    15200 segmentation fault (core dumped)  ./qemu-io -r -U
--image-opts driver=file,filename=/dev/null,force-share=off

Since @opts is converted from QemuOpts, the value must be a string, and
we have to compare it as such.  Consequently, it makes sense to also set
it as a string instead of a boolean.

Cc: qemu-stable@nongnu.org
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180502202051.15493-2-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qemu-io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qemu-io.c b/qemu-io.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -XXX,XX +XXX,XX @@ static int openfile(char *name, int flags, bool writethrough, bool force_share,
             opts = qdict_new();
         }
         if (qdict_haskey(opts, BDRV_OPT_FORCE_SHARE)
-            && !qdict_get_bool(opts, BDRV_OPT_FORCE_SHARE)) {
+            && strcmp(qdict_get_str(opts, BDRV_OPT_FORCE_SHARE), "on")) {
             error_report("-U conflicts with image options");
             qobject_unref(opts);
             return 1;
         }
-        qdict_put_bool(opts, BDRV_OPT_FORCE_SHARE, true);
+        qdict_put_str(opts, BDRV_OPT_FORCE_SHARE, "on");
     }
     qemuio_blk = blk_new_open(name, NULL, opts, flags, &local_err);
     if (!qemuio_blk) {
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

img_open_opts() takes a QemuOpts and converts them to a QDict, so all
values therein are strings.  Then it may try to call qdict_get_bool(),
however, which will fail with a segmentation fault every time:

$ ./qemu-img info -U --image-opts \
    driver=file,filename=/dev/null,force-share=off
[1]    27869 segmentation fault (core dumped)  ./qemu-img info -U
--image-opts driver=file,filename=/dev/null,force-share=off

Fix this by using qdict_get_str() and comparing the value as a string.
Also, when adding a force-share value to the QDict, add it as a string
so it fits the rest of the dict.

Cc: qemu-stable@nongnu.org
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180502202051.15493-3-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qemu-img.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ static BlockBackend *img_open_opts(const char *optstr,
     options = qemu_opts_to_qdict(opts, NULL);
     if (force_share) {
         if (qdict_haskey(options, BDRV_OPT_FORCE_SHARE)
-            && !qdict_get_bool(options, BDRV_OPT_FORCE_SHARE)) {
+            && strcmp(qdict_get_str(options, BDRV_OPT_FORCE_SHARE), "on")) {
             error_report("--force-share/-U conflicts with image options");
             qobject_unref(options);
             return NULL;
         }
-        qdict_put_bool(options, BDRV_OPT_FORCE_SHARE, true);
+        qdict_put_str(options, BDRV_OPT_FORCE_SHARE, "on");
     }
     blk = blk_new_open(NULL, NULL, options, flags, &local_err);
     if (!blk) {
-- 
2.13.6

From: Max Reitz <mreitz@redhat.com>

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20180502202051.15493-4-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/153     | 17 +++++++++++++++++
 tests/qemu-iotests/153.out | 16 ++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/tests/qemu-iotests/153 b/tests/qemu-iotests/153
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/153
+++ b/tests/qemu-iotests/153
@@ -XXX,XX +XXX,XX @@ _run_cmd $QEMU_IO "${TEST_IMG}" -c 'write 0 512'
 
 _cleanup_qemu
 
+echo
+echo "== Detecting -U and force-share conflicts =="
+
+echo
+echo 'No conflict:'
+$QEMU_IMG info -U --image-opts driver=null-co,force-share=on
+echo
+echo 'Conflict:'
+$QEMU_IMG info -U --image-opts driver=null-co,force-share=off
+
+echo
+echo 'No conflict:'
+$QEMU_IO -c 'open -r -U -o driver=null-co,force-share=on'
+echo
+echo 'Conflict:'
+$QEMU_IO -c 'open -r -U -o driver=null-co,force-share=off'
+
 # success, all done
 echo "*** done"
 rm -f $seq.full
diff --git a/tests/qemu-iotests/153.out b/tests/qemu-iotests/153.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/153.out
+++ b/tests/qemu-iotests/153.out
@@ -XXX,XX +XXX,XX @@ Is another process using the image?
 Closing the other
 
 _qemu_io_wrapper TEST_DIR/t.qcow2 -c write 0 512
+
+== Detecting -U and force-share conflicts ==
+
+No conflict:
+image: null-co://
+file format: null-co
+virtual size: 1.0G (1073741824 bytes)
+disk size: unavailable
+
+Conflict:
+qemu-img: --force-share/-U conflicts with image options
+
+No conflict:
+
+Conflict:
+-U conflicts with image options
 *** done
-- 
2.13.6

The following changes since commit 16aaacb307ed607b9780c12702c44f0fe52edc7e:

Merge remote-tracking branch 'remotes/cohuck/tags/s390x-20200430' into staging (2020-04-30 14:00:36 +0100)

are available in the Git repository at:

git://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to eaae29ef89d498d0eac553c77b554f310a47f809:

qemu-storage-daemon: Fix non-string --object properties (2020-04-30 17:51:07 +0200)

----------------------------------------------------------------
Block layer patches:

- Fix resize (extending) of short overlays
- nvme: introduce PMR support from NVMe 1.4 spec
- qemu-storage-daemon: Fix non-string --object properties

----------------------------------------------------------------
Alberto Garcia (1):
      qcow2: Add incompatibility note between backing files and raw external data files

Andrzej Jakowski (1):
      nvme: introduce PMR support from NVMe 1.4 spec

Kevin Wolf (12):
      block: Add flags to BlockDriver.bdrv_co_truncate()
      block: Add flags to bdrv(_co)_truncate()
      block-backend: Add flags to blk_truncate()
      qcow2: Support BDRV_REQ_ZERO_WRITE for truncate
      raw-format: Support BDRV_REQ_ZERO_WRITE for truncate
      file-posix: Support BDRV_REQ_ZERO_WRITE for truncate
      block: truncate: Don't make backing file data visible
      iotests: Filter testfiles out in filter_img_info()
      iotests: Test committing to short backing file
      qcow2: Forward ZERO_WRITE flag for full preallocation
      qom: Factor out user_creatable_add_dict()
      qemu-storage-daemon: Fix non-string --object properties

Paolo Bonzini (1):
      qemu-iotests: allow qcow2 external discarded clusters to contain stale data

From: Alberto Garcia <berto@igalia.com>

Backing files and raw external data files are mutually exclusive.
The documentation of the raw external data bit (in autoclear_features)
already indicates that, but we should also mention it on the other
side.

Suggested-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Alberto Garcia <berto@igalia.com>
Message-Id: <20200410121816.8334-1-berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 docs/interop/qcow2.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt
index XXXXXXX..XXXXXXX 100644
--- a/docs/interop/qcow2.txt
+++ b/docs/interop/qcow2.txt
@@ -XXX,XX +XXX,XX @@ The first cluster of a qcow2 image contains the file header:
                     is stored (NB: The string is not null terminated). 0 if the
                     image doesn't have a backing file.
 
+                    Note: backing files are incompatible with raw external data
+                    files (auto-clear feature bit 1).
+
          16 - 19:   backing_file_size
                     Length of the backing file name in bytes. Must not be
                     longer than 1023 bytes. Undefined if the image doesn't have
-- 
2.25.3

From: Paolo Bonzini <pbonzini@redhat.com>

Test 244 checks the expected behavior of qcow2 external data files
with respect to zero and discarded clusters.  Filesystems however
are free to ignore discard requests, and this seems to be the
case for overlayfs.  Relax the tests to skip checks on the
external data file for discarded areas, which implies not using
qemu-img compare in the data_file_raw=on case.

This fixes docker tests on RHEL8.

Cc: Kevin Wolf <kwolf@redhat.com>
Cc: qemu-block@nongnu.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20200409191006.24429-1-pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/244     | 10 ++++++++--
 tests/qemu-iotests/244.out |  9 ++++++---
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/tests/qemu-iotests/244 b/tests/qemu-iotests/244
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/244
+++ b/tests/qemu-iotests/244
@@ -XXX,XX +XXX,XX @@ $QEMU_IO -c 'read -P 0 0 1M' \
 echo
 $QEMU_IO -c 'read -P 0 0 1M' \
          -c 'read -P 0x11 1M 1M' \
-         -c 'read -P 0 2M 2M' \
          -c 'read -P 0x11 4M 1M' \
          -c 'read -P 0 5M 1M' \
          -f raw "$TEST_IMG.data" |
@@ -XXX,XX +XXX,XX @@ $QEMU_IO -c 'read -P 0 0 1M' \
          -f $IMGFMT "$TEST_IMG" |
          _filter_qemu_io
 
+# Discarded clusters are only marked as such in the qcow2 metadata, but
+# they can contain stale data in the external data file.  Instead, zero
+# clusters must be zeroed in the external data file too.
 echo
-$QEMU_IMG compare "$TEST_IMG" "$TEST_IMG.data"
+$QEMU_IO -c 'read -P 0 0 1M' \
+         -c 'read -P 0x11 1M 1M' \
+         -c 'read -P 0 3M 3M' \
+         -f raw "$TEST_IMG".data |
+         _filter_qemu_io
 
 echo -n "qcow2 file size after I/O: "
 du -b $TEST_IMG | cut -f1
diff --git a/tests/qemu-iotests/244.out b/tests/qemu-iotests/244.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/244.out
+++ b/tests/qemu-iotests/244.out
@@ -XXX,XX +XXX,XX @@ read 1048576/1048576 bytes at offset 0
 1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 read 1048576/1048576 bytes at offset 1048576
 1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 2097152/2097152 bytes at offset 2097152
-2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 read 1048576/1048576 bytes at offset 4194304
 1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 read 1048576/1048576 bytes at offset 5242880
@@ -XXX,XX +XXX,XX @@ read 1048576/1048576 bytes at offset 1048576
 read 4194304/4194304 bytes at offset 2097152
 4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
-Images are identical.
+read 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1048576/1048576 bytes at offset 1048576
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 3145728/3145728 bytes at offset 3145728
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 qcow2 file size after I/O: 327680
 
 === bdrv_co_block_status test for file and offset=0 ===
-- 
2.25.3

This adds a new BdrvRequestFlags parameter to the .bdrv_co_truncate()
driver callbacks, and a supported_truncate_flags field in
BlockDriverState that allows drivers to advertise support for request
flags in the context of truncate.

For now, we always pass 0 and no drivers declare support for any flag.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-Id: <20200424125448.63318-2-kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block_int.h   | 10 +++++++++-
 block/crypto.c              |  3 ++-
 block/file-posix.c          |  2 +-
 block/file-win32.c          |  2 +-
 block/gluster.c             |  1 +
 block/io.c                  |  8 +++++++-
 block/iscsi.c               |  2 +-
 block/nfs.c                 |  3 ++-
 block/qcow2.c               |  2 +-
 block/qed.c                 |  1 +
 block/raw-format.c          |  2 +-
 block/rbd.c                 |  1 +
 block/sheepdog.c            |  4 ++--
 block/ssh.c                 |  2 +-
 tests/test-block-iothread.c |  3 ++-
 15 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
      */
     int coroutine_fn (*bdrv_co_truncate)(BlockDriverState *bs, int64_t offset,
                                          bool exact, PreallocMode prealloc,
-                                         Error **errp);
+                                         BdrvRequestFlags flags, Error **errp);
 
     int64_t (*bdrv_getlength)(BlockDriverState *bs);
     bool has_variable_length;
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA,
      * BDRV_REQ_MAY_UNMAP, BDRV_REQ_WRITE_UNCHANGED) */
     unsigned int supported_zero_flags;
+    /*
+     * Flags honoured during truncate (so far: BDRV_REQ_ZERO_WRITE).
+     *
+     * If BDRV_REQ_ZERO_WRITE is given, the truncate operation must make sure
+     * that any added space reads as all zeros. If this can't be guaranteed,
+     * the operation must fail.
+     */
+    unsigned int supported_truncate_flags;
 
     /* the following member gives a name to every node on the bs graph. */
     char node_name[32];
diff --git a/block/crypto.c b/block/crypto.c
index XXXXXXX..XXXXXXX 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -XXX,XX +XXX,XX @@ static int block_crypto_co_create_generic(BlockDriverState *bs,
 
 static int coroutine_fn
 block_crypto_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
-                         PreallocMode prealloc, Error **errp)
+                         PreallocMode prealloc, BdrvRequestFlags flags,
+                         Error **errp)
 {
     BlockCrypto *crypto = bs->opaque;
     uint64_t payload_offset =
diff --git a/block/file-posix.c b/block/file-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -XXX,XX +XXX,XX @@ raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
 
 static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
                                         bool exact, PreallocMode prealloc,
-                                        Error **errp)
+                                        BdrvRequestFlags flags, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
     struct stat st;
diff --git a/block/file-win32.c b/block/file-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/block/file-win32.c
+++ b/block/file-win32.c
@@ -XXX,XX +XXX,XX @@ static void raw_close(BlockDriverState *bs)
 
 static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
                                         bool exact, PreallocMode prealloc,
-                                        Error **errp)
+                                        BdrvRequestFlags flags, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
     LONG low, high;
diff --git a/block/gluster.c b/block/gluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qemu_gluster_co_truncate(BlockDriverState *bs,
                                                  int64_t offset,
                                                  bool exact,
                                                  PreallocMode prealloc,
+                                                 BdrvRequestFlags flags,
                                                  Error **errp)
 {
     BDRVGlusterState *s = bs->opaque;
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
     BlockDriverState *bs = child->bs;
     BlockDriver *drv = bs->drv;
     BdrvTrackedRequest req;
+    BdrvRequestFlags flags = 0;
     int64_t old_size, new_bytes;
     int ret;
 
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
     }
 
     if (drv->bdrv_co_truncate) {
-        ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, errp);
+        if (flags & ~bs->supported_truncate_flags) {
+            error_setg(errp, "Block driver does not support requested flags");
+            ret = -ENOTSUP;
+            goto out;
+        }
+        ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp);
     } else if (bs->file && drv->is_filter) {
         ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
     } else {
diff --git a/block/iscsi.c b/block/iscsi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static void iscsi_reopen_commit(BDRVReopenState *reopen_state)
 
 static int coroutine_fn iscsi_co_truncate(BlockDriverState *bs, int64_t offset,
                                           bool exact, PreallocMode prealloc,
-                                          Error **errp)
+                                          BdrvRequestFlags flags, Error **errp)
 {
     IscsiLun *iscsilun = bs->opaque;
     int64_t cur_length;
diff --git a/block/nfs.c b/block/nfs.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -XXX,XX +XXX,XX @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
 
 static int coroutine_fn
 nfs_file_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
-                     PreallocMode prealloc, Error **errp)
+                     PreallocMode prealloc, BdrvRequestFlags flags,
+                     Error **errp)
 {
     NFSClient *client = bs->opaque;
     int ret;
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ fail:
 
 static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
                                           bool exact, PreallocMode prealloc,
-                                          Error **errp)
+                                          BdrvRequestFlags flags, Error **errp)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t old_length;
diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_qed_co_truncate(BlockDriverState *bs,
                                              int64_t offset,
                                              bool exact,
                                              PreallocMode prealloc,
+                                             BdrvRequestFlags flags,
                                              Error **errp)
 {
     BDRVQEDState *s = bs->opaque;
diff --git a/block/raw-format.c b/block/raw-format.c
index XXXXXXX..XXXXXXX 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -XXX,XX +XXX,XX @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 
 static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
                                         bool exact, PreallocMode prealloc,
-                                        Error **errp)
+                                        BdrvRequestFlags flags, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
 
diff --git a/block/rbd.c b/block/rbd.c
index XXXXXXX..XXXXXXX 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs,
                                              int64_t offset,
                                              bool exact,
                                              PreallocMode prealloc,
+                                             BdrvRequestFlags flags,
                                              Error **errp)
 {
     int r;
diff --git a/block/sheepdog.c b/block/sheepdog.c
index XXXXXXX..XXXXXXX 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static int64_t sd_getlength(BlockDriverState *bs)
 
 static int coroutine_fn sd_co_truncate(BlockDriverState *bs, int64_t offset,
                                        bool exact, PreallocMode prealloc,
-                                       Error **errp)
+                                       BdrvRequestFlags flags, Error **errp)
 {
     BDRVSheepdogState *s = bs->opaque;
     int ret, fd;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
 
     assert(!flags);
     if (offset > s->inode.vdi_size) {
-        ret = sd_co_truncate(bs, offset, false, PREALLOC_MODE_OFF, NULL);
+        ret = sd_co_truncate(bs, offset, false, PREALLOC_MODE_OFF, 0, NULL);
         if (ret < 0) {
             return ret;
         }
diff --git a/block/ssh.c b/block/ssh.c
index XXXXXXX..XXXXXXX 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -XXX,XX +XXX,XX @@ static int64_t ssh_getlength(BlockDriverState *bs)
 
 static int coroutine_fn ssh_co_truncate(BlockDriverState *bs, int64_t offset,
                                         bool exact, PreallocMode prealloc,
-                                        Error **errp)
+                                        BdrvRequestFlags flags, Error **errp)
 {
     BDRVSSHState *s = bs->opaque;
 
diff --git a/tests/test-block-iothread.c b/tests/test-block-iothread.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-block-iothread.c
+++ b/tests/test-block-iothread.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_test_co_pdiscard(BlockDriverState *bs,
 
 static int coroutine_fn
 bdrv_test_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
-                      PreallocMode prealloc, Error **errp)
+                      PreallocMode prealloc, BdrvRequestFlags flags,
+                      Error **errp)
 {
     return 0;
 }
-- 
2.25.3

Now that block drivers can support flags for .bdrv_co_truncate, expose
the parameter in the node level interfaces bdrv_co_truncate() and
bdrv_truncate().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-Id: <20200424125448.63318-3-kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/block.h       |  5 +++--
 block/block-backend.c       |  2 +-
 block/crypto.c              |  2 +-
 block/io.c                  | 12 +++++++-----
 block/parallels.c           |  6 +++---
 block/qcow.c                |  4 ++--
 block/qcow2-refcount.c      |  2 +-
 block/qcow2.c               | 15 +++++++++------
 block/raw-format.c          |  2 +-
 block/vhdx-log.c            |  2 +-
 block/vhdx.c                |  2 +-
 block/vmdk.c                |  2 +-
 tests/test-block-iothread.c |  6 +++---
 13 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
 void bdrv_refresh_filename(BlockDriverState *bs);
 
 int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
-                                  PreallocMode prealloc, Error **errp);
+                                  PreallocMode prealloc, BdrvRequestFlags flags,
+                                  Error **errp);
 int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
-                  PreallocMode prealloc, Error **errp);
+                  PreallocMode prealloc, BdrvRequestFlags flags, Error **errp);
 
 int64_t bdrv_nb_sectors(BlockDriverState *bs);
 int64_t bdrv_getlength(BlockDriverState *bs);
diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
         return -ENOMEDIUM;
     }
 
-    return bdrv_truncate(blk->root, offset, exact, prealloc, errp);
+    return bdrv_truncate(blk->root, offset, exact, prealloc, 0, errp);
 }
 
 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
diff --git a/block/crypto.c b/block/crypto.c
index XXXXXXX..XXXXXXX 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -XXX,XX +XXX,XX @@ block_crypto_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
 
     offset += payload_offset;
 
-    return bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
+    return bdrv_co_truncate(bs->file, offset, exact, prealloc, 0, errp);
 }
 
 static void block_crypto_close(BlockDriverState *bs)
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_parent_cb_resize(BlockDriverState *bs)
  * 'offset' bytes in length.
  */
 int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
-                                  PreallocMode prealloc, Error **errp)
+                                  PreallocMode prealloc, BdrvRequestFlags flags,
+                                  Error **errp)
 {
     BlockDriverState *bs = child->bs;
     BlockDriver *drv = bs->drv;
     BdrvTrackedRequest req;
-    BdrvRequestFlags flags = 0;
     int64_t old_size, new_bytes;
     int ret;
 
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
         }
         ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp);
     } else if (bs->file && drv->is_filter) {
-        ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
+        ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
     } else {
         error_setg(errp, "Image format driver does not support resize");
         ret = -ENOTSUP;
@@ -XXX,XX +XXX,XX @@ typedef struct TruncateCo {
     int64_t offset;
     bool exact;
     PreallocMode prealloc;
+    BdrvRequestFlags flags;
     Error **errp;
     int ret;
 } TruncateCo;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_truncate_co_entry(void *opaque)
 {
     TruncateCo *tco = opaque;
     tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->exact,
-                                tco->prealloc, tco->errp);
+                                tco->prealloc, tco->flags, tco->errp);
     aio_wait_kick();
 }
 
 int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
-                  PreallocMode prealloc, Error **errp)
+                  PreallocMode prealloc, BdrvRequestFlags flags, Error **errp)
 {
     Coroutine *co;
     TruncateCo tco = {
@@ -XXX,XX +XXX,XX @@ int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
         .offset     = offset,
         .exact      = exact,
         .prealloc   = prealloc,
+        .flags      = flags,
         .errp       = errp,
         .ret        = NOT_DONE,
     };
diff --git a/block/parallels.c b/block/parallels.c
index XXXXXXX..XXXXXXX 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -XXX,XX +XXX,XX @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
         } else {
             ret = bdrv_truncate(bs->file,
                                 (s->data_end + space) << BDRV_SECTOR_BITS,
-                                false, PREALLOC_MODE_OFF, NULL);
+                                false, PREALLOC_MODE_OFF, 0, NULL);
         }
         if (ret < 0) {
             return ret;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs,
              * That means we have to pass exact=true.
              */
             ret = bdrv_truncate(bs->file, res->image_end_offset, true,
-                                PREALLOC_MODE_OFF, &local_err);
+                                PREALLOC_MODE_OFF, 0, &local_err);
             if (ret < 0) {
                 error_report_err(local_err);
                 res->check_errors++;
@@ -XXX,XX +XXX,XX @@ static void parallels_close(BlockDriverState *bs)
 
         /* errors are ignored, so we might as well pass exact=true */
         bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS, true,
-                      PREALLOC_MODE_OFF, NULL);
+                      PREALLOC_MODE_OFF, 0, NULL);
     }
 
     g_free(s->bat_dirty_bmap);
diff --git a/block/qcow.c b/block/qcow.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -XXX,XX +XXX,XX @@ static int get_cluster_offset(BlockDriverState *bs,
                     return -E2BIG;
                 }
                 ret = bdrv_truncate(bs->file, cluster_offset + s->cluster_size,
-                                    false, PREALLOC_MODE_OFF, NULL);
+                                    false, PREALLOC_MODE_OFF, 0, NULL);
                 if (ret < 0) {
                     return ret;
                 }
@@ -XXX,XX +XXX,XX @@ static int qcow_make_empty(BlockDriverState *bs)
             l1_length) < 0)
         return -1;
     ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length, false,
-                        PREALLOC_MODE_OFF, NULL);
+                        PREALLOC_MODE_OFF, 0, NULL);
     if (ret < 0)
         return ret;
 
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
                 }
 
                 ret = bdrv_truncate(bs->file, offset + s->cluster_size, false,
-                                    PREALLOC_MODE_OFF, &local_err);
+                                    PREALLOC_MODE_OFF, 0, &local_err);
                 if (ret < 0) {
                     error_report_err(local_err);
                     goto resize_fail;
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn preallocate_co(BlockDriverState *bs, uint64_t offset,
             mode = PREALLOC_MODE_OFF;
         }
         ret = bdrv_co_truncate(s->data_file, host_offset + cur_bytes, false,
-                               mode, errp);
+                               mode, 0, errp);
         if (ret < 0) {
             return ret;
         }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
              * always fulfilled, so there is no need to pass it on.)
              */
             bdrv_co_truncate(bs->file, (last_cluster + 1) * s->cluster_size,
-                             false, PREALLOC_MODE_OFF, &local_err);
+                             false, PREALLOC_MODE_OFF, 0, &local_err);
             if (local_err) {
                 warn_reportf_err(local_err,
                                  "Failed to truncate the tail of the image: ");
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
              * file should be resized to the exact target size, too,
              * so we pass @exact here.
              */
-            ret = bdrv_co_truncate(s->data_file, offset, exact, prealloc, errp);
+            ret = bdrv_co_truncate(s->data_file, offset, exact, prealloc, 0,
+                                   errp);
             if (ret < 0) {
                 goto fail;
             }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
         new_file_size = allocation_start +
                         nb_new_data_clusters * s->cluster_size;
         /* Image file grows, so @exact does not matter */
-        ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, errp);
+        ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, 0,
+                               errp);
         if (ret < 0) {
             error_prepend(errp, "Failed to resize underlying file: ");
             qcow2_free_clusters(bs, allocation_start,
@@ -XXX,XX +XXX,XX @@ qcow2_co_pwritev_compressed_part(BlockDriverState *bs,
         if (len < 0) {
             return len;
         }
-        return bdrv_co_truncate(bs->file, len, false, PREALLOC_MODE_OFF, NULL);
+        return bdrv_co_truncate(bs->file, len, false, PREALLOC_MODE_OFF, 0,
+                                NULL);
     }
 
     if (offset_into_cluster(s, offset)) {
@@ -XXX,XX +XXX,XX @@ static int make_completely_empty(BlockDriverState *bs)
     }
 
     ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size, false,
-                        PREALLOC_MODE_OFF, &local_err);
+                        PREALLOC_MODE_OFF, 0, &local_err);
     if (ret < 0) {
         error_report_err(local_err);
         goto fail;
diff --git a/block/raw-format.c b/block/raw-format.c
index XXXXXXX..XXXXXXX 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
 
     s->size = offset;
     offset += s->offset;
-    return bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
+    return bdrv_co_truncate(bs->file, offset, exact, prealloc, 0, errp);
 }
 
 static void raw_eject(BlockDriverState *bs, bool eject_flag)
diff --git a/block/vhdx-log.c b/block/vhdx-log.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -XXX,XX +XXX,XX @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
                     goto exit;
                 }
                 ret = bdrv_truncate(bs->file, new_file_size, false,
-                                    PREALLOC_MODE_OFF, NULL);
+                                    PREALLOC_MODE_OFF, 0, NULL);
                 if (ret < 0) {
                     goto exit;
                 }
diff --git a/block/vhdx.c b/block/vhdx.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -XXX,XX +XXX,XX @@ static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s,
     }
 
     return bdrv_truncate(bs->file, *new_offset + s->block_size, false,
-                         PREALLOC_MODE_OFF, NULL);
+                         PREALLOC_MODE_OFF, 0, NULL);
 }
 
 /*
diff --git a/block/vmdk.c b/block/vmdk.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -XXX,XX +XXX,XX @@ vmdk_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
             }
             length = QEMU_ALIGN_UP(length, BDRV_SECTOR_SIZE);
             ret = bdrv_truncate(s->extents[i].file, length, false,
-                                PREALLOC_MODE_OFF, NULL);
+                                PREALLOC_MODE_OFF, 0, NULL);
             if (ret < 0) {
                 return ret;
             }
diff --git a/tests/test-block-iothread.c b/tests/test-block-iothread.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-block-iothread.c
+++ b/tests/test-block-iothread.c
@@ -XXX,XX +XXX,XX @@ static void test_sync_op_truncate(BdrvChild *c)
     int ret;
 
     /* Normal success path */
-    ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, NULL);
+    ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, 0, NULL);
     g_assert_cmpint(ret, ==, 0);
 
     /* Early error: Negative offset */
-    ret = bdrv_truncate(c, -2, false, PREALLOC_MODE_OFF, NULL);
+    ret = bdrv_truncate(c, -2, false, PREALLOC_MODE_OFF, 0, NULL);
     g_assert_cmpint(ret, ==, -EINVAL);
 
     /* Error: Read-only image */
     c->bs->read_only = true;
     c->bs->open_flags &= ~BDRV_O_RDWR;
 
-    ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, NULL);
+    ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, 0, NULL);
     g_assert_cmpint(ret, ==, -EACCES);
 
     c->bs->read_only = false;
-- 
2.25.3

Now that node level interface bdrv_truncate() supports passing request
flags to the block driver, expose this on the BlockBackend level, too.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-Id: <20200424125448.63318-4-kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/sysemu/block-backend.h | 2 +-
 block.c                        | 3 ++-
 block/block-backend.c          | 4 ++--
 block/commit.c                 | 4 ++--
 block/crypto.c                 | 2 +-
 block/mirror.c                 | 2 +-
 block/qcow2.c                  | 4 ++--
 block/qed.c                    | 2 +-
 block/vdi.c                    | 2 +-
 block/vhdx.c                   | 4 ++--
 block/vmdk.c                   | 6 +++---
 block/vpc.c                    | 2 +-
 blockdev.c                     | 2 +-
 qemu-img.c                     | 2 +-
 qemu-io-cmds.c                 | 2 +-
 15 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -XXX,XX +XXX,XX @@ int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
 int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
                           int bytes);
 int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
-                 PreallocMode prealloc, Error **errp);
+                 PreallocMode prealloc, BdrvRequestFlags flags, Error **errp);
 int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes);
 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
                      int64_t pos, int size);
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static int64_t create_file_fallback_truncate(BlockBackend *blk,
     int64_t size;
     int ret;
 
-    ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, &local_err);
+    ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0,
+                       &local_err);
     if (ret < 0 && ret != -ENOTSUP) {
         error_propagate(errp, local_err);
         return ret;
diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
 }
 
 int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
-                 PreallocMode prealloc, Error **errp)
+                 PreallocMode prealloc, BdrvRequestFlags flags, Error **errp)
 {
     if (!blk_is_available(blk)) {
         error_setg(errp, "No medium inserted");
         return -ENOMEDIUM;
     }
 
-    return bdrv_truncate(blk->root, offset, exact, prealloc, 0, errp);
+    return bdrv_truncate(blk->root, offset, exact, prealloc, flags, errp);
 }
 
 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
diff --git a/block/commit.c b/block/commit.c
index XXXXXXX..XXXXXXX 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn commit_run(Job *job, Error **errp)
     }
 
     if (base_len < len) {
-        ret = blk_truncate(s->base, len, false, PREALLOC_MODE_OFF, NULL);
+        ret = blk_truncate(s->base, len, false, PREALLOC_MODE_OFF, 0, NULL);
         if (ret) {
             goto out;
         }
@@ -XXX,XX +XXX,XX @@ int bdrv_commit(BlockDriverState *bs)
      * grow the backing file image if possible.  If not possible,
      * we must return an error */
     if (length > backing_length) {
-        ret = blk_truncate(backing, length, false, PREALLOC_MODE_OFF,
+        ret = blk_truncate(backing, length, false, PREALLOC_MODE_OFF, 0,
                            &local_err);
         if (ret < 0) {
             error_report_err(local_err);
diff --git a/block/crypto.c b/block/crypto.c
index XXXXXXX..XXXXXXX 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -XXX,XX +XXX,XX @@ static ssize_t block_crypto_init_func(QCryptoBlock *block,
      * which will be used by the crypto header
      */
     return blk_truncate(data->blk, data->size + headerlen, false,
-                        data->prealloc, errp);
+                        data->prealloc, 0, errp);
 }
 
 
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
 
         if (s->bdev_length > base_length) {
             ret = blk_truncate(s->target, s->bdev_length, false,
-                               PREALLOC_MODE_OFF, NULL);
+                               PREALLOC_MODE_OFF, 0, NULL);
             if (ret < 0) {
                 goto immediate_exit;
             }
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
 
     /* Okay, now that we have a valid image, let's give it the right size */
     ret = blk_truncate(blk, qcow2_opts->size, false, qcow2_opts->preallocation,
-                       errp);
+                       0, errp);
     if (ret < 0) {
         error_prepend(errp, "Could not resize image: ");
         goto out;
@@ -XXX,XX +XXX,XX @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
          * Amending image options should ensure that the image has
          * exactly the given new values, so pass exact=true here.
          */
-        ret = blk_truncate(blk, new_size, true, PREALLOC_MODE_OFF, errp);
+        ret = blk_truncate(blk, new_size, true, PREALLOC_MODE_OFF, 0, errp);
         blk_unref(blk);
         if (ret < 0) {
             return ret;
diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts,
      * The QED format associates file length with allocation status,
      * so a new file (which is empty) must have a length of 0.
      */
-    ret = blk_truncate(blk, 0, true, PREALLOC_MODE_OFF, errp);
+    ret = blk_truncate(blk, 0, true, PREALLOC_MODE_OFF, 0, errp);
     if (ret < 0) {
         goto out;
     }
diff --git a/block/vdi.c b/block/vdi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn vdi_co_do_create(BlockdevCreateOptions *create_options,
 
     if (image_type == VDI_TYPE_STATIC) {
         ret = blk_truncate(blk, offset + blocks * block_size, false,
-                           PREALLOC_MODE_OFF, errp);
+                           PREALLOC_MODE_OFF, 0, errp);
         if (ret < 0) {
             error_prepend(errp, "Failed to statically allocate file");
             goto exit;
diff --git a/block/vhdx.c b/block/vhdx.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -XXX,XX +XXX,XX @@ static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
         /* All zeroes, so we can just extend the file - the end of the BAT
          * is the furthest thing we have written yet */
         ret = blk_truncate(blk, data_file_offset, false, PREALLOC_MODE_OFF,
-                           errp);
+                           0, errp);
         if (ret < 0) {
             goto exit;
         }
     } else if (type == VHDX_TYPE_FIXED) {
         ret = blk_truncate(blk, data_file_offset + image_size, false,
-                           PREALLOC_MODE_OFF, errp);
+                           PREALLOC_MODE_OFF, 0, errp);
         if (ret < 0) {
             goto exit;
         }
diff --git a/block/vmdk.c b/block/vmdk.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -XXX,XX +XXX,XX @@ static int vmdk_init_extent(BlockBackend *blk,
     int gd_buf_size;
 
     if (flat) {
-        ret = blk_truncate(blk, filesize, false, PREALLOC_MODE_OFF, errp);
+        ret = blk_truncate(blk, filesize, false, PREALLOC_MODE_OFF, 0, errp);
         goto exit;
     }
     magic = cpu_to_be32(VMDK4_MAGIC);
@@ -XXX,XX +XXX,XX @@ static int vmdk_init_extent(BlockBackend *blk,
     }
 
     ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9, false,
-                       PREALLOC_MODE_OFF, errp);
+                       PREALLOC_MODE_OFF, 0, errp);
     if (ret < 0) {
         goto exit;
     }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn vmdk_co_do_create(int64_t size,
     /* bdrv_pwrite write padding zeros to align to sector, we don't need that
      * for description file */
     if (desc_offset == 0) {
-        ret = blk_truncate(blk, desc_len, false, PREALLOC_MODE_OFF, errp);
+        ret = blk_truncate(blk, desc_len, false, PREALLOC_MODE_OFF, 0, errp);
         if (ret < 0) {
             goto exit;
         }
diff --git a/block/vpc.c b/block/vpc.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -XXX,XX +XXX,XX @@ static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
     /* Add footer to total size */
     total_size += HEADER_SIZE;
 
-    ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, errp);
+    ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, 0, errp);
     if (ret < 0) {
         return ret;
     }
diff --git a/blockdev.c b/blockdev.c
index XXXXXXX..XXXXXXX 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -XXX,XX +XXX,XX @@ void qmp_block_resize(bool has_device, const char *device,
     }
 
     bdrv_drained_begin(bs);
-    ret = blk_truncate(blk, size, false, PREALLOC_MODE_OFF, errp);
+    ret = blk_truncate(blk, size, false, PREALLOC_MODE_OFF, 0, errp);
     bdrv_drained_end(bs);
 
 out:
diff --git a/qemu-img.c b/qemu-img.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ static int img_resize(int argc, char **argv)
      * resizing, so pass @exact=true.  It is of no use to report
      * success when the image has not actually been resized.
      */
-    ret = blk_truncate(blk, total_size, true, prealloc, &err);
+    ret = blk_truncate(blk, total_size, true, prealloc, 0, &err);
     if (!ret) {
         qprintf(quiet, "Image resized.\n");
     } else {
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -XXX,XX +XXX,XX @@ static int truncate_f(BlockBackend *blk, int argc, char **argv)
      * exact=true.  It is better to err on the "emit more errors" side
      * than to be overly permissive.
      */
-    ret = blk_truncate(blk, offset, true, PREALLOC_MODE_OFF, &local_err);
+    ret = blk_truncate(blk, offset, true, PREALLOC_MODE_OFF, 0, &local_err);
     if (ret < 0) {
         error_report_err(local_err);
         return ret;
-- 
2.25.3

If BDRV_REQ_ZERO_WRITE is set and we're extending the image, calling
qcow2_cluster_zeroize() with flags=0 does the right thing: It doesn't
undo any previous preallocation, but just adds the zero flag to all
relevant L2 entries. If an external data file is in use, a write_zeroes
request to the data file is made instead.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20200424125448.63318-5-kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c |  2 +-
 block/qcow2.c         | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset,
     /* Caller must pass aligned values, except at image end */
     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
     assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
-           end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
+           end_offset >= bs->total_sectors << BDRV_SECTOR_BITS);
 
     /* The zero flag is only supported by version 3 and newer */
     if (s->qcow_version < 3) {
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
 
     bs->supported_zero_flags = header.version >= 3 ?
                                BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK : 0;
+    bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
 
     /* Repair image if dirty */
     if (!(flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) && !bs->read_only &&
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
         g_assert_not_reached();
     }
 
+    if ((flags & BDRV_REQ_ZERO_WRITE) && offset > old_length) {
+        uint64_t zero_start = QEMU_ALIGN_UP(old_length, s->cluster_size);
+
+        /*
+         * Use zero clusters as much as we can. qcow2_cluster_zeroize()
+         * requires a cluster-aligned start. The end may be unaligned if it is
+         * at the end of the image (which it is here).
+         */
+        ret = qcow2_cluster_zeroize(bs, zero_start, offset - zero_start, 0);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to zero out new clusters");
+            goto fail;
+        }
+
+        /* Write explicit zeros for the unaligned head */
+        if (zero_start > old_length) {
+            uint64_t len = zero_start - old_length;
+            uint8_t *buf = qemu_blockalign0(bs, len);
+            QEMUIOVector qiov;
+            qemu_iovec_init_buf(&qiov, buf, len);
+
+            qemu_co_mutex_unlock(&s->lock);
+            ret = qcow2_co_pwritev_part(bs, old_length, len, &qiov, 0, 0);
+            qemu_co_mutex_lock(&s->lock);
+
+            qemu_vfree(buf);
+            if (ret < 0) {
+                error_setg_errno(errp, -ret, "Failed to zero out the new area");
+                goto fail;
+            }
+        }
+    }
+
     if (prealloc != PREALLOC_MODE_OFF) {
         /* Flush metadata before actually changing the image size */
         ret = qcow2_write_caches(bs);
-- 
2.25.3

The raw format driver can simply forward the flag and let its bs->file
child take care of actually providing the zeros.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20200424125448.63318-6-kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-format.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/raw-format.c b/block/raw-format.c
index XXXXXXX..XXXXXXX 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
 
     s->size = offset;
     offset += s->offset;
-    return bdrv_co_truncate(bs->file, offset, exact, prealloc, 0, errp);
+    return bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
 }
 
 static void raw_eject(BlockDriverState *bs, bool eject_flag)
@@ -XXX,XX +XXX,XX @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
     bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
         ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
             bs->file->bs->supported_zero_flags);
+    bs->supported_truncate_flags = bs->file->bs->supported_truncate_flags &
+                                   BDRV_REQ_ZERO_WRITE;
 
     if (bs->probed && !bdrv_is_read_only(bs)) {
         bdrv_refresh_filename(bs->file->bs);
-- 
2.25.3

For regular files, we always get BDRV_REQ_ZERO_WRITE behaviour from the
OS, so we can advertise the flag and just ignore it.

diff --git a/block/file-posix.c b/block/file-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -XXX,XX +XXX,XX @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
 #endif
 
     bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
+    if (S_ISREG(st.st_mode)) {
+        /* When extending regular files, we get zeros from the OS */
+        bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
+    }
     ret = 0;
 fail:
     if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
-- 
2.25.3

When extending the size of an image that has a backing file larger than
its old size, make sure that the backing file data doesn't become
visible in the guest, but the added area is properly zeroed out.

Consider the following scenario where the overlay is shorter than its
backing file:

base.qcow2:     AAAAAAAA
    overlay.qcow2:  BBBB

When resizing (extending) overlay.qcow2, the new blocks should not stay
unallocated and make the additional As from base.qcow2 visible like
before this patch, but zeros should be read.

A similar case happens with the various variants of a commit job when an
intermediate file is short (- for unallocated):

base.qcow2:     A-A-AAAA
    mid.qcow2:      BB-B
    top.qcow2:      C--C--C-

After commit top.qcow2 to mid.qcow2, the following happens:

mid.qcow2:      CB-C00C0 (correct result)
    mid.qcow2:      CB-C--C- (before this fix)

Without the fix, blocks that previously read as zeros on top.qcow2
suddenly turn into A.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20200424125448.63318-8-kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
         goto out;
     }
 
+    /*
+     * If the image has a backing file that is large enough that it would
+     * provide data for the new area, we cannot leave it unallocated because
+     * then the backing file content would become visible. Instead, zero-fill
+     * the new area.
+     *
+     * Note that if the image has a backing file, but was opened without the
+     * backing file, taking care of keeping things consistent with that backing
+     * file is the user's responsibility.
+     */
+    if (new_bytes && bs->backing) {
+        int64_t backing_len;
+
+        backing_len = bdrv_getlength(backing_bs(bs));
+        if (backing_len < 0) {
+            ret = backing_len;
+            error_setg_errno(errp, -ret, "Could not get backing file size");
+            goto out;
+        }
+
+        if (backing_len > old_size) {
+            flags |= BDRV_REQ_ZERO_WRITE;
+        }
+    }
+
     if (drv->bdrv_co_truncate) {
         if (flags & ~bs->supported_truncate_flags) {
             error_setg(errp, "Block driver does not support requested flags");
-- 
2.25.3

We want to keep TEST_IMG for the full path of the main test image, but
filter_testfiles() must be called for other test images before replacing
other things like the image format because the test directory path could
contain the format as a substring.

Insert a filter_testfiles() call between both.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20200424125448.63318-9-kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/iotests.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -XXX,XX +XXX,XX @@ def filter_img_info(output, filename):
     for line in output.split('\n'):
         if 'disk size' in line or 'actual-size' in line:
             continue
-        line = line.replace(filename, 'TEST_IMG') \
-                   .replace(imgfmt, 'IMGFMT')
+        line = line.replace(filename, 'TEST_IMG')
+        line = filter_testfiles(line)
+        line = line.replace(imgfmt, 'IMGFMT')
         line = re.sub('iters: [0-9]+', 'iters: XXX', line)
         line = re.sub('uuid: [-a-f0-9]+', 'uuid: XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX', line)
         line = re.sub('cid: [0-9]+', 'cid: XXXXXXXXXX', line)
-- 
2.25.3

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20200424125448.63318-10-kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/274     | 155 +++++++++++++++++++++
 tests/qemu-iotests/274.out | 268 +++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 424 insertions(+)
 create mode 100755 tests/qemu-iotests/274
 create mode 100644 tests/qemu-iotests/274.out

diff --git a/tests/qemu-iotests/274 b/tests/qemu-iotests/274
new file mode 100755
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/274
@@ -XXX,XX +XXX,XX @@
+#!/usr/bin/env python3
+#
+# Copyright (C) 2019 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# Creator/Owner: Kevin Wolf <kwolf@redhat.com>
+#
+# Some tests for short backing files and short overlays
+
+import iotests
+
+iotests.verify_image_format(supported_fmts=['qcow2'])
+iotests.verify_platform(['linux'])
+
+size_short = 1 * 1024 * 1024
+size_long = 2 * 1024 * 1024
+size_diff = size_long - size_short
+
+def create_chain() -> None:
+    iotests.qemu_img_log('create', '-f', iotests.imgfmt, base,
+                         str(size_long))
+    iotests.qemu_img_log('create', '-f', iotests.imgfmt, '-b', base, mid,
+                         str(size_short))
+    iotests.qemu_img_log('create', '-f', iotests.imgfmt, '-b', mid, top,
+                         str(size_long))
+
+    iotests.qemu_io_log('-c', 'write -P 1 0 %d' % size_long, base)
+
+def create_vm() -> iotests.VM:
+    vm = iotests.VM()
+    vm.add_blockdev('file,filename=%s,node-name=base-file' % base)
+    vm.add_blockdev('%s,file=base-file,node-name=base' % iotests.imgfmt)
+    vm.add_blockdev('file,filename=%s,node-name=mid-file' % mid)
+    vm.add_blockdev('%s,file=mid-file,node-name=mid,backing=base'
+                    % iotests.imgfmt)
+    vm.add_drive(top, 'backing=mid,node-name=top')
+    return vm
+
+with iotests.FilePath('base') as base, \
+     iotests.FilePath('mid') as mid, \
+     iotests.FilePath('top') as top:
+
+    iotests.log('== Commit tests ==')
+
+    create_chain()
+
+    iotests.log('=== Check visible data ===')
+
+    iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, top)
+    iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), top)
+
+    iotests.log('=== Checking allocation status ===')
+
+    iotests.qemu_io_log('-c', 'alloc 0 %d' % size_short,
+                        '-c', 'alloc %d %d' % (size_short, size_diff),
+                        base)
+
+    iotests.qemu_io_log('-c', 'alloc 0 %d' % size_short,
+                        '-c', 'alloc %d %d' % (size_short, size_diff),
+                        mid)
+
+    iotests.qemu_io_log('-c', 'alloc 0 %d' % size_short,
+                        '-c', 'alloc %d %d' % (size_short, size_diff),
+                        top)
+
+    iotests.log('=== Checking map ===')
+
+    iotests.qemu_img_log('map', '--output=json', base)
+    iotests.qemu_img_log('map', '--output=human', base)
+    iotests.qemu_img_log('map', '--output=json', mid)
+    iotests.qemu_img_log('map', '--output=human', mid)
+    iotests.qemu_img_log('map', '--output=json', top)
+    iotests.qemu_img_log('map', '--output=human', top)
+
+    iotests.log('=== Testing qemu-img commit (top -> mid) ===')
+
+    iotests.qemu_img_log('commit', top)
+    iotests.img_info_log(mid)
+    iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, mid)
+    iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), mid)
+
+    iotests.log('=== Testing HMP commit (top -> mid) ===')
+
+    create_chain()
+    with create_vm() as vm:
+        vm.launch()
+        vm.qmp_log('human-monitor-command', command_line='commit drive0')
+
+    iotests.img_info_log(mid)
+    iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, mid)
+    iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), mid)
+
+    iotests.log('=== Testing QMP active commit (top -> mid) ===')
+
+    create_chain()
+    with create_vm() as vm:
+        vm.launch()
+        vm.qmp_log('block-commit', device='top', base_node='mid',
+                   job_id='job0', auto_dismiss=False)
+        vm.run_job('job0', wait=5)
+
+    iotests.img_info_log(mid)
+    iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, mid)
+    iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), mid)
+
+
+    iotests.log('== Resize tests ==')
+
+    # Use different sizes for different allocation modes:
+    #
+    # We want to have at least one test where 32 bit truncation in the size of
+    # the overlapping area becomes visible. This is covered by the
+    # prealloc='off' case (1G to 6G is an overlap of 5G).
+    #
+    # However, we can only do this for modes that don't preallocate data
+    # because otherwise we might run out of space on the test host.
+    #
+    # We also want to test some unaligned combinations.
+    for (prealloc, base_size, top_size_old, top_size_new, off) in [
+            ('off',       '6G',    '1G',   '8G',   '5G'),
+            ('metadata', '32G',   '30G',  '33G',  '31G'),
+            ('falloc',   '10M',    '5M',  '15M',   '9M'),
+            ('full',     '16M',    '8M',  '12M',  '11M'),
+            ('off',      '384k', '253k', '512k', '253k'),
+            ('off',      '400k', '256k', '512k', '336k'),
+            ('off',      '512k', '256k', '500k', '436k')]:
+
+        iotests.log('=== preallocation=%s ===' % prealloc)
+        iotests.qemu_img_log('create', '-f', iotests.imgfmt, base, base_size)
+        iotests.qemu_img_log('create', '-f', iotests.imgfmt, '-b', base, top,
+                             top_size_old)
+        iotests.qemu_io_log('-c', 'write -P 1 %s 64k' % off, base)
+
+        # After this, top_size_old to base_size should be allocated/zeroed.
+        #
+        # In theory, leaving base_size to top_size_new unallocated would be
+        # correct, but in practice, if we zero out anything, we zero out
+        # everything up to top_size_new.
+        iotests.qemu_img_log('resize', '-f', iotests.imgfmt,
+                             '--preallocation', prealloc, top, top_size_new)
+        iotests.qemu_io_log('-c', 'read -P 0 %s 64k' % off, top)
+        iotests.qemu_io_log('-c', 'map', top)
+        iotests.qemu_img_log('map', '--output=json', top)
diff --git a/tests/qemu-iotests/274.out b/tests/qemu-iotests/274.out
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/274.out
@@ -XXX,XX +XXX,XX @@
+== Commit tests ==
+Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=2097152 cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+Formatting 'TEST_DIR/PID-mid', fmt=qcow2 size=1048576 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=2097152 backing_file=TEST_DIR/PID-mid cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+wrote 2097152/2097152 bytes at offset 0
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+=== Check visible data ===
+read 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+read 1048576/1048576 bytes at offset 1048576
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+=== Checking allocation status ===
+1048576/1048576 bytes allocated at offset 0 bytes
+1048576/1048576 bytes allocated at offset 1 MiB
+
+0/1048576 bytes allocated at offset 0 bytes
+0/0 bytes allocated at offset 1 MiB
+
+0/1048576 bytes allocated at offset 0 bytes
+0/1048576 bytes allocated at offset 1 MiB
+
+=== Checking map ===
+[{ "start": 0, "length": 2097152, "depth": 0, "zero": false, "data": true, "offset": 327680}]
+
+Offset          Length          Mapped to       File
+0               0x200000        0x50000         TEST_DIR/PID-base
+
+[{ "start": 0, "length": 1048576, "depth": 1, "zero": false, "data": true, "offset": 327680}]
+
+Offset          Length          Mapped to       File
+0               0x100000        0x50000         TEST_DIR/PID-base
+
+[{ "start": 0, "length": 1048576, "depth": 2, "zero": false, "data": true, "offset": 327680},
+{ "start": 1048576, "length": 1048576, "depth": 0, "zero": true, "data": false}]
+
+Offset          Length          Mapped to       File
+0               0x100000        0x50000         TEST_DIR/PID-base
+
+=== Testing qemu-img commit (top -> mid) ===
+Image committed.
+
+image: TEST_IMG
+file format: IMGFMT
+virtual size: 2 MiB (2097152 bytes)
+cluster_size: 65536
+backing file: TEST_DIR/PID-base
+Format specific information:
+    compat: 1.1
+    lazy refcounts: false
+    refcount bits: 16
+    corrupt: false
+
+read 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+read 1048576/1048576 bytes at offset 1048576
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+=== Testing HMP commit (top -> mid) ===
+Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=2097152 cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+Formatting 'TEST_DIR/PID-mid', fmt=qcow2 size=1048576 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=2097152 backing_file=TEST_DIR/PID-mid cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+wrote 2097152/2097152 bytes at offset 0
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+{"execute": "human-monitor-command", "arguments": {"command-line": "commit drive0"}}
+{"return": ""}
+image: TEST_IMG
+file format: IMGFMT
+virtual size: 2 MiB (2097152 bytes)
+cluster_size: 65536
+backing file: TEST_DIR/PID-base
+Format specific information:
+    compat: 1.1
+    lazy refcounts: false
+    refcount bits: 16
+    corrupt: false
+
+read 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+read 1048576/1048576 bytes at offset 1048576
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+=== Testing QMP active commit (top -> mid) ===
+Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=2097152 cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+Formatting 'TEST_DIR/PID-mid', fmt=qcow2 size=1048576 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=2097152 backing_file=TEST_DIR/PID-mid cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+wrote 2097152/2097152 bytes at offset 0
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+{"execute": "block-commit", "arguments": {"auto-dismiss": false, "base-node": "mid", "device": "top", "job-id": "job0"}}
+{"return": {}}
+{"execute": "job-complete", "arguments": {"id": "job0"}}
+{"return": {}}
+{"data": {"device": "job0", "len": 0, "offset": 0, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_READY", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
+{"data": {"device": "job0", "len": 0, "offset": 0, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
+{"execute": "job-dismiss", "arguments": {"id": "job0"}}
+{"return": {}}
+image: TEST_IMG
+file format: IMGFMT
+virtual size: 2 MiB (2097152 bytes)
+cluster_size: 65536
+backing file: TEST_DIR/PID-base
+Format specific information:
+    compat: 1.1
+    lazy refcounts: false
+    refcount bits: 16
+    corrupt: false
+
+read 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+read 1048576/1048576 bytes at offset 1048576
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== Resize tests ==
+=== preallocation=off ===
+Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=6442450944 cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=1073741824 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+wrote 65536/65536 bytes at offset 5368709120
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+Image resized.
+
+read 65536/65536 bytes at offset 5368709120
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+1 GiB (0x40000000) bytes not allocated at offset 0 bytes (0x0)
+7 GiB (0x1c0000000) bytes     allocated at offset 1 GiB (0x40000000)
+
+[{ "start": 0, "length": 1073741824, "depth": 1, "zero": true, "data": false},
+{ "start": 1073741824, "length": 7516192768, "depth": 0, "zero": true, "data": false}]
+
+=== preallocation=metadata ===
+Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=34359738368 cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=32212254720 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+wrote 65536/65536 bytes at offset 33285996544
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+Image resized.
+
+read 65536/65536 bytes at offset 33285996544
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+30 GiB (0x780000000) bytes not allocated at offset 0 bytes (0x0)
+3 GiB (0xc0000000) bytes     allocated at offset 30 GiB (0x780000000)
+
+[{ "start": 0, "length": 32212254720, "depth": 1, "zero": true, "data": false},
+{ "start": 32212254720, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 327680},
+{ "start": 32749125632, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 537264128},
+{ "start": 33285996544, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 1074200576},
+{ "start": 33822867456, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 1611137024},
+{ "start": 34359738368, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 2148139008},
+{ "start": 34896609280, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 2685075456}]
+
+=== preallocation=falloc ===
+Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=10485760 cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=5242880 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+wrote 65536/65536 bytes at offset 9437184
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+Image resized.
+
+read 65536/65536 bytes at offset 9437184
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+5 MiB (0x500000) bytes not allocated at offset 0 bytes (0x0)
+10 MiB (0xa00000) bytes     allocated at offset 5 MiB (0x500000)
+
+[{ "start": 0, "length": 5242880, "depth": 1, "zero": true, "data": false},
+{ "start": 5242880, "length": 10485760, "depth": 0, "zero": true, "data": false, "offset": 327680}]
+
+=== preallocation=full ===
+Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=16777216 cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=8388608 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+wrote 65536/65536 bytes at offset 11534336
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+Image resized.
+
+read 65536/65536 bytes at offset 11534336
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+8 MiB (0x800000) bytes not allocated at offset 0 bytes (0x0)
+4 MiB (0x400000) bytes     allocated at offset 8 MiB (0x800000)
+
+[{ "start": 0, "length": 8388608, "depth": 1, "zero": true, "data": false},
+{ "start": 8388608, "length": 4194304, "depth": 0, "zero": true, "data": false, "offset": 327680}]
+
+=== preallocation=off ===
+Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=393216 cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=259072 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+wrote 65536/65536 bytes at offset 259072
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+Image resized.
+
+read 65536/65536 bytes at offset 259072
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+192 KiB (0x30000) bytes not allocated at offset 0 bytes (0x0)
+320 KiB (0x50000) bytes     allocated at offset 192 KiB (0x30000)
+
+[{ "start": 0, "length": 196608, "depth": 1, "zero": true, "data": false},
+{ "start": 196608, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680},
+{ "start": 262144, "length": 262144, "depth": 0, "zero": true, "data": false}]
+
+=== preallocation=off ===
+Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=409600 cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=262144 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+wrote 65536/65536 bytes at offset 344064
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+Image resized.
+
+read 65536/65536 bytes at offset 344064
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+256 KiB (0x40000) bytes not allocated at offset 0 bytes (0x0)
+256 KiB (0x40000) bytes     allocated at offset 256 KiB (0x40000)
+
+[{ "start": 0, "length": 262144, "depth": 1, "zero": true, "data": false},
+{ "start": 262144, "length": 262144, "depth": 0, "zero": true, "data": false}]
+
+=== preallocation=off ===
+Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=524288 cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=262144 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+wrote 65536/65536 bytes at offset 446464
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+Image resized.
+
+read 65536/65536 bytes at offset 446464
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+256 KiB (0x40000) bytes not allocated at offset 0 bytes (0x0)
+244 KiB (0x3d000) bytes     allocated at offset 256 KiB (0x40000)
+
+[{ "start": 0, "length": 262144, "depth": 1, "zero": true, "data": false},
+{ "start": 262144, "length": 249856, "depth": 0, "zero": true, "data": false}]
+
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 270 rw backing quick
 272 rw
 273 backing quick
+274 rw backing
 277 rw quick
 279 rw backing quick
 280 rw migration quick
-- 
2.25.3

The BDRV_REQ_ZERO_WRITE is currently implemented in a way that first the
image is possibly preallocated and then the zero flag is added to all
clusters. This means that a copy-on-write operation may be needed when
writing to these clusters, despite having used preallocation, negating
one of the major benefits of preallocation.

Instead, try to forward the BDRV_REQ_ZERO_WRITE to the protocol driver,
and if the protocol driver can ensure that the new area reads as zeros,
we can skip setting the zero flag in the qcow2 layer.

Unfortunately, the same approach doesn't work for metadata
preallocation, so we'll still set the zero flag there.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-Id: <20200424142701.67053-1-kwolf@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.c              | 22 +++++++++++++++++++---
 tests/qemu-iotests/274.out |  4 ++--
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
         /* Allocate the data area */
         new_file_size = allocation_start +
                         nb_new_data_clusters * s->cluster_size;
-        /* Image file grows, so @exact does not matter */
-        ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, 0,
-                               errp);
+        /*
+         * Image file grows, so @exact does not matter.
+         *
+         * If we need to zero out the new area, try first whether the protocol
+         * driver can already take care of this.
+         */
+        if (flags & BDRV_REQ_ZERO_WRITE) {
+            ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc,
+                                   BDRV_REQ_ZERO_WRITE, NULL);
+            if (ret >= 0) {
+                flags &= ~BDRV_REQ_ZERO_WRITE;
+            }
+        } else {
+            ret = -1;
+        }
+        if (ret < 0) {
+            ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, 0,
+                                   errp);
+        }
         if (ret < 0) {
             error_prepend(errp, "Failed to resize underlying file: ");
             qcow2_free_clusters(bs, allocation_start,
diff --git a/tests/qemu-iotests/274.out b/tests/qemu-iotests/274.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/274.out
+++ b/tests/qemu-iotests/274.out
@@ -XXX,XX +XXX,XX @@ read 65536/65536 bytes at offset 9437184
 10 MiB (0xa00000) bytes     allocated at offset 5 MiB (0x500000)
 
 [{ "start": 0, "length": 5242880, "depth": 1, "zero": true, "data": false},
-{ "start": 5242880, "length": 10485760, "depth": 0, "zero": true, "data": false, "offset": 327680}]
+{ "start": 5242880, "length": 10485760, "depth": 0, "zero": false, "data": true, "offset": 327680}]
 
 === preallocation=full ===
 Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=16777216 cluster_size=65536 lazy_refcounts=off refcount_bits=16
@@ -XXX,XX +XXX,XX @@ read 65536/65536 bytes at offset 11534336
 4 MiB (0x400000) bytes     allocated at offset 8 MiB (0x800000)
 
 [{ "start": 0, "length": 8388608, "depth": 1, "zero": true, "data": false},
-{ "start": 8388608, "length": 4194304, "depth": 0, "zero": true, "data": false, "offset": 327680}]
+{ "start": 8388608, "length": 4194304, "depth": 0, "zero": false, "data": true, "offset": 327680}]
 
 === preallocation=off ===
 Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=393216 cluster_size=65536 lazy_refcounts=off refcount_bits=16
-- 
2.25.3

From: Andrzej Jakowski <andrzej.jakowski@linux.intel.com>

This patch introduces support for PMR that has been defined as part of NVMe 1.4
spec. User can now specify a pmrdev option that should point to HostMemoryBackend.
pmrdev memory region will subsequently be exposed as PCI BAR 2 in emulated NVMe
device. Guest OS can perform mmio read and writes to the PMR region that will stay
persistent across system reboot.

Signed-off-by: Andrzej Jakowski <andrzej.jakowski@linux.intel.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20200330164656.9348-1-andrzej.jakowski@linux.intel.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/nvme.h        |   2 +
 include/block/nvme.h   | 172 +++++++++++++++++++++++++++++++++++++++++
 hw/block/nvme.c        | 109 ++++++++++++++++++++++++++
 hw/block/Makefile.objs |   2 +-
 hw/block/trace-events  |   4 +
 5 files changed, 288 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -XXX,XX +XXX,XX @@ typedef struct NvmeCtrl {
     uint64_t    timestamp_set_qemu_clock_ms;    /* QEMU clock time */
 
     char            *serial;
+    HostMemoryBackend *pmrdev;
+
     NvmeNamespace   *namespaces;
     NvmeSQueue      **sq;
     NvmeCQueue      **cq;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -XXX,XX +XXX,XX @@ typedef struct NvmeBar {
     uint64_t    acq;
     uint32_t    cmbloc;
     uint32_t    cmbsz;
+    uint8_t     padding[3520]; /* not used by QEMU */
+    uint32_t    pmrcap;
+    uint32_t    pmrctl;
+    uint32_t    pmrsts;
+    uint32_t    pmrebs;
+    uint32_t    pmrswtp;
+    uint32_t    pmrmsc;
 } NvmeBar;
 
 enum NvmeCapShift {
@@ -XXX,XX +XXX,XX @@ enum NvmeCapShift {
     CAP_CSS_SHIFT      = 37,
     CAP_MPSMIN_SHIFT   = 48,
     CAP_MPSMAX_SHIFT   = 52,
+    CAP_PMR_SHIFT      = 56,
 };
 
 enum NvmeCapMask {
@@ -XXX,XX +XXX,XX @@ enum NvmeCapMask {
     CAP_CSS_MASK       = 0xff,
     CAP_MPSMIN_MASK    = 0xf,
     CAP_MPSMAX_MASK    = 0xf,
+    CAP_PMR_MASK       = 0x1,
 };
 
 #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
@@ -XXX,XX +XXX,XX @@ enum NvmeCapMask {
                                                            << CAP_MPSMIN_SHIFT)
 #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\
                                                             << CAP_MPSMAX_SHIFT)
+#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\
+                                                            << CAP_PMR_SHIFT)
 
 enum NvmeCcShift {
     CC_EN_SHIFT     = 0,
@@ -XXX,XX +XXX,XX @@ enum NvmeCmbszMask {
 #define NVME_CMBSZ_GETSIZE(cmbsz) \
     (NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz))))
 
+enum NvmePmrcapShift {
+    PMRCAP_RDS_SHIFT      = 3,
+    PMRCAP_WDS_SHIFT      = 4,
+    PMRCAP_BIR_SHIFT      = 5,
+    PMRCAP_PMRTU_SHIFT    = 8,
+    PMRCAP_PMRWBM_SHIFT   = 10,
+    PMRCAP_PMRTO_SHIFT    = 16,
+    PMRCAP_CMSS_SHIFT     = 24,
+};
+
+enum NvmePmrcapMask {
+    PMRCAP_RDS_MASK      = 0x1,
+    PMRCAP_WDS_MASK      = 0x1,
+    PMRCAP_BIR_MASK      = 0x7,
+    PMRCAP_PMRTU_MASK    = 0x3,
+    PMRCAP_PMRWBM_MASK   = 0xf,
+    PMRCAP_PMRTO_MASK    = 0xff,
+    PMRCAP_CMSS_MASK     = 0x1,
+};
+
+#define NVME_PMRCAP_RDS(pmrcap)    \
+    ((pmrcap >> PMRCAP_RDS_SHIFT)   & PMRCAP_RDS_MASK)
+#define NVME_PMRCAP_WDS(pmrcap)    \
+    ((pmrcap >> PMRCAP_WDS_SHIFT)   & PMRCAP_WDS_MASK)
+#define NVME_PMRCAP_BIR(pmrcap)    \
+    ((pmrcap >> PMRCAP_BIR_SHIFT)   & PMRCAP_BIR_MASK)
+#define NVME_PMRCAP_PMRTU(pmrcap)    \
+    ((pmrcap >> PMRCAP_PMRTU_SHIFT)   & PMRCAP_PMRTU_MASK)
+#define NVME_PMRCAP_PMRWBM(pmrcap)    \
+    ((pmrcap >> PMRCAP_PMRWBM_SHIFT)   & PMRCAP_PMRWBM_MASK)
+#define NVME_PMRCAP_PMRTO(pmrcap)    \
+    ((pmrcap >> PMRCAP_PMRTO_SHIFT)   & PMRCAP_PMRTO_MASK)
+#define NVME_PMRCAP_CMSS(pmrcap)    \
+    ((pmrcap >> PMRCAP_CMSS_SHIFT)   & PMRCAP_CMSS_MASK)
+
+#define NVME_PMRCAP_SET_RDS(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_RDS_MASK) << PMRCAP_RDS_SHIFT)
+#define NVME_PMRCAP_SET_WDS(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_WDS_MASK) << PMRCAP_WDS_SHIFT)
+#define NVME_PMRCAP_SET_BIR(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_BIR_MASK) << PMRCAP_BIR_SHIFT)
+#define NVME_PMRCAP_SET_PMRTU(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_PMRTU_MASK) << PMRCAP_PMRTU_SHIFT)
+#define NVME_PMRCAP_SET_PMRWBM(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_PMRWBM_MASK) << PMRCAP_PMRWBM_SHIFT)
+#define NVME_PMRCAP_SET_PMRTO(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_PMRTO_MASK) << PMRCAP_PMRTO_SHIFT)
+#define NVME_PMRCAP_SET_CMSS(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_CMSS_MASK) << PMRCAP_CMSS_SHIFT)
+
+enum NvmePmrctlShift {
+    PMRCTL_EN_SHIFT   = 0,
+};
+
+enum NvmePmrctlMask {
+    PMRCTL_EN_MASK   = 0x1,
+};
+
+#define NVME_PMRCTL_EN(pmrctl)  ((pmrctl >> PMRCTL_EN_SHIFT)   & PMRCTL_EN_MASK)
+
+#define NVME_PMRCTL_SET_EN(pmrctl, val)   \
+    (pmrctl |= (uint64_t)(val & PMRCTL_EN_MASK) << PMRCTL_EN_SHIFT)
+
+enum NvmePmrstsShift {
+    PMRSTS_ERR_SHIFT    = 0,
+    PMRSTS_NRDY_SHIFT   = 8,
+    PMRSTS_HSTS_SHIFT   = 9,
+    PMRSTS_CBAI_SHIFT   = 12,
+};
+
+enum NvmePmrstsMask {
+    PMRSTS_ERR_MASK    = 0xff,
+    PMRSTS_NRDY_MASK   = 0x1,
+    PMRSTS_HSTS_MASK   = 0x7,
+    PMRSTS_CBAI_MASK   = 0x1,
+};
+
+#define NVME_PMRSTS_ERR(pmrsts)     \
+    ((pmrsts >> PMRSTS_ERR_SHIFT)   & PMRSTS_ERR_MASK)
+#define NVME_PMRSTS_NRDY(pmrsts)    \
+    ((pmrsts >> PMRSTS_NRDY_SHIFT)   & PMRSTS_NRDY_MASK)
+#define NVME_PMRSTS_HSTS(pmrsts)    \
+    ((pmrsts >> PMRSTS_HSTS_SHIFT)   & PMRSTS_HSTS_MASK)
+#define NVME_PMRSTS_CBAI(pmrsts)    \
+    ((pmrsts >> PMRSTS_CBAI_SHIFT)   & PMRSTS_CBAI_MASK)
+
+#define NVME_PMRSTS_SET_ERR(pmrsts, val)   \
+    (pmrsts |= (uint64_t)(val & PMRSTS_ERR_MASK) << PMRSTS_ERR_SHIFT)
+#define NVME_PMRSTS_SET_NRDY(pmrsts, val)   \
+    (pmrsts |= (uint64_t)(val & PMRSTS_NRDY_MASK) << PMRSTS_NRDY_SHIFT)
+#define NVME_PMRSTS_SET_HSTS(pmrsts, val)   \
+    (pmrsts |= (uint64_t)(val & PMRSTS_HSTS_MASK) << PMRSTS_HSTS_SHIFT)
+#define NVME_PMRSTS_SET_CBAI(pmrsts, val)   \
+    (pmrsts |= (uint64_t)(val & PMRSTS_CBAI_MASK) << PMRSTS_CBAI_SHIFT)
+
+enum NvmePmrebsShift {
+    PMREBS_PMRSZU_SHIFT   = 0,
+    PMREBS_RBB_SHIFT      = 4,
+    PMREBS_PMRWBZ_SHIFT   = 8,
+};
+
+enum NvmePmrebsMask {
+    PMREBS_PMRSZU_MASK   = 0xf,
+    PMREBS_RBB_MASK      = 0x1,
+    PMREBS_PMRWBZ_MASK   = 0xffffff,
+};
+
+#define NVME_PMREBS_PMRSZU(pmrebs)  \
+    ((pmrebs >> PMREBS_PMRSZU_SHIFT)   & PMREBS_PMRSZU_MASK)
+#define NVME_PMREBS_RBB(pmrebs)     \
+    ((pmrebs >> PMREBS_RBB_SHIFT)   & PMREBS_RBB_MASK)
+#define NVME_PMREBS_PMRWBZ(pmrebs)  \
+    ((pmrebs >> PMREBS_PMRWBZ_SHIFT)   & PMREBS_PMRWBZ_MASK)
+
+#define NVME_PMREBS_SET_PMRSZU(pmrebs, val)   \
+    (pmrebs |= (uint64_t)(val & PMREBS_PMRSZU_MASK) << PMREBS_PMRSZU_SHIFT)
+#define NVME_PMREBS_SET_RBB(pmrebs, val)   \
+    (pmrebs |= (uint64_t)(val & PMREBS_RBB_MASK) << PMREBS_RBB_SHIFT)
+#define NVME_PMREBS_SET_PMRWBZ(pmrebs, val)   \
+    (pmrebs |= (uint64_t)(val & PMREBS_PMRWBZ_MASK) << PMREBS_PMRWBZ_SHIFT)
+
+enum NvmePmrswtpShift {
+    PMRSWTP_PMRSWTU_SHIFT   = 0,
+    PMRSWTP_PMRSWTV_SHIFT   = 8,
+};
+
+enum NvmePmrswtpMask {
+    PMRSWTP_PMRSWTU_MASK   = 0xf,
+    PMRSWTP_PMRSWTV_MASK   = 0xffffff,
+};
+
+#define NVME_PMRSWTP_PMRSWTU(pmrswtp)   \
+    ((pmrswtp >> PMRSWTP_PMRSWTU_SHIFT)   & PMRSWTP_PMRSWTU_MASK)
+#define NVME_PMRSWTP_PMRSWTV(pmrswtp)   \
+    ((pmrswtp >> PMRSWTP_PMRSWTV_SHIFT)   & PMRSWTP_PMRSWTV_MASK)
+
+#define NVME_PMRSWTP_SET_PMRSWTU(pmrswtp, val)   \
+    (pmrswtp |= (uint64_t)(val & PMRSWTP_PMRSWTU_MASK) << PMRSWTP_PMRSWTU_SHIFT)
+#define NVME_PMRSWTP_SET_PMRSWTV(pmrswtp, val)   \
+    (pmrswtp |= (uint64_t)(val & PMRSWTP_PMRSWTV_MASK) << PMRSWTP_PMRSWTV_SHIFT)
+
+enum NvmePmrmscShift {
+    PMRMSC_CMSE_SHIFT   = 1,
+    PMRMSC_CBA_SHIFT    = 12,
+};
+
+enum NvmePmrmscMask {
+    PMRMSC_CMSE_MASK   = 0x1,
+    PMRMSC_CBA_MASK    = 0xfffffffffffff,
+};
+
+#define NVME_PMRMSC_CMSE(pmrmsc)    \
+    ((pmrmsc >> PMRMSC_CMSE_SHIFT)   & PMRMSC_CMSE_MASK)
+#define NVME_PMRMSC_CBA(pmrmsc)     \
+    ((pmrmsc >> PMRMSC_CBA_SHIFT)   & PMRMSC_CBA_MASK)
+
+#define NVME_PMRMSC_SET_CMSE(pmrmsc, val)   \
+    (pmrmsc |= (uint64_t)(val & PMRMSC_CMSE_MASK) << PMRMSC_CMSE_SHIFT)
+#define NVME_PMRMSC_SET_CBA(pmrmsc, val)   \
+    (pmrmsc |= (uint64_t)(val & PMRMSC_CBA_MASK) << PMRMSC_CBA_SHIFT)
+
 typedef struct NvmeCmd {
     uint8_t     opcode;
     uint8_t     fuse;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -XXX,XX +XXX,XX @@
  *      -drive file=<file>,if=none,id=<drive_id>
  *      -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>, \
  *              cmb_size_mb=<cmb_size_mb[optional]>, \
+ *              [pmrdev=<mem_backend_file_id>,] \
  *              num_queues=<N[optional]>
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ *
+ * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
+ * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
+ * both provided.
+ * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
+ * For example:
+ * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
+ *  size=<size> .... -device nvme,...,pmrdev=<mem_id>
  */
 
 #include "qemu/osdep.h"
@@ -XXX,XX +XXX,XX @@
 #include "sysemu/sysemu.h"
 #include "qapi/error.h"
 #include "qapi/visitor.h"
+#include "sysemu/hostmem.h"
 #include "sysemu/block-backend.h"
+#include "exec/ram_addr.h"
 
 #include "qemu/log.h"
 #include "qemu/module.h"
@@ -XXX,XX +XXX,XX @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
         NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
                        "invalid write to read only CMBSZ, ignored");
         return;
+    case 0xE00: /* PMRCAP */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrcap_readonly,
+                       "invalid write to PMRCAP register, ignored");
+        return;
+    case 0xE04: /* TODO PMRCTL */
+        break;
+    case 0xE08: /* PMRSTS */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrsts_readonly,
+                       "invalid write to PMRSTS register, ignored");
+        return;
+    case 0xE0C: /* PMREBS */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrebs_readonly,
+                       "invalid write to PMREBS register, ignored");
+        return;
+    case 0xE10: /* PMRSWTP */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrswtp_readonly,
+                       "invalid write to PMRSWTP register, ignored");
+        return;
+    case 0xE14: /* TODO PMRMSC */
+         break;
     default:
         NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
                        "invalid MMIO write,"
@@ -XXX,XX +XXX,XX @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
     }
 
     if (addr < sizeof(n->bar)) {
+        /*
+         * When PMRWBM bit 1 is set then read from
+         * from PMRSTS should ensure prior writes
+         * made it to persistent media
+         */
+        if (addr == 0xE08 &&
+            (NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02)) {
+            qemu_ram_writeback(n->pmrdev->mr.ram_block,
+                               0, n->pmrdev->size);
+        }
         memcpy(&val, ptr + addr, size);
     } else {
         NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
@@ -XXX,XX +XXX,XX @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
         error_setg(errp, "serial property not set");
         return;
     }
+
+    if (!n->cmb_size_mb && n->pmrdev) {
+        if (host_memory_backend_is_mapped(n->pmrdev)) {
+            char *path = object_get_canonical_path_component(OBJECT(n->pmrdev));
+            error_setg(errp, "can't use already busy memdev: %s", path);
+            g_free(path);
+            return;
+        }
+
+        if (!is_power_of_2(n->pmrdev->size)) {
+            error_setg(errp, "pmr backend size needs to be power of 2 in size");
+            return;
+        }
+
+        host_memory_backend_set_mapped(n->pmrdev, true);
+    }
+
     blkconf_blocksizes(&n->conf);
     if (!blkconf_apply_backend_options(&n->conf, blk_is_read_only(n->conf.blk),
                                        false, errp)) {
@@ -XXX,XX +XXX,XX @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
             PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 |
             PCI_BASE_ADDRESS_MEM_PREFETCH, &n->ctrl_mem);
 
+    } else if (n->pmrdev) {
+        /* Controller Capabilities register */
+        NVME_CAP_SET_PMRS(n->bar.cap, 1);
+
+        /* PMR Capabities register */
+        n->bar.pmrcap = 0;
+        NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 0);
+        NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 0);
+        NVME_PMRCAP_SET_BIR(n->bar.pmrcap, 2);
+        NVME_PMRCAP_SET_PMRTU(n->bar.pmrcap, 0);
+        /* Turn on bit 1 support */
+        NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02);
+        NVME_PMRCAP_SET_PMRTO(n->bar.pmrcap, 0);
+        NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 0);
+
+        /* PMR Control register */
+        n->bar.pmrctl = 0;
+        NVME_PMRCTL_SET_EN(n->bar.pmrctl, 0);
+
+        /* PMR Status register */
+        n->bar.pmrsts = 0;
+        NVME_PMRSTS_SET_ERR(n->bar.pmrsts, 0);
+        NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 0);
+        NVME_PMRSTS_SET_HSTS(n->bar.pmrsts, 0);
+        NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 0);
+
+        /* PMR Elasticity Buffer Size register */
+        n->bar.pmrebs = 0;
+        NVME_PMREBS_SET_PMRSZU(n->bar.pmrebs, 0);
+        NVME_PMREBS_SET_RBB(n->bar.pmrebs, 0);
+        NVME_PMREBS_SET_PMRWBZ(n->bar.pmrebs, 0);
+
+        /* PMR Sustained Write Throughput register */
+        n->bar.pmrswtp = 0;
+        NVME_PMRSWTP_SET_PMRSWTU(n->bar.pmrswtp, 0);
+        NVME_PMRSWTP_SET_PMRSWTV(n->bar.pmrswtp, 0);
+
+        /* PMR Memory Space Control register */
+        n->bar.pmrmsc = 0;
+        NVME_PMRMSC_SET_CMSE(n->bar.pmrmsc, 0);
+        NVME_PMRMSC_SET_CBA(n->bar.pmrmsc, 0);
+
+        pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap),
+            PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 |
+            PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr);
     }
 
     for (i = 0; i < n->num_namespaces; i++) {
@@ -XXX,XX +XXX,XX @@ static void nvme_exit(PCIDevice *pci_dev)
     if (n->cmb_size_mb) {
         g_free(n->cmbuf);
     }
+
+    if (n->pmrdev) {
+        host_memory_backend_set_mapped(n->pmrdev, false);
+    }
     msix_uninit_exclusive_bar(pci_dev);
 }
 
 static Property nvme_props[] = {
     DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
+    DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmrdev, TYPE_MEMORY_BACKEND,
+                     HostMemoryBackend *),
     DEFINE_PROP_STRING("serial", NvmeCtrl, serial),
     DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, cmb_size_mb, 0),
     DEFINE_PROP_UINT32("num_queues", NvmeCtrl, num_queues, 64),
diff --git a/hw/block/Makefile.objs b/hw/block/Makefile.objs
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/Makefile.objs
+++ b/hw/block/Makefile.objs
@@ -XXX,XX +XXX,XX @@ common-obj-$(CONFIG_PFLASH_CFI02) += pflash_cfi02.o
 common-obj-$(CONFIG_XEN) += xen-block.o
 common-obj-$(CONFIG_ECC) += ecc.o
 common-obj-$(CONFIG_ONENAND) += onenand.o
-common-obj-$(CONFIG_NVME_PCI) += nvme.o
 common-obj-$(CONFIG_SWIM) += swim.o
 
 common-obj-$(CONFIG_SH4) += tc58128.o
 
 obj-$(CONFIG_VIRTIO_BLK) += virtio-blk.o
 obj-$(CONFIG_VHOST_USER_BLK) += vhost-user-blk.o
+obj-$(CONFIG_NVME_PCI) += nvme.o
 
 obj-y += dataplane/
diff --git a/hw/block/trace-events b/hw/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -XXX,XX +XXX,XX @@ nvme_ub_mmiowr_ssreset_w1c_unsupported(void) "attempted to W1C CSTS.NSSRO but CA
 nvme_ub_mmiowr_ssreset_unsupported(void) "attempted NVM subsystem reset but CAP.NSSRS is zero (not supported)"
 nvme_ub_mmiowr_cmbloc_reserved(void) "invalid write to reserved CMBLOC when CMBSZ is zero, ignored"
 nvme_ub_mmiowr_cmbsz_readonly(void) "invalid write to read only CMBSZ, ignored"
+nvme_ub_mmiowr_pmrcap_readonly(void) "invalid write to read only PMRCAP, ignored"
+nvme_ub_mmiowr_pmrsts_readonly(void) "invalid write to read only PMRSTS, ignored"
+nvme_ub_mmiowr_pmrebs_readonly(void) "invalid write to read only PMREBS, ignored"
+nvme_ub_mmiowr_pmrswtp_readonly(void) "invalid write to read only PMRSWTP, ignored"
 nvme_ub_mmiowr_invalid(uint64_t offset, uint64_t data) "invalid MMIO write, offset=0x%"PRIx64", data=0x%"PRIx64""
 nvme_ub_mmiord_misaligned32(uint64_t offset) "MMIO read not 32-bit aligned, offset=0x%"PRIx64""
 nvme_ub_mmiord_toosmall(uint64_t offset) "MMIO read smaller than 32-bits, offset=0x%"PRIx64""
-- 
2.25.3

The QMP handler qmp_object_add() and the implementation of --object in
qemu-storage-daemon can share most of the code. Currently,
qemu-storage-daemon calls qmp_object_add(), but this is not correct
because different visitors need to be used.

As a first step towards a fix, make qmp_object_add() a wrapper around a
new function user_creatable_add_dict() that can get an additional
parameter. The handling of "props" is only required for compatibility
and not required for the qemu-storage-daemon command line, so it stays
in qmp_object_add().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/qom/object_interfaces.h | 12 ++++++++++++
 qom/object_interfaces.c         | 27 +++++++++++++++++++++++++++
 qom/qom-qmp-cmds.c              | 24 +-----------------------
 3 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/include/qom/object_interfaces.h b/include/qom/object_interfaces.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qom/object_interfaces.h
+++ b/include/qom/object_interfaces.h
@@ -XXX,XX +XXX,XX @@ Object *user_creatable_add_type(const char *type, const char *id,
                                 const QDict *qdict,
                                 Visitor *v, Error **errp);
 
+/**
+ * user_creatable_add_dict:
+ * @qdict: the object definition
+ * @errp: if an error occurs, a pointer to an area to store the error
+ *
+ * Create an instance of the user creatable object that is defined by
+ * @qdict.  The object type is taken from the QDict key 'qom-type', its
+ * ID from the key 'id'. The remaining entries in @qdict are used to
+ * initialize the object properties.
+ */
+void user_creatable_add_dict(QDict *qdict, Error **errp);
+
 /**
  * user_creatable_add_opts:
  * @opts: the object definition
diff --git a/qom/object_interfaces.c b/qom/object_interfaces.c
index XXXXXXX..XXXXXXX 100644
--- a/qom/object_interfaces.c
+++ b/qom/object_interfaces.c
@@ -XXX,XX +XXX,XX @@
 #include "qapi/qmp/qerror.h"
 #include "qapi/qmp/qjson.h"
 #include "qapi/qmp/qstring.h"
+#include "qapi/qobject-input-visitor.h"
 #include "qom/object_interfaces.h"
 #include "qemu/help_option.h"
 #include "qemu/module.h"
@@ -XXX,XX +XXX,XX @@ out:
     return obj;
 }
 
+void user_creatable_add_dict(QDict *qdict, Error **errp)
+{
+    Visitor *v;
+    Object *obj;
+    g_autofree char *type = NULL;
+    g_autofree char *id = NULL;
+
+    type = g_strdup(qdict_get_try_str(qdict, "qom-type"));
+    if (!type) {
+        error_setg(errp, QERR_MISSING_PARAMETER, "qom-type");
+        return;
+    }
+    qdict_del(qdict, "qom-type");
+
+    id = g_strdup(qdict_get_try_str(qdict, "id"));
+    if (!id) {
+        error_setg(errp, QERR_MISSING_PARAMETER, "id");
+        return;
+    }
+    qdict_del(qdict, "id");
+
+    v = qobject_input_visitor_new(QOBJECT(qdict));
+    obj = user_creatable_add_type(type, id, qdict, v, errp);
+    visit_free(v);
+    object_unref(obj);
+}
 
 Object *user_creatable_add_opts(QemuOpts *opts, Error **errp)
 {
diff --git a/qom/qom-qmp-cmds.c b/qom/qom-qmp-cmds.c
index XXXXXXX..XXXXXXX 100644
--- a/qom/qom-qmp-cmds.c
+++ b/qom/qom-qmp-cmds.c
@@ -XXX,XX +XXX,XX @@
 #include "qapi/qapi-commands-qom.h"
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qerror.h"
-#include "qapi/qobject-input-visitor.h"
 #include "qemu/cutils.h"
 #include "qom/object_interfaces.h"
 #include "qom/qom-qobject.h"
@@ -XXX,XX +XXX,XX @@ void qmp_object_add(QDict *qdict, QObject **ret_data, Error **errp)
 {
     QObject *props;
     QDict *pdict;
-    Visitor *v;
-    Object *obj;
-    g_autofree char *type = NULL;
-    g_autofree char *id = NULL;
-
-    type = g_strdup(qdict_get_try_str(qdict, "qom-type"));
-    if (!type) {
-        error_setg(errp, QERR_MISSING_PARAMETER, "qom-type");
-        return;
-    }
-    qdict_del(qdict, "qom-type");
-
-    id = g_strdup(qdict_get_try_str(qdict, "id"));
-    if (!id) {
-        error_setg(errp, QERR_MISSING_PARAMETER, "id");
-        return;
-    }
-    qdict_del(qdict, "id");
 
     props = qdict_get(qdict, "props");
     if (props) {
@@ -XXX,XX +XXX,XX @@ void qmp_object_add(QDict *qdict, QObject **ret_data, Error **errp)
         qobject_unref(pdict);
     }
 
-    v = qobject_input_visitor_new(QOBJECT(qdict));
-    obj = user_creatable_add_type(type, id, qdict, v, errp);
-    visit_free(v);
-    object_unref(obj);
+    user_creatable_add_dict(qdict, errp);
 }
 
 void qmp_object_del(const char *id, Error **errp)
-- 
2.25.3

After processing the option string with the keyval parser, we get a
QDict that contains only strings. This QDict must be fed to a keyval
visitor which converts the strings into the right data types.

qmp_object_add(), however, uses the normal QObject input visitor, which
expects a QDict where all properties already have the QType that matches
the data type required by the QOM object type.

Change the --object implementation in qemu-storage-daemon so that it
doesn't call qmp_object_add(), but calls user_creatable_add_dict()
directly instead and pass it a new keyval boolean that decides which
visitor must be used.

Reported-by: Coiby Xu <coiby.xu@gmail.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/qom/object_interfaces.h | 6 +++++-
 qemu-storage-daemon.c           | 4 +---
 qom/object_interfaces.c         | 8 ++++++--
 qom/qom-qmp-cmds.c              | 2 +-
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/include/qom/object_interfaces.h b/include/qom/object_interfaces.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qom/object_interfaces.h
+++ b/include/qom/object_interfaces.h
@@ -XXX,XX +XXX,XX @@ Object *user_creatable_add_type(const char *type, const char *id,
 /**
  * user_creatable_add_dict:
  * @qdict: the object definition
+ * @keyval: if true, use a keyval visitor for processing @qdict (i.e.
+ *          assume that all @qdict values are strings); otherwise, use
+ *          the normal QObject visitor (i.e. assume all @qdict values
+ *          have the QType expected by the QOM object type)
  * @errp: if an error occurs, a pointer to an area to store the error
  *
  * Create an instance of the user creatable object that is defined by
@@ -XXX,XX +XXX,XX @@ Object *user_creatable_add_type(const char *type, const char *id,
  * ID from the key 'id'. The remaining entries in @qdict are used to
  * initialize the object properties.
  */
-void user_creatable_add_dict(QDict *qdict, Error **errp);
+void user_creatable_add_dict(QDict *qdict, bool keyval, Error **errp);
 
 /**
  * user_creatable_add_opts:
diff --git a/qemu-storage-daemon.c b/qemu-storage-daemon.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-storage-daemon.c
+++ b/qemu-storage-daemon.c
@@ -XXX,XX +XXX,XX @@ static void process_options(int argc, char *argv[])
                 QemuOpts *opts;
                 const char *type;
                 QDict *args;
-                QObject *ret_data = NULL;
 
                 /* FIXME The keyval parser rejects 'help' arguments, so we must
                  * unconditionall try QemuOpts first. */
@@ -XXX,XX +XXX,XX @@ static void process_options(int argc, char *argv[])
                 qemu_opts_del(opts);
 
                 args = keyval_parse(optarg, "qom-type", &error_fatal);
-                qmp_object_add(args, &ret_data, &error_fatal);
+                user_creatable_add_dict(args, true, &error_fatal);
                 qobject_unref(args);
-                qobject_unref(ret_data);
                 break;
             }
         default:
diff --git a/qom/object_interfaces.c b/qom/object_interfaces.c
index XXXXXXX..XXXXXXX 100644
--- a/qom/object_interfaces.c
+++ b/qom/object_interfaces.c
@@ -XXX,XX +XXX,XX @@ out:
     return obj;
 }
 
-void user_creatable_add_dict(QDict *qdict, Error **errp)
+void user_creatable_add_dict(QDict *qdict, bool keyval, Error **errp)
 {
     Visitor *v;
     Object *obj;
@@ -XXX,XX +XXX,XX @@ void user_creatable_add_dict(QDict *qdict, Error **errp)
     }
     qdict_del(qdict, "id");
 
-    v = qobject_input_visitor_new(QOBJECT(qdict));
+    if (keyval) {
+        v = qobject_input_visitor_new_keyval(QOBJECT(qdict));
+    } else {
+        v = qobject_input_visitor_new(QOBJECT(qdict));
+    }
     obj = user_creatable_add_type(type, id, qdict, v, errp);
     visit_free(v);
     object_unref(obj);
diff --git a/qom/qom-qmp-cmds.c b/qom/qom-qmp-cmds.c
index XXXXXXX..XXXXXXX 100644
--- a/qom/qom-qmp-cmds.c
+++ b/qom/qom-qmp-cmds.c
@@ -XXX,XX +XXX,XX @@ void qmp_object_add(QDict *qdict, QObject **ret_data, Error **errp)
         qobject_unref(pdict);
     }
 
-    user_creatable_add_dict(qdict, errp);
+    user_creatable_add_dict(qdict, false, errp);
 }
 
 void qmp_object_del(const char *id, Error **errp)
-- 
2.25.3