Series comparison

-[Qemu-devel] [PULL 00/16] Block patches
+[PULL 0/9] Block patches
-The following changes since commit dd25f97c66a75d1508f1d4c6478ed2c95bec428f:
+The following changes since commit 67f17e23baca5dd545fe98b01169cc351a70fe35:
-  Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20190913' into staging (2019-09-16 10:15:15 +0100)
+  Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging (2020-03-06 17:15:36 +0000)
 are available in the Git repository at:
-  https://github.com/XanClic/qemu.git tags/pull-block-2019-09-16
+  https://github.com/stefanha/qemu.git tags/block-pull-request
-for you to fetch changes up to 1825cc0783ccf0ec5d9f0b225a99b340bdd4c68f:
+for you to fetch changes up to d37d0e365afb6825a90d8356fc6adcc1f58f40f3:
-  qemu-iotests: Add test for bz #1745922 (2019-09-16 15:37:12 +0200)
+  aio-posix: remove idle poll handlers to improve scalability (2020-03-09 16:45:16 +0000)
 ----------------------------------------------------------------
-Block patches:
+Pull request
 - Fix for block jobs when used with I/O threads
 - Fix for a corruption when using qcow2's LUKS encryption mode
 - cURL fix
 - check-block.sh cleanups (for make check)
 - Refactoring
 ----------------------------------------------------------------
-Max Reitz (7):
-  curl: Keep pointer to the CURLState in CURLSocket
-  curl: Keep *socket until the end of curl_sock_cb()
-  curl: Check completion in curl_multi_do()
-  curl: Pass CURLSocket to curl_multi_do()
-  curl: Report only ready sockets
-  curl: Handle success in multi_check_completion
-  curl: Check curl_multi_add_handle()'s return code
-Maxim Levitsky (3):
+Stefan Hajnoczi (9):
-  block/qcow2: Fix corruption introduced by commit 8ac0f15f335
+  qemu/queue.h: clear linked list pointers on remove
-  block/qcow2: refactor encryption code
+  aio-posix: remove confusing QLIST_SAFE_REMOVE()
-  qemu-iotests: Add test for bz #1745922
+  aio-posix: completely stop polling when disabled
   aio-posix: move RCU_READ_LOCK() into run_poll_handlers()
   aio-posix: extract ppoll(2) and epoll(7) fd monitoring
   aio-posix: simplify FDMonOps->update() prototype
   aio-posix: add io_uring fd monitoring implementation
   aio-posix: support userspace polling of fd monitoring
   aio-posix: remove idle poll handlers to improve scalability
-Nir Soffer (2):
+ MAINTAINERS           |   2 +
-  block: Use QEMU_IS_ALIGNED
+ configure             |   5 +
-  block: Remove unused masks
+ include/block/aio.h   |  71 ++++++-
+ include/qemu/queue.h  |  19 +-
-Sergio Lopez (1):
+ util/Makefile.objs    |   3 +
-  blockjob: update nodes head while removing all bdrv
+ util/aio-posix.c      | 451 ++++++++++++++----------------------------
+ util/aio-posix.h      |  81 ++++++++
-Thomas Huth (2):
+ util/fdmon-epoll.c    | 155 +++++++++++++++
-  tests/qemu-iotests/check: Replace "tests" with "iotests" in final
+ util/fdmon-io_uring.c | 332 +++++++++++++++++++++++++++++++
-    status text
+ util/fdmon-poll.c     | 107 ++++++++++
-  tests/Makefile: Do not print the name of the check-block.sh shell
+ util/trace-events     |   2 +
-    script
+files changed, 915 insertions(+), 313 deletions(-)
+ create mode 100644 util/aio-posix.h
-Vladimir Sementsov-Ogievskiy (1):
+ create mode 100644 util/fdmon-epoll.c
-  tests/qemu-iotests: Fix qemu-io related output in 026.out.nocache
+ create mode 100644 util/fdmon-io_uring.c
+ create mode 100644 util/fdmon-poll.c
  tests/Makefile.include             |   2 +-
  block/qcow2.h                      |   8 +-
  include/block/block.h              |   2 -
  block/bochs.c                      |   4 +-
  block/cloop.c                      |   4 +-
  block/curl.c                       | 133 ++++++++++-------------
  block/dmg.c                        |   4 +-
  block/io.c                         |   8 +-
  block/qcow2-cluster.c              |  40 +++----
  block/qcow2-threads.c              |  63 ++++++++---
  block/qcow2.c                      |   9 +-
  block/vvfat.c                      |   8 +-
  blockjob.c                         |  17 ++-
  migration/block.c                  |   2 +-
  qemu-img.c                         |   2 +-
  tests/qemu-iotests/026.out.nocache | 168 ++++++++++++++---------------
  tests/qemu-iotests/263             |  91 ++++++++++++++++
  tests/qemu-iotests/263.out         |  40 +++++++
  tests/qemu-iotests/check           |   8 +-
  tests/qemu-iotests/group           |   1 +
 files changed, 380 insertions(+), 234 deletions(-)
  create mode 100755 tests/qemu-iotests/263
  create mode 100644 tests/qemu-iotests/263.out
 --
-.21.0
+.24.1

-[Qemu-devel] [PULL 01/16] block: Use QEMU_IS_ALIGNED
+Deleted patch
-From: Nir Soffer <nirsof@gmail.com>
-Replace instances of:
-    (n & (BDRV_SECTOR_SIZE - 1)) == 0
-And:
-   (n & ~BDRV_SECTOR_MASK) == 0
-With:
-    QEMU_IS_ALIGNED(n, BDRV_SECTOR_SIZE)
-Which reveals the intent of the code better, and makes it easier to
-locate the code checking alignment.
-Signed-off-by: Nir Soffer <nsoffer@redhat.com>
-Message-id: 20190827185913.27427-2-nsoffer@redhat.com
-Reviewed-by: John Snow <jsnow@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/bochs.c         | 4 ++--
- block/cloop.c         | 4 ++--
- block/dmg.c           | 4 ++--
- block/io.c            | 8 ++++----
- block/qcow2-cluster.c | 4 ++--
- block/qcow2.c         | 4 ++--
- block/vvfat.c         | 8 ++++----
- qemu-img.c            | 2 +-
-files changed, 19 insertions(+), 19 deletions(-)
-diff --git a/block/bochs.c b/block/bochs.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/bochs.c
-+++ b/block/bochs.c
-@@ -XXX,XX +XXX,XX @@ bochs_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-     QEMUIOVector local_qiov;
-     int ret;
--    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
--    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-+    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
-+    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
-     qemu_iovec_init(&local_qiov, qiov->niov);
-     qemu_co_mutex_lock(&s->lock);
-diff --git a/block/cloop.c b/block/cloop.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/cloop.c
-+++ b/block/cloop.c
-@@ -XXX,XX +XXX,XX @@ cloop_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-     int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-     int ret, i;
--    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
--    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-+    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
-+    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
-     qemu_co_mutex_lock(&s->lock);
-diff --git a/block/dmg.c b/block/dmg.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/dmg.c
-+++ b/block/dmg.c
-@@ -XXX,XX +XXX,XX @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-     int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-     int ret, i;
--    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
--    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-+    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
-+    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
-     qemu_co_mutex_lock(&s->lock);
-diff --git a/block/io.c b/block/io.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/io.c
-+++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
-     sector_num = offset >> BDRV_SECTOR_BITS;
-     nb_sectors = bytes >> BDRV_SECTOR_BITS;
--    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
--    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-+    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
-+    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
-     assert(bytes <= BDRV_REQUEST_MAX_BYTES);
-     assert(drv->bdrv_co_readv);
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
-     sector_num = offset >> BDRV_SECTOR_BITS;
-     nb_sectors = bytes >> BDRV_SECTOR_BITS;
--    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
--    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-+    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
-+    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
-     assert(bytes <= BDRV_REQUEST_MAX_BYTES);
-     assert(drv->bdrv_co_writev);
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
-@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
- {
-     if (bytes && bs->encrypted) {
-         BDRVQcow2State *s = bs->opaque;
--        assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
--        assert((bytes & ~BDRV_SECTOR_MASK) == 0);
-+        assert(QEMU_IS_ALIGNED(offset_in_cluster, BDRV_SECTOR_SIZE));
-+        assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
-         assert(s->crypto);
-         if (qcow2_co_encrypt(bs, cluster_offset,
-                              src_cluster_offset + offset_in_cluster,
-diff --git a/block/qcow2.c b/block/qcow2.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.c
-+++ b/block/qcow2.c
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv_part(BlockDriverState *bs,
-                     goto fail;
-                 }
--                assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
--                assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-+                assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
-+                assert(QEMU_IS_ALIGNED(cur_bytes, BDRV_SECTOR_SIZE));
-                 if (qcow2_co_decrypt(bs, cluster_offset, offset,
-                                      cluster_data, cur_bytes) < 0) {
-                     ret = -EIO;
-diff --git a/block/vvfat.c b/block/vvfat.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/vvfat.c
-+++ b/block/vvfat.c
-@@ -XXX,XX +XXX,XX @@ vvfat_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-     int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-     void *buf;
--    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
--    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-+    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
-+    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
-     buf = g_try_malloc(bytes);
-     if (bytes && buf == NULL) {
-@@ -XXX,XX +XXX,XX @@ vvfat_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-     int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-     void *buf;
--    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
--    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-+    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
-+    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
-     buf = g_try_malloc(bytes);
-     if (bytes && buf == NULL) {
-diff --git a/qemu-img.c b/qemu-img.c
-index XXXXXXX..XXXXXXX 100644
---- a/qemu-img.c
-+++ b/qemu-img.c
-@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
-             int64_t sval;
-             sval = cvtnum(optarg);
--            if (sval < 0 || sval & (BDRV_SECTOR_SIZE - 1) ||
-+            if (sval < 0 || !QEMU_IS_ALIGNED(sval, BDRV_SECTOR_SIZE) ||
-                 sval / BDRV_SECTOR_SIZE > MAX_BUF_SECTORS) {
-                 error_report("Invalid buffer size for sparse output specified. "
-                     "Valid sizes are multiples of %llu up to %llu. Select "
---
-.21.0

-[Qemu-devel] [PULL 02/16] block: Remove unused masks
+Deleted patch
-From: Nir Soffer <nirsof@gmail.com>
-Replace confusing usage:
-    ~BDRV_SECTOR_MASK
-With more clear:
-    (BDRV_SECTOR_SIZE - 1)
-Remove BDRV_SECTOR_MASK and the unused BDRV_BLOCK_OFFSET_MASK which was
-it's last user.
-Signed-off-by: Nir Soffer <nsoffer@redhat.com>
-Message-id: 20190827185913.27427-3-nsoffer@redhat.com
-Reviewed-by: Juan Quintela <quintela@redhat.com>
-Reviewed-by: John Snow <jsnow@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- include/block/block.h | 2 --
- migration/block.c     | 2 +-
-files changed, 1 insertion(+), 3 deletions(-)
-diff --git a/include/block/block.h b/include/block/block.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/block.h
-+++ b/include/block/block.h
-@@ -XXX,XX +XXX,XX @@ typedef struct HDGeometry {
- #define BDRV_SECTOR_BITS   9
- #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
--#define BDRV_SECTOR_MASK   ~(BDRV_SECTOR_SIZE - 1)
- #define BDRV_REQUEST_MAX_SECTORS MIN(SIZE_MAX >> BDRV_SECTOR_BITS, \
-                                      INT_MAX >> BDRV_SECTOR_BITS)
-@@ -XXX,XX +XXX,XX @@ typedef struct HDGeometry {
- #define BDRV_BLOCK_ALLOCATED    0x10
- #define BDRV_BLOCK_EOF          0x20
- #define BDRV_BLOCK_RECURSE      0x40
--#define BDRV_BLOCK_OFFSET_MASK  BDRV_SECTOR_MASK
- typedef QSIMPLEQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;
-diff --git a/migration/block.c b/migration/block.c
-index XXXXXXX..XXXXXXX 100644
---- a/migration/block.c
-+++ b/migration/block.c
-@@ -XXX,XX +XXX,XX @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
-     do {
-         addr = qemu_get_be64(f);
--        flags = addr & ~BDRV_SECTOR_MASK;
-+        flags = addr & (BDRV_SECTOR_SIZE - 1);
-         addr >>= BDRV_SECTOR_BITS;
-         if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
---
-.21.0

-[Qemu-devel] [PULL 03/16] tests/qemu-iotests/check: Replace "tests" with "iotests" in final status text
+Deleted patch
-From: Thomas Huth <thuth@redhat.com>
-When running "make check -j8" or something similar, the iotests are
-running in parallel with the other tests. So when they are printing
-out "Passed all xx tests" or a similar status message at the end,
-it might not be quite clear that this message belongs to the iotests,
-since the output might be mixed with the other tests. Thus change the
-word "tests" here to "iotests" instead to avoid confusion.
-Signed-off-by: Thomas Huth <thuth@redhat.com>
-Message-id: 20190906113920.11271-1-thuth@redhat.com
-Reviewed-by: John Snow <jsnow@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- tests/qemu-iotests/check | 8 ++++----
-file changed, 4 insertions(+), 4 deletions(-)
-diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check
-index XXXXXXX..XXXXXXX 100755
---- a/tests/qemu-iotests/check
-+++ b/tests/qemu-iotests/check
-@@ -XXX,XX +XXX,XX @@ END        { if (NR > 0) {
-         if [ ! -z "$n_bad" -a $n_bad != 0 ]
-         then
-             echo "Failures:$bad"
--            echo "Failed $n_bad of $try tests"
-+            echo "Failed $n_bad of $try iotests"
-             echo "Failures:$bad" | fmt >>check.log
--            echo "Failed $n_bad of $try tests" >>check.log
-+            echo "Failed $n_bad of $try iotests" >>check.log
-         else
--            echo "Passed all $try tests"
--            echo "Passed all $try tests" >>check.log
-+            echo "Passed all $try iotests"
-+            echo "Passed all $try iotests" >>check.log
-         fi
-         needwrap=false
-     fi
---
-.21.0

-[Qemu-devel] [PULL 14/16] block/qcow2: Fix corruption introduced by commit 8ac0f15f335
+[PULL 1/9] qemu/queue.h: clear linked list pointers on remove
-From: Maxim Levitsky <mlevitsk@redhat.com>
+Do not leave stale linked list pointers around after removal.  It's
 safer to set them to NULL so that use-after-removal results in an
 immediate segfault.
-This fixes subtle corruption introduced by luks threaded encryption
+The RCU queue removal macros are unchanged since nodes may still be
-in commit 8ac0f15f335
+traversed after removal.
-Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1745922
+Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Link: https://lore.kernel.org/r/20200224103406.1894923-2-stefanha@redhat.com
 Message-Id: <20200224103406.1894923-2-stefanha@redhat.com>
 ---
  include/qemu/queue.h | 19 +++++++++++++++----
 file changed, 15 insertions(+), 4 deletions(-)
-The corruption happens when we do a write that
+diff --git a/include/qemu/queue.h b/include/qemu/queue.h
-   * writes to two or more unallocated clusters at once
+index XXXXXXX..XXXXXXX 100644
-   * doesn't fully cover the first sector
+--- a/include/qemu/queue.h
-   * doesn't fully cover the last sector
++++ b/include/qemu/queue.h
-   * uses luks encryption
+@@ -XXX,XX +XXX,XX @@ struct {                                                                \
                  (elm)->field.le_next->field.le_prev =                   \
                      (elm)->field.le_prev;                               \
          *(elm)->field.le_prev = (elm)->field.le_next;                   \
 +        (elm)->field.le_next = NULL;                                    \
 +        (elm)->field.le_prev = NULL;                                    \
  } while (/*CONSTCOND*/0)
  /*
@@ -XXX,XX +XXX,XX @@ struct {                                                                \
  } while (/*CONSTCOND*/0)
  #define QSLIST_REMOVE_HEAD(head, field) do {                             \
 -        (head)->slh_first = (head)->slh_first->field.sle_next;          \
 +        typeof((head)->slh_first) elm = (head)->slh_first;               \
 +        (head)->slh_first = elm->field.sle_next;                         \
 +        elm->field.sle_next = NULL;                                      \
  } while (/*CONSTCOND*/0)
  #define QSLIST_REMOVE_AFTER(slistelm, field) do {                       \
 -        (slistelm)->field.sle_next =                                    \
 -            QSLIST_NEXT(QSLIST_NEXT((slistelm), field), field);         \
 +        typeof(slistelm) next = (slistelm)->field.sle_next;             \
 +        (slistelm)->field.sle_next = next->field.sle_next;              \
 +        next->field.sle_next = NULL;                                    \
  } while (/*CONSTCOND*/0)
  #define QSLIST_REMOVE(head, elm, type, field) do {                      \
@@ -XXX,XX +XXX,XX @@ struct {                                                                \
          while (curelm->field.sle_next != (elm))                         \
              curelm = curelm->field.sle_next;                            \
          curelm->field.sle_next = curelm->field.sle_next->field.sle_next; \
 +        (elm)->field.sle_next = NULL;                                   \
      }                                                                   \
  } while (/*CONSTCOND*/0)
@@ -XXX,XX +XXX,XX @@ struct {                                                                \
  } while (/*CONSTCOND*/0)
  #define QSIMPLEQ_REMOVE_HEAD(head, field) do {                          \
 -    if (((head)->sqh_first = (head)->sqh_first->field.sqe_next) == NULL)\
 +    typeof((head)->sqh_first) elm = (head)->sqh_first;                  \
 +    if (((head)->sqh_first = elm->field.sqe_next) == NULL)              \
          (head)->sqh_last = &(head)->sqh_first;                          \
 +    elm->field.sqe_next = NULL;                                         \
  } while (/*CONSTCOND*/0)
  #define QSIMPLEQ_SPLIT_AFTER(head, elm, field, removed) do {            \
@@ -XXX,XX +XXX,XX @@ struct {                                                                \
          if ((curelm->field.sqe_next =                                   \
              curelm->field.sqe_next->field.sqe_next) == NULL)            \
                  (head)->sqh_last = &(curelm)->field.sqe_next;           \
 +        (elm)->field.sqe_next = NULL;                                   \
      }                                                                   \
  } while (/*CONSTCOND*/0)
@@ -XXX,XX +XXX,XX @@ union {                                                                 \
              (head)->tqh_circ.tql_prev = (elm)->field.tqe_circ.tql_prev; \
          (elm)->field.tqe_circ.tql_prev->tql_next = (elm)->field.tqe_next; \
          (elm)->field.tqe_circ.tql_prev = NULL;                          \
 +        (elm)->field.tqe_circ.tql_next = NULL;                          \
 +        (elm)->field.tqe_next = NULL;                                   \
  } while (/*CONSTCOND*/0)
  /* remove @left, @right and all elements in between from @head */
 --
 .24.1
-In this case, when allocating the new clusters we COW both areas
-prior to the write and after the write, and we encrypt them.
-The above mentioned commit accidentally made it so we encrypt the
-second COW area using the physical cluster offset of the first area.
-The problem is that offset_in_cluster in do_perform_cow_encrypt
-can be larger that the cluster size, thus cluster_offset
-will no longer point to the start of the cluster at which encrypted
-area starts.
-Next patch in this series will refactor the code to avoid all these
-assumptions.
-In the bugreport that was triggered by rebasing a luks image to new,
-zero filled base, which lot of such writes, and causes some files
-with zero areas to contain garbage there instead.
-But as described above it can happen elsewhere as well
-Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-Message-id: 20190915203655.21638-2-mlevitsk@redhat.com
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/qcow2-cluster.c | 7 ++++---
-file changed, 4 insertions(+), 3 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
-@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
-         assert(QEMU_IS_ALIGNED(offset_in_cluster, BDRV_SECTOR_SIZE));
-         assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
-         assert(s->crypto);
--        if (qcow2_co_encrypt(bs, cluster_offset,
--                             src_cluster_offset + offset_in_cluster,
--                             buffer, bytes) < 0) {
-+        if (qcow2_co_encrypt(bs,
-+                start_of_cluster(s, cluster_offset + offset_in_cluster),
-+                src_cluster_offset + offset_in_cluster,
-+                buffer, bytes) < 0) {
-             return false;
-         }
-     }
---
-.21.0

-[Qemu-devel] [PULL 04/16] tests/Makefile: Do not print the name of the check-block.sh shell script
+[PULL 2/9] aio-posix: remove confusing QLIST_SAFE_REMOVE()
-From: Thomas Huth <thuth@redhat.com>
+QLIST_SAFE_REMOVE() is confusing here because the node must be on the
 list.  We actually just wanted to clear the linked list pointers when
 removing it from the list.  QLIST_REMOVE() now does this, so switch to
 it.
-The check script is already printing out which iotest is currently
+Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
-running, so printing out the name of the check-block.sh shell script
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-looks superfluous here.
+Link: https://lore.kernel.org/r/20200224103406.1894923-3-stefanha@redhat.com
+Message-Id: <20200224103406.1894923-3-stefanha@redhat.com>
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 Message-id: 20190906113534.10907-1-thuth@redhat.com
 Acked-by: John Snow <jsnow@redhat.com>
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- tests/Makefile.include | 2 +-
+ util/aio-posix.c | 2 +-
 file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/tests/Makefile.include b/tests/Makefile.include
+diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
---- a/tests/Makefile.include
+--- a/util/aio-posix.c
-+++ b/tests/Makefile.include
++++ b/util/aio-posix.c
-@@ -XXX,XX +XXX,XX @@ QEMU_IOTESTS_HELPERS-$(call land,$(CONFIG_SOFTMMU),$(CONFIG_LINUX)) = tests/qemu
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_ready_handlers(AioContext *ctx,
- check-tests/check-block.sh: tests/check-block.sh qemu-img$(EXESUF) \
+     AioHandler *node;
-         qemu-io$(EXESUF) qemu-nbd$(EXESUF) $(QEMU_IOTESTS_HELPERS-y) \
-         $(patsubst %,%/all,$(filter %-softmmu,$(TARGET_DIRS)))
+     while ((node = QLIST_FIRST(ready_list))) {
--    $<
+-        QLIST_SAFE_REMOVE(node, node_ready);
-+    @$<
++        QLIST_REMOVE(node, node_ready);
+         progress = aio_dispatch_handler(ctx, node) || progress;
- .PHONY: $(patsubst %, check-%, $(check-qapi-schema-y))
+     }
- $(patsubst %, check-%, $(check-qapi-schema-y)): check-%.json: $(SRC_PATH)/%.json
 --
-.21.0
+.24.1

-[Qemu-devel] [PULL 05/16] tests/qemu-iotests: Fix qemu-io related output in 026.out.nocache
+Deleted patch
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-qemu-io now prefixes its error and warnings with "qemu-io:".
-b9986b08787019e fixed a lot of iotests output but forget about
-.out.nocache. Fix it too.
-Fixes: 99e98d7c9fc1a1639fad ("qemu-io: Use error_[gs]et_progname()")
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-Message-id: 20190816153015.447957-2-vsementsov@virtuozzo.com
-Reviewed-by: John Snow <jsnow@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- tests/qemu-iotests/026.out.nocache | 168 ++++++++++++++---------------
-file changed, 84 insertions(+), 84 deletions(-)
-diff --git a/tests/qemu-iotests/026.out.nocache b/tests/qemu-iotests/026.out.nocache
-index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/026.out.nocache
-+++ b/tests/qemu-iotests/026.out.nocache
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: l1_update; errno: 5; imm: off; once: off; write
--Failed to flush the L2 table cache: Input/output error
--Failed to flush the refcount block cache: Input/output error
-+qemu-io: Failed to flush the L2 table cache: Input/output error
-+qemu-io: Failed to flush the refcount block cache: Input/output error
- write failed: Input/output error
-leaked clusters were found on the image.
-@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: l1_update; errno: 5; imm: off; once: off; write -b
--Failed to flush the L2 table cache: Input/output error
--Failed to flush the refcount block cache: Input/output error
-+qemu-io: Failed to flush the L2 table cache: Input/output error
-+qemu-io: Failed to flush the refcount block cache: Input/output error
- write failed: Input/output error
-leaked clusters were found on the image.
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: l1_update; errno: 28; imm: off; once: off; write
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
-leaked clusters were found on the image.
-@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: l1_update; errno: 28; imm: off; once: off; write -b
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
-leaked clusters were found on the image.
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: l2_update; errno: 5; imm: off; once: off; write
--Failed to flush the L2 table cache: Input/output error
--Failed to flush the refcount block cache: Input/output error
-+qemu-io: Failed to flush the L2 table cache: Input/output error
-+qemu-io: Failed to flush the refcount block cache: Input/output error
- wrote 131072/131072 bytes at offset 0
-KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: l2_update; errno: 5; imm: off; once: off; write -b
--Failed to flush the L2 table cache: Input/output error
--Failed to flush the refcount block cache: Input/output error
-+qemu-io: Failed to flush the L2 table cache: Input/output error
-+qemu-io: Failed to flush the refcount block cache: Input/output error
- wrote 131072/131072 bytes at offset 0
-KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: l2_update; errno: 28; imm: off; once: off; write
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- wrote 131072/131072 bytes at offset 0
-KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: l2_update; errno: 28; imm: off; once: off; write -b
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- wrote 131072/131072 bytes at offset 0
-KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: l2_alloc_write; errno: 5; imm: off; once: off; write
--Failed to flush the L2 table cache: Input/output error
--Failed to flush the refcount block cache: Input/output error
-+qemu-io: Failed to flush the L2 table cache: Input/output error
-+qemu-io: Failed to flush the refcount block cache: Input/output error
- write failed: Input/output error
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: l2_alloc_write; errno: 5; imm: off; once: off; write -b
--Failed to flush the L2 table cache: Input/output error
--Failed to flush the refcount block cache: Input/output error
-+qemu-io: Failed to flush the L2 table cache: Input/output error
-+qemu-io: Failed to flush the refcount block cache: Input/output error
- write failed: Input/output error
-leaked clusters were found on the image.
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: l2_alloc_write; errno: 28; imm: off; once: off; write
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: l2_alloc_write; errno: 28; imm: off; once: off; write -b
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
-leaked clusters were found on the image.
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: write_aio; errno: 5; imm: off; once: off; write
--Failed to flush the L2 table cache: Input/output error
--Failed to flush the refcount block cache: Input/output error
-+qemu-io: Failed to flush the L2 table cache: Input/output error
-+qemu-io: Failed to flush the refcount block cache: Input/output error
- write failed: Input/output error
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: write_aio; errno: 5; imm: off; once: off; write -b
--Failed to flush the L2 table cache: Input/output error
--Failed to flush the refcount block cache: Input/output error
-+qemu-io: Failed to flush the L2 table cache: Input/output error
-+qemu-io: Failed to flush the refcount block cache: Input/output error
- write failed: Input/output error
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: write_aio; errno: 28; imm: off; once: off; write
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: write_aio; errno: 28; imm: off; once: off; write -b
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_load; errno: 5; imm: off; once: off; write
--Failed to flush the L2 table cache: Input/output error
--Failed to flush the refcount block cache: Input/output error
-+qemu-io: Failed to flush the L2 table cache: Input/output error
-+qemu-io: Failed to flush the refcount block cache: Input/output error
- write failed: Input/output error
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_load; errno: 5; imm: off; once: off; write -b
--Failed to flush the L2 table cache: Input/output error
--Failed to flush the refcount block cache: Input/output error
-+qemu-io: Failed to flush the L2 table cache: Input/output error
-+qemu-io: Failed to flush the refcount block cache: Input/output error
- write failed: Input/output error
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_load; errno: 28; imm: off; once: off; write
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_load; errno: 28; imm: off; once: off; write -b
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_update_part; errno: 5; imm: off; once: off; write
--Failed to flush the L2 table cache: Input/output error
--Failed to flush the refcount block cache: Input/output error
-+qemu-io: Failed to flush the L2 table cache: Input/output error
-+qemu-io: Failed to flush the refcount block cache: Input/output error
- write failed: Input/output error
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_update_part; errno: 5; imm: off; once: off; write -b
--Failed to flush the L2 table cache: Input/output error
--Failed to flush the refcount block cache: Input/output error
-+qemu-io: Failed to flush the L2 table cache: Input/output error
-+qemu-io: Failed to flush the refcount block cache: Input/output error
- write failed: Input/output error
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_update_part; errno: 28; imm: off; once: off; write
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_update_part; errno: 28; imm: off; once: off; write -b
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_alloc; errno: 5; imm: off; once: off; write
--Failed to flush the L2 table cache: Input/output error
--Failed to flush the refcount block cache: Input/output error
-+qemu-io: Failed to flush the L2 table cache: Input/output error
-+qemu-io: Failed to flush the refcount block cache: Input/output error
- write failed: Input/output error
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_alloc; errno: 5; imm: off; once: off; write -b
--Failed to flush the L2 table cache: Input/output error
--Failed to flush the refcount block cache: Input/output error
-+qemu-io: Failed to flush the L2 table cache: Input/output error
-+qemu-io: Failed to flush the refcount block cache: Input/output error
- write failed: Input/output error
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_alloc; errno: 28; imm: off; once: off; write
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_alloc; errno: 28; imm: off; once: off; write -b
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_alloc_hookup; errno: 28; imm: off; once: off; write
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
-leaked clusters were found on the image.
-@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_alloc_hookup; errno: 28; imm: off; once: off; write -b
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
-leaked clusters were found on the image.
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_alloc_write; errno: 28; imm: off; once: off; write
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_alloc_write; errno: 28; imm: off; once: off; write -b
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_alloc_write_blocks; errno: 28; imm: off; once: off; write
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
-leaked clusters were found on the image.
-@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_alloc_write_blocks; errno: 28; imm: off; once: off; write -b
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
-leaked clusters were found on the image.
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_alloc_write_table; errno: 28; imm: off; once: off; write
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
-leaked clusters were found on the image.
-@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_alloc_write_table; errno: 28; imm: off; once: off; write -b
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
-leaked clusters were found on the image.
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_alloc_switch_table; errno: 28; imm: off; once: off; write
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
-leaked clusters were found on the image.
-@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: refblock_alloc_switch_table; errno: 28; imm: off; once: off; write -b
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
-leaked clusters were found on the image.
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: l1_grow_write_table; errno: 5; imm: off; once: off
--Failed to flush the L2 table cache: Input/output error
--Failed to flush the refcount block cache: Input/output error
-+qemu-io: Failed to flush the L2 table cache: Input/output error
-+qemu-io: Failed to flush the refcount block cache: Input/output error
- write failed: Input/output error
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: l1_grow_write_table; errno: 28; imm: off; once: off
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
- No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: l1_grow_activate_table; errno: 5; imm: off; once: off
--Failed to flush the L2 table cache: Input/output error
--Failed to flush the refcount block cache: Input/output error
-+qemu-io: Failed to flush the L2 table cache: Input/output error
-+qemu-io: Failed to flush the refcount block cache: Input/output error
- write failed: Input/output error
-leaked clusters were found on the image.
-@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
- Event: l1_grow_activate_table; errno: 28; imm: off; once: off
--Failed to flush the L2 table cache: No space left on device
--Failed to flush the refcount block cache: No space left on device
-+qemu-io: Failed to flush the L2 table cache: No space left on device
-+qemu-io: Failed to flush the refcount block cache: No space left on device
- write failed: No space left on device
-leaked clusters were found on the image.
---
-.21.0

-[Qemu-devel] [PULL 06/16] curl: Keep pointer to the CURLState in CURLSocket
+Deleted patch
-A follow-up patch will make curl_multi_do() and curl_multi_read() take a
-CURLSocket instead of the CURLState.  They still need the latter,
-though, so add a pointer to it to the former.
-Cc: qemu-stable@nongnu.org
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: John Snow <jsnow@redhat.com>
-Message-id: 20190910124136.10565-2-mreitz@redhat.com
-Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/curl.c | 3 +++
-file changed, 3 insertions(+)
-diff --git a/block/curl.c b/block/curl.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/curl.c
-+++ b/block/curl.c
-@@ -XXX,XX +XXX,XX @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle,
- #define CURL_BLOCK_OPT_TIMEOUT_DEFAULT 5
- struct BDRVCURLState;
-+struct CURLState;
- static bool libcurl_initialized;
-@@ -XXX,XX +XXX,XX @@ typedef struct CURLAIOCB {
- typedef struct CURLSocket {
-     int fd;
-+    struct CURLState *state;
-     QLIST_ENTRY(CURLSocket) next;
- } CURLSocket;
-@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
-     if (!socket) {
-         socket = g_new0(CURLSocket, 1);
-         socket->fd = fd;
-+        socket->state = state;
-         QLIST_INSERT_HEAD(&state->sockets, socket, next);
-     }
-     socket = NULL;
---
-.21.0

-[Qemu-devel] [PULL 07/16] curl: Keep *socket until the end of curl_sock_cb()
+[PULL 3/9] aio-posix: completely stop polling when disabled
-This does not really change anything, but it makes the code a bit easier
+One iteration of polling is always performed even when polling is
-to follow once we use @socket as the opaque pointer for
+disabled.  This is done because:
-aio_set_fd_handler().
+. Userspace polling is cheaper than making a syscall.  We might get
    lucky.
 . We must poll once more after polling has stopped in case an event
    occurred while stopping polling.
-Cc: qemu-stable@nongnu.org
+However, there are downsides:
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+. Polling becomes a bottleneck when the number of event sources is very
-Message-id: 20190910124136.10565-3-mreitz@redhat.com
+   high.  It's more efficient to monitor fds in that case.
-Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
+. A high-frequency polling event source can starve non-polling event
-Reviewed-by: John Snow <jsnow@redhat.com>
+   sources because ppoll(2)/epoll(7) is never invoked.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
 This patch removes the forced polling iteration so that poll_ns=0 really
 means no polling.
 IOPS increases from 10k to 60k when the guest has 100
 virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1
 device because the large number of event sources being polled slows down
 the event loop.
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Link: https://lore.kernel.org/r/20200305170806.1313245-2-stefanha@redhat.com
 Message-Id: <20200305170806.1313245-2-stefanha@redhat.com>
 ---
- block/curl.c | 10 +++++-----
+ util/aio-posix.c | 22 +++++++++++++++-------
-file changed, 5 insertions(+), 5 deletions(-)
+file changed, 15 insertions(+), 7 deletions(-)
-diff --git a/block/curl.c b/block/curl.c
+diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/curl.c
+--- a/util/aio-posix.c
-+++ b/block/curl.c
++++ b/util/aio-posix.c
-@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
+@@ -XXX,XX +XXX,XX @@ void aio_set_event_notifier_poll(AioContext *ctx,
+                     (IOHandler *)io_poll_end);
-     QLIST_FOREACH(socket, &state->sockets, next) {
+ }
-         if (socket->fd == fd) {
--            if (action == CURL_POLL_REMOVE) {
+-static void poll_set_started(AioContext *ctx, bool started)
--                QLIST_REMOVE(socket, next);
++static bool poll_set_started(AioContext *ctx, bool started)
--                g_free(socket);
+ {
--            }
+     AioHandler *node;
-             break;
++    bool progress = false;
      if (started == ctx->poll_started) {
 -        return;
 +        return false;
      }
      ctx->poll_started = started;
@@ -XXX,XX +XXX,XX @@ static void poll_set_started(AioContext *ctx, bool started)
          if (fn) {
              fn(node->opaque);
          }
 +
 +        /* Poll one last time in case ->io_poll_end() raced with the event */
 +        if (!started) {
 +            progress = node->io_poll(node->opaque) || progress;
 +        }
      }
      qemu_lockcnt_dec(&ctx->list_lock);
 +
 +    return progress;
  }
@@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
          }
      }
-@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
-         socket->state = state;
+-    poll_set_started(ctx, false);
-         QLIST_INSERT_HEAD(&state->sockets, socket, next);
++    if (poll_set_started(ctx, false)) {
-     }
++        *timeout = 0;
--    socket = NULL;
++        return true;
      trace_curl_sock_cb(action, (int)fd);
      switch (action) {
@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
              break;
      }
 +    if (action == CURL_POLL_REMOVE) {
 +        QLIST_REMOVE(socket, next);
 +        g_free(socket);
 +    }
-+
-     return 0;
+-    /* Even if we don't run busy polling, try polling once in case it can make
 -     * progress and the caller will be able to avoid ppoll(2)/epoll_wait(2).
 -     */
 -    return run_poll_handlers_once(ctx, timeout);
 +    return false;
  }
+ bool aio_poll(AioContext *ctx, bool blocking)
 --
-.21.0
+.24.1

-[Qemu-devel] [PULL 08/16] curl: Check completion in curl_multi_do()
+Deleted patch
-While it is more likely that transfers complete after some file
-descriptor has data ready to read, we probably should not rely on it.
-Better be safe than sorry and call curl_multi_check_completion() in
-curl_multi_do(), too, just like it is done in curl_multi_read().
-With this change, curl_multi_do() and curl_multi_read() are actually the
-same, so drop curl_multi_read() and use curl_multi_do() as the sole FD
-handler.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Message-id: 20190910124136.10565-4-mreitz@redhat.com
-Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
-Reviewed-by: John Snow <jsnow@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/curl.c | 14 ++------------
-file changed, 2 insertions(+), 12 deletions(-)
-diff --git a/block/curl.c b/block/curl.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/curl.c
-+++ b/block/curl.c
-@@ -XXX,XX +XXX,XX @@ typedef struct BDRVCURLState {
- static void curl_clean_state(CURLState *s);
- static void curl_multi_do(void *arg);
--static void curl_multi_read(void *arg);
- #ifdef NEED_CURL_TIMER_CALLBACK
- /* Called from curl_multi_do_locked, with s->mutex held.  */
-@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
-     switch (action) {
-         case CURL_POLL_IN:
-             aio_set_fd_handler(s->aio_context, fd, false,
--                               curl_multi_read, NULL, NULL, state);
-+                               curl_multi_do, NULL, NULL, state);
-             break;
-         case CURL_POLL_OUT:
-             aio_set_fd_handler(s->aio_context, fd, false,
-@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
-             break;
-         case CURL_POLL_INOUT:
-             aio_set_fd_handler(s->aio_context, fd, false,
--                               curl_multi_read, curl_multi_do, NULL, state);
-+                               curl_multi_do, curl_multi_do, NULL, state);
-             break;
-         case CURL_POLL_REMOVE:
-             aio_set_fd_handler(s->aio_context, fd, false,
-@@ -XXX,XX +XXX,XX @@ static void curl_multi_do(void *arg)
- {
-     CURLState *s = (CURLState *)arg;
--    qemu_mutex_lock(&s->s->mutex);
--    curl_multi_do_locked(s);
--    qemu_mutex_unlock(&s->s->mutex);
--}
--
--static void curl_multi_read(void *arg)
--{
--    CURLState *s = (CURLState *)arg;
--
-     qemu_mutex_lock(&s->s->mutex);
-     curl_multi_do_locked(s);
-     curl_multi_check_completion(s->s);
---
-.21.0

-[Qemu-devel] [PULL 12/16] curl: Check curl_multi_add_handle()'s return code
+[PULL 4/9] aio-posix: move RCU_READ_LOCK() into run_poll_handlers()
-If we had done that all along, debugging would have been much simpler.
+Now that run_poll_handlers_once() is only called by run_poll_handlers()
-(Also, I/O errors are better than hangs.)
+we can improve the CPU time profile by moving the expensive
 RCU_READ_LOCK() out of the polling loop.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+This reduces the run_poll_handlers() from 40% CPU to 10% CPU in perf's
-Message-id: 20190910124136.10565-8-mreitz@redhat.com
+sampling profiler output.
-Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
-Reviewed-by: John Snow <jsnow@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Link: https://lore.kernel.org/r/20200305170806.1313245-3-stefanha@redhat.com
 Message-Id: <20200305170806.1313245-3-stefanha@redhat.com>
 ---
- block/curl.c | 8 +++++++-
+ util/aio-posix.c | 20 ++++++++++----------
-file changed, 7 insertions(+), 1 deletion(-)
+file changed, 10 insertions(+), 10 deletions(-)
-diff --git a/block/curl.c b/block/curl.c
+diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/curl.c
+--- a/util/aio-posix.c
-+++ b/block/curl.c
++++ b/util/aio-posix.c
-@@ -XXX,XX +XXX,XX @@ static void curl_setup_preadv(BlockDriverState *bs, CURLAIOCB *acb)
+@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
-     trace_curl_setup_preadv(acb->bytes, start, state->range);
+     bool progress = false;
-     curl_easy_setopt(state->curl, CURLOPT_RANGE, state->range);
+     AioHandler *node;
--    curl_multi_add_handle(s->multi, state->curl);
+-    /*
-+    if (curl_multi_add_handle(s->multi, state->curl) != CURLM_OK) {
+-     * Optimization: ->io_poll() handlers often contain RCU read critical
-+        state->acb[0] = NULL;
+-     * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
-+        acb->ret = -EIO;
+-     * -> rcu_read_lock() -> ... sequences with expensive memory
 -     * synchronization primitives.  Make the entire polling loop an RCU
 -     * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
 -     * are cheap.
 -     */
 -    RCU_READ_LOCK_GUARD();
 -
      QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
          if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
              aio_node_check(ctx, node->is_external) &&
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
      trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
 +    /*
 +     * Optimization: ->io_poll() handlers often contain RCU read critical
 +     * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
 +     * -> rcu_read_lock() -> ... sequences with expensive memory
 +     * synchronization primitives.  Make the entire polling loop an RCU
 +     * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
 +     * are cheap.
 +     */
 +    RCU_READ_LOCK_GUARD();
 +
-+        curl_clean_state(state);
+     start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-+        goto out;
+     do {
-+    }
+         progress = run_poll_handlers_once(ctx, timeout);
      /* Tell curl it needs to kick things off */
      curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
 --
-.21.0
+.24.1

-[Qemu-devel] [PULL 15/16] block/qcow2: refactor encryption code
+[PULL 5/9] aio-posix: extract ppoll(2) and epoll(7) fd monitoring
-From: Maxim Levitsky <mlevitsk@redhat.com>
+The ppoll(2) and epoll(7) file descriptor monitoring implementations are
 mixed with the core util/aio-posix.c code.  Before adding another
 implementation for Linux io_uring, extract out the existing
 ones so there is a clear interface and the core code is simpler.
-* Change the qcow2_co_{encrypt|decrypt} to just receive full host and
+The new interface is AioContext->fdmon_ops, a pointer to a FDMonOps
-  guest offsets and use this function directly instead of calling
+struct.  See the patch for details.
   do_perform_cow_encrypt (which is removed by that patch).
-* Adjust qcow2_co_encdec to take full host and guest offsets as well.
+Semantic changes:
 . ppoll(2) now reflects events from pollfds[] back into AioHandlers
    while we're still on the clock for adaptive polling.  This was
    already happening for epoll(7), so if it's really an issue then we'll
    need to fix both in the future.
 . epoll(7)'s fallback to ppoll(2) while external events are disabled
    was broken when the number of fds exceeded the epoll(7) upgrade
    threshold.  I guess this code path simply wasn't tested and no one
    noticed the bug.  I didn't go out of my way to fix it but the correct
    code is simpler than preserving the bug.
-* Document the qcow2_co_{encrypt|decrypt} arguments
+I also took some liberties in removing the unnecessary
-  to prevent the bug fixed in former commit from hopefully
+AioContext->epoll_available (just check AioContext->epollfd != -1
-  happening again.
+instead) and AioContext->epoll_enabled (it's implicit if our
 AioContext->fdmon_ops callbacks are being invoked) fields.
-Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Message-id: 20190915203655.21638-3-mlevitsk@redhat.com
+Link: https://lore.kernel.org/r/20200305170806.1313245-4-stefanha@redhat.com
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+Message-Id: <20200305170806.1313245-4-stefanha@redhat.com>
 [mreitz: Let perform_cow() return the error value returned by
          qcow2_co_encrypt(), as proposed by Vladimir]
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- block/qcow2.h         |  8 +++---
+ MAINTAINERS         |   2 +
- block/qcow2-cluster.c | 41 +++++++++-------------------
+ include/block/aio.h |  36 +++++-
- block/qcow2-threads.c | 63 +++++++++++++++++++++++++++++++++----------
+ util/Makefile.objs  |   2 +
- block/qcow2.c         |  5 ++--
+ util/aio-posix.c    | 286 ++------------------------------------------
-files changed, 69 insertions(+), 48 deletions(-)
+ util/aio-posix.h    |  61 ++++++++++
  util/fdmon-epoll.c  | 151 +++++++++++++++++++++++
  util/fdmon-poll.c   | 104 ++++++++++++++++
 files changed, 366 insertions(+), 276 deletions(-)
  create mode 100644 util/aio-posix.h
  create mode 100644 util/fdmon-epoll.c
  create mode 100644 util/fdmon-poll.c
-diff --git a/block/qcow2.h b/block/qcow2.h
+diff --git a/MAINTAINERS b/MAINTAINERS
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
+--- a/MAINTAINERS
-+++ b/block/qcow2.h
++++ b/MAINTAINERS
-@@ -XXX,XX +XXX,XX @@ ssize_t coroutine_fn
+@@ -XXX,XX +XXX,XX @@ L: qemu-block@nongnu.org
- qcow2_co_decompress(BlockDriverState *bs, void *dest, size_t dest_size,
+ S: Supported
-                     const void *src, size_t src_size);
+ F: util/async.c
- int coroutine_fn
+ F: util/aio-*.c
--qcow2_co_encrypt(BlockDriverState *bs, uint64_t file_cluster_offset,
++F: util/aio-*.h
--                 uint64_t offset, void *buf, size_t len);
++F: util/fdmon-*.c
-+qcow2_co_encrypt(BlockDriverState *bs, uint64_t host_offset,
+ F: block/io.c
-+                 uint64_t guest_offset, void *buf, size_t len);
+ F: migration/block*
- int coroutine_fn
+ F: include/block/aio.h
--qcow2_co_decrypt(BlockDriverState *bs, uint64_t file_cluster_offset,
+diff --git a/include/block/aio.h b/include/block/aio.h
 -                 uint64_t offset, void *buf, size_t len);
 +qcow2_co_decrypt(BlockDriverState *bs, uint64_t host_offset,
 +                 uint64_t guest_offset, void *buf, size_t len);
  #endif
 diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
+--- a/include/block/aio.h
-+++ b/block/qcow2-cluster.c
++++ b/include/block/aio.h
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ struct ThreadPool;
-     return 0;
+ struct LinuxAioState;
  struct LuringState;
 +/* Callbacks for file descriptor monitoring implementations */
 +typedef struct {
 +    /*
 +     * update:
 +     * @ctx: the AioContext
 +     * @node: the handler
 +     * @is_new: is the file descriptor already being monitored?
 +     *
 +     * Add/remove/modify a monitored file descriptor.  There are three cases:
 +     * 1. node->pfd.events == 0 means remove the file descriptor.
 +     * 2. !is_new means modify an already monitored file descriptor.
 +     * 3. is_new means add a new file descriptor.
 +     *
 +     * Called with ctx->list_lock acquired.
 +     */
 +    void (*update)(AioContext *ctx, AioHandler *node, bool is_new);
 +
 +    /*
 +     * wait:
 +     * @ctx: the AioContext
 +     * @ready_list: list for handlers that become ready
 +     * @timeout: maximum duration to wait, in nanoseconds
 +     *
 +     * Wait for file descriptors to become ready and place them on ready_list.
 +     *
 +     * Called with ctx->list_lock incremented but not locked.
 +     *
 +     * Returns: number of ready file descriptors.
 +     */
 +    int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
 +} FDMonOps;
 +
  /*
   * Each aio_bh_poll() call carves off a slice of the BH list, so that newly
   * scheduled BHs are not processed until the next aio_bh_poll() call.  All
@@ -XXX,XX +XXX,XX @@ struct AioContext {
      /* epoll(7) state used when built with CONFIG_EPOLL */
      int epollfd;
 -    bool epoll_enabled;
 -    bool epoll_available;
 +
 +    const FDMonOps *fdmon_ops;
  };
  /**
 diff --git a/util/Makefile.objs b/util/Makefile.objs
 index XXXXXXX..XXXXXXX 100644
 --- a/util/Makefile.objs
 +++ b/util/Makefile.objs
@@ -XXX,XX +XXX,XX @@ util-obj-y += aiocb.o async.o aio-wait.o thread-pool.o qemu-timer.o
  util-obj-y += main-loop.o
  util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o
  util-obj-$(CONFIG_POSIX) += aio-posix.o
 +util-obj-$(CONFIG_POSIX) += fdmon-poll.o
 +util-obj-$(CONFIG_EPOLL_CREATE1) += fdmon-epoll.o
  util-obj-$(CONFIG_POSIX) += compatfd.o
  util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
  util-obj-$(CONFIG_POSIX) += mmap-alloc.o
 diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-posix.c
 +++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/sockets.h"
  #include "qemu/cutils.h"
  #include "trace.h"
 -#ifdef CONFIG_EPOLL_CREATE1
 -#include <sys/epoll.h>
 -#endif
 +#include "aio-posix.h"
 -struct AioHandler
 -{
 -    GPollFD pfd;
 -    IOHandler *io_read;
 -    IOHandler *io_write;
 -    AioPollFn *io_poll;
 -    IOHandler *io_poll_begin;
 -    IOHandler *io_poll_end;
 -    void *opaque;
 -    bool is_external;
 -    QLIST_ENTRY(AioHandler) node;
 -    QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
 -    QLIST_ENTRY(AioHandler) node_deleted;
 -};
 -
 -/* Add a handler to a ready list */
 -static void add_ready_handler(AioHandlerList *ready_list,
 -                              AioHandler *node,
 -                              int revents)
 +void aio_add_ready_handler(AioHandlerList *ready_list,
 +                           AioHandler *node,
 +                           int revents)
  {
      QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
      node->pfd.revents = revents;
      QLIST_INSERT_HEAD(ready_list, node, node_ready);
  }
--static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
+-#ifdef CONFIG_EPOLL_CREATE1
--                                                uint64_t src_cluster_offset,
+-
--                                                uint64_t cluster_offset,
+-/* The fd number threshold to switch to epoll */
--                                                unsigned offset_in_cluster,
+-#define EPOLL_ENABLE_THRESHOLD 64
--                                                uint8_t *buffer,
+-
--                                                unsigned bytes)
+-static void aio_epoll_disable(AioContext *ctx)
 -{
--    if (bytes && bs->encrypted) {
+-    ctx->epoll_enabled = false;
--        BDRVQcow2State *s = bs->opaque;
+-    if (!ctx->epoll_available) {
--        assert(QEMU_IS_ALIGNED(offset_in_cluster, BDRV_SECTOR_SIZE));
+-        return;
--        assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
+-    }
--        assert(s->crypto);
+-    ctx->epoll_available = false;
--        if (qcow2_co_encrypt(bs,
+-    close(ctx->epollfd);
--                start_of_cluster(s, cluster_offset + offset_in_cluster),
+-}
--                src_cluster_offset + offset_in_cluster,
+-
--                buffer, bytes) < 0) {
+-static inline int epoll_events_from_pfd(int pfd_events)
 -{
 -    return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
 -           (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
 -           (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
 -           (pfd_events & G_IO_ERR ? EPOLLERR : 0);
 -}
 -
 -static bool aio_epoll_try_enable(AioContext *ctx)
 -{
 -    AioHandler *node;
 -    struct epoll_event event;
 -
 -    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
 -        int r;
 -        if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
 -            continue;
 -        }
 -        event.events = epoll_events_from_pfd(node->pfd.events);
 -        event.data.ptr = node;
 -        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
 -        if (r) {
 -            return false;
 -        }
 -    }
+-    ctx->epoll_enabled = true;
 -    return true;
 -}
 -
- static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
+-static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
-                                              uint64_t cluster_offset,
+-{
-                                              unsigned offset_in_cluster,
+-    struct epoll_event event;
-@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
+-    int r;
+-    int ctl;
-     /* Encrypt the data if necessary before writing it */
+-
-     if (bs->encrypted) {
+-    if (!ctx->epoll_enabled) {
--        if (!do_perform_cow_encrypt(bs, m->offset, m->alloc_offset,
+-        return;
--                                    start->offset, start_buffer,
+-    }
--                                    start->nb_bytes) ||
+-    if (!node->pfd.events) {
--            !do_perform_cow_encrypt(bs, m->offset, m->alloc_offset,
+-        ctl = EPOLL_CTL_DEL;
--                                    end->offset, end_buffer, end->nb_bytes)) {
+-    } else {
--            ret = -EIO;
+-        event.data.ptr = node;
-+        ret = qcow2_co_encrypt(bs,
+-        event.events = epoll_events_from_pfd(node->pfd.events);
-+                               m->alloc_offset + start->offset,
+-        ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
-+                               m->offset + start->offset,
+-    }
-+                               start_buffer, start->nb_bytes);
+-
-+        if (ret < 0) {
+-    r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
-+            goto fail;
+-    if (r) {
-+        }
+-        aio_epoll_disable(ctx);
-+
+-    }
-+        ret = qcow2_co_encrypt(bs,
+-}
-+                               m->alloc_offset + end->offset,
+-
-+                               m->offset + end->offset,
+-static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
-+                               end_buffer, end->nb_bytes);
+-                     int64_t timeout)
-+        if (ret < 0) {
+-{
-             goto fail;
+-    GPollFD pfd = {
 -        .fd = ctx->epollfd,
 -        .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
 -    };
 -    AioHandler *node;
 -    int i, ret = 0;
 -    struct epoll_event events[128];
 -
 -    if (timeout > 0) {
 -        ret = qemu_poll_ns(&pfd, 1, timeout);
 -        if (ret > 0) {
 -            timeout = 0;
 -        }
 -    }
 -    if (timeout <= 0 || ret > 0) {
 -        ret = epoll_wait(ctx->epollfd, events,
 -                         ARRAY_SIZE(events),
 -                         timeout);
 -        if (ret <= 0) {
 -            goto out;
 -        }
 -        for (i = 0; i < ret; i++) {
 -            int ev = events[i].events;
 -            int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
 -                          (ev & EPOLLOUT ? G_IO_OUT : 0) |
 -                          (ev & EPOLLHUP ? G_IO_HUP : 0) |
 -                          (ev & EPOLLERR ? G_IO_ERR : 0);
 -
 -            node = events[i].data.ptr;
 -            add_ready_handler(ready_list, node, revents);
 -        }
 -    }
 -out:
 -    return ret;
 -}
 -
 -static bool aio_epoll_enabled(AioContext *ctx)
 -{
 -    /* Fall back to ppoll when external clients are disabled. */
 -    return !aio_external_disabled(ctx) && ctx->epoll_enabled;
 -}
 -
 -static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
 -                                 unsigned npfd, int64_t timeout)
 -{
 -    if (!ctx->epoll_available) {
 -        return false;
 -    }
 -    if (aio_epoll_enabled(ctx)) {
 -        return true;
 -    }
 -    if (npfd >= EPOLL_ENABLE_THRESHOLD) {
 -        if (aio_epoll_try_enable(ctx)) {
 -            return true;
 -        } else {
 -            aio_epoll_disable(ctx);
 -        }
 -    }
 -    return false;
 -}
 -
 -#else
 -
 -static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
 -{
 -}
 -
 -static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
 -                     int64_t timeout)
 -{
 -    assert(false);
 -}
 -
 -static bool aio_epoll_enabled(AioContext *ctx)
 -{
 -    return false;
 -}
 -
 -static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
 -                          unsigned npfd, int64_t timeout)
 -{
 -    return false;
 -}
 -
 -#endif
 -
  static AioHandler *find_aio_handler(AioContext *ctx, int fd)
  {
      AioHandler *node;
@@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx,
                 atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
      if (new_node) {
 -        aio_epoll_update(ctx, new_node, is_new);
 +        ctx->fdmon_ops->update(ctx, new_node, is_new);
      } else if (node) {
          /* Unregister deleted fd_handler */
 -        aio_epoll_update(ctx, node, false);
 +        ctx->fdmon_ops->update(ctx, node, false);
      }
      qemu_lockcnt_unlock(&ctx->list_lock);
      aio_notify(ctx);
@@ -XXX,XX +XXX,XX @@ void aio_dispatch(AioContext *ctx)
      timerlistgroup_run_timers(&ctx->tlg);
  }
 -/* These thread-local variables are used only in a small part of aio_poll
 - * around the call to the poll() system call.  In particular they are not
 - * used while aio_poll is performing callbacks, which makes it much easier
 - * to think about reentrancy!
 - *
 - * Stack-allocated arrays would be perfect but they have size limitations;
 - * heap allocation is expensive enough that we want to reuse arrays across
 - * calls to aio_poll().  And because poll() has to be called without holding
 - * any lock, the arrays cannot be stored in AioContext.  Thread-local data
 - * has none of the disadvantages of these three options.
 - */
 -static __thread GPollFD *pollfds;
 -static __thread AioHandler **nodes;
 -static __thread unsigned npfd, nalloc;
 -static __thread Notifier pollfds_cleanup_notifier;
 -
 -static void pollfds_cleanup(Notifier *n, void *unused)
 -{
 -    g_assert(npfd == 0);
 -    g_free(pollfds);
 -    g_free(nodes);
 -    nalloc = 0;
 -}
 -
 -static void add_pollfd(AioHandler *node)
 -{
 -    if (npfd == nalloc) {
 -        if (nalloc == 0) {
 -            pollfds_cleanup_notifier.notify = pollfds_cleanup;
 -            qemu_thread_atexit_add(&pollfds_cleanup_notifier);
 -            nalloc = 8;
 -        } else {
 -            g_assert(nalloc <= INT_MAX);
 -            nalloc *= 2;
 -        }
 -        pollfds = g_renew(GPollFD, pollfds, nalloc);
 -        nodes = g_renew(AioHandler *, nodes, nalloc);
 -    }
 -    nodes[npfd] = node;
 -    pollfds[npfd] = (GPollFD) {
 -        .fd = node->pfd.fd,
 -        .events = node->pfd.events,
 -    };
 -    npfd++;
 -}
 -
  static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
  {
      bool progress = false;
@@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
  bool aio_poll(AioContext *ctx, bool blocking)
  {
      AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
 -    AioHandler *node;
 -    int i;
      int ret = 0;
      bool progress;
      int64_t timeout;
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
       * system call---a single round of run_poll_handlers_once suffices.
       */
      if (timeout || atomic_read(&ctx->poll_disable_cnt)) {
 -        assert(npfd == 0);
 -
 -        /* fill pollfds */
 -
 -        if (!aio_epoll_enabled(ctx)) {
 -            QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
 -                if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
 -                    && aio_node_check(ctx, node->is_external)) {
 -                    add_pollfd(node);
 -                }
 -            }
 -        }
 -
 -        /* wait until next event */
 -        if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
 -            npfd = 0; /* pollfds[] is not being used */
 -            ret = aio_epoll(ctx, &ready_list, timeout);
 -        } else  {
 -            ret = qemu_poll_ns(pollfds, npfd, timeout);
 -        }
 +        ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout);
      }
      if (blocking) {
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          }
      }
-diff --git a/block/qcow2-threads.c b/block/qcow2-threads.c
-index XXXXXXX..XXXXXXX 100644
+-    /* if we have any readable fds, dispatch event */
---- a/block/qcow2-threads.c
+-    if (ret > 0) {
-+++ b/block/qcow2-threads.c
+-        for (i = 0; i < npfd; i++) {
-@@ -XXX,XX +XXX,XX @@ static int qcow2_encdec_pool_func(void *opaque)
+-            int revents = pollfds[i].revents;
 -
 -            if (revents) {
 -                add_ready_handler(&ready_list, nodes[i], revents);
 -            }
 -        }
 -    }
 -
 -    npfd = 0;
 -
      progress |= aio_bh_poll(ctx);
      if (ret > 0) {
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
  void aio_context_setup(AioContext *ctx)
  {
 -#ifdef CONFIG_EPOLL_CREATE1
 -    assert(!ctx->epollfd);
 -    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
 -    if (ctx->epollfd == -1) {
 -        fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
 -        ctx->epoll_available = false;
 -    } else {
 -        ctx->epoll_available = true;
 -    }
 -#endif
 +    ctx->fdmon_ops = &fdmon_poll_ops;
 +    ctx->epollfd = -1;
 +
 +    fdmon_epoll_setup(ctx);
  }
- static int coroutine_fn
+ void aio_context_destroy(AioContext *ctx)
 -qcow2_co_encdec(BlockDriverState *bs, uint64_t file_cluster_offset,
 -                  uint64_t offset, void *buf, size_t len, Qcow2EncDecFunc func)
 +qcow2_co_encdec(BlockDriverState *bs, uint64_t host_offset,
 +                uint64_t guest_offset, void *buf, size_t len,
 +                Qcow2EncDecFunc func)
  {
-     BDRVQcow2State *s = bs->opaque;
+-#ifdef CONFIG_EPOLL_CREATE1
-     Qcow2EncDecData arg = {
+-    aio_epoll_disable(ctx);
-         .block = s->crypto,
+-#endif
--        .offset = s->crypt_physical_offset ?
++    fdmon_epoll_disable(ctx);
 -                      file_cluster_offset + offset_into_cluster(s, offset) :
 -                      offset,
 +        .offset = s->crypt_physical_offset ? host_offset : guest_offset,
          .buf = buf,
          .len = len,
          .func = func,
      };
 -    return qcow2_co_process(bs, qcow2_encdec_pool_func, &arg);
 +    assert(QEMU_IS_ALIGNED(guest_offset, BDRV_SECTOR_SIZE));
 +    assert(QEMU_IS_ALIGNED(host_offset, BDRV_SECTOR_SIZE));
 +    assert(QEMU_IS_ALIGNED(len, BDRV_SECTOR_SIZE));
 +    assert(s->crypto);
 +
 +    return len == 0 ? 0 : qcow2_co_process(bs, qcow2_encdec_pool_func, &arg);
  }
+ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
+diff --git a/util/aio-posix.h b/util/aio-posix.h
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/util/aio-posix.h
+@@ -XXX,XX +XXX,XX @@
 +/*
-+ * qcow2_co_encrypt()
++ * AioContext POSIX event loop implementation internal APIs
 + *
-+ * Encrypts one or more contiguous aligned sectors
++ * Copyright IBM, Corp. 2008
 + * Copyright Red Hat, Inc. 2020
 + *
-+ * @host_offset - underlying storage offset of the first sector of the
++ * Authors:
-+ * data to be encrypted
++ *  Anthony Liguori   <aliguori@us.ibm.com>
 + *
-+ * @guest_offset - guest (virtual) offset of the first sector of the
++ * This work is licensed under the terms of the GNU GPL, version 2.  See
-+ * data to be encrypted
++ * the COPYING file in the top-level directory.
 + *
-+ * @buf - buffer with the data to encrypt, that after encryption
++ * Contributions after 2012-01-13 are licensed under the terms of the
-+ *        will be written to the underlying storage device at
++ * GNU GPL, version 2 or (at your option) any later version.
-+ *        @host_offset
++ */
 +
 +#ifndef AIO_POSIX_H
 +#define AIO_POSIX_H
 +
 +#include "block/aio.h"
 +
 +struct AioHandler {
 +    GPollFD pfd;
 +    IOHandler *io_read;
 +    IOHandler *io_write;
 +    AioPollFn *io_poll;
 +    IOHandler *io_poll_begin;
 +    IOHandler *io_poll_end;
 +    void *opaque;
 +    bool is_external;
 +    QLIST_ENTRY(AioHandler) node;
 +    QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
 +    QLIST_ENTRY(AioHandler) node_deleted;
 +};
 +
 +/* Add a handler to a ready list */
 +void aio_add_ready_handler(AioHandlerList *ready_list, AioHandler *node,
 +                           int revents);
 +
 +extern const FDMonOps fdmon_poll_ops;
 +
 +#ifdef CONFIG_EPOLL_CREATE1
 +bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd);
 +void fdmon_epoll_setup(AioContext *ctx);
 +void fdmon_epoll_disable(AioContext *ctx);
 +#else
 +static inline bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
 +{
 +    return false;
 +}
 +
 +static inline void fdmon_epoll_setup(AioContext *ctx)
 +{
 +}
 +
 +static inline void fdmon_epoll_disable(AioContext *ctx)
 +{
 +}
 +#endif /* !CONFIG_EPOLL_CREATE1 */
 +
 +#endif /* AIO_POSIX_H */
 diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/util/fdmon-epoll.c
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: GPL-2.0-or-later */
 +/*
 + * epoll(7) file descriptor monitoring
 + */
 +
 +#include "qemu/osdep.h"
 +#include <sys/epoll.h>
 +#include "qemu/rcu_queue.h"
 +#include "aio-posix.h"
 +
 +/* The fd number threshold to switch to epoll */
 +#define EPOLL_ENABLE_THRESHOLD 64
 +
 +void fdmon_epoll_disable(AioContext *ctx)
 +{
 +    if (ctx->epollfd >= 0) {
 +        close(ctx->epollfd);
 +        ctx->epollfd = -1;
 +    }
 +
 +    /* Switch back */
 +    ctx->fdmon_ops = &fdmon_poll_ops;
 +}
 +
 +static inline int epoll_events_from_pfd(int pfd_events)
 +{
 +    return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
 +           (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
 +           (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
 +           (pfd_events & G_IO_ERR ? EPOLLERR : 0);
 +}
 +
 +static void fdmon_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
 +{
 +    struct epoll_event event;
 +    int r;
 +    int ctl;
 +
 +    if (!node->pfd.events) {
 +        ctl = EPOLL_CTL_DEL;
 +    } else {
 +        event.data.ptr = node;
 +        event.events = epoll_events_from_pfd(node->pfd.events);
 +        ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
 +    }
 +
 +    r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
 +    if (r) {
 +        fdmon_epoll_disable(ctx);
 +    }
 +}
 +
 +static int fdmon_epoll_wait(AioContext *ctx, AioHandlerList *ready_list,
 +                            int64_t timeout)
 +{
 +    GPollFD pfd = {
 +        .fd = ctx->epollfd,
 +        .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
 +    };
 +    AioHandler *node;
 +    int i, ret = 0;
 +    struct epoll_event events[128];
 +
 +    /* Fall back while external clients are disabled */
 +    if (atomic_read(&ctx->external_disable_cnt)) {
 +        return fdmon_poll_ops.wait(ctx, ready_list, timeout);
 +    }
 +
 +    if (timeout > 0) {
 +        ret = qemu_poll_ns(&pfd, 1, timeout);
 +        if (ret > 0) {
 +            timeout = 0;
 +        }
 +    }
 +    if (timeout <= 0 || ret > 0) {
 +        ret = epoll_wait(ctx->epollfd, events,
 +                         ARRAY_SIZE(events),
 +                         timeout);
 +        if (ret <= 0) {
 +            goto out;
 +        }
 +        for (i = 0; i < ret; i++) {
 +            int ev = events[i].events;
 +            int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
 +                          (ev & EPOLLOUT ? G_IO_OUT : 0) |
 +                          (ev & EPOLLHUP ? G_IO_HUP : 0) |
 +                          (ev & EPOLLERR ? G_IO_ERR : 0);
 +
 +            node = events[i].data.ptr;
 +            aio_add_ready_handler(ready_list, node, revents);
 +        }
 +    }
 +out:
 +    return ret;
 +}
 +
 +static const FDMonOps fdmon_epoll_ops = {
 +    .update = fdmon_epoll_update,
 +    .wait = fdmon_epoll_wait,
 +};
 +
 +static bool fdmon_epoll_try_enable(AioContext *ctx)
 +{
 +    AioHandler *node;
 +    struct epoll_event event;
 +
 +    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
 +        int r;
 +        if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
 +            continue;
 +        }
 +        event.events = epoll_events_from_pfd(node->pfd.events);
 +        event.data.ptr = node;
 +        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
 +        if (r) {
 +            return false;
 +        }
 +    }
 +
 +    ctx->fdmon_ops = &fdmon_epoll_ops;
 +    return true;
 +}
 +
 +bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
 +{
 +    if (ctx->epollfd < 0) {
 +        return false;
 +    }
 +
 +    /* Do not upgrade while external clients are disabled */
 +    if (atomic_read(&ctx->external_disable_cnt)) {
 +        return false;
 +    }
 +
 +    if (npfd >= EPOLL_ENABLE_THRESHOLD) {
 +        if (fdmon_epoll_try_enable(ctx)) {
 +            return true;
 +        } else {
 +            fdmon_epoll_disable(ctx);
 +        }
 +    }
 +    return false;
 +}
 +
 +void fdmon_epoll_setup(AioContext *ctx)
 +{
 +    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
 +    if (ctx->epollfd == -1) {
 +        fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
 +    }
 +}
 diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/util/fdmon-poll.c
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: GPL-2.0-or-later */
 +/*
 + * poll(2) file descriptor monitoring
 + *
-+ * @len - length of the buffer (must be a BDRV_SECTOR_SIZE multiple)
++ * Uses ppoll(2) when available, g_poll() otherwise.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "aio-posix.h"
 +#include "qemu/rcu_queue.h"
 +
 +/*
 + * These thread-local variables are used only in fdmon_poll_wait() around the
 + * call to the poll() system call.  In particular they are not used while
 + * aio_poll is performing callbacks, which makes it much easier to think about
 + * reentrancy!
 + *
-+ * Depending on the encryption method, @host_offset and/or @guest_offset
++ * Stack-allocated arrays would be perfect but they have size limitations;
-+ * may be used for generating the initialization vector for
++ * heap allocation is expensive enough that we want to reuse arrays across
-+ * encryption.
++ * calls to aio_poll().  And because poll() has to be called without holding
-+ *
++ * any lock, the arrays cannot be stored in AioContext.  Thread-local data
-+ * Note that while the whole range must be aligned on sectors, it
++ * has none of the disadvantages of these three options.
 + * does not have to be aligned on clusters and can also cross cluster
 + * boundaries
 + */
- int coroutine_fn
++static __thread GPollFD *pollfds;
--qcow2_co_encrypt(BlockDriverState *bs, uint64_t file_cluster_offset,
++static __thread AioHandler **nodes;
--                 uint64_t offset, void *buf, size_t len)
++static __thread unsigned npfd, nalloc;
-+qcow2_co_encrypt(BlockDriverState *bs, uint64_t host_offset,
++static __thread Notifier pollfds_cleanup_notifier;
-+                 uint64_t guest_offset, void *buf, size_t len)
++
- {
++static void pollfds_cleanup(Notifier *n, void *unused)
--    return qcow2_co_encdec(bs, file_cluster_offset, offset, buf, len,
++{
--                             qcrypto_block_encrypt);
++    g_assert(npfd == 0);
-+    return qcow2_co_encdec(bs, host_offset, guest_offset, buf, len,
++    g_free(pollfds);
-+                           qcrypto_block_encrypt);
++    g_free(nodes);
- }
++    nalloc = 0;
++}
-+/*
++
-+ * qcow2_co_decrypt()
++static void add_pollfd(AioHandler *node)
-+ *
++{
-+ * Decrypts one or more contiguous aligned sectors
++    if (npfd == nalloc) {
-+ * Similar to qcow2_co_encrypt
++        if (nalloc == 0) {
-+ */
++            pollfds_cleanup_notifier.notify = pollfds_cleanup;
- int coroutine_fn
++            qemu_thread_atexit_add(&pollfds_cleanup_notifier);
--qcow2_co_decrypt(BlockDriverState *bs, uint64_t file_cluster_offset,
++            nalloc = 8;
--                 uint64_t offset, void *buf, size_t len)
++        } else {
-+qcow2_co_decrypt(BlockDriverState *bs, uint64_t host_offset,
++            g_assert(nalloc <= INT_MAX);
-+                 uint64_t guest_offset, void *buf, size_t len)
++            nalloc *= 2;
- {
++        }
--    return qcow2_co_encdec(bs, file_cluster_offset, offset, buf, len,
++        pollfds = g_renew(GPollFD, pollfds, nalloc);
--                             qcrypto_block_decrypt);
++        nodes = g_renew(AioHandler *, nodes, nalloc);
-+    return qcow2_co_encdec(bs, host_offset, guest_offset, buf, len,
++    }
-+                           qcrypto_block_decrypt);
++    nodes[npfd] = node;
- }
++    pollfds[npfd] = (GPollFD) {
-diff --git a/block/qcow2.c b/block/qcow2.c
++        .fd = node->pfd.fd,
-index XXXXXXX..XXXXXXX 100644
++        .events = node->pfd.events,
---- a/block/qcow2.c
++    };
-+++ b/block/qcow2.c
++    npfd++;
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv_part(BlockDriverState *bs,
++}
++
-                 assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
++static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list,
-                 assert(QEMU_IS_ALIGNED(cur_bytes, BDRV_SECTOR_SIZE));
++                            int64_t timeout)
--                if (qcow2_co_decrypt(bs, cluster_offset, offset,
++{
-+                if (qcow2_co_decrypt(bs, cluster_offset + offset_in_cluster,
++    AioHandler *node;
-+                                     offset,
++    int ret;
-                                      cluster_data, cur_bytes) < 0) {
++
-                     ret = -EIO;
++    assert(npfd == 0);
-                     goto fail;
++
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_pwritev_part(
++    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
-             qemu_iovec_to_buf(qiov, qiov_offset + bytes_done,
++        if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
-                               cluster_data, cur_bytes);
++                && aio_node_check(ctx, node->is_external)) {
++            add_pollfd(node);
--            if (qcow2_co_encrypt(bs, cluster_offset, offset,
++        }
-+            if (qcow2_co_encrypt(bs, cluster_offset + offset_in_cluster, offset,
++    }
-                                  cluster_data, cur_bytes) < 0) {
++
-                 ret = -EIO;
++    /* epoll(7) is faster above a certain number of fds */
-                 goto out_unlocked;
++    if (fdmon_epoll_try_upgrade(ctx, npfd)) {
 +        return ctx->fdmon_ops->wait(ctx, ready_list, timeout);
 +    }
 +
 +    ret = qemu_poll_ns(pollfds, npfd, timeout);
 +    if (ret > 0) {
 +        int i;
 +
 +        for (i = 0; i < npfd; i++) {
 +            int revents = pollfds[i].revents;
 +
 +            if (revents) {
 +                aio_add_ready_handler(ready_list, nodes[i], revents);
 +            }
 +        }
 +    }
 +
 +    npfd = 0;
 +    return ret;
 +}
 +
 +static void fdmon_poll_update(AioContext *ctx, AioHandler *node, bool is_new)
 +{
 +    /* Do nothing, AioHandler already contains the state we'll need */
 +}
 +
 +const FDMonOps fdmon_poll_ops = {
 +    .update = fdmon_poll_update,
 +    .wait = fdmon_poll_wait,
 +};
 --
-.21.0
+.24.1

-[Qemu-devel] [PULL 10/16] curl: Report only ready sockets
+[PULL 6/9] aio-posix: simplify FDMonOps->update() prototype
-Instead of reporting all sockets to cURL, only report the one that has
+The AioHandler *node, bool is_new arguments are more complicated to
-caused curl_multi_do_locked() to be called.  This lets us get rid of the
+think about than simply being given AioHandler *old_node, AioHandler
-QLIST_FOREACH_SAFE() list, which was actually wrong: SAFE foreaches are
+*new_node.
 only safe when the current element is removed in each iteration.  If it
 possible for the list to be concurrently modified, we cannot guarantee
 that only the current element will be removed.  Therefore, we must not
 use QLIST_FOREACH_SAFE() here.
-Fixes: ff5ca1664af85b24a4180d595ea6873fd3deac57
+Furthermore, the new Linux io_uring file descriptor monitoring mechanism
-Cc: qemu-stable@nongnu.org
+added by the new patch requires access to both the old and the new
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+nodes.  Make this change now in preparation.
-Message-id: 20190910124136.10565-6-mreitz@redhat.com
-Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: John Snow <jsnow@redhat.com>
+Link: https://lore.kernel.org/r/20200305170806.1313245-5-stefanha@redhat.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Message-Id: <20200305170806.1313245-5-stefanha@redhat.com>
 ---
- block/curl.c | 17 ++++++-----------
+ include/block/aio.h | 13 ++++++-------
-file changed, 6 insertions(+), 11 deletions(-)
+ util/aio-posix.c    |  7 +------
  util/fdmon-epoll.c  | 21 ++++++++++++---------
  util/fdmon-poll.c   |  4 +++-
 files changed, 22 insertions(+), 23 deletions(-)
-diff --git a/block/curl.c b/block/curl.c
+diff --git a/include/block/aio.h b/include/block/aio.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/curl.c
+--- a/include/block/aio.h
-+++ b/block/curl.c
++++ b/include/block/aio.h
-@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
+@@ -XXX,XX +XXX,XX @@ typedef struct {
      /*
       * update:
       * @ctx: the AioContext
 -     * @node: the handler
 -     * @is_new: is the file descriptor already being monitored?
 +     * @old_node: the existing handler or NULL if this file descriptor is being
 +     *            monitored for the first time
 +     * @new_node: the new handler or NULL if this file descriptor is being
 +     *            removed
       *
 -     * Add/remove/modify a monitored file descriptor.  There are three cases:
 -     * 1. node->pfd.events == 0 means remove the file descriptor.
 -     * 2. !is_new means modify an already monitored file descriptor.
 -     * 3. is_new means add a new file descriptor.
 +     * Add/remove/modify a monitored file descriptor.
       *
       * Called with ctx->list_lock acquired.
       */
 -    void (*update)(AioContext *ctx, AioHandler *node, bool is_new);
 +    void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node);
      /*
       * wait:
 diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-posix.c
 +++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx,
      atomic_set(&ctx->poll_disable_cnt,
                 atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
 -    if (new_node) {
 -        ctx->fdmon_ops->update(ctx, new_node, is_new);
 -    } else if (node) {
 -        /* Unregister deleted fd_handler */
 -        ctx->fdmon_ops->update(ctx, node, false);
 -    }
 +    ctx->fdmon_ops->update(ctx, node, new_node);
      qemu_lockcnt_unlock(&ctx->list_lock);
      aio_notify(ctx);
 diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/fdmon-epoll.c
 +++ b/util/fdmon-epoll.c
@@ -XXX,XX +XXX,XX @@ static inline int epoll_events_from_pfd(int pfd_events)
             (pfd_events & G_IO_ERR ? EPOLLERR : 0);
  }
- /* Called with s->mutex held.  */
+-static void fdmon_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
--static void curl_multi_do_locked(CURLSocket *ready_socket)
++static void fdmon_epoll_update(AioContext *ctx,
-+static void curl_multi_do_locked(CURLSocket *socket)
++                               AioHandler *old_node,
 +                               AioHandler *new_node)
  {
--    CURLSocket *socket, *next_socket;
+-    struct epoll_event event;
--    CURLState *s = ready_socket->state;
++    struct epoll_event event = {
-+    BDRVCURLState *s = socket->state->s;
++        .data.ptr = new_node,
-     int running;
++        .events = new_node ? epoll_events_from_pfd(new_node->pfd.events) : 0,
 +    };
      int r;
+-    int ctl;
--    if (!s->s->multi) {
-+    if (!s->multi) {
+-    if (!node->pfd.events) {
-         return;
+-        ctl = EPOLL_CTL_DEL;
 +    if (!new_node) {
 +        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, &event);
 +    } else if (!old_node) {
 +        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, new_node->pfd.fd, &event);
      } else {
 -        event.data.ptr = node;
 -        event.events = epoll_events_from_pfd(node->pfd.events);
 -        ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
 +        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, new_node->pfd.fd, &event);
      }
--    /* Need to use _SAFE because curl_multi_socket_action() may trigger
+-    r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
--     * curl_sock_cb() which might modify this list */
+     if (r) {
--    QLIST_FOREACH_SAFE(socket, &s->sockets, next, next_socket) {
+         fdmon_epoll_disable(ctx);
--        do {
+     }
--            r = curl_multi_socket_action(s->s->multi, socket->fd, 0, &running);
+diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c
--        } while (r == CURLM_CALL_MULTI_PERFORM);
+index XXXXXXX..XXXXXXX 100644
--    }
+--- a/util/fdmon-poll.c
-+    do {
++++ b/util/fdmon-poll.c
-+        r = curl_multi_socket_action(s->multi, socket->fd, 0, &running);
+@@ -XXX,XX +XXX,XX @@ static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list,
-+    } while (r == CURLM_CALL_MULTI_PERFORM);
+     return ret;
  }
- static void curl_multi_do(void *arg)
+-static void fdmon_poll_update(AioContext *ctx, AioHandler *node, bool is_new)
 +static void fdmon_poll_update(AioContext *ctx,
 +                              AioHandler *old_node,
 +                              AioHandler *new_node)
  {
      /* Do nothing, AioHandler already contains the state we'll need */
  }
 --
-.21.0
+.24.1

-[Qemu-devel] [PULL 16/16] qemu-iotests: Add test for bz #1745922
+[PULL 7/9] aio-posix: add io_uring fd monitoring implementation
-From: Maxim Levitsky <mlevitsk@redhat.com>
+The recent Linux io_uring API has several advantages over ppoll(2) and
 epoll(2).  Details are given in the source code.
-Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Add an io_uring implementation and make it the default on Linux.
-Tested-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+Performance is the same as with epoll(7) but later patches add
-Message-id: 20190915203655.21638-4-mlevitsk@redhat.com
+optimizations that take advantage of io_uring.
-Reviewed-by: Max Reitz <mreitz@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+It is necessary to change how aio_set_fd_handler() deals with deleting
 AioHandlers since removing monitored file descriptors is asynchronous in
 io_uring.  fdmon_io_uring_remove() marks the AioHandler deleted and
 aio_set_fd_handler() will let it handle deletion in that case.
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Link: https://lore.kernel.org/r/20200305170806.1313245-6-stefanha@redhat.com
 Message-Id: <20200305170806.1313245-6-stefanha@redhat.com>
 ---
- tests/qemu-iotests/263     | 91 ++++++++++++++++++++++++++++++++++++++
+ configure             |   5 +
- tests/qemu-iotests/263.out | 40 +++++++++++++++++
+ include/block/aio.h   |   9 ++
- tests/qemu-iotests/group   |  1 +
+ util/Makefile.objs    |   1 +
-files changed, 132 insertions(+)
+ util/aio-posix.c      |  20 ++-
- create mode 100755 tests/qemu-iotests/263
+ util/aio-posix.h      |  20 ++-
- create mode 100644 tests/qemu-iotests/263.out
+ util/fdmon-io_uring.c | 326 ++++++++++++++++++++++++++++++++++++++++++
 files changed, 376 insertions(+), 5 deletions(-)
  create mode 100644 util/fdmon-io_uring.c
-diff --git a/tests/qemu-iotests/263 b/tests/qemu-iotests/263
+diff --git a/configure b/configure
-new file mode 100755
+index XXXXXXX..XXXXXXX 100755
-index XXXXXXX..XXXXXXX
+--- a/configure
---- /dev/null
++++ b/configure
-+++ b/tests/qemu-iotests/263
+@@ -XXX,XX +XXX,XX @@ if test "$linux_io_uring" != "no" ; then
      linux_io_uring_cflags=$($pkg_config --cflags liburing)
      linux_io_uring_libs=$($pkg_config --libs liburing)
      linux_io_uring=yes
 +
 +    # io_uring is used in libqemuutil.a where per-file -libs variables are not
 +    # seen by programs linking the archive.  It's not ideal, but just add the
 +    # library dependency globally.
 +    LIBS="$linux_io_uring_libs $LIBS"
    else
      if test "$linux_io_uring" = "yes" ; then
        feature_not_found "linux io_uring" "Install liburing devel"
 diff --git a/include/block/aio.h b/include/block/aio.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/aio.h
 +++ b/include/block/aio.h
 @@ -XXX,XX +XXX,XX @@
-+#!/usr/bin/env bash
+ #ifndef QEMU_AIO_H
-+#
+ #define QEMU_AIO_H
-+# Test encrypted write that crosses cluster boundary of two unallocated clusters
-+# Based on 188
++#ifdef CONFIG_LINUX_IO_URING
-+#
++#include <liburing.h>
-+# Copyright (C) 2019 Red Hat, Inc.
++#endif
-+#
+ #include "qemu/queue.h"
-+# This program is free software; you can redistribute it and/or modify
+ #include "qemu/event_notifier.h"
-+# it under the terms of the GNU General Public License as published by
+ #include "qemu/thread.h"
-+# the Free Software Foundation; either version 2 of the License, or
+@@ -XXX,XX +XXX,XX @@ struct BHListSlice {
-+# (at your option) any later version.
+     QSIMPLEQ_ENTRY(BHListSlice) next;
-+#
+ };
-+# This program is distributed in the hope that it will be useful,
-+# but WITHOUT ANY WARRANTY; without even the implied warranty of
++typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++
-+# GNU General Public License for more details.
+ struct AioContext {
-+#
+     GSource source;
-+# You should have received a copy of the GNU General Public License
-+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+@@ -XXX,XX +XXX,XX @@ struct AioContext {
-+#
+      * locking.
-+
+      */
-+# creator
+     struct LuringState *linux_io_uring;
-+owner=mlevitsk@redhat.com
++
-+
++    /* State for file descriptor monitoring using Linux io_uring */
-+seq=`basename $0`
++    struct io_uring fdmon_io_uring;
-+echo "QA output created by $seq"
++    AioHandlerSList submit_list;
-+
+ #endif
-+status=1    # failure is the default!
-+
+     /* TimerLists for calling timers - one per clock type.  Has its own
-+_cleanup()
+diff --git a/util/Makefile.objs b/util/Makefile.objs
-+{
+index XXXXXXX..XXXXXXX 100644
-+    _cleanup_test_img
+--- a/util/Makefile.objs
-+}
++++ b/util/Makefile.objs
-+trap "_cleanup; exit \$status" 0 1 2 3 15
+@@ -XXX,XX +XXX,XX @@ util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o
-+
+ util-obj-$(CONFIG_POSIX) += aio-posix.o
-+# get standard environment, filters and checks
+ util-obj-$(CONFIG_POSIX) += fdmon-poll.o
-+. ./common.rc
+ util-obj-$(CONFIG_EPOLL_CREATE1) += fdmon-epoll.o
-+. ./common.filter
++util-obj-$(CONFIG_LINUX_IO_URING) += fdmon-io_uring.o
-+
+ util-obj-$(CONFIG_POSIX) += compatfd.o
-+_supported_fmt qcow2
+ util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
-+_supported_proto generic
+ util-obj-$(CONFIG_POSIX) += mmap-alloc.o
-+_supported_os Linux
+diff --git a/util/aio-posix.c b/util/aio-posix.c
-+
+index XXXXXXX..XXXXXXX 100644
-+
+--- a/util/aio-posix.c
-+size=1M
++++ b/util/aio-posix.c
-+
+@@ -XXX,XX +XXX,XX @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
-+SECRET="secret,id=sec0,data=astrochicken"
+         g_source_remove_poll(&ctx->source, &node->pfd);
-+QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
+     }
-+
-+
++    node->pfd.revents = 0;
-+_run_test()
++
-+{
++    /* If the fd monitor has already marked it deleted, leave it alone */
-+    echo "== reading the whole image =="
++    if (QLIST_IS_INSERTED(node, node_deleted)) {
-+    $QEMU_IO --object $SECRET -c "read -P 0 0 $size" --image-opts "$1" | _filter_qemu_io | _filter_testdir
++        return false;
-+
++    }
-+    echo
++
-+    echo "== write two 512 byte sectors on a cluster boundary =="
+     /* If a read is in progress, just mark the node as deleted */
-+    $QEMU_IO --object $SECRET -c "write -P 0xAA 0xFE00 0x400" --image-opts "$1" | _filter_qemu_io | _filter_testdir
+     if (qemu_lockcnt_count(&ctx->list_lock)) {
-+
+         QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
-+    echo
+-        node->pfd.revents = 0;
-+    echo "== verify that the rest of the image is not changed =="
+         return false;
-+    $QEMU_IO --object $SECRET -c "read -P 0x00 0x00000 0xFE00" --image-opts "$1" | _filter_qemu_io | _filter_testdir
+     }
-+    $QEMU_IO --object $SECRET -c "read -P 0xAA 0x0FE00 0x400" --image-opts "$1" | _filter_qemu_io | _filter_testdir
+     /* Otherwise, delete it for real.  We can't just mark it as
-+    $QEMU_IO --object $SECRET -c "read -P 0x00 0x10200 0xEFE00" --image-opts "$1" | _filter_qemu_io | _filter_testdir
+@@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx,
-+
-+}
+         QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node);
-+
+     }
-+
+-    if (node) {
-+echo
+-        deleted = aio_remove_fd_handler(ctx, node);
-+echo "testing LUKS qcow2 encryption"
+-    }
-+echo
-+
+     /* No need to order poll_disable_cnt writes against other updates;
-+_make_test_img --object $SECRET -o "encrypt.format=luks,encrypt.key-secret=sec0,encrypt.iter-time=10,cluster_size=64K" $size
+      * the counter is only used to avoid wasting time and latency on
-+_run_test "driver=$IMGFMT,encrypt.key-secret=sec0,file.filename=$TEST_IMG"
+@@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx,
-+_cleanup_test_img
+                atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
-+
-+echo
+     ctx->fdmon_ops->update(ctx, node, new_node);
-+echo "testing legacy AES qcow2 encryption"
++    if (node) {
-+echo
++        deleted = aio_remove_fd_handler(ctx, node);
-+
++    }
-+
+     qemu_lockcnt_unlock(&ctx->list_lock);
-+_make_test_img --object $SECRET -o "encrypt.format=aes,encrypt.key-secret=sec0,cluster_size=64K" $size
+     aio_notify(ctx);
-+_run_test "driver=$IMGFMT,encrypt.key-secret=sec0,file.filename=$TEST_IMG"
-+_cleanup_test_img
+@@ -XXX,XX +XXX,XX @@ void aio_context_setup(AioContext *ctx)
-+
+     ctx->fdmon_ops = &fdmon_poll_ops;
-+
+     ctx->epollfd = -1;
-+
-+# success, all done
++    /* Use the fastest fd monitoring implementation if available */
-+echo "*** done"
++    if (fdmon_io_uring_setup(ctx)) {
-+rm -f $seq.full
++        return;
-+status=0
++    }
-diff --git a/tests/qemu-iotests/263.out b/tests/qemu-iotests/263.out
++
      fdmon_epoll_setup(ctx);
  }
  void aio_context_destroy(AioContext *ctx)
  {
 +    fdmon_io_uring_destroy(ctx);
      fdmon_epoll_disable(ctx);
  }
 diff --git a/util/aio-posix.h b/util/aio-posix.h
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-posix.h
 +++ b/util/aio-posix.h
@@ -XXX,XX +XXX,XX @@ struct AioHandler {
      IOHandler *io_poll_begin;
      IOHandler *io_poll_end;
      void *opaque;
 -    bool is_external;
      QLIST_ENTRY(AioHandler) node;
      QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
      QLIST_ENTRY(AioHandler) node_deleted;
 +#ifdef CONFIG_LINUX_IO_URING
 +    QSLIST_ENTRY(AioHandler) node_submitted;
 +    unsigned flags; /* see fdmon-io_uring.c */
 +#endif
 +    bool is_external;
  };
  /* Add a handler to a ready list */
@@ -XXX,XX +XXX,XX @@ static inline void fdmon_epoll_disable(AioContext *ctx)
  }
  #endif /* !CONFIG_EPOLL_CREATE1 */
 +#ifdef CONFIG_LINUX_IO_URING
 +bool fdmon_io_uring_setup(AioContext *ctx);
 +void fdmon_io_uring_destroy(AioContext *ctx);
 +#else
 +static inline bool fdmon_io_uring_setup(AioContext *ctx)
 +{
 +    return false;
 +}
 +
 +static inline void fdmon_io_uring_destroy(AioContext *ctx)
 +{
 +}
 +#endif /* !CONFIG_LINUX_IO_URING */
 +
  #endif /* AIO_POSIX_H */
 diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/tests/qemu-iotests/263.out
++++ b/util/fdmon-io_uring.c
 @@ -XXX,XX +XXX,XX @@
-+QA output created by 263
++/* SPDX-License-Identifier: GPL-2.0-or-later */
-+
++/*
-+testing LUKS qcow2 encryption
++ * Linux io_uring file descriptor monitoring
-+
++ *
-+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 encrypt.format=luks encrypt.key-secret=sec0 encrypt.iter-time=10
++ * The Linux io_uring API supports file descriptor monitoring with a few
-+== reading the whole image ==
++ * advantages over existing APIs like poll(2) and epoll(7):
-+read 1048576/1048576 bytes at offset 0
++ *
-+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++ * 1. Userspace polling of events is possible because the completion queue (cq
-+
++ *    ring) is shared between the kernel and userspace.  This allows
-+== write two 512 byte sectors on a cluster boundary ==
++ *    applications that rely on userspace polling to also monitor file
-+wrote 1024/1024 bytes at offset 65024
++ *    descriptors in the same userspace polling loop.
-+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++ *
-+
++ * 2. Submission and completion is batched and done together in a single system
-+== verify that the rest of the image is not changed ==
++ *    call.  This minimizes the number of system calls.
-+read 65024/65024 bytes at offset 0
++ *
-+63.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++ * 3. File descriptor monitoring is O(1) like epoll(7) so it scales better than
-+read 1024/1024 bytes at offset 65024
++ *    poll(2).
-+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++ *
-+read 982528/982528 bytes at offset 66048
++ * 4. Nanosecond timeouts are supported so it requires fewer syscalls than
-+959.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++ *    epoll(7).
-+
++ *
-+testing legacy AES qcow2 encryption
++ * This code only monitors file descriptors and does not do asynchronous disk
-+
++ * I/O.  Implementing disk I/O efficiently has other requirements and should
-+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 encrypt.format=aes encrypt.key-secret=sec0
++ * use a separate io_uring so it does not make sense to unify the code.
-+== reading the whole image ==
++ *
-+read 1048576/1048576 bytes at offset 0
++ * File descriptor monitoring is implemented using the following operations:
-+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++ *
-+
++ * 1. IORING_OP_POLL_ADD - adds a file descriptor to be monitored.
-+== write two 512 byte sectors on a cluster boundary ==
++ * 2. IORING_OP_POLL_REMOVE - removes a file descriptor being monitored.  When
-+wrote 1024/1024 bytes at offset 65024
++ *    the poll mask changes for a file descriptor it is first removed and then
-+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++ *    re-added with the new poll mask, so this operation is also used as part
-+
++ *    of modifying an existing monitored file descriptor.
-+== verify that the rest of the image is not changed ==
++ * 3. IORING_OP_TIMEOUT - added every time a blocking syscall is made to wait
-+read 65024/65024 bytes at offset 0
++ *    for events.  This operation self-cancels if another event completes
-+63.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++ *    before the timeout.
-+read 1024/1024 bytes at offset 65024
++ *
-+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++ * io_uring calls the submission queue the "sq ring" and the completion queue
-+read 982528/982528 bytes at offset 66048
++ * the "cq ring".  Ring entries are called "sqe" and "cqe", respectively.
-+959.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
++ *
-+*** done
++ * The code is structured so that sq/cq rings are only modified within
-diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
++ * fdmon_io_uring_wait().  Changes to AioHandlers are made by enqueuing them on
-index XXXXXXX..XXXXXXX 100644
++ * ctx->submit_list so that fdmon_io_uring_wait() can submit IORING_OP_POLL_ADD
---- a/tests/qemu-iotests/group
++ * and/or IORING_OP_POLL_REMOVE sqes for them.
-+++ b/tests/qemu-iotests/group
++ */
-@@ -XXX,XX +XXX,XX @@
++
-rw
++#include "qemu/osdep.h"
-rw quick
++#include <poll.h>
-rw quick migration
++#include "qemu/rcu_queue.h"
-+263 rw quick
++#include "aio-posix.h"
-rw auto quick
++
-rw quick
++enum {
 +    FDMON_IO_URING_ENTRIES  = 128, /* sq/cq ring size */
 +
 +    /* AioHandler::flags */
 +    FDMON_IO_URING_PENDING  = (1 << 0),
 +    FDMON_IO_URING_ADD      = (1 << 1),
 +    FDMON_IO_URING_REMOVE   = (1 << 2),
 +};
 +
 +static inline int poll_events_from_pfd(int pfd_events)
 +{
 +    return (pfd_events & G_IO_IN ? POLLIN : 0) |
 +           (pfd_events & G_IO_OUT ? POLLOUT : 0) |
 +           (pfd_events & G_IO_HUP ? POLLHUP : 0) |
 +           (pfd_events & G_IO_ERR ? POLLERR : 0);
 +}
 +
 +static inline int pfd_events_from_poll(int poll_events)
 +{
 +    return (poll_events & POLLIN ? G_IO_IN : 0) |
 +           (poll_events & POLLOUT ? G_IO_OUT : 0) |
 +           (poll_events & POLLHUP ? G_IO_HUP : 0) |
 +           (poll_events & POLLERR ? G_IO_ERR : 0);
 +}
 +
 +/*
 + * Returns an sqe for submitting a request.  Only be called within
 + * fdmon_io_uring_wait().
 + */
 +static struct io_uring_sqe *get_sqe(AioContext *ctx)
 +{
 +    struct io_uring *ring = &ctx->fdmon_io_uring;
 +    struct io_uring_sqe *sqe = io_uring_get_sqe(ring);
 +    int ret;
 +
 +    if (likely(sqe)) {
 +        return sqe;
 +    }
 +
 +    /* No free sqes left, submit pending sqes first */
 +    ret = io_uring_submit(ring);
 +    assert(ret > 1);
 +    sqe = io_uring_get_sqe(ring);
 +    assert(sqe);
 +    return sqe;
 +}
 +
 +/* Atomically enqueue an AioHandler for sq ring submission */
 +static void enqueue(AioHandlerSList *head, AioHandler *node, unsigned flags)
 +{
 +    unsigned old_flags;
 +
 +    old_flags = atomic_fetch_or(&node->flags, FDMON_IO_URING_PENDING | flags);
 +    if (!(old_flags & FDMON_IO_URING_PENDING)) {
 +        QSLIST_INSERT_HEAD_ATOMIC(head, node, node_submitted);
 +    }
 +}
 +
 +/* Dequeue an AioHandler for sq ring submission.  Called by fill_sq_ring(). */
 +static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags)
 +{
 +    AioHandler *node = QSLIST_FIRST(head);
 +
 +    if (!node) {
 +        return NULL;
 +    }
 +
 +    /* Doesn't need to be atomic since fill_sq_ring() moves the list */
 +    QSLIST_REMOVE_HEAD(head, node_submitted);
 +
 +    /*
 +     * Don't clear FDMON_IO_URING_REMOVE.  It's sticky so it can serve two
 +     * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and
 +     * telling process_cqe() to delete the AioHandler when its
 +     * IORING_OP_POLL_ADD completes.
 +     */
 +    *flags = atomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING |
 +                                              FDMON_IO_URING_ADD));
 +    return node;
 +}
 +
 +static void fdmon_io_uring_update(AioContext *ctx,
 +                                  AioHandler *old_node,
 +                                  AioHandler *new_node)
 +{
 +    if (new_node) {
 +        enqueue(&ctx->submit_list, new_node, FDMON_IO_URING_ADD);
 +    }
 +
 +    if (old_node) {
 +        /*
 +         * Deletion is tricky because IORING_OP_POLL_ADD and
 +         * IORING_OP_POLL_REMOVE are async.  We need to wait for the original
 +         * IORING_OP_POLL_ADD to complete before this handler can be freed
 +         * safely.
 +         *
 +         * It's possible that the file descriptor becomes ready and the
 +         * IORING_OP_POLL_ADD cqe is enqueued before IORING_OP_POLL_REMOVE is
 +         * submitted, too.
 +         *
 +         * Mark this handler deleted right now but don't place it on
 +         * ctx->deleted_aio_handlers yet.  Instead, manually fudge the list
 +         * entry to make QLIST_IS_INSERTED() think this handler has been
 +         * inserted and other code recognizes this AioHandler as deleted.
 +         *
 +         * Once the original IORING_OP_POLL_ADD completes we enqueue the
 +         * handler on the real ctx->deleted_aio_handlers list to be freed.
 +         */
 +        assert(!QLIST_IS_INSERTED(old_node, node_deleted));
 +        old_node->node_deleted.le_prev = &old_node->node_deleted.le_next;
 +
 +        enqueue(&ctx->submit_list, old_node, FDMON_IO_URING_REMOVE);
 +    }
 +}
 +
 +static void add_poll_add_sqe(AioContext *ctx, AioHandler *node)
 +{
 +    struct io_uring_sqe *sqe = get_sqe(ctx);
 +    int events = poll_events_from_pfd(node->pfd.events);
 +
 +    io_uring_prep_poll_add(sqe, node->pfd.fd, events);
 +    io_uring_sqe_set_data(sqe, node);
 +}
 +
 +static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node)
 +{
 +    struct io_uring_sqe *sqe = get_sqe(ctx);
 +
 +    io_uring_prep_poll_remove(sqe, node);
 +}
 +
 +/* Add a timeout that self-cancels when another cqe becomes ready */
 +static void add_timeout_sqe(AioContext *ctx, int64_t ns)
 +{
 +    struct io_uring_sqe *sqe;
 +    struct __kernel_timespec ts = {
 +        .tv_sec = ns / NANOSECONDS_PER_SECOND,
 +        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
 +    };
 +
 +    sqe = get_sqe(ctx);
 +    io_uring_prep_timeout(sqe, &ts, 1, 0);
 +}
 +
 +/* Add sqes from ctx->submit_list for submission */
 +static void fill_sq_ring(AioContext *ctx)
 +{
 +    AioHandlerSList submit_list;
 +    AioHandler *node;
 +    unsigned flags;
 +
 +    QSLIST_MOVE_ATOMIC(&submit_list, &ctx->submit_list);
 +
 +    while ((node = dequeue(&submit_list, &flags))) {
 +        /* Order matters, just in case both flags were set */
 +        if (flags & FDMON_IO_URING_ADD) {
 +            add_poll_add_sqe(ctx, node);
 +        }
 +        if (flags & FDMON_IO_URING_REMOVE) {
 +            add_poll_remove_sqe(ctx, node);
 +        }
 +    }
 +}
 +
 +/* Returns true if a handler became ready */
 +static bool process_cqe(AioContext *ctx,
 +                        AioHandlerList *ready_list,
 +                        struct io_uring_cqe *cqe)
 +{
 +    AioHandler *node = io_uring_cqe_get_data(cqe);
 +    unsigned flags;
 +
 +    /* poll_timeout and poll_remove have a zero user_data field */
 +    if (!node) {
 +        return false;
 +    }
 +
 +    /*
 +     * Deletion can only happen when IORING_OP_POLL_ADD completes.  If we race
 +     * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE
 +     * bit before IORING_OP_POLL_REMOVE is submitted.
 +     */
 +    flags = atomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE);
 +    if (flags & FDMON_IO_URING_REMOVE) {
 +        QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
 +        return false;
 +    }
 +
 +    aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res));
 +
 +    /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */
 +    add_poll_add_sqe(ctx, node);
 +    return true;
 +}
 +
 +static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list)
 +{
 +    struct io_uring *ring = &ctx->fdmon_io_uring;
 +    struct io_uring_cqe *cqe;
 +    unsigned num_cqes = 0;
 +    unsigned num_ready = 0;
 +    unsigned head;
 +
 +    io_uring_for_each_cqe(ring, head, cqe) {
 +        if (process_cqe(ctx, ready_list, cqe)) {
 +            num_ready++;
 +        }
 +
 +        num_cqes++;
 +    }
 +
 +    io_uring_cq_advance(ring, num_cqes);
 +    return num_ready;
 +}
 +
 +static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
 +                               int64_t timeout)
 +{
 +    unsigned wait_nr = 1; /* block until at least one cqe is ready */
 +    int ret;
 +
 +    /* Fall back while external clients are disabled */
 +    if (atomic_read(&ctx->external_disable_cnt)) {
 +        return fdmon_poll_ops.wait(ctx, ready_list, timeout);
 +    }
 +
 +    if (timeout == 0) {
 +        wait_nr = 0; /* non-blocking */
 +    } else if (timeout > 0) {
 +        add_timeout_sqe(ctx, timeout);
 +    }
 +
 +    fill_sq_ring(ctx);
 +
 +    ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
 +    assert(ret >= 0);
 +
 +    return process_cq_ring(ctx, ready_list);
 +}
 +
 +static const FDMonOps fdmon_io_uring_ops = {
 +    .update = fdmon_io_uring_update,
 +    .wait = fdmon_io_uring_wait,
 +};
 +
 +bool fdmon_io_uring_setup(AioContext *ctx)
 +{
 +    int ret;
 +
 +    ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
 +    if (ret != 0) {
 +        return false;
 +    }
 +
 +    QSLIST_INIT(&ctx->submit_list);
 +    ctx->fdmon_ops = &fdmon_io_uring_ops;
 +    return true;
 +}
 +
 +void fdmon_io_uring_destroy(AioContext *ctx)
 +{
 +    if (ctx->fdmon_ops == &fdmon_io_uring_ops) {
 +        AioHandler *node;
 +
 +        io_uring_queue_exit(&ctx->fdmon_io_uring);
 +
 +        /* No need to submit these anymore, just free them. */
 +        while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) {
 +            QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted);
 +            QLIST_REMOVE(node, node);
 +            g_free(node);
 +        }
 +
 +        ctx->fdmon_ops = &fdmon_poll_ops;
 +    }
 +}
 --
-.21.0
+.24.1

-[Qemu-devel] [PULL 09/16] curl: Pass CURLSocket to curl_multi_do()
+[PULL 8/9] aio-posix: support userspace polling of fd monitoring
-curl_multi_do_locked() currently marks all sockets as ready.  That is
+Unlike ppoll(2) and epoll(7), Linux io_uring completions can be polled
-not only inefficient, but in fact unsafe (the loop is).  A follow-up
+from userspace.  Previously userspace polling was only allowed when all
-patch will change that, but to do so, curl_multi_do_locked() needs to
+AioHandler's had an ->io_poll() callback.  This prevented starvation of
-know exactly which socket is ready; and that is accomplished by this
+fds by userspace pollable handlers.
 patch here.
-Cc: qemu-stable@nongnu.org
+Add the FDMonOps->need_wait() callback that enables userspace polling
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+even when some AioHandlers lack ->io_poll().
-Message-id: 20190910124136.10565-5-mreitz@redhat.com
-Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
+For example, it's now possible to do userspace polling when a TCP/IP
-Reviewed-by: John Snow <jsnow@redhat.com>
+socket is monitored thanks to Linux io_uring.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Link: https://lore.kernel.org/r/20200305170806.1313245-7-stefanha@redhat.com
 Message-Id: <20200305170806.1313245-7-stefanha@redhat.com>
 ---
- block/curl.c | 20 +++++++++++---------
+ include/block/aio.h   | 19 +++++++++++++++++++
-file changed, 11 insertions(+), 9 deletions(-)
+ util/aio-posix.c      | 11 ++++++++---
  util/fdmon-epoll.c    |  1 +
  util/fdmon-io_uring.c |  6 ++++++
  util/fdmon-poll.c     |  1 +
 files changed, 35 insertions(+), 3 deletions(-)
-diff --git a/block/curl.c b/block/curl.c
+diff --git a/include/block/aio.h b/include/block/aio.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/curl.c
+--- a/include/block/aio.h
-+++ b/block/curl.c
++++ b/include/block/aio.h
-@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
+@@ -XXX,XX +XXX,XX @@ struct ThreadPool;
-     switch (action) {
+ struct LinuxAioState;
-         case CURL_POLL_IN:
+ struct LuringState;
-             aio_set_fd_handler(s->aio_context, fd, false,
--                               curl_multi_do, NULL, NULL, state);
++/* Is polling disabled? */
-+                               curl_multi_do, NULL, NULL, socket);
++bool aio_poll_disabled(AioContext *ctx);
-             break;
++
-         case CURL_POLL_OUT:
+ /* Callbacks for file descriptor monitoring implementations */
-             aio_set_fd_handler(s->aio_context, fd, false,
+ typedef struct {
--                               NULL, curl_multi_do, NULL, state);
+     /*
-+                               NULL, curl_multi_do, NULL, socket);
+@@ -XXX,XX +XXX,XX @@ typedef struct {
-             break;
+      * Returns: number of ready file descriptors.
-         case CURL_POLL_INOUT:
+      */
-             aio_set_fd_handler(s->aio_context, fd, false,
+     int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
--                               curl_multi_do, curl_multi_do, NULL, state);
++
-+                               curl_multi_do, curl_multi_do, NULL, socket);
++    /*
-             break;
++     * need_wait:
-         case CURL_POLL_REMOVE:
++     * @ctx: the AioContext
-             aio_set_fd_handler(s->aio_context, fd, false,
++     *
-@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
++     * Tell aio_poll() when to stop userspace polling early because ->wait()
 +     * has fds ready.
 +     *
 +     * File descriptor monitoring implementations that cannot poll fd readiness
 +     * from userspace should use aio_poll_disabled() here.  This ensures that
 +     * file descriptors are not starved by handlers that frequently make
 +     * progress via userspace polling.
 +     *
 +     * Returns: true if ->wait() should be called, false otherwise.
 +     */
 +    bool (*need_wait)(AioContext *ctx);
  } FDMonOps;
  /*
 diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-posix.c
 +++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@
  #include "trace.h"
  #include "aio-posix.h"
 +bool aio_poll_disabled(AioContext *ctx)
 +{
 +    return atomic_read(&ctx->poll_disable_cnt);
 +}
 +
  void aio_add_ready_handler(AioHandlerList *ready_list,
                             AioHandler *node,
                             int revents)
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
          elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
          max_ns = qemu_soonest_timeout(*timeout, max_ns);
          assert(!(max_ns && progress));
 -    } while (elapsed_time < max_ns && !atomic_read(&ctx->poll_disable_cnt));
 +    } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
      /* If time has passed with no successful polling, adjust *timeout to
       * keep the same ending time.
@@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
  {
      int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
 -    if (max_ns && !atomic_read(&ctx->poll_disable_cnt)) {
 +    if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
          poll_set_started(ctx, true);
          if (run_poll_handlers(ctx, max_ns, timeout)) {
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
      /* If polling is allowed, non-blocking aio_poll does not need the
       * system call---a single round of run_poll_handlers_once suffices.
       */
 -    if (timeout || atomic_read(&ctx->poll_disable_cnt)) {
 +    if (timeout || ctx->fdmon_ops->need_wait(ctx)) {
          ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout);
      }
 diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/fdmon-epoll.c
 +++ b/util/fdmon-epoll.c
@@ -XXX,XX +XXX,XX @@ out:
  static const FDMonOps fdmon_epoll_ops = {
      .update = fdmon_epoll_update,
      .wait = fdmon_epoll_wait,
 +    .need_wait = aio_poll_disabled,
  };
  static bool fdmon_epoll_try_enable(AioContext *ctx)
 diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/fdmon-io_uring.c
 +++ b/util/fdmon-io_uring.c
@@ -XXX,XX +XXX,XX @@ static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
      return process_cq_ring(ctx, ready_list);
  }
- /* Called with s->mutex held.  */
++static bool fdmon_io_uring_need_wait(AioContext *ctx)
--static void curl_multi_do_locked(CURLState *s)
++{
-+static void curl_multi_do_locked(CURLSocket *ready_socket)
++    return io_uring_cq_ready(&ctx->fdmon_io_uring);
- {
++}
-     CURLSocket *socket, *next_socket;
++
-+    CURLState *s = ready_socket->state;
+ static const FDMonOps fdmon_io_uring_ops = {
-     int running;
+     .update = fdmon_io_uring_update,
-     int r;
+     .wait = fdmon_io_uring_wait,
++    .need_wait = fdmon_io_uring_need_wait,
-@@ -XXX,XX +XXX,XX @@ static void curl_multi_do_locked(CURLState *s)
+ };
- static void curl_multi_do(void *arg)
+ bool fdmon_io_uring_setup(AioContext *ctx)
- {
+diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c
--    CURLState *s = (CURLState *)arg;
+index XXXXXXX..XXXXXXX 100644
-+    CURLSocket *socket = arg;
+--- a/util/fdmon-poll.c
-+    BDRVCURLState *s = socket->state->s;
++++ b/util/fdmon-poll.c
+@@ -XXX,XX +XXX,XX @@ static void fdmon_poll_update(AioContext *ctx,
--    qemu_mutex_lock(&s->s->mutex);
+ const FDMonOps fdmon_poll_ops = {
--    curl_multi_do_locked(s);
+     .update = fdmon_poll_update,
--    curl_multi_check_completion(s->s);
+     .wait = fdmon_poll_wait,
--    qemu_mutex_unlock(&s->s->mutex);
++    .need_wait = aio_poll_disabled,
-+    qemu_mutex_lock(&s->mutex);
+ };
 +    curl_multi_do_locked(socket);
 +    curl_multi_check_completion(s);
 +    qemu_mutex_unlock(&s->mutex);
  }
  static void curl_multi_timeout_do(void *arg)
 --
-.21.0
+.24.1

-[Qemu-devel] [PULL 11/16] curl: Handle success in multi_check_completion
+Deleted patch
-Background: As of cURL 7.59.0, it verifies that several functions are
-not called from within a callback.  Among these functions is
-curl_multi_add_handle().
-curl_read_cb() is a callback from cURL and not a coroutine.  Waking up
-acb->co will lead to entering it then and there, which means the current
-request will settle and the caller (if it runs in the same coroutine)
-may then issue the next request.  In such a case, we will enter
-curl_setup_preadv() effectively from within curl_read_cb().
-Calling curl_multi_add_handle() will then fail and the new request will
-not be processed.
-Fix this by not letting curl_read_cb() wake up acb->co.  Instead, leave
-the whole business of settling the AIOCB objects to
-curl_multi_check_completion() (which is called from our timer callback
-and our FD handler, so not from any cURL callbacks).
-Reported-by: Natalie Gavrielov <ngavrilo@redhat.com>
-Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1740193
-Cc: qemu-stable@nongnu.org
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Message-id: 20190910124136.10565-7-mreitz@redhat.com
-Reviewed-by: John Snow <jsnow@redhat.com>
-Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
- block/curl.c | 69 ++++++++++++++++++++++------------------------------
-file changed, 29 insertions(+), 40 deletions(-)
-diff --git a/block/curl.c b/block/curl.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/curl.c
-+++ b/block/curl.c
-@@ -XXX,XX +XXX,XX @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
- {
-     CURLState *s = ((CURLState*)opaque);
-     size_t realsize = size * nmemb;
--    int i;
-     trace_curl_read_cb(realsize);
-@@ -XXX,XX +XXX,XX @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
-     memcpy(s->orig_buf + s->buf_off, ptr, realsize);
-     s->buf_off += realsize;
--    for(i=0; i<CURL_NUM_ACB; i++) {
--        CURLAIOCB *acb = s->acb[i];
--
--        if (!acb)
--            continue;
--
--        if ((s->buf_off >= acb->end)) {
--            size_t request_length = acb->bytes;
--
--            qemu_iovec_from_buf(acb->qiov, 0, s->orig_buf + acb->start,
--                                acb->end - acb->start);
--
--            if (acb->end - acb->start < request_length) {
--                size_t offset = acb->end - acb->start;
--                qemu_iovec_memset(acb->qiov, offset, 0,
--                                  request_length - offset);
--            }
--
--            acb->ret = 0;
--            s->acb[i] = NULL;
--            qemu_mutex_unlock(&s->s->mutex);
--            aio_co_wake(acb->co);
--            qemu_mutex_lock(&s->s->mutex);
--        }
--    }
--
- read_end:
-     /* curl will error out if we do not return this value */
-     return size * nmemb;
-@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
-             break;
-         if (msg->msg == CURLMSG_DONE) {
-+            int i;
-             CURLState *state = NULL;
-+            bool error = msg->data.result != CURLE_OK;
-+
-             curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE,
-                               (char **)&state);
--            /* ACBs for successful messages get completed in curl_read_cb */
--            if (msg->data.result != CURLE_OK) {
--                int i;
-+            if (error) {
-                 static int errcount = 100;
-                 /* Don't lose the original error message from curl, since
-@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
-                         error_report("curl: further errors suppressed");
-                     }
-                 }
-+            }
--                for (i = 0; i < CURL_NUM_ACB; i++) {
--                    CURLAIOCB *acb = state->acb[i];
-+            for (i = 0; i < CURL_NUM_ACB; i++) {
-+                CURLAIOCB *acb = state->acb[i];
--                    if (acb == NULL) {
--                        continue;
--                    }
-+                if (acb == NULL) {
-+                    continue;
-+                }
-+
-+                if (!error) {
-+                    /* Assert that we have read all data */
-+                    assert(state->buf_off >= acb->end);
-+
-+                    qemu_iovec_from_buf(acb->qiov, 0,
-+                                        state->orig_buf + acb->start,
-+                                        acb->end - acb->start);
--                    acb->ret = -EIO;
--                    state->acb[i] = NULL;
--                    qemu_mutex_unlock(&s->mutex);
--                    aio_co_wake(acb->co);
--                    qemu_mutex_lock(&s->mutex);
-+                    if (acb->end - acb->start < acb->bytes) {
-+                        size_t offset = acb->end - acb->start;
-+                        qemu_iovec_memset(acb->qiov, offset, 0,
-+                                          acb->bytes - offset);
-+                    }
-                 }
-+
-+                acb->ret = error ? -EIO : 0;
-+                state->acb[i] = NULL;
-+                qemu_mutex_unlock(&s->mutex);
-+                aio_co_wake(acb->co);
-+                qemu_mutex_lock(&s->mutex);
-             }
-             curl_clean_state(state);
---
-.21.0

-[Qemu-devel] [PULL 13/16] blockjob: update nodes head while removing all bdrv
+[PULL 9/9] aio-posix: remove idle poll handlers to improve scalability
-From: Sergio Lopez <slp@redhat.com>
+When there are many poll handlers it's likely that some of them are idle
+most of the time.  Remove handlers that haven't had activity recently so
-block_job_remove_all_bdrv() iterates through job->nodes, calling
+that the polling loop scales better for guests with a large number of
-bdrv_root_unref_child() for each entry. The call to the latter may
+devices.
-reach child_job_[can_]set_aio_ctx(), which will also attempt to
-traverse job->nodes, potentially finding entries that where freed
+This feature only takes effect for the Linux io_uring fd monitoring
-on previous iterations.
+implementation because it is capable of combining fd monitoring with
+userspace polling.  The other implementations can't do that and risk
-To avoid this situation, update job->nodes head on each iteration to
+starving fds in favor of poll handlers, so don't try this optimization
-ensure that already freed entries are no longer linked to the list.
+when they are in use.
-RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1746631
+IOPS improves from 10k to 105k when the guest has 100
-Signed-off-by: Sergio Lopez <slp@redhat.com>
+virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1
-Cc: qemu-stable@nongnu.org
+device for rw=randread,iodepth=1,bs=4k,ioengine=libaio on NVMe.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Message-id: 20190911100316.32282-1-mreitz@redhat.com
+[Clarified aio_poll_handlers locking discipline explanation in comment
-Reviewed-by: Sergio Lopez <slp@redhat.com>
+after discussion with Paolo Bonzini <pbonzini@redhat.com>.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+--Stefan]
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Link: https://lore.kernel.org/r/20200305170806.1313245-8-stefanha@redhat.com
 Message-Id: <20200305170806.1313245-8-stefanha@redhat.com>
 ---
- blockjob.c | 17 +++++++++++++----
+ include/block/aio.h |  8 ++++
-file changed, 13 insertions(+), 4 deletions(-)
+ util/aio-posix.c    | 93 +++++++++++++++++++++++++++++++++++++++++----
+ util/aio-posix.h    |  2 +
-diff --git a/blockjob.c b/blockjob.c
+ util/trace-events   |  2 +
-index XXXXXXX..XXXXXXX 100644
+files changed, 98 insertions(+), 7 deletions(-)
---- a/blockjob.c
-+++ b/blockjob.c
+diff --git a/include/block/aio.h b/include/block/aio.h
-@@ -XXX,XX +XXX,XX @@ static const BdrvChildRole child_job = {
+index XXXXXXX..XXXXXXX 100644
+--- a/include/block/aio.h
- void block_job_remove_all_bdrv(BlockJob *job)
++++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ struct AioContext {
      int64_t poll_grow;      /* polling time growth factor */
      int64_t poll_shrink;    /* polling time shrink factor */
 +    /*
 +     * List of handlers participating in userspace polling.  Protected by
 +     * ctx->list_lock.  Iterated and modified mostly by the event loop thread
 +     * from aio_poll() with ctx->list_lock incremented.  aio_set_fd_handler()
 +     * only touches the list to delete nodes if ctx->list_lock's count is zero.
 +     */
 +    AioHandlerList poll_aio_handlers;
 +
      /* Are we in polling mode or monitoring file descriptors? */
      bool poll_started;
 diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-posix.c
 +++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@
  #include "trace.h"
  #include "aio-posix.h"
 +/* Stop userspace polling on a handler if it isn't active for some time */
 +#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
 +
  bool aio_poll_disabled(AioContext *ctx)
  {
--    GSList *l;
+     return atomic_read(&ctx->poll_disable_cnt);
--    for (l = job->nodes; l; l = l->next) {
+@@ -XXX,XX +XXX,XX @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
       * deleted because deleted nodes are only cleaned up while
       * no one is walking the handlers list.
       */
 +    QLIST_SAFE_REMOVE(node, node_poll);
      QLIST_REMOVE(node, node);
      return true;
  }
@@ -XXX,XX +XXX,XX @@ static bool poll_set_started(AioContext *ctx, bool started)
      ctx->poll_started = started;
      qemu_lockcnt_inc(&ctx->list_lock);
 -    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
 +    QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
          IOHandler *fn;
          if (QLIST_IS_INSERTED(node, node_deleted)) {
@@ -XXX,XX +XXX,XX @@ static void aio_free_deleted_handlers(AioContext *ctx)
      while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
          QLIST_REMOVE(node, node);
          QLIST_REMOVE(node, node_deleted);
 +        QLIST_SAFE_REMOVE(node, node_poll);
          g_free(node);
      }
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
      revents = node->pfd.revents & node->pfd.events;
      node->pfd.revents = 0;
 +    /*
-+     * bdrv_root_unref_child() may reach child_job_[can_]set_aio_ctx(),
++     * Start polling AioHandlers when they become ready because activity is
-+     * which will also traverse job->nodes, so consume the list one by
++     * likely to continue.  Note that starvation is theoretically possible when
-+     * one to make sure that such a concurrent access does not attempt
++     * fdmon_supports_polling(), but only until the fd fires for the first
-+     * to process an already freed BdrvChild.
++     * time.
 +     */
-+    while (job->nodes) {
++    if (!QLIST_IS_INSERTED(node, node_deleted) &&
-+        GSList *l = job->nodes;
++        !QLIST_IS_INSERTED(node, node_poll) &&
-         BdrvChild *c = l->data;
++        node->io_poll) {
-+
++        trace_poll_add(ctx, node, node->pfd.fd, revents);
-+        job->nodes = l->next;
++        if (ctx->poll_started && node->io_poll_begin) {
-+
++            node->io_poll_begin(node->opaque);
-         bdrv_op_unblock_all(c->bs, job->blocker);
++        }
-         bdrv_root_unref_child(c);
++        QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll);
-+
++    }
-+        g_slist_free_1(l);
++
-     }
+     if (!QLIST_IS_INSERTED(node, node_deleted) &&
--    g_slist_free(job->nodes);
+         (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
--    job->nodes = NULL;
+         aio_node_check(ctx, node->is_external) &&
@@ -XXX,XX +XXX,XX @@ void aio_dispatch(AioContext *ctx)
      timerlistgroup_run_timers(&ctx->tlg);
  }
- bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs)
+-static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
 +static bool run_poll_handlers_once(AioContext *ctx,
 +                                   int64_t now,
 +                                   int64_t *timeout)
  {
      bool progress = false;
      AioHandler *node;
 +    AioHandler *tmp;
 -    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
 -        if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
 -            aio_node_check(ctx, node->is_external) &&
 +    QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
 +        if (aio_node_check(ctx, node->is_external) &&
              node->io_poll(node->opaque)) {
 +            node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
 +
              /*
               * Polling was successful, exit try_poll_mode immediately
               * to adjust the next polling time.
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
      return progress;
  }
 +static bool fdmon_supports_polling(AioContext *ctx)
 +{
 +    return ctx->fdmon_ops->need_wait != aio_poll_disabled;
 +}
 +
 +static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now)
 +{
 +    AioHandler *node;
 +    AioHandler *tmp;
 +    bool progress = false;
 +
 +    /*
 +     * File descriptor monitoring implementations without userspace polling
 +     * support suffer from starvation when a subset of handlers is polled
 +     * because fds will not be processed in a timely fashion.  Don't remove
 +     * idle poll handlers.
 +     */
 +    if (!fdmon_supports_polling(ctx)) {
 +        return false;
 +    }
 +
 +    QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
 +        if (node->poll_idle_timeout == 0LL) {
 +            node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
 +        } else if (now >= node->poll_idle_timeout) {
 +            trace_poll_remove(ctx, node, node->pfd.fd);
 +            node->poll_idle_timeout = 0LL;
 +            QLIST_SAFE_REMOVE(node, node_poll);
 +            if (ctx->poll_started && node->io_poll_end) {
 +                node->io_poll_end(node->opaque);
 +
 +                /*
 +                 * Final poll in case ->io_poll_end() races with an event.
 +                 * Nevermind about re-adding the handler in the rare case where
 +                 * this causes progress.
 +                 */
 +                progress = node->io_poll(node->opaque) || progress;
 +            }
 +        }
 +    }
 +
 +    return progress;
 +}
 +
  /* run_poll_handlers:
   * @ctx: the AioContext
   * @max_ns: maximum time to poll for, in nanoseconds
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
      start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
      do {
 -        progress = run_poll_handlers_once(ctx, timeout);
 +        progress = run_poll_handlers_once(ctx, start_time, timeout);
          elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
          max_ns = qemu_soonest_timeout(*timeout, max_ns);
          assert(!(max_ns && progress));
      } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
 +    if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) {
 +        *timeout = 0;
 +        progress = true;
 +    }
 +
      /* If time has passed with no successful polling, adjust *timeout to
       * keep the same ending time.
       */
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
   */
  static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
  {
 -    int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
 +    int64_t max_ns;
 +
 +    if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
 +        return false;
 +    }
 +    max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
      if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
          poll_set_started(ctx, true);
 diff --git a/util/aio-posix.h b/util/aio-posix.h
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-posix.h
 +++ b/util/aio-posix.h
@@ -XXX,XX +XXX,XX @@ struct AioHandler {
      QLIST_ENTRY(AioHandler) node;
      QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
      QLIST_ENTRY(AioHandler) node_deleted;
 +    QLIST_ENTRY(AioHandler) node_poll;
  #ifdef CONFIG_LINUX_IO_URING
      QSLIST_ENTRY(AioHandler) node_submitted;
      unsigned flags; /* see fdmon-io_uring.c */
  #endif
 +    int64_t poll_idle_timeout; /* when to stop userspace polling */
      bool is_external;
  };
 diff --git a/util/trace-events b/util/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/util/trace-events
 +++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ run_poll_handlers_begin(void *ctx, int64_t max_ns, int64_t timeout) "ctx %p max_
  run_poll_handlers_end(void *ctx, bool progress, int64_t timeout) "ctx %p progress %d new timeout %"PRId64
  poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
  poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 +poll_add(void *ctx, void *node, int fd, unsigned revents) "ctx %p node %p fd %d revents 0x%x"
 +poll_remove(void *ctx, void *node, int fd) "ctx %p node %p fd %d"
  # async.c
  aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
 --
-.21.0
+.24.1

The following changes since commit dd25f97c66a75d1508f1d4c6478ed2c95bec428f:

Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20190913' into staging (2019-09-16 10:15:15 +0100)

are available in the Git repository at:

https://github.com/XanClic/qemu.git tags/pull-block-2019-09-16

for you to fetch changes up to 1825cc0783ccf0ec5d9f0b225a99b340bdd4c68f:

qemu-iotests: Add test for bz #1745922 (2019-09-16 15:37:12 +0200)

----------------------------------------------------------------
Block patches:
- Fix for block jobs when used with I/O threads
- Fix for a corruption when using qcow2's LUKS encryption mode
- cURL fix
- check-block.sh cleanups (for make check)
- Refactoring

----------------------------------------------------------------
Max Reitz (7):
  curl: Keep pointer to the CURLState in CURLSocket
  curl: Keep *socket until the end of curl_sock_cb()
  curl: Check completion in curl_multi_do()
  curl: Pass CURLSocket to curl_multi_do()
  curl: Report only ready sockets
  curl: Handle success in multi_check_completion
  curl: Check curl_multi_add_handle()'s return code

Maxim Levitsky (3):
  block/qcow2: Fix corruption introduced by commit 8ac0f15f335
  block/qcow2: refactor encryption code
  qemu-iotests: Add test for bz #1745922

Nir Soffer (2):
  block: Use QEMU_IS_ALIGNED
  block: Remove unused masks

Sergio Lopez (1):
  blockjob: update nodes head while removing all bdrv

Thomas Huth (2):
  tests/qemu-iotests/check: Replace "tests" with "iotests" in final
    status text
  tests/Makefile: Do not print the name of the check-block.sh shell
    script

Vladimir Sementsov-Ogievskiy (1):
  tests/qemu-iotests: Fix qemu-io related output in 026.out.nocache

-- 
2.21.0

From: Nir Soffer <nirsof@gmail.com>

Replace instances of:

(n & (BDRV_SECTOR_SIZE - 1)) == 0

And:

(n & ~BDRV_SECTOR_MASK) == 0

With:

QEMU_IS_ALIGNED(n, BDRV_SECTOR_SIZE)

Which reveals the intent of the code better, and makes it easier to
locate the code checking alignment.

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
Message-id: 20190827185913.27427-2-nsoffer@redhat.com
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/bochs.c         | 4 ++--
 block/cloop.c         | 4 ++--
 block/dmg.c           | 4 ++--
 block/io.c            | 8 ++++----
 block/qcow2-cluster.c | 4 ++--
 block/qcow2.c         | 4 ++--
 block/vvfat.c         | 8 ++++----
 qemu-img.c            | 2 +-
 8 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/block/bochs.c b/block/bochs.c
index XXXXXXX..XXXXXXX 100644
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -XXX,XX +XXX,XX @@ bochs_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
     QEMUIOVector local_qiov;
     int ret;
 
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
+    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
 
     qemu_iovec_init(&local_qiov, qiov->niov);
     qemu_co_mutex_lock(&s->lock);
diff --git a/block/cloop.c b/block/cloop.c
index XXXXXXX..XXXXXXX 100644
--- a/block/cloop.c
+++ b/block/cloop.c
@@ -XXX,XX +XXX,XX @@ cloop_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
     int nb_sectors = bytes >> BDRV_SECTOR_BITS;
     int ret, i;
 
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
+    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
 
     qemu_co_mutex_lock(&s->lock);
 
diff --git a/block/dmg.c b/block/dmg.c
index XXXXXXX..XXXXXXX 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -XXX,XX +XXX,XX @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
     int nb_sectors = bytes >> BDRV_SECTOR_BITS;
     int ret, i;
 
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
+    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
 
     qemu_co_mutex_lock(&s->lock);
 
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
     sector_num = offset >> BDRV_SECTOR_BITS;
     nb_sectors = bytes >> BDRV_SECTOR_BITS;
 
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
+    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
     assert(bytes <= BDRV_REQUEST_MAX_BYTES);
     assert(drv->bdrv_co_readv);
 
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
     sector_num = offset >> BDRV_SECTOR_BITS;
     nb_sectors = bytes >> BDRV_SECTOR_BITS;
 
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
+    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
     assert(bytes <= BDRV_REQUEST_MAX_BYTES);
 
     assert(drv->bdrv_co_writev);
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
 {
     if (bytes && bs->encrypted) {
         BDRVQcow2State *s = bs->opaque;
-        assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
-        assert((bytes & ~BDRV_SECTOR_MASK) == 0);
+        assert(QEMU_IS_ALIGNED(offset_in_cluster, BDRV_SECTOR_SIZE));
+        assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
         assert(s->crypto);
         if (qcow2_co_encrypt(bs, cluster_offset,
                              src_cluster_offset + offset_in_cluster,
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv_part(BlockDriverState *bs,
                     goto fail;
                 }
 
-                assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-                assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+                assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
+                assert(QEMU_IS_ALIGNED(cur_bytes, BDRV_SECTOR_SIZE));
                 if (qcow2_co_decrypt(bs, cluster_offset, offset,
                                      cluster_data, cur_bytes) < 0) {
                     ret = -EIO;
diff --git a/block/vvfat.c b/block/vvfat.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -XXX,XX +XXX,XX @@ vvfat_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
     int nb_sectors = bytes >> BDRV_SECTOR_BITS;
     void *buf;
 
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
+    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
 
     buf = g_try_malloc(bytes);
     if (bytes && buf == NULL) {
@@ -XXX,XX +XXX,XX @@ vvfat_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
     int nb_sectors = bytes >> BDRV_SECTOR_BITS;
     void *buf;
 
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
+    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
 
     buf = g_try_malloc(bytes);
     if (bytes && buf == NULL) {
diff --git a/qemu-img.c b/qemu-img.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
             int64_t sval;
 
             sval = cvtnum(optarg);
-            if (sval < 0 || sval & (BDRV_SECTOR_SIZE - 1) ||
+            if (sval < 0 || !QEMU_IS_ALIGNED(sval, BDRV_SECTOR_SIZE) ||
                 sval / BDRV_SECTOR_SIZE > MAX_BUF_SECTORS) {
                 error_report("Invalid buffer size for sparse output specified. "
                     "Valid sizes are multiples of %llu up to %llu. Select "
-- 
2.21.0

From: Nir Soffer <nirsof@gmail.com>

Replace confusing usage:

~BDRV_SECTOR_MASK

With more clear:

(BDRV_SECTOR_SIZE - 1)

Remove BDRV_SECTOR_MASK and the unused BDRV_BLOCK_OFFSET_MASK which was
it's last user.

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
Message-id: 20190827185913.27427-3-nsoffer@redhat.com
Reviewed-by: Juan Quintela <quintela@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 include/block/block.h | 2 --
 migration/block.c     | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -XXX,XX +XXX,XX @@ typedef struct HDGeometry {
 
 #define BDRV_SECTOR_BITS   9
 #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
-#define BDRV_SECTOR_MASK   ~(BDRV_SECTOR_SIZE - 1)
 
 #define BDRV_REQUEST_MAX_SECTORS MIN(SIZE_MAX >> BDRV_SECTOR_BITS, \
                                      INT_MAX >> BDRV_SECTOR_BITS)
@@ -XXX,XX +XXX,XX @@ typedef struct HDGeometry {
 #define BDRV_BLOCK_ALLOCATED    0x10
 #define BDRV_BLOCK_EOF          0x20
 #define BDRV_BLOCK_RECURSE      0x40
-#define BDRV_BLOCK_OFFSET_MASK  BDRV_SECTOR_MASK
 
 typedef QSIMPLEQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;
 
diff --git a/migration/block.c b/migration/block.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -XXX,XX +XXX,XX @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
     do {
         addr = qemu_get_be64(f);
 
-        flags = addr & ~BDRV_SECTOR_MASK;
+        flags = addr & (BDRV_SECTOR_SIZE - 1);
         addr >>= BDRV_SECTOR_BITS;
 
         if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
-- 
2.21.0

From: Thomas Huth <thuth@redhat.com>

When running "make check -j8" or something similar, the iotests are
running in parallel with the other tests. So when they are printing
out "Passed all xx tests" or a similar status message at the end,
it might not be quite clear that this message belongs to the iotests,
since the output might be mixed with the other tests. Thus change the
word "tests" here to "iotests" instead to avoid confusion.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Message-id: 20190906113920.11271-1-thuth@redhat.com
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/check | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/check
+++ b/tests/qemu-iotests/check
@@ -XXX,XX +XXX,XX @@ END        { if (NR > 0) {
         if [ ! -z "$n_bad" -a $n_bad != 0 ]
         then
             echo "Failures:$bad"
-            echo "Failed $n_bad of $try tests"
+            echo "Failed $n_bad of $try iotests"
             echo "Failures:$bad" | fmt >>check.log
-            echo "Failed $n_bad of $try tests" >>check.log
+            echo "Failed $n_bad of $try iotests" >>check.log
         else
-            echo "Passed all $try tests"
-            echo "Passed all $try tests" >>check.log
+            echo "Passed all $try iotests"
+            echo "Passed all $try iotests" >>check.log
         fi
         needwrap=false
     fi
-- 
2.21.0

From: Thomas Huth <thuth@redhat.com>

The check script is already printing out which iotest is currently
running, so printing out the name of the check-block.sh shell script
looks superfluous here.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Message-id: 20190906113534.10907-1-thuth@redhat.com
Acked-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/Makefile.include | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/Makefile.include b/tests/Makefile.include
index XXXXXXX..XXXXXXX 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -XXX,XX +XXX,XX @@ QEMU_IOTESTS_HELPERS-$(call land,$(CONFIG_SOFTMMU),$(CONFIG_LINUX)) = tests/qemu
 check-tests/check-block.sh: tests/check-block.sh qemu-img$(EXESUF) \
 		qemu-io$(EXESUF) qemu-nbd$(EXESUF) $(QEMU_IOTESTS_HELPERS-y) \
 		$(patsubst %,%/all,$(filter %-softmmu,$(TARGET_DIRS)))
-	$<
+	@$<
 
 .PHONY: $(patsubst %, check-%, $(check-qapi-schema-y))
 $(patsubst %, check-%, $(check-qapi-schema-y)): check-%.json: $(SRC_PATH)/%.json
-- 
2.21.0

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

qemu-io now prefixes its error and warnings with "qemu-io:".
36b9986b08787019e fixed a lot of iotests output but forget about
026.out.nocache. Fix it too.

Fixes: 99e98d7c9fc1a1639fad ("qemu-io: Use error_[gs]et_progname()")
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-id: 20190816153015.447957-2-vsementsov@virtuozzo.com
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/026.out.nocache | 168 ++++++++++++++---------------
 1 file changed, 84 insertions(+), 84 deletions(-)

diff --git a/tests/qemu-iotests/026.out.nocache b/tests/qemu-iotests/026.out.nocache
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/026.out.nocache
+++ b/tests/qemu-iotests/026.out.nocache
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l1_update; errno: 5; imm: off; once: off; write 
-Failed to flush the L2 table cache: Input/output error
-Failed to flush the refcount block cache: Input/output error
+qemu-io: Failed to flush the L2 table cache: Input/output error
+qemu-io: Failed to flush the refcount block cache: Input/output error
 write failed: Input/output error
 
 1 leaked clusters were found on the image.
@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l1_update; errno: 5; imm: off; once: off; write -b
-Failed to flush the L2 table cache: Input/output error
-Failed to flush the refcount block cache: Input/output error
+qemu-io: Failed to flush the L2 table cache: Input/output error
+qemu-io: Failed to flush the refcount block cache: Input/output error
 write failed: Input/output error
 
 1 leaked clusters were found on the image.
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l1_update; errno: 28; imm: off; once: off; write 
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 
 1 leaked clusters were found on the image.
@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l1_update; errno: 28; imm: off; once: off; write -b
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 
 1 leaked clusters were found on the image.
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l2_update; errno: 5; imm: off; once: off; write 
-Failed to flush the L2 table cache: Input/output error
-Failed to flush the refcount block cache: Input/output error
+qemu-io: Failed to flush the L2 table cache: Input/output error
+qemu-io: Failed to flush the refcount block cache: Input/output error
 wrote 131072/131072 bytes at offset 0
 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l2_update; errno: 5; imm: off; once: off; write -b
-Failed to flush the L2 table cache: Input/output error
-Failed to flush the refcount block cache: Input/output error
+qemu-io: Failed to flush the L2 table cache: Input/output error
+qemu-io: Failed to flush the refcount block cache: Input/output error
 wrote 131072/131072 bytes at offset 0
 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l2_update; errno: 28; imm: off; once: off; write 
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 wrote 131072/131072 bytes at offset 0
 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l2_update; errno: 28; imm: off; once: off; write -b
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 wrote 131072/131072 bytes at offset 0
 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l2_alloc_write; errno: 5; imm: off; once: off; write 
-Failed to flush the L2 table cache: Input/output error
-Failed to flush the refcount block cache: Input/output error
+qemu-io: Failed to flush the L2 table cache: Input/output error
+qemu-io: Failed to flush the refcount block cache: Input/output error
 write failed: Input/output error
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l2_alloc_write; errno: 5; imm: off; once: off; write -b
-Failed to flush the L2 table cache: Input/output error
-Failed to flush the refcount block cache: Input/output error
+qemu-io: Failed to flush the L2 table cache: Input/output error
+qemu-io: Failed to flush the refcount block cache: Input/output error
 write failed: Input/output error
 
 1 leaked clusters were found on the image.
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l2_alloc_write; errno: 28; imm: off; once: off; write 
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l2_alloc_write; errno: 28; imm: off; once: off; write -b
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 
 1 leaked clusters were found on the image.
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: write_aio; errno: 5; imm: off; once: off; write 
-Failed to flush the L2 table cache: Input/output error
-Failed to flush the refcount block cache: Input/output error
+qemu-io: Failed to flush the L2 table cache: Input/output error
+qemu-io: Failed to flush the refcount block cache: Input/output error
 write failed: Input/output error
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: write_aio; errno: 5; imm: off; once: off; write -b
-Failed to flush the L2 table cache: Input/output error
-Failed to flush the refcount block cache: Input/output error
+qemu-io: Failed to flush the L2 table cache: Input/output error
+qemu-io: Failed to flush the refcount block cache: Input/output error
 write failed: Input/output error
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: write_aio; errno: 28; imm: off; once: off; write 
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: write_aio; errno: 28; imm: off; once: off; write -b
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_load; errno: 5; imm: off; once: off; write 
-Failed to flush the L2 table cache: Input/output error
-Failed to flush the refcount block cache: Input/output error
+qemu-io: Failed to flush the L2 table cache: Input/output error
+qemu-io: Failed to flush the refcount block cache: Input/output error
 write failed: Input/output error
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_load; errno: 5; imm: off; once: off; write -b
-Failed to flush the L2 table cache: Input/output error
-Failed to flush the refcount block cache: Input/output error
+qemu-io: Failed to flush the L2 table cache: Input/output error
+qemu-io: Failed to flush the refcount block cache: Input/output error
 write failed: Input/output error
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_load; errno: 28; imm: off; once: off; write 
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_load; errno: 28; imm: off; once: off; write -b
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_update_part; errno: 5; imm: off; once: off; write 
-Failed to flush the L2 table cache: Input/output error
-Failed to flush the refcount block cache: Input/output error
+qemu-io: Failed to flush the L2 table cache: Input/output error
+qemu-io: Failed to flush the refcount block cache: Input/output error
 write failed: Input/output error
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_update_part; errno: 5; imm: off; once: off; write -b
-Failed to flush the L2 table cache: Input/output error
-Failed to flush the refcount block cache: Input/output error
+qemu-io: Failed to flush the L2 table cache: Input/output error
+qemu-io: Failed to flush the refcount block cache: Input/output error
 write failed: Input/output error
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_update_part; errno: 28; imm: off; once: off; write 
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_update_part; errno: 28; imm: off; once: off; write -b
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_alloc; errno: 5; imm: off; once: off; write 
-Failed to flush the L2 table cache: Input/output error
-Failed to flush the refcount block cache: Input/output error
+qemu-io: Failed to flush the L2 table cache: Input/output error
+qemu-io: Failed to flush the refcount block cache: Input/output error
 write failed: Input/output error
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_alloc; errno: 5; imm: off; once: off; write -b
-Failed to flush the L2 table cache: Input/output error
-Failed to flush the refcount block cache: Input/output error
+qemu-io: Failed to flush the L2 table cache: Input/output error
+qemu-io: Failed to flush the refcount block cache: Input/output error
 write failed: Input/output error
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_alloc; errno: 28; imm: off; once: off; write 
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_alloc; errno: 28; imm: off; once: off; write -b
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_alloc_hookup; errno: 28; imm: off; once: off; write 
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 
 55 leaked clusters were found on the image.
@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_alloc_hookup; errno: 28; imm: off; once: off; write -b
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 
 251 leaked clusters were found on the image.
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_alloc_write; errno: 28; imm: off; once: off; write 
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_alloc_write; errno: 28; imm: off; once: off; write -b
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_alloc_write_blocks; errno: 28; imm: off; once: off; write 
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 
 10 leaked clusters were found on the image.
@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_alloc_write_blocks; errno: 28; imm: off; once: off; write -b
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 
 23 leaked clusters were found on the image.
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_alloc_write_table; errno: 28; imm: off; once: off; write 
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 
 10 leaked clusters were found on the image.
@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_alloc_write_table; errno: 28; imm: off; once: off; write -b
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 
 23 leaked clusters were found on the image.
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_alloc_switch_table; errno: 28; imm: off; once: off; write 
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 
 10 leaked clusters were found on the image.
@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: refblock_alloc_switch_table; errno: 28; imm: off; once: off; write -b
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 
 23 leaked clusters were found on the image.
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l1_grow_write_table; errno: 5; imm: off; once: off
-Failed to flush the L2 table cache: Input/output error
-Failed to flush the refcount block cache: Input/output error
+qemu-io: Failed to flush the L2 table cache: Input/output error
+qemu-io: Failed to flush the refcount block cache: Input/output error
 write failed: Input/output error
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l1_grow_write_table; errno: 28; imm: off; once: off
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l1_grow_activate_table; errno: 5; imm: off; once: off
-Failed to flush the L2 table cache: Input/output error
-Failed to flush the refcount block cache: Input/output error
+qemu-io: Failed to flush the L2 table cache: Input/output error
+qemu-io: Failed to flush the refcount block cache: Input/output error
 write failed: Input/output error
 
 96 leaked clusters were found on the image.
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l1_grow_activate_table; errno: 28; imm: off; once: off
-Failed to flush the L2 table cache: No space left on device
-Failed to flush the refcount block cache: No space left on device
+qemu-io: Failed to flush the L2 table cache: No space left on device
+qemu-io: Failed to flush the refcount block cache: No space left on device
 write failed: No space left on device
 
 96 leaked clusters were found on the image.
-- 
2.21.0

A follow-up patch will make curl_multi_do() and curl_multi_read() take a
CURLSocket instead of the CURLState.  They still need the latter,
though, so add a pointer to it to the former.

Cc: qemu-stable@nongnu.org
Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Message-id: 20190910124136.10565-2-mreitz@redhat.com
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/curl.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle,
 #define CURL_BLOCK_OPT_TIMEOUT_DEFAULT 5
 
 struct BDRVCURLState;
+struct CURLState;
 
 static bool libcurl_initialized;
 
@@ -XXX,XX +XXX,XX @@ typedef struct CURLAIOCB {
 
 typedef struct CURLSocket {
     int fd;
+    struct CURLState *state;
     QLIST_ENTRY(CURLSocket) next;
 } CURLSocket;
 
@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
     if (!socket) {
         socket = g_new0(CURLSocket, 1);
         socket->fd = fd;
+        socket->state = state;
         QLIST_INSERT_HEAD(&state->sockets, socket, next);
     }
     socket = NULL;
-- 
2.21.0

This does not really change anything, but it makes the code a bit easier
to follow once we use @socket as the opaque pointer for
aio_set_fd_handler().

Cc: qemu-stable@nongnu.org
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190910124136.10565-3-mreitz@redhat.com
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/curl.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
 
     QLIST_FOREACH(socket, &state->sockets, next) {
         if (socket->fd == fd) {
-            if (action == CURL_POLL_REMOVE) {
-                QLIST_REMOVE(socket, next);
-                g_free(socket);
-            }
             break;
         }
     }
@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
         socket->state = state;
         QLIST_INSERT_HEAD(&state->sockets, socket, next);
     }
-    socket = NULL;
 
     trace_curl_sock_cb(action, (int)fd);
     switch (action) {
@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
             break;
     }
 
+    if (action == CURL_POLL_REMOVE) {
+        QLIST_REMOVE(socket, next);
+        g_free(socket);
+    }
+
     return 0;
 }
 
-- 
2.21.0

While it is more likely that transfers complete after some file
descriptor has data ready to read, we probably should not rely on it.
Better be safe than sorry and call curl_multi_check_completion() in
curl_multi_do(), too, just like it is done in curl_multi_read().

With this change, curl_multi_do() and curl_multi_read() are actually the
same, so drop curl_multi_read() and use curl_multi_do() as the sole FD
handler.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190910124136.10565-4-mreitz@redhat.com
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/curl.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ typedef struct BDRVCURLState {
 
 static void curl_clean_state(CURLState *s);
 static void curl_multi_do(void *arg);
-static void curl_multi_read(void *arg);
 
 #ifdef NEED_CURL_TIMER_CALLBACK
 /* Called from curl_multi_do_locked, with s->mutex held.  */
@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
     switch (action) {
         case CURL_POLL_IN:
             aio_set_fd_handler(s->aio_context, fd, false,
-                               curl_multi_read, NULL, NULL, state);
+                               curl_multi_do, NULL, NULL, state);
             break;
         case CURL_POLL_OUT:
             aio_set_fd_handler(s->aio_context, fd, false,
@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
             break;
         case CURL_POLL_INOUT:
             aio_set_fd_handler(s->aio_context, fd, false,
-                               curl_multi_read, curl_multi_do, NULL, state);
+                               curl_multi_do, curl_multi_do, NULL, state);
             break;
         case CURL_POLL_REMOVE:
             aio_set_fd_handler(s->aio_context, fd, false,
@@ -XXX,XX +XXX,XX @@ static void curl_multi_do(void *arg)
 {
     CURLState *s = (CURLState *)arg;
 
-    qemu_mutex_lock(&s->s->mutex);
-    curl_multi_do_locked(s);
-    qemu_mutex_unlock(&s->s->mutex);
-}
-
-static void curl_multi_read(void *arg)
-{
-    CURLState *s = (CURLState *)arg;
-
     qemu_mutex_lock(&s->s->mutex);
     curl_multi_do_locked(s);
     curl_multi_check_completion(s->s);
-- 
2.21.0

curl_multi_do_locked() currently marks all sockets as ready.  That is
not only inefficient, but in fact unsafe (the loop is).  A follow-up
patch will change that, but to do so, curl_multi_do_locked() needs to
know exactly which socket is ready; and that is accomplished by this
patch here.

Cc: qemu-stable@nongnu.org
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190910124136.10565-5-mreitz@redhat.com
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/curl.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
     switch (action) {
         case CURL_POLL_IN:
             aio_set_fd_handler(s->aio_context, fd, false,
-                               curl_multi_do, NULL, NULL, state);
+                               curl_multi_do, NULL, NULL, socket);
             break;
         case CURL_POLL_OUT:
             aio_set_fd_handler(s->aio_context, fd, false,
-                               NULL, curl_multi_do, NULL, state);
+                               NULL, curl_multi_do, NULL, socket);
             break;
         case CURL_POLL_INOUT:
             aio_set_fd_handler(s->aio_context, fd, false,
-                               curl_multi_do, curl_multi_do, NULL, state);
+                               curl_multi_do, curl_multi_do, NULL, socket);
             break;
         case CURL_POLL_REMOVE:
             aio_set_fd_handler(s->aio_context, fd, false,
@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
 }
 
 /* Called with s->mutex held.  */
-static void curl_multi_do_locked(CURLState *s)
+static void curl_multi_do_locked(CURLSocket *ready_socket)
 {
     CURLSocket *socket, *next_socket;
+    CURLState *s = ready_socket->state;
     int running;
     int r;
 
@@ -XXX,XX +XXX,XX @@ static void curl_multi_do_locked(CURLState *s)
 
 static void curl_multi_do(void *arg)
 {
-    CURLState *s = (CURLState *)arg;
+    CURLSocket *socket = arg;
+    BDRVCURLState *s = socket->state->s;
 
-    qemu_mutex_lock(&s->s->mutex);
-    curl_multi_do_locked(s);
-    curl_multi_check_completion(s->s);
-    qemu_mutex_unlock(&s->s->mutex);
+    qemu_mutex_lock(&s->mutex);
+    curl_multi_do_locked(socket);
+    curl_multi_check_completion(s);
+    qemu_mutex_unlock(&s->mutex);
 }
 
 static void curl_multi_timeout_do(void *arg)
-- 
2.21.0

Instead of reporting all sockets to cURL, only report the one that has
caused curl_multi_do_locked() to be called.  This lets us get rid of the
QLIST_FOREACH_SAFE() list, which was actually wrong: SAFE foreaches are
only safe when the current element is removed in each iteration.  If it
possible for the list to be concurrently modified, we cannot guarantee
that only the current element will be removed.  Therefore, we must not
use QLIST_FOREACH_SAFE() here.

Fixes: ff5ca1664af85b24a4180d595ea6873fd3deac57
Cc: qemu-stable@nongnu.org
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190910124136.10565-6-mreitz@redhat.com
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/curl.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
 }
 
 /* Called with s->mutex held.  */
-static void curl_multi_do_locked(CURLSocket *ready_socket)
+static void curl_multi_do_locked(CURLSocket *socket)
 {
-    CURLSocket *socket, *next_socket;
-    CURLState *s = ready_socket->state;
+    BDRVCURLState *s = socket->state->s;
     int running;
     int r;
 
-    if (!s->s->multi) {
+    if (!s->multi) {
         return;
     }
 
-    /* Need to use _SAFE because curl_multi_socket_action() may trigger
-     * curl_sock_cb() which might modify this list */
-    QLIST_FOREACH_SAFE(socket, &s->sockets, next, next_socket) {
-        do {
-            r = curl_multi_socket_action(s->s->multi, socket->fd, 0, &running);
-        } while (r == CURLM_CALL_MULTI_PERFORM);
-    }
+    do {
+        r = curl_multi_socket_action(s->multi, socket->fd, 0, &running);
+    } while (r == CURLM_CALL_MULTI_PERFORM);
 }
 
 static void curl_multi_do(void *arg)
-- 
2.21.0

Background: As of cURL 7.59.0, it verifies that several functions are
not called from within a callback.  Among these functions is
curl_multi_add_handle().

curl_read_cb() is a callback from cURL and not a coroutine.  Waking up
acb->co will lead to entering it then and there, which means the current
request will settle and the caller (if it runs in the same coroutine)
may then issue the next request.  In such a case, we will enter
curl_setup_preadv() effectively from within curl_read_cb().

Calling curl_multi_add_handle() will then fail and the new request will
not be processed.

Fix this by not letting curl_read_cb() wake up acb->co.  Instead, leave
the whole business of settling the AIOCB objects to
curl_multi_check_completion() (which is called from our timer callback
and our FD handler, so not from any cURL callbacks).

Reported-by: Natalie Gavrielov <ngavrilo@redhat.com>
Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1740193
Cc: qemu-stable@nongnu.org
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190910124136.10565-7-mreitz@redhat.com
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/curl.c | 69 ++++++++++++++++++++++------------------------------
 1 file changed, 29 insertions(+), 40 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
 {
     CURLState *s = ((CURLState*)opaque);
     size_t realsize = size * nmemb;
-    int i;
 
     trace_curl_read_cb(realsize);
 
@@ -XXX,XX +XXX,XX @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
     memcpy(s->orig_buf + s->buf_off, ptr, realsize);
     s->buf_off += realsize;
 
-    for(i=0; i<CURL_NUM_ACB; i++) {
-        CURLAIOCB *acb = s->acb[i];
-
-        if (!acb)
-            continue;
-
-        if ((s->buf_off >= acb->end)) {
-            size_t request_length = acb->bytes;
-
-            qemu_iovec_from_buf(acb->qiov, 0, s->orig_buf + acb->start,
-                                acb->end - acb->start);
-
-            if (acb->end - acb->start < request_length) {
-                size_t offset = acb->end - acb->start;
-                qemu_iovec_memset(acb->qiov, offset, 0,
-                                  request_length - offset);
-            }
-
-            acb->ret = 0;
-            s->acb[i] = NULL;
-            qemu_mutex_unlock(&s->s->mutex);
-            aio_co_wake(acb->co);
-            qemu_mutex_lock(&s->s->mutex);
-        }
-    }
-
 read_end:
     /* curl will error out if we do not return this value */
     return size * nmemb;
@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
             break;
 
         if (msg->msg == CURLMSG_DONE) {
+            int i;
             CURLState *state = NULL;
+            bool error = msg->data.result != CURLE_OK;
+
             curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE,
                               (char **)&state);
 
-            /* ACBs for successful messages get completed in curl_read_cb */
-            if (msg->data.result != CURLE_OK) {
-                int i;
+            if (error) {
                 static int errcount = 100;
 
                 /* Don't lose the original error message from curl, since
@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
                         error_report("curl: further errors suppressed");
                     }
                 }
+            }
 
-                for (i = 0; i < CURL_NUM_ACB; i++) {
-                    CURLAIOCB *acb = state->acb[i];
+            for (i = 0; i < CURL_NUM_ACB; i++) {
+                CURLAIOCB *acb = state->acb[i];
 
-                    if (acb == NULL) {
-                        continue;
-                    }
+                if (acb == NULL) {
+                    continue;
+                }
+
+                if (!error) {
+                    /* Assert that we have read all data */
+                    assert(state->buf_off >= acb->end);
+
+                    qemu_iovec_from_buf(acb->qiov, 0,
+                                        state->orig_buf + acb->start,
+                                        acb->end - acb->start);
 
-                    acb->ret = -EIO;
-                    state->acb[i] = NULL;
-                    qemu_mutex_unlock(&s->mutex);
-                    aio_co_wake(acb->co);
-                    qemu_mutex_lock(&s->mutex);
+                    if (acb->end - acb->start < acb->bytes) {
+                        size_t offset = acb->end - acb->start;
+                        qemu_iovec_memset(acb->qiov, offset, 0,
+                                          acb->bytes - offset);
+                    }
                 }
+
+                acb->ret = error ? -EIO : 0;
+                state->acb[i] = NULL;
+                qemu_mutex_unlock(&s->mutex);
+                aio_co_wake(acb->co);
+                qemu_mutex_lock(&s->mutex);
             }
 
             curl_clean_state(state);
-- 
2.21.0

If we had done that all along, debugging would have been much simpler.
(Also, I/O errors are better than hangs.)

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190910124136.10565-8-mreitz@redhat.com
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/curl.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_setup_preadv(BlockDriverState *bs, CURLAIOCB *acb)
     trace_curl_setup_preadv(acb->bytes, start, state->range);
     curl_easy_setopt(state->curl, CURLOPT_RANGE, state->range);
 
-    curl_multi_add_handle(s->multi, state->curl);
+    if (curl_multi_add_handle(s->multi, state->curl) != CURLM_OK) {
+        state->acb[0] = NULL;
+        acb->ret = -EIO;
+
+        curl_clean_state(state);
+        goto out;
+    }
 
     /* Tell curl it needs to kick things off */
     curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
-- 
2.21.0

From: Sergio Lopez <slp@redhat.com>

block_job_remove_all_bdrv() iterates through job->nodes, calling
bdrv_root_unref_child() for each entry. The call to the latter may
reach child_job_[can_]set_aio_ctx(), which will also attempt to
traverse job->nodes, potentially finding entries that where freed
on previous iterations.

To avoid this situation, update job->nodes head on each iteration to
ensure that already freed entries are no longer linked to the list.

RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1746631
Signed-off-by: Sergio Lopez <slp@redhat.com>
Cc: qemu-stable@nongnu.org
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190911100316.32282-1-mreitz@redhat.com
Reviewed-by: Sergio Lopez <slp@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 blockjob.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static const BdrvChildRole child_job = {
 
 void block_job_remove_all_bdrv(BlockJob *job)
 {
-    GSList *l;
-    for (l = job->nodes; l; l = l->next) {
+    /*
+     * bdrv_root_unref_child() may reach child_job_[can_]set_aio_ctx(),
+     * which will also traverse job->nodes, so consume the list one by
+     * one to make sure that such a concurrent access does not attempt
+     * to process an already freed BdrvChild.
+     */
+    while (job->nodes) {
+        GSList *l = job->nodes;
         BdrvChild *c = l->data;
+
+        job->nodes = l->next;
+
         bdrv_op_unblock_all(c->bs, job->blocker);
         bdrv_root_unref_child(c);
+
+        g_slist_free_1(l);
     }
-    g_slist_free(job->nodes);
-    job->nodes = NULL;
 }
 
 bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs)
-- 
2.21.0

From: Maxim Levitsky <mlevitsk@redhat.com>

This fixes subtle corruption introduced by luks threaded encryption
in commit 8ac0f15f335

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1745922

The corruption happens when we do a write that
   * writes to two or more unallocated clusters at once
   * doesn't fully cover the first sector
   * doesn't fully cover the last sector
   * uses luks encryption

In this case, when allocating the new clusters we COW both areas
prior to the write and after the write, and we encrypt them.

The above mentioned commit accidentally made it so we encrypt the
second COW area using the physical cluster offset of the first area.

The problem is that offset_in_cluster in do_perform_cow_encrypt
can be larger that the cluster size, thus cluster_offset
will no longer point to the start of the cluster at which encrypted
area starts.

Next patch in this series will refactor the code to avoid all these
assumptions.

In the bugreport that was triggered by rebasing a luks image to new,
zero filled base, which lot of such writes, and causes some files
with zero areas to contain garbage there instead.
But as described above it can happen elsewhere as well

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-id: 20190915203655.21638-2-mlevitsk@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
         assert(QEMU_IS_ALIGNED(offset_in_cluster, BDRV_SECTOR_SIZE));
         assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
         assert(s->crypto);
-        if (qcow2_co_encrypt(bs, cluster_offset,
-                             src_cluster_offset + offset_in_cluster,
-                             buffer, bytes) < 0) {
+        if (qcow2_co_encrypt(bs,
+                start_of_cluster(s, cluster_offset + offset_in_cluster),
+                src_cluster_offset + offset_in_cluster,
+                buffer, bytes) < 0) {
             return false;
         }
     }
-- 
2.21.0

From: Maxim Levitsky <mlevitsk@redhat.com>

* Change the qcow2_co_{encrypt|decrypt} to just receive full host and
  guest offsets and use this function directly instead of calling
  do_perform_cow_encrypt (which is removed by that patch).

* Adjust qcow2_co_encdec to take full host and guest offsets as well.

* Document the qcow2_co_{encrypt|decrypt} arguments
  to prevent the bug fixed in former commit from hopefully
  happening again.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-id: 20190915203655.21638-3-mlevitsk@redhat.com
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
[mreitz: Let perform_cow() return the error value returned by
         qcow2_co_encrypt(), as proposed by Vladimir]
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.h         |  8 +++---
 block/qcow2-cluster.c | 41 +++++++++-------------------
 block/qcow2-threads.c | 63 +++++++++++++++++++++++++++++++++----------
 block/qcow2.c         |  5 ++--
 4 files changed, 69 insertions(+), 48 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ ssize_t coroutine_fn
 qcow2_co_decompress(BlockDriverState *bs, void *dest, size_t dest_size,
                     const void *src, size_t src_size);
 int coroutine_fn
-qcow2_co_encrypt(BlockDriverState *bs, uint64_t file_cluster_offset,
-                 uint64_t offset, void *buf, size_t len);
+qcow2_co_encrypt(BlockDriverState *bs, uint64_t host_offset,
+                 uint64_t guest_offset, void *buf, size_t len);
 int coroutine_fn
-qcow2_co_decrypt(BlockDriverState *bs, uint64_t file_cluster_offset,
-                 uint64_t offset, void *buf, size_t len);
+qcow2_co_decrypt(BlockDriverState *bs, uint64_t host_offset,
+                 uint64_t guest_offset, void *buf, size_t len);
 
 #endif
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
     return 0;
 }
 
-static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
-                                                uint64_t src_cluster_offset,
-                                                uint64_t cluster_offset,
-                                                unsigned offset_in_cluster,
-                                                uint8_t *buffer,
-                                                unsigned bytes)
-{
-    if (bytes && bs->encrypted) {
-        BDRVQcow2State *s = bs->opaque;
-        assert(QEMU_IS_ALIGNED(offset_in_cluster, BDRV_SECTOR_SIZE));
-        assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
-        assert(s->crypto);
-        if (qcow2_co_encrypt(bs,
-                start_of_cluster(s, cluster_offset + offset_in_cluster),
-                src_cluster_offset + offset_in_cluster,
-                buffer, bytes) < 0) {
-            return false;
-        }
-    }
-    return true;
-}
-
 static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
                                              uint64_t cluster_offset,
                                              unsigned offset_in_cluster,
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
 
     /* Encrypt the data if necessary before writing it */
     if (bs->encrypted) {
-        if (!do_perform_cow_encrypt(bs, m->offset, m->alloc_offset,
-                                    start->offset, start_buffer,
-                                    start->nb_bytes) ||
-            !do_perform_cow_encrypt(bs, m->offset, m->alloc_offset,
-                                    end->offset, end_buffer, end->nb_bytes)) {
-            ret = -EIO;
+        ret = qcow2_co_encrypt(bs,
+                               m->alloc_offset + start->offset,
+                               m->offset + start->offset,
+                               start_buffer, start->nb_bytes);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        ret = qcow2_co_encrypt(bs,
+                               m->alloc_offset + end->offset,
+                               m->offset + end->offset,
+                               end_buffer, end->nb_bytes);
+        if (ret < 0) {
             goto fail;
         }
     }
diff --git a/block/qcow2-threads.c b/block/qcow2-threads.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-threads.c
+++ b/block/qcow2-threads.c
@@ -XXX,XX +XXX,XX @@ static int qcow2_encdec_pool_func(void *opaque)
 }
 
 static int coroutine_fn
-qcow2_co_encdec(BlockDriverState *bs, uint64_t file_cluster_offset,
-                  uint64_t offset, void *buf, size_t len, Qcow2EncDecFunc func)
+qcow2_co_encdec(BlockDriverState *bs, uint64_t host_offset,
+                uint64_t guest_offset, void *buf, size_t len,
+                Qcow2EncDecFunc func)
 {
     BDRVQcow2State *s = bs->opaque;
     Qcow2EncDecData arg = {
         .block = s->crypto,
-        .offset = s->crypt_physical_offset ?
-                      file_cluster_offset + offset_into_cluster(s, offset) :
-                      offset,
+        .offset = s->crypt_physical_offset ? host_offset : guest_offset,
         .buf = buf,
         .len = len,
         .func = func,
     };
 
-    return qcow2_co_process(bs, qcow2_encdec_pool_func, &arg);
+    assert(QEMU_IS_ALIGNED(guest_offset, BDRV_SECTOR_SIZE));
+    assert(QEMU_IS_ALIGNED(host_offset, BDRV_SECTOR_SIZE));
+    assert(QEMU_IS_ALIGNED(len, BDRV_SECTOR_SIZE));
+    assert(s->crypto);
+
+    return len == 0 ? 0 : qcow2_co_process(bs, qcow2_encdec_pool_func, &arg);
 }
 
+/*
+ * qcow2_co_encrypt()
+ *
+ * Encrypts one or more contiguous aligned sectors
+ *
+ * @host_offset - underlying storage offset of the first sector of the
+ * data to be encrypted
+ *
+ * @guest_offset - guest (virtual) offset of the first sector of the
+ * data to be encrypted
+ *
+ * @buf - buffer with the data to encrypt, that after encryption
+ *        will be written to the underlying storage device at
+ *        @host_offset
+ *
+ * @len - length of the buffer (must be a BDRV_SECTOR_SIZE multiple)
+ *
+ * Depending on the encryption method, @host_offset and/or @guest_offset
+ * may be used for generating the initialization vector for
+ * encryption.
+ *
+ * Note that while the whole range must be aligned on sectors, it
+ * does not have to be aligned on clusters and can also cross cluster
+ * boundaries
+ */
 int coroutine_fn
-qcow2_co_encrypt(BlockDriverState *bs, uint64_t file_cluster_offset,
-                 uint64_t offset, void *buf, size_t len)
+qcow2_co_encrypt(BlockDriverState *bs, uint64_t host_offset,
+                 uint64_t guest_offset, void *buf, size_t len)
 {
-    return qcow2_co_encdec(bs, file_cluster_offset, offset, buf, len,
-                             qcrypto_block_encrypt);
+    return qcow2_co_encdec(bs, host_offset, guest_offset, buf, len,
+                           qcrypto_block_encrypt);
 }
 
+/*
+ * qcow2_co_decrypt()
+ *
+ * Decrypts one or more contiguous aligned sectors
+ * Similar to qcow2_co_encrypt
+ */
 int coroutine_fn
-qcow2_co_decrypt(BlockDriverState *bs, uint64_t file_cluster_offset,
-                 uint64_t offset, void *buf, size_t len)
+qcow2_co_decrypt(BlockDriverState *bs, uint64_t host_offset,
+                 uint64_t guest_offset, void *buf, size_t len)
 {
-    return qcow2_co_encdec(bs, file_cluster_offset, offset, buf, len,
-                             qcrypto_block_decrypt);
+    return qcow2_co_encdec(bs, host_offset, guest_offset, buf, len,
+                           qcrypto_block_decrypt);
 }
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv_part(BlockDriverState *bs,
 
                 assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
                 assert(QEMU_IS_ALIGNED(cur_bytes, BDRV_SECTOR_SIZE));
-                if (qcow2_co_decrypt(bs, cluster_offset, offset,
+                if (qcow2_co_decrypt(bs, cluster_offset + offset_in_cluster,
+                                     offset,
                                      cluster_data, cur_bytes) < 0) {
                     ret = -EIO;
                     goto fail;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_pwritev_part(
             qemu_iovec_to_buf(qiov, qiov_offset + bytes_done,
                               cluster_data, cur_bytes);
 
-            if (qcow2_co_encrypt(bs, cluster_offset, offset,
+            if (qcow2_co_encrypt(bs, cluster_offset + offset_in_cluster, offset,
                                  cluster_data, cur_bytes) < 0) {
                 ret = -EIO;
                 goto out_unlocked;
-- 
2.21.0

From: Maxim Levitsky <mlevitsk@redhat.com>

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Tested-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-id: 20190915203655.21638-4-mlevitsk@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/263     | 91 ++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/263.out | 40 +++++++++++++++++
 tests/qemu-iotests/group   |  1 +
 3 files changed, 132 insertions(+)
 create mode 100755 tests/qemu-iotests/263
 create mode 100644 tests/qemu-iotests/263.out

diff --git a/tests/qemu-iotests/263 b/tests/qemu-iotests/263
new file mode 100755
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/263
@@ -XXX,XX +XXX,XX @@
+#!/usr/bin/env bash
+#
+# Test encrypted write that crosses cluster boundary of two unallocated clusters
+# Based on 188
+#
+# Copyright (C) 2019 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=mlevitsk@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+status=1	# failure is the default!
+
+_cleanup()
+{
+	_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qcow2
+_supported_proto generic
+_supported_os Linux
+
+
+size=1M
+
+SECRET="secret,id=sec0,data=astrochicken"
+QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
+
+
+_run_test()
+{
+	echo "== reading the whole image =="
+	$QEMU_IO --object $SECRET -c "read -P 0 0 $size" --image-opts "$1" | _filter_qemu_io | _filter_testdir
+
+	echo
+	echo "== write two 512 byte sectors on a cluster boundary =="
+	$QEMU_IO --object $SECRET -c "write -P 0xAA 0xFE00 0x400" --image-opts "$1" | _filter_qemu_io | _filter_testdir
+
+	echo
+	echo "== verify that the rest of the image is not changed =="
+	$QEMU_IO --object $SECRET -c "read -P 0x00 0x00000 0xFE00" --image-opts "$1" | _filter_qemu_io | _filter_testdir
+	$QEMU_IO --object $SECRET -c "read -P 0xAA 0x0FE00 0x400" --image-opts "$1" | _filter_qemu_io | _filter_testdir
+	$QEMU_IO --object $SECRET -c "read -P 0x00 0x10200 0xEFE00" --image-opts "$1" | _filter_qemu_io | _filter_testdir
+
+}
+
+
+echo
+echo "testing LUKS qcow2 encryption"
+echo
+
+_make_test_img --object $SECRET -o "encrypt.format=luks,encrypt.key-secret=sec0,encrypt.iter-time=10,cluster_size=64K" $size
+_run_test "driver=$IMGFMT,encrypt.key-secret=sec0,file.filename=$TEST_IMG"
+_cleanup_test_img
+
+echo
+echo "testing legacy AES qcow2 encryption"
+echo
+
+
+_make_test_img --object $SECRET -o "encrypt.format=aes,encrypt.key-secret=sec0,cluster_size=64K" $size
+_run_test "driver=$IMGFMT,encrypt.key-secret=sec0,file.filename=$TEST_IMG"
+_cleanup_test_img
+
+
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/263.out b/tests/qemu-iotests/263.out
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/263.out
@@ -XXX,XX +XXX,XX @@
+QA output created by 263
+
+testing LUKS qcow2 encryption
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 encrypt.format=luks encrypt.key-secret=sec0 encrypt.iter-time=10
+== reading the whole image ==
+read 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== write two 512 byte sectors on a cluster boundary ==
+wrote 1024/1024 bytes at offset 65024
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== verify that the rest of the image is not changed ==
+read 65024/65024 bytes at offset 0
+63.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 65024
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 982528/982528 bytes at offset 66048
+959.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+testing legacy AES qcow2 encryption
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 encrypt.format=aes encrypt.key-secret=sec0
+== reading the whole image ==
+read 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== write two 512 byte sectors on a cluster boundary ==
+wrote 1024/1024 bytes at offset 65024
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== verify that the rest of the image is not changed ==
+read 65024/65024 bytes at offset 0
+63.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 65024
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 982528/982528 bytes at offset 66048
+959.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 257 rw
 258 rw quick
 262 rw quick migration
+263 rw quick
 265 rw auto quick
 266 rw quick
-- 
2.21.0

The following changes since commit 67f17e23baca5dd545fe98b01169cc351a70fe35:

Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging (2020-03-06 17:15:36 +0000)

are available in the Git repository at:

https://github.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to d37d0e365afb6825a90d8356fc6adcc1f58f40f3:

aio-posix: remove idle poll handlers to improve scalability (2020-03-09 16:45:16 +0000)

----------------------------------------------------------------
Pull request

----------------------------------------------------------------

Stefan Hajnoczi (9):
  qemu/queue.h: clear linked list pointers on remove
  aio-posix: remove confusing QLIST_SAFE_REMOVE()
  aio-posix: completely stop polling when disabled
  aio-posix: move RCU_READ_LOCK() into run_poll_handlers()
  aio-posix: extract ppoll(2) and epoll(7) fd monitoring
  aio-posix: simplify FDMonOps->update() prototype
  aio-posix: add io_uring fd monitoring implementation
  aio-posix: support userspace polling of fd monitoring
  aio-posix: remove idle poll handlers to improve scalability

MAINTAINERS           |   2 +
 configure             |   5 +
 include/block/aio.h   |  71 ++++++-
 include/qemu/queue.h  |  19 +-
 util/Makefile.objs    |   3 +
 util/aio-posix.c      | 451 ++++++++++++++----------------------------
 util/aio-posix.h      |  81 ++++++++
 util/fdmon-epoll.c    | 155 +++++++++++++++
 util/fdmon-io_uring.c | 332 +++++++++++++++++++++++++++++++
 util/fdmon-poll.c     | 107 ++++++++++
 util/trace-events     |   2 +
 11 files changed, 915 insertions(+), 313 deletions(-)
 create mode 100644 util/aio-posix.h
 create mode 100644 util/fdmon-epoll.c
 create mode 100644 util/fdmon-io_uring.c
 create mode 100644 util/fdmon-poll.c

-- 
2.24.1

Do not leave stale linked list pointers around after removal.  It's
safer to set them to NULL so that use-after-removal results in an
immediate segfault.

The RCU queue removal macros are unchanged since nodes may still be
traversed after removal.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Link: https://lore.kernel.org/r/20200224103406.1894923-2-stefanha@redhat.com
Message-Id: <20200224103406.1894923-2-stefanha@redhat.com>
---
 include/qemu/queue.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/include/qemu/queue.h b/include/qemu/queue.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/queue.h
+++ b/include/qemu/queue.h
@@ -XXX,XX +XXX,XX @@ struct {                                                                \
                 (elm)->field.le_next->field.le_prev =                   \
                     (elm)->field.le_prev;                               \
         *(elm)->field.le_prev = (elm)->field.le_next;                   \
+        (elm)->field.le_next = NULL;                                    \
+        (elm)->field.le_prev = NULL;                                    \
 } while (/*CONSTCOND*/0)
 
 /*
@@ -XXX,XX +XXX,XX @@ struct {                                                                \
 } while (/*CONSTCOND*/0)
 
 #define QSLIST_REMOVE_HEAD(head, field) do {                             \
-        (head)->slh_first = (head)->slh_first->field.sle_next;          \
+        typeof((head)->slh_first) elm = (head)->slh_first;               \
+        (head)->slh_first = elm->field.sle_next;                         \
+        elm->field.sle_next = NULL;                                      \
 } while (/*CONSTCOND*/0)
 
 #define QSLIST_REMOVE_AFTER(slistelm, field) do {                       \
-        (slistelm)->field.sle_next =                                    \
-            QSLIST_NEXT(QSLIST_NEXT((slistelm), field), field);         \
+        typeof(slistelm) next = (slistelm)->field.sle_next;             \
+        (slistelm)->field.sle_next = next->field.sle_next;              \
+        next->field.sle_next = NULL;                                    \
 } while (/*CONSTCOND*/0)
 
 #define QSLIST_REMOVE(head, elm, type, field) do {                      \
@@ -XXX,XX +XXX,XX @@ struct {                                                                \
         while (curelm->field.sle_next != (elm))                         \
             curelm = curelm->field.sle_next;                            \
         curelm->field.sle_next = curelm->field.sle_next->field.sle_next; \
+        (elm)->field.sle_next = NULL;                                   \
     }                                                                   \
 } while (/*CONSTCOND*/0)
 
@@ -XXX,XX +XXX,XX @@ struct {                                                                \
 } while (/*CONSTCOND*/0)
 
 #define QSIMPLEQ_REMOVE_HEAD(head, field) do {                          \
-    if (((head)->sqh_first = (head)->sqh_first->field.sqe_next) == NULL)\
+    typeof((head)->sqh_first) elm = (head)->sqh_first;                  \
+    if (((head)->sqh_first = elm->field.sqe_next) == NULL)              \
         (head)->sqh_last = &(head)->sqh_first;                          \
+    elm->field.sqe_next = NULL;                                         \
 } while (/*CONSTCOND*/0)
 
 #define QSIMPLEQ_SPLIT_AFTER(head, elm, field, removed) do {            \
@@ -XXX,XX +XXX,XX @@ struct {                                                                \
         if ((curelm->field.sqe_next =                                   \
             curelm->field.sqe_next->field.sqe_next) == NULL)            \
                 (head)->sqh_last = &(curelm)->field.sqe_next;           \
+        (elm)->field.sqe_next = NULL;                                   \
     }                                                                   \
 } while (/*CONSTCOND*/0)
 
@@ -XXX,XX +XXX,XX @@ union {                                                                 \
             (head)->tqh_circ.tql_prev = (elm)->field.tqe_circ.tql_prev; \
         (elm)->field.tqe_circ.tql_prev->tql_next = (elm)->field.tqe_next; \
         (elm)->field.tqe_circ.tql_prev = NULL;                          \
+        (elm)->field.tqe_circ.tql_next = NULL;                          \
+        (elm)->field.tqe_next = NULL;                                   \
 } while (/*CONSTCOND*/0)
 
 /* remove @left, @right and all elements in between from @head */
-- 
2.24.1

One iteration of polling is always performed even when polling is
disabled.  This is done because:
1. Userspace polling is cheaper than making a syscall.  We might get
   lucky.
2. We must poll once more after polling has stopped in case an event
   occurred while stopping polling.

However, there are downsides:
1. Polling becomes a bottleneck when the number of event sources is very
   high.  It's more efficient to monitor fds in that case.
2. A high-frequency polling event source can starve non-polling event
   sources because ppoll(2)/epoll(7) is never invoked.

This patch removes the forced polling iteration so that poll_ns=0 really
means no polling.

IOPS increases from 10k to 60k when the guest has 100
virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1
device because the large number of event sources being polled slows down
the event loop.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Link: https://lore.kernel.org/r/20200305170806.1313245-2-stefanha@redhat.com
Message-Id: <20200305170806.1313245-2-stefanha@redhat.com>
---
 util/aio-posix.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ void aio_set_event_notifier_poll(AioContext *ctx,
                     (IOHandler *)io_poll_end);
 }
 
-static void poll_set_started(AioContext *ctx, bool started)
+static bool poll_set_started(AioContext *ctx, bool started)
 {
     AioHandler *node;
+    bool progress = false;
 
     if (started == ctx->poll_started) {
-        return;
+        return false;
     }
 
     ctx->poll_started = started;
@@ -XXX,XX +XXX,XX @@ static void poll_set_started(AioContext *ctx, bool started)
         if (fn) {
             fn(node->opaque);
         }
+
+        /* Poll one last time in case ->io_poll_end() raced with the event */
+        if (!started) {
+            progress = node->io_poll(node->opaque) || progress;
+        }
     }
     qemu_lockcnt_dec(&ctx->list_lock);
+
+    return progress;
 }
 
 
@@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
         }
     }
 
-    poll_set_started(ctx, false);
+    if (poll_set_started(ctx, false)) {
+        *timeout = 0;
+        return true;
+    }
 
-    /* Even if we don't run busy polling, try polling once in case it can make
-     * progress and the caller will be able to avoid ppoll(2)/epoll_wait(2).
-     */
-    return run_poll_handlers_once(ctx, timeout);
+    return false;
 }
 
 bool aio_poll(AioContext *ctx, bool blocking)
-- 
2.24.1

Now that run_poll_handlers_once() is only called by run_poll_handlers()
we can improve the CPU time profile by moving the expensive
RCU_READ_LOCK() out of the polling loop.

This reduces the run_poll_handlers() from 40% CPU to 10% CPU in perf's
sampling profiler output.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Link: https://lore.kernel.org/r/20200305170806.1313245-3-stefanha@redhat.com
Message-Id: <20200305170806.1313245-3-stefanha@redhat.com>
---
 util/aio-posix.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
     bool progress = false;
     AioHandler *node;
 
-    /*
-     * Optimization: ->io_poll() handlers often contain RCU read critical
-     * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
-     * -> rcu_read_lock() -> ... sequences with expensive memory
-     * synchronization primitives.  Make the entire polling loop an RCU
-     * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
-     * are cheap.
-     */
-    RCU_READ_LOCK_GUARD();
-
     QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
         if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
             aio_node_check(ctx, node->is_external) &&
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
 
     trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
 
+    /*
+     * Optimization: ->io_poll() handlers often contain RCU read critical
+     * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
+     * -> rcu_read_lock() -> ... sequences with expensive memory
+     * synchronization primitives.  Make the entire polling loop an RCU
+     * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
+     * are cheap.
+     */
+    RCU_READ_LOCK_GUARD();
+
     start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     do {
         progress = run_poll_handlers_once(ctx, timeout);
-- 
2.24.1

The ppoll(2) and epoll(7) file descriptor monitoring implementations are
mixed with the core util/aio-posix.c code.  Before adding another
implementation for Linux io_uring, extract out the existing
ones so there is a clear interface and the core code is simpler.

The new interface is AioContext->fdmon_ops, a pointer to a FDMonOps
struct.  See the patch for details.

Semantic changes:
1. ppoll(2) now reflects events from pollfds[] back into AioHandlers
   while we're still on the clock for adaptive polling.  This was
   already happening for epoll(7), so if it's really an issue then we'll
   need to fix both in the future.
2. epoll(7)'s fallback to ppoll(2) while external events are disabled
   was broken when the number of fds exceeded the epoll(7) upgrade
   threshold.  I guess this code path simply wasn't tested and no one
   noticed the bug.  I didn't go out of my way to fix it but the correct
   code is simpler than preserving the bug.

I also took some liberties in removing the unnecessary
AioContext->epoll_available (just check AioContext->epollfd != -1
instead) and AioContext->epoll_enabled (it's implicit if our
AioContext->fdmon_ops callbacks are being invoked) fields.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Link: https://lore.kernel.org/r/20200305170806.1313245-4-stefanha@redhat.com
Message-Id: <20200305170806.1313245-4-stefanha@redhat.com>
---
 MAINTAINERS         |   2 +
 include/block/aio.h |  36 +++++-
 util/Makefile.objs  |   2 +
 util/aio-posix.c    | 286 ++------------------------------------------
 util/aio-posix.h    |  61 ++++++++++
 util/fdmon-epoll.c  | 151 +++++++++++++++++++++++
 util/fdmon-poll.c   | 104 ++++++++++++++++
 7 files changed, 366 insertions(+), 276 deletions(-)
 create mode 100644 util/aio-posix.h
 create mode 100644 util/fdmon-epoll.c
 create mode 100644 util/fdmon-poll.c

diff --git a/MAINTAINERS b/MAINTAINERS
index XXXXXXX..XXXXXXX 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ L: qemu-block@nongnu.org
 S: Supported
 F: util/async.c
 F: util/aio-*.c
+F: util/aio-*.h
+F: util/fdmon-*.c
 F: block/io.c
 F: migration/block*
 F: include/block/aio.h
diff --git a/include/block/aio.h b/include/block/aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ struct ThreadPool;
 struct LinuxAioState;
 struct LuringState;
 
+/* Callbacks for file descriptor monitoring implementations */
+typedef struct {
+    /*
+     * update:
+     * @ctx: the AioContext
+     * @node: the handler
+     * @is_new: is the file descriptor already being monitored?
+     *
+     * Add/remove/modify a monitored file descriptor.  There are three cases:
+     * 1. node->pfd.events == 0 means remove the file descriptor.
+     * 2. !is_new means modify an already monitored file descriptor.
+     * 3. is_new means add a new file descriptor.
+     *
+     * Called with ctx->list_lock acquired.
+     */
+    void (*update)(AioContext *ctx, AioHandler *node, bool is_new);
+
+    /*
+     * wait:
+     * @ctx: the AioContext
+     * @ready_list: list for handlers that become ready
+     * @timeout: maximum duration to wait, in nanoseconds
+     *
+     * Wait for file descriptors to become ready and place them on ready_list.
+     *
+     * Called with ctx->list_lock incremented but not locked.
+     *
+     * Returns: number of ready file descriptors.
+     */
+    int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
+} FDMonOps;
+
 /*
  * Each aio_bh_poll() call carves off a slice of the BH list, so that newly
  * scheduled BHs are not processed until the next aio_bh_poll() call.  All
@@ -XXX,XX +XXX,XX @@ struct AioContext {
 
     /* epoll(7) state used when built with CONFIG_EPOLL */
     int epollfd;
-    bool epoll_enabled;
-    bool epoll_available;
+
+    const FDMonOps *fdmon_ops;
 };
 
 /**
diff --git a/util/Makefile.objs b/util/Makefile.objs
index XXXXXXX..XXXXXXX 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -XXX,XX +XXX,XX @@ util-obj-y += aiocb.o async.o aio-wait.o thread-pool.o qemu-timer.o
 util-obj-y += main-loop.o
 util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o
 util-obj-$(CONFIG_POSIX) += aio-posix.o
+util-obj-$(CONFIG_POSIX) += fdmon-poll.o
+util-obj-$(CONFIG_EPOLL_CREATE1) += fdmon-epoll.o
 util-obj-$(CONFIG_POSIX) += compatfd.o
 util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
 util-obj-$(CONFIG_POSIX) += mmap-alloc.o
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/sockets.h"
 #include "qemu/cutils.h"
 #include "trace.h"
-#ifdef CONFIG_EPOLL_CREATE1
-#include <sys/epoll.h>
-#endif
+#include "aio-posix.h"
 
-struct AioHandler
-{
-    GPollFD pfd;
-    IOHandler *io_read;
-    IOHandler *io_write;
-    AioPollFn *io_poll;
-    IOHandler *io_poll_begin;
-    IOHandler *io_poll_end;
-    void *opaque;
-    bool is_external;
-    QLIST_ENTRY(AioHandler) node;
-    QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
-    QLIST_ENTRY(AioHandler) node_deleted;
-};
-
-/* Add a handler to a ready list */
-static void add_ready_handler(AioHandlerList *ready_list,
-                              AioHandler *node,
-                              int revents)
+void aio_add_ready_handler(AioHandlerList *ready_list,
+                           AioHandler *node,
+                           int revents)
 {
     QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
     node->pfd.revents = revents;
     QLIST_INSERT_HEAD(ready_list, node, node_ready);
 }
 
-#ifdef CONFIG_EPOLL_CREATE1
-
-/* The fd number threshold to switch to epoll */
-#define EPOLL_ENABLE_THRESHOLD 64
-
-static void aio_epoll_disable(AioContext *ctx)
-{
-    ctx->epoll_enabled = false;
-    if (!ctx->epoll_available) {
-        return;
-    }
-    ctx->epoll_available = false;
-    close(ctx->epollfd);
-}
-
-static inline int epoll_events_from_pfd(int pfd_events)
-{
-    return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
-           (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
-           (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
-           (pfd_events & G_IO_ERR ? EPOLLERR : 0);
-}
-
-static bool aio_epoll_try_enable(AioContext *ctx)
-{
-    AioHandler *node;
-    struct epoll_event event;
-
-    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
-        int r;
-        if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
-            continue;
-        }
-        event.events = epoll_events_from_pfd(node->pfd.events);
-        event.data.ptr = node;
-        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
-        if (r) {
-            return false;
-        }
-    }
-    ctx->epoll_enabled = true;
-    return true;
-}
-
-static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
-{
-    struct epoll_event event;
-    int r;
-    int ctl;
-
-    if (!ctx->epoll_enabled) {
-        return;
-    }
-    if (!node->pfd.events) {
-        ctl = EPOLL_CTL_DEL;
-    } else {
-        event.data.ptr = node;
-        event.events = epoll_events_from_pfd(node->pfd.events);
-        ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
-    }
-
-    r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
-    if (r) {
-        aio_epoll_disable(ctx);
-    }
-}
-
-static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
-                     int64_t timeout)
-{
-    GPollFD pfd = {
-        .fd = ctx->epollfd,
-        .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
-    };
-    AioHandler *node;
-    int i, ret = 0;
-    struct epoll_event events[128];
-
-    if (timeout > 0) {
-        ret = qemu_poll_ns(&pfd, 1, timeout);
-        if (ret > 0) {
-            timeout = 0;
-        }
-    }
-    if (timeout <= 0 || ret > 0) {
-        ret = epoll_wait(ctx->epollfd, events,
-                         ARRAY_SIZE(events),
-                         timeout);
-        if (ret <= 0) {
-            goto out;
-        }
-        for (i = 0; i < ret; i++) {
-            int ev = events[i].events;
-            int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
-                          (ev & EPOLLOUT ? G_IO_OUT : 0) |
-                          (ev & EPOLLHUP ? G_IO_HUP : 0) |
-                          (ev & EPOLLERR ? G_IO_ERR : 0);
-
-            node = events[i].data.ptr;
-            add_ready_handler(ready_list, node, revents);
-        }
-    }
-out:
-    return ret;
-}
-
-static bool aio_epoll_enabled(AioContext *ctx)
-{
-    /* Fall back to ppoll when external clients are disabled. */
-    return !aio_external_disabled(ctx) && ctx->epoll_enabled;
-}
-
-static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
-                                 unsigned npfd, int64_t timeout)
-{
-    if (!ctx->epoll_available) {
-        return false;
-    }
-    if (aio_epoll_enabled(ctx)) {
-        return true;
-    }
-    if (npfd >= EPOLL_ENABLE_THRESHOLD) {
-        if (aio_epoll_try_enable(ctx)) {
-            return true;
-        } else {
-            aio_epoll_disable(ctx);
-        }
-    }
-    return false;
-}
-
-#else
-
-static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
-{
-}
-
-static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
-                     int64_t timeout)
-{
-    assert(false);
-}
-
-static bool aio_epoll_enabled(AioContext *ctx)
-{
-    return false;
-}
-
-static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
-                          unsigned npfd, int64_t timeout)
-{
-    return false;
-}
-
-#endif
-
 static AioHandler *find_aio_handler(AioContext *ctx, int fd)
 {
     AioHandler *node;
@@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx,
                atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
 
     if (new_node) {
-        aio_epoll_update(ctx, new_node, is_new);
+        ctx->fdmon_ops->update(ctx, new_node, is_new);
     } else if (node) {
         /* Unregister deleted fd_handler */
-        aio_epoll_update(ctx, node, false);
+        ctx->fdmon_ops->update(ctx, node, false);
     }
     qemu_lockcnt_unlock(&ctx->list_lock);
     aio_notify(ctx);
@@ -XXX,XX +XXX,XX @@ void aio_dispatch(AioContext *ctx)
     timerlistgroup_run_timers(&ctx->tlg);
 }
 
-/* These thread-local variables are used only in a small part of aio_poll
- * around the call to the poll() system call.  In particular they are not
- * used while aio_poll is performing callbacks, which makes it much easier
- * to think about reentrancy!
- *
- * Stack-allocated arrays would be perfect but they have size limitations;
- * heap allocation is expensive enough that we want to reuse arrays across
- * calls to aio_poll().  And because poll() has to be called without holding
- * any lock, the arrays cannot be stored in AioContext.  Thread-local data
- * has none of the disadvantages of these three options.
- */
-static __thread GPollFD *pollfds;
-static __thread AioHandler **nodes;
-static __thread unsigned npfd, nalloc;
-static __thread Notifier pollfds_cleanup_notifier;
-
-static void pollfds_cleanup(Notifier *n, void *unused)
-{
-    g_assert(npfd == 0);
-    g_free(pollfds);
-    g_free(nodes);
-    nalloc = 0;
-}
-
-static void add_pollfd(AioHandler *node)
-{
-    if (npfd == nalloc) {
-        if (nalloc == 0) {
-            pollfds_cleanup_notifier.notify = pollfds_cleanup;
-            qemu_thread_atexit_add(&pollfds_cleanup_notifier);
-            nalloc = 8;
-        } else {
-            g_assert(nalloc <= INT_MAX);
-            nalloc *= 2;
-        }
-        pollfds = g_renew(GPollFD, pollfds, nalloc);
-        nodes = g_renew(AioHandler *, nodes, nalloc);
-    }
-    nodes[npfd] = node;
-    pollfds[npfd] = (GPollFD) {
-        .fd = node->pfd.fd,
-        .events = node->pfd.events,
-    };
-    npfd++;
-}
-
 static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
 {
     bool progress = false;
@@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
 bool aio_poll(AioContext *ctx, bool blocking)
 {
     AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
-    AioHandler *node;
-    int i;
     int ret = 0;
     bool progress;
     int64_t timeout;
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
      * system call---a single round of run_poll_handlers_once suffices.
      */
     if (timeout || atomic_read(&ctx->poll_disable_cnt)) {
-        assert(npfd == 0);
-
-        /* fill pollfds */
-
-        if (!aio_epoll_enabled(ctx)) {
-            QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
-                if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
-                    && aio_node_check(ctx, node->is_external)) {
-                    add_pollfd(node);
-                }
-            }
-        }
-
-        /* wait until next event */
-        if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
-            npfd = 0; /* pollfds[] is not being used */
-            ret = aio_epoll(ctx, &ready_list, timeout);
-        } else  {
-            ret = qemu_poll_ns(pollfds, npfd, timeout);
-        }
+        ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout);
     }
 
     if (blocking) {
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         }
     }
 
-    /* if we have any readable fds, dispatch event */
-    if (ret > 0) {
-        for (i = 0; i < npfd; i++) {
-            int revents = pollfds[i].revents;
-
-            if (revents) {
-                add_ready_handler(&ready_list, nodes[i], revents);
-            }
-        }
-    }
-
-    npfd = 0;
-
     progress |= aio_bh_poll(ctx);
 
     if (ret > 0) {
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
 
 void aio_context_setup(AioContext *ctx)
 {
-#ifdef CONFIG_EPOLL_CREATE1
-    assert(!ctx->epollfd);
-    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
-    if (ctx->epollfd == -1) {
-        fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
-        ctx->epoll_available = false;
-    } else {
-        ctx->epoll_available = true;
-    }
-#endif
+    ctx->fdmon_ops = &fdmon_poll_ops;
+    ctx->epollfd = -1;
+
+    fdmon_epoll_setup(ctx);
 }
 
 void aio_context_destroy(AioContext *ctx)
 {
-#ifdef CONFIG_EPOLL_CREATE1
-    aio_epoll_disable(ctx);
-#endif
+    fdmon_epoll_disable(ctx);
 }
 
 void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
diff --git a/util/aio-posix.h b/util/aio-posix.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/util/aio-posix.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * AioContext POSIX event loop implementation internal APIs
+ *
+ * Copyright IBM, Corp. 2008
+ * Copyright Red Hat, Inc. 2020
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#ifndef AIO_POSIX_H
+#define AIO_POSIX_H
+
+#include "block/aio.h"
+
+struct AioHandler {
+    GPollFD pfd;
+    IOHandler *io_read;
+    IOHandler *io_write;
+    AioPollFn *io_poll;
+    IOHandler *io_poll_begin;
+    IOHandler *io_poll_end;
+    void *opaque;
+    bool is_external;
+    QLIST_ENTRY(AioHandler) node;
+    QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
+    QLIST_ENTRY(AioHandler) node_deleted;
+};
+
+/* Add a handler to a ready list */
+void aio_add_ready_handler(AioHandlerList *ready_list, AioHandler *node,
+                           int revents);
+
+extern const FDMonOps fdmon_poll_ops;
+
+#ifdef CONFIG_EPOLL_CREATE1
+bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd);
+void fdmon_epoll_setup(AioContext *ctx);
+void fdmon_epoll_disable(AioContext *ctx);
+#else
+static inline bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
+{
+    return false;
+}
+
+static inline void fdmon_epoll_setup(AioContext *ctx)
+{
+}
+
+static inline void fdmon_epoll_disable(AioContext *ctx)
+{
+}
+#endif /* !CONFIG_EPOLL_CREATE1 */
+
+#endif /* AIO_POSIX_H */
diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/util/fdmon-epoll.c
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * epoll(7) file descriptor monitoring
+ */
+
+#include "qemu/osdep.h"
+#include <sys/epoll.h>
+#include "qemu/rcu_queue.h"
+#include "aio-posix.h"
+
+/* The fd number threshold to switch to epoll */
+#define EPOLL_ENABLE_THRESHOLD 64
+
+void fdmon_epoll_disable(AioContext *ctx)
+{
+    if (ctx->epollfd >= 0) {
+        close(ctx->epollfd);
+        ctx->epollfd = -1;
+    }
+
+    /* Switch back */
+    ctx->fdmon_ops = &fdmon_poll_ops;
+}
+
+static inline int epoll_events_from_pfd(int pfd_events)
+{
+    return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
+           (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
+           (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
+           (pfd_events & G_IO_ERR ? EPOLLERR : 0);
+}
+
+static void fdmon_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
+{
+    struct epoll_event event;
+    int r;
+    int ctl;
+
+    if (!node->pfd.events) {
+        ctl = EPOLL_CTL_DEL;
+    } else {
+        event.data.ptr = node;
+        event.events = epoll_events_from_pfd(node->pfd.events);
+        ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
+    }
+
+    r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
+    if (r) {
+        fdmon_epoll_disable(ctx);
+    }
+}
+
+static int fdmon_epoll_wait(AioContext *ctx, AioHandlerList *ready_list,
+                            int64_t timeout)
+{
+    GPollFD pfd = {
+        .fd = ctx->epollfd,
+        .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
+    };
+    AioHandler *node;
+    int i, ret = 0;
+    struct epoll_event events[128];
+
+    /* Fall back while external clients are disabled */
+    if (atomic_read(&ctx->external_disable_cnt)) {
+        return fdmon_poll_ops.wait(ctx, ready_list, timeout);
+    }
+
+    if (timeout > 0) {
+        ret = qemu_poll_ns(&pfd, 1, timeout);
+        if (ret > 0) {
+            timeout = 0;
+        }
+    }
+    if (timeout <= 0 || ret > 0) {
+        ret = epoll_wait(ctx->epollfd, events,
+                         ARRAY_SIZE(events),
+                         timeout);
+        if (ret <= 0) {
+            goto out;
+        }
+        for (i = 0; i < ret; i++) {
+            int ev = events[i].events;
+            int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
+                          (ev & EPOLLOUT ? G_IO_OUT : 0) |
+                          (ev & EPOLLHUP ? G_IO_HUP : 0) |
+                          (ev & EPOLLERR ? G_IO_ERR : 0);
+
+            node = events[i].data.ptr;
+            aio_add_ready_handler(ready_list, node, revents);
+        }
+    }
+out:
+    return ret;
+}
+
+static const FDMonOps fdmon_epoll_ops = {
+    .update = fdmon_epoll_update,
+    .wait = fdmon_epoll_wait,
+};
+
+static bool fdmon_epoll_try_enable(AioContext *ctx)
+{
+    AioHandler *node;
+    struct epoll_event event;
+
+    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+        int r;
+        if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
+            continue;
+        }
+        event.events = epoll_events_from_pfd(node->pfd.events);
+        event.data.ptr = node;
+        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
+        if (r) {
+            return false;
+        }
+    }
+
+    ctx->fdmon_ops = &fdmon_epoll_ops;
+    return true;
+}
+
+bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
+{
+    if (ctx->epollfd < 0) {
+        return false;
+    }
+
+    /* Do not upgrade while external clients are disabled */
+    if (atomic_read(&ctx->external_disable_cnt)) {
+        return false;
+    }
+
+    if (npfd >= EPOLL_ENABLE_THRESHOLD) {
+        if (fdmon_epoll_try_enable(ctx)) {
+            return true;
+        } else {
+            fdmon_epoll_disable(ctx);
+        }
+    }
+    return false;
+}
+
+void fdmon_epoll_setup(AioContext *ctx)
+{
+    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
+    if (ctx->epollfd == -1) {
+        fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
+    }
+}
diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/util/fdmon-poll.c
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * poll(2) file descriptor monitoring
+ *
+ * Uses ppoll(2) when available, g_poll() otherwise.
+ */
+
+#include "qemu/osdep.h"
+#include "aio-posix.h"
+#include "qemu/rcu_queue.h"
+
+/*
+ * These thread-local variables are used only in fdmon_poll_wait() around the
+ * call to the poll() system call.  In particular they are not used while
+ * aio_poll is performing callbacks, which makes it much easier to think about
+ * reentrancy!
+ *
+ * Stack-allocated arrays would be perfect but they have size limitations;
+ * heap allocation is expensive enough that we want to reuse arrays across
+ * calls to aio_poll().  And because poll() has to be called without holding
+ * any lock, the arrays cannot be stored in AioContext.  Thread-local data
+ * has none of the disadvantages of these three options.
+ */
+static __thread GPollFD *pollfds;
+static __thread AioHandler **nodes;
+static __thread unsigned npfd, nalloc;
+static __thread Notifier pollfds_cleanup_notifier;
+
+static void pollfds_cleanup(Notifier *n, void *unused)
+{
+    g_assert(npfd == 0);
+    g_free(pollfds);
+    g_free(nodes);
+    nalloc = 0;
+}
+
+static void add_pollfd(AioHandler *node)
+{
+    if (npfd == nalloc) {
+        if (nalloc == 0) {
+            pollfds_cleanup_notifier.notify = pollfds_cleanup;
+            qemu_thread_atexit_add(&pollfds_cleanup_notifier);
+            nalloc = 8;
+        } else {
+            g_assert(nalloc <= INT_MAX);
+            nalloc *= 2;
+        }
+        pollfds = g_renew(GPollFD, pollfds, nalloc);
+        nodes = g_renew(AioHandler *, nodes, nalloc);
+    }
+    nodes[npfd] = node;
+    pollfds[npfd] = (GPollFD) {
+        .fd = node->pfd.fd,
+        .events = node->pfd.events,
+    };
+    npfd++;
+}
+
+static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list,
+                            int64_t timeout)
+{
+    AioHandler *node;
+    int ret;
+
+    assert(npfd == 0);
+
+    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+        if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
+                && aio_node_check(ctx, node->is_external)) {
+            add_pollfd(node);
+        }
+    }
+
+    /* epoll(7) is faster above a certain number of fds */
+    if (fdmon_epoll_try_upgrade(ctx, npfd)) {
+        return ctx->fdmon_ops->wait(ctx, ready_list, timeout);
+    }
+
+    ret = qemu_poll_ns(pollfds, npfd, timeout);
+    if (ret > 0) {
+        int i;
+
+        for (i = 0; i < npfd; i++) {
+            int revents = pollfds[i].revents;
+
+            if (revents) {
+                aio_add_ready_handler(ready_list, nodes[i], revents);
+            }
+        }
+    }
+
+    npfd = 0;
+    return ret;
+}
+
+static void fdmon_poll_update(AioContext *ctx, AioHandler *node, bool is_new)
+{
+    /* Do nothing, AioHandler already contains the state we'll need */
+}
+
+const FDMonOps fdmon_poll_ops = {
+    .update = fdmon_poll_update,
+    .wait = fdmon_poll_wait,
+};
-- 
2.24.1

The AioHandler *node, bool is_new arguments are more complicated to
think about than simply being given AioHandler *old_node, AioHandler
*new_node.

Furthermore, the new Linux io_uring file descriptor monitoring mechanism
added by the new patch requires access to both the old and the new
nodes.  Make this change now in preparation.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Link: https://lore.kernel.org/r/20200305170806.1313245-5-stefanha@redhat.com
Message-Id: <20200305170806.1313245-5-stefanha@redhat.com>
---
 include/block/aio.h | 13 ++++++-------
 util/aio-posix.c    |  7 +------
 util/fdmon-epoll.c  | 21 ++++++++++++---------
 util/fdmon-poll.c   |  4 +++-
 4 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/include/block/aio.h b/include/block/aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ typedef struct {
     /*
      * update:
      * @ctx: the AioContext
-     * @node: the handler
-     * @is_new: is the file descriptor already being monitored?
+     * @old_node: the existing handler or NULL if this file descriptor is being
+     *            monitored for the first time
+     * @new_node: the new handler or NULL if this file descriptor is being
+     *            removed
      *
-     * Add/remove/modify a monitored file descriptor.  There are three cases:
-     * 1. node->pfd.events == 0 means remove the file descriptor.
-     * 2. !is_new means modify an already monitored file descriptor.
-     * 3. is_new means add a new file descriptor.
+     * Add/remove/modify a monitored file descriptor.
      *
      * Called with ctx->list_lock acquired.
      */
-    void (*update)(AioContext *ctx, AioHandler *node, bool is_new);
+    void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node);
 
     /*
      * wait:
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx,
     atomic_set(&ctx->poll_disable_cnt,
                atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
 
-    if (new_node) {
-        ctx->fdmon_ops->update(ctx, new_node, is_new);
-    } else if (node) {
-        /* Unregister deleted fd_handler */
-        ctx->fdmon_ops->update(ctx, node, false);
-    }
+    ctx->fdmon_ops->update(ctx, node, new_node);
     qemu_lockcnt_unlock(&ctx->list_lock);
     aio_notify(ctx);
 
diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c
index XXXXXXX..XXXXXXX 100644
--- a/util/fdmon-epoll.c
+++ b/util/fdmon-epoll.c
@@ -XXX,XX +XXX,XX @@ static inline int epoll_events_from_pfd(int pfd_events)
            (pfd_events & G_IO_ERR ? EPOLLERR : 0);
 }
 
-static void fdmon_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
+static void fdmon_epoll_update(AioContext *ctx,
+                               AioHandler *old_node,
+                               AioHandler *new_node)
 {
-    struct epoll_event event;
+    struct epoll_event event = {
+        .data.ptr = new_node,
+        .events = new_node ? epoll_events_from_pfd(new_node->pfd.events) : 0,
+    };
     int r;
-    int ctl;
 
-    if (!node->pfd.events) {
-        ctl = EPOLL_CTL_DEL;
+    if (!new_node) {
+        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, &event);
+    } else if (!old_node) {
+        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, new_node->pfd.fd, &event);
     } else {
-        event.data.ptr = node;
-        event.events = epoll_events_from_pfd(node->pfd.events);
-        ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
+        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, new_node->pfd.fd, &event);
     }
 
-    r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
     if (r) {
         fdmon_epoll_disable(ctx);
     }
diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c
index XXXXXXX..XXXXXXX 100644
--- a/util/fdmon-poll.c
+++ b/util/fdmon-poll.c
@@ -XXX,XX +XXX,XX @@ static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list,
     return ret;
 }
 
-static void fdmon_poll_update(AioContext *ctx, AioHandler *node, bool is_new)
+static void fdmon_poll_update(AioContext *ctx,
+                              AioHandler *old_node,
+                              AioHandler *new_node)
 {
     /* Do nothing, AioHandler already contains the state we'll need */
 }
-- 
2.24.1

The recent Linux io_uring API has several advantages over ppoll(2) and
epoll(2).  Details are given in the source code.

Add an io_uring implementation and make it the default on Linux.
Performance is the same as with epoll(7) but later patches add
optimizations that take advantage of io_uring.

It is necessary to change how aio_set_fd_handler() deals with deleting
AioHandlers since removing monitored file descriptors is asynchronous in
io_uring.  fdmon_io_uring_remove() marks the AioHandler deleted and
aio_set_fd_handler() will let it handle deletion in that case.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Link: https://lore.kernel.org/r/20200305170806.1313245-6-stefanha@redhat.com
Message-Id: <20200305170806.1313245-6-stefanha@redhat.com>
---
 configure             |   5 +
 include/block/aio.h   |   9 ++
 util/Makefile.objs    |   1 +
 util/aio-posix.c      |  20 ++-
 util/aio-posix.h      |  20 ++-
 util/fdmon-io_uring.c | 326 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 376 insertions(+), 5 deletions(-)
 create mode 100644 util/fdmon-io_uring.c

diff --git a/configure b/configure
index XXXXXXX..XXXXXXX 100755
--- a/configure
+++ b/configure
@@ -XXX,XX +XXX,XX @@ if test "$linux_io_uring" != "no" ; then
     linux_io_uring_cflags=$($pkg_config --cflags liburing)
     linux_io_uring_libs=$($pkg_config --libs liburing)
     linux_io_uring=yes
+
+    # io_uring is used in libqemuutil.a where per-file -libs variables are not
+    # seen by programs linking the archive.  It's not ideal, but just add the
+    # library dependency globally.
+    LIBS="$linux_io_uring_libs $LIBS"
   else
     if test "$linux_io_uring" = "yes" ; then
       feature_not_found "linux io_uring" "Install liburing devel"
diff --git a/include/block/aio.h b/include/block/aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@
 #ifndef QEMU_AIO_H
 #define QEMU_AIO_H
 
+#ifdef CONFIG_LINUX_IO_URING
+#include <liburing.h>
+#endif
 #include "qemu/queue.h"
 #include "qemu/event_notifier.h"
 #include "qemu/thread.h"
@@ -XXX,XX +XXX,XX @@ struct BHListSlice {
     QSIMPLEQ_ENTRY(BHListSlice) next;
 };
 
+typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
+
 struct AioContext {
     GSource source;
 
@@ -XXX,XX +XXX,XX @@ struct AioContext {
      * locking.
      */
     struct LuringState *linux_io_uring;
+
+    /* State for file descriptor monitoring using Linux io_uring */
+    struct io_uring fdmon_io_uring;
+    AioHandlerSList submit_list;
 #endif
 
     /* TimerLists for calling timers - one per clock type.  Has its own
diff --git a/util/Makefile.objs b/util/Makefile.objs
index XXXXXXX..XXXXXXX 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -XXX,XX +XXX,XX @@ util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o
 util-obj-$(CONFIG_POSIX) += aio-posix.o
 util-obj-$(CONFIG_POSIX) += fdmon-poll.o
 util-obj-$(CONFIG_EPOLL_CREATE1) += fdmon-epoll.o
+util-obj-$(CONFIG_LINUX_IO_URING) += fdmon-io_uring.o
 util-obj-$(CONFIG_POSIX) += compatfd.o
 util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
 util-obj-$(CONFIG_POSIX) += mmap-alloc.o
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
         g_source_remove_poll(&ctx->source, &node->pfd);
     }
 
+    node->pfd.revents = 0;
+
+    /* If the fd monitor has already marked it deleted, leave it alone */
+    if (QLIST_IS_INSERTED(node, node_deleted)) {
+        return false;
+    }
+
     /* If a read is in progress, just mark the node as deleted */
     if (qemu_lockcnt_count(&ctx->list_lock)) {
         QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
-        node->pfd.revents = 0;
         return false;
     }
     /* Otherwise, delete it for real.  We can't just mark it as
@@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx,
 
         QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node);
     }
-    if (node) {
-        deleted = aio_remove_fd_handler(ctx, node);
-    }
 
     /* No need to order poll_disable_cnt writes against other updates;
      * the counter is only used to avoid wasting time and latency on
@@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx,
                atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
 
     ctx->fdmon_ops->update(ctx, node, new_node);
+    if (node) {
+        deleted = aio_remove_fd_handler(ctx, node);
+    }
     qemu_lockcnt_unlock(&ctx->list_lock);
     aio_notify(ctx);
 
@@ -XXX,XX +XXX,XX @@ void aio_context_setup(AioContext *ctx)
     ctx->fdmon_ops = &fdmon_poll_ops;
     ctx->epollfd = -1;
 
+    /* Use the fastest fd monitoring implementation if available */
+    if (fdmon_io_uring_setup(ctx)) {
+        return;
+    }
+
     fdmon_epoll_setup(ctx);
 }
 
 void aio_context_destroy(AioContext *ctx)
 {
+    fdmon_io_uring_destroy(ctx);
     fdmon_epoll_disable(ctx);
 }
 
diff --git a/util/aio-posix.h b/util/aio-posix.h
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.h
+++ b/util/aio-posix.h
@@ -XXX,XX +XXX,XX @@ struct AioHandler {
     IOHandler *io_poll_begin;
     IOHandler *io_poll_end;
     void *opaque;
-    bool is_external;
     QLIST_ENTRY(AioHandler) node;
     QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
     QLIST_ENTRY(AioHandler) node_deleted;
+#ifdef CONFIG_LINUX_IO_URING
+    QSLIST_ENTRY(AioHandler) node_submitted;
+    unsigned flags; /* see fdmon-io_uring.c */
+#endif
+    bool is_external;
 };
 
 /* Add a handler to a ready list */
@@ -XXX,XX +XXX,XX @@ static inline void fdmon_epoll_disable(AioContext *ctx)
 }
 #endif /* !CONFIG_EPOLL_CREATE1 */
 
+#ifdef CONFIG_LINUX_IO_URING
+bool fdmon_io_uring_setup(AioContext *ctx);
+void fdmon_io_uring_destroy(AioContext *ctx);
+#else
+static inline bool fdmon_io_uring_setup(AioContext *ctx)
+{
+    return false;
+}
+
+static inline void fdmon_io_uring_destroy(AioContext *ctx)
+{
+}
+#endif /* !CONFIG_LINUX_IO_URING */
+
 #endif /* AIO_POSIX_H */
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/util/fdmon-io_uring.c
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Linux io_uring file descriptor monitoring
+ *
+ * The Linux io_uring API supports file descriptor monitoring with a few
+ * advantages over existing APIs like poll(2) and epoll(7):
+ *
+ * 1. Userspace polling of events is possible because the completion queue (cq
+ *    ring) is shared between the kernel and userspace.  This allows
+ *    applications that rely on userspace polling to also monitor file
+ *    descriptors in the same userspace polling loop.
+ *
+ * 2. Submission and completion is batched and done together in a single system
+ *    call.  This minimizes the number of system calls.
+ *
+ * 3. File descriptor monitoring is O(1) like epoll(7) so it scales better than
+ *    poll(2).
+ *
+ * 4. Nanosecond timeouts are supported so it requires fewer syscalls than
+ *    epoll(7).
+ *
+ * This code only monitors file descriptors and does not do asynchronous disk
+ * I/O.  Implementing disk I/O efficiently has other requirements and should
+ * use a separate io_uring so it does not make sense to unify the code.
+ *
+ * File descriptor monitoring is implemented using the following operations:
+ *
+ * 1. IORING_OP_POLL_ADD - adds a file descriptor to be monitored.
+ * 2. IORING_OP_POLL_REMOVE - removes a file descriptor being monitored.  When
+ *    the poll mask changes for a file descriptor it is first removed and then
+ *    re-added with the new poll mask, so this operation is also used as part
+ *    of modifying an existing monitored file descriptor.
+ * 3. IORING_OP_TIMEOUT - added every time a blocking syscall is made to wait
+ *    for events.  This operation self-cancels if another event completes
+ *    before the timeout.
+ *
+ * io_uring calls the submission queue the "sq ring" and the completion queue
+ * the "cq ring".  Ring entries are called "sqe" and "cqe", respectively.
+ *
+ * The code is structured so that sq/cq rings are only modified within
+ * fdmon_io_uring_wait().  Changes to AioHandlers are made by enqueuing them on
+ * ctx->submit_list so that fdmon_io_uring_wait() can submit IORING_OP_POLL_ADD
+ * and/or IORING_OP_POLL_REMOVE sqes for them.
+ */
+
+#include "qemu/osdep.h"
+#include <poll.h>
+#include "qemu/rcu_queue.h"
+#include "aio-posix.h"
+
+enum {
+    FDMON_IO_URING_ENTRIES  = 128, /* sq/cq ring size */
+
+    /* AioHandler::flags */
+    FDMON_IO_URING_PENDING  = (1 << 0),
+    FDMON_IO_URING_ADD      = (1 << 1),
+    FDMON_IO_URING_REMOVE   = (1 << 2),
+};
+
+static inline int poll_events_from_pfd(int pfd_events)
+{
+    return (pfd_events & G_IO_IN ? POLLIN : 0) |
+           (pfd_events & G_IO_OUT ? POLLOUT : 0) |
+           (pfd_events & G_IO_HUP ? POLLHUP : 0) |
+           (pfd_events & G_IO_ERR ? POLLERR : 0);
+}
+
+static inline int pfd_events_from_poll(int poll_events)
+{
+    return (poll_events & POLLIN ? G_IO_IN : 0) |
+           (poll_events & POLLOUT ? G_IO_OUT : 0) |
+           (poll_events & POLLHUP ? G_IO_HUP : 0) |
+           (poll_events & POLLERR ? G_IO_ERR : 0);
+}
+
+/*
+ * Returns an sqe for submitting a request.  Only be called within
+ * fdmon_io_uring_wait().
+ */
+static struct io_uring_sqe *get_sqe(AioContext *ctx)
+{
+    struct io_uring *ring = &ctx->fdmon_io_uring;
+    struct io_uring_sqe *sqe = io_uring_get_sqe(ring);
+    int ret;
+
+    if (likely(sqe)) {
+        return sqe;
+    }
+
+    /* No free sqes left, submit pending sqes first */
+    ret = io_uring_submit(ring);
+    assert(ret > 1);
+    sqe = io_uring_get_sqe(ring);
+    assert(sqe);
+    return sqe;
+}
+
+/* Atomically enqueue an AioHandler for sq ring submission */
+static void enqueue(AioHandlerSList *head, AioHandler *node, unsigned flags)
+{
+    unsigned old_flags;
+
+    old_flags = atomic_fetch_or(&node->flags, FDMON_IO_URING_PENDING | flags);
+    if (!(old_flags & FDMON_IO_URING_PENDING)) {
+        QSLIST_INSERT_HEAD_ATOMIC(head, node, node_submitted);
+    }
+}
+
+/* Dequeue an AioHandler for sq ring submission.  Called by fill_sq_ring(). */
+static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags)
+{
+    AioHandler *node = QSLIST_FIRST(head);
+
+    if (!node) {
+        return NULL;
+    }
+
+    /* Doesn't need to be atomic since fill_sq_ring() moves the list */
+    QSLIST_REMOVE_HEAD(head, node_submitted);
+
+    /*
+     * Don't clear FDMON_IO_URING_REMOVE.  It's sticky so it can serve two
+     * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and
+     * telling process_cqe() to delete the AioHandler when its
+     * IORING_OP_POLL_ADD completes.
+     */
+    *flags = atomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING |
+                                              FDMON_IO_URING_ADD));
+    return node;
+}
+
+static void fdmon_io_uring_update(AioContext *ctx,
+                                  AioHandler *old_node,
+                                  AioHandler *new_node)
+{
+    if (new_node) {
+        enqueue(&ctx->submit_list, new_node, FDMON_IO_URING_ADD);
+    }
+
+    if (old_node) {
+        /*
+         * Deletion is tricky because IORING_OP_POLL_ADD and
+         * IORING_OP_POLL_REMOVE are async.  We need to wait for the original
+         * IORING_OP_POLL_ADD to complete before this handler can be freed
+         * safely.
+         *
+         * It's possible that the file descriptor becomes ready and the
+         * IORING_OP_POLL_ADD cqe is enqueued before IORING_OP_POLL_REMOVE is
+         * submitted, too.
+         *
+         * Mark this handler deleted right now but don't place it on
+         * ctx->deleted_aio_handlers yet.  Instead, manually fudge the list
+         * entry to make QLIST_IS_INSERTED() think this handler has been
+         * inserted and other code recognizes this AioHandler as deleted.
+         *
+         * Once the original IORING_OP_POLL_ADD completes we enqueue the
+         * handler on the real ctx->deleted_aio_handlers list to be freed.
+         */
+        assert(!QLIST_IS_INSERTED(old_node, node_deleted));
+        old_node->node_deleted.le_prev = &old_node->node_deleted.le_next;
+
+        enqueue(&ctx->submit_list, old_node, FDMON_IO_URING_REMOVE);
+    }
+}
+
+static void add_poll_add_sqe(AioContext *ctx, AioHandler *node)
+{
+    struct io_uring_sqe *sqe = get_sqe(ctx);
+    int events = poll_events_from_pfd(node->pfd.events);
+
+    io_uring_prep_poll_add(sqe, node->pfd.fd, events);
+    io_uring_sqe_set_data(sqe, node);
+}
+
+static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node)
+{
+    struct io_uring_sqe *sqe = get_sqe(ctx);
+
+    io_uring_prep_poll_remove(sqe, node);
+}
+
+/* Add a timeout that self-cancels when another cqe becomes ready */
+static void add_timeout_sqe(AioContext *ctx, int64_t ns)
+{
+    struct io_uring_sqe *sqe;
+    struct __kernel_timespec ts = {
+        .tv_sec = ns / NANOSECONDS_PER_SECOND,
+        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
+    };
+
+    sqe = get_sqe(ctx);
+    io_uring_prep_timeout(sqe, &ts, 1, 0);
+}
+
+/* Add sqes from ctx->submit_list for submission */
+static void fill_sq_ring(AioContext *ctx)
+{
+    AioHandlerSList submit_list;
+    AioHandler *node;
+    unsigned flags;
+
+    QSLIST_MOVE_ATOMIC(&submit_list, &ctx->submit_list);
+
+    while ((node = dequeue(&submit_list, &flags))) {
+        /* Order matters, just in case both flags were set */
+        if (flags & FDMON_IO_URING_ADD) {
+            add_poll_add_sqe(ctx, node);
+        }
+        if (flags & FDMON_IO_URING_REMOVE) {
+            add_poll_remove_sqe(ctx, node);
+        }
+    }
+}
+
+/* Returns true if a handler became ready */
+static bool process_cqe(AioContext *ctx,
+                        AioHandlerList *ready_list,
+                        struct io_uring_cqe *cqe)
+{
+    AioHandler *node = io_uring_cqe_get_data(cqe);
+    unsigned flags;
+
+    /* poll_timeout and poll_remove have a zero user_data field */
+    if (!node) {
+        return false;
+    }
+
+    /*
+     * Deletion can only happen when IORING_OP_POLL_ADD completes.  If we race
+     * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE
+     * bit before IORING_OP_POLL_REMOVE is submitted.
+     */
+    flags = atomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE);
+    if (flags & FDMON_IO_URING_REMOVE) {
+        QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
+        return false;
+    }
+
+    aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res));
+
+    /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */
+    add_poll_add_sqe(ctx, node);
+    return true;
+}
+
+static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list)
+{
+    struct io_uring *ring = &ctx->fdmon_io_uring;
+    struct io_uring_cqe *cqe;
+    unsigned num_cqes = 0;
+    unsigned num_ready = 0;
+    unsigned head;
+
+    io_uring_for_each_cqe(ring, head, cqe) {
+        if (process_cqe(ctx, ready_list, cqe)) {
+            num_ready++;
+        }
+
+        num_cqes++;
+    }
+
+    io_uring_cq_advance(ring, num_cqes);
+    return num_ready;
+}
+
+static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
+                               int64_t timeout)
+{
+    unsigned wait_nr = 1; /* block until at least one cqe is ready */
+    int ret;
+
+    /* Fall back while external clients are disabled */
+    if (atomic_read(&ctx->external_disable_cnt)) {
+        return fdmon_poll_ops.wait(ctx, ready_list, timeout);
+    }
+
+    if (timeout == 0) {
+        wait_nr = 0; /* non-blocking */
+    } else if (timeout > 0) {
+        add_timeout_sqe(ctx, timeout);
+    }
+
+    fill_sq_ring(ctx);
+
+    ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
+    assert(ret >= 0);
+
+    return process_cq_ring(ctx, ready_list);
+}
+
+static const FDMonOps fdmon_io_uring_ops = {
+    .update = fdmon_io_uring_update,
+    .wait = fdmon_io_uring_wait,
+};
+
+bool fdmon_io_uring_setup(AioContext *ctx)
+{
+    int ret;
+
+    ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
+    if (ret != 0) {
+        return false;
+    }
+
+    QSLIST_INIT(&ctx->submit_list);
+    ctx->fdmon_ops = &fdmon_io_uring_ops;
+    return true;
+}
+
+void fdmon_io_uring_destroy(AioContext *ctx)
+{
+    if (ctx->fdmon_ops == &fdmon_io_uring_ops) {
+        AioHandler *node;
+
+        io_uring_queue_exit(&ctx->fdmon_io_uring);
+
+        /* No need to submit these anymore, just free them. */
+        while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) {
+            QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted);
+            QLIST_REMOVE(node, node);
+            g_free(node);
+        }
+
+        ctx->fdmon_ops = &fdmon_poll_ops;
+    }
+}
-- 
2.24.1

Unlike ppoll(2) and epoll(7), Linux io_uring completions can be polled
from userspace.  Previously userspace polling was only allowed when all
AioHandler's had an ->io_poll() callback.  This prevented starvation of
fds by userspace pollable handlers.

Add the FDMonOps->need_wait() callback that enables userspace polling
even when some AioHandlers lack ->io_poll().

For example, it's now possible to do userspace polling when a TCP/IP
socket is monitored thanks to Linux io_uring.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Link: https://lore.kernel.org/r/20200305170806.1313245-7-stefanha@redhat.com
Message-Id: <20200305170806.1313245-7-stefanha@redhat.com>
---
 include/block/aio.h   | 19 +++++++++++++++++++
 util/aio-posix.c      | 11 ++++++++---
 util/fdmon-epoll.c    |  1 +
 util/fdmon-io_uring.c |  6 ++++++
 util/fdmon-poll.c     |  1 +
 5 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/include/block/aio.h b/include/block/aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ struct ThreadPool;
 struct LinuxAioState;
 struct LuringState;
 
+/* Is polling disabled? */
+bool aio_poll_disabled(AioContext *ctx);
+
 /* Callbacks for file descriptor monitoring implementations */
 typedef struct {
     /*
@@ -XXX,XX +XXX,XX @@ typedef struct {
      * Returns: number of ready file descriptors.
      */
     int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
+
+    /*
+     * need_wait:
+     * @ctx: the AioContext
+     *
+     * Tell aio_poll() when to stop userspace polling early because ->wait()
+     * has fds ready.
+     *
+     * File descriptor monitoring implementations that cannot poll fd readiness
+     * from userspace should use aio_poll_disabled() here.  This ensures that
+     * file descriptors are not starved by handlers that frequently make
+     * progress via userspace polling.
+     *
+     * Returns: true if ->wait() should be called, false otherwise.
+     */
+    bool (*need_wait)(AioContext *ctx);
 } FDMonOps;
 
 /*
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@
 #include "trace.h"
 #include "aio-posix.h"
 
+bool aio_poll_disabled(AioContext *ctx)
+{
+    return atomic_read(&ctx->poll_disable_cnt);
+}
+
 void aio_add_ready_handler(AioHandlerList *ready_list,
                            AioHandler *node,
                            int revents)
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
         elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
         max_ns = qemu_soonest_timeout(*timeout, max_ns);
         assert(!(max_ns && progress));
-    } while (elapsed_time < max_ns && !atomic_read(&ctx->poll_disable_cnt));
+    } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
 
     /* If time has passed with no successful polling, adjust *timeout to
      * keep the same ending time.
@@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
 {
     int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
 
-    if (max_ns && !atomic_read(&ctx->poll_disable_cnt)) {
+    if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
         poll_set_started(ctx, true);
 
         if (run_poll_handlers(ctx, max_ns, timeout)) {
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     /* If polling is allowed, non-blocking aio_poll does not need the
      * system call---a single round of run_poll_handlers_once suffices.
      */
-    if (timeout || atomic_read(&ctx->poll_disable_cnt)) {
+    if (timeout || ctx->fdmon_ops->need_wait(ctx)) {
         ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout);
     }
 
diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c
index XXXXXXX..XXXXXXX 100644
--- a/util/fdmon-epoll.c
+++ b/util/fdmon-epoll.c
@@ -XXX,XX +XXX,XX @@ out:
 static const FDMonOps fdmon_epoll_ops = {
     .update = fdmon_epoll_update,
     .wait = fdmon_epoll_wait,
+    .need_wait = aio_poll_disabled,
 };
 
 static bool fdmon_epoll_try_enable(AioContext *ctx)
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
index XXXXXXX..XXXXXXX 100644
--- a/util/fdmon-io_uring.c
+++ b/util/fdmon-io_uring.c
@@ -XXX,XX +XXX,XX @@ static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
     return process_cq_ring(ctx, ready_list);
 }
 
+static bool fdmon_io_uring_need_wait(AioContext *ctx)
+{
+    return io_uring_cq_ready(&ctx->fdmon_io_uring);
+}
+
 static const FDMonOps fdmon_io_uring_ops = {
     .update = fdmon_io_uring_update,
     .wait = fdmon_io_uring_wait,
+    .need_wait = fdmon_io_uring_need_wait,
 };
 
 bool fdmon_io_uring_setup(AioContext *ctx)
diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c
index XXXXXXX..XXXXXXX 100644
--- a/util/fdmon-poll.c
+++ b/util/fdmon-poll.c
@@ -XXX,XX +XXX,XX @@ static void fdmon_poll_update(AioContext *ctx,
 const FDMonOps fdmon_poll_ops = {
     .update = fdmon_poll_update,
     .wait = fdmon_poll_wait,
+    .need_wait = aio_poll_disabled,
 };
-- 
2.24.1

When there are many poll handlers it's likely that some of them are idle
most of the time.  Remove handlers that haven't had activity recently so
that the polling loop scales better for guests with a large number of
devices.

This feature only takes effect for the Linux io_uring fd monitoring
implementation because it is capable of combining fd monitoring with
userspace polling.  The other implementations can't do that and risk
starving fds in favor of poll handlers, so don't try this optimization
when they are in use.

IOPS improves from 10k to 105k when the guest has 100
virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1
device for rw=randread,iodepth=1,bs=4k,ioengine=libaio on NVMe.

[Clarified aio_poll_handlers locking discipline explanation in comment
after discussion with Paolo Bonzini <pbonzini@redhat.com>.
--Stefan]

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Link: https://lore.kernel.org/r/20200305170806.1313245-8-stefanha@redhat.com
Message-Id: <20200305170806.1313245-8-stefanha@redhat.com>
---
 include/block/aio.h |  8 ++++
 util/aio-posix.c    | 93 +++++++++++++++++++++++++++++++++++++++++----
 util/aio-posix.h    |  2 +
 util/trace-events   |  2 +
 4 files changed, 98 insertions(+), 7 deletions(-)

diff --git a/include/block/aio.h b/include/block/aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ struct AioContext {
     int64_t poll_grow;      /* polling time growth factor */
     int64_t poll_shrink;    /* polling time shrink factor */
 
+    /*
+     * List of handlers participating in userspace polling.  Protected by
+     * ctx->list_lock.  Iterated and modified mostly by the event loop thread
+     * from aio_poll() with ctx->list_lock incremented.  aio_set_fd_handler()
+     * only touches the list to delete nodes if ctx->list_lock's count is zero.
+     */
+    AioHandlerList poll_aio_handlers;
+
     /* Are we in polling mode or monitoring file descriptors? */
     bool poll_started;
 
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@
 #include "trace.h"
 #include "aio-posix.h"
 
+/* Stop userspace polling on a handler if it isn't active for some time */
+#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
+
 bool aio_poll_disabled(AioContext *ctx)
 {
     return atomic_read(&ctx->poll_disable_cnt);
@@ -XXX,XX +XXX,XX @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
      * deleted because deleted nodes are only cleaned up while
      * no one is walking the handlers list.
      */
+    QLIST_SAFE_REMOVE(node, node_poll);
     QLIST_REMOVE(node, node);
     return true;
 }
@@ -XXX,XX +XXX,XX @@ static bool poll_set_started(AioContext *ctx, bool started)
     ctx->poll_started = started;
 
     qemu_lockcnt_inc(&ctx->list_lock);
-    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+    QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
         IOHandler *fn;
 
         if (QLIST_IS_INSERTED(node, node_deleted)) {
@@ -XXX,XX +XXX,XX @@ static void aio_free_deleted_handlers(AioContext *ctx)
     while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
         QLIST_REMOVE(node, node);
         QLIST_REMOVE(node, node_deleted);
+        QLIST_SAFE_REMOVE(node, node_poll);
         g_free(node);
     }
 
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
     revents = node->pfd.revents & node->pfd.events;
     node->pfd.revents = 0;
 
+    /*
+     * Start polling AioHandlers when they become ready because activity is
+     * likely to continue.  Note that starvation is theoretically possible when
+     * fdmon_supports_polling(), but only until the fd fires for the first
+     * time.
+     */
+    if (!QLIST_IS_INSERTED(node, node_deleted) &&
+        !QLIST_IS_INSERTED(node, node_poll) &&
+        node->io_poll) {
+        trace_poll_add(ctx, node, node->pfd.fd, revents);
+        if (ctx->poll_started && node->io_poll_begin) {
+            node->io_poll_begin(node->opaque);
+        }
+        QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll);
+    }
+
     if (!QLIST_IS_INSERTED(node, node_deleted) &&
         (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
         aio_node_check(ctx, node->is_external) &&
@@ -XXX,XX +XXX,XX @@ void aio_dispatch(AioContext *ctx)
     timerlistgroup_run_timers(&ctx->tlg);
 }
 
-static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
+static bool run_poll_handlers_once(AioContext *ctx,
+                                   int64_t now,
+                                   int64_t *timeout)
 {
     bool progress = false;
     AioHandler *node;
+    AioHandler *tmp;
 
-    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
-        if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
-            aio_node_check(ctx, node->is_external) &&
+    QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
+        if (aio_node_check(ctx, node->is_external) &&
             node->io_poll(node->opaque)) {
+            node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
+
             /*
              * Polling was successful, exit try_poll_mode immediately
              * to adjust the next polling time.
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
     return progress;
 }
 
+static bool fdmon_supports_polling(AioContext *ctx)
+{
+    return ctx->fdmon_ops->need_wait != aio_poll_disabled;
+}
+
+static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now)
+{
+    AioHandler *node;
+    AioHandler *tmp;
+    bool progress = false;
+
+    /*
+     * File descriptor monitoring implementations without userspace polling
+     * support suffer from starvation when a subset of handlers is polled
+     * because fds will not be processed in a timely fashion.  Don't remove
+     * idle poll handlers.
+     */
+    if (!fdmon_supports_polling(ctx)) {
+        return false;
+    }
+
+    QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
+        if (node->poll_idle_timeout == 0LL) {
+            node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
+        } else if (now >= node->poll_idle_timeout) {
+            trace_poll_remove(ctx, node, node->pfd.fd);
+            node->poll_idle_timeout = 0LL;
+            QLIST_SAFE_REMOVE(node, node_poll);
+            if (ctx->poll_started && node->io_poll_end) {
+                node->io_poll_end(node->opaque);
+
+                /*
+                 * Final poll in case ->io_poll_end() races with an event.
+                 * Nevermind about re-adding the handler in the rare case where
+                 * this causes progress.
+                 */
+                progress = node->io_poll(node->opaque) || progress;
+            }
+        }
+    }
+
+    return progress;
+}
+
 /* run_poll_handlers:
  * @ctx: the AioContext
  * @max_ns: maximum time to poll for, in nanoseconds
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
 
     start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     do {
-        progress = run_poll_handlers_once(ctx, timeout);
+        progress = run_poll_handlers_once(ctx, start_time, timeout);
         elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
         max_ns = qemu_soonest_timeout(*timeout, max_ns);
         assert(!(max_ns && progress));
     } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
 
+    if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) {
+        *timeout = 0;
+        progress = true;
+    }
+
     /* If time has passed with no successful polling, adjust *timeout to
      * keep the same ending time.
      */
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
  */
 static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
 {
-    int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
+    int64_t max_ns;
+
+    if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
+        return false;
+    }
 
+    max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
     if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
         poll_set_started(ctx, true);
 
diff --git a/util/aio-posix.h b/util/aio-posix.h
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.h
+++ b/util/aio-posix.h
@@ -XXX,XX +XXX,XX @@ struct AioHandler {
     QLIST_ENTRY(AioHandler) node;
     QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
     QLIST_ENTRY(AioHandler) node_deleted;
+    QLIST_ENTRY(AioHandler) node_poll;
 #ifdef CONFIG_LINUX_IO_URING
     QSLIST_ENTRY(AioHandler) node_submitted;
     unsigned flags; /* see fdmon-io_uring.c */
 #endif
+    int64_t poll_idle_timeout; /* when to stop userspace polling */
     bool is_external;
 };
 
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ run_poll_handlers_begin(void *ctx, int64_t max_ns, int64_t timeout) "ctx %p max_
 run_poll_handlers_end(void *ctx, bool progress, int64_t timeout) "ctx %p progress %d new timeout %"PRId64
 poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
+poll_add(void *ctx, void *node, int fd, unsigned revents) "ctx %p node %p fd %d revents 0x%x"
+poll_remove(void *ctx, void *node, int fd) "ctx %p node %p fd %d"
 
 # async.c
 aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
-- 
2.24.1