Series comparison

-[PULL 00/32] Block patches
+[Qemu-devel] [PULL v2 00/24] Block patches
-The following changes since commit 0b6206b9c6825619cd721085fe082d7a0abc9af4:
+The following changes since commit 56f9e46b841c7be478ca038d8d4085d776ab4b0d:
-  Merge remote-tracking branch 'remotes/rth-gitlab/tags/pull-tcg-20210914-4' into staging (2021-09-15 13:27:49 +0100)
+  Merge remote-tracking branch 'remotes/armbru/tags/pull-qapi-2017-02-20' into staging (2017-02-20 17:42:47 +0000)
-are available in the Git repository at:
+are available in the git repository at:
-  https://github.com/XanClic/qemu.git tags/pull-block-2021-09-15
+  git://github.com/stefanha/qemu.git tags/block-pull-request
-for you to fetch changes up to 1899bf47375ad40555dcdff12ba49b4b8b82df38:
+for you to fetch changes up to a7b91d35bab97a2d3e779d0c64c9b837b52a6cf7:
-  qemu-img: Add -F shorthand to convert (2021-09-15 18:42:38 +0200)
+  coroutine-lock: make CoRwlock thread-safe and fair (2017-02-21 11:39:40 +0000)
 ----------------------------------------------------------------
-Block patches:
+Pull request
-- Block-status cache for data regions
-- qcow2 optimization (when using subclusters)
+v2:
-- iotests delinting, and let 297 (lint checker) cover named iotests
+ * Rebased to resolve scsi conflicts
 - qcow2 check improvements
 - Added -F (target backing file format) option to qemu-img convert
 - Mirror job fix
 - Fix for when a migration is initiated while a backup job runs
 - Fix for uncached qemu-img convert to a volume with 4k sectors (for an
   unaligned image)
 - Minor gluster driver fix
 ----------------------------------------------------------------
-Eric Blake (1):
-  qemu-img: Add -F shorthand to convert
-Hanna Reitz (15):
+Paolo Bonzini (24):
-  gluster: Align block-status tail
+  block: move AioContext, QEMUTimer, main-loop to libqemuutil
-  block: Drop BDS comment regarding bdrv_append()
+  aio: introduce aio_co_schedule and aio_co_wake
-  block: block-status cache for data regions
+  block-backend: allow blk_prw from coroutine context
-  block: Clarify that @bytes is no limit on *pnum
+  test-thread-pool: use generic AioContext infrastructure
-  block/file-posix: Do not force-cap *pnum
+  io: add methods to set I/O handlers on AioContext
-  block/gluster: Do not force-cap *pnum
+  io: make qio_channel_yield aware of AioContexts
-  block/iscsi: Do not force-cap *pnum
+  nbd: convert to use qio_channel_yield
-  iotests: Fix unspecified-encoding pylint warnings
+  coroutine-lock: reschedule coroutine on the AioContext it was running
-  iotests: Fix use-{list,dict}-literal warnings
+    on
-  iotests/297: Drop 169 and 199 from the skip list
+  blkdebug: reschedule coroutine on the AioContext it is running on
-  migrate-bitmaps-postcopy-test: Fix pylint warnings
+  qed: introduce qed_aio_start_io and qed_aio_next_io_cb
-  migrate-bitmaps-test: Fix pylint warnings
+  aio: push aio_context_acquire/release down to dispatching
-  mirror-top-perms: Fix AbnormalShutdown path
+  block: explicitly acquire aiocontext in timers that need it
-  iotests/297: Cover tests/
+  block: explicitly acquire aiocontext in callbacks that need it
-  qemu-img: Allow target be aligned to sector size
+  block: explicitly acquire aiocontext in bottom halves that need it
   block: explicitly acquire aiocontext in aio callbacks that need it
   aio-posix: partially inline aio_dispatch into aio_poll
   async: remove unnecessary inc/dec pairs
   block: document fields protected by AioContext lock
   coroutine-lock: make CoMutex thread-safe
   coroutine-lock: add limited spinning to CoMutex
   test-aio-multithread: add performance comparison with thread-based
     mutexes
   coroutine-lock: place CoMutex before CoQueue in header
   coroutine-lock: add mutex argument to CoQueue APIs
   coroutine-lock: make CoRwlock thread-safe and fair
-Stefano Garzarella (1):
+ Makefile.objs                       |   4 -
-  block/mirror: fix NULL pointer dereference in
+ stubs/Makefile.objs                 |   1 +
-    mirror_wait_on_conflicts()
+ tests/Makefile.include              |  19 +-
+ util/Makefile.objs                  |   6 +-
-Vladimir Sementsov-Ogievskiy (15):
+ block/nbd-client.h                  |   2 +-
-  tests: add migrate-during-backup
+ block/qed.h                         |   3 +
-  block: bdrv_inactivate_recurse(): check for permissions and fix crash
+ include/block/aio.h                 |  38 ++-
-  simplebench: add img_bench_templater.py
+ include/block/block_int.h           |  64 +++--
-  qcow2: refactor handle_dependencies() loop body
+ include/io/channel.h                |  72 +++++-
-  qcow2: handle_dependencies(): relax conflict detection
+ include/qemu/coroutine.h            |  84 ++++---
-  qcow2-refcount: improve style of check_refcounts_l2()
+ include/qemu/coroutine_int.h        |  11 +-
-  qcow2: compressed read: simplify cluster descriptor passing
+ include/sysemu/block-backend.h      |  14 +-
-  qcow2: introduce qcow2_parse_compressed_l2_entry() helper
+ tests/iothread.h                    |  25 ++
-  qcow2-refcount: introduce fix_l2_entry_by_zero()
+ block/backup.c                      |   2 +-
-  qcow2-refcount: fix_l2_entry_by_zero(): also zero L2 entry bitmap
+ block/blkdebug.c                    |   9 +-
-  qcow2-refcount: check_refcounts_l2(): check l2_bitmap
+ block/blkreplay.c                   |   2 +-
-  qcow2-refcount: check_refcounts_l2(): check reserved bits
+ block/block-backend.c               |  13 +-
-  qcow2-refcount: improve style of check_refcounts_l1()
+ block/curl.c                        |  44 +++-
-  qcow2-refcount: check_refcounts_l1(): check reserved bits
+ block/gluster.c                     |   9 +-
-  qcow2-refcount: check_refblocks(): add separate message for reserved
+ block/io.c                          |  42 +---
+ block/iscsi.c                       |  15 +-
- docs/tools/qemu-img.rst                       |   4 +-
+ block/linux-aio.c                   |  10 +-
- block/qcow2.h                                 |   7 +-
+ block/mirror.c                      |  12 +-
- include/block/block_int.h                     |  61 +++-
+ block/nbd-client.c                  | 119 +++++----
- block.c                                       |  88 +++++
+ block/nfs.c                         |   9 +-
- block/file-posix.c                            |   7 +-
+ block/qcow2-cluster.c               |   4 +-
- block/gluster.c                               |  23 +-
+ block/qed-cluster.c                 |   2 +
- block/io.c                                    |  68 +++-
+ block/qed-table.c                   |  12 +-
- block/iscsi.c                                 |   3 -
+ block/qed.c                         |  58 +++--
- block/mirror.c                                |  25 +-
+ block/sheepdog.c                    |  31 +--
- block/qcow2-cluster.c                         |  78 +++--
+ block/ssh.c                         |  29 +--
- block/qcow2-refcount.c                        | 326 ++++++++++++------
+ block/throttle-groups.c             |   4 +-
- block/qcow2.c                                 |  13 +-
+ block/win32-aio.c                   |   9 +-
- qemu-img.c                                    |  18 +-
+ dma-helpers.c                       |   2 +
- qemu-img-cmds.hx                              |   2 +-
+ hw/9pfs/9p.c                        |   2 +-
- scripts/simplebench/img_bench_templater.py    |  95 +++++
+ hw/block/virtio-blk.c               |  19 +-
- scripts/simplebench/table_templater.py        |  62 ++++
+ hw/scsi/scsi-bus.c                  |   2 +
- tests/qemu-iotests/122                        |   2 +-
+ hw/scsi/scsi-disk.c                 |  15 ++
- tests/qemu-iotests/271                        |   5 +-
+ hw/scsi/scsi-generic.c              |  20 +-
- tests/qemu-iotests/271.out                    |   4 +-
+ hw/scsi/virtio-scsi.c               |   7 +
- tests/qemu-iotests/297                        |   9 +-
+ io/channel-command.c                |  13 +
- tests/qemu-iotests/iotests.py                 |  12 +-
+ io/channel-file.c                   |  11 +
- .../tests/migrate-bitmaps-postcopy-test       |  13 +-
+ io/channel-socket.c                 |  16 +-
- tests/qemu-iotests/tests/migrate-bitmaps-test |  43 ++-
+ io/channel-tls.c                    |  12 +
- .../qemu-iotests/tests/migrate-during-backup  |  97 ++++++
+ io/channel-watch.c                  |   6 +
- .../tests/migrate-during-backup.out           |   5 +
+ io/channel.c                        |  97 ++++++--
- tests/qemu-iotests/tests/mirror-top-perms     |   2 +-
+ nbd/client.c                        |   2 +-
-files changed, 855 insertions(+), 217 deletions(-)
+ nbd/common.c                        |   9 +-
- create mode 100755 scripts/simplebench/img_bench_templater.py
+ nbd/server.c                        |  94 +++-----
- create mode 100644 scripts/simplebench/table_templater.py
+ stubs/linux-aio.c                   |  32 +++
- create mode 100755 tests/qemu-iotests/tests/migrate-during-backup
+ stubs/set-fd-handler.c              |  11 -
- create mode 100644 tests/qemu-iotests/tests/migrate-during-backup.out
+ tests/iothread.c                    |  91 +++++++
  tests/test-aio-multithread.c        | 463 ++++++++++++++++++++++++++++++++++++
  tests/test-thread-pool.c            |  12 +-
  aio-posix.c => util/aio-posix.c     |  62 ++---
  aio-win32.c => util/aio-win32.c     |  30 +--
  util/aiocb.c                        |  55 +++++
  async.c => util/async.c             |  84 ++++++-
  iohandler.c => util/iohandler.c     |   0
  main-loop.c => util/main-loop.c     |   0
  util/qemu-coroutine-lock.c          | 254 ++++++++++++++++++--
  util/qemu-coroutine-sleep.c         |   2 +-
  util/qemu-coroutine.c               |   8 +
  qemu-timer.c => util/qemu-timer.c   |   0
  thread-pool.c => util/thread-pool.c |   8 +-
  trace-events                        |  11 -
  util/trace-events                   |  17 +-
 files changed, 1712 insertions(+), 533 deletions(-)
  create mode 100644 tests/iothread.h
  create mode 100644 stubs/linux-aio.c
  create mode 100644 tests/iothread.c
  create mode 100644 tests/test-aio-multithread.c
  rename aio-posix.c => util/aio-posix.c (94%)
  rename aio-win32.c => util/aio-win32.c (95%)
  create mode 100644 util/aiocb.c
  rename async.c => util/async.c (82%)
  rename iohandler.c => util/iohandler.c (100%)
  rename main-loop.c => util/main-loop.c (100%)
  rename qemu-timer.c => util/qemu-timer.c (100%)
  rename thread-pool.c => util/thread-pool.c (97%)
 --
-.31.1
+.9.3

-[PULL 18/32] simplebench: add img_bench_templater.py
+[Qemu-devel] [PULL v2 01/24] block: move AioContext, QEMUTimer, main-loop to libqemuutil
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Add simple grammar-parsing template benchmark. New tool consume test
+AioContext is fairly self contained, the only dependency is QEMUTimer but
-template written in bash with some special grammar injections and
+that in turn doesn't need anything else.  So move them out of block-obj-y
-produces multiple tests, run them and finally print a performance
+to avoid introducing a dependency from io/ to block-obj-y.
-comparison table of different tests produced from one template.
+main-loop and its dependency iohandler also need to be moved, because
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+later in this series io/ will call iohandler_get_aio_context.
-Message-Id: <20210824101517.59802-2-vsementsov@virtuozzo.com>
-Reviewed-by: Hanna Reitz <hreitz@redhat.com>
+[Changed copyright "the QEMU team" to "other QEMU contributors" as
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
+suggested by Daniel Berrange and agreed by Paolo.
 --Stefan]
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213135235.12274-2-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- scripts/simplebench/img_bench_templater.py | 95 ++++++++++++++++++++++
+ Makefile.objs                       |  4 ---
- scripts/simplebench/table_templater.py     | 62 ++++++++++++++
+ stubs/Makefile.objs                 |  1 +
-files changed, 157 insertions(+)
+ tests/Makefile.include              | 11 ++++----
- create mode 100755 scripts/simplebench/img_bench_templater.py
+ util/Makefile.objs                  |  6 +++-
- create mode 100644 scripts/simplebench/table_templater.py
+ block/io.c                          | 29 -------------------
+ stubs/linux-aio.c                   | 32 +++++++++++++++++++++
-diff --git a/scripts/simplebench/img_bench_templater.py b/scripts/simplebench/img_bench_templater.py
+ stubs/set-fd-handler.c              | 11 --------
-new file mode 100755
+ aio-posix.c => util/aio-posix.c     |  2 +-
-index XXXXXXX..XXXXXXX
+ aio-win32.c => util/aio-win32.c     |  0
---- /dev/null
+ util/aiocb.c                        | 55 +++++++++++++++++++++++++++++++++++++
-+++ b/scripts/simplebench/img_bench_templater.py
+ async.c => util/async.c             |  3 +-
-@@ -XXX,XX +XXX,XX @@
+ iohandler.c => util/iohandler.c     |  0
-+#!/usr/bin/env python3
+ main-loop.c => util/main-loop.c     |  0
-+#
+ qemu-timer.c => util/qemu-timer.c   |  0
-+# Process img-bench test templates
+ thread-pool.c => util/thread-pool.c |  2 +-
-+#
+ trace-events                        | 11 --------
-+# Copyright (c) 2021 Virtuozzo International GmbH.
+ util/trace-events                   | 11 ++++++++
-+#
+files changed, 114 insertions(+), 64 deletions(-)
-+# This program is free software; you can redistribute it and/or modify
+ create mode 100644 stubs/linux-aio.c
-+# it under the terms of the GNU General Public License as published by
+ rename aio-posix.c => util/aio-posix.c (99%)
-+# the Free Software Foundation; either version 2 of the License, or
+ rename aio-win32.c => util/aio-win32.c (100%)
-+# (at your option) any later version.
+ create mode 100644 util/aiocb.c
-+#
+ rename async.c => util/async.c (99%)
-+# This program is distributed in the hope that it will be useful,
+ rename iohandler.c => util/iohandler.c (100%)
-+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+ rename main-loop.c => util/main-loop.c (100%)
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ rename qemu-timer.c => util/qemu-timer.c (100%)
-+# GNU General Public License for more details.
+ rename thread-pool.c => util/thread-pool.c (99%)
-+#
-+# You should have received a copy of the GNU General Public License
+diff --git a/Makefile.objs b/Makefile.objs
-+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+index XXXXXXX..XXXXXXX 100644
-+#
+--- a/Makefile.objs
-+
++++ b/Makefile.objs
-+
+@@ -XXX,XX +XXX,XX @@ chardev-obj-y = chardev/
-+import sys
+ #######################################################################
-+import subprocess
+ # block-obj-y is code used by both qemu system emulation and qemu-img
-+import re
-+import json
+-block-obj-y = async.o thread-pool.o
-+
+ block-obj-y += nbd/
-+import simplebench
+ block-obj-y += block.o blockjob.o
-+from results_to_text import results_to_text
+-block-obj-y += main-loop.o iohandler.o qemu-timer.o
-+from table_templater import Templater
+-block-obj-$(CONFIG_POSIX) += aio-posix.o
-+
+-block-obj-$(CONFIG_WIN32) += aio-win32.o
-+
+ block-obj-y += block/
-+def bench_func(env, case):
+ block-obj-y += qemu-io-cmds.o
-+    test = templater.gen(env['data'], case['data'])
+ block-obj-$(CONFIG_REPLICATION) += replication.o
-+
+diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
-+    p = subprocess.run(test, shell=True, stdout=subprocess.PIPE,
+index XXXXXXX..XXXXXXX 100644
-+                       stderr=subprocess.STDOUT, universal_newlines=True)
+--- a/stubs/Makefile.objs
-+
++++ b/stubs/Makefile.objs
-+    if p.returncode == 0:
+@@ -XXX,XX +XXX,XX @@ stub-obj-y += get-vm-name.o
-+        try:
+ stub-obj-y += iothread.o
-+            m = re.search(r'Run completed in (\d+.\d+) seconds.', p.stdout)
+ stub-obj-y += iothread-lock.o
-+            return {'seconds': float(m.group(1))}
+ stub-obj-y += is-daemonized.o
-+        except Exception:
++stub-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
-+            return {'error': f'failed to parse qemu-img output: {p.stdout}'}
+ stub-obj-y += machine-init-done.o
-+    else:
+ stub-obj-y += migr-blocker.o
-+        return {'error': f'qemu-img failed: {p.returncode}: {p.stdout}'}
+ stub-obj-y += monitor.o
-+
+diff --git a/tests/Makefile.include b/tests/Makefile.include
-+
+index XXXXXXX..XXXXXXX 100644
-+if __name__ == '__main__':
+--- a/tests/Makefile.include
-+    if len(sys.argv) > 1:
++++ b/tests/Makefile.include
-+        print("""
+@@ -XXX,XX +XXX,XX @@ check-unit-y += tests/test-visitor-serialization$(EXESUF)
-+Usage: img_bench_templater.py < path/to/test-template.sh
+ check-unit-y += tests/test-iov$(EXESUF)
-+
+ gcov-files-test-iov-y = util/iov.c
-+This script generates performance tests from a test template (example below),
+ check-unit-y += tests/test-aio$(EXESUF)
-+runs them, and displays the results in a table. The template is read from
++gcov-files-test-aio-y = util/async.c util/qemu-timer.o
-+stdin.  It must be written in bash and end with a `qemu-img bench` invocation
++gcov-files-test-aio-$(CONFIG_WIN32) += util/aio-win32.c
-+(whose result is parsed to get the test instance’s result).
++gcov-files-test-aio-$(CONFIG_POSIX) += util/aio-posix.c
-+
+ check-unit-y += tests/test-throttle$(EXESUF)
-+Use the following syntax in the template to create the various different test
+ gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
-+instances:
+ gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
-+
+@@ -XXX,XX +XXX,XX @@ tests/check-qjson$(EXESUF): tests/check-qjson.o $(test-util-obj-y)
-+  column templating: {var1|var2|...} - test will use different values in
+ tests/check-qom-interface$(EXESUF): tests/check-qom-interface.o $(test-qom-obj-y)
-+  different columns. You may use several {} constructions in the test, in this
+ tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
-+  case product of all choice-sets will be used.
-+
+-tests/test-char$(EXESUF): tests/test-char.o qemu-timer.o \
-+  row templating: [var1|var2|...] - similar thing to define rows (test-cases)
+-    $(test-util-obj-y) $(qtest-obj-y) $(test-block-obj-y) $(chardev-obj-y)
-+
++tests/test-char$(EXESUF): tests/test-char.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y) $(chardev-obj-y)
-+Test template example:
+ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
-+
+ tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
-+Assume you want to compare two qemu-img binaries, called qemu-img-old and
+ tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
-+qemu-img-new in your build directory in two test-cases with 4K writes and 64K
+@@ -XXX,XX +XXX,XX @@ tests/test-vmstate$(EXESUF): tests/test-vmstate.o \
-+writes. The template may look like this:
+     migration/vmstate.o migration/qemu-file.o \
-+
+         migration/qemu-file-channel.o migration/qjson.o \
-+qemu_img=/path/to/qemu/build/qemu-img-{old|new}
+     $(test-io-obj-y)
-+$qemu_img create -f qcow2 /ssd/x.qcow2 1G
+-tests/test-timed-average$(EXESUF): tests/test-timed-average.o qemu-timer.o \
-+$qemu_img bench -c 100 -d 8 [-s 4K|-s 64K] -w -t none -n /ssd/x.qcow2
+-    $(test-util-obj-y)
-+
++tests/test-timed-average$(EXESUF): tests/test-timed-average.o $(test-util-obj-y)
-+When passing this to stdin of img_bench_templater.py, the resulting comparison
+ tests/test-base64$(EXESUF): tests/test-base64.o \
-+table will contain two columns (for two binaries) and two rows (for two
+     libqemuutil.a libqemustub.a
-+test-cases).
+ tests/ptimer-test$(EXESUF): tests/ptimer-test.o tests/ptimer-test-stubs.o hw/core/ptimer.o libqemustub.a
-+
+@@ -XXX,XX +XXX,XX @@ tests/usb-hcd-ehci-test$(EXESUF): tests/usb-hcd-ehci-test.o $(libqos-usb-obj-y)
-+In addition to displaying the results, script also stores results in JSON
+ tests/usb-hcd-xhci-test$(EXESUF): tests/usb-hcd-xhci-test.o $(libqos-usb-obj-y)
-+format into results.json file in current directory.
+ tests/pc-cpu-test$(EXESUF): tests/pc-cpu-test.o
-+""")
+ tests/postcopy-test$(EXESUF): tests/postcopy-test.o
-+        sys.exit()
+-tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-timer.o \
-+
++tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o $(test-util-obj-y) \
-+    templater = Templater(sys.stdin.read())
+     $(qtest-obj-y) $(test-io-obj-y) $(libqos-virtio-obj-y) $(libqos-pc-obj-y) \
-+
+     $(chardev-obj-y)
-+    envs = [{'id': ' / '.join(x), 'data': x} for x in templater.columns]
+ tests/qemu-iotests/socket_scm_helper$(EXESUF): tests/qemu-iotests/socket_scm_helper.o
-+    cases = [{'id': ' / '.join(x), 'data': x} for x in templater.rows]
+diff --git a/util/Makefile.objs b/util/Makefile.objs
-+
+index XXXXXXX..XXXXXXX 100644
-+    result = simplebench.bench(bench_func, envs, cases, count=5,
+--- a/util/Makefile.objs
-+                               initial_run=False)
++++ b/util/Makefile.objs
-+    print(results_to_text(result))
+@@ -XXX,XX +XXX,XX @@
-+    with open('results.json', 'w') as f:
+ util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
-+        json.dump(result, f, indent=4)
+ util-obj-y += bufferiszero.o
-diff --git a/scripts/simplebench/table_templater.py b/scripts/simplebench/table_templater.py
+ util-obj-y += lockcnt.o
 +util-obj-y += aiocb.o async.o thread-pool.o qemu-timer.o
 +util-obj-y += main-loop.o iohandler.o
 +util-obj-$(CONFIG_POSIX) += aio-posix.o
  util-obj-$(CONFIG_POSIX) += compatfd.o
  util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
  util-obj-$(CONFIG_POSIX) += mmap-alloc.o
  util-obj-$(CONFIG_POSIX) += oslib-posix.o
  util-obj-$(CONFIG_POSIX) += qemu-openpty.o
  util-obj-$(CONFIG_POSIX) += qemu-thread-posix.o
 -util-obj-$(CONFIG_WIN32) += event_notifier-win32.o
  util-obj-$(CONFIG_POSIX) += memfd.o
 +util-obj-$(CONFIG_WIN32) += aio-win32.o
 +util-obj-$(CONFIG_WIN32) += event_notifier-win32.o
  util-obj-$(CONFIG_WIN32) += oslib-win32.o
  util-obj-$(CONFIG_WIN32) += qemu-thread-win32.o
  util-obj-y += envlist.o path.o module.o
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
      return &acb->common;
  }
 -void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
 -                   BlockCompletionFunc *cb, void *opaque)
 -{
 -    BlockAIOCB *acb;
 -
 -    acb = g_malloc(aiocb_info->aiocb_size);
 -    acb->aiocb_info = aiocb_info;
 -    acb->bs = bs;
 -    acb->cb = cb;
 -    acb->opaque = opaque;
 -    acb->refcnt = 1;
 -    return acb;
 -}
 -
 -void qemu_aio_ref(void *p)
 -{
 -    BlockAIOCB *acb = p;
 -    acb->refcnt++;
 -}
 -
 -void qemu_aio_unref(void *p)
 -{
 -    BlockAIOCB *acb = p;
 -    assert(acb->refcnt > 0);
 -    if (--acb->refcnt == 0) {
 -        g_free(acb);
 -    }
 -}
 -
  /**************************************************************/
  /* Coroutine block device emulation */
 diff --git a/stubs/linux-aio.c b/stubs/linux-aio.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/scripts/simplebench/table_templater.py
++++ b/stubs/linux-aio.c
 @@ -XXX,XX +XXX,XX @@
-+# Parser for test templates
++/*
-+#
++ * Linux native AIO support.
-+# Copyright (c) 2021 Virtuozzo International GmbH.
++ *
-+#
++ * Copyright (C) 2009 IBM, Corp.
-+# This program is free software; you can redistribute it and/or modify
++ * Copyright (C) 2009 Red Hat, Inc.
-+# it under the terms of the GNU General Public License as published by
++ *
-+# the Free Software Foundation; either version 2 of the License, or
++ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+# (at your option) any later version.
++ * See the COPYING file in the top-level directory.
-+#
++ */
-+# This program is distributed in the hope that it will be useful,
++#include "qemu/osdep.h"
-+# but WITHOUT ANY WARRANTY; without even the implied warranty of
++#include "block/aio.h"
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++#include "block/raw-aio.h"
-+# GNU General Public License for more details.
++
-+#
++void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
-+# You should have received a copy of the GNU General Public License
++{
-+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
++    abort();
-+#
++}
 +
-+import itertools
++void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
-+from lark import Lark
++{
-+
++    abort();
-+grammar = """
++}
-+start: ( text | column_switch | row_switch )+
++
-+
++LinuxAioState *laio_init(void)
-+column_switch: "{" text ["|" text]+ "}"
++{
-+row_switch: "[" text ["|" text]+ "]"
++    abort();
-+text: /[^|{}\[\]]+/
++}
-+"""
++
-+
++void laio_cleanup(LinuxAioState *s)
-+parser = Lark(grammar)
++{
-+
++    abort();
-+class Templater:
++}
-+    def __init__(self, template):
+diff --git a/stubs/set-fd-handler.c b/stubs/set-fd-handler.c
-+        self.tree = parser.parse(template)
+index XXXXXXX..XXXXXXX 100644
-+
+--- a/stubs/set-fd-handler.c
-+        c_switches = []
++++ b/stubs/set-fd-handler.c
-+        r_switches = []
+@@ -XXX,XX +XXX,XX @@ void qemu_set_fd_handler(int fd,
-+        for x in self.tree.children:
+ {
-+            if x.data == 'column_switch':
+     abort();
-+                c_switches.append([el.children[0].value for el in x.children])
+ }
-+            elif x.data == 'row_switch':
+-
-+                r_switches.append([el.children[0].value for el in x.children])
+-void aio_set_fd_handler(AioContext *ctx,
-+
+-                        int fd,
-+        self.columns = list(itertools.product(*c_switches))
+-                        bool is_external,
-+        self.rows = list(itertools.product(*r_switches))
+-                        IOHandler *io_read,
-+
+-                        IOHandler *io_write,
-+    def gen(self, column, row):
+-                        AioPollFn *io_poll,
-+        i = 0
+-                        void *opaque)
-+        j = 0
+-{
-+        result = []
+-    abort();
-+
+-}
-+        for x in self.tree.children:
+diff --git a/aio-posix.c b/util/aio-posix.c
-+            if x.data == 'text':
+similarity index 99%
-+                result.append(x.children[0].value)
+rename from aio-posix.c
-+            elif x.data == 'column_switch':
+rename to util/aio-posix.c
-+                result.append(column[i])
+index XXXXXXX..XXXXXXX 100644
-+                i += 1
+--- a/aio-posix.c
-+            elif x.data == 'row_switch':
++++ b/util/aio-posix.c
-+                result.append(row[j])
+@@ -XXX,XX +XXX,XX @@
-+                j += 1
+ #include "qemu/rcu_queue.h"
-+
+ #include "qemu/sockets.h"
-+        return ''.join(result)
+ #include "qemu/cutils.h"
 -#include "trace-root.h"
 +#include "trace.h"
  #ifdef CONFIG_EPOLL_CREATE1
  #include <sys/epoll.h>
  #endif
 diff --git a/aio-win32.c b/util/aio-win32.c
 similarity index 100%
 rename from aio-win32.c
 rename to util/aio-win32.c
 diff --git a/util/aiocb.c b/util/aiocb.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/util/aiocb.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * BlockAIOCB allocation
 + *
 + * Copyright (c) 2003-2017 Fabrice Bellard and other QEMU contributors
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a copy
 + * of this software and associated documentation files (the "Software"), to deal
 + * in the Software without restriction, including without limitation the rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "block/aio.h"
 +
 +void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
 +                   BlockCompletionFunc *cb, void *opaque)
 +{
 +    BlockAIOCB *acb;
 +
 +    acb = g_malloc(aiocb_info->aiocb_size);
 +    acb->aiocb_info = aiocb_info;
 +    acb->bs = bs;
 +    acb->cb = cb;
 +    acb->opaque = opaque;
 +    acb->refcnt = 1;
 +    return acb;
 +}
 +
 +void qemu_aio_ref(void *p)
 +{
 +    BlockAIOCB *acb = p;
 +    acb->refcnt++;
 +}
 +
 +void qemu_aio_unref(void *p)
 +{
 +    BlockAIOCB *acb = p;
 +    assert(acb->refcnt > 0);
 +    if (--acb->refcnt == 0) {
 +        g_free(acb);
 +    }
 +}
 diff --git a/async.c b/util/async.c
 similarity index 99%
 rename from async.c
 rename to util/async.c
 index XXXXXXX..XXXXXXX 100644
 --- a/async.c
 +++ b/util/async.c
@@ -XXX,XX +XXX,XX @@
  /*
 - * QEMU System Emulator
 + * Data plane event loop
   *
   * Copyright (c) 2003-2008 Fabrice Bellard
 + * Copyright (c) 2009-2017 QEMU contributors
   *
   * Permission is hereby granted, free of charge, to any person obtaining a copy
   * of this software and associated documentation files (the "Software"), to deal
 diff --git a/iohandler.c b/util/iohandler.c
 similarity index 100%
 rename from iohandler.c
 rename to util/iohandler.c
 diff --git a/main-loop.c b/util/main-loop.c
 similarity index 100%
 rename from main-loop.c
 rename to util/main-loop.c
 diff --git a/qemu-timer.c b/util/qemu-timer.c
 similarity index 100%
 rename from qemu-timer.c
 rename to util/qemu-timer.c
 diff --git a/thread-pool.c b/util/thread-pool.c
 similarity index 99%
 rename from thread-pool.c
 rename to util/thread-pool.c
 index XXXXXXX..XXXXXXX 100644
 --- a/thread-pool.c
 +++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/queue.h"
  #include "qemu/thread.h"
  #include "qemu/coroutine.h"
 -#include "trace-root.h"
 +#include "trace.h"
  #include "block/thread-pool.h"
  #include "qemu/main-loop.h"
 diff --git a/trace-events b/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/trace-events
 +++ b/trace-events
@@ -XXX,XX +XXX,XX @@
  #
  # The <format-string> should be a sprintf()-compatible format string.
 -# aio-posix.c
 -run_poll_handlers_begin(void *ctx, int64_t max_ns) "ctx %p max_ns %"PRId64
 -run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
 -poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 -poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 -
 -# thread-pool.c
 -thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
 -thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
 -thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
 -
  # ioport.c
  cpu_in(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
  cpu_out(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
 diff --git a/util/trace-events b/util/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/util/trace-events
 +++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@
  # See docs/tracing.txt for syntax documentation.
 +# util/aio-posix.c
 +run_poll_handlers_begin(void *ctx, int64_t max_ns) "ctx %p max_ns %"PRId64
 +run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
 +poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 +poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 +
 +# util/thread-pool.c
 +thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
 +thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
 +thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
 +
  # util/buffer.c
  buffer_resize(const char *buf, size_t olen, size_t len) "%s: old %zd, new %zd"
  buffer_move_empty(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s"
 --
-.31.1
+.9.3

-[PULL 16/32] tests: add migrate-during-backup
+[Qemu-devel] [PULL v2 02/24] aio: introduce aio_co_schedule and aio_co_wake
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Add a simple test which tries to run migration during backup.
+aio_co_wake provides the infrastructure to start a coroutine on a "home"
-bdrv_inactivate_all() should fail. But due to bug (see next commit with
+AioContext.  It will be used by CoMutex and CoQueue, so that coroutines
-fix) it doesn't, nodes are inactivated and continued backup crashes
+don't jump from one context to another when they go to sleep on a
-on assertion "assert(!(bs->open_flags & BDRV_O_INACTIVE));" in
+mutex or waitqueue.  However, it can also be used as a more efficient
-bdrv_co_write_req_prepare().
+alternative to one-shot bottom halves, and saves the effort of tracking
+which AioContext a coroutine is running on.
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-Message-Id: <20210911120027.8063-2-vsementsov@virtuozzo.com>
+aio_co_schedule is the part of aio_co_wake that starts a coroutine
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
+on a remove AioContext, but it is also useful to implement e.g.
 bdrv_set_aio_context callbacks.
 The implementation of aio_co_schedule is based on a lock-free
 multiple-producer, single-consumer queue.  The multiple producers use
 cmpxchg to add to a LIFO stack.  The consumer (a per-AioContext bottom
 half) grabs all items added so far, inverts the list to make it FIFO,
 and goes through it one item at a time until it's empty.  The data
 structure was inspired by OSv, which uses it in the very code we'll
 "port" to QEMU for the thread-safe CoMutex.
 Most of the new code is really tests.
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213135235.12274-3-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- .../qemu-iotests/tests/migrate-during-backup  | 97 +++++++++++++++++++
+ tests/Makefile.include       |   8 +-
- .../tests/migrate-during-backup.out           |  5 +
+ include/block/aio.h          |  32 +++++++
-files changed, 102 insertions(+)
+ include/qemu/coroutine_int.h |  11 ++-
- create mode 100755 tests/qemu-iotests/tests/migrate-during-backup
+ tests/iothread.h             |  25 +++++
- create mode 100644 tests/qemu-iotests/tests/migrate-during-backup.out
+ tests/iothread.c             |  91 ++++++++++++++++++
+ tests/test-aio-multithread.c | 213 +++++++++++++++++++++++++++++++++++++++++++
-diff --git a/tests/qemu-iotests/tests/migrate-during-backup b/tests/qemu-iotests/tests/migrate-during-backup
+ util/async.c                 |  65 +++++++++++++
-new file mode 100755
+ util/qemu-coroutine.c        |   8 ++
-index XXXXXXX..XXXXXXX
+ util/trace-events            |   4 +
---- /dev/null
+files changed, 453 insertions(+), 4 deletions(-)
-+++ b/tests/qemu-iotests/tests/migrate-during-backup
+ create mode 100644 tests/iothread.h
-@@ -XXX,XX +XXX,XX @@
+ create mode 100644 tests/iothread.c
-+#!/usr/bin/env python3
+ create mode 100644 tests/test-aio-multithread.c
-+# group: migration disabled
-+#
+diff --git a/tests/Makefile.include b/tests/Makefile.include
-+# Copyright (c) 2021 Virtuozzo International GmbH
+index XXXXXXX..XXXXXXX 100644
-+#
+--- a/tests/Makefile.include
-+# This program is free software; you can redistribute it and/or modify
++++ b/tests/Makefile.include
-+# it under the terms of the GNU General Public License as published by
+@@ -XXX,XX +XXX,XX @@ check-unit-y += tests/test-aio$(EXESUF)
-+# the Free Software Foundation; either version 2 of the License, or
+ gcov-files-test-aio-y = util/async.c util/qemu-timer.o
-+# (at your option) any later version.
+ gcov-files-test-aio-$(CONFIG_WIN32) += util/aio-win32.c
-+#
+ gcov-files-test-aio-$(CONFIG_POSIX) += util/aio-posix.c
-+# This program is distributed in the hope that it will be useful,
++check-unit-y += tests/test-aio-multithread$(EXESUF)
-+# but WITHOUT ANY WARRANTY; without even the implied warranty of
++gcov-files-test-aio-multithread-y = $(gcov-files-test-aio-y)
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++gcov-files-test-aio-multithread-y += util/qemu-coroutine.c tests/iothread.c
-+# GNU General Public License for more details.
+ check-unit-y += tests/test-throttle$(EXESUF)
-+#
+-gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
-+# You should have received a copy of the GNU General Public License
+-gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
-+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ check-unit-y += tests/test-thread-pool$(EXESUF)
-+#
+ gcov-files-test-thread-pool-y = thread-pool.c
-+
+ gcov-files-test-hbitmap-y = util/hbitmap.c
-+import os
+@@ -XXX,XX +XXX,XX @@ test-qapi-obj-y = tests/test-qapi-visit.o tests/test-qapi-types.o \
-+import iotests
+     $(test-qom-obj-y)
-+from iotests import qemu_img_create, qemu_io
+ test-crypto-obj-y = $(crypto-obj-y) $(test-qom-obj-y)
-+
+ test-io-obj-y = $(io-obj-y) $(test-crypto-obj-y)
-+
+-test-block-obj-y = $(block-obj-y) $(test-io-obj-y)
-+disk_a = os.path.join(iotests.test_dir, 'disk_a')
++test-block-obj-y = $(block-obj-y) $(test-io-obj-y) tests/iothread.o
-+disk_b = os.path.join(iotests.test_dir, 'disk_b')
-+size = '1M'
+ tests/check-qint$(EXESUF): tests/check-qint.o $(test-util-obj-y)
-+mig_file = os.path.join(iotests.test_dir, 'mig_file')
+ tests/check-qstring$(EXESUF): tests/check-qstring.o $(test-util-obj-y)
-+mig_cmd = 'exec: cat > ' + mig_file
+@@ -XXX,XX +XXX,XX @@ tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
-+
+ tests/test-char$(EXESUF): tests/test-char.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y) $(chardev-obj-y)
-+
+ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
-+class TestMigrateDuringBackup(iotests.QMPTestCase):
+ tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
-+    def tearDown(self):
++tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
-+        self.vm.shutdown()
+ tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
-+        os.remove(disk_a)
+ tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
-+        os.remove(disk_b)
+ tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
-+        os.remove(mig_file)
+diff --git a/include/block/aio.h b/include/block/aio.h
-+
+index XXXXXXX..XXXXXXX 100644
-+    def setUp(self):
+--- a/include/block/aio.h
-+        qemu_img_create('-f', iotests.imgfmt, disk_a, size)
++++ b/include/block/aio.h
-+        qemu_img_create('-f', iotests.imgfmt, disk_b, size)
+@@ -XXX,XX +XXX,XX @@ typedef void QEMUBHFunc(void *opaque);
-+        qemu_io('-c', f'write 0 {size}', disk_a)
+ typedef bool AioPollFn(void *opaque);
-+
+ typedef void IOHandler(void *opaque);
-+        self.vm = iotests.VM().add_drive(disk_a)
-+        self.vm.launch()
++struct Coroutine;
-+        result = self.vm.qmp('blockdev-add', {
+ struct ThreadPool;
-+            'node-name': 'target',
+ struct LinuxAioState;
-+            'driver': iotests.imgfmt,
-+            'file': {
+@@ -XXX,XX +XXX,XX @@ struct AioContext {
-+                'driver': 'file',
+     bool notified;
-+                'filename': disk_b
+     EventNotifier notifier;
-+            }
-+        })
++    QSLIST_HEAD(, Coroutine) scheduled_coroutines;
-+        self.assert_qmp(result, 'return', {})
++    QEMUBH *co_schedule_bh;
 +
-+    def test_migrate(self):
+     /* Thread pool for performing work and receiving completion callbacks.
-+        result = self.vm.qmp('blockdev-backup', device='drive0',
+      * Has its own locking.
-+                             target='target', sync='full',
+      */
-+                             speed=1, x_perf={
+@@ -XXX,XX +XXX,XX @@ static inline bool aio_node_check(AioContext *ctx, bool is_external)
-+                                 'max-workers': 1,
+ }
-+                                 'max-chunk': 64 * 1024
-+                             })
+ /**
-+        self.assert_qmp(result, 'return', {})
++ * aio_co_schedule:
-+
++ * @ctx: the aio context
-+        result = self.vm.qmp('job-pause', id='drive0')
++ * @co: the coroutine
-+        self.assert_qmp(result, 'return', {})
++ *
-+
++ * Start a coroutine on a remote AioContext.
-+        result = self.vm.qmp('migrate-set-capabilities',
++ *
-+                             capabilities=[{'capability': 'events',
++ * The coroutine must not be entered by anyone else while aio_co_schedule()
-+                                            'state': True}])
++ * is active.  In addition the coroutine must have yielded unless ctx
-+        self.assert_qmp(result, 'return', {})
++ * is the context in which the coroutine is running (i.e. the value of
-+        result = self.vm.qmp('migrate', uri=mig_cmd)
++ * qemu_get_current_aio_context() from the coroutine itself).
-+        self.assert_qmp(result, 'return', {})
++ */
-+
++void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
-+        e = self.vm.events_wait((('MIGRATION',
++
-+                                  {'data': {'status': 'completed'}}),
++/**
-+                                 ('MIGRATION',
++ * aio_co_wake:
-+                                  {'data': {'status': 'failed'}})))
++ * @co: the coroutine
-+
++ *
-+        # Don't assert that e is 'failed' now: this way we'll miss
++ * Restart a coroutine on the AioContext where it was running last, thus
-+        # possible crash when backup continues :)
++ * preventing coroutines from jumping from one context to another when they
-+
++ * go to sleep.
-+        result = self.vm.qmp('block-job-set-speed', device='drive0',
++ *
-+                             speed=0)
++ * aio_co_wake may be executed either in coroutine or non-coroutine
-+        self.assert_qmp(result, 'return', {})
++ * context.  The coroutine must not be entered by anyone else while
-+        result = self.vm.qmp('job-resume', id='drive0')
++ * aio_co_wake() is active.
-+        self.assert_qmp(result, 'return', {})
++ */
-+
++void aio_co_wake(struct Coroutine *co);
-+        # For future: if something changes so that both migration
++
-+        # and backup pass, let's not miss that moment, as it may
++/**
-+        # be a bug as well as improvement.
+  * Return the AioContext whose event loop runs in the current thread.
-+        self.assert_qmp(e, 'data/status', 'failed')
+  *
-+
+  * If called from an IOThread this will be the IOThread's AioContext.  If
-+
+diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
-+if __name__ == '__main__':
+index XXXXXXX..XXXXXXX 100644
-+    iotests.main(supported_fmts=['qcow2'],
+--- a/include/qemu/coroutine_int.h
-+                 supported_protocols=['file'])
++++ b/include/qemu/coroutine_int.h
-diff --git a/tests/qemu-iotests/tests/migrate-during-backup.out b/tests/qemu-iotests/tests/migrate-during-backup.out
+@@ -XXX,XX +XXX,XX @@ struct Coroutine {
      CoroutineEntry *entry;
      void *entry_arg;
      Coroutine *caller;
 +
 +    /* Only used when the coroutine has terminated.  */
      QSLIST_ENTRY(Coroutine) pool_next;
 +
      size_t locks_held;
 -    /* Coroutines that should be woken up when we yield or terminate */
 +    /* Coroutines that should be woken up when we yield or terminate.
 +     * Only used when the coroutine is running.
 +     */
      QSIMPLEQ_HEAD(, Coroutine) co_queue_wakeup;
 +
 +    /* Only used when the coroutine has yielded.  */
 +    AioContext *ctx;
      QSIMPLEQ_ENTRY(Coroutine) co_queue_next;
 +    QSLIST_ENTRY(Coroutine) co_scheduled_next;
  };
  Coroutine *qemu_coroutine_new(void);
 diff --git a/tests/iothread.h b/tests/iothread.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/tests/qemu-iotests/tests/migrate-during-backup.out
++++ b/tests/iothread.h
 @@ -XXX,XX +XXX,XX @@
-+.
++/*
-+----------------------------------------------------------------------
++ * Event loop thread implementation for unit tests
-+Ran 1 tests
++ *
-+
++ * Copyright Red Hat Inc., 2013, 2016
-+OK
++ *
 + * Authors:
 + *  Stefan Hajnoczi   <stefanha@redhat.com>
 + *  Paolo Bonzini     <pbonzini@redhat.com>
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +#ifndef TEST_IOTHREAD_H
 +#define TEST_IOTHREAD_H
 +
 +#include "block/aio.h"
 +#include "qemu/thread.h"
 +
 +typedef struct IOThread IOThread;
 +
 +IOThread *iothread_new(void);
 +void iothread_join(IOThread *iothread);
 +AioContext *iothread_get_aio_context(IOThread *iothread);
 +
 +#endif
 diff --git a/tests/iothread.c b/tests/iothread.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/iothread.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * Event loop thread implementation for unit tests
 + *
 + * Copyright Red Hat Inc., 2013, 2016
 + *
 + * Authors:
 + *  Stefan Hajnoczi   <stefanha@redhat.com>
 + *  Paolo Bonzini     <pbonzini@redhat.com>
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + *
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qapi/error.h"
 +#include "block/aio.h"
 +#include "qemu/main-loop.h"
 +#include "qemu/rcu.h"
 +#include "iothread.h"
 +
 +struct IOThread {
 +    AioContext *ctx;
 +
 +    QemuThread thread;
 +    QemuMutex init_done_lock;
 +    QemuCond init_done_cond;    /* is thread initialization done? */
 +    bool stopping;
 +};
 +
 +static __thread IOThread *my_iothread;
 +
 +AioContext *qemu_get_current_aio_context(void)
 +{
 +    return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
 +}
 +
 +static void *iothread_run(void *opaque)
 +{
 +    IOThread *iothread = opaque;
 +
 +    rcu_register_thread();
 +
 +    my_iothread = iothread;
 +    qemu_mutex_lock(&iothread->init_done_lock);
 +    iothread->ctx = aio_context_new(&error_abort);
 +    qemu_cond_signal(&iothread->init_done_cond);
 +    qemu_mutex_unlock(&iothread->init_done_lock);
 +
 +    while (!atomic_read(&iothread->stopping)) {
 +        aio_poll(iothread->ctx, true);
 +    }
 +
 +    rcu_unregister_thread();
 +    return NULL;
 +}
 +
 +void iothread_join(IOThread *iothread)
 +{
 +    iothread->stopping = true;
 +    aio_notify(iothread->ctx);
 +    qemu_thread_join(&iothread->thread);
 +    qemu_cond_destroy(&iothread->init_done_cond);
 +    qemu_mutex_destroy(&iothread->init_done_lock);
 +    aio_context_unref(iothread->ctx);
 +    g_free(iothread);
 +}
 +
 +IOThread *iothread_new(void)
 +{
 +    IOThread *iothread = g_new0(IOThread, 1);
 +
 +    qemu_mutex_init(&iothread->init_done_lock);
 +    qemu_cond_init(&iothread->init_done_cond);
 +    qemu_thread_create(&iothread->thread, NULL, iothread_run,
 +                       iothread, QEMU_THREAD_JOINABLE);
 +
 +    /* Wait for initialization to complete */
 +    qemu_mutex_lock(&iothread->init_done_lock);
 +    while (iothread->ctx == NULL) {
 +        qemu_cond_wait(&iothread->init_done_cond,
 +                       &iothread->init_done_lock);
 +    }
 +    qemu_mutex_unlock(&iothread->init_done_lock);
 +    return iothread;
 +}
 +
 +AioContext *iothread_get_aio_context(IOThread *iothread)
 +{
 +    return iothread->ctx;
 +}
 diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/test-aio-multithread.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * AioContext multithreading tests
 + *
 + * Copyright Red Hat, Inc. 2016
 + *
 + * Authors:
 + *  Paolo Bonzini    <pbonzini@redhat.com>
 + *
 + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 + * See the COPYING.LIB file in the top-level directory.
 + */
 +
 +#include "qemu/osdep.h"
 +#include <glib.h>
 +#include "block/aio.h"
 +#include "qapi/error.h"
 +#include "qemu/coroutine.h"
 +#include "qemu/thread.h"
 +#include "qemu/error-report.h"
 +#include "iothread.h"
 +
 +/* AioContext management */
 +
 +#define NUM_CONTEXTS 5
 +
 +static IOThread *threads[NUM_CONTEXTS];
 +static AioContext *ctx[NUM_CONTEXTS];
 +static __thread int id = -1;
 +
 +static QemuEvent done_event;
 +
 +/* Run a function synchronously on a remote iothread. */
 +
 +typedef struct CtxRunData {
 +    QEMUBHFunc *cb;
 +    void *arg;
 +} CtxRunData;
 +
 +static void ctx_run_bh_cb(void *opaque)
 +{
 +    CtxRunData *data = opaque;
 +
 +    data->cb(data->arg);
 +    qemu_event_set(&done_event);
 +}
 +
 +static void ctx_run(int i, QEMUBHFunc *cb, void *opaque)
 +{
 +    CtxRunData data = {
 +        .cb = cb,
 +        .arg = opaque
 +    };
 +
 +    qemu_event_reset(&done_event);
 +    aio_bh_schedule_oneshot(ctx[i], ctx_run_bh_cb, &data);
 +    qemu_event_wait(&done_event);
 +}
 +
 +/* Starting the iothreads. */
 +
 +static void set_id_cb(void *opaque)
 +{
 +    int *i = opaque;
 +
 +    id = *i;
 +}
 +
 +static void create_aio_contexts(void)
 +{
 +    int i;
 +
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        threads[i] = iothread_new();
 +        ctx[i] = iothread_get_aio_context(threads[i]);
 +    }
 +
 +    qemu_event_init(&done_event, false);
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        ctx_run(i, set_id_cb, &i);
 +    }
 +}
 +
 +/* Stopping the iothreads. */
 +
 +static void join_aio_contexts(void)
 +{
 +    int i;
 +
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        aio_context_ref(ctx[i]);
 +    }
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        iothread_join(threads[i]);
 +    }
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        aio_context_unref(ctx[i]);
 +    }
 +    qemu_event_destroy(&done_event);
 +}
 +
 +/* Basic test for the stuff above. */
 +
 +static void test_lifecycle(void)
 +{
 +    create_aio_contexts();
 +    join_aio_contexts();
 +}
 +
 +/* aio_co_schedule test.  */
 +
 +static Coroutine *to_schedule[NUM_CONTEXTS];
 +
 +static bool now_stopping;
 +
 +static int count_retry;
 +static int count_here;
 +static int count_other;
 +
 +static bool schedule_next(int n)
 +{
 +    Coroutine *co;
 +
 +    co = atomic_xchg(&to_schedule[n], NULL);
 +    if (!co) {
 +        atomic_inc(&count_retry);
 +        return false;
 +    }
 +
 +    if (n == id) {
 +        atomic_inc(&count_here);
 +    } else {
 +        atomic_inc(&count_other);
 +    }
 +
 +    aio_co_schedule(ctx[n], co);
 +    return true;
 +}
 +
 +static void finish_cb(void *opaque)
 +{
 +    schedule_next(id);
 +}
 +
 +static coroutine_fn void test_multi_co_schedule_entry(void *opaque)
 +{
 +    g_assert(to_schedule[id] == NULL);
 +    atomic_mb_set(&to_schedule[id], qemu_coroutine_self());
 +
 +    while (!atomic_mb_read(&now_stopping)) {
 +        int n;
 +
 +        n = g_test_rand_int_range(0, NUM_CONTEXTS);
 +        schedule_next(n);
 +        qemu_coroutine_yield();
 +
 +        g_assert(to_schedule[id] == NULL);
 +        atomic_mb_set(&to_schedule[id], qemu_coroutine_self());
 +    }
 +}
 +
 +
 +static void test_multi_co_schedule(int seconds)
 +{
 +    int i;
 +
 +    count_here = count_other = count_retry = 0;
 +    now_stopping = false;
 +
 +    create_aio_contexts();
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        Coroutine *co1 = qemu_coroutine_create(test_multi_co_schedule_entry, NULL);
 +        aio_co_schedule(ctx[i], co1);
 +    }
 +
 +    g_usleep(seconds * 1000000);
 +
 +    atomic_mb_set(&now_stopping, true);
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        ctx_run(i, finish_cb, NULL);
 +        to_schedule[i] = NULL;
 +    }
 +
 +    join_aio_contexts();
 +    g_test_message("scheduled %d, queued %d, retry %d, total %d\n",
 +                  count_other, count_here, count_retry,
 +                  count_here + count_other + count_retry);
 +}
 +
 +static void test_multi_co_schedule_1(void)
 +{
 +    test_multi_co_schedule(1);
 +}
 +
 +static void test_multi_co_schedule_10(void)
 +{
 +    test_multi_co_schedule(10);
 +}
 +
 +/* End of tests.  */
 +
 +int main(int argc, char **argv)
 +{
 +    init_clocks();
 +
 +    g_test_init(&argc, &argv, NULL);
 +    g_test_add_func("/aio/multi/lifecycle", test_lifecycle);
 +    if (g_test_quick()) {
 +        g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
 +    } else {
 +        g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
 +    }
 +    return g_test_run();
 +}
 diff --git a/util/async.c b/util/async.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/async.c
 +++ b/util/async.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/main-loop.h"
  #include "qemu/atomic.h"
  #include "block/raw-aio.h"
 +#include "qemu/coroutine_int.h"
 +#include "trace.h"
  /***********************************************************/
  /* bottom halves (can be seen as timers which expire ASAP) */
@@ -XXX,XX +XXX,XX @@ aio_ctx_finalize(GSource     *source)
      }
  #endif
 +    assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
 +    qemu_bh_delete(ctx->co_schedule_bh);
 +
      qemu_lockcnt_lock(&ctx->list_lock);
      assert(!qemu_lockcnt_count(&ctx->list_lock));
      while (ctx->first_bh) {
@@ -XXX,XX +XXX,XX @@ static bool event_notifier_poll(void *opaque)
      return atomic_read(&ctx->notified);
  }
 +static void co_schedule_bh_cb(void *opaque)
 +{
 +    AioContext *ctx = opaque;
 +    QSLIST_HEAD(, Coroutine) straight, reversed;
 +
 +    QSLIST_MOVE_ATOMIC(&reversed, &ctx->scheduled_coroutines);
 +    QSLIST_INIT(&straight);
 +
 +    while (!QSLIST_EMPTY(&reversed)) {
 +        Coroutine *co = QSLIST_FIRST(&reversed);
 +        QSLIST_REMOVE_HEAD(&reversed, co_scheduled_next);
 +        QSLIST_INSERT_HEAD(&straight, co, co_scheduled_next);
 +    }
 +
 +    while (!QSLIST_EMPTY(&straight)) {
 +        Coroutine *co = QSLIST_FIRST(&straight);
 +        QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
 +        trace_aio_co_schedule_bh_cb(ctx, co);
 +        qemu_coroutine_enter(co);
 +    }
 +}
 +
  AioContext *aio_context_new(Error **errp)
  {
      int ret;
@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
      }
      g_source_set_can_recurse(&ctx->source, true);
      qemu_lockcnt_init(&ctx->list_lock);
 +
 +    ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx);
 +    QSLIST_INIT(&ctx->scheduled_coroutines);
 +
      aio_set_event_notifier(ctx, &ctx->notifier,
                             false,
                             (EventNotifierHandler *)
@@ -XXX,XX +XXX,XX @@ fail:
      return NULL;
  }
 +void aio_co_schedule(AioContext *ctx, Coroutine *co)
 +{
 +    trace_aio_co_schedule(ctx, co);
 +    QSLIST_INSERT_HEAD_ATOMIC(&ctx->scheduled_coroutines,
 +                              co, co_scheduled_next);
 +    qemu_bh_schedule(ctx->co_schedule_bh);
 +}
 +
 +void aio_co_wake(struct Coroutine *co)
 +{
 +    AioContext *ctx;
 +
 +    /* Read coroutine before co->ctx.  Matches smp_wmb in
 +     * qemu_coroutine_enter.
 +     */
 +    smp_read_barrier_depends();
 +    ctx = atomic_read(&co->ctx);
 +
 +    if (ctx != qemu_get_current_aio_context()) {
 +        aio_co_schedule(ctx, co);
 +        return;
 +    }
 +
 +    if (qemu_in_coroutine()) {
 +        Coroutine *self = qemu_coroutine_self();
 +        assert(self != co);
 +        QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, co, co_queue_next);
 +    } else {
 +        aio_context_acquire(ctx);
 +        qemu_coroutine_enter(co);
 +        aio_context_release(ctx);
 +    }
 +}
 +
  void aio_context_ref(AioContext *ctx)
  {
      g_source_ref(&ctx->source);
 diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/qemu-coroutine.c
 +++ b/util/qemu-coroutine.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/atomic.h"
  #include "qemu/coroutine.h"
  #include "qemu/coroutine_int.h"
 +#include "block/aio.h"
  enum {
      POOL_BATCH_SIZE = 64,
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_enter(Coroutine *co)
      }
      co->caller = self;
 +    co->ctx = qemu_get_current_aio_context();
 +
 +    /* Store co->ctx before anything that stores co.  Matches
 +     * barrier in aio_co_wake.
 +     */
 +    smp_wmb();
 +
      ret = qemu_coroutine_switch(self, co, COROUTINE_ENTER);
      qemu_co_queue_run_restart(co);
 diff --git a/util/trace-events b/util/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/util/trace-events
 +++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
  poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
  poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 +# util/async.c
 +aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
 +aio_co_schedule_bh_cb(void *ctx, void *co) "ctx %p co %p"
 +
  # util/thread-pool.c
  thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
  thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
 --
-.31.1
+.9.3

-[PULL 32/32] qemu-img: Add -F shorthand to convert
+[Qemu-devel] [PULL v2 03/24] block-backend: allow blk_prw from coroutine context
-From: Eric Blake <eblake@redhat.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Although we have long supported 'qemu-img convert -o
+qcow2_create2 calls this.  Do not run a nested event loop, as that
-backing_file=foo,backing_fmt=bar', the fact that we have a shortcut -B
+breaks when aio_co_wake tries to queue the coroutine on the co_queue_wakeup
-for backing_file but none for backing_fmt has made it more likely that
+list of the currently running one.
 users accidentally run into:
-qemu-img: warning: Deprecated use of backing file without explicit backing format
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213135235.12274-4-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  block/block-backend.c | 12 ++++++++----
 file changed, 8 insertions(+), 4 deletions(-)
-when using -B instead of -o.  For similarity with other qemu-img
+diff --git a/block/block-backend.c b/block/block-backend.c
 commands, such as create and compare, add '-F $fmt' as the shorthand
 for '-o backing_fmt=$fmt'.  Update iotest 122 for coverage of both
 spellings.
 Signed-off-by: Eric Blake <eblake@redhat.com>
 Message-Id: <20210913131735.1948339-1-eblake@redhat.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
 Signed-off-by: Hanna Reitz <hreitz@redhat.com>
 ---
  docs/tools/qemu-img.rst |  4 ++--
  qemu-img.c              | 10 +++++++---
  qemu-img-cmds.hx        |  2 +-
  tests/qemu-iotests/122  |  2 +-
 files changed, 11 insertions(+), 7 deletions(-)
 diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst
 index XXXXXXX..XXXXXXX 100644
---- a/docs/tools/qemu-img.rst
+--- a/block/block-backend.c
-+++ b/docs/tools/qemu-img.rst
++++ b/block/block-backend.c
-@@ -XXX,XX +XXX,XX @@ Command description:
+@@ -XXX,XX +XXX,XX @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
+ {
-     Error on reading data
+     QEMUIOVector qiov;
+     struct iovec iov;
--.. option:: convert [--object OBJECTDEF] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps [--skip-broken-bitmaps]] [-U] [-C] [-c] [-p] [-q] [-n] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-O OUTPUT_FMT] [-B BACKING_FILE] [-o OPTIONS] [-l SNAPSHOT_PARAM] [-S SPARSE_SIZE] [-r RATE_LIMIT] [-m NUM_COROUTINES] [-W] FILENAME [FILENAME2 [...]] OUTPUT_FILENAME
+-    Coroutine *co;
-+.. option:: convert [--object OBJECTDEF] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps [--skip-broken-bitmaps]] [-U] [-C] [-c] [-p] [-q] [-n] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-O OUTPUT_FMT] [-B BACKING_FILE [-F backing_fmt]] [-o OPTIONS] [-l SNAPSHOT_PARAM] [-S SPARSE_SIZE] [-r RATE_LIMIT] [-m NUM_COROUTINES] [-W] FILENAME [FILENAME2 [...]] OUTPUT_FILENAME
+     BlkRwCo rwco;
-   Convert the disk image *FILENAME* or a snapshot *SNAPSHOT_PARAM*
+     iov = (struct iovec) {
-   to disk image *OUTPUT_FILENAME* using format *OUTPUT_FMT*. It can
+@@ -XXX,XX +XXX,XX @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
-@@ -XXX,XX +XXX,XX @@ Command description:
+         .ret    = NOT_DONE,
-   You can use the *BACKING_FILE* option to force the output image to be
+     };
-   created as a copy on write image of the specified base image; the
-   *BACKING_FILE* should have the same content as the input's base image,
+-    co = qemu_coroutine_create(co_entry, &rwco);
--  however the path, image format, etc may differ.
+-    qemu_coroutine_enter(co);
-+  however the path, image format (as given by *BACKING_FMT*), etc may differ.
+-    BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
++    if (qemu_in_coroutine()) {
-   If a relative path name is given, the backing file is looked up relative to
++        /* Fast-path if already in coroutine context */
-   the directory containing *OUTPUT_FILENAME*.
++        co_entry(&rwco);
-diff --git a/qemu-img.c b/qemu-img.c
++    } else {
-index XXXXXXX..XXXXXXX 100644
++        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
---- a/qemu-img.c
++        qemu_coroutine_enter(co);
-+++ b/qemu-img.c
++        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
-@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
++    }
-     int c, bs_i, flags, src_flags = BDRV_O_NO_SHARE;
-     const char *fmt = NULL, *out_fmt = NULL, *cache = "unsafe",
+     return rwco.ret;
-                *src_cache = BDRV_DEFAULT_CACHE, *out_baseimg = NULL,
+ }
 -               *out_filename, *out_baseimg_param, *snapshot_name = NULL;
 +               *out_filename, *out_baseimg_param, *snapshot_name = NULL,
 +               *backing_fmt = NULL;
      BlockDriver *drv = NULL, *proto_drv = NULL;
      BlockDriverInfo bdi;
      BlockDriverState *out_bs;
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
              {"skip-broken-bitmaps", no_argument, 0, OPTION_SKIP_BROKEN},
              {0, 0, 0, 0}
          };
 -        c = getopt_long(argc, argv, ":hf:O:B:Cco:l:S:pt:T:qnm:WUr:",
 +        c = getopt_long(argc, argv, ":hf:O:B:CcF:o:l:S:pt:T:qnm:WUr:",
                          long_options, NULL);
          if (c == -1) {
              break;
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
          case 'c':
              s.compressed = true;
              break;
 +        case 'F':
 +            backing_fmt = optarg;
 +            break;
          case 'o':
              if (accumulate_options(&options, optarg) < 0) {
                  goto fail_getopt;
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
          qemu_opt_set_number(opts, BLOCK_OPT_SIZE,
                              s.total_sectors * BDRV_SECTOR_SIZE, &error_abort);
 -        ret = add_old_style_options(out_fmt, opts, out_baseimg, NULL);
 +        ret = add_old_style_options(out_fmt, opts, out_baseimg, backing_fmt);
          if (ret < 0) {
              goto out;
          }
 diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-img-cmds.hx
 +++ b/qemu-img-cmds.hx
@@ -XXX,XX +XXX,XX @@ SRST
  ERST
  DEF("convert", img_convert,
 -    "convert [--object objectdef] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-B backing_file] [-o options] [-l snapshot_param] [-S sparse_size] [-r rate_limit] [-m num_coroutines] [-W] [--salvage] filename [filename2 [...]] output_filename")
 +    "convert [--object objectdef] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-B backing_file [-F backing_fmt]] [-o options] [-l snapshot_param] [-S sparse_size] [-r rate_limit] [-m num_coroutines] [-W] [--salvage] filename [filename2 [...]] output_filename")
  SRST
  .. option:: convert [--object OBJECTDEF] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-O OUTPUT_FMT] [-B BACKING_FILE] [-o OPTIONS] [-l SNAPSHOT_PARAM] [-S SPARSE_SIZE] [-r RATE_LIMIT] [-m NUM_COROUTINES] [-W] [--salvage] FILENAME [FILENAME2 [...]] OUTPUT_FILENAME
  ERST
 diff --git a/tests/qemu-iotests/122 b/tests/qemu-iotests/122
 index XXXXXXX..XXXXXXX 100755
 --- a/tests/qemu-iotests/122
 +++ b/tests/qemu-iotests/122
@@ -XXX,XX +XXX,XX @@ echo
  _make_test_img -b "$TEST_IMG".base -F $IMGFMT
  $QEMU_IO -c "write -P 0 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
 -$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base -o backing_fmt=$IMGFMT \
 +$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base -F $IMGFMT \
      "$TEST_IMG" "$TEST_IMG".orig
  $QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
  $QEMU_IMG convert -O $IMGFMT -c -B "$TEST_IMG".base -o backing_fmt=$IMGFMT \
 --
-.31.1
+.9.3

-[PULL 31/32] qcow2-refcount: check_refblocks(): add separate message for reserved
+[Qemu-devel] [PULL v2 04/24] test-thread-pool: use generic AioContext infrastructure
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Split checking for reserved bits out of aligned offset check.
+Once the thread pool starts using aio_co_wake, it will also need
 qemu_get_current_aio_context().  Make test-thread-pool create
 an AioContext with qemu_init_main_loop, so that stubs/iothread.c
 and tests/iothread.c can provide the rest.
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Tested-by: Kirill Tkhai <ktkhai@virtuozzo.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
-Reviewed-by: Hanna Reitz <hreitz@redhat.com>
+Message-id: 20170213135235.12274-5-pbonzini@redhat.com
-Message-Id: <20210914122454.141075-11-vsementsov@virtuozzo.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Hanna Reitz <hreitz@redhat.com>
 ---
- block/qcow2.h          |  1 +
+ tests/test-thread-pool.c | 12 +++---------
- block/qcow2-refcount.c | 10 +++++++++-
+file changed, 3 insertions(+), 9 deletions(-)
 files changed, 10 insertions(+), 1 deletion(-)
-diff --git a/block/qcow2.h b/block/qcow2.h
+diff --git a/tests/test-thread-pool.c b/tests/test-thread-pool.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
+--- a/tests/test-thread-pool.c
-+++ b/block/qcow2.h
++++ b/tests/test-thread-pool.c
-@@ -XXX,XX +XXX,XX @@ typedef enum QCow2MetadataOverlap {
+@@ -XXX,XX +XXX,XX @@
- #define L2E_STD_RESERVED_MASK 0x3f000000000001feULL
+ #include "qapi/error.h"
+ #include "qemu/timer.h"
- #define REFT_OFFSET_MASK 0xfffffffffffffe00ULL
+ #include "qemu/error-report.h"
-+#define REFT_RESERVED_MASK 0x1ffULL
++#include "qemu/main-loop.h"
- #define INV_OFFSET (-1ULL)
+ static AioContext *ctx;
+ static ThreadPool *pool;
-diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
+@@ -XXX,XX +XXX,XX @@ static void test_cancel_async(void)
-index XXXXXXX..XXXXXXX 100644
+ int main(int argc, char **argv)
---- a/block/qcow2-refcount.c
+ {
-+++ b/block/qcow2-refcount.c
+     int ret;
-@@ -XXX,XX +XXX,XX @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
+-    Error *local_error = NULL;
-     for(i = 0; i < s->refcount_table_size; i++) {
+-    init_clocks();
-         uint64_t offset, cluster;
+-
--        offset = s->refcount_table[i];
+-    ctx = aio_context_new(&local_error);
-+        offset = s->refcount_table[i] & REFT_OFFSET_MASK;
+-    if (!ctx) {
-         cluster = offset >> s->cluster_bits;
+-        error_reportf_err(local_error, "Failed to create AIO Context: ");
+-        exit(1);
-+        if (s->refcount_table[i] & REFT_RESERVED_MASK) {
+-    }
-+            fprintf(stderr, "ERROR refcount table entry %" PRId64 " has "
++    qemu_init_main_loop(&error_abort);
-+                    "reserved bits set\n", i);
++    ctx = qemu_get_current_aio_context();
-+            res->corruptions++;
+     pool = aio_get_thread_pool(ctx);
-+            *rebuild = true;
-+            continue;
+     g_test_init(&argc, &argv, NULL);
-+        }
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
-+
-         /* Refcount blocks are cluster aligned */
+     ret = g_test_run();
-         if (offset_into_cluster(s, offset)) {
-             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
+-    aio_context_unref(ctx);
      return ret;
  }
 --
-.31.1
+.9.3

-[PULL 26/32] qcow2-refcount: fix_l2_entry_by_zero(): also zero L2 entry bitmap
+[Qemu-devel] [PULL v2 05/24] io: add methods to set I/O handlers on AioContext
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-We'll reuse the function to fix wrong L2 entry bitmap. Support it now.
+This is in preparation for making qio_channel_yield work on
+AioContexts other than the main one.
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
-Reviewed-by: Hanna Reitz <hreitz@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Message-Id: <20210914122454.141075-6-vsementsov@virtuozzo.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213135235.12274-6-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qcow2-refcount.c | 18 +++++++++++++++---
+ include/io/channel.h | 25 +++++++++++++++++++++++++
-file changed, 15 insertions(+), 3 deletions(-)
+ io/channel-command.c | 13 +++++++++++++
+ io/channel-file.c    | 11 +++++++++++
-diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
+ io/channel-socket.c  | 16 +++++++++++-----
-index XXXXXXX..XXXXXXX 100644
+ io/channel-tls.c     | 12 ++++++++++++
---- a/block/qcow2-refcount.c
+ io/channel-watch.c   |  6 ++++++
-+++ b/block/qcow2-refcount.c
+ io/channel.c         | 11 +++++++++++
-@@ -XXX,XX +XXX,XX @@ enum {
+files changed, 89 insertions(+), 5 deletions(-)
 diff --git a/include/io/channel.h b/include/io/channel.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/io/channel.h
 +++ b/include/io/channel.h
@@ -XXX,XX +XXX,XX @@
  #include "qemu-common.h"
  #include "qom/object.h"
 +#include "block/aio.h"
  #define TYPE_QIO_CHANNEL "qio-channel"
  #define QIO_CHANNEL(obj)                                    \
@@ -XXX,XX +XXX,XX @@ struct QIOChannelClass {
                       off_t offset,
                       int whence,
                       Error **errp);
 +    void (*io_set_aio_fd_handler)(QIOChannel *ioc,
 +                                  AioContext *ctx,
 +                                  IOHandler *io_read,
 +                                  IOHandler *io_write,
 +                                  void *opaque);
  };
- /*
+ /* General I/O handling functions */
-- * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN.
+@@ -XXX,XX +XXX,XX @@ void qio_channel_yield(QIOChannel *ioc,
-+ * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN (or making all its present
+ void qio_channel_wait(QIOChannel *ioc,
-+ * subclusters QCOW2_SUBCLUSTER_ZERO_PLAIN).
+                       GIOCondition condition);
-  *
-  * This function decrements res->corruptions on success, so the caller is
++/**
-  * responsible to increment res->corruptions prior to the call.
++ * qio_channel_set_aio_fd_handler:
-@@ -XXX,XX +XXX,XX @@ static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
++ * @ioc: the channel object
-     int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t));
++ * @ctx: the AioContext to set the handlers on
-     uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s);
++ * @io_read: the read handler
-     int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2;
++ * @io_write: the write handler
--    uint64_t l2_entry = has_subclusters(s) ? 0 : QCOW_OFLAG_ZERO;
++ * @opaque: the opaque value passed to the handler
++ *
--    set_l2_entry(s, l2_table, l2_index, l2_entry);
++ * This is used internally by qio_channel_yield().  It can
-+    if (has_subclusters(s)) {
++ * be used by channel implementations to forward the handlers
-+        uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, l2_index);
++ * to another channel (e.g. from #QIOChannelTLS to the
-+
++ * underlying socket).
-+        /* Allocated subclusters become zero */
++ */
-+        l2_bitmap |= l2_bitmap << 32;
++void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
-+        l2_bitmap &= QCOW_L2_BITMAP_ALL_ZEROES;
++                                    AioContext *ctx,
-+
++                                    IOHandler *io_read,
-+        set_l2_bitmap(s, l2_table, l2_index, l2_bitmap);
++                                    IOHandler *io_write,
-+        set_l2_entry(s, l2_table, l2_index, 0);
++                                    void *opaque);
-+    } else {
++
-+        set_l2_entry(s, l2_table, l2_index, QCOW_OFLAG_ZERO);
+ #endif /* QIO_CHANNEL_H */
-+    }
+diff --git a/io/channel-command.c b/io/channel-command.c
-+
+index XXXXXXX..XXXXXXX 100644
-     ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s),
+--- a/io/channel-command.c
-                                         false);
++++ b/io/channel-command.c
-     if (metadata_overlap) {
+@@ -XXX,XX +XXX,XX @@ static int qio_channel_command_close(QIOChannel *ioc,
  }
 +static void qio_channel_command_set_aio_fd_handler(QIOChannel *ioc,
 +                                                   AioContext *ctx,
 +                                                   IOHandler *io_read,
 +                                                   IOHandler *io_write,
 +                                                   void *opaque)
 +{
 +    QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);
 +    aio_set_fd_handler(ctx, cioc->readfd, false, io_read, NULL, NULL, opaque);
 +    aio_set_fd_handler(ctx, cioc->writefd, false, NULL, io_write, NULL, opaque);
 +}
 +
 +
  static GSource *qio_channel_command_create_watch(QIOChannel *ioc,
                                                   GIOCondition condition)
  {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_command_class_init(ObjectClass *klass,
      ioc_klass->io_set_blocking = qio_channel_command_set_blocking;
      ioc_klass->io_close = qio_channel_command_close;
      ioc_klass->io_create_watch = qio_channel_command_create_watch;
 +    ioc_klass->io_set_aio_fd_handler = qio_channel_command_set_aio_fd_handler;
  }
  static const TypeInfo qio_channel_command_info = {
 diff --git a/io/channel-file.c b/io/channel-file.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel-file.c
 +++ b/io/channel-file.c
@@ -XXX,XX +XXX,XX @@ static int qio_channel_file_close(QIOChannel *ioc,
  }
 +static void qio_channel_file_set_aio_fd_handler(QIOChannel *ioc,
 +                                                AioContext *ctx,
 +                                                IOHandler *io_read,
 +                                                IOHandler *io_write,
 +                                                void *opaque)
 +{
 +    QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);
 +    aio_set_fd_handler(ctx, fioc->fd, false, io_read, io_write, NULL, opaque);
 +}
 +
  static GSource *qio_channel_file_create_watch(QIOChannel *ioc,
                                                GIOCondition condition)
  {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_file_class_init(ObjectClass *klass,
      ioc_klass->io_seek = qio_channel_file_seek;
      ioc_klass->io_close = qio_channel_file_close;
      ioc_klass->io_create_watch = qio_channel_file_create_watch;
 +    ioc_klass->io_set_aio_fd_handler = qio_channel_file_set_aio_fd_handler;
  }
  static const TypeInfo qio_channel_file_info = {
 diff --git a/io/channel-socket.c b/io/channel-socket.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel-socket.c
 +++ b/io/channel-socket.c
@@ -XXX,XX +XXX,XX @@ qio_channel_socket_set_blocking(QIOChannel *ioc,
          qemu_set_block(sioc->fd);
      } else {
          qemu_set_nonblock(sioc->fd);
 -#ifdef WIN32
 -        WSAEventSelect(sioc->fd, ioc->event,
 -                       FD_READ | FD_ACCEPT | FD_CLOSE |
 -                       FD_CONNECT | FD_WRITE | FD_OOB);
 -#endif
      }
      return 0;
  }
@@ -XXX,XX +XXX,XX @@ qio_channel_socket_shutdown(QIOChannel *ioc,
      return 0;
  }
 +static void qio_channel_socket_set_aio_fd_handler(QIOChannel *ioc,
 +                                                  AioContext *ctx,
 +                                                  IOHandler *io_read,
 +                                                  IOHandler *io_write,
 +                                                  void *opaque)
 +{
 +    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
 +    aio_set_fd_handler(ctx, sioc->fd, false, io_read, io_write, NULL, opaque);
 +}
 +
  static GSource *qio_channel_socket_create_watch(QIOChannel *ioc,
                                                  GIOCondition condition)
  {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_socket_class_init(ObjectClass *klass,
      ioc_klass->io_set_cork = qio_channel_socket_set_cork;
      ioc_klass->io_set_delay = qio_channel_socket_set_delay;
      ioc_klass->io_create_watch = qio_channel_socket_create_watch;
 +    ioc_klass->io_set_aio_fd_handler = qio_channel_socket_set_aio_fd_handler;
  }
  static const TypeInfo qio_channel_socket_info = {
 diff --git a/io/channel-tls.c b/io/channel-tls.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel-tls.c
 +++ b/io/channel-tls.c
@@ -XXX,XX +XXX,XX @@ static int qio_channel_tls_close(QIOChannel *ioc,
      return qio_channel_close(tioc->master, errp);
  }
 +static void qio_channel_tls_set_aio_fd_handler(QIOChannel *ioc,
 +                                               AioContext *ctx,
 +                                               IOHandler *io_read,
 +                                               IOHandler *io_write,
 +                                               void *opaque)
 +{
 +    QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc);
 +
 +    qio_channel_set_aio_fd_handler(tioc->master, ctx, io_read, io_write, opaque);
 +}
 +
  static GSource *qio_channel_tls_create_watch(QIOChannel *ioc,
                                               GIOCondition condition)
  {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_tls_class_init(ObjectClass *klass,
      ioc_klass->io_close = qio_channel_tls_close;
      ioc_klass->io_shutdown = qio_channel_tls_shutdown;
      ioc_klass->io_create_watch = qio_channel_tls_create_watch;
 +    ioc_klass->io_set_aio_fd_handler = qio_channel_tls_set_aio_fd_handler;
  }
  static const TypeInfo qio_channel_tls_info = {
 diff --git a/io/channel-watch.c b/io/channel-watch.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel-watch.c
 +++ b/io/channel-watch.c
@@ -XXX,XX +XXX,XX @@ GSource *qio_channel_create_socket_watch(QIOChannel *ioc,
      GSource *source;
      QIOChannelSocketSource *ssource;
 +#ifdef WIN32
 +    WSAEventSelect(socket, ioc->event,
 +                   FD_READ | FD_ACCEPT | FD_CLOSE |
 +                   FD_CONNECT | FD_WRITE | FD_OOB);
 +#endif
 +
      source = g_source_new(&qio_channel_socket_source_funcs,
                            sizeof(QIOChannelSocketSource));
      ssource = (QIOChannelSocketSource *)source;
 diff --git a/io/channel.c b/io/channel.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel.c
 +++ b/io/channel.c
@@ -XXX,XX +XXX,XX @@ GSource *qio_channel_create_watch(QIOChannel *ioc,
  }
 +void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
 +                                    AioContext *ctx,
 +                                    IOHandler *io_read,
 +                                    IOHandler *io_write,
 +                                    void *opaque)
 +{
 +    QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
 +
 +    klass->io_set_aio_fd_handler(ioc, ctx, io_read, io_write, opaque);
 +}
 +
  guint qio_channel_add_watch(QIOChannel *ioc,
                              GIOCondition condition,
                              QIOChannelFunc func,
 --
-.31.1
+.9.3

-[PULL 24/32] qcow2: introduce qcow2_parse_compressed_l2_entry() helper
+[Qemu-devel] [PULL v2 06/24] io: make qio_channel_yield aware of AioContexts
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Add helper to parse compressed l2_entry and use it everywhere instead
+Support separate coroutines for reading and writing, and place the
-of open-coding.
+read/write handlers on the AioContext that the QIOChannel is registered
+with.
-Note, that in most places we move to precise coffset/csize instead of
-sector-aligned. Still it should work good enough for updating
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
-refcounts.
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Message-id: 20170213135235.12274-7-pbonzini@redhat.com
-Reviewed-by: Hanna Reitz <hreitz@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Message-Id: <20210914122454.141075-4-vsementsov@virtuozzo.com>
 Signed-off-by: Hanna Reitz <hreitz@redhat.com>
 ---
- block/qcow2.h          |  3 ++-
+ include/io/channel.h | 47 ++++++++++++++++++++++++++--
- block/qcow2-cluster.c  | 15 +++++++++++++++
+ io/channel.c         | 86 +++++++++++++++++++++++++++++++++++++++-------------
- block/qcow2-refcount.c | 36 +++++++++++++++++-------------------
+files changed, 109 insertions(+), 24 deletions(-)
- block/qcow2.c          |  9 ++-------
-files changed, 36 insertions(+), 27 deletions(-)
+diff --git a/include/io/channel.h b/include/io/channel.h
 diff --git a/block/qcow2.h b/block/qcow2.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
+--- a/include/io/channel.h
-+++ b/block/qcow2.h
++++ b/include/io/channel.h
 @@ -XXX,XX +XXX,XX @@
- /* Defined in the qcow2 spec (compressed cluster descriptor) */
+ #include "qemu-common.h"
- #define QCOW2_COMPRESSED_SECTOR_SIZE 512U
+ #include "qom/object.h"
--#define QCOW2_COMPRESSED_SECTOR_MASK (~(QCOW2_COMPRESSED_SECTOR_SIZE - 1ULL))
++#include "qemu/coroutine.h"
+ #include "block/aio.h"
- /* Must be at least 2 to cover COW */
- #define MIN_L2_CACHE_SIZE 2 /* cache entries */
+ #define TYPE_QIO_CHANNEL "qio-channel"
-@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ struct QIOChannel {
-                                           uint64_t offset,
+     Object parent;
-                                           int compressed_size,
+     unsigned int features; /* bitmask of QIOChannelFeatures */
-                                           uint64_t *host_offset);
+     char *name;
-+void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry,
++    AioContext *ctx;
-+                                     uint64_t *coffset, int *csize);
++    Coroutine *read_coroutine;
++    Coroutine *write_coroutine;
- int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
+ #ifdef _WIN32
- void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m);
+     HANDLE event; /* For use with GSource on Win32 */
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
+ #endif
@@ -XXX,XX +XXX,XX @@ guint qio_channel_add_watch(QIOChannel *ioc,
  /**
 + * qio_channel_attach_aio_context:
 + * @ioc: the channel object
 + * @ctx: the #AioContext to set the handlers on
 + *
 + * Request that qio_channel_yield() sets I/O handlers on
 + * the given #AioContext.  If @ctx is %NULL, qio_channel_yield()
 + * uses QEMU's main thread event loop.
 + *
 + * You can move a #QIOChannel from one #AioContext to another even if
 + * I/O handlers are set for a coroutine.  However, #QIOChannel provides
 + * no synchronization between the calls to qio_channel_yield() and
 + * qio_channel_attach_aio_context().
 + *
 + * Therefore you should first call qio_channel_detach_aio_context()
 + * to ensure that the coroutine is not entered concurrently.  Then,
 + * while the coroutine has yielded, call qio_channel_attach_aio_context(),
 + * and then aio_co_schedule() to place the coroutine on the new
 + * #AioContext.  The calls to qio_channel_detach_aio_context()
 + * and qio_channel_attach_aio_context() should be protected with
 + * aio_context_acquire() and aio_context_release().
 + */
 +void qio_channel_attach_aio_context(QIOChannel *ioc,
 +                                    AioContext *ctx);
 +
 +/**
 + * qio_channel_detach_aio_context:
 + * @ioc: the channel object
 + *
 + * Disable any I/O handlers set by qio_channel_yield().  With the
 + * help of aio_co_schedule(), this allows moving a coroutine that was
 + * paused by qio_channel_yield() to another context.
 + */
 +void qio_channel_detach_aio_context(QIOChannel *ioc);
 +
 +/**
   * qio_channel_yield:
   * @ioc: the channel object
   * @condition: the I/O condition to wait for
   *
 - * Yields execution from the current coroutine until
 - * the condition indicated by @condition becomes
 - * available.
 + * Yields execution from the current coroutine until the condition
 + * indicated by @condition becomes available.  @condition must
 + * be either %G_IO_IN or %G_IO_OUT; it cannot contain both.  In
 + * addition, no two coroutine can be waiting on the same condition
 + * and channel at the same time.
   *
   * This must only be called from coroutine context
   */
 diff --git a/io/channel.c b/io/channel.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
+--- a/io/channel.c
-+++ b/block/qcow2-cluster.c
++++ b/io/channel.c
-@@ -XXX,XX +XXX,XX @@ fail:
+@@ -XXX,XX +XXX,XX @@
-     g_free(l1_table);
+ #include "qemu/osdep.h"
-     return ret;
+ #include "io/channel.h"
  #include "qapi/error.h"
 -#include "qemu/coroutine.h"
 +#include "qemu/main-loop.h"
  bool qio_channel_has_feature(QIOChannel *ioc,
                               QIOChannelFeature feature)
@@ -XXX,XX +XXX,XX @@ off_t qio_channel_io_seek(QIOChannel *ioc,
  }
-+
-+void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry,
-+                                     uint64_t *coffset, int *csize)
+-typedef struct QIOChannelYieldData QIOChannelYieldData;
-+{
+-struct QIOChannelYieldData {
-+    BDRVQcow2State *s = bs->opaque;
+-    QIOChannel *ioc;
-+    int nb_csectors;
+-    Coroutine *co;
-+
+-};
-+    assert(qcow2_get_cluster_type(bs, l2_entry) == QCOW2_CLUSTER_COMPRESSED);
++static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc);
-+
-+    *coffset = l2_entry & s->cluster_offset_mask;
++static void qio_channel_restart_read(void *opaque)
-+
++{
-+    nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1;
++    QIOChannel *ioc = opaque;
-+    *csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE -
++    Coroutine *co = ioc->read_coroutine;
-+        (*coffset & (QCOW2_COMPRESSED_SECTOR_SIZE - 1));
++
-+}
++    ioc->read_coroutine = NULL;
-diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
++    qio_channel_set_aio_fd_handlers(ioc);
-index XXXXXXX..XXXXXXX 100644
++    aio_co_wake(co);
---- a/block/qcow2-refcount.c
++}
-+++ b/block/qcow2-refcount.c
-@@ -XXX,XX +XXX,XX @@ void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
+-static gboolean qio_channel_yield_enter(QIOChannel *ioc,
-     switch (ctype) {
+-                                        GIOCondition condition,
-     case QCOW2_CLUSTER_COMPRESSED:
+-                                        gpointer opaque)
-         {
++static void qio_channel_restart_write(void *opaque)
 -            int64_t offset = (l2_entry & s->cluster_offset_mask)
 -                & QCOW2_COMPRESSED_SECTOR_MASK;
 -            int size = QCOW2_COMPRESSED_SECTOR_SIZE *
 -                (((l2_entry >> s->csize_shift) & s->csize_mask) + 1);
 -            qcow2_free_clusters(bs, offset, size, type);
 +            uint64_t coffset;
 +            int csize;
 +
 +            qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
 +            qcow2_free_clusters(bs, coffset, csize, type);
          }
          break;
      case QCOW2_CLUSTER_NORMAL:
@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
      bool l1_allocated = false;
      int64_t old_entry, old_l2_offset;
      unsigned slice, slice_size2, n_slices;
 -    int i, j, l1_modified = 0, nb_csectors;
 +    int i, j, l1_modified = 0;
      int ret;
      assert(addend >= -1 && addend <= 1);
@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                      switch (qcow2_get_cluster_type(bs, entry)) {
                      case QCOW2_CLUSTER_COMPRESSED:
 -                        nb_csectors = ((entry >> s->csize_shift) &
 -                                       s->csize_mask) + 1;
                          if (addend != 0) {
 -                            uint64_t coffset = (entry & s->cluster_offset_mask)
 -                                & QCOW2_COMPRESSED_SECTOR_MASK;
 +                            uint64_t coffset;
 +                            int csize;
 +
 +                            qcow2_parse_compressed_l2_entry(bs, entry,
 +                                                            &coffset, &csize);
                              ret = update_refcount(
 -                                bs, coffset,
 -                                nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE,
 +                                bs, coffset, csize,
                                  abs(addend), addend < 0,
                                  QCOW2_DISCARD_SNAPSHOT);
                              if (ret < 0) {
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
      BDRVQcow2State *s = bs->opaque;
      uint64_t l2_entry;
      uint64_t next_contiguous_offset = 0;
 -    int i, nb_csectors, ret;
 +    int i, ret;
      size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
      g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes);
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
      /* Do the actual checks */
      for (i = 0; i < s->l2_size; i++) {
 +        uint64_t coffset;
 +        int csize;
          l2_entry = get_l2_entry(s, l2_table, i);
          switch (qcow2_get_cluster_type(bs, l2_entry)) {
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
              }
              /* Mark cluster as used */
 -            nb_csectors = ((l2_entry >> s->csize_shift) &
 -                           s->csize_mask) + 1;
 -            l2_entry &= s->cluster_offset_mask;
 +            qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
              ret = qcow2_inc_refcounts_imrt(
 -                bs, res, refcount_table, refcount_table_size,
 -                l2_entry & QCOW2_COMPRESSED_SECTOR_MASK,
 -                nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE);
 +                bs, res, refcount_table, refcount_table_size, coffset, csize);
              if (ret < 0) {
                  return ret;
              }
 diff --git a/block/qcow2.c b/block/qcow2.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2.c
 +++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ qcow2_co_preadv_compressed(BlockDriverState *bs,
                             size_t qiov_offset)
  {
-     BDRVQcow2State *s = bs->opaque;
+-    QIOChannelYieldData *data = opaque;
--    int ret = 0, csize, nb_csectors;
+-    qemu_coroutine_enter(data->co);
-+    int ret = 0, csize;
+-    return FALSE;
-     uint64_t coffset;
++    QIOChannel *ioc = opaque;
-     uint8_t *buf, *out_buf;
++    Coroutine *co = ioc->write_coroutine;
-     int offset_in_cluster = offset_into_cluster(s, offset);
++
++    ioc->write_coroutine = NULL;
--    assert(qcow2_get_cluster_type(bs, l2_entry) == QCOW2_CLUSTER_COMPRESSED);
++    qio_channel_set_aio_fd_handlers(ioc);
 +    aio_co_wake(co);
  }
 +static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc)
 +{
 +    IOHandler *rd_handler = NULL, *wr_handler = NULL;
 +    AioContext *ctx;
 +
 +    if (ioc->read_coroutine) {
 +        rd_handler = qio_channel_restart_read;
 +    }
 +    if (ioc->write_coroutine) {
 +        wr_handler = qio_channel_restart_write;
 +    }
 +
 +    ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
 +    qio_channel_set_aio_fd_handler(ioc, ctx, rd_handler, wr_handler, ioc);
 +}
 +
 +void qio_channel_attach_aio_context(QIOChannel *ioc,
 +                                    AioContext *ctx)
 +{
 +    AioContext *old_ctx;
 +    if (ioc->ctx == ctx) {
 +        return;
 +    }
 +
 +    old_ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
 +    qio_channel_set_aio_fd_handler(ioc, old_ctx, NULL, NULL, NULL);
 +    ioc->ctx = ctx;
 +    qio_channel_set_aio_fd_handlers(ioc);
 +}
 +
 +void qio_channel_detach_aio_context(QIOChannel *ioc)
 +{
 +    ioc->read_coroutine = NULL;
 +    ioc->write_coroutine = NULL;
 +    qio_channel_set_aio_fd_handlers(ioc);
 +    ioc->ctx = NULL;
 +}
  void coroutine_fn qio_channel_yield(QIOChannel *ioc,
                                      GIOCondition condition)
  {
 -    QIOChannelYieldData data;
 -
--    coffset = l2_entry & s->cluster_offset_mask;
+     assert(qemu_in_coroutine());
--    nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1;
+-    data.ioc = ioc;
--    csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE -
+-    data.co = qemu_coroutine_self();
--        (coffset & ~QCOW2_COMPRESSED_SECTOR_MASK);
+-    qio_channel_add_watch(ioc,
-+    qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
+-                          condition,
+-                          qio_channel_yield_enter,
-     buf = g_try_malloc(csize);
+-                          &data,
-     if (!buf) {
+-                          NULL);
 +    if (condition == G_IO_IN) {
 +        assert(!ioc->read_coroutine);
 +        ioc->read_coroutine = qemu_coroutine_self();
 +    } else if (condition == G_IO_OUT) {
 +        assert(!ioc->write_coroutine);
 +        ioc->write_coroutine = qemu_coroutine_self();
 +    } else {
 +        abort();
 +    }
 +    qio_channel_set_aio_fd_handlers(ioc);
      qemu_coroutine_yield();
  }
 --
-.31.1
+.9.3

-[PULL 01/32] gluster: Align block-status tail
+[Qemu-devel] [PULL v2 07/24] nbd: convert to use qio_channel_yield
-From: Max Reitz <mreitz@redhat.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-gluster's block-status implementation is basically a copy of that in
+In the client, read the reply headers from a coroutine, switching the
-block/file-posix.c, there is only one thing missing, and that is
+read side between the "read header" coroutine and the I/O coroutine that
-aligning trailing data extents to the request alignment (as added by
+reads the body of the reply.
 commit 9c3db310ff0).
-Note that 9c3db310ff0 mentions that "there seems to be no other block
+In the server, if the server can read more requests it will create a new
-driver that sets request_alignment and [...]", but while block/gluster.c
+"read request" coroutine as soon as a request has been read.  Otherwise,
-does indeed not set request_alignment, block/io.c's
+the new coroutine is created in nbd_request_put.
 bdrv_refresh_limits() will still default to an alignment of 512 because
 block/gluster.c does not provide a byte-aligned read function.
 Therefore, unaligned tails can conceivably occur, and so we should apply
 the change from 9c3db310ff0 to gluster's block-status implementation.
-Reported-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20210805143603.59503-1-mreitz@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
+Message-id: 20170213135235.12274-8-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/gluster.c | 16 ++++++++++++++++
+ block/nbd-client.h |   2 +-
-file changed, 16 insertions(+)
+ block/nbd-client.c | 117 ++++++++++++++++++++++++-----------------------------
  nbd/client.c       |   2 +-
  nbd/common.c       |   9 +----
  nbd/server.c       |  94 +++++++++++++-----------------------------
 files changed, 83 insertions(+), 141 deletions(-)
-diff --git a/block/gluster.c b/block/gluster.c
+diff --git a/block/nbd-client.h b/block/nbd-client.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/gluster.c
+--- a/block/nbd-client.h
-+++ b/block/gluster.c
++++ b/block/nbd-client.h
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ typedef struct NBDClientSession {
-     off_t data = 0, hole = 0;
-     int ret = -EINVAL;
+     CoMutex send_mutex;
+     CoQueue free_sema;
-+    assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
+-    Coroutine *send_coroutine;
 +    Coroutine *read_reply_co;
      int in_flight;
      Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
 diff --git a/block/nbd-client.c b/block/nbd-client.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/nbd-client.c
 +++ b/block/nbd-client.c
@@ -XXX,XX +XXX,XX @@
  #define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
  #define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))
 -static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
 +static void nbd_recv_coroutines_enter_all(BlockDriverState *bs)
  {
 +    NBDClientSession *s = nbd_get_client_session(bs);
      int i;
      for (i = 0; i < MAX_NBD_REQUESTS; i++) {
@@ -XXX,XX +XXX,XX @@ static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
              qemu_coroutine_enter(s->recv_coroutine[i]);
          }
      }
 +    BDRV_POLL_WHILE(bs, s->read_reply_co);
  }
  static void nbd_teardown_connection(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
      qio_channel_shutdown(client->ioc,
                           QIO_CHANNEL_SHUTDOWN_BOTH,
                           NULL);
 -    nbd_recv_coroutines_enter_all(client);
 +    nbd_recv_coroutines_enter_all(bs);
      nbd_client_detach_aio_context(bs);
      object_unref(OBJECT(client->sioc));
@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
      client->ioc = NULL;
  }
 -static void nbd_reply_ready(void *opaque)
 +static coroutine_fn void nbd_read_reply_entry(void *opaque)
  {
 -    BlockDriverState *bs = opaque;
 -    NBDClientSession *s = nbd_get_client_session(bs);
 +    NBDClientSession *s = opaque;
      uint64_t i;
      int ret;
 -    if (!s->ioc) { /* Already closed */
 -        return;
 -    }
 -
 -    if (s->reply.handle == 0) {
 -        /* No reply already in flight.  Fetch a header.  It is possible
 -         * that another thread has done the same thing in parallel, so
 -         * the socket is not readable anymore.
 -         */
 +    for (;;) {
 +        assert(s->reply.handle == 0);
          ret = nbd_receive_reply(s->ioc, &s->reply);
 -        if (ret == -EAGAIN) {
 -            return;
 -        }
          if (ret < 0) {
 -            s->reply.handle = 0;
 -            goto fail;
 +            break;
          }
 -    }
 -    /* There's no need for a mutex on the receive side, because the
 -     * handler acts as a synchronization point and ensures that only
 -     * one coroutine is called until the reply finishes.  */
 -    i = HANDLE_TO_INDEX(s, s->reply.handle);
 -    if (i >= MAX_NBD_REQUESTS) {
 -        goto fail;
 -    }
 +        /* There's no need for a mutex on the receive side, because the
 +         * handler acts as a synchronization point and ensures that only
 +         * one coroutine is called until the reply finishes.
 +         */
 +        i = HANDLE_TO_INDEX(s, s->reply.handle);
 +        if (i >= MAX_NBD_REQUESTS || !s->recv_coroutine[i]) {
 +            break;
 +        }
 -    if (s->recv_coroutine[i]) {
 -        qemu_coroutine_enter(s->recv_coroutine[i]);
 -        return;
 +        /* We're woken up by the recv_coroutine itself.  Note that there
 +         * is no race between yielding and reentering read_reply_co.  This
 +         * is because:
 +         *
 +         * - if recv_coroutine[i] runs on the same AioContext, it is only
 +         *   entered after we yield
 +         *
 +         * - if recv_coroutine[i] runs on a different AioContext, reentering
 +         *   read_reply_co happens through a bottom half, which can only
 +         *   run after we yield.
 +         */
 +        aio_co_wake(s->recv_coroutine[i]);
 +        qemu_coroutine_yield();
      }
 -
 -fail:
 -    nbd_teardown_connection(bs);
 -}
 -
 -static void nbd_restart_write(void *opaque)
 -{
 -    BlockDriverState *bs = opaque;
 -
 -    qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine);
 +    s->read_reply_co = NULL;
  }
  static int nbd_co_send_request(BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
                                 QEMUIOVector *qiov)
  {
      NBDClientSession *s = nbd_get_client_session(bs);
 -    AioContext *aio_context;
      int rc, ret, i;
      qemu_co_mutex_lock(&s->send_mutex);
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
          return -EPIPE;
      }
 -    s->send_coroutine = qemu_coroutine_self();
 -    aio_context = bdrv_get_aio_context(bs);
 -
 -    aio_set_fd_handler(aio_context, s->sioc->fd, false,
 -                       nbd_reply_ready, nbd_restart_write, NULL, bs);
      if (qiov) {
          qio_channel_set_cork(s->ioc, true);
          rc = nbd_send_request(s->ioc, request);
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
      } else {
          rc = nbd_send_request(s->ioc, request);
      }
 -    aio_set_fd_handler(aio_context, s->sioc->fd, false,
 -                       nbd_reply_ready, NULL, NULL, bs);
 -    s->send_coroutine = NULL;
      qemu_co_mutex_unlock(&s->send_mutex);
      return rc;
  }
@@ -XXX,XX +XXX,XX @@ static void nbd_co_receive_reply(NBDClientSession *s,
  {
      int ret;
 -    /* Wait until we're woken up by the read handler.  TODO: perhaps
 -     * peek at the next reply and avoid yielding if it's ours?  */
 +    /* Wait until we're woken up by nbd_read_reply_entry.  */
      qemu_coroutine_yield();
      *reply = s->reply;
      if (reply->handle != request->handle ||
@@ -XXX,XX +XXX,XX @@ static void nbd_coroutine_start(NBDClientSession *s,
      /* s->recv_coroutine[i] is set as soon as we get the send_lock.  */
  }
 -static void nbd_coroutine_end(NBDClientSession *s,
 +static void nbd_coroutine_end(BlockDriverState *bs,
                                NBDRequest *request)
  {
 +    NBDClientSession *s = nbd_get_client_session(bs);
      int i = HANDLE_TO_INDEX(s, request->handle);
 +
-     if (!s->fd) {
+     s->recv_coroutine[i] = NULL;
 -    if (s->in_flight-- == MAX_NBD_REQUESTS) {
 -        qemu_co_queue_next(&s->free_sema);
 +    s->in_flight--;
 +    qemu_co_queue_next(&s->free_sema);
 +
 +    /* Kick the read_reply_co to get the next reply.  */
 +    if (s->read_reply_co) {
 +        aio_co_wake(s->read_reply_co);
      }
  }
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
      } else {
          nbd_co_receive_reply(client, &request, &reply, qiov);
      }
 -    nbd_coroutine_end(client, &request);
 +    nbd_coroutine_end(bs, &request);
      return -reply.error;
  }
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
      } else {
          nbd_co_receive_reply(client, &request, &reply, NULL);
      }
 -    nbd_coroutine_end(client, &request);
 +    nbd_coroutine_end(bs, &request);
      return -reply.error;
  }
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
      } else {
          nbd_co_receive_reply(client, &request, &reply, NULL);
      }
 -    nbd_coroutine_end(client, &request);
 +    nbd_coroutine_end(bs, &request);
      return -reply.error;
  }
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_flush(BlockDriverState *bs)
      } else {
          nbd_co_receive_reply(client, &request, &reply, NULL);
      }
 -    nbd_coroutine_end(client, &request);
 +    nbd_coroutine_end(bs, &request);
      return -reply.error;
  }
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
      } else {
          nbd_co_receive_reply(client, &request, &reply, NULL);
      }
 -    nbd_coroutine_end(client, &request);
 +    nbd_coroutine_end(bs, &request);
      return -reply.error;
  }
  void nbd_client_detach_aio_context(BlockDriverState *bs)
  {
 -    aio_set_fd_handler(bdrv_get_aio_context(bs),
 -                       nbd_get_client_session(bs)->sioc->fd,
 -                       false, NULL, NULL, NULL, NULL);
 +    NBDClientSession *client = nbd_get_client_session(bs);
 +    qio_channel_detach_aio_context(QIO_CHANNEL(client->sioc));
  }
  void nbd_client_attach_aio_context(BlockDriverState *bs,
                                     AioContext *new_context)
  {
 -    aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sioc->fd,
 -                       false, nbd_reply_ready, NULL, NULL, bs);
 +    NBDClientSession *client = nbd_get_client_session(bs);
 +    qio_channel_attach_aio_context(QIO_CHANNEL(client->sioc), new_context);
 +    aio_co_schedule(new_context, client->read_reply_co);
  }
  void nbd_client_close(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ int nbd_client_init(BlockDriverState *bs,
      /* Now that we're connected, set the socket to be non-blocking and
       * kick the reply mechanism.  */
      qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL);
 -
 +    client->read_reply_co = qemu_coroutine_create(nbd_read_reply_entry, client);
      nbd_client_attach_aio_context(bs, bdrv_get_aio_context(bs));
      logout("Established connection with NBD server\n");
 diff --git a/nbd/client.c b/nbd/client.c
 index XXXXXXX..XXXXXXX 100644
 --- a/nbd/client.c
 +++ b/nbd/client.c
@@ -XXX,XX +XXX,XX @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply)
      ssize_t ret;
      ret = read_sync(ioc, buf, sizeof(buf));
 -    if (ret < 0) {
 +    if (ret <= 0) {
          return ret;
      }
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
-         /* On a data extent, compute bytes to the end of the extent,
+diff --git a/nbd/common.c b/nbd/common.c
-          * possibly including a partial sector at EOF. */
+index XXXXXXX..XXXXXXX 100644
-         *pnum = MIN(bytes, hole - offset);
+--- a/nbd/common.c
 +++ b/nbd/common.c
@@ -XXX,XX +XXX,XX @@ ssize_t nbd_wr_syncv(QIOChannel *ioc,
          }
          if (len == QIO_CHANNEL_ERR_BLOCK) {
              if (qemu_in_coroutine()) {
 -                /* XXX figure out if we can create a variant on
 -                 * qio_channel_yield() that works with AIO contexts
 -                 * and consider using that in this branch */
 -                qemu_coroutine_yield();
 -            } else if (done) {
 -                /* XXX this is needed by nbd_reply_ready.  */
 -                qio_channel_wait(ioc,
 -                                 do_read ? G_IO_IN : G_IO_OUT);
 +                qio_channel_yield(ioc, do_read ? G_IO_IN : G_IO_OUT);
              } else {
                  return -EAGAIN;
              }
 diff --git a/nbd/server.c b/nbd/server.c
 index XXXXXXX..XXXXXXX 100644
 --- a/nbd/server.c
 +++ b/nbd/server.c
@@ -XXX,XX +XXX,XX @@ struct NBDClient {
      CoMutex send_lock;
      Coroutine *send_coroutine;
 -    bool can_read;
 -
      QTAILQ_ENTRY(NBDClient) next;
      int nb_requests;
      bool closing;
@@ -XXX,XX +XXX,XX @@ struct NBDClient {
  /* That's all folks */
 -static void nbd_set_handlers(NBDClient *client);
 -static void nbd_unset_handlers(NBDClient *client);
 -static void nbd_update_can_read(NBDClient *client);
 +static void nbd_client_receive_next_request(NBDClient *client);
  static gboolean nbd_negotiate_continue(QIOChannel *ioc,
                                         GIOCondition condition,
@@ -XXX,XX +XXX,XX @@ void nbd_client_put(NBDClient *client)
           */
          assert(client->closing);
 -        nbd_unset_handlers(client);
 +        qio_channel_detach_aio_context(client->ioc);
          object_unref(OBJECT(client->sioc));
          object_unref(OBJECT(client->ioc));
          if (client->tlscreds) {
@@ -XXX,XX +XXX,XX @@ static NBDRequestData *nbd_request_get(NBDClient *client)
      assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
      client->nb_requests++;
 -    nbd_update_can_read(client);
      req = g_new0(NBDRequestData, 1);
      nbd_client_get(client);
@@ -XXX,XX +XXX,XX @@ static void nbd_request_put(NBDRequestData *req)
      g_free(req);
      client->nb_requests--;
 -    nbd_update_can_read(client);
 +    nbd_client_receive_next_request(client);
 +
-+        /*
+     nbd_client_put(client);
-+         * We are not allowed to return partial sectors, though, so
+ }
-+         * round up if necessary.
-+         */
+@@ -XXX,XX +XXX,XX @@ static void blk_aio_attached(AioContext *ctx, void *opaque)
-+        if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
+     exp->ctx = ctx;
-+            int64_t file_length = qemu_gluster_getlength(bs);
-+            if (file_length > 0) {
+     QTAILQ_FOREACH(client, &exp->clients, next) {
-+                /* Ignore errors, this is just a safeguard */
+-        nbd_set_handlers(client);
-+                assert(hole == file_length);
++        qio_channel_attach_aio_context(client->ioc, ctx);
-+            }
++        if (client->recv_coroutine) {
-+            *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
++            aio_co_schedule(ctx, client->recv_coroutine);
 +        }
++        if (client->send_coroutine) {
++            aio_co_schedule(ctx, client->send_coroutine);
++        }
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static void blk_aio_detach(void *opaque)
+     TRACE("Export %s: Detaching clients from AIO context %p\n", exp->name, exp->ctx);
+     QTAILQ_FOREACH(client, &exp->clients, next) {
+-        nbd_unset_handlers(client);
++        qio_channel_detach_aio_context(client->ioc);
+     }
+     exp->ctx = NULL;
+@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
+     g_assert(qemu_in_coroutine());
+     qemu_co_mutex_lock(&client->send_lock);
+     client->send_coroutine = qemu_coroutine_self();
+-    nbd_set_handlers(client);
+     if (!len) {
+         rc = nbd_send_reply(client->ioc, reply);
+@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
+     }
+     client->send_coroutine = NULL;
+-    nbd_set_handlers(client);
+     qemu_co_mutex_unlock(&client->send_lock);
+     return rc;
+ }
+@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
+     ssize_t rc;
+     g_assert(qemu_in_coroutine());
+-    client->recv_coroutine = qemu_coroutine_self();
+-    nbd_update_can_read(client);
+-
++    assert(client->recv_coroutine == qemu_coroutine_self());
+     rc = nbd_receive_request(client->ioc, request);
+     if (rc < 0) {
+         if (rc != -EAGAIN) {
+@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
+ out:
+     client->recv_coroutine = NULL;
+-    nbd_update_can_read(client);
++    nbd_client_receive_next_request(client);
+     return rc;
+ }
+-static void nbd_trip(void *opaque)
++/* Owns a reference to the NBDClient passed as opaque.  */
++static coroutine_fn void nbd_trip(void *opaque)
+ {
+     NBDClient *client = opaque;
+     NBDExport *exp = client->exp;
+     NBDRequestData *req;
+-    NBDRequest request;
++    NBDRequest request = { 0 };    /* GCC thinks it can be used uninitialized */
+     NBDReply reply;
+     ssize_t ret;
+     int flags;
+     TRACE("Reading request.");
+     if (client->closing) {
++        nbd_client_put(client);
+         return;
+     }
+@@ -XXX,XX +XXX,XX @@ static void nbd_trip(void *opaque)
+ done:
+     nbd_request_put(req);
++    nbd_client_put(client);
+     return;
+ out:
+     nbd_request_put(req);
+     client_close(client);
++    nbd_client_put(client);
+ }
+-static void nbd_read(void *opaque)
++static void nbd_client_receive_next_request(NBDClient *client)
+ {
+-    NBDClient *client = opaque;
+-
+-    if (client->recv_coroutine) {
+-        qemu_coroutine_enter(client->recv_coroutine);
+-    } else {
+-        qemu_coroutine_enter(qemu_coroutine_create(nbd_trip, client));
+-    }
+-}
+-
+-static void nbd_restart_write(void *opaque)
+-{
+-    NBDClient *client = opaque;
+-
+-    qemu_coroutine_enter(client->send_coroutine);
+-}
+-
+-static void nbd_set_handlers(NBDClient *client)
+-{
+-    if (client->exp && client->exp->ctx) {
+-        aio_set_fd_handler(client->exp->ctx, client->sioc->fd, true,
+-                           client->can_read ? nbd_read : NULL,
+-                           client->send_coroutine ? nbd_restart_write : NULL,
+-                           NULL, client);
+-    }
+-}
+-
+-static void nbd_unset_handlers(NBDClient *client)
+-{
+-    if (client->exp && client->exp->ctx) {
+-        aio_set_fd_handler(client->exp->ctx, client->sioc->fd, true, NULL,
+-                           NULL, NULL, NULL);
+-    }
+-}
+-
+-static void nbd_update_can_read(NBDClient *client)
+-{
+-    bool can_read = client->recv_coroutine ||
+-                    client->nb_requests < MAX_NBD_REQUESTS;
+-
+-    if (can_read != client->can_read) {
+-        client->can_read = can_read;
+-        nbd_set_handlers(client);
+-
+-        /* There is no need to invoke aio_notify(), since aio_set_fd_handler()
+-         * in nbd_set_handlers() will have taken care of that */
++    if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS) {
++        nbd_client_get(client);
++        client->recv_coroutine = qemu_coroutine_create(nbd_trip, client);
++        aio_co_schedule(client->exp->ctx, client->recv_coroutine);
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static coroutine_fn void nbd_co_client_start(void *opaque)
+         goto out;
+     }
+     qemu_co_mutex_init(&client->send_lock);
+-    nbd_set_handlers(client);
+     if (exp) {
+         QTAILQ_INSERT_TAIL(&exp->clients, client, next);
+     }
 +
-         ret = BDRV_BLOCK_DATA;
++    nbd_client_receive_next_request(client);
-     } else {
++
-         /* On a hole, compute bytes to the beginning of the next extent.  */
+ out:
      g_free(data);
  }
@@ -XXX,XX +XXX,XX @@ void nbd_client_new(NBDExport *exp,
      object_ref(OBJECT(client->sioc));
      client->ioc = QIO_CHANNEL(sioc);
      object_ref(OBJECT(client->ioc));
 -    client->can_read = true;
      client->close = close_fn;
      data->client = client;
 --
-.31.1
+.9.3

-[PULL 28/32] qcow2-refcount: check_refcounts_l2(): check reserved bits
+[Qemu-devel] [PULL v2 08/24] coroutine-lock: reschedule coroutine on the AioContext it was running on
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+As a small step towards the introduction of multiqueue, we want
-Reviewed-by: Eric Blake <eblake@redhat.com>
+coroutines to remain on the same AioContext that started them,
-Tested-by: Kirill Tkhai <ktkhai@virtuozzo.com>
+unless they are moved explicitly with e.g. aio_co_schedule.  This patch
-Reviewed-by: Hanna Reitz <hreitz@redhat.com>
+avoids that coroutines switch AioContext when they use a CoMutex.
-Message-Id: <20210914122454.141075-8-vsementsov@virtuozzo.com>
+For now it does not make much of a difference, because the CoMutex
-[hreitz: Separated `type` declaration from statements]
+is not thread-safe and the AioContext itself is used to protect the
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
+CoMutex from concurrent access.  However, this is going to change.
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Message-id: 20170213135235.12274-9-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qcow2.h          |  1 +
+ util/qemu-coroutine-lock.c | 5 ++---
- block/qcow2-refcount.c | 14 +++++++++++++-
+ util/trace-events          | 1 -
-files changed, 14 insertions(+), 1 deletion(-)
+files changed, 2 insertions(+), 4 deletions(-)
-diff --git a/block/qcow2.h b/block/qcow2.h
+diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
+--- a/util/qemu-coroutine-lock.c
-+++ b/block/qcow2.h
++++ b/util/qemu-coroutine-lock.c
-@@ -XXX,XX +XXX,XX @@ typedef enum QCow2MetadataOverlap {
+@@ -XXX,XX +XXX,XX @@
+ #include "qemu/coroutine.h"
- #define L1E_OFFSET_MASK 0x00fffffffffffe00ULL
+ #include "qemu/coroutine_int.h"
- #define L2E_OFFSET_MASK 0x00fffffffffffe00ULL
+ #include "qemu/queue.h"
-+#define L2E_STD_RESERVED_MASK 0x3f000000000001feULL
++#include "block/aio.h"
+ #include "trace.h"
- #define REFT_OFFSET_MASK 0xfffffffffffffe00ULL
+ void qemu_co_queue_init(CoQueue *queue)
-diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
+@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_run_restart(Coroutine *co)
  static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
  {
 -    Coroutine *self = qemu_coroutine_self();
      Coroutine *next;
      if (QSIMPLEQ_EMPTY(&queue->entries)) {
@@ -XXX,XX +XXX,XX @@ static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
      while ((next = QSIMPLEQ_FIRST(&queue->entries)) != NULL) {
          QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next);
 -        QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, next, co_queue_next);
 -        trace_qemu_co_queue_next(next);
 +        aio_co_wake(next);
          if (single) {
              break;
          }
 diff --git a/util/trace-events b/util/trace-events
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-refcount.c
+--- a/util/trace-events
-+++ b/block/qcow2-refcount.c
++++ b/util/trace-events
-@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
+@@ -XXX,XX +XXX,XX @@ qemu_coroutine_terminate(void *co) "self %p"
-     for (i = 0; i < s->l2_size; i++) {
-         uint64_t coffset;
+ # util/qemu-coroutine-lock.c
-         int csize;
+ qemu_co_queue_run_restart(void *co) "co %p"
-+        QCow2ClusterType type;
+-qemu_co_queue_next(void *nxt) "next %p"
-+
+ qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
-         l2_entry = get_l2_entry(s, l2_table, i);
+ qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
-         l2_bitmap = get_l2_bitmap(s, l2_table, i);
+ qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
 +        type = qcow2_get_cluster_type(bs, l2_entry);
 +
 +        if (type != QCOW2_CLUSTER_COMPRESSED) {
 +            /* Check reserved bits of Standard Cluster Descriptor */
 +            if (l2_entry & L2E_STD_RESERVED_MASK) {
 +                fprintf(stderr, "ERROR found l2 entry with reserved bits set: "
 +                        "%" PRIx64 "\n", l2_entry);
 +                res->corruptions++;
 +            }
 +        }
 -        switch (qcow2_get_cluster_type(bs, l2_entry)) {
 +        switch (type) {
          case QCOW2_CLUSTER_COMPRESSED:
              /* Compressed clusters don't have QCOW_OFLAG_COPIED */
              if (l2_entry & QCOW_OFLAG_COPIED) {
 --
-.31.1
+.9.3

-[PULL 23/32] qcow2: compressed read: simplify cluster descriptor passing
+[Qemu-devel] [PULL v2 09/24] blkdebug: reschedule coroutine on the AioContext it is running on
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Let's pass the whole L2 entry and not bother with
+Keep the coroutine on the same AioContext.  Without this change,
-L2E_COMPRESSED_OFFSET_SIZE_MASK.
+there would be a race between yielding the coroutine and reentering it.
 While the race cannot happen now, because the code only runs from a single
 AioContext, this will change with multiqueue support in the block layer.
-It also helps further refactoring that adds generic
+While doing the change, replace custom bottom half with aio_co_schedule.
 qcow2_parse_compressed_l2_entry() helper.
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
-Reviewed-by: Alberto Garcia <berto@igalia.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Hanna Reitz <hreitz@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
-Message-Id: <20210914122454.141075-3-vsementsov@virtuozzo.com>
+Message-id: 20170213135235.12274-10-pbonzini@redhat.com
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qcow2.h         |  1 -
+ block/blkdebug.c | 9 +--------
- block/qcow2-cluster.c |  5 ++---
+file changed, 1 insertion(+), 8 deletions(-)
  block/qcow2.c         | 12 +++++++-----
 files changed, 9 insertions(+), 9 deletions(-)
-diff --git a/block/qcow2.h b/block/qcow2.h
+diff --git a/block/blkdebug.c b/block/blkdebug.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
+--- a/block/blkdebug.c
-+++ b/block/qcow2.h
++++ b/block/blkdebug.c
-@@ -XXX,XX +XXX,XX @@ typedef enum QCow2MetadataOverlap {
+@@ -XXX,XX +XXX,XX @@ out:
+     return ret;
- #define L1E_OFFSET_MASK 0x00fffffffffffe00ULL
+ }
- #define L2E_OFFSET_MASK 0x00fffffffffffe00ULL
--#define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL
+-static void error_callback_bh(void *opaque)
+-{
- #define REFT_OFFSET_MASK 0xfffffffffffffe00ULL
+-    Coroutine *co = opaque;
+-    qemu_coroutine_enter(co);
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
+-}
-index XXXXXXX..XXXXXXX 100644
+-
---- a/block/qcow2-cluster.c
+ static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
-+++ b/block/qcow2-cluster.c
+ {
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
+     BDRVBlkdebugState *s = bs->opaque;
-  * offset needs to be aligned to a cluster boundary.
+@@ -XXX,XX +XXX,XX @@ static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
-  *
+     }
-  * If the cluster is unallocated then *host_offset will be 0.
-- * If the cluster is compressed then *host_offset will contain the
+     if (!immediately) {
-- * complete compressed cluster descriptor.
+-        aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), error_callback_bh,
-+ * If the cluster is compressed then *host_offset will contain the l2 entry.
+-                                qemu_coroutine_self());
-  *
++        aio_co_schedule(qemu_get_current_aio_context(), qemu_coroutine_self());
-  * On entry, *bytes is the maximum number of contiguous bytes starting at
+         qemu_coroutine_yield();
-  * offset that we are interested in.
+     }
@@ -XXX,XX +XXX,XX @@ int qcow2_get_host_offset(BlockDriverState *bs, uint64_t offset,
              ret = -EIO;
              goto fail;
          }
 -        *host_offset = l2_entry & L2E_COMPRESSED_OFFSET_SIZE_MASK;
 +        *host_offset = l2_entry;
          break;
      case QCOW2_SUBCLUSTER_ZERO_PLAIN:
      case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
 diff --git a/block/qcow2.c b/block/qcow2.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2.c
 +++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
  static int coroutine_fn
  qcow2_co_preadv_compressed(BlockDriverState *bs,
 -                           uint64_t cluster_descriptor,
 +                           uint64_t l2_entry,
                             uint64_t offset,
                             uint64_t bytes,
                             QEMUIOVector *qiov,
@@ -XXX,XX +XXX,XX @@ typedef struct Qcow2AioTask {
      BlockDriverState *bs;
      QCow2SubclusterType subcluster_type; /* only for read */
 -    uint64_t host_offset; /* or full descriptor in compressed clusters */
 +    uint64_t host_offset; /* or l2_entry for compressed read */
      uint64_t offset;
      uint64_t bytes;
      QEMUIOVector *qiov;
@@ -XXX,XX +XXX,XX @@ qcow2_co_pwritev_compressed_part(BlockDriverState *bs,
  static int coroutine_fn
  qcow2_co_preadv_compressed(BlockDriverState *bs,
 -                           uint64_t cluster_descriptor,
 +                           uint64_t l2_entry,
                             uint64_t offset,
                             uint64_t bytes,
                             QEMUIOVector *qiov,
@@ -XXX,XX +XXX,XX @@ qcow2_co_preadv_compressed(BlockDriverState *bs,
      uint8_t *buf, *out_buf;
      int offset_in_cluster = offset_into_cluster(s, offset);
 -    coffset = cluster_descriptor & s->cluster_offset_mask;
 -    nb_csectors = ((cluster_descriptor >> s->csize_shift) & s->csize_mask) + 1;
 +    assert(qcow2_get_cluster_type(bs, l2_entry) == QCOW2_CLUSTER_COMPRESSED);
 +
 +    coffset = l2_entry & s->cluster_offset_mask;
 +    nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1;
      csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE -
          (coffset & ~QCOW2_COMPRESSED_SECTOR_MASK);
 --
-.31.1
+.9.3

-[PULL 25/32] qcow2-refcount: introduce fix_l2_entry_by_zero()
+[Qemu-devel] [PULL v2 10/24] qed: introduce qed_aio_start_io and qed_aio_next_io_cb
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Split fix_l2_entry_by_zero() out of check_refcounts_l2() to be
+qed_aio_start_io and qed_aio_next_io will not have to acquire/release
-reused in further patch.
+the AioContext, while qed_aio_next_io_cb will.  Split the functionality
 and gain a little type-safety in the process.
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Reviewed-by: Hanna Reitz <hreitz@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
-Message-Id: <20210914122454.141075-5-vsementsov@virtuozzo.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
+Message-id: 20170213135235.12274-11-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qcow2-refcount.c | 87 +++++++++++++++++++++++++++++-------------
+ block/qed.c | 39 +++++++++++++++++++++++++--------------
-file changed, 60 insertions(+), 27 deletions(-)
+file changed, 25 insertions(+), 14 deletions(-)
-diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
+diff --git a/block/qed.c b/block/qed.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-refcount.c
+--- a/block/qed.c
-+++ b/block/qcow2-refcount.c
++++ b/block/qed.c
-@@ -XXX,XX +XXX,XX @@ enum {
+@@ -XXX,XX +XXX,XX @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
-     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
+     return l2_table;
- };
+ }
-+/*
+-static void qed_aio_next_io(void *opaque, int ret);
-+ * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN.
++static void qed_aio_next_io(QEDAIOCB *acb, int ret);
-+ *
++
-+ * This function decrements res->corruptions on success, so the caller is
++static void qed_aio_start_io(QEDAIOCB *acb)
 + * responsible to increment res->corruptions prior to the call.
 + *
 + * On failure in-memory @l2_table may be modified.
 + */
 +static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
 +                                uint64_t l2_offset,
 +                                uint64_t *l2_table, int l2_index, bool active,
 +                                bool *metadata_overlap)
 +{
-+    BDRVQcow2State *s = bs->opaque;
++    qed_aio_next_io(acb, 0);
 +    int ret;
 +    int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t));
 +    uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s);
 +    int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2;
 +    uint64_t l2_entry = has_subclusters(s) ? 0 : QCOW_OFLAG_ZERO;
 +
 +    set_l2_entry(s, l2_table, l2_index, l2_entry);
 +    ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s),
 +                                        false);
 +    if (metadata_overlap) {
 +        *metadata_overlap = ret < 0;
 +    }
 +    if (ret < 0) {
 +        fprintf(stderr, "ERROR: Overlap check failed\n");
 +        goto fail;
 +    }
 +
 +    ret = bdrv_pwrite_sync(bs->file, l2e_offset, &l2_table[idx],
 +                           l2_entry_size(s));
 +    if (ret < 0) {
 +        fprintf(stderr, "ERROR: Failed to overwrite L2 "
 +                "table entry: %s\n", strerror(-ret));
 +        goto fail;
 +    }
 +
 +    res->corruptions--;
 +    res->corruptions_fixed++;
 +    return 0;
 +
 +fail:
 +    res->check_errors++;
 +    return ret;
 +}
 +
- /*
++static void qed_aio_next_io_cb(void *opaque, int ret)
-  * Increases the refcount in the given refcount table for the all clusters
++{
-  * referenced in the L2 table. While doing so, performs some checks on L2
++    QEDAIOCB *acb = opaque;
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
      int i, ret;
      size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
      g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes);
 +    bool metadata_overlap;
      /* Read L2 table from disk */
      ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size_bytes);
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                              fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
                              offset);
                      if (fix & BDRV_FIX_ERRORS) {
 -                        int idx = i * (l2_entry_size(s) / sizeof(uint64_t));
 -                        uint64_t l2e_offset =
 -                            l2_offset + (uint64_t)i * l2_entry_size(s);
 -                        int ign = active ? QCOW2_OL_ACTIVE_L2 :
 -                                           QCOW2_OL_INACTIVE_L2;
 -
 -                        l2_entry = has_subclusters(s) ? 0 : QCOW_OFLAG_ZERO;
 -                        set_l2_entry(s, l2_table, i, l2_entry);
 -                        ret = qcow2_pre_write_overlap_check(bs, ign,
 -                                l2e_offset, l2_entry_size(s), false);
 -                        if (ret < 0) {
 -                            fprintf(stderr, "ERROR: Overlap check failed\n");
 -                            res->check_errors++;
 +                        ret = fix_l2_entry_by_zero(bs, res, l2_offset,
 +                                                   l2_table, i, active,
 +                                                   &metadata_overlap);
 +                        if (metadata_overlap) {
                              /*
                               * Something is seriously wrong, so abort checking
                               * this L2 table.
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                              return ret;
                          }
 -                        ret = bdrv_pwrite_sync(bs->file, l2e_offset,
 -                                               &l2_table[idx],
 -                                               l2_entry_size(s));
 -                        if (ret < 0) {
 -                            fprintf(stderr, "ERROR: Failed to overwrite L2 "
 -                                    "table entry: %s\n", strerror(-ret));
 -                            res->check_errors++;
 -                            /*
 -                             * Do not abort, continue checking the rest of this
 -                             * L2 table's entries.
 -                             */
 -                        } else {
 -                            res->corruptions--;
 -                            res->corruptions_fixed++;
 +                        if (ret == 0) {
                              /*
                               * Skip marking the cluster as used
                               * (it is unused now).
                               */
                              continue;
                          }
 +
-+                        /*
++    qed_aio_next_io(acb, ret);
-+                         * Failed to fix.
++}
-+                         * Do not abort, continue checking the rest of this
-+                         * L2 table's entries.
+ static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
-+                         */
+ {
-                     }
+@@ -XXX,XX +XXX,XX @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
-                 } else {
-                     fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
+     acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
      if (acb) {
 -        qed_aio_next_io(acb, 0);
 +        qed_aio_start_io(acb);
      }
  }
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
          QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
          acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
          if (acb) {
 -            qed_aio_next_io(acb, 0);
 +            qed_aio_start_io(acb);
          } else if (s->header.features & QED_F_NEED_CHECK) {
              qed_start_need_check_timer(s);
          }
@@ -XXX,XX +XXX,XX @@ static void qed_commit_l2_update(void *opaque, int ret)
      acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
      assert(acb->request.l2_table != NULL);
 -    qed_aio_next_io(opaque, ret);
 +    qed_aio_next_io(acb, ret);
  }
  /**
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
      if (need_alloc) {
          /* Write out the whole new L2 table */
          qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
 -                            qed_aio_write_l1_update, acb);
 +                           qed_aio_write_l1_update, acb);
      } else {
          /* Write out only the updated part of the L2 table */
          qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
 -                            qed_aio_next_io, acb);
 +                           qed_aio_next_io_cb, acb);
      }
      return;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
      }
      if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
 -        next_fn = qed_aio_next_io;
 +        next_fn = qed_aio_next_io_cb;
      } else {
          if (s->bs->backing) {
              next_fn = qed_aio_write_flush_before_l2_update;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
      if (acb->flags & QED_AIOCB_ZERO) {
          /* Skip ahead if the clusters are already zero */
          if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
 -            qed_aio_next_io(acb, 0);
 +            qed_aio_start_io(acb);
              return;
          }
@@ -XXX,XX +XXX,XX @@ static void qed_aio_read_data(void *opaque, int ret,
      /* Handle zero cluster and backing file reads */
      if (ret == QED_CLUSTER_ZERO) {
          qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
 -        qed_aio_next_io(acb, 0);
 +        qed_aio_start_io(acb);
          return;
      } else if (ret != QED_CLUSTER_FOUND) {
          qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
 -                              &acb->backing_qiov, qed_aio_next_io, acb);
 +                              &acb->backing_qiov, qed_aio_next_io_cb, acb);
          return;
      }
      BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
      bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
                     &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
 -                   qed_aio_next_io, acb);
 +                   qed_aio_next_io_cb, acb);
      return;
  err:
@@ -XXX,XX +XXX,XX @@ err:
  /**
   * Begin next I/O or complete the request
   */
 -static void qed_aio_next_io(void *opaque, int ret)
 +static void qed_aio_next_io(QEDAIOCB *acb, int ret)
  {
 -    QEDAIOCB *acb = opaque;
      BDRVQEDState *s = acb_to_s(acb);
      QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
                                  qed_aio_write_data : qed_aio_read_data;
@@ -XXX,XX +XXX,XX @@ static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
      qemu_iovec_init(&acb->cur_qiov, qiov->niov);
      /* Start request */
 -    qed_aio_next_io(acb, 0);
 +    qed_aio_start_io(acb);
      return &acb->common;
  }
 --
-.31.1
+.9.3

-[PULL 21/32] qemu-img: Allow target be aligned to sector size
+[Qemu-devel] [PULL v2 11/24] aio: push aio_context_acquire/release down to dispatching
-We cannot write to images opened with O_DIRECT unless we allow them to
+From: Paolo Bonzini <pbonzini@redhat.com>
 be resized so they are aligned to the sector size: Since 9c60a5d1978,
 bdrv_node_refresh_perm() ensures that for nodes whose length is not
 aligned to the request alignment and where someone has taken a WRITE
 permission, the RESIZE permission is taken, too).
-Let qemu-img convert pass the BDRV_O_RESIZE flag (which causes
+The AioContext data structures are now protected by list_lock and/or
-blk_new_open() to take the RESIZE permission) when using cache=none for
+they are walked with FOREACH_RCU primitives.  There is no need anymore
-the target, so that when writing to it, it can be aligned to the target
+to acquire the AioContext for the entire duration of aio_dispatch.
-sector size.
+Instead, just acquire it before and after invoking the callbacks.
 The next step is then to push it further down.
-Without this patch, an error is returned:
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Message-id: 20170213135235.12274-12-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  util/aio-posix.c | 25 +++++++++++--------------
  util/aio-win32.c | 15 +++++++--------
  util/async.c     |  2 ++
 files changed, 20 insertions(+), 22 deletions(-)
-$ qemu-img convert -f raw -O raw -t none foo.img /mnt/tmp/foo.img
+diff --git a/util/aio-posix.c b/util/aio-posix.c
 qemu-img: Could not open '/mnt/tmp/foo.img': Cannot get 'write'
 permission without 'resize': Image size is not a multiple of request
 alignment
 Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1994266
 Signed-off-by: Hanna Reitz <hreitz@redhat.com>
 Message-Id: <20210819101200.64235-1-hreitz@redhat.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 ---
  qemu-img.c | 8 ++++++++
 file changed, 8 insertions(+)
 diff --git a/qemu-img.c b/qemu-img.c
 index XXXXXXX..XXXXXXX 100644
---- a/qemu-img.c
+--- a/util/aio-posix.c
-+++ b/qemu-img.c
++++ b/util/aio-posix.c
-@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
-         goto out;
+             (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
              aio_node_check(ctx, node->is_external) &&
              node->io_read) {
 +            aio_context_acquire(ctx);
              node->io_read(node->opaque);
 +            aio_context_release(ctx);
              /* aio_notify() does not count as progress */
              if (node->opaque != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
              (revents & (G_IO_OUT | G_IO_ERR)) &&
              aio_node_check(ctx, node->is_external) &&
              node->io_write) {
 +            aio_context_acquire(ctx);
              node->io_write(node->opaque);
 +            aio_context_release(ctx);
              progress = true;
          }
@@ -XXX,XX +XXX,XX @@ bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
      }
-+    if (flags & BDRV_O_NOCACHE) {
+     /* Run our timers */
-+        /*
++    aio_context_acquire(ctx);
-+         * If we open the target with O_DIRECT, it may be necessary to
+     progress |= timerlistgroup_run_timers(&ctx->tlg);
-+         * extend its size to align to the physical sector size.
++    aio_context_release(ctx);
-+         */
-+        flags |= BDRV_O_RESIZE;
+     return progress;
-+    }
+ }
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
      int64_t timeout;
      int64_t start = 0;
 -    aio_context_acquire(ctx);
 -    progress = false;
 -
      /* aio_notify can avoid the expensive event_notifier_set if
       * everything (file descriptors, bottom halves, timers) will
       * be re-evaluated before the next blocking poll().  This is
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
      }
 -    if (try_poll_mode(ctx, blocking)) {
 -        progress = true;
 -    } else {
 +    aio_context_acquire(ctx);
 +    progress = try_poll_mode(ctx, blocking);
 +    aio_context_release(ctx);
 +
-     if (skip_create) {
++    if (!progress) {
-         s.target = img_open(tgt_image_opts, out_filename, out_fmt,
+         assert(npfd == 0);
-                             flags, writethrough, s.quiet, false);
          /* fill pollfds */
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          timeout = blocking ? aio_compute_timeout(ctx) : 0;
          /* wait until next event */
 -        if (timeout) {
 -            aio_context_release(ctx);
 -        }
          if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
              AioHandler epoll_handler;
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          } else  {
              ret = qemu_poll_ns(pollfds, npfd, timeout);
          }
 -        if (timeout) {
 -            aio_context_acquire(ctx);
 -        }
      }
      if (blocking) {
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          progress = true;
      }
 -    aio_context_release(ctx);
 -
      return progress;
  }
 diff --git a/util/aio-win32.c b/util/aio-win32.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-win32.c
 +++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
              (revents || event_notifier_get_handle(node->e) == event) &&
              node->io_notify) {
              node->pfd.revents = 0;
 +            aio_context_acquire(ctx);
              node->io_notify(node->e);
 +            aio_context_release(ctx);
              /* aio_notify() does not count as progress */
              if (node->e != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
              (node->io_read || node->io_write)) {
              node->pfd.revents = 0;
              if ((revents & G_IO_IN) && node->io_read) {
 +                aio_context_acquire(ctx);
                  node->io_read(node->opaque);
 +                aio_context_release(ctx);
                  progress = true;
              }
              if ((revents & G_IO_OUT) && node->io_write) {
 +                aio_context_acquire(ctx);
                  node->io_write(node->opaque);
 +                aio_context_release(ctx);
                  progress = true;
              }
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
      int count;
      int timeout;
 -    aio_context_acquire(ctx);
      progress = false;
      /* aio_notify can avoid the expensive event_notifier_set if
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          timeout = blocking && !have_select_revents
              ? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
 -        if (timeout) {
 -            aio_context_release(ctx);
 -        }
          ret = WaitForMultipleObjects(count, events, FALSE, timeout);
          if (blocking) {
              assert(first);
              atomic_sub(&ctx->notify_me, 2);
          }
 -        if (timeout) {
 -            aio_context_acquire(ctx);
 -        }
          if (first) {
              aio_notify_accept(ctx);
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          progress |= aio_dispatch_handlers(ctx, event);
      } while (count > 0);
 +    aio_context_acquire(ctx);
      progress |= timerlistgroup_run_timers(&ctx->tlg);
 -
      aio_context_release(ctx);
      return progress;
  }
 diff --git a/util/async.c b/util/async.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/async.c
 +++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
                  ret = 1;
              }
              bh->idle = 0;
 +            aio_context_acquire(ctx);
              aio_bh_call(bh);
 +            aio_context_release(ctx);
          }
          if (bh->deleted) {
              deleted = true;
 --
-.31.1
+.9.3

-[PULL 07/32] block/iscsi: Do not force-cap *pnum
+[Qemu-devel] [PULL v2 12/24] block: explicitly acquire aiocontext in timers that need it
-bdrv_co_block_status() does it for us, we do not need to do it here.
+From: Paolo Bonzini <pbonzini@redhat.com>
-The advantage of not capping *pnum is that bdrv_co_block_status() can
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-cache larger data regions than requested by its caller.
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Message-id: 20170213135235.12274-13-pbonzini@redhat.com
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Message-Id: <20210812084148.14458-7-hreitz@redhat.com>
 ---
- block/iscsi.c | 3 ---
+ block/qed.h                 |  3 +++
-file changed, 3 deletions(-)
+ block/curl.c                |  2 ++
+ block/io.c                  |  5 +++++
  block/iscsi.c               |  8 ++++++--
  block/null.c                |  4 ++++
  block/qed.c                 | 12 ++++++++++++
  block/throttle-groups.c     |  2 ++
  util/aio-posix.c            |  2 --
  util/aio-win32.c            |  2 --
  util/qemu-coroutine-sleep.c |  2 +-
 files changed, 35 insertions(+), 7 deletions(-)
 diff --git a/block/qed.h b/block/qed.h
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.h
 +++ b/block/qed.h
@@ -XXX,XX +XXX,XX @@ enum {
   */
  typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len);
 +void qed_acquire(BDRVQEDState *s);
 +void qed_release(BDRVQEDState *s);
 +
  /**
   * Generic callback for chaining async callbacks
   */
 diff --git a/block/curl.c b/block/curl.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/curl.c
 +++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_multi_timeout_do(void *arg)
          return;
      }
 +    aio_context_acquire(s->aio_context);
      curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
      curl_multi_check_completion(s);
 +    aio_context_release(s->aio_context);
  #else
      abort();
  #endif
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_aio_cancel(BlockAIOCB *acb)
          if (acb->aiocb_info->get_aio_context) {
              aio_poll(acb->aiocb_info->get_aio_context(acb), true);
          } else if (acb->bs) {
 +            /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
 +             * assert that we're not using an I/O thread.  Thread-safe
 +             * code should use bdrv_aio_cancel_async exclusively.
 +             */
 +            assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
              aio_poll(bdrv_get_aio_context(acb->bs), true);
          } else {
              abort();
 diff --git a/block/iscsi.c b/block/iscsi.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/iscsi.c
 +++ b/block/iscsi.c
-@@ -XXX,XX +XXX,XX @@ retry:
+@@ -XXX,XX +XXX,XX @@ static void iscsi_retry_timer_expired(void *opaque)
-         iscsi_allocmap_set_allocated(iscsilun, offset, *pnum);
+     struct IscsiTask *iTask = opaque;
-     }
+     iTask->complete = 1;
+     if (iTask->co) {
--    if (*pnum > bytes) {
+-        qemu_coroutine_enter(iTask->co);
--        *pnum = bytes;
++        aio_co_wake(iTask->co);
--    }
+     }
- out_unlock:
+ }
-     qemu_mutex_unlock(&iscsilun->mutex);
-     g_free(iTask.err_str);
+@@ -XXX,XX +XXX,XX @@ static void iscsi_nop_timed_event(void *opaque)
  {
      IscsiLun *iscsilun = opaque;
 +    aio_context_acquire(iscsilun->aio_context);
      if (iscsi_get_nops_in_flight(iscsilun->iscsi) >= MAX_NOP_FAILURES) {
          error_report("iSCSI: NOP timeout. Reconnecting...");
          iscsilun->request_timed_out = true;
      } else if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) {
          error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages.");
 -        return;
 +        goto out;
      }
      timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL);
      iscsi_set_events(iscsilun);
 +
 +out:
 +    aio_context_release(iscsilun->aio_context);
  }
  static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
 diff --git a/block/null.c b/block/null.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/null.c
 +++ b/block/null.c
@@ -XXX,XX +XXX,XX @@ static void null_bh_cb(void *opaque)
  static void null_timer_cb(void *opaque)
  {
      NullAIOCB *acb = opaque;
 +    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 +
 +    aio_context_acquire(ctx);
      acb->common.cb(acb->common.opaque, 0);
 +    aio_context_release(ctx);
      timer_deinit(&acb->timer);
      qemu_aio_unref(acb);
  }
 diff --git a/block/qed.c b/block/qed.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.c
 +++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_need_check_timer_cb(void *opaque)
      trace_qed_need_check_timer_cb(s);
 +    qed_acquire(s);
      qed_plug_allocating_write_reqs(s);
      /* Ensure writes are on disk before clearing flag */
      bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
 +    qed_release(s);
 +}
 +
 +void qed_acquire(BDRVQEDState *s)
 +{
 +    aio_context_acquire(bdrv_get_aio_context(s->bs));
 +}
 +
 +void qed_release(BDRVQEDState *s)
 +{
 +    aio_context_release(bdrv_get_aio_context(s->bs));
  }
  static void qed_start_need_check_timer(BDRVQEDState *s)
 diff --git a/block/throttle-groups.c b/block/throttle-groups.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/throttle-groups.c
 +++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@ static void timer_cb(BlockBackend *blk, bool is_write)
      qemu_mutex_unlock(&tg->lock);
      /* Run the request that was waiting for this timer */
 +    aio_context_acquire(blk_get_aio_context(blk));
      empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]);
 +    aio_context_release(blk_get_aio_context(blk));
      /* If the request queue was empty then we have to take care of
       * scheduling the next one */
 diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-posix.c
 +++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
      }
      /* Run our timers */
 -    aio_context_acquire(ctx);
      progress |= timerlistgroup_run_timers(&ctx->tlg);
 -    aio_context_release(ctx);
      return progress;
  }
 diff --git a/util/aio-win32.c b/util/aio-win32.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-win32.c
 +++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          progress |= aio_dispatch_handlers(ctx, event);
      } while (count > 0);
 -    aio_context_acquire(ctx);
      progress |= timerlistgroup_run_timers(&ctx->tlg);
 -    aio_context_release(ctx);
      return progress;
  }
 diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/qemu-coroutine-sleep.c
 +++ b/util/qemu-coroutine-sleep.c
@@ -XXX,XX +XXX,XX @@ static void co_sleep_cb(void *opaque)
  {
      CoSleepCB *sleep_cb = opaque;
 -    qemu_coroutine_enter(sleep_cb->co);
 +    aio_co_wake(sleep_cb->co);
  }
  void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type,
 --
-.31.1
+.9.3

-[PULL 30/32] qcow2-refcount: check_refcounts_l1(): check reserved bits
+[Qemu-devel] [PULL v2 13/24] block: explicitly acquire aiocontext in callbacks that need it
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+This covers both file descriptor callbacks and polling callbacks,
-Reviewed-by: Eric Blake <eblake@redhat.com>
+since they execute related code.
-Tested-by: Kirill Tkhai <ktkhai@virtuozzo.com>
-Reviewed-by: Hanna Reitz <hreitz@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Message-Id: <20210914122454.141075-10-vsementsov@virtuozzo.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Message-id: 20170213135235.12274-14-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qcow2.h          | 1 +
+ block/curl.c          | 16 +++++++++++++---
- block/qcow2-refcount.c | 6 ++++++
+ block/iscsi.c         |  4 ++++
-files changed, 7 insertions(+)
+ block/linux-aio.c     |  4 ++++
  block/nfs.c           |  6 ++++++
  block/sheepdog.c      | 29 +++++++++++++++--------------
  block/ssh.c           | 29 +++++++++--------------------
  block/win32-aio.c     | 10 ++++++----
  hw/block/virtio-blk.c |  5 ++++-
  hw/scsi/virtio-scsi.c |  7 +++++++
  util/aio-posix.c      |  7 -------
  util/aio-win32.c      |  6 ------
 files changed, 68 insertions(+), 55 deletions(-)
-diff --git a/block/qcow2.h b/block/qcow2.h
+diff --git a/block/curl.c b/block/curl.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
+--- a/block/curl.c
-+++ b/block/qcow2.h
++++ b/block/curl.c
-@@ -XXX,XX +XXX,XX @@ typedef enum QCow2MetadataOverlap {
+@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
-     (QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2)
+     }
+ }
- #define L1E_OFFSET_MASK 0x00fffffffffffe00ULL
-+#define L1E_RESERVED_MASK 0x7f000000000001ffULL
+-static void curl_multi_do(void *arg)
- #define L2E_OFFSET_MASK 0x00fffffffffffe00ULL
++static void curl_multi_do_locked(CURLState *s)
- #define L2E_STD_RESERVED_MASK 0x3f000000000001feULL
+ {
+-    CURLState *s = (CURLState *)arg;
-diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
+     CURLSocket *socket, *next_socket;
-index XXXXXXX..XXXXXXX 100644
+     int running;
---- a/block/qcow2-refcount.c
+     int r;
-+++ b/block/qcow2-refcount.c
+@@ -XXX,XX +XXX,XX @@ static void curl_multi_do(void *arg)
-@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l1(BlockDriverState *bs,
+     }
-             continue;
+ }
 +static void curl_multi_do(void *arg)
 +{
 +    CURLState *s = (CURLState *)arg;
 +
 +    aio_context_acquire(s->s->aio_context);
 +    curl_multi_do_locked(s);
 +    aio_context_release(s->s->aio_context);
 +}
 +
  static void curl_multi_read(void *arg)
  {
      CURLState *s = (CURLState *)arg;
 -    curl_multi_do(arg);
 +    aio_context_acquire(s->s->aio_context);
 +    curl_multi_do_locked(s);
      curl_multi_check_completion(s->s);
 +    aio_context_release(s->s->aio_context);
  }
  static void curl_multi_timeout_do(void *arg)
 diff --git a/block/iscsi.c b/block/iscsi.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/iscsi.c
 +++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ iscsi_process_read(void *arg)
      IscsiLun *iscsilun = arg;
      struct iscsi_context *iscsi = iscsilun->iscsi;
 +    aio_context_acquire(iscsilun->aio_context);
      iscsi_service(iscsi, POLLIN);
      iscsi_set_events(iscsilun);
 +    aio_context_release(iscsilun->aio_context);
  }
  static void
@@ -XXX,XX +XXX,XX @@ iscsi_process_write(void *arg)
      IscsiLun *iscsilun = arg;
      struct iscsi_context *iscsi = iscsilun->iscsi;
 +    aio_context_acquire(iscsilun->aio_context);
      iscsi_service(iscsi, POLLOUT);
      iscsi_set_events(iscsilun);
 +    aio_context_release(iscsilun->aio_context);
  }
  static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun)
 diff --git a/block/linux-aio.c b/block/linux-aio.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/linux-aio.c
 +++ b/block/linux-aio.c
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_completion_cb(EventNotifier *e)
      LinuxAioState *s = container_of(e, LinuxAioState, e);
      if (event_notifier_test_and_clear(&s->e)) {
 +        aio_context_acquire(s->aio_context);
          qemu_laio_process_completions_and_submit(s);
 +        aio_context_release(s->aio_context);
      }
  }
@@ -XXX,XX +XXX,XX @@ static bool qemu_laio_poll_cb(void *opaque)
          return false;
      }
 +    aio_context_acquire(s->aio_context);
      qemu_laio_process_completions_and_submit(s);
 +    aio_context_release(s->aio_context);
      return true;
  }
 diff --git a/block/nfs.c b/block/nfs.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/nfs.c
 +++ b/block/nfs.c
@@ -XXX,XX +XXX,XX @@ static void nfs_set_events(NFSClient *client)
  static void nfs_process_read(void *arg)
  {
      NFSClient *client = arg;
 +
 +    aio_context_acquire(client->aio_context);
      nfs_service(client->context, POLLIN);
      nfs_set_events(client);
 +    aio_context_release(client->aio_context);
  }
  static void nfs_process_write(void *arg)
  {
      NFSClient *client = arg;
 +
 +    aio_context_acquire(client->aio_context);
      nfs_service(client->context, POLLOUT);
      nfs_set_events(client);
 +    aio_context_release(client->aio_context);
  }
  static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
 diff --git a/block/sheepdog.c b/block/sheepdog.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/sheepdog.c
 +++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
      return ret;
  }
 -static void restart_co_req(void *opaque)
 -{
 -    Coroutine *co = opaque;
 -
 -    qemu_coroutine_enter(co);
 -}
 -
  typedef struct SheepdogReqCo {
      int sockfd;
      BlockDriverState *bs;
@@ -XXX,XX +XXX,XX @@ typedef struct SheepdogReqCo {
      unsigned int *rlen;
      int ret;
      bool finished;
 +    Coroutine *co;
  } SheepdogReqCo;
 +static void restart_co_req(void *opaque)
 +{
 +    SheepdogReqCo *srco = opaque;
 +
 +    aio_co_wake(srco->co);
 +}
 +
  static coroutine_fn void do_co_req(void *opaque)
  {
      int ret;
 -    Coroutine *co;
      SheepdogReqCo *srco = opaque;
      int sockfd = srco->sockfd;
      SheepdogReq *hdr = srco->hdr;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void do_co_req(void *opaque)
      unsigned int *wlen = srco->wlen;
      unsigned int *rlen = srco->rlen;
 -    co = qemu_coroutine_self();
 +    srco->co = qemu_coroutine_self();
      aio_set_fd_handler(srco->aio_context, sockfd, false,
 -                       NULL, restart_co_req, NULL, co);
 +                       NULL, restart_co_req, NULL, srco);
      ret = send_co_req(sockfd, hdr, data, wlen);
      if (ret < 0) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void do_co_req(void *opaque)
      }
      aio_set_fd_handler(srco->aio_context, sockfd, false,
 -                       restart_co_req, NULL, NULL, co);
 +                       restart_co_req, NULL, NULL, srco);
      ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
      if (ret != sizeof(*hdr)) {
@@ -XXX,XX +XXX,XX @@ out:
      aio_set_fd_handler(srco->aio_context, sockfd, false,
                         NULL, NULL, NULL, NULL);
 +    srco->co = NULL;
      srco->ret = ret;
      srco->finished = true;
      if (srco->bs) {
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
           * We've finished all requests which belong to the AIOCB, so
           * we can switch back to sd_co_readv/writev now.
           */
 -        qemu_coroutine_enter(acb->coroutine);
 +        aio_co_wake(acb->coroutine);
      }
      return;
@@ -XXX,XX +XXX,XX @@ static void co_read_response(void *opaque)
          s->co_recv = qemu_coroutine_create(aio_read_response, opaque);
      }
 -    qemu_coroutine_enter(s->co_recv);
 +    aio_co_wake(s->co_recv);
  }
  static void co_write_request(void *opaque)
  {
      BDRVSheepdogState *s = opaque;
 -    qemu_coroutine_enter(s->co_send);
 +    aio_co_wake(s->co_send);
  }
  /*
 diff --git a/block/ssh.c b/block/ssh.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/ssh.c
 +++ b/block/ssh.c
@@ -XXX,XX +XXX,XX @@ static void restart_coroutine(void *opaque)
      DPRINTF("co=%p", co);
 -    qemu_coroutine_enter(co);
 +    aio_co_wake(co);
  }
 -static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
 +/* A non-blocking call returned EAGAIN, so yield, ensuring the
 + * handlers are set up so that we'll be rescheduled when there is an
 + * interesting event on the socket.
 + */
 +static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
  {
      int r;
      IOHandler *rd_handler = NULL, *wr_handler = NULL;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
      aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
                         false, rd_handler, wr_handler, NULL, co);
 -}
 -
 -static coroutine_fn void clear_fd_handler(BDRVSSHState *s,
 -                                          BlockDriverState *bs)
 -{
 -    DPRINTF("s->sock=%d", s->sock);
 -    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
 -                       false, NULL, NULL, NULL, NULL);
 -}
 -
 -/* A non-blocking call returned EAGAIN, so yield, ensuring the
 - * handlers are set up so that we'll be rescheduled when there is an
 - * interesting event on the socket.
 - */
 -static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
 -{
 -    set_fd_handler(s, bs);
      qemu_coroutine_yield();
 -    clear_fd_handler(s, bs);
 +    DPRINTF("s->sock=%d - back", s->sock);
 +    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, false,
 +                       NULL, NULL, NULL, NULL);
  }
  /* SFTP has a function `libssh2_sftp_seek64' which seeks to a position
 diff --git a/block/win32-aio.c b/block/win32-aio.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/win32-aio.c
 +++ b/block/win32-aio.c
@@ -XXX,XX +XXX,XX @@ struct QEMUWin32AIOState {
      HANDLE hIOCP;
      EventNotifier e;
      int count;
 -    bool is_aio_context_attached;
 +    AioContext *aio_ctx;
  };
  typedef struct QEMUWin32AIOCB {
@@ -XXX,XX +XXX,XX @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
      }
 +    aio_context_acquire(s->aio_ctx);
      waiocb->common.cb(waiocb->common.opaque, ret);
 +    aio_context_release(s->aio_ctx);
      qemu_aio_unref(waiocb);
  }
@@ -XXX,XX +XXX,XX @@ void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
                                    AioContext *old_context)
  {
      aio_set_event_notifier(old_context, &aio->e, false, NULL, NULL);
 -    aio->is_aio_context_attached = false;
 +    aio->aio_ctx = NULL;
  }
  void win32_aio_attach_aio_context(QEMUWin32AIOState *aio,
                                    AioContext *new_context)
  {
 -    aio->is_aio_context_attached = true;
 +    aio->aio_ctx = new_context;
      aio_set_event_notifier(new_context, &aio->e, false,
                             win32_aio_completion_cb, NULL);
  }
@@ -XXX,XX +XXX,XX @@ out_free_state:
  void win32_aio_cleanup(QEMUWin32AIOState *aio)
  {
 -    assert(!aio->is_aio_context_attached);
 +    assert(!aio->aio_ctx);
      CloseHandle(aio->hIOCP);
      event_notifier_cleanup(&aio->e);
      g_free(aio);
 diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/block/virtio-blk.c
 +++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
  {
      VirtIOBlockIoctlReq *ioctl_req = opaque;
      VirtIOBlockReq *req = ioctl_req->req;
 -    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
 +    VirtIOBlock *s = req->dev;
 +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
      struct virtio_scsi_inhdr *scsi;
      struct sg_io_hdr *hdr;
@@ -XXX,XX +XXX,XX @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
      MultiReqBuffer mrb = {};
      bool progress = false;
 +    aio_context_acquire(blk_get_aio_context(s->blk));
      blk_io_plug(s->blk);
      do {
@@ -XXX,XX +XXX,XX @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
      }
      blk_io_unplug(s->blk);
 +    aio_context_release(blk_get_aio_context(s->blk));
      return progress;
  }
 diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/scsi/virtio-scsi.c
 +++ b/hw/scsi/virtio-scsi.c
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
      VirtIOSCSIReq *req;
      bool progress = false;
 +    virtio_scsi_acquire(s);
      while ((req = virtio_scsi_pop_req(s, vq))) {
          progress = true;
          virtio_scsi_handle_ctrl_req(s, req);
      }
 +    virtio_scsi_release(s);
      return progress;
  }
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
      QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs);
 +    virtio_scsi_acquire(s);
      do {
          virtio_queue_set_notification(vq, 0);
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
      QTAILQ_FOREACH_SAFE(req, &reqs, next, next) {
          virtio_scsi_handle_cmd_req_submit(s, req);
      }
 +    virtio_scsi_release(s);
      return progress;
  }
@@ -XXX,XX +XXX,XX @@ out:
  bool virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq)
  {
 +    virtio_scsi_acquire(s);
      if (s->events_dropped) {
          virtio_scsi_push_event(s, NULL, VIRTIO_SCSI_T_NO_EVENT, 0);
 +        virtio_scsi_release(s);
          return true;
      }
 +    virtio_scsi_release(s);
      return false;
  }
 diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-posix.c
 +++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
              (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
              aio_node_check(ctx, node->is_external) &&
              node->io_read) {
 -            aio_context_acquire(ctx);
              node->io_read(node->opaque);
 -            aio_context_release(ctx);
              /* aio_notify() does not count as progress */
              if (node->opaque != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
              (revents & (G_IO_OUT | G_IO_ERR)) &&
              aio_node_check(ctx, node->is_external) &&
              node->io_write) {
 -            aio_context_acquire(ctx);
              node->io_write(node->opaque);
 -            aio_context_release(ctx);
              progress = true;
          }
-+        if (l1_table[i] & L1E_RESERVED_MASK) {
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
-+            fprintf(stderr, "ERROR found L1 entry with reserved bits set: "
+         start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-+                    "%" PRIx64 "\n", l1_table[i]);
+     }
-+            res->corruptions++;
-+        }
+-    aio_context_acquire(ctx);
-+
+     progress = try_poll_mode(ctx, blocking);
-         l2_offset = l1_table[i] & L1E_OFFSET_MASK;
+-    aio_context_release(ctx);
+-
-         /* Mark L2 table as used */
+     if (!progress) {
          assert(npfd == 0);
 diff --git a/util/aio-win32.c b/util/aio-win32.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-win32.c
 +++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
              (revents || event_notifier_get_handle(node->e) == event) &&
              node->io_notify) {
              node->pfd.revents = 0;
 -            aio_context_acquire(ctx);
              node->io_notify(node->e);
 -            aio_context_release(ctx);
              /* aio_notify() does not count as progress */
              if (node->e != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
              (node->io_read || node->io_write)) {
              node->pfd.revents = 0;
              if ((revents & G_IO_IN) && node->io_read) {
 -                aio_context_acquire(ctx);
                  node->io_read(node->opaque);
 -                aio_context_release(ctx);
                  progress = true;
              }
              if ((revents & G_IO_OUT) && node->io_write) {
 -                aio_context_acquire(ctx);
                  node->io_write(node->opaque);
 -                aio_context_release(ctx);
                  progress = true;
              }
 --
-.31.1
+.9.3

-[PULL 06/32] block/gluster: Do not force-cap *pnum
+[Qemu-devel] [PULL v2 14/24] block: explicitly acquire aiocontext in bottom halves that need it
-bdrv_co_block_status() does it for us, we do not need to do it here.
+From: Paolo Bonzini <pbonzini@redhat.com>
-The advantage of not capping *pnum is that bdrv_co_block_status() can
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-cache larger data regions than requested by its caller.
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Message-id: 20170213135235.12274-15-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  block/archipelago.c   |  3 +++
  block/blkreplay.c     |  2 +-
  block/block-backend.c |  6 ++++++
  block/curl.c          | 26 ++++++++++++++++++--------
  block/gluster.c       |  9 +--------
  block/io.c            |  6 +++++-
  block/iscsi.c         |  6 +++++-
  block/linux-aio.c     | 15 +++++++++------
  block/nfs.c           |  3 ++-
  block/null.c          |  4 ++++
  block/qed.c           |  3 +++
  block/rbd.c           |  4 ++++
  dma-helpers.c         |  2 ++
  hw/block/virtio-blk.c |  2 ++
  hw/scsi/scsi-bus.c    |  2 ++
  util/async.c          |  4 ++--
  util/thread-pool.c    |  2 ++
 files changed, 71 insertions(+), 28 deletions(-)
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
+diff --git a/block/archipelago.c b/block/archipelago.c
-Reviewed-by: Eric Blake <eblake@redhat.com>
+index XXXXXXX..XXXXXXX 100644
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+--- a/block/archipelago.c
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
++++ b/block/archipelago.c
-Message-Id: <20210812084148.14458-6-hreitz@redhat.com>
+@@ -XXX,XX +XXX,XX @@ static void qemu_archipelago_complete_aio(void *opaque)
----
+ {
- block/gluster.c | 7 ++++---
+     AIORequestData *reqdata = (AIORequestData *) opaque;
-file changed, 4 insertions(+), 3 deletions(-)
+     ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;
++    AioContext *ctx = bdrv_get_aio_context(aio_cb->common.bs);
 +    aio_context_acquire(ctx);
      aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
 +    aio_context_release(ctx);
      aio_cb->status = 0;
      qemu_aio_unref(aio_cb);
 diff --git a/block/blkreplay.c b/block/blkreplay.c
 index XXXXXXX..XXXXXXX 100755
 --- a/block/blkreplay.c
 +++ b/block/blkreplay.c
@@ -XXX,XX +XXX,XX @@ static int64_t blkreplay_getlength(BlockDriverState *bs)
  static void blkreplay_bh_cb(void *opaque)
  {
      Request *req = opaque;
 -    qemu_coroutine_enter(req->co);
 +    aio_co_wake(req->co);
      qemu_bh_delete(req->bh);
      g_free(req);
  }
 diff --git a/block/block-backend.c b/block/block-backend.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/block-backend.c
 +++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
  static void error_callback_bh(void *opaque)
  {
      struct BlockBackendAIOCB *acb = opaque;
 +    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
      bdrv_dec_in_flight(acb->common.bs);
 +    aio_context_acquire(ctx);
      acb->common.cb(acb->common.opaque, acb->ret);
 +    aio_context_release(ctx);
      qemu_aio_unref(acb);
  }
@@ -XXX,XX +XXX,XX @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)
  static void blk_aio_complete_bh(void *opaque)
  {
      BlkAioEmAIOCB *acb = opaque;
 +    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
      assert(acb->has_returned);
 +    aio_context_acquire(ctx);
      blk_aio_complete(acb);
 +    aio_context_release(ctx);
  }
  static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
 diff --git a/block/curl.c b/block/curl.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/curl.c
 +++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
  {
      CURLState *state;
      int running;
 +    int ret = -EINPROGRESS;
      CURLAIOCB *acb = p;
 -    BDRVCURLState *s = acb->common.bs->opaque;
 +    BlockDriverState *bs = acb->common.bs;
 +    BDRVCURLState *s = bs->opaque;
 +    AioContext *ctx = bdrv_get_aio_context(bs);
      size_t start = acb->sector_num * BDRV_SECTOR_SIZE;
      size_t end;
 +    aio_context_acquire(ctx);
 +
      // In case we have the requested data already (e.g. read-ahead),
      // we can just call the callback and be done.
      switch (curl_find_buf(s, start, acb->nb_sectors * BDRV_SECTOR_SIZE, acb)) {
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
              qemu_aio_unref(acb);
              // fall through
          case FIND_RET_WAIT:
 -            return;
 +            goto out;
          default:
              break;
      }
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
      // No cache found, so let's start a new request
      state = curl_init_state(acb->common.bs, s);
      if (!state) {
 -        acb->common.cb(acb->common.opaque, -EIO);
 -        qemu_aio_unref(acb);
 -        return;
 +        ret = -EIO;
 +        goto out;
      }
      acb->start = 0;
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
      state->orig_buf = g_try_malloc(state->buf_len);
      if (state->buf_len && state->orig_buf == NULL) {
          curl_clean_state(state);
 -        acb->common.cb(acb->common.opaque, -ENOMEM);
 -        qemu_aio_unref(acb);
 -        return;
 +        ret = -ENOMEM;
 +        goto out;
      }
      state->acb[0] = acb;
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
      /* Tell curl it needs to kick things off */
      curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
 +
 +out:
 +    if (ret != -EINPROGRESS) {
 +        acb->common.cb(acb->common.opaque, ret);
 +        qemu_aio_unref(acb);
 +    }
 +    aio_context_release(ctx);
  }
  static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
 diff --git a/block/gluster.c b/block/gluster.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/gluster.c
 +++ b/block/gluster.c
-@@ -XXX,XX +XXX,XX @@ exit:
+@@ -XXX,XX +XXX,XX @@ static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
-  * the specified offset) that are known to be in the same
+     return qemu_gluster_glfs_init(gconf, errp);
-  * allocated/unallocated state.
+ }
-  *
-- * 'bytes' is the max value 'pnum' should be set to.
+-static void qemu_gluster_complete_aio(void *opaque)
-+ * 'bytes' is a soft cap for 'pnum'.  If the information is free, 'pnum' may
+-{
-+ * well exceed it.
+-    GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
-  *
+-
-  * (Based on raw_co_block_status() from file-posix.c.)
+-    qemu_coroutine_enter(acb->coroutine);
 -}
 -
  /*
   * AIO callback routine called from GlusterFS thread.
   */
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
-     } else if (data == offset) {
+         acb->ret = -EIO; /* Partial read/write - fail it */
-         /* On a data extent, compute bytes to the end of the extent,
+     }
-          * possibly including a partial sector at EOF. */
--        *pnum = MIN(bytes, hole - offset);
+-    aio_bh_schedule_oneshot(acb->aio_context, qemu_gluster_complete_aio, acb);
-+        *pnum = hole - offset;
++    aio_co_schedule(acb->aio_context, acb->coroutine);
+ }
-         /*
-          * We are not allowed to return partial sectors, though, so
+ static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
+diff --git a/block/io.c b/block/io.c
-     } else {
+index XXXXXXX..XXXXXXX 100644
-         /* On a hole, compute bytes to the beginning of the next extent.  */
+--- a/block/io.c
-         assert(hole == offset);
++++ b/block/io.c
--        *pnum = MIN(bytes, data - offset);
+@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
-+        *pnum = data - offset;
+     bdrv_dec_in_flight(bs);
-         ret = BDRV_BLOCK_ZERO;
+     bdrv_drained_begin(bs);
-     }
+     data->done = true;
+-    qemu_coroutine_enter(co);
 +    aio_co_wake(co);
  }
  static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
  static void bdrv_co_em_bh(void *opaque)
  {
      BlockAIOCBCoroutine *acb = opaque;
 +    BlockDriverState *bs = acb->common.bs;
 +    AioContext *ctx = bdrv_get_aio_context(bs);
      assert(!acb->need_bh);
 +    aio_context_acquire(ctx);
      bdrv_co_complete(acb);
 +    aio_context_release(ctx);
  }
  static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
 diff --git a/block/iscsi.c b/block/iscsi.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/iscsi.c
 +++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static void
  iscsi_bh_cb(void *p)
  {
      IscsiAIOCB *acb = p;
 +    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
      qemu_bh_delete(acb->bh);
      g_free(acb->buf);
      acb->buf = NULL;
 +    aio_context_acquire(ctx);
      acb->common.cb(acb->common.opaque, acb->status);
 +    aio_context_release(ctx);
      if (acb->task != NULL) {
          scsi_free_scsi_task(acb->task);
@@ -XXX,XX +XXX,XX @@ iscsi_schedule_bh(IscsiAIOCB *acb)
  static void iscsi_co_generic_bh_cb(void *opaque)
  {
      struct IscsiTask *iTask = opaque;
 +
      iTask->complete = 1;
 -    qemu_coroutine_enter(iTask->co);
 +    aio_co_wake(iTask->co);
  }
  static void iscsi_retry_timer_expired(void *opaque)
 diff --git a/block/linux-aio.c b/block/linux-aio.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/linux-aio.c
 +++ b/block/linux-aio.c
@@ -XXX,XX +XXX,XX @@ struct LinuxAioState {
      io_context_t ctx;
      EventNotifier e;
 -    /* io queue for submit at batch */
 +    /* io queue for submit at batch.  Protected by AioContext lock. */
      LaioQueue io_q;
 -    /* I/O completion processing */
 +    /* I/O completion processing.  Only runs in I/O thread.  */
      QEMUBH *completion_bh;
      int event_idx;
      int event_max;
@@ -XXX,XX +XXX,XX @@ static inline ssize_t io_event_ret(struct io_event *ev)
   */
  static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
  {
 +    LinuxAioState *s = laiocb->ctx;
      int ret;
      ret = laiocb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
      }
      laiocb->ret = ret;
 +    aio_context_acquire(s->aio_context);
      if (laiocb->co) {
          /* If the coroutine is already entered it must be in ioq_submit() and
           * will notice laio->ret has been filled in when it eventually runs
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
          laiocb->common.cb(laiocb->common.opaque, ret);
          qemu_aio_unref(laiocb);
      }
 +    aio_context_release(s->aio_context);
  }
  /**
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completions(LinuxAioState *s)
  static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
  {
      qemu_laio_process_completions(s);
 +
 +    aio_context_acquire(s->aio_context);
      if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
          ioq_submit(s);
      }
 +    aio_context_release(s->aio_context);
  }
  static void qemu_laio_completion_bh(void *opaque)
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_completion_cb(EventNotifier *e)
      LinuxAioState *s = container_of(e, LinuxAioState, e);
      if (event_notifier_test_and_clear(&s->e)) {
 -        aio_context_acquire(s->aio_context);
          qemu_laio_process_completions_and_submit(s);
 -        aio_context_release(s->aio_context);
      }
  }
@@ -XXX,XX +XXX,XX @@ static bool qemu_laio_poll_cb(void *opaque)
          return false;
      }
 -    aio_context_acquire(s->aio_context);
      qemu_laio_process_completions_and_submit(s);
 -    aio_context_release(s->aio_context);
      return true;
  }
@@ -XXX,XX +XXX,XX @@ void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
  {
      aio_set_event_notifier(old_context, &s->e, false, NULL, NULL);
      qemu_bh_delete(s->completion_bh);
 +    s->aio_context = NULL;
  }
  void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
 diff --git a/block/nfs.c b/block/nfs.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/nfs.c
 +++ b/block/nfs.c
@@ -XXX,XX +XXX,XX @@ static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
  static void nfs_co_generic_bh_cb(void *opaque)
  {
      NFSRPC *task = opaque;
 +
      task->complete = 1;
 -    qemu_coroutine_enter(task->co);
 +    aio_co_wake(task->co);
  }
  static void
 diff --git a/block/null.c b/block/null.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/null.c
 +++ b/block/null.c
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo null_aiocb_info = {
  static void null_bh_cb(void *opaque)
  {
      NullAIOCB *acb = opaque;
 +    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 +
 +    aio_context_acquire(ctx);
      acb->common.cb(acb->common.opaque, 0);
 +    aio_context_release(ctx);
      qemu_aio_unref(acb);
  }
 diff --git a/block/qed.c b/block/qed.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.c
 +++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
  static void qed_aio_complete_bh(void *opaque)
  {
      QEDAIOCB *acb = opaque;
 +    BDRVQEDState *s = acb_to_s(acb);
      BlockCompletionFunc *cb = acb->common.cb;
      void *user_opaque = acb->common.opaque;
      int ret = acb->bh_ret;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete_bh(void *opaque)
      qemu_aio_unref(acb);
      /* Invoke callback */
 +    qed_acquire(s);
      cb(user_opaque, ret);
 +    qed_release(s);
  }
  static void qed_aio_complete(QEDAIOCB *acb, int ret)
 diff --git a/block/rbd.c b/block/rbd.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/rbd.c
 +++ b/block/rbd.c
@@ -XXX,XX +XXX,XX @@ shutdown:
  static void qemu_rbd_complete_aio(RADOSCB *rcb)
  {
      RBDAIOCB *acb = rcb->acb;
 +    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
      int64_t r;
      r = rcb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
          qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
      }
      qemu_vfree(acb->bounce);
 +
 +    aio_context_acquire(ctx);
      acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
 +    aio_context_release(ctx);
      qemu_aio_unref(acb);
  }
 diff --git a/dma-helpers.c b/dma-helpers.c
 index XXXXXXX..XXXXXXX 100644
 --- a/dma-helpers.c
 +++ b/dma-helpers.c
@@ -XXX,XX +XXX,XX @@ static void dma_blk_cb(void *opaque, int ret)
                                  QEMU_ALIGN_DOWN(dbs->iov.size, dbs->align));
      }
 +    aio_context_acquire(dbs->ctx);
      dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
                              dma_blk_cb, dbs, dbs->io_func_opaque);
 +    aio_context_release(dbs->ctx);
      assert(dbs->acb);
  }
 diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/block/virtio-blk.c
 +++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_dma_restart_bh(void *opaque)
      s->rq = NULL;
 +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
      while (req) {
          VirtIOBlockReq *next = req->next;
          if (virtio_blk_handle_request(req, &mrb)) {
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_dma_restart_bh(void *opaque)
      if (mrb.num_reqs) {
          virtio_blk_submit_multireq(s->blk, &mrb);
      }
 +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
  }
  static void virtio_blk_dma_restart_cb(void *opaque, int running,
 diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/scsi/scsi-bus.c
 +++ b/hw/scsi/scsi-bus.c
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_restart_bh(void *opaque)
      qemu_bh_delete(s->bh);
      s->bh = NULL;
 +    aio_context_acquire(blk_get_aio_context(s->conf.blk));
      QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
          scsi_req_ref(req);
          if (req->retry) {
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_restart_bh(void *opaque)
          }
          scsi_req_unref(req);
      }
 +    aio_context_release(blk_get_aio_context(s->conf.blk));
  }
  void scsi_req_retry(SCSIRequest *req)
 diff --git a/util/async.c b/util/async.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/async.c
 +++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
                  ret = 1;
              }
              bh->idle = 0;
 -            aio_context_acquire(ctx);
              aio_bh_call(bh);
 -            aio_context_release(ctx);
          }
          if (bh->deleted) {
              deleted = true;
@@ -XXX,XX +XXX,XX @@ static void co_schedule_bh_cb(void *opaque)
          Coroutine *co = QSLIST_FIRST(&straight);
          QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
          trace_aio_co_schedule_bh_cb(ctx, co);
 +        aio_context_acquire(ctx);
          qemu_coroutine_enter(co);
 +        aio_context_release(ctx);
      }
  }
 diff --git a/util/thread-pool.c b/util/thread-pool.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/thread-pool.c
 +++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@ static void thread_pool_completion_bh(void *opaque)
      ThreadPool *pool = opaque;
      ThreadPoolElement *elem, *next;
 +    aio_context_acquire(pool->ctx);
  restart:
      QLIST_FOREACH_SAFE(elem, &pool->head, all, next) {
          if (elem->state != THREAD_DONE) {
@@ -XXX,XX +XXX,XX @@ restart:
              qemu_aio_unref(elem);
          }
      }
 +    aio_context_release(pool->ctx);
  }
  static void thread_pool_cancel(BlockAIOCB *acb)
 --
-.31.1
+.9.3

-[PULL 15/32] block/mirror: fix NULL pointer dereference in mirror_wait_on_conflicts()
+[Qemu-devel] [PULL v2 15/24] block: explicitly acquire aiocontext in aio callbacks that need it
-From: Stefano Garzarella <sgarzare@redhat.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-In mirror_iteration() we call mirror_wait_on_conflicts() with
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-`self` parameter set to NULL.
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Message-id: 20170213135235.12274-16-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  block/archipelago.c    |  3 ---
  block/block-backend.c  |  7 -------
  block/curl.c           |  2 +-
  block/io.c             |  6 +-----
  block/iscsi.c          |  3 ---
  block/linux-aio.c      |  5 +----
  block/mirror.c         | 12 +++++++++---
  block/null.c           |  8 --------
  block/qed-cluster.c    |  2 ++
  block/qed-table.c      | 12 ++++++++++--
  block/qed.c            |  4 ++--
  block/rbd.c            |  4 ----
  block/win32-aio.c      |  3 ---
  hw/block/virtio-blk.c  | 12 +++++++++++-
  hw/scsi/scsi-disk.c    | 15 +++++++++++++++
  hw/scsi/scsi-generic.c | 20 +++++++++++++++++---
  util/thread-pool.c     |  4 +++-
 files changed, 72 insertions(+), 50 deletions(-)
-Starting from commit d44dae1a7c we dereference `self` pointer in
+diff --git a/block/archipelago.c b/block/archipelago.c
-mirror_wait_on_conflicts() without checks if it is not NULL.
+index XXXXXXX..XXXXXXX 100644
+--- a/block/archipelago.c
-Backtrace:
++++ b/block/archipelago.c
-  Program terminated with signal SIGSEGV, Segmentation fault.
+@@ -XXX,XX +XXX,XX @@ static void qemu_archipelago_complete_aio(void *opaque)
-  #0  mirror_wait_on_conflicts (self=0x0, s=<optimized out>, offset=<optimized out>, bytes=<optimized out>)
+ {
-      at ../block/mirror.c:172
+     AIORequestData *reqdata = (AIORequestData *) opaque;
-                    self->waiting_for_op = op;
+     ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;
-  [Current thread is 1 (Thread 0x7f0908931ec0 (LWP 380249))]
+-    AioContext *ctx = bdrv_get_aio_context(aio_cb->common.bs);
-  (gdb) bt
-  #0  mirror_wait_on_conflicts (self=0x0, s=<optimized out>, offset=<optimized out>, bytes=<optimized out>)
+-    aio_context_acquire(ctx);
-      at ../block/mirror.c:172
+     aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
-  #1  0x00005610c5d9d631 in mirror_run (job=0x5610c76a2c00, errp=<optimized out>) at ../block/mirror.c:491
+-    aio_context_release(ctx);
-  #2  0x00005610c5d58726 in job_co_entry (opaque=0x5610c76a2c00) at ../job.c:917
+     aio_cb->status = 0;
-  #3  0x00005610c5f046c6 in coroutine_trampoline (i0=<optimized out>, i1=<optimized out>)
-      at ../util/coroutine-ucontext.c:173
+     qemu_aio_unref(aio_cb);
-  #4  0x00007f0909975820 in ?? () at ../sysdeps/unix/sysv/linux/x86_64/__start_context.S:91
+diff --git a/block/block-backend.c b/block/block-backend.c
-      from /usr/lib64/libc.so.6
+index XXXXXXX..XXXXXXX 100644
+--- a/block/block-backend.c
-Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2001404
++++ b/block/block-backend.c
-Fixes: d44dae1a7c ("block/mirror: fix active mirror dead-lock in mirror_wait_on_conflicts")
+@@ -XXX,XX +XXX,XX @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
-Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
+ static void error_callback_bh(void *opaque)
-Message-Id: <20210910124533.288318-1-sgarzare@redhat.com>
+ {
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+     struct BlockBackendAIOCB *acb = opaque;
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
+-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
----
- block/mirror.c | 25 ++++++++++++++++---------
+     bdrv_dec_in_flight(acb->common.bs);
-file changed, 16 insertions(+), 9 deletions(-)
+-    aio_context_acquire(ctx);
+     acb->common.cb(acb->common.opaque, acb->ret);
 -    aio_context_release(ctx);
      qemu_aio_unref(acb);
  }
@@ -XXX,XX +XXX,XX @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)
  static void blk_aio_complete_bh(void *opaque)
  {
      BlkAioEmAIOCB *acb = opaque;
 -    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 -
      assert(acb->has_returned);
 -    aio_context_acquire(ctx);
      blk_aio_complete(acb);
 -    aio_context_release(ctx);
  }
  static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
 diff --git a/block/curl.c b/block/curl.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/curl.c
 +++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
      curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
  out:
 +    aio_context_release(ctx);
      if (ret != -EINPROGRESS) {
          acb->common.cb(acb->common.opaque, ret);
          qemu_aio_unref(acb);
      }
 -    aio_context_release(ctx);
  }
  static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_io_em_complete(void *opaque, int ret)
      CoroutineIOCompletion *co = opaque;
      co->ret = ret;
 -    qemu_coroutine_enter(co->coroutine);
 +    aio_co_wake(co->coroutine);
  }
  static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
  static void bdrv_co_em_bh(void *opaque)
  {
      BlockAIOCBCoroutine *acb = opaque;
 -    BlockDriverState *bs = acb->common.bs;
 -    AioContext *ctx = bdrv_get_aio_context(bs);
      assert(!acb->need_bh);
 -    aio_context_acquire(ctx);
      bdrv_co_complete(acb);
 -    aio_context_release(ctx);
  }
  static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
 diff --git a/block/iscsi.c b/block/iscsi.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/iscsi.c
 +++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static void
  iscsi_bh_cb(void *p)
  {
      IscsiAIOCB *acb = p;
 -    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
      qemu_bh_delete(acb->bh);
      g_free(acb->buf);
      acb->buf = NULL;
 -    aio_context_acquire(ctx);
      acb->common.cb(acb->common.opaque, acb->status);
 -    aio_context_release(ctx);
      if (acb->task != NULL) {
          scsi_free_scsi_task(acb->task);
 diff --git a/block/linux-aio.c b/block/linux-aio.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/linux-aio.c
 +++ b/block/linux-aio.c
@@ -XXX,XX +XXX,XX @@ static inline ssize_t io_event_ret(struct io_event *ev)
   */
  static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
  {
 -    LinuxAioState *s = laiocb->ctx;
      int ret;
      ret = laiocb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
      }
      laiocb->ret = ret;
 -    aio_context_acquire(s->aio_context);
      if (laiocb->co) {
          /* If the coroutine is already entered it must be in ioq_submit() and
           * will notice laio->ret has been filled in when it eventually runs
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
           * that!
           */
          if (!qemu_coroutine_entered(laiocb->co)) {
 -            qemu_coroutine_enter(laiocb->co);
 +            aio_co_wake(laiocb->co);
          }
      } else {
          laiocb->common.cb(laiocb->common.opaque, ret);
          qemu_aio_unref(laiocb);
      }
 -    aio_context_release(s->aio_context);
  }
  /**
 diff --git a/block/mirror.c b/block/mirror.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_wait_on_conflicts(MirrorOp *self,
+@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
-             if (ranges_overlap(self_start_chunk, self_nb_chunks,
+ {
-                                op_start_chunk, op_nb_chunks))
+     MirrorOp *op = opaque;
-             {
+     MirrorBlockJob *s = op->s;
--                /*
++
--                 * If the operation is already (indirectly) waiting for us, or
++    aio_context_acquire(blk_get_aio_context(s->common.blk));
--                 * will wait for us as soon as it wakes up, then just go on
+     if (ret < 0) {
--                 * (instead of producing a deadlock in the former case).
+         BlockErrorAction action;
--                 */
--                if (op->waiting_for_op) {
+@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
 -                    continue;
 +                if (self) {
 +                    /*
 +                     * If the operation is already (indirectly) waiting for us,
 +                     * or will wait for us as soon as it wakes up, then just go
 +                     * on (instead of producing a deadlock in the former case).
 +                     */
 +                    if (op->waiting_for_op) {
 +                        continue;
 +                    }
 +
 +                    self->waiting_for_op = op;
                  }
 -                self->waiting_for_op = op;
                  qemu_co_queue_wait(&op->waiting_requests, NULL);
 -                self->waiting_for_op = NULL;
 +
 +                if (self) {
 +                    self->waiting_for_op = NULL;
 +                }
 +
                  break;
              }
          }
+     }
+     mirror_iteration_done(op, ret);
++    aio_context_release(blk_get_aio_context(s->common.blk));
+ }
+ static void mirror_read_complete(void *opaque, int ret)
+ {
+     MirrorOp *op = opaque;
+     MirrorBlockJob *s = op->s;
++
++    aio_context_acquire(blk_get_aio_context(s->common.blk));
+     if (ret < 0) {
+         BlockErrorAction action;
+@@ -XXX,XX +XXX,XX @@ static void mirror_read_complete(void *opaque, int ret)
+         }
+         mirror_iteration_done(op, ret);
+-        return;
++    } else {
++        blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
++                        0, mirror_write_complete, op);
+     }
+-    blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
+-                    0, mirror_write_complete, op);
++    aio_context_release(blk_get_aio_context(s->common.blk));
+ }
+ static inline void mirror_clip_sectors(MirrorBlockJob *s,
+diff --git a/block/null.c b/block/null.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/null.c
++++ b/block/null.c
+@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo null_aiocb_info = {
+ static void null_bh_cb(void *opaque)
+ {
+     NullAIOCB *acb = opaque;
+-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+-
+-    aio_context_acquire(ctx);
+     acb->common.cb(acb->common.opaque, 0);
+-    aio_context_release(ctx);
+     qemu_aio_unref(acb);
+ }
+ static void null_timer_cb(void *opaque)
+ {
+     NullAIOCB *acb = opaque;
+-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+-
+-    aio_context_acquire(ctx);
+     acb->common.cb(acb->common.opaque, 0);
+-    aio_context_release(ctx);
+     timer_deinit(&acb->timer);
+     qemu_aio_unref(acb);
+ }
+diff --git a/block/qed-cluster.c b/block/qed-cluster.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/qed-cluster.c
++++ b/block/qed-cluster.c
+@@ -XXX,XX +XXX,XX @@ static void qed_find_cluster_cb(void *opaque, int ret)
+     unsigned int index;
+     unsigned int n;
++    qed_acquire(s);
+     if (ret) {
+         goto out;
+     }
+@@ -XXX,XX +XXX,XX @@ static void qed_find_cluster_cb(void *opaque, int ret)
+ out:
+     find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
++    qed_release(s);
+     g_free(find_cluster_cb);
+ }
+diff --git a/block/qed-table.c b/block/qed-table.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/qed-table.c
++++ b/block/qed-table.c
+@@ -XXX,XX +XXX,XX @@ static void qed_read_table_cb(void *opaque, int ret)
+ {
+     QEDReadTableCB *read_table_cb = opaque;
+     QEDTable *table = read_table_cb->table;
++    BDRVQEDState *s = read_table_cb->s;
+     int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
+     int i;
+@@ -XXX,XX +XXX,XX @@ static void qed_read_table_cb(void *opaque, int ret)
+     }
+     /* Byteswap offsets */
++    qed_acquire(s);
+     for (i = 0; i < noffsets; i++) {
+         table->offsets[i] = le64_to_cpu(table->offsets[i]);
+     }
++    qed_release(s);
+ out:
+     /* Completion */
+-    trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret);
++    trace_qed_read_table_cb(s, read_table_cb->table, ret);
+     gencb_complete(&read_table_cb->gencb, ret);
+ }
+@@ -XXX,XX +XXX,XX @@ typedef struct {
+ static void qed_write_table_cb(void *opaque, int ret)
+ {
+     QEDWriteTableCB *write_table_cb = opaque;
++    BDRVQEDState *s = write_table_cb->s;
+-    trace_qed_write_table_cb(write_table_cb->s,
++    trace_qed_write_table_cb(s,
+                              write_table_cb->orig_table,
+                              write_table_cb->flush,
+                              ret);
+@@ -XXX,XX +XXX,XX @@ static void qed_write_table_cb(void *opaque, int ret)
+     if (write_table_cb->flush) {
+         /* We still need to flush first */
+         write_table_cb->flush = false;
++        qed_acquire(s);
+         bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb,
+                        write_table_cb);
++        qed_release(s);
+         return;
+     }
+@@ -XXX,XX +XXX,XX @@ static void qed_read_l2_table_cb(void *opaque, int ret)
+     CachedL2Table *l2_table = request->l2_table;
+     uint64_t l2_offset = read_l2_table_cb->l2_offset;
++    qed_acquire(s);
+     if (ret) {
+         /* can't trust loaded L2 table anymore */
+         qed_unref_l2_cache_entry(l2_table);
+@@ -XXX,XX +XXX,XX @@ static void qed_read_l2_table_cb(void *opaque, int ret)
+         request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
+         assert(request->l2_table != NULL);
+     }
++    qed_release(s);
+     gencb_complete(&read_l2_table_cb->gencb, ret);
+ }
+diff --git a/block/qed.c b/block/qed.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/qed.c
++++ b/block/qed.c
+@@ -XXX,XX +XXX,XX @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l
+     }
+     if (cb->co) {
+-        qemu_coroutine_enter(cb->co);
++        aio_co_wake(cb->co);
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
+     cb->done = true;
+     cb->ret = ret;
+     if (cb->co) {
+-        qemu_coroutine_enter(cb->co);
++        aio_co_wake(cb->co);
+     }
+ }
+diff --git a/block/rbd.c b/block/rbd.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/rbd.c
++++ b/block/rbd.c
+@@ -XXX,XX +XXX,XX @@ shutdown:
+ static void qemu_rbd_complete_aio(RADOSCB *rcb)
+ {
+     RBDAIOCB *acb = rcb->acb;
+-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+     int64_t r;
+     r = rcb->ret;
+@@ -XXX,XX +XXX,XX @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
+         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+     }
+     qemu_vfree(acb->bounce);
+-
+-    aio_context_acquire(ctx);
+     acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
+-    aio_context_release(ctx);
+     qemu_aio_unref(acb);
+ }
+diff --git a/block/win32-aio.c b/block/win32-aio.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/win32-aio.c
++++ b/block/win32-aio.c
+@@ -XXX,XX +XXX,XX @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
+         qemu_vfree(waiocb->buf);
+     }
+-
+-    aio_context_acquire(s->aio_ctx);
+     waiocb->common.cb(waiocb->common.opaque, ret);
+-    aio_context_release(s->aio_ctx);
+     qemu_aio_unref(waiocb);
+ }
+diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/block/virtio-blk.c
++++ b/hw/block/virtio-blk.c
+@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
+ static void virtio_blk_rw_complete(void *opaque, int ret)
+ {
+     VirtIOBlockReq *next = opaque;
++    VirtIOBlock *s = next->dev;
++    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+     while (next) {
+         VirtIOBlockReq *req = next;
+         next = req->mr_next;
+@@ -XXX,XX +XXX,XX @@ static void virtio_blk_rw_complete(void *opaque, int ret)
+         block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
+         virtio_blk_free_request(req);
+     }
++    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+ }
+ static void virtio_blk_flush_complete(void *opaque, int ret)
+ {
+     VirtIOBlockReq *req = opaque;
++    VirtIOBlock *s = req->dev;
++    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+     if (ret) {
+         if (virtio_blk_handle_rw_error(req, -ret, 0)) {
+-            return;
++            goto out;
+         }
+     }
+     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
+     block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
+     virtio_blk_free_request(req);
++
++out:
++    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+ }
+ #ifdef __linux__
+@@ -XXX,XX +XXX,XX @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
+     virtio_stl_p(vdev, &scsi->data_len, hdr->dxfer_len);
+ out:
++    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+     virtio_blk_req_complete(req, status);
+     virtio_blk_free_request(req);
++    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+     g_free(ioctl_req);
+ }
+diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/scsi/scsi-disk.c
++++ b/hw/scsi/scsi-disk.c
+@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
+     assert(r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
+     if (scsi_disk_req_check_error(r, ret, true)) {
+         goto done;
+     }
+@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
+     scsi_req_complete(&r->req, GOOD);
+ done:
++    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
+     scsi_req_unref(&r->req);
+ }
+@@ -XXX,XX +XXX,XX @@ static void scsi_dma_complete(void *opaque, int ret)
+     assert(r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
+     if (ret < 0) {
+         block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
+     } else {
+         block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
+     }
+     scsi_dma_complete_noio(r, ret);
++    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
+ }
+ static void scsi_read_complete(void * opaque, int ret)
+@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
+     assert(r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
+     if (scsi_disk_req_check_error(r, ret, true)) {
+         goto done;
+     }
+@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
+ done:
+     scsi_req_unref(&r->req);
++    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
+ }
+ /* Actually issue a read to the block device.  */
+@@ -XXX,XX +XXX,XX @@ static void scsi_do_read_cb(void *opaque, int ret)
+     assert (r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
+     if (ret < 0) {
+         block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
+     } else {
+         block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
+     }
+     scsi_do_read(opaque, ret);
++    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
+ }
+ /* Read more data from scsi device into buffer.  */
+@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
+     assert (r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
+     if (ret < 0) {
+         block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
+     } else {
+         block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
+     }
+     scsi_write_complete_noio(r, ret);
++    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
+ }
+ static void scsi_write_data(SCSIRequest *req)
+@@ -XXX,XX +XXX,XX @@ static void scsi_unmap_complete(void *opaque, int ret)
+ {
+     UnmapCBData *data = opaque;
+     SCSIDiskReq *r = data->r;
++    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+     assert(r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
+     scsi_unmap_complete_noio(data, ret);
++    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
+ }
+ static void scsi_disk_emulate_unmap(SCSIDiskReq *r, uint8_t *inbuf)
+@@ -XXX,XX +XXX,XX @@ static void scsi_write_same_complete(void *opaque, int ret)
+     assert(r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
+     if (scsi_disk_req_check_error(r, ret, true)) {
+         goto done;
+     }
+@@ -XXX,XX +XXX,XX @@ done:
+     scsi_req_unref(&r->req);
+     qemu_vfree(data->iov.iov_base);
+     g_free(data);
++    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
+ }
+ static void scsi_disk_emulate_write_same(SCSIDiskReq *r, uint8_t *inbuf)
+diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/scsi/scsi-generic.c
++++ b/hw/scsi/scsi-generic.c
+@@ -XXX,XX +XXX,XX @@ done:
+ static void scsi_command_complete(void *opaque, int ret)
+ {
+     SCSIGenericReq *r = (SCSIGenericReq *)opaque;
++    SCSIDevice *s = r->req.dev;
+     assert(r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++
++    aio_context_acquire(blk_get_aio_context(s->conf.blk));
+     scsi_command_complete_noio(r, ret);
++    aio_context_release(blk_get_aio_context(s->conf.blk));
+ }
+ static int execute_command(BlockBackend *blk,
+@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
+     assert(r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->conf.blk));
++
+     if (ret || r->req.io_canceled) {
+         scsi_command_complete_noio(r, ret);
+-        return;
++        goto done;
+     }
+     len = r->io_header.dxfer_len - r->io_header.resid;
+@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
+     r->len = -1;
+     if (len == 0) {
+         scsi_command_complete_noio(r, 0);
+-        return;
++        goto done;
+     }
+     /* Snoop READ CAPACITY output to set the blocksize.  */
+@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
+     }
+     scsi_req_data(&r->req, len);
+     scsi_req_unref(&r->req);
++
++done:
++    aio_context_release(blk_get_aio_context(s->conf.blk));
+ }
+ /* Read more data from scsi device into buffer.  */
+@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
+     assert(r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->conf.blk));
++
+     if (ret || r->req.io_canceled) {
+         scsi_command_complete_noio(r, ret);
+-        return;
++        goto done;
+     }
+     if (r->req.cmd.buf[0] == MODE_SELECT && r->req.cmd.buf[4] == 12 &&
+@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
+     }
+     scsi_command_complete_noio(r, ret);
++
++done:
++    aio_context_release(blk_get_aio_context(s->conf.blk));
+ }
+ /* Write data to a scsi device.  Returns nonzero on failure.
+diff --git a/util/thread-pool.c b/util/thread-pool.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/thread-pool.c
++++ b/util/thread-pool.c
+@@ -XXX,XX +XXX,XX @@ restart:
+              */
+             qemu_bh_schedule(pool->completion_bh);
++            aio_context_release(pool->ctx);
+             elem->common.cb(elem->common.opaque, elem->ret);
++            aio_context_acquire(pool->ctx);
+             qemu_aio_unref(elem);
+             goto restart;
+         } else {
+@@ -XXX,XX +XXX,XX @@ static void thread_pool_co_cb(void *opaque, int ret)
+     ThreadPoolCo *co = opaque;
+     co->ret = ret;
+-    qemu_coroutine_enter(co->co);
++    aio_co_wake(co->co);
+ }
+ int coroutine_fn thread_pool_submit_co(ThreadPool *pool, ThreadPoolFunc *func,
 --
-.31.1
+.9.3

-[PULL 19/32] qcow2: refactor handle_dependencies() loop body
+[Qemu-devel] [PULL v2 16/24] aio-posix: partially inline aio_dispatch into aio_poll
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-No logic change, just prepare for the following commit. While being
+This patch prepares for the removal of unnecessary lockcnt inc/dec pairs.
-here do also small grammar fix in a comment.
+Extract the dispatching loop for file descriptor handlers into a new
 function aio_dispatch_handlers, and then inline aio_dispatch into
 aio_poll.
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+aio_dispatch can now become void.
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Hanna Reitz <hreitz@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Message-Id: <20210824101517.59802-3-vsementsov@virtuozzo.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Message-id: 20170213135235.12274-17-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qcow2-cluster.c | 49 ++++++++++++++++++++++++-------------------
+ include/block/aio.h |  6 +-----
-file changed, 28 insertions(+), 21 deletions(-)
+ util/aio-posix.c    | 44 ++++++++++++++------------------------------
  util/aio-win32.c    | 13 ++++---------
  util/async.c        |  2 +-
 files changed, 20 insertions(+), 45 deletions(-)
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
+diff --git a/include/block/aio.h b/include/block/aio.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
+--- a/include/block/aio.h
-+++ b/block/qcow2-cluster.c
++++ b/include/block/aio.h
-@@ -XXX,XX +XXX,XX @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
+@@ -XXX,XX +XXX,XX @@ bool aio_pending(AioContext *ctx);
+ /* Dispatch any pending callbacks from the GSource attached to the AioContext.
-         if (end <= old_start || start >= old_end) {
+  *
-             /* No intersection */
+  * This is used internally in the implementation of the GSource.
--        } else {
+- *
--            if (start < old_start) {
+- * @dispatch_fds: true to process fds, false to skip them
--                /* Stop at the start of a running allocation */
+- *                (can be used as an optimization by callers that know there
--                bytes = old_start - start;
+- *                are no fds ready)
--            } else {
+  */
--                bytes = 0;
+-bool aio_dispatch(AioContext *ctx, bool dispatch_fds);
--            }
++void aio_dispatch(AioContext *ctx);
-+            continue;
-+        }
+ /* Progress in completing AIO work to occur.  This can issue new pending
+  * aio as a result of executing I/O completion or bh callbacks.
--            /* Stop if already an l2meta exists. After yielding, it wouldn't
+diff --git a/util/aio-posix.c b/util/aio-posix.c
--             * be valid any more, so we'd have to clean up the old L2Metas
+index XXXXXXX..XXXXXXX 100644
--             * and deal with requests depending on them before starting to
+--- a/util/aio-posix.c
--             * gather new ones. Not worth the trouble. */
++++ b/util/aio-posix.c
--            if (bytes == 0 && *m) {
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
--                *cur_bytes = 0;
+     AioHandler *node, *tmp;
--                return 0;
+     bool progress = false;
--            }
-+        /* Conflict */
+-    /*
+-     * We have to walk very carefully in case aio_set_fd_handler is
--            if (bytes == 0) {
+-     * called while we're walking.
--                /* Wait for the dependency to complete. We need to recheck
+-     */
--                 * the free/allocated clusters when we continue. */
+-    qemu_lockcnt_inc(&ctx->list_lock);
--                qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
+-
--                return -EAGAIN;
+     QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
--            }
+         int revents;
-+        if (start < old_start) {
-+            /* Stop at the start of a running allocation */
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
 +            bytes = old_start - start;
 +        } else {
 +            bytes = 0;
 +        }
 +
 +        /*
 +         * Stop if an l2meta already exists. After yielding, it wouldn't
 +         * be valid any more, so we'd have to clean up the old L2Metas
 +         * and deal with requests depending on them before starting to
 +         * gather new ones. Not worth the trouble.
 +         */
 +        if (bytes == 0 && *m) {
 +            *cur_bytes = 0;
 +            return 0;
 +        }
 +
 +        if (bytes == 0) {
 +            /*
 +             * Wait for the dependency to complete. We need to recheck
 +             * the free/allocated clusters when we continue.
 +             */
 +            qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
 +            return -EAGAIN;
          }
      }
+-    qemu_lockcnt_dec(&ctx->list_lock);
+     return progress;
+ }
+-/*
+- * Note that dispatch_fds == false has the side-effect of post-poning the
+- * freeing of deleted handlers.
+- */
+-bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
++void aio_dispatch(AioContext *ctx)
+ {
+-    bool progress;
++    aio_bh_poll(ctx);
+-    /*
+-     * If there are callbacks left that have been queued, we need to call them.
+-     * Do not call select in this case, because it is possible that the caller
+-     * does not need a complete flush (as is the case for aio_poll loops).
+-     */
+-    progress = aio_bh_poll(ctx);
++    qemu_lockcnt_inc(&ctx->list_lock);
++    aio_dispatch_handlers(ctx);
++    qemu_lockcnt_dec(&ctx->list_lock);
+-    if (dispatch_fds) {
+-        progress |= aio_dispatch_handlers(ctx);
+-    }
+-
+-    /* Run our timers */
+-    progress |= timerlistgroup_run_timers(&ctx->tlg);
+-
+-    return progress;
++    timerlistgroup_run_timers(&ctx->tlg);
+ }
+ /* These thread-local variables are used only in a small part of aio_poll
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
+     npfd = 0;
+     qemu_lockcnt_dec(&ctx->list_lock);
+-    /* Run dispatch even if there were no readable fds to run timers */
+-    if (aio_dispatch(ctx, ret > 0)) {
+-        progress = true;
++    progress |= aio_bh_poll(ctx);
++
++    if (ret > 0) {
++        qemu_lockcnt_inc(&ctx->list_lock);
++        progress |= aio_dispatch_handlers(ctx);
++        qemu_lockcnt_dec(&ctx->list_lock);
+     }
++    progress |= timerlistgroup_run_timers(&ctx->tlg);
++
+     return progress;
+ }
+diff --git a/util/aio-win32.c b/util/aio-win32.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/aio-win32.c
++++ b/util/aio-win32.c
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
+     return progress;
+ }
+-bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
++void aio_dispatch(AioContext *ctx)
+ {
+-    bool progress;
+-
+-    progress = aio_bh_poll(ctx);
+-    if (dispatch_fds) {
+-        progress |= aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
+-    }
+-    progress |= timerlistgroup_run_timers(&ctx->tlg);
+-    return progress;
++    aio_bh_poll(ctx);
++    aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
++    timerlistgroup_run_timers(&ctx->tlg);
+ }
+ bool aio_poll(AioContext *ctx, bool blocking)
+diff --git a/util/async.c b/util/async.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/async.c
++++ b/util/async.c
+@@ -XXX,XX +XXX,XX @@ aio_ctx_dispatch(GSource     *source,
+     AioContext *ctx = (AioContext *) source;
+     assert(callback == NULL);
+-    aio_dispatch(ctx, true);
++    aio_dispatch(ctx);
+     return true;
+ }
 --
-.31.1
+.9.3

-[PULL 22/32] qcow2-refcount: improve style of check_refcounts_l2()
+[Qemu-devel] [PULL v2 17/24] async: remove unnecessary inc/dec pairs
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
- - don't use same name for size in bytes and in entries
+Pull the increment/decrement pair out of aio_bh_poll and into the
- - use g_autofree for l2_table
+callers.
  - add whitespace
  - fix block comment style
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Reviewed-by: Hanna Reitz <hreitz@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
-Message-Id: <20210914122454.141075-2-vsementsov@virtuozzo.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
+Message-id: 20170213135235.12274-18-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qcow2-refcount.c | 47 +++++++++++++++++++++---------------------
+ util/aio-posix.c |  8 +++-----
-file changed, 24 insertions(+), 23 deletions(-)
+ util/aio-win32.c |  8 ++++----
  util/async.c     | 12 ++++++------
 files changed, 13 insertions(+), 15 deletions(-)
-diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
+diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-refcount.c
+--- a/util/aio-posix.c
-+++ b/block/qcow2-refcount.c
++++ b/util/aio-posix.c
-@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
-                               int flags, BdrvCheckMode fix, bool active)
  void aio_dispatch(AioContext *ctx)
  {
-     BDRVQcow2State *s = bs->opaque;
++    qemu_lockcnt_inc(&ctx->list_lock);
--    uint64_t *l2_table, l2_entry;
+     aio_bh_poll(ctx);
 +    uint64_t l2_entry;
      uint64_t next_contiguous_offset = 0;
 -    int i, l2_size, nb_csectors, ret;
 +    int i, nb_csectors, ret;
 +    size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
 +    g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes);
      /* Read L2 table from disk */
 -    l2_size = s->l2_size * l2_entry_size(s);
 -    l2_table = g_malloc(l2_size);
 -
--    ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
+-    qemu_lockcnt_inc(&ctx->list_lock);
-+    ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size_bytes);
+     aio_dispatch_handlers(ctx);
-     if (ret < 0) {
+     qemu_lockcnt_dec(&ctx->list_lock);
-         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
-         res->check_errors++;
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
 -        goto fail;
 +        return ret;
      }
-     /* Do the actual checks */
+     npfd = 0;
--    for(i = 0; i < s->l2_size; i++) {
+-    qemu_lockcnt_dec(&ctx->list_lock);
-+    for (i = 0; i < s->l2_size; i++) {
-         l2_entry = get_l2_entry(s, l2_table, i);
+     progress |= aio_bh_poll(ctx);
-         switch (qcow2_get_cluster_type(bs, l2_entry)) {
+     if (ret > 0) {
-@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
+-        qemu_lockcnt_inc(&ctx->list_lock);
-                 l2_entry & QCOW2_COMPRESSED_SECTOR_MASK,
+         progress |= aio_dispatch_handlers(ctx);
-                 nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE);
+-        qemu_lockcnt_dec(&ctx->list_lock);
-             if (ret < 0) {
+     }
--                goto fail;
-+                return ret;
++    qemu_lockcnt_dec(&ctx->list_lock);
-             }
++
+     progress |= timerlistgroup_run_timers(&ctx->tlg);
-             if (flags & CHECK_FRAG_INFO) {
-                 res->bfi.allocated_clusters++;
+     return progress;
-                 res->bfi.compressed_clusters++;
+diff --git a/util/aio-win32.c b/util/aio-win32.c
+index XXXXXXX..XXXXXXX 100644
--                /* Compressed clusters are fragmented by nature.  Since they
+--- a/util/aio-win32.c
-+                /*
++++ b/util/aio-win32.c
-+                 * Compressed clusters are fragmented by nature.  Since they
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
-                  * take up sub-sector space but we only have sector granularity
+     bool progress = false;
-                  * I/O we need to re-read the same sectors even for adjacent
+     AioHandler *tmp;
-                  * compressed clusters.
-@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
+-    qemu_lockcnt_inc(&ctx->list_lock);
-                         if (ret < 0) {
+-
-                             fprintf(stderr, "ERROR: Overlap check failed\n");
+     /*
-                             res->check_errors++;
+      * We have to walk very carefully in case aio_set_fd_handler is
--                            /* Something is seriously wrong, so abort checking
+      * called while we're walking.
--                             * this L2 table */
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
 -                            goto fail;
 +                            /*
 +                             * Something is seriously wrong, so abort checking
 +                             * this L2 table.
 +                             */
 +                            return ret;
                          }
                          ret = bdrv_pwrite_sync(bs->file, l2e_offset,
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                              fprintf(stderr, "ERROR: Failed to overwrite L2 "
                                      "table entry: %s\n", strerror(-ret));
                              res->check_errors++;
 -                            /* Do not abort, continue checking the rest of this
 -                             * L2 table's entries */
 +                            /*
 +                             * Do not abort, continue checking the rest of this
 +                             * L2 table's entries.
 +                             */
                          } else {
                              res->corruptions--;
                              res->corruptions_fixed++;
 -                            /* Skip marking the cluster as used
 -                             * (it is unused now) */
 +                            /*
 +                             * Skip marking the cluster as used
 +                             * (it is unused now).
 +                             */
                              continue;
                          }
                      }
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                                                 refcount_table_size,
                                                 offset, s->cluster_size);
                  if (ret < 0) {
 -                    goto fail;
 +                    return ret;
                  }
              }
              break;
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
          }
      }
--    g_free(l2_table);
+-    qemu_lockcnt_dec(&ctx->list_lock);
-     return 0;
+     return progress;
  }
  void aio_dispatch(AioContext *ctx)
  {
 +    qemu_lockcnt_inc(&ctx->list_lock);
      aio_bh_poll(ctx);
      aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
 +    qemu_lockcnt_dec(&ctx->list_lock);
      timerlistgroup_run_timers(&ctx->tlg);
  }
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          }
      }
 -    qemu_lockcnt_dec(&ctx->list_lock);
      first = true;
      /* ctx->notifier is always registered.  */
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          progress |= aio_dispatch_handlers(ctx, event);
      } while (count > 0);
 +    qemu_lockcnt_dec(&ctx->list_lock);
 +
      progress |= timerlistgroup_run_timers(&ctx->tlg);
      return progress;
  }
 diff --git a/util/async.c b/util/async.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/async.c
 +++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ void aio_bh_call(QEMUBH *bh)
      bh->cb(bh->opaque);
  }
 -/* Multiple occurrences of aio_bh_poll cannot be called concurrently */
 +/* Multiple occurrences of aio_bh_poll cannot be called concurrently.
 + * The count in ctx->list_lock is incremented before the call, and is
 + * not affected by the call.
 + */
  int aio_bh_poll(AioContext *ctx)
  {
      QEMUBH *bh, **bhp, *next;
      int ret;
      bool deleted = false;
 -    qemu_lockcnt_inc(&ctx->list_lock);
 -
--fail:
+     ret = 0;
--    g_free(l2_table);
+     for (bh = atomic_rcu_read(&ctx->first_bh); bh; bh = next) {
--    return ret;
+         next = atomic_rcu_read(&bh->next);
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
      /* remove deleted bhs */
      if (!deleted) {
 -        qemu_lockcnt_dec(&ctx->list_lock);
          return ret;
      }
 -    if (qemu_lockcnt_dec_and_lock(&ctx->list_lock)) {
 +    if (qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
          bhp = &ctx->first_bh;
          while (*bhp) {
              bh = *bhp;
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
                  bhp = &bh->next;
              }
          }
 -        qemu_lockcnt_unlock(&ctx->list_lock);
 +        qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
      }
      return ret;
  }
- /*
 --
-.31.1
+.9.3

-[PULL 02/32] block: Drop BDS comment regarding bdrv_append()
+[Qemu-devel] [PULL v2 18/24] block: document fields protected by AioContext lock
-There is a comment above the BDS definition stating care must be taken
+From: Paolo Bonzini <pbonzini@redhat.com>
 to consider handling newly added fields in bdrv_append().
-Actually, this comment should have said "bdrv_swap()" as of 4ddc07cac
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-(nine years ago), and in any case, bdrv_swap() was dropped in
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-e419aefa (six years ago).  So no such care is necessary anymore.
+Reviewed-by: Fam Zheng <famz@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
+Message-id: 20170213135235.12274-19-pbonzini@redhat.com
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Message-Id: <20210812084148.14458-2-hreitz@redhat.com>
 ---
- include/block/block_int.h | 6 ------
+ include/block/block_int.h      | 64 +++++++++++++++++++++++++-----------------
-file changed, 6 deletions(-)
+ include/sysemu/block-backend.h | 14 ++++++---
 files changed, 49 insertions(+), 29 deletions(-)
 diff --git a/include/block/block_int.h b/include/block/block_int.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/block_int.h
 +++ b/include/block/block_int.h
 @@ -XXX,XX +XXX,XX @@ struct BdrvChild {
-     QLIST_ENTRY(BdrvChild) next_parent;
+  * copied as well.
   */
  struct BlockDriverState {
 -    int64_t total_sectors; /* if we are reading a disk image, give its
 -                              size in sectors */
 +    /* Protected by big QEMU lock or read-only after opening.  No special
 +     * locking needed during I/O...
 +     */
      int open_flags; /* flags used to open the file, re-used for re-open */
      bool read_only; /* if true, the media is read only */
      bool encrypted; /* if true, the media is encrypted */
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
      bool sg;        /* if true, the device is a /dev/sg* */
      bool probed;    /* if true, format was probed rather than specified */
 -    int copy_on_read; /* if nonzero, copy read backing sectors into image.
 -                         note this is a reference count */
 -
 -    CoQueue flush_queue;            /* Serializing flush queue */
 -    bool active_flush_req;          /* Flush request in flight? */
 -    unsigned int write_gen;         /* Current data generation */
 -    unsigned int flushed_gen;       /* Flushed write generation */
 -
      BlockDriver *drv; /* NULL means no media */
      void *opaque;
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
      BdrvChild *backing;
      BdrvChild *file;
 -    /* Callback before write request is processed */
 -    NotifierWithReturnList before_write_notifiers;
 -
 -    /* number of in-flight requests; overall and serialising */
 -    unsigned int in_flight;
 -    unsigned int serialising_in_flight;
 -
 -    bool wakeup;
 -
 -    /* Offset after the highest byte written to */
 -    uint64_t wr_highest_offset;
 -
      /* I/O Limits */
      BlockLimits bl;
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
      QTAILQ_ENTRY(BlockDriverState) bs_list;
      /* element of the list of monitor-owned BDS */
      QTAILQ_ENTRY(BlockDriverState) monitor_list;
 -    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
      int refcnt;
 -    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
 -
      /* operation blockers */
      QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX];
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
      /* The error object in use for blocking operations on backing_hd */
      Error *backing_blocker;
 +    /* Protected by AioContext lock */
 +
 +    /* If true, copy read backing sectors into image.  Can be >1 if more
 +     * than one client has requested copy-on-read.
 +     */
 +    int copy_on_read;
 +
 +    /* If we are reading a disk image, give its size in sectors.
 +     * Generally read-only; it is written to by load_vmstate and save_vmstate,
 +     * but the block layer is quiescent during those.
 +     */
 +    int64_t total_sectors;
 +
 +    /* Callback before write request is processed */
 +    NotifierWithReturnList before_write_notifiers;
 +
 +    /* number of in-flight requests; overall and serialising */
 +    unsigned int in_flight;
 +    unsigned int serialising_in_flight;
 +
 +    bool wakeup;
 +
 +    /* Offset after the highest byte written to */
 +    uint64_t wr_highest_offset;
 +
      /* threshold limit for writes, in bytes. "High water mark". */
      uint64_t write_threshold_offset;
      NotifierWithReturn write_threshold_notifier;
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
      /* counter for nested bdrv_io_plug */
      unsigned io_plugged;
 +    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
 +    CoQueue flush_queue;                  /* Serializing flush queue */
 +    bool active_flush_req;                /* Flush request in flight? */
 +    unsigned int write_gen;               /* Current data generation */
 +    unsigned int flushed_gen;             /* Flushed write generation */
 +
 +    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
 +
 +    /* do we need to tell the quest if we have a volatile write cache? */
 +    int enable_write_cache;
 +
      int quiesce_counter;
  };
--/*
+diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
-- * Note: the function bdrv_append() copies and swaps contents of
+index XXXXXXX..XXXXXXX 100644
-- * BlockDriverStates, so if you add new fields to this struct, please
+--- a/include/sysemu/block-backend.h
-- * inspect bdrv_append() to determine if the new fields need to be
++++ b/include/sysemu/block-backend.h
-- * copied as well.
+@@ -XXX,XX +XXX,XX @@ typedef struct BlockDevOps {
-- */
+  * fields that must be public. This is in particular for QLIST_ENTRY() and
- struct BlockDriverState {
+  * friends so that BlockBackends can be kept in lists outside block-backend.c */
-     /* Protected by big QEMU lock or read-only after opening.  No special
+ typedef struct BlockBackendPublic {
-      * locking needed during I/O...
+-    /* I/O throttling.
 -     * throttle_state tells us if this BlockBackend has I/O limits configured.
 -     * io_limits_disabled tells us if they are currently being enforced */
 +    /* I/O throttling has its own locking, but also some fields are
 +     * protected by the AioContext lock.
 +     */
 +
 +    /* Protected by AioContext lock.  */
      CoQueue      throttled_reqs[2];
 +
 +    /* Nonzero if the I/O limits are currently being ignored; generally
 +     * it is zero.  */
      unsigned int io_limits_disabled;
      /* The following fields are protected by the ThrottleGroup lock.
 -     * See the ThrottleGroup documentation for details. */
 +     * See the ThrottleGroup documentation for details.
 +     * throttle_state tells us if I/O limits are configured. */
      ThrottleState *throttle_state;
      ThrottleTimers throttle_timers;
      unsigned       pending_reqs[2];
 --
-.31.1
+.9.3

-[PULL 17/32] block: bdrv_inactivate_recurse(): check for permissions and fix crash
+[Qemu-devel] [PULL v2 19/24] coroutine-lock: make CoMutex thread-safe
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-We must not inactivate child when parent has write permissions on
+This uses the lock-free mutex described in the paper '"Blocking without
-it.
+Locking", or LFTHREADS: A lock-free thread library' by Gidenstam and
+Papatriantafilou.  The same technique is used in OSv, and in fact
-Calling .bdrv_inactivate() doesn't help: actually only qcow2 has this
+the code is essentially a conversion to C of OSv's code.
-handler and it is used to flush caches, not for permission
-manipulations.
+[Added missing coroutine_fn in tests/test-aio-multithread.c.
+--Stefan]
-So, let's simply check cumulative parent permissions before
-inactivating the node.
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
-This commit fixes a crash when we do migration during backup: prior to
+Message-id: 20170213181244.16297-2-pbonzini@redhat.com
-the commit nothing prevents all nodes inactivation at migration finish
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 and following backup write to the target crashes on assertion
 "assert(!(bs->open_flags & BDRV_O_INACTIVE));" in
 bdrv_co_write_req_prepare().
 After the commit, we rely on the fact that copy-before-write filter
 keeps write permission on target node to be able to write to it. So
 inactivation fails and migration fails as expected.
 Corresponding test now passes, so, enable it.
 Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 Reviewed-by: Hanna Reitz <hreitz@redhat.com>
 Message-Id: <20210911120027.8063-3-vsementsov@virtuozzo.com>
 Signed-off-by: Hanna Reitz <hreitz@redhat.com>
 ---
- block.c                                        | 8 ++++++++
+ include/qemu/coroutine.h     |  17 ++++-
- tests/qemu-iotests/tests/migrate-during-backup | 2 +-
+ tests/test-aio-multithread.c |  86 ++++++++++++++++++++++++
-files changed, 9 insertions(+), 1 deletion(-)
+ util/qemu-coroutine-lock.c   | 155 ++++++++++++++++++++++++++++++++++++++++---
+ util/trace-events            |   1 +
-diff --git a/block.c b/block.c
+files changed, 246 insertions(+), 13 deletions(-)
 diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
 index XXXXXXX..XXXXXXX 100644
---- a/block.c
+--- a/include/qemu/coroutine.h
-+++ b/block.c
++++ b/include/qemu/coroutine.h
-@@ -XXX,XX +XXX,XX @@ static int bdrv_inactivate_recurse(BlockDriverState *bs)
+@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue);
  /**
   * Provides a mutex that can be used to synchronise coroutines
   */
 +struct CoWaitRecord;
  typedef struct CoMutex {
 -    bool locked;
 +    /* Count of pending lockers; 0 for a free mutex, 1 for an
 +     * uncontended mutex.
 +     */
 +    unsigned locked;
 +
 +    /* A queue of waiters.  Elements are added atomically in front of
 +     * from_push.  to_pop is only populated, and popped from, by whoever
 +     * is in charge of the next wakeup.  This can be an unlocker or,
 +     * through the handoff protocol, a locker that is about to go to sleep.
 +     */
 +    QSLIST_HEAD(, CoWaitRecord) from_push, to_pop;
 +
 +    unsigned handoff, sequence;
 +
      Coroutine *holder;
 -    CoQueue queue;
  } CoMutex;
  /**
 diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/test-aio-multithread.c
 +++ b/tests/test-aio-multithread.c
@@ -XXX,XX +XXX,XX @@ static void test_multi_co_schedule_10(void)
      test_multi_co_schedule(10);
  }
 +/* CoMutex thread-safety.  */
 +
 +static uint32_t atomic_counter;
 +static uint32_t running;
 +static uint32_t counter;
 +static CoMutex comutex;
 +
 +static void coroutine_fn test_multi_co_mutex_entry(void *opaque)
 +{
 +    while (!atomic_mb_read(&now_stopping)) {
 +        qemu_co_mutex_lock(&comutex);
 +        counter++;
 +        qemu_co_mutex_unlock(&comutex);
 +
 +        /* Increase atomic_counter *after* releasing the mutex.  Otherwise
 +         * there is a chance (it happens about 1 in 3 runs) that the iothread
 +         * exits before the coroutine is woken up, causing a spurious
 +         * assertion failure.
 +         */
 +        atomic_inc(&atomic_counter);
 +    }
 +    atomic_dec(&running);
 +}
 +
 +static void test_multi_co_mutex(int threads, int seconds)
 +{
 +    int i;
 +
 +    qemu_co_mutex_init(&comutex);
 +    counter = 0;
 +    atomic_counter = 0;
 +    now_stopping = false;
 +
 +    create_aio_contexts();
 +    assert(threads <= NUM_CONTEXTS);
 +    running = threads;
 +    for (i = 0; i < threads; i++) {
 +        Coroutine *co1 = qemu_coroutine_create(test_multi_co_mutex_entry, NULL);
 +        aio_co_schedule(ctx[i], co1);
 +    }
 +
 +    g_usleep(seconds * 1000000);
 +
 +    atomic_mb_set(&now_stopping, true);
 +    while (running > 0) {
 +        g_usleep(100000);
 +    }
 +
 +    join_aio_contexts();
 +    g_test_message("%d iterations/second\n", counter / seconds);
 +    g_assert_cmpint(counter, ==, atomic_counter);
 +}
 +
 +/* Testing with NUM_CONTEXTS threads focuses on the queue.  The mutex however
 + * is too contended (and the threads spend too much time in aio_poll)
 + * to actually stress the handoff protocol.
 + */
 +static void test_multi_co_mutex_1(void)
 +{
 +    test_multi_co_mutex(NUM_CONTEXTS, 1);
 +}
 +
 +static void test_multi_co_mutex_10(void)
 +{
 +    test_multi_co_mutex(NUM_CONTEXTS, 10);
 +}
 +
 +/* Testing with fewer threads stresses the handoff protocol too.  Still, the
 + * case where the locker _can_ pick up a handoff is very rare, happening
 + * about 10 times in 1 million, so increase the runtime a bit compared to
 + * other "quick" testcases that only run for 1 second.
 + */
 +static void test_multi_co_mutex_2_3(void)
 +{
 +    test_multi_co_mutex(2, 3);
 +}
 +
 +static void test_multi_co_mutex_2_30(void)
 +{
 +    test_multi_co_mutex(2, 30);
 +}
 +
  /* End of tests.  */
  int main(int argc, char **argv)
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/aio/multi/lifecycle", test_lifecycle);
      if (g_test_quick()) {
          g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
 +        g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1);
 +        g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3);
      } else {
          g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
 +        g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10);
 +        g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30);
      }
      return g_test_run();
  }
 diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/qemu-coroutine-lock.c
 +++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@
   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
   * THE SOFTWARE.
 + *
 + * The lock-free mutex implementation is based on OSv
 + * (core/lfmutex.cc, include/lockfree/mutex.hh).
 + * Copyright (C) 2013 Cloudius Systems, Ltd.
   */
  #include "qemu/osdep.h"
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue)
      return QSIMPLEQ_FIRST(&queue->entries) == NULL;
  }
 +/* The wait records are handled with a multiple-producer, single-consumer
 + * lock-free queue.  There cannot be two concurrent pop_waiter() calls
 + * because pop_waiter() can only be called while mutex->handoff is zero.
 + * This can happen in three cases:
 + * - in qemu_co_mutex_unlock, before the hand-off protocol has started.
 + *   In this case, qemu_co_mutex_lock will see mutex->handoff == 0 and
 + *   not take part in the handoff.
 + * - in qemu_co_mutex_lock, if it steals the hand-off responsibility from
 + *   qemu_co_mutex_unlock.  In this case, qemu_co_mutex_unlock will fail
 + *   the cmpxchg (it will see either 0 or the next sequence value) and
 + *   exit.  The next hand-off cannot begin until qemu_co_mutex_lock has
 + *   woken up someone.
 + * - in qemu_co_mutex_unlock, if it takes the hand-off token itself.
 + *   In this case another iteration starts with mutex->handoff == 0;
 + *   a concurrent qemu_co_mutex_lock will fail the cmpxchg, and
 + *   qemu_co_mutex_unlock will go back to case (1).
 + *
 + * The following functions manage this queue.
 + */
 +typedef struct CoWaitRecord {
 +    Coroutine *co;
 +    QSLIST_ENTRY(CoWaitRecord) next;
 +} CoWaitRecord;
 +
 +static void push_waiter(CoMutex *mutex, CoWaitRecord *w)
 +{
 +    w->co = qemu_coroutine_self();
 +    QSLIST_INSERT_HEAD_ATOMIC(&mutex->from_push, w, next);
 +}
 +
 +static void move_waiters(CoMutex *mutex)
 +{
 +    QSLIST_HEAD(, CoWaitRecord) reversed;
 +    QSLIST_MOVE_ATOMIC(&reversed, &mutex->from_push);
 +    while (!QSLIST_EMPTY(&reversed)) {
 +        CoWaitRecord *w = QSLIST_FIRST(&reversed);
 +        QSLIST_REMOVE_HEAD(&reversed, next);
 +        QSLIST_INSERT_HEAD(&mutex->to_pop, w, next);
 +    }
 +}
 +
 +static CoWaitRecord *pop_waiter(CoMutex *mutex)
 +{
 +    CoWaitRecord *w;
 +
 +    if (QSLIST_EMPTY(&mutex->to_pop)) {
 +        move_waiters(mutex);
 +        if (QSLIST_EMPTY(&mutex->to_pop)) {
 +            return NULL;
 +        }
 +    }
 +    w = QSLIST_FIRST(&mutex->to_pop);
 +    QSLIST_REMOVE_HEAD(&mutex->to_pop, next);
 +    return w;
 +}
 +
 +static bool has_waiters(CoMutex *mutex)
 +{
 +    return QSLIST_EMPTY(&mutex->to_pop) || QSLIST_EMPTY(&mutex->from_push);
 +}
 +
  void qemu_co_mutex_init(CoMutex *mutex)
  {
-     BdrvChild *child, *parent;
+     memset(mutex, 0, sizeof(*mutex));
-     int ret;
+-    qemu_co_queue_init(&mutex->queue);
-+    uint64_t cumulative_perms, cumulative_shared_perms;
+ }
-     if (!bs->drv) {
+-void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
-         return -ENOMEDIUM;
++static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
-@@ -XXX,XX +XXX,XX @@ static int bdrv_inactivate_recurse(BlockDriverState *bs)
+ {
-         }
+     Coroutine *self = qemu_coroutine_self();
 +    CoWaitRecord w;
 +    unsigned old_handoff;
      trace_qemu_co_mutex_lock_entry(mutex, self);
 +    w.co = self;
 +    push_waiter(mutex, &w);
 -    while (mutex->locked) {
 -        qemu_co_queue_wait(&mutex->queue);
 +    /* This is the "Responsibility Hand-Off" protocol; a lock() picks from
 +     * a concurrent unlock() the responsibility of waking somebody up.
 +     */
 +    old_handoff = atomic_mb_read(&mutex->handoff);
 +    if (old_handoff &&
 +        has_waiters(mutex) &&
 +        atomic_cmpxchg(&mutex->handoff, old_handoff, 0) == old_handoff) {
 +        /* There can be no concurrent pops, because there can be only
 +         * one active handoff at a time.
 +         */
 +        CoWaitRecord *to_wake = pop_waiter(mutex);
 +        Coroutine *co = to_wake->co;
 +        if (co == self) {
 +            /* We got the lock ourselves!  */
 +            assert(to_wake == &w);
 +            return;
 +        }
 +
 +        aio_co_wake(co);
      }
-+    bdrv_get_cumulative_perm(bs, &cumulative_perms,
+-    mutex->locked = true;
-+                             &cumulative_shared_perms);
+-    mutex->holder = self;
-+    if (cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) {
+-    self->locks_held++;
-+        /* Our inactive parents still need write access. Inactivation failed. */
+-
-+        return -EPERM;
++    qemu_coroutine_yield();
-+    }
+     trace_qemu_co_mutex_lock_return(mutex, self);
-+
+ }
-     bs->open_flags |= BDRV_O_INACTIVE;
++void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
-     /*
++{
-diff --git a/tests/qemu-iotests/tests/migrate-during-backup b/tests/qemu-iotests/tests/migrate-during-backup
++    Coroutine *self = qemu_coroutine_self();
-index XXXXXXX..XXXXXXX 100755
++
---- a/tests/qemu-iotests/tests/migrate-during-backup
++    if (atomic_fetch_inc(&mutex->locked) == 0) {
-+++ b/tests/qemu-iotests/tests/migrate-during-backup
++        /* Uncontended.  */
-@@ -XXX,XX +XXX,XX @@
++        trace_qemu_co_mutex_lock_uncontended(mutex, self);
- #!/usr/bin/env python3
++    } else {
--# group: migration disabled
++        qemu_co_mutex_lock_slowpath(mutex);
-+# group: migration
++    }
- #
++    mutex->holder = self;
- # Copyright (c) 2021 Virtuozzo International GmbH
++    self->locks_held++;
- #
++}
 +
  void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
  {
      Coroutine *self = qemu_coroutine_self();
      trace_qemu_co_mutex_unlock_entry(mutex, self);
 -    assert(mutex->locked == true);
 +    assert(mutex->locked);
      assert(mutex->holder == self);
      assert(qemu_in_coroutine());
 -    mutex->locked = false;
      mutex->holder = NULL;
      self->locks_held--;
 -    qemu_co_queue_next(&mutex->queue);
 +    if (atomic_fetch_dec(&mutex->locked) == 1) {
 +        /* No waiting qemu_co_mutex_lock().  Pfew, that was easy!  */
 +        return;
 +    }
 +
 +    for (;;) {
 +        CoWaitRecord *to_wake = pop_waiter(mutex);
 +        unsigned our_handoff;
 +
 +        if (to_wake) {
 +            Coroutine *co = to_wake->co;
 +            aio_co_wake(co);
 +            break;
 +        }
 +
 +        /* Some concurrent lock() is in progress (we know this because
 +         * mutex->locked was >1) but it hasn't yet put itself on the wait
 +         * queue.  Pick a sequence number for the handoff protocol (not 0).
 +         */
 +        if (++mutex->sequence == 0) {
 +            mutex->sequence = 1;
 +        }
 +
 +        our_handoff = mutex->sequence;
 +        atomic_mb_set(&mutex->handoff, our_handoff);
 +        if (!has_waiters(mutex)) {
 +            /* The concurrent lock has not added itself yet, so it
 +             * will be able to pick our handoff.
 +             */
 +            break;
 +        }
 +
 +        /* Try to do the handoff protocol ourselves; if somebody else has
 +         * already taken it, however, we're done and they're responsible.
 +         */
 +        if (atomic_cmpxchg(&mutex->handoff, our_handoff, 0) != our_handoff) {
 +            break;
 +        }
 +    }
      trace_qemu_co_mutex_unlock_return(mutex, self);
  }
 diff --git a/util/trace-events b/util/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/util/trace-events
 +++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_coroutine_terminate(void *co) "self %p"
  # util/qemu-coroutine-lock.c
  qemu_co_queue_run_restart(void *co) "co %p"
 +qemu_co_mutex_lock_uncontended(void *mutex, void *self) "mutex %p self %p"
  qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
  qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
  qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
 --
-.31.1
+.9.3

-[PULL 27/32] qcow2-refcount: check_refcounts_l2(): check l2_bitmap
+[Qemu-devel] [PULL v2 20/24] coroutine-lock: add limited spinning to CoMutex
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Check subcluster bitmap of the l2 entry for different types of
+Running a very small critical section on pthread_mutex_t and CoMutex
-clusters:
+shows that pthread_mutex_t is much faster because it doesn't actually
 go to sleep.  What happens is that the critical section is shorter
 than the latency of entering the kernel and thus FUTEX_WAIT always
 fails.  With CoMutex there is no such latency but you still want to
 avoid wait and wakeup.  So introduce it artificially.
- - for compressed it must be zero
+This only works with one waiters; because CoMutex is fair, it will
- - for allocated check consistency of two parts of the bitmap
+always have more waits and wakeups than a pthread_mutex_t.
  - for unallocated all subclusters should be unallocated
    (or zero-plain)
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Tested-by: Kirill Tkhai <ktkhai@virtuozzo.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
-Message-Id: <20210914122454.141075-7-vsementsov@virtuozzo.com>
+Message-id: 20170213181244.16297-3-pbonzini@redhat.com
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Hanna Reitz <hreitz@redhat.com>
 Signed-off-by: Hanna Reitz <hreitz@redhat.com>
 ---
- block/qcow2-refcount.c | 28 ++++++++++++++++++++++++++--
+ include/qemu/coroutine.h   |  5 +++++
-file changed, 26 insertions(+), 2 deletions(-)
+ util/qemu-coroutine-lock.c | 51 ++++++++++++++++++++++++++++++++++++++++------
  util/qemu-coroutine.c      |  2 +-
 files changed, 51 insertions(+), 7 deletions(-)
-diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
+diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-refcount.c
+--- a/include/qemu/coroutine.h
-+++ b/block/qcow2-refcount.c
++++ b/include/qemu/coroutine.h
-@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
+@@ -XXX,XX +XXX,XX @@ typedef struct CoMutex {
-                               int flags, BdrvCheckMode fix, bool active)
+      */
      unsigned locked;
 +    /* Context that is holding the lock.  Useful to avoid spinning
 +     * when two coroutines on the same AioContext try to get the lock. :)
 +     */
 +    AioContext *ctx;
 +
      /* A queue of waiters.  Elements are added atomically in front of
       * from_push.  to_pop is only populated, and popped from, by whoever
       * is in charge of the next wakeup.  This can be an unlocker or,
 diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/qemu-coroutine-lock.c
 +++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu-common.h"
  #include "qemu/coroutine.h"
  #include "qemu/coroutine_int.h"
 +#include "qemu/processor.h"
  #include "qemu/queue.h"
  #include "block/aio.h"
  #include "trace.h"
@@ -XXX,XX +XXX,XX @@ void qemu_co_mutex_init(CoMutex *mutex)
      memset(mutex, 0, sizeof(*mutex));
  }
 -static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
 +static void coroutine_fn qemu_co_mutex_wake(CoMutex *mutex, Coroutine *co)
 +{
 +    /* Read co before co->ctx; pairs with smp_wmb() in
 +     * qemu_coroutine_enter().
 +     */
 +    smp_read_barrier_depends();
 +    mutex->ctx = co->ctx;
 +    aio_co_wake(co);
 +}
 +
 +static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx,
 +                                                     CoMutex *mutex)
  {
-     BDRVQcow2State *s = bs->opaque;
+     Coroutine *self = qemu_coroutine_self();
--    uint64_t l2_entry;
+     CoWaitRecord w;
-+    uint64_t l2_entry, l2_bitmap;
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
-     uint64_t next_contiguous_offset = 0;
+         if (co == self) {
-     int i, ret;
+             /* We got the lock ourselves!  */
-     size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
+             assert(to_wake == &w);
-@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
++            mutex->ctx = ctx;
-         uint64_t coffset;
+             return;
-         int csize;
+         }
-         l2_entry = get_l2_entry(s, l2_table, i);
-+        l2_bitmap = get_l2_bitmap(s, l2_table, i);
+-        aio_co_wake(co);
++        qemu_co_mutex_wake(mutex, co);
-         switch (qcow2_get_cluster_type(bs, l2_entry)) {
+     }
-         case QCOW2_CLUSTER_COMPRESSED:
-@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
+     qemu_coroutine_yield();
-                 break;
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
-             }
+ void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
-+            if (l2_bitmap) {
+ {
-+                fprintf(stderr, "ERROR compressed cluster %d with non-zero "
++    AioContext *ctx = qemu_get_current_aio_context();
-+                        "subcluster allocation bitmap, entry=0x%" PRIx64 "\n",
+     Coroutine *self = qemu_coroutine_self();
-+                        i, l2_entry);
++    int waiters, i;
-+                res->corruptions++;
 -    if (atomic_fetch_inc(&mutex->locked) == 0) {
 +    /* Running a very small critical section on pthread_mutex_t and CoMutex
 +     * shows that pthread_mutex_t is much faster because it doesn't actually
 +     * go to sleep.  What happens is that the critical section is shorter
 +     * than the latency of entering the kernel and thus FUTEX_WAIT always
 +     * fails.  With CoMutex there is no such latency but you still want to
 +     * avoid wait and wakeup.  So introduce it artificially.
 +     */
 +    i = 0;
 +retry_fast_path:
 +    waiters = atomic_cmpxchg(&mutex->locked, 0, 1);
 +    if (waiters != 0) {
 +        while (waiters == 1 && ++i < 1000) {
 +            if (atomic_read(&mutex->ctx) == ctx) {
 +                break;
 +            }
++            if (atomic_read(&mutex->locked) == 0) {
++                goto retry_fast_path;
++            }
++            cpu_relax();
++        }
++        waiters = atomic_fetch_inc(&mutex->locked);
++    }
 +
-             /* Mark cluster as used */
++    if (waiters == 0) {
-             qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
+         /* Uncontended.  */
-             ret = qcow2_inc_refcounts_imrt(
+         trace_qemu_co_mutex_lock_uncontended(mutex, self);
-@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
++        mutex->ctx = ctx;
-         {
+     } else {
-             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
+-        qemu_co_mutex_lock_slowpath(mutex);
++        qemu_co_mutex_lock_slowpath(ctx, mutex);
-+            if ((l2_bitmap >> 32) & l2_bitmap) {
+     }
-+                res->corruptions++;
+     mutex->holder = self;
-+                fprintf(stderr, "ERROR offset=%" PRIx64 ": Allocated "
+     self->locks_held++;
-+                        "cluster has corrupted subcluster allocation bitmap\n",
+@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
-+                        offset);
+     assert(mutex->holder == self);
-+            }
+     assert(qemu_in_coroutine());
-+
-             /* Correct offsets are cluster aligned */
++    mutex->ctx = NULL;
-             if (offset_into_cluster(s, offset)) {
+     mutex->holder = NULL;
-                 bool contains_data;
+     self->locks_held--;
-                 res->corruptions++;
+     if (atomic_fetch_dec(&mutex->locked) == 1) {
+@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
-                 if (has_subclusters(s)) {
+         unsigned our_handoff;
--                    uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, i);
-                     contains_data = (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC);
+         if (to_wake) {
-                 } else {
+-            Coroutine *co = to_wake->co;
-                     contains_data = !(l2_entry & QCOW_OFLAG_ZERO);
+-            aio_co_wake(co);
-@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
++            qemu_co_mutex_wake(mutex, to_wake->co);
              break;
          }
-         case QCOW2_CLUSTER_ZERO_PLAIN:
+diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
-+            /* Impossible when image has subclusters */
+index XXXXXXX..XXXXXXX 100644
-+            assert(!l2_bitmap);
+--- a/util/qemu-coroutine.c
-+            break;
++++ b/util/qemu-coroutine.c
-+
+@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_enter(Coroutine *co)
-         case QCOW2_CLUSTER_UNALLOCATED:
+     co->ctx = qemu_get_current_aio_context();
-+            if (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC) {
-+                res->corruptions++;
+     /* Store co->ctx before anything that stores co.  Matches
-+                fprintf(stderr, "ERROR: Unallocated "
+-     * barrier in aio_co_wake.
-+                        "cluster has non-zero subcluster allocation map\n");
++     * barrier in aio_co_wake and qemu_co_mutex_wake.
-+            }
+      */
-             break;
+     smp_wmb();
          default:
 --
-.31.1
+.9.3

-[PULL 14/32] iotests/297: Cover tests/
+[Qemu-devel] [PULL v2 21/24] test-aio-multithread: add performance comparison with thread-based mutexes
-so far does not check the named tests, which reside in the tests/
+From: Paolo Bonzini <pbonzini@redhat.com>
-directory (i.e. full path tests/qemu-iotests/tests).  Fix it.
+Add two implementations of the same benchmark as the previous patch,
-Thanks to the previous two commits, all named tests pass its scrutiny,
+but using pthreads.  One uses a normal QemuMutex, the other is Linux
-so we do not have to add anything to SKIP_FILES.
+only and implements a fair mutex based on MCS locks and futexes.
+This shows that the slower performance of the 5-thread case is due to
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
+the fairness of CoMutex, rather than to coroutines.  If fairness does
-Reviewed-by: Willian Rampazzo <willianr@redhat.com>
+not matter, as is the case with two threads, CoMutex can actually be
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+faster than pthreads.
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Message-Id: <20210902094017.32902-6-hreitz@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213181244.16297-4-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- tests/qemu-iotests/297 | 5 +++--
+ tests/test-aio-multithread.c | 164 +++++++++++++++++++++++++++++++++++++++++++
-file changed, 3 insertions(+), 2 deletions(-)
+file changed, 164 insertions(+)
-diff --git a/tests/qemu-iotests/297 b/tests/qemu-iotests/297
+diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
-index XXXXXXX..XXXXXXX 100755
+index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/297
+--- a/tests/test-aio-multithread.c
-+++ b/tests/qemu-iotests/297
++++ b/tests/test-aio-multithread.c
-@@ -XXX,XX +XXX,XX @@ def is_python_file(filename):
+@@ -XXX,XX +XXX,XX @@ static void test_multi_co_mutex_2_30(void)
+     test_multi_co_mutex(2, 30);
+ }
- def run_linters():
--    files = [filename for filename in (set(os.listdir('.')) - set(SKIP_FILES))
++/* Same test with fair mutexes, for performance comparison.  */
--             if is_python_file(filename)]
++
-+    named_tests = [f'tests/{entry}' for entry in os.listdir('tests')]
++#ifdef CONFIG_LINUX
-+    check_tests = set(os.listdir('.') + named_tests) - set(SKIP_FILES)
++#include "qemu/futex.h"
-+    files = [filename for filename in check_tests if is_python_file(filename)]
++
++/* The nodes for the mutex reside in this structure (on which we try to avoid
-     iotests.logger.debug('Files to be checked:')
++ * false sharing).  The head of the mutex is in the "mutex_head" variable.
-     iotests.logger.debug(', '.join(sorted(files)))
++ */
 +static struct {
 +    int next, locked;
 +    int padding[14];
 +} nodes[NUM_CONTEXTS] __attribute__((__aligned__(64)));
 +
 +static int mutex_head = -1;
 +
 +static void mcs_mutex_lock(void)
 +{
 +    int prev;
 +
 +    nodes[id].next = -1;
 +    nodes[id].locked = 1;
 +    prev = atomic_xchg(&mutex_head, id);
 +    if (prev != -1) {
 +        atomic_set(&nodes[prev].next, id);
 +        qemu_futex_wait(&nodes[id].locked, 1);
 +    }
 +}
 +
 +static void mcs_mutex_unlock(void)
 +{
 +    int next;
 +    if (nodes[id].next == -1) {
 +        if (atomic_read(&mutex_head) == id &&
 +            atomic_cmpxchg(&mutex_head, id, -1) == id) {
 +            /* Last item in the list, exit.  */
 +            return;
 +        }
 +        while (atomic_read(&nodes[id].next) == -1) {
 +            /* mcs_mutex_lock did the xchg, but has not updated
 +             * nodes[prev].next yet.
 +             */
 +        }
 +    }
 +
 +    /* Wake up the next in line.  */
 +    next = nodes[id].next;
 +    nodes[next].locked = 0;
 +    qemu_futex_wake(&nodes[next].locked, 1);
 +}
 +
 +static void test_multi_fair_mutex_entry(void *opaque)
 +{
 +    while (!atomic_mb_read(&now_stopping)) {
 +        mcs_mutex_lock();
 +        counter++;
 +        mcs_mutex_unlock();
 +        atomic_inc(&atomic_counter);
 +    }
 +    atomic_dec(&running);
 +}
 +
 +static void test_multi_fair_mutex(int threads, int seconds)
 +{
 +    int i;
 +
 +    assert(mutex_head == -1);
 +    counter = 0;
 +    atomic_counter = 0;
 +    now_stopping = false;
 +
 +    create_aio_contexts();
 +    assert(threads <= NUM_CONTEXTS);
 +    running = threads;
 +    for (i = 0; i < threads; i++) {
 +        Coroutine *co1 = qemu_coroutine_create(test_multi_fair_mutex_entry, NULL);
 +        aio_co_schedule(ctx[i], co1);
 +    }
 +
 +    g_usleep(seconds * 1000000);
 +
 +    atomic_mb_set(&now_stopping, true);
 +    while (running > 0) {
 +        g_usleep(100000);
 +    }
 +
 +    join_aio_contexts();
 +    g_test_message("%d iterations/second\n", counter / seconds);
 +    g_assert_cmpint(counter, ==, atomic_counter);
 +}
 +
 +static void test_multi_fair_mutex_1(void)
 +{
 +    test_multi_fair_mutex(NUM_CONTEXTS, 1);
 +}
 +
 +static void test_multi_fair_mutex_10(void)
 +{
 +    test_multi_fair_mutex(NUM_CONTEXTS, 10);
 +}
 +#endif
 +
 +/* Same test with pthread mutexes, for performance comparison and
 + * portability.  */
 +
 +static QemuMutex mutex;
 +
 +static void test_multi_mutex_entry(void *opaque)
 +{
 +    while (!atomic_mb_read(&now_stopping)) {
 +        qemu_mutex_lock(&mutex);
 +        counter++;
 +        qemu_mutex_unlock(&mutex);
 +        atomic_inc(&atomic_counter);
 +    }
 +    atomic_dec(&running);
 +}
 +
 +static void test_multi_mutex(int threads, int seconds)
 +{
 +    int i;
 +
 +    qemu_mutex_init(&mutex);
 +    counter = 0;
 +    atomic_counter = 0;
 +    now_stopping = false;
 +
 +    create_aio_contexts();
 +    assert(threads <= NUM_CONTEXTS);
 +    running = threads;
 +    for (i = 0; i < threads; i++) {
 +        Coroutine *co1 = qemu_coroutine_create(test_multi_mutex_entry, NULL);
 +        aio_co_schedule(ctx[i], co1);
 +    }
 +
 +    g_usleep(seconds * 1000000);
 +
 +    atomic_mb_set(&now_stopping, true);
 +    while (running > 0) {
 +        g_usleep(100000);
 +    }
 +
 +    join_aio_contexts();
 +    g_test_message("%d iterations/second\n", counter / seconds);
 +    g_assert_cmpint(counter, ==, atomic_counter);
 +}
 +
 +static void test_multi_mutex_1(void)
 +{
 +    test_multi_mutex(NUM_CONTEXTS, 1);
 +}
 +
 +static void test_multi_mutex_10(void)
 +{
 +    test_multi_mutex(NUM_CONTEXTS, 10);
 +}
 +
  /* End of tests.  */
  int main(int argc, char **argv)
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
          g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
          g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1);
          g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3);
 +#ifdef CONFIG_LINUX
 +        g_test_add_func("/aio/multi/mutex/mcs", test_multi_fair_mutex_1);
 +#endif
 +        g_test_add_func("/aio/multi/mutex/pthread", test_multi_mutex_1);
      } else {
          g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
          g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10);
          g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30);
 +#ifdef CONFIG_LINUX
 +        g_test_add_func("/aio/multi/mutex/mcs", test_multi_fair_mutex_10);
 +#endif
 +        g_test_add_func("/aio/multi/mutex/pthread", test_multi_mutex_10);
      }
      return g_test_run();
  }
 --
-.31.1
+.9.3

-[PULL 03/32] block: block-status cache for data regions
+[Qemu-devel] [PULL v2 22/24] coroutine-lock: place CoMutex before CoQueue in header
-As we have attempted before
+From: Paolo Bonzini <pbonzini@redhat.com>
 (https://lists.gnu.org/archive/html/qemu-devel/2019-01/msg06451.html,
 "file-posix: Cache lseek result for data regions";
 https://lists.nongnu.org/archive/html/qemu-block/2021-02/msg00934.html,
 "file-posix: Cache next hole"), this patch seeks to reduce the number of
 SEEK_DATA/HOLE operations the file-posix driver has to perform.  The
 main difference is that this time it is implemented as part of the
 general block layer code.
-The problem we face is that on some filesystems or in some
+This will avoid forward references in the next patch.  It is also
-circumstances, SEEK_DATA/HOLE is unreasonably slow.  Given the
+more logical because CoQueue is not anymore the basic primitive.
 implementation is outside of qemu, there is little we can do about its
 performance.
-We have already introduced the want_zero parameter to
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-bdrv_co_block_status() to reduce the number of SEEK_DATA/HOLE calls
+Reviewed-by: Fam Zheng <famz@redhat.com>
-unless we really want zero information; but sometimes we do want that
+Message-id: 20170213181244.16297-5-pbonzini@redhat.com
-information, because for files that consist largely of zero areas,
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-special-casing those areas can give large performance boosts.  So the
+---
-real problem is with files that consist largely of data, so that
+ include/qemu/coroutine.h | 89 ++++++++++++++++++++++++------------------------
-inquiring the block status does not gain us much performance, but where
+file changed, 44 insertions(+), 45 deletions(-)
 such an inquiry itself takes a lot of time.
-To address this, we want to cache data regions.  Most of the time, when
+diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
 bad performance is reported, it is in places where the image is iterated
 over from start to end (qemu-img convert or the mirror job), so a simple
 yet effective solution is to cache only the current data region.
 (Note that only caching data regions but not zero regions means that
 returning false information from the cache is not catastrophic: Treating
 zeroes as data is fine.  While we try to invalidate the cache on zero
 writes and discards, such incongruences may still occur when there are
 other processes writing to the image.)
 We only use the cache for nodes without children (i.e. protocol nodes),
 because that is where the problem is: Drivers that rely on block-status
 implementations outside of qemu (e.g. SEEK_DATA/HOLE).
 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/307
 Signed-off-by: Hanna Reitz <hreitz@redhat.com>
 Message-Id: <20210812084148.14458-3-hreitz@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 [hreitz: Added `local_file == bs` assertion, as suggested by Vladimir]
 Signed-off-by: Hanna Reitz <hreitz@redhat.com>
 ---
  include/block/block_int.h | 50 ++++++++++++++++++++++++
  block.c                   | 80 +++++++++++++++++++++++++++++++++++++++
  block/io.c                | 68 +++++++++++++++++++++++++++++++--
 files changed, 195 insertions(+), 3 deletions(-)
 diff --git a/include/block/block_int.h b/include/block/block_int.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/block_int.h
+--- a/include/qemu/coroutine.h
-+++ b/include/block/block_int.h
++++ b/include/qemu/coroutine.h
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ bool qemu_in_coroutine(void);
  #include "qemu/hbitmap.h"
  #include "block/snapshot.h"
  #include "qemu/throttle.h"
 +#include "qemu/rcu.h"
  #define BLOCK_FLAG_LAZY_REFCOUNTS   8
@@ -XXX,XX +XXX,XX @@ struct BdrvChild {
      QLIST_ENTRY(BdrvChild) next_parent;
  };
 +/*
 + * Allows bdrv_co_block_status() to cache one data region for a
 + * protocol node.
 + *
 + * @valid: Whether the cache is valid (should be accessed with atomic
 + *         functions so this can be reset by RCU readers)
 + * @data_start: Offset where we know (or strongly assume) is data
 + * @data_end: Offset where the data region ends (which is not necessarily
 + *            the start of a zeroed region)
 + */
 +typedef struct BdrvBlockStatusCache {
 +    struct rcu_head rcu;
 +
 +    bool valid;
 +    int64_t data_start;
 +    int64_t data_end;
 +} BdrvBlockStatusCache;
 +
  struct BlockDriverState {
      /* Protected by big QEMU lock or read-only after opening.  No special
       * locking needed during I/O...
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
      /* BdrvChild links to this node may never be frozen */
      bool never_freeze;
 +
 +    /* Lock for block-status cache RCU writers */
 +    CoMutex bsc_modify_lock;
 +    /* Always non-NULL, but must only be dereferenced under an RCU read guard */
 +    BdrvBlockStatusCache *block_status_cache;
  };
  struct BlockBackendRootState {
@@ -XXX,XX +XXX,XX @@ static inline BlockDriverState *bdrv_primary_bs(BlockDriverState *bs)
   */
- void bdrv_drain_all_end_quiesce(BlockDriverState *bs);
+ bool qemu_coroutine_entered(Coroutine *co);
-+/**
+-
-+ * Check whether the given offset is in the cached block-status data
+-/**
-+ * region.
+- * CoQueues are a mechanism to queue coroutines in order to continue executing
-+ *
+- * them later. They provide the fundamental primitives on which coroutine locks
-+ * If it is, and @pnum is not NULL, *pnum is set to
+- * are built.
-+ * `bsc.data_end - offset`, i.e. how many bytes, starting from
+- */
-+ * @offset, are data (according to the cache).
+-typedef struct CoQueue {
-+ * Otherwise, *pnum is not touched.
+-    QSIMPLEQ_HEAD(, Coroutine) entries;
-+ */
+-} CoQueue;
-+bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum);
+-
 -/**
 - * Initialise a CoQueue. This must be called before any other operation is used
 - * on the CoQueue.
 - */
 -void qemu_co_queue_init(CoQueue *queue);
 -
 -/**
 - * Adds the current coroutine to the CoQueue and transfers control to the
 - * caller of the coroutine.
 - */
 -void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
 -
 -/**
 - * Restarts the next coroutine in the CoQueue and removes it from the queue.
 - *
 - * Returns true if a coroutine was restarted, false if the queue is empty.
 - */
 -bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
 -
 -/**
 - * Restarts all coroutines in the CoQueue and leaves the queue empty.
 - */
 -void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
 -
 -/**
 - * Enter the next coroutine in the queue
 - */
 -bool qemu_co_enter_next(CoQueue *queue);
 -
 -/**
 - * Checks if the CoQueue is empty.
 - */
 -bool qemu_co_queue_empty(CoQueue *queue);
 -
 -
  /**
   * Provides a mutex that can be used to synchronise coroutines
   */
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex);
   */
  void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
 +
 +/**
-+ * If [offset, offset + bytes) overlaps with the currently cached
++ * CoQueues are a mechanism to queue coroutines in order to continue executing
-+ * block-status region, invalidate the cache.
++ * them later.
 + *
 + * (To be used by I/O paths that cause data regions to be zero or
 + * holes.)
 + */
-+void bdrv_bsc_invalidate_range(BlockDriverState *bs,
++typedef struct CoQueue {
-+                               int64_t offset, int64_t bytes);
++    QSIMPLEQ_HEAD(, Coroutine) entries;
 +} CoQueue;
 +
 +/**
-+ * Mark the range [offset, offset + bytes) as a data region.
++ * Initialise a CoQueue. This must be called before any other operation is used
 + * on the CoQueue.
 + */
-+void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes);
++void qemu_co_queue_init(CoQueue *queue);
 +
  #endif /* BLOCK_INT_H */
 diff --git a/block.c b/block.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block.c
 +++ b/block.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/timer.h"
  #include "qemu/cutils.h"
  #include "qemu/id.h"
 +#include "qemu/range.h"
 +#include "qemu/rcu.h"
  #include "block/coroutines.h"
  #ifdef CONFIG_BSD
@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_new(void)
      qemu_co_queue_init(&bs->flush_queue);
 +    qemu_co_mutex_init(&bs->bsc_modify_lock);
 +    bs->block_status_cache = g_new0(BdrvBlockStatusCache, 1);
 +
      for (i = 0; i < bdrv_drain_all_count; i++) {
          bdrv_drained_begin(bs);
      }
@@ -XXX,XX +XXX,XX @@ static void bdrv_close(BlockDriverState *bs)
      bs->explicit_options = NULL;
      qobject_unref(bs->full_open_options);
      bs->full_open_options = NULL;
 +    g_free(bs->block_status_cache);
 +    bs->block_status_cache = NULL;
      bdrv_release_named_dirty_bitmaps(bs);
      assert(QLIST_EMPTY(&bs->dirty_bitmaps));
@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs)
  {
      return bdrv_skip_filters(bdrv_cow_bs(bdrv_skip_filters(bs)));
  }
 +
 +/**
-+ * Check whether [offset, offset + bytes) overlaps with the cached
++ * Adds the current coroutine to the CoQueue and transfers control to the
-+ * block-status data region.
++ * caller of the coroutine.
 + *
 + * If so, and @pnum is not NULL, set *pnum to `bsc.data_end - offset`,
 + * which is what bdrv_bsc_is_data()'s interface needs.
 + * Otherwise, *pnum is not touched.
 + */
-+static bool bdrv_bsc_range_overlaps_locked(BlockDriverState *bs,
++void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
 +                                           int64_t offset, int64_t bytes,
 +                                           int64_t *pnum)
 +{
 +    BdrvBlockStatusCache *bsc = qatomic_rcu_read(&bs->block_status_cache);
 +    bool overlaps;
 +
 +    overlaps =
 +        qatomic_read(&bsc->valid) &&
 +        ranges_overlap(offset, bytes, bsc->data_start,
 +                       bsc->data_end - bsc->data_start);
 +
 +    if (overlaps && pnum) {
 +        *pnum = bsc->data_end - offset;
 +    }
 +
 +    return overlaps;
 +}
 +
 +/**
-+ * See block_int.h for this function's documentation.
++ * Restarts the next coroutine in the CoQueue and removes it from the queue.
 + *
 + * Returns true if a coroutine was restarted, false if the queue is empty.
 + */
-+bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum)
++bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
 +{
 +    RCU_READ_LOCK_GUARD();
 +
 +    return bdrv_bsc_range_overlaps_locked(bs, offset, 1, pnum);
 +}
 +
 +/**
-+ * See block_int.h for this function's documentation.
++ * Restarts all coroutines in the CoQueue and leaves the queue empty.
 + */
-+void bdrv_bsc_invalidate_range(BlockDriverState *bs,
++void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
 +                               int64_t offset, int64_t bytes)
 +{
 +    RCU_READ_LOCK_GUARD();
 +
 +    if (bdrv_bsc_range_overlaps_locked(bs, offset, bytes, NULL)) {
 +        qatomic_set(&bs->block_status_cache->valid, false);
 +    }
 +}
 +
 +/**
-+ * See block_int.h for this function's documentation.
++ * Enter the next coroutine in the queue
 + */
-+void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes)
++bool qemu_co_enter_next(CoQueue *queue);
 +{
 +    BdrvBlockStatusCache *new_bsc = g_new(BdrvBlockStatusCache, 1);
 +    BdrvBlockStatusCache *old_bsc;
 +
-+    *new_bsc = (BdrvBlockStatusCache) {
++/**
-+        .valid = true,
++ * Checks if the CoQueue is empty.
-+        .data_start = offset,
++ */
-+        .data_end = offset + bytes,
++bool qemu_co_queue_empty(CoQueue *queue);
 +    };
 +
-+    QEMU_LOCK_GUARD(&bs->bsc_modify_lock);
 +
-+    old_bsc = qatomic_rcu_read(&bs->block_status_cache);
+ typedef struct CoRwlock {
-+    qatomic_rcu_set(&bs->block_status_cache, new_bsc);
+     bool writer;
-+    if (old_bsc) {
+     int reader;
 +        g_free_rcu(old_bsc, rcu);
 +    }
 +}
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
          return -ENOTSUP;
      }
 +    /* Invalidate the cached block-status data range if this write overlaps */
 +    bdrv_bsc_invalidate_range(bs, offset, bytes);
 +
      assert(alignment % bs->bl.request_alignment == 0);
      head = offset % alignment;
      tail = (offset + bytes) % alignment;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
      aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
      if (bs->drv->bdrv_co_block_status) {
 -        ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
 -                                            aligned_bytes, pnum, &local_map,
 -                                            &local_file);
 +        /*
 +         * Use the block-status cache only for protocol nodes: Format
 +         * drivers are generally quick to inquire the status, but protocol
 +         * drivers often need to get information from outside of qemu, so
 +         * we do not have control over the actual implementation.  There
 +         * have been cases where inquiring the status took an unreasonably
 +         * long time, and we can do nothing in qemu to fix it.
 +         * This is especially problematic for images with large data areas,
 +         * because finding the few holes in them and giving them special
 +         * treatment does not gain much performance.  Therefore, we try to
 +         * cache the last-identified data region.
 +         *
 +         * Second, limiting ourselves to protocol nodes allows us to assume
 +         * the block status for data regions to be DATA | OFFSET_VALID, and
 +         * that the host offset is the same as the guest offset.
 +         *
 +         * Note that it is possible that external writers zero parts of
 +         * the cached regions without the cache being invalidated, and so
 +         * we may report zeroes as data.  This is not catastrophic,
 +         * however, because reporting zeroes as data is fine.
 +         */
 +        if (QLIST_EMPTY(&bs->children) &&
 +            bdrv_bsc_is_data(bs, aligned_offset, pnum))
 +        {
 +            ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
 +            local_file = bs;
 +            local_map = aligned_offset;
 +        } else {
 +            ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
 +                                                aligned_bytes, pnum, &local_map,
 +                                                &local_file);
 +
 +            /*
 +             * Note that checking QLIST_EMPTY(&bs->children) is also done when
 +             * the cache is queried above.  Technically, we do not need to check
 +             * it here; the worst that can happen is that we fill the cache for
 +             * non-protocol nodes, and then it is never used.  However, filling
 +             * the cache requires an RCU update, so double check here to avoid
 +             * such an update if possible.
 +             */
 +            if (ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) &&
 +                QLIST_EMPTY(&bs->children))
 +            {
 +                /*
 +                 * When a protocol driver reports BLOCK_OFFSET_VALID, the
 +                 * returned local_map value must be the same as the offset we
 +                 * have passed (aligned_offset), and local_bs must be the node
 +                 * itself.
 +                 * Assert this, because we follow this rule when reading from
 +                 * the cache (see the `local_file = bs` and
 +                 * `local_map = aligned_offset` assignments above), and the
 +                 * result the cache delivers must be the same as the driver
 +                 * would deliver.
 +                 */
 +                assert(local_file == bs);
 +                assert(local_map == aligned_offset);
 +                bdrv_bsc_fill(bs, aligned_offset, *pnum);
 +            }
 +        }
      } else {
          /* Default code for filters */
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
          return 0;
      }
 +    /* Invalidate the cached block-status data range if this discard overlaps */
 +    bdrv_bsc_invalidate_range(bs, offset, bytes);
 +
      /* Discard is advisory, but some devices track and coalesce
       * unaligned requests, so we must pass everything down rather than
       * round here.  Still, most devices will just silently ignore
 --
-.31.1
+.9.3

-[PULL 04/32] block: Clarify that @bytes is no limit on *pnum
+Deleted patch
-.bdrv_co_block_status() implementations are free to return a *pnum that
-exceeds @bytes, because bdrv_co_block_status() in block/io.c will clamp
-*pnum as necessary.
-On the other hand, if drivers' implementations return values for *pnum
-that are as large as possible, our recently introduced block-status
-cache will become more effective.
-So, make a note in block_int.h that @bytes is no upper limit for *pnum.
-Suggested-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-Message-Id: <20210812084148.14458-4-hreitz@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
----
- include/block/block_int.h | 9 +++++++++
-file changed, 9 insertions(+)
-diff --git a/include/block/block_int.h b/include/block/block_int.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/block_int.h
-+++ b/include/block/block_int.h
-@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
-      * clamped to bdrv_getlength() and aligned to request_alignment,
-      * as well as non-NULL pnum, map, and file; in turn, the driver
-      * must return an error or set pnum to an aligned non-zero value.
-+     *
-+     * Note that @bytes is just a hint on how big of a region the
-+     * caller wants to inspect.  It is not a limit on *pnum.
-+     * Implementations are free to return larger values of *pnum if
-+     * doing so does not incur a performance penalty.
-+     *
-+     * block/io.c's bdrv_co_block_status() will utilize an unclamped
-+     * *pnum value for the block-status cache on protocol nodes, prior
-+     * to clamping *pnum for return to its caller.
-      */
-     int coroutine_fn (*bdrv_co_block_status)(BlockDriverState *bs,
-         bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum,
---
-.31.1

-[PULL 05/32] block/file-posix: Do not force-cap *pnum
+Deleted patch
-bdrv_co_block_status() does it for us, we do not need to do it here.
-The advantage of not capping *pnum is that bdrv_co_block_status() can
-cache larger data regions than requested by its caller.
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Message-Id: <20210812084148.14458-5-hreitz@redhat.com>
----
- block/file-posix.c | 7 ++++---
-file changed, 4 insertions(+), 3 deletions(-)
-diff --git a/block/file-posix.c b/block/file-posix.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/file-posix.c
-+++ b/block/file-posix.c
-@@ -XXX,XX +XXX,XX @@ static int find_allocation(BlockDriverState *bs, off_t start,
-  * the specified offset) that are known to be in the same
-  * allocated/unallocated state.
-  *
-- * 'bytes' is the max value 'pnum' should be set to.
-+ * 'bytes' is a soft cap for 'pnum'.  If the information is free, 'pnum' may
-+ * well exceed it.
-  */
- static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
-                                             bool want_zero,
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
-     } else if (data == offset) {
-         /* On a data extent, compute bytes to the end of the extent,
-          * possibly including a partial sector at EOF. */
--        *pnum = MIN(bytes, hole - offset);
-+        *pnum = hole - offset;
-         /*
-          * We are not allowed to return partial sectors, though, so
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
-     } else {
-         /* On a hole, compute bytes to the beginning of the next extent.  */
-         assert(hole == offset);
--        *pnum = MIN(bytes, data - offset);
-+        *pnum = data - offset;
-         ret = BDRV_BLOCK_ZERO;
-     }
-     *map = offset;
---
-.31.1

-[PULL 08/32] iotests: Fix unspecified-encoding pylint warnings
+Deleted patch
-As of recently, pylint complains when `open()` calls are missing an
-`encoding=` specified.  Everything we have should be UTF-8 (and in fact,
-everything should be UTF-8, period (exceptions apply)), so use that.
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
-Message-Id: <20210824153540.177128-2-hreitz@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Reviewed-by: John Snow <jsnow@redhat.com>
----
- tests/qemu-iotests/297        | 2 +-
- tests/qemu-iotests/iotests.py | 8 +++++---
-files changed, 6 insertions(+), 4 deletions(-)
-diff --git a/tests/qemu-iotests/297 b/tests/qemu-iotests/297
-index XXXXXXX..XXXXXXX 100755
---- a/tests/qemu-iotests/297
-+++ b/tests/qemu-iotests/297
-@@ -XXX,XX +XXX,XX @@ def is_python_file(filename):
-     if filename.endswith('.py'):
-         return True
--    with open(filename) as f:
-+    with open(filename, encoding='utf-8') as f:
-         try:
-             first_line = f.readline()
-             return re.match('^#!.*python', first_line) is not None
-diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
-index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/iotests.py
-+++ b/tests/qemu-iotests/iotests.py
-@@ -XXX,XX +XXX,XX @@ def _post_shutdown(self) -> None:
-             return
-         valgrind_filename =  f"{test_dir}/{self._popen.pid}.valgrind"
-         if self.exitcode() == 99:
--            with open(valgrind_filename) as f:
-+            with open(valgrind_filename, encoding='utf-8') as f:
-                 print(f.read())
-         else:
-             os.remove(valgrind_filename)
-@@ -XXX,XX +XXX,XX @@ def notrun(reason):
-     # Each test in qemu-iotests has a number ("seq")
-     seq = os.path.basename(sys.argv[0])
--    with open('%s/%s.notrun' % (output_dir, seq), 'w') as outfile:
-+    with open('%s/%s.notrun' % (output_dir, seq), 'w', encoding='utf-8') \
-+            as outfile:
-         outfile.write(reason + '\n')
-     logger.warning("%s not run: %s", seq, reason)
-     sys.exit(0)
-@@ -XXX,XX +XXX,XX @@ def case_notrun(reason):
-     # Each test in qemu-iotests has a number ("seq")
-     seq = os.path.basename(sys.argv[0])
--    with open('%s/%s.casenotrun' % (output_dir, seq), 'a') as outfile:
-+    with open('%s/%s.casenotrun' % (output_dir, seq), 'a', encoding='utf-8') \
-+            as outfile:
-         outfile.write('    [case not run] ' + reason + '\n')
- def _verify_image_format(supported_fmts: Sequence[str] = (),
---
-.31.1

-[PULL 09/32] iotests: Fix use-{list,dict}-literal warnings
+Deleted patch
-pylint proposes using `[]` instead of `list()` and `{}` instead of
-`dict()`, because it is faster.  That seems simple enough, so heed its
-advice.
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
-Message-Id: <20210824153540.177128-3-hreitz@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
----
- tests/qemu-iotests/iotests.py | 4 ++--
-file changed, 2 insertions(+), 2 deletions(-)
-diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
-index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/iotests.py
-+++ b/tests/qemu-iotests/iotests.py
-@@ -XXX,XX +XXX,XX @@ def hmp_qemu_io(self, drive: str, cmd: str,
-     def flatten_qmp_object(self, obj, output=None, basestr=''):
-         if output is None:
--            output = dict()
-+            output = {}
-         if isinstance(obj, list):
-             for i, item in enumerate(obj):
-                 self.flatten_qmp_object(item, output, basestr + str(i) + '.')
-@@ -XXX,XX +XXX,XX @@ def flatten_qmp_object(self, obj, output=None, basestr=''):
-     def qmp_to_opts(self, obj):
-         obj = self.flatten_qmp_object(obj)
--        output_list = list()
-+        output_list = []
-         for key in obj:
-             output_list += [key + '=' + obj[key]]
-         return ','.join(output_list)
---
-.31.1

-[PULL 10/32] iotests/297: Drop 169 and 199 from the skip list
+Deleted patch
-and 199 have been renamed and moved to tests/ (commit a44be0334be:
-"iotests: rename and move 169 and 199 tests"), so we can drop them from
-the skip list.
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
-Reviewed-by: Willian Rampazzo <willianr@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Message-Id: <20210902094017.32902-2-hreitz@redhat.com>
----
- tests/qemu-iotests/297 | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/tests/qemu-iotests/297 b/tests/qemu-iotests/297
-index XXXXXXX..XXXXXXX 100755
---- a/tests/qemu-iotests/297
-+++ b/tests/qemu-iotests/297
-@@ -XXX,XX +XXX,XX @@ import iotests
- SKIP_FILES = (
-     '030', '040', '041', '044', '045', '055', '056', '057', '065', '093',
-     '096', '118', '124', '132', '136', '139', '147', '148', '149',
--    '151', '152', '155', '163', '165', '169', '194', '196', '199', '202',
-+    '151', '152', '155', '163', '165', '194', '196', '202',
-     '203', '205', '206', '207', '208', '210', '211', '212', '213', '216',
-     '218', '219', '224', '228', '234', '235', '236', '237', '238',
-     '240', '242', '245', '246', '248', '255', '256', '257', '258', '260',
---
-.31.1

-[PULL 11/32] migrate-bitmaps-postcopy-test: Fix pylint warnings
+Deleted patch
-pylint complains that discards1_sha256 and all_discards_sha256 are first
-set in non-__init__ methods.
-These variables are not really class-variables anyway, so let them
-instead be returned by start_postcopy(), thus silencing pylint.
-Suggested-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-Message-Id: <20210902094017.32902-3-hreitz@redhat.com>
----
- .../tests/migrate-bitmaps-postcopy-test             | 13 +++++++------
-file changed, 7 insertions(+), 6 deletions(-)
-diff --git a/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test b/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test
-index XXXXXXX..XXXXXXX 100755
---- a/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test
-+++ b/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test
-@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase):
-         result = self.vm_a.qmp('x-debug-block-dirty-bitmap-sha256',
-                                node='drive0', name='bitmap0')
--        self.discards1_sha256 = result['return']['sha256']
-+        discards1_sha256 = result['return']['sha256']
-         # Check, that updating the bitmap by discards works
--        assert self.discards1_sha256 != empty_sha256
-+        assert discards1_sha256 != empty_sha256
-         # We want to calculate resulting sha256. Do it in bitmap0, so, disable
-         # other bitmaps
-@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase):
-         result = self.vm_a.qmp('x-debug-block-dirty-bitmap-sha256',
-                                node='drive0', name='bitmap0')
--        self.all_discards_sha256 = result['return']['sha256']
-+        all_discards_sha256 = result['return']['sha256']
-         # Now, enable some bitmaps, to be updated during migration
-         for i in range(2, nb_bitmaps, 2):
-@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase):
-         event_resume = self.vm_b.event_wait('RESUME')
-         self.vm_b_events.append(event_resume)
--        return event_resume
-+        return (event_resume, discards1_sha256, all_discards_sha256)
-     def test_postcopy_success(self):
--        event_resume = self.start_postcopy()
-+        event_resume, discards1_sha256, all_discards_sha256 = \
-+                self.start_postcopy()
-         # enabled bitmaps should be updated
-         apply_discards(self.vm_b, discards2)
-@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase):
-         for i in range(0, nb_bitmaps, 5):
-             result = self.vm_b.qmp('x-debug-block-dirty-bitmap-sha256',
-                                    node='drive0', name='bitmap{}'.format(i))
--            sha = self.discards1_sha256 if i % 2 else self.all_discards_sha256
-+            sha = discards1_sha256 if i % 2 else all_discards_sha256
-             self.assert_qmp(result, 'return/sha256', sha)
-     def test_early_shutdown_destination(self):
---
-.31.1

-[PULL 12/32] migrate-bitmaps-test: Fix pylint warnings
+Deleted patch
-There are a couple of things pylint takes issue with:
-- The "time" import is unused
-- The import order (iotests should come last)
-- get_bitmap_hash() doesn't use @self and so should be a function
-- Semicolons at the end of some lines
-- Parentheses after "if"
-- Some lines are too long (80 characters instead of 79)
-- inject_test_case()'s @name parameter shadows a top-level @name
-  variable
-- "lambda self: mc(self)" were equivalent to just "mc", but in
-  inject_test_case(), it is not equivalent, so add a comment and disable
-  the warning locally
-- Always put two empty lines after a function
-- f'exec: cat > /dev/null' does not need to be an f-string
-Fix them.
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
-Message-Id: <20210902094017.32902-4-hreitz@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
----
- tests/qemu-iotests/tests/migrate-bitmaps-test | 43 +++++++++++--------
-file changed, 25 insertions(+), 18 deletions(-)
-diff --git a/tests/qemu-iotests/tests/migrate-bitmaps-test b/tests/qemu-iotests/tests/migrate-bitmaps-test
-index XXXXXXX..XXXXXXX 100755
---- a/tests/qemu-iotests/tests/migrate-bitmaps-test
-+++ b/tests/qemu-iotests/tests/migrate-bitmaps-test
-@@ -XXX,XX +XXX,XX @@
- #
- import os
--import iotests
--import time
- import itertools
- import operator
- import re
-+import iotests
- from iotests import qemu_img, qemu_img_create, Timeout
-@@ -XXX,XX +XXX,XX @@ mig_cmd = 'exec: cat > ' + mig_file
- incoming_cmd = 'exec: cat ' + mig_file
-+def get_bitmap_hash(vm):
-+    result = vm.qmp('x-debug-block-dirty-bitmap-sha256',
-+                    node='drive0', name='bitmap0')
-+    return result['return']['sha256']
-+
-+
- class TestDirtyBitmapMigration(iotests.QMPTestCase):
-     def tearDown(self):
-         self.vm_a.shutdown()
-@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
-             params['persistent'] = True
-         result = vm.qmp('block-dirty-bitmap-add', **params)
--        self.assert_qmp(result, 'return', {});
--
--    def get_bitmap_hash(self, vm):
--        result = vm.qmp('x-debug-block-dirty-bitmap-sha256',
--                        node='drive0', name='bitmap0')
--        return result['return']['sha256']
-+        self.assert_qmp(result, 'return', {})
-     def check_bitmap(self, vm, sha256):
-         result = vm.qmp('x-debug-block-dirty-bitmap-sha256',
-                         node='drive0', name='bitmap0')
-         if sha256:
--            self.assert_qmp(result, 'return/sha256', sha256);
-+            self.assert_qmp(result, 'return/sha256', sha256)
-         else:
-             self.assert_qmp(result, 'error/desc',
--                            "Dirty bitmap 'bitmap0' not found");
-+                            "Dirty bitmap 'bitmap0' not found")
-     def do_test_migration_resume_source(self, persistent, migrate_bitmaps):
-         granularity = 512
-@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
-         self.add_bitmap(self.vm_a, granularity, persistent)
-         for r in regions:
-             self.vm_a.hmp_qemu_io('drive0', 'write %d %d' % r)
--        sha256 = self.get_bitmap_hash(self.vm_a)
-+        sha256 = get_bitmap_hash(self.vm_a)
-         result = self.vm_a.qmp('migrate', uri=mig_cmd)
-         while True:
-@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
-                 break
-         while True:
-             result = self.vm_a.qmp('query-status')
--            if (result['return']['status'] == 'postmigrate'):
-+            if result['return']['status'] == 'postmigrate':
-                 break
-         # test that bitmap is still here
-@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
-         self.add_bitmap(self.vm_a, granularity, persistent)
-         for r in regions:
-             self.vm_a.hmp_qemu_io('drive0', 'write %d %d' % r)
--        sha256 = self.get_bitmap_hash(self.vm_a)
-+        sha256 = get_bitmap_hash(self.vm_a)
-         if pre_shutdown:
-             self.vm_a.shutdown()
-@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
-             self.check_bitmap(self.vm_b, sha256 if persistent else False)
--def inject_test_case(klass, name, method, *args, **kwargs):
-+def inject_test_case(klass, suffix, method, *args, **kwargs):
-     mc = operator.methodcaller(method, *args, **kwargs)
--    setattr(klass, 'test_' + method + name, lambda self: mc(self))
-+    # We want to add a function attribute to `klass`, so that it is
-+    # correctly converted to a method on instantiation.  The
-+    # methodcaller object `mc` is a callable, not a function, so we
-+    # need the lambda to turn it into a function.
-+    # pylint: disable=unnecessary-lambda
-+    setattr(klass, 'test_' + method + suffix, lambda self: mc(self))
-+
- for cmb in list(itertools.product((True, False), repeat=5)):
-     name = ('_' if cmb[0] else '_not_') + 'persistent_'
-     name += ('_' if cmb[1] else '_not_') + 'migbitmap_'
-     name += '_online' if cmb[2] else '_offline'
-     name += '_shared' if cmb[3] else '_nonshared'
--    if (cmb[4]):
-+    if cmb[4]:
-         name += '__pre_shutdown'
-     inject_test_case(TestDirtyBitmapMigration, name, 'do_test_migration',
-@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapBackingMigration(iotests.QMPTestCase):
-         self.assert_qmp(result, 'return', {})
-         # Check that the bitmaps are there
--        for node in self.vm.qmp('query-named-block-nodes', flat=True)['return']:
-+        nodes = self.vm.qmp('query-named-block-nodes', flat=True)['return']
-+        for node in nodes:
-             if 'node0' in node['node-name']:
-                 self.assert_qmp(node, 'dirty-bitmaps[0]/name', 'bmap0')
-@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapBackingMigration(iotests.QMPTestCase):
-         """
-         Continue the source after migration.
-         """
--        result = self.vm.qmp('migrate', uri=f'exec: cat > /dev/null')
-+        result = self.vm.qmp('migrate', uri='exec: cat > /dev/null')
-         self.assert_qmp(result, 'return', {})
-         with Timeout(10, 'Migration timeout'):
---
-.31.1

-[PULL 13/32] mirror-top-perms: Fix AbnormalShutdown path
+Deleted patch
-The AbnormalShutdown exception class is not in qemu.machine, but in
-qemu.machine.machine.  (qemu.machine.AbnormalShutdown was enough for
-Python to find it in order to run this test, but pylint complains about
-it.)
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
-Message-Id: <20210902094017.32902-5-hreitz@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
----
- tests/qemu-iotests/tests/mirror-top-perms | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/tests/qemu-iotests/tests/mirror-top-perms b/tests/qemu-iotests/tests/mirror-top-perms
-index XXXXXXX..XXXXXXX 100755
---- a/tests/qemu-iotests/tests/mirror-top-perms
-+++ b/tests/qemu-iotests/tests/mirror-top-perms
-@@ -XXX,XX +XXX,XX @@ class TestMirrorTopPerms(iotests.QMPTestCase):
-     def tearDown(self):
-         try:
-             self.vm.shutdown()
--        except qemu.machine.AbnormalShutdown:
-+        except qemu.machine.machine.AbnormalShutdown:
-             pass
-         if self.vm_b is not None:
---
-.31.1

-[PULL 20/32] qcow2: handle_dependencies(): relax conflict detection
+[Qemu-devel] [PULL v2 23/24] coroutine-lock: add mutex argument to CoQueue APIs
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-There is no conflict and no dependency if we have parallel writes to
+All that CoQueue needs in order to become thread-safe is help
-different subclusters of one cluster when the cluster itself is already
+from an external mutex.  Add this to the API.
-allocated. So, relax extra dependency.
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Measure performance:
+Reviewed-by: Fam Zheng <famz@redhat.com>
-First, prepare build/qemu-img-old and build/qemu-img-new images.
+Message-id: 20170213181244.16297-6-pbonzini@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 cd scripts/simplebench
 ./img_bench_templater.py
 Paste the following to stdin of running script:
 qemu_img=../../build/qemu-img-{old|new}
 $qemu_img create -f qcow2 -o extended_l2=on /ssd/x.qcow2 1G
 $qemu_img bench -c 100000 -d 8 [-s 2K|-s 2K -o 512|-s $((1024*2+512))] \
         -w -t none -n /ssd/x.qcow2
 The result:
 All results are in seconds
 ------------------  ---------  ---------
                     old        new
 -s 2K               6.7 ± 15%  6.2 ± 12%
                                  -7%
 -s 2K -o 512        13 ± 3%    11 ± 5%
                                  -16%
 -s $((1024*2+512))  9.5 ± 4%   8.4
                                  -12%
 ------------------  ---------  ---------
 So small writes are more independent now and that helps to keep deeper
 io queue which improves performance.
 iotest output becomes racy for three allocation in one cluster.
 Second and third writes may finish in different order. Second and
 third requests don't depend on each other any more. Still they both
 depend on first request anyway. Filter out second and third write
 offsets to cover both possible outputs.
 Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 Message-Id: <20210824101517.59802-4-vsementsov@virtuozzo.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Hanna Reitz <hreitz@redhat.com>
 [hreitz: s/ an / and /]
 Signed-off-by: Hanna Reitz <hreitz@redhat.com>
 ---
- block/qcow2-cluster.c      | 11 +++++++++++
+ include/qemu/coroutine.h   |  8 +++++---
- tests/qemu-iotests/271     |  5 ++++-
+ block/backup.c             |  2 +-
- tests/qemu-iotests/271.out |  4 ++--
+ block/io.c                 |  4 ++--
-files changed, 17 insertions(+), 3 deletions(-)
+ block/nbd-client.c         |  2 +-
+ block/qcow2-cluster.c      |  4 +---
  block/sheepdog.c           |  2 +-
  block/throttle-groups.c    |  2 +-
  hw/9pfs/9p.c               |  2 +-
  util/qemu-coroutine-lock.c | 24 +++++++++++++++++++++---
 files changed, 34 insertions(+), 16 deletions(-)
 diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/coroutine.h
 +++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
  /**
   * CoQueues are a mechanism to queue coroutines in order to continue executing
 - * them later.
 + * them later.  They are similar to condition variables, but they need help
 + * from an external mutex in order to maintain thread-safety.
   */
  typedef struct CoQueue {
      QSIMPLEQ_HEAD(, Coroutine) entries;
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_init(CoQueue *queue);
  /**
   * Adds the current coroutine to the CoQueue and transfers control to the
 - * caller of the coroutine.
 + * caller of the coroutine.  The mutex is unlocked during the wait and
 + * locked again afterwards.
   */
 -void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
 +void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex);
  /**
   * Restarts the next coroutine in the CoQueue and removes it from the queue.
 diff --git a/block/backup.c b/block/backup.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/backup.c
 +++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
          retry = false;
          QLIST_FOREACH(req, &job->inflight_reqs, list) {
              if (end > req->start && start < req->end) {
 -                qemu_co_queue_wait(&req->wait_queue);
 +                qemu_co_queue_wait(&req->wait_queue, NULL);
                  retry = true;
                  break;
              }
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
                   * (instead of producing a deadlock in the former case). */
                  if (!req->waiting_for) {
                      self->waiting_for = req;
 -                    qemu_co_queue_wait(&req->wait_queue);
 +                    qemu_co_queue_wait(&req->wait_queue, NULL);
                      self->waiting_for = NULL;
                      retry = true;
                      waited = true;
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
      /* Wait until any previous flushes are completed */
      while (bs->active_flush_req) {
 -        qemu_co_queue_wait(&bs->flush_queue);
 +        qemu_co_queue_wait(&bs->flush_queue, NULL);
      }
      bs->active_flush_req = true;
 diff --git a/block/nbd-client.c b/block/nbd-client.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/nbd-client.c
 +++ b/block/nbd-client.c
@@ -XXX,XX +XXX,XX @@ static void nbd_coroutine_start(NBDClientSession *s,
      /* Poor man semaphore.  The free_sema is locked when no other request
       * can be accepted, and unlocked after receiving one reply.  */
      if (s->in_flight == MAX_NBD_REQUESTS) {
 -        qemu_co_queue_wait(&s->free_sema);
 +        qemu_co_queue_wait(&s->free_sema, NULL);
          assert(s->in_flight < MAX_NBD_REQUESTS);
      }
      s->in_flight++;
 diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2-cluster.c
 +++ b/block/qcow2-cluster.c
 @@ -XXX,XX +XXX,XX @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
-             continue;
+             if (bytes == 0) {
                  /* Wait for the dependency to complete. We need to recheck
                   * the free/allocated clusters when we continue. */
 -                qemu_co_mutex_unlock(&s->lock);
 -                qemu_co_queue_wait(&old_alloc->dependent_requests);
 -                qemu_co_mutex_lock(&s->lock);
 +                qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
                  return -EAGAIN;
              }
          }
+diff --git a/block/sheepdog.c b/block/sheepdog.c
-+        if (old_alloc->keep_old_clusters &&
+index XXXXXXX..XXXXXXX 100644
-+            (end <= l2meta_cow_start(old_alloc) ||
+--- a/block/sheepdog.c
-+             start >= l2meta_cow_end(old_alloc)))
++++ b/block/sheepdog.c
-+        {
+@@ -XXX,XX +XXX,XX @@ static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
-+            /*
+ retry:
-+             * Clusters intersect but COW areas don't. And cluster itself is
+     QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
-+             * already allocated. So, there is no actual conflict.
+         if (AIOCBOverlapping(acb, cb)) {
-+             */
+-            qemu_co_queue_wait(&s->overlapping_queue);
-+            continue;
++            qemu_co_queue_wait(&s->overlapping_queue, NULL);
-+        }
+             goto retry;
          }
      }
 diff --git a/block/throttle-groups.c b/block/throttle-groups.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/throttle-groups.c
 +++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
      if (must_wait || blkp->pending_reqs[is_write]) {
          blkp->pending_reqs[is_write]++;
          qemu_mutex_unlock(&tg->lock);
 -        qemu_co_queue_wait(&blkp->throttled_reqs[is_write]);
 +        qemu_co_queue_wait(&blkp->throttled_reqs[is_write], NULL);
          qemu_mutex_lock(&tg->lock);
          blkp->pending_reqs[is_write]--;
      }
 diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/9pfs/9p.c
 +++ b/hw/9pfs/9p.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn v9fs_flush(void *opaque)
          /*
           * Wait for pdu to complete.
           */
 -        qemu_co_queue_wait(&cancel_pdu->complete);
 +        qemu_co_queue_wait(&cancel_pdu->complete, NULL);
          cancel_pdu->cancelled = 0;
          pdu_free(cancel_pdu);
      }
 diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/qemu-coroutine-lock.c
 +++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_init(CoQueue *queue)
      QSIMPLEQ_INIT(&queue->entries);
  }
 -void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
 +void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex)
  {
      Coroutine *self = qemu_coroutine_self();
      QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next);
 +
-         /* Conflict */
++    if (mutex) {
++        qemu_co_mutex_unlock(mutex);
-         if (start < old_start) {
++    }
-diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271
++
-index XXXXXXX..XXXXXXX 100755
++    /* There is no race condition here.  Other threads will call
---- a/tests/qemu-iotests/271
++     * aio_co_schedule on our AioContext, which can reenter this
-+++ b/tests/qemu-iotests/271
++     * coroutine but only after this yield and after the main loop
-@@ -XXX,XX +XXX,XX @@ EOF
++     * has gone through the next iteration.
 +     */
      qemu_coroutine_yield();
      assert(qemu_in_coroutine());
 +
 +    /* TODO: OSv implements wait morphing here, where the wakeup
 +     * primitive automatically places the woken coroutine on the
 +     * mutex's queue.  This avoids the thundering herd effect.
 +     */
 +    if (mutex) {
 +        qemu_co_mutex_lock(mutex);
 +    }
  }
- _make_test_img -o extended_l2=on 1M
+ /**
--_concurrent_io     | $QEMU_IO | _filter_qemu_io
+@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock)
-+# Second and third writes in _concurrent_io() are independent and may finish in
+     Coroutine *self = qemu_coroutine_self();
-+# different order. So, filter offset out to match both possible variants.
-+_concurrent_io     | $QEMU_IO | _filter_qemu_io | \
+     while (lock->writer) {
-+    $SED -e 's/\(20480\|40960\)/OFFSET/'
+-        qemu_co_queue_wait(&lock->queue);
- _concurrent_verify | $QEMU_IO | _filter_qemu_io
++        qemu_co_queue_wait(&lock->queue, NULL);
+     }
- # success, all done
+     lock->reader++;
-diff --git a/tests/qemu-iotests/271.out b/tests/qemu-iotests/271.out
+     self->locks_held++;
-index XXXXXXX..XXXXXXX 100644
+@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_wrlock(CoRwlock *lock)
---- a/tests/qemu-iotests/271.out
+     Coroutine *self = qemu_coroutine_self();
-+++ b/tests/qemu-iotests/271.out
-@@ -XXX,XX +XXX,XX @@ blkdebug: Suspended request 'A'
+     while (lock->writer || lock->reader) {
- blkdebug: Resuming request 'A'
+-        qemu_co_queue_wait(&lock->queue);
- wrote 2048/2048 bytes at offset 30720
++        qemu_co_queue_wait(&lock->queue, NULL);
-KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+     }
--wrote 2048/2048 bytes at offset 20480
+     lock->writer = true;
-+wrote 2048/2048 bytes at offset OFFSET
+     self->locks_held++;
 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 -wrote 2048/2048 bytes at offset 40960
 +wrote 2048/2048 bytes at offset OFFSET
 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
  *** done
 --
-.31.1
+.9.3

-[PULL 29/32] qcow2-refcount: improve style of check_refcounts_l1()
+[Qemu-devel] [PULL v2 24/24] coroutine-lock: make CoRwlock thread-safe and fair
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
- - use g_autofree for l1_table
+This adds a CoMutex around the existing CoQueue.  Because the write-side
- - better name for size in bytes variable
+can just take CoMutex, the old "writer" field is not necessary anymore.
- - reduce code blocks nesting
+Instead of removing it altogether, count the number of pending writers
- - whitespaces, braces, newlines
+during a read-side critical section and forbid further readers from
 entering.
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Reviewed-by: Hanna Reitz <hreitz@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
-Message-Id: <20210914122454.141075-9-vsementsov@virtuozzo.com>
+Message-id: 20170213181244.16297-7-pbonzini@redhat.com
-Signed-off-by: Hanna Reitz <hreitz@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/qcow2-refcount.c | 98 +++++++++++++++++++++---------------------
+ include/qemu/coroutine.h   |  3 ++-
-file changed, 50 insertions(+), 48 deletions(-)
+ util/qemu-coroutine-lock.c | 35 ++++++++++++++++++++++++-----------
 files changed, 26 insertions(+), 12 deletions(-)
-diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
+diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-refcount.c
+--- a/include/qemu/coroutine.h
-+++ b/block/qcow2-refcount.c
++++ b/include/qemu/coroutine.h
-@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l1(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue);
-                               int flags, BdrvCheckMode fix, bool active)
  typedef struct CoRwlock {
 -    bool writer;
 +    int pending_writer;
      int reader;
 +    CoMutex mutex;
      CoQueue queue;
  } CoRwlock;
 diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/qemu-coroutine-lock.c
 +++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_init(CoRwlock *lock)
  {
-     BDRVQcow2State *s = bs->opaque;
+     memset(lock, 0, sizeof(*lock));
--    uint64_t *l1_table = NULL, l2_offset, l1_size2;
+     qemu_co_queue_init(&lock->queue);
-+    size_t l1_size_bytes = l1_size * L1E_SIZE;
++    qemu_co_mutex_init(&lock->mutex);
-+    g_autofree uint64_t *l1_table = NULL;
+ }
-+    uint64_t l2_offset;
-     int i, ret;
+ void qemu_co_rwlock_rdlock(CoRwlock *lock)
+ {
--    l1_size2 = l1_size * L1E_SIZE;
+     Coroutine *self = qemu_coroutine_self();
-+    if (!l1_size) {
-+        return 0;
+-    while (lock->writer) {
-+    }
+-        qemu_co_queue_wait(&lock->queue, NULL);
++    qemu_co_mutex_lock(&lock->mutex);
-     /* Mark L1 table as used */
++    /* For fairness, wait if a writer is in line.  */
-     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
++    while (lock->pending_writer) {
--                                   l1_table_offset, l1_size2);
++        qemu_co_queue_wait(&lock->queue, &lock->mutex);
-+                                   l1_table_offset, l1_size_bytes);
+     }
-     if (ret < 0) {
+     lock->reader++;
--        goto fail;
++    qemu_co_mutex_unlock(&lock->mutex);
 +        return ret;
 +    }
 +
-+    l1_table = g_try_malloc(l1_size_bytes);
++    /* The rest of the read-side critical section is run without the mutex.  */
-+    if (l1_table == NULL) {
+     self->locks_held++;
-+        res->check_errors++;
+ }
-+        return -ENOMEM;
-     }
+@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
+     Coroutine *self = qemu_coroutine_self();
-     /* Read L1 table entries from disk */
--    if (l1_size2 > 0) {
+     assert(qemu_in_coroutine());
--        l1_table = g_try_malloc(l1_size2);
+-    if (lock->writer) {
--        if (l1_table == NULL) {
+-        lock->writer = false;
--            ret = -ENOMEM;
++    if (!lock->reader) {
--            res->check_errors++;
++        /* The critical section started in qemu_co_rwlock_wrlock.  */
--            goto fail;
+         qemu_co_queue_restart_all(&lock->queue);
--        }
+     } else {
--        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
++        self->locks_held--;
 -        if (ret < 0) {
 -            fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
 -            res->check_errors++;
 -            goto fail;
 -        }
 -        for(i = 0;i < l1_size; i++)
 -            be64_to_cpus(&l1_table[i]);
 +    ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size_bytes);
 +    if (ret < 0) {
 +        fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
 +        res->check_errors++;
 +        return ret;
 +    }
 +
-+    for (i = 0; i < l1_size; i++) {
++        qemu_co_mutex_lock(&lock->mutex);
-+        be64_to_cpus(&l1_table[i]);
+         lock->reader--;
-     }
+         assert(lock->reader >= 0);
+         /* Wakeup only one waiting writer */
-     /* Do the actual checks */
+@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
--    for(i = 0; i < l1_size; i++) {
+             qemu_co_queue_next(&lock->queue);
 -        l2_offset = l1_table[i];
 -        if (l2_offset) {
 -            /* Mark L2 table as used */
 -            l2_offset &= L1E_OFFSET_MASK;
 -            ret = qcow2_inc_refcounts_imrt(bs, res,
 -                                           refcount_table, refcount_table_size,
 -                                           l2_offset, s->cluster_size);
 -            if (ret < 0) {
 -                goto fail;
 -            }
 +    for (i = 0; i < l1_size; i++) {
 +        if (!l1_table[i]) {
 +            continue;
 +        }
 -            /* L2 tables are cluster aligned */
 -            if (offset_into_cluster(s, l2_offset)) {
 -                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
 -                    "cluster aligned; L1 entry corrupted\n", l2_offset);
 -                res->corruptions++;
 -            }
 +        l2_offset = l1_table[i] & L1E_OFFSET_MASK;
 -            /* Process and check L2 entries */
 -            ret = check_refcounts_l2(bs, res, refcount_table,
 -                                     refcount_table_size, l2_offset, flags,
 -                                     fix, active);
 -            if (ret < 0) {
 -                goto fail;
 -            }
 +        /* Mark L2 table as used */
 +        ret = qcow2_inc_refcounts_imrt(bs, res,
 +                                       refcount_table, refcount_table_size,
 +                                       l2_offset, s->cluster_size);
 +        if (ret < 0) {
 +            return ret;
 +        }
 +
 +        /* L2 tables are cluster aligned */
 +        if (offset_into_cluster(s, l2_offset)) {
 +            fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
 +                "cluster aligned; L1 entry corrupted\n", l2_offset);
 +            res->corruptions++;
 +        }
 +
 +        /* Process and check L2 entries */
 +        ret = check_refcounts_l2(bs, res, refcount_table,
 +                                 refcount_table_size, l2_offset, flags,
 +                                 fix, active);
 +        if (ret < 0) {
 +            return ret;
          }
      }
--    g_free(l1_table);
+-    self->locks_held--;
--    return 0;
++    qemu_co_mutex_unlock(&lock->mutex);
 -fail:
 -    g_free(l1_table);
 -    return ret;
 +    return 0;
  }
- /*
+ void qemu_co_rwlock_wrlock(CoRwlock *lock)
  {
 -    Coroutine *self = qemu_coroutine_self();
 -
 -    while (lock->writer || lock->reader) {
 -        qemu_co_queue_wait(&lock->queue, NULL);
 +    qemu_co_mutex_lock(&lock->mutex);
 +    lock->pending_writer++;
 +    while (lock->reader) {
 +        qemu_co_queue_wait(&lock->queue, &lock->mutex);
      }
 -    lock->writer = true;
 -    self->locks_held++;
 +    lock->pending_writer--;
 +
 +    /* The rest of the write-side critical section is run with
 +     * the mutex taken, so that lock->reader remains zero.
 +     * There is no need to update self->locks_held.
 +     */
  }
 --
-.31.1
+.9.3

The following changes since commit 0b6206b9c6825619cd721085fe082d7a0abc9af4:

Merge remote-tracking branch 'remotes/rth-gitlab/tags/pull-tcg-20210914-4' into staging (2021-09-15 13:27:49 +0100)

are available in the Git repository at:

https://github.com/XanClic/qemu.git tags/pull-block-2021-09-15

for you to fetch changes up to 1899bf47375ad40555dcdff12ba49b4b8b82df38:

qemu-img: Add -F shorthand to convert (2021-09-15 18:42:38 +0200)

----------------------------------------------------------------
Block patches:
- Block-status cache for data regions
- qcow2 optimization (when using subclusters)
- iotests delinting, and let 297 (lint checker) cover named iotests
- qcow2 check improvements
- Added -F (target backing file format) option to qemu-img convert
- Mirror job fix
- Fix for when a migration is initiated while a backup job runs
- Fix for uncached qemu-img convert to a volume with 4k sectors (for an
  unaligned image)
- Minor gluster driver fix

----------------------------------------------------------------
Eric Blake (1):
  qemu-img: Add -F shorthand to convert

Hanna Reitz (15):
  gluster: Align block-status tail
  block: Drop BDS comment regarding bdrv_append()
  block: block-status cache for data regions
  block: Clarify that @bytes is no limit on *pnum
  block/file-posix: Do not force-cap *pnum
  block/gluster: Do not force-cap *pnum
  block/iscsi: Do not force-cap *pnum
  iotests: Fix unspecified-encoding pylint warnings
  iotests: Fix use-{list,dict}-literal warnings
  iotests/297: Drop 169 and 199 from the skip list
  migrate-bitmaps-postcopy-test: Fix pylint warnings
  migrate-bitmaps-test: Fix pylint warnings
  mirror-top-perms: Fix AbnormalShutdown path
  iotests/297: Cover tests/
  qemu-img: Allow target be aligned to sector size

Stefano Garzarella (1):
  block/mirror: fix NULL pointer dereference in
    mirror_wait_on_conflicts()

Vladimir Sementsov-Ogievskiy (15):
  tests: add migrate-during-backup
  block: bdrv_inactivate_recurse(): check for permissions and fix crash
  simplebench: add img_bench_templater.py
  qcow2: refactor handle_dependencies() loop body
  qcow2: handle_dependencies(): relax conflict detection
  qcow2-refcount: improve style of check_refcounts_l2()
  qcow2: compressed read: simplify cluster descriptor passing
  qcow2: introduce qcow2_parse_compressed_l2_entry() helper
  qcow2-refcount: introduce fix_l2_entry_by_zero()
  qcow2-refcount: fix_l2_entry_by_zero(): also zero L2 entry bitmap
  qcow2-refcount: check_refcounts_l2(): check l2_bitmap
  qcow2-refcount: check_refcounts_l2(): check reserved bits
  qcow2-refcount: improve style of check_refcounts_l1()
  qcow2-refcount: check_refcounts_l1(): check reserved bits
  qcow2-refcount: check_refblocks(): add separate message for reserved

docs/tools/qemu-img.rst                       |   4 +-
 block/qcow2.h                                 |   7 +-
 include/block/block_int.h                     |  61 +++-
 block.c                                       |  88 +++++
 block/file-posix.c                            |   7 +-
 block/gluster.c                               |  23 +-
 block/io.c                                    |  68 +++-
 block/iscsi.c                                 |   3 -
 block/mirror.c                                |  25 +-
 block/qcow2-cluster.c                         |  78 +++--
 block/qcow2-refcount.c                        | 326 ++++++++++++------
 block/qcow2.c                                 |  13 +-
 qemu-img.c                                    |  18 +-
 qemu-img-cmds.hx                              |   2 +-
 scripts/simplebench/img_bench_templater.py    |  95 +++++
 scripts/simplebench/table_templater.py        |  62 ++++
 tests/qemu-iotests/122                        |   2 +-
 tests/qemu-iotests/271                        |   5 +-
 tests/qemu-iotests/271.out                    |   4 +-
 tests/qemu-iotests/297                        |   9 +-
 tests/qemu-iotests/iotests.py                 |  12 +-
 .../tests/migrate-bitmaps-postcopy-test       |  13 +-
 tests/qemu-iotests/tests/migrate-bitmaps-test |  43 ++-
 .../qemu-iotests/tests/migrate-during-backup  |  97 ++++++
 .../tests/migrate-during-backup.out           |   5 +
 tests/qemu-iotests/tests/mirror-top-perms     |   2 +-
 26 files changed, 855 insertions(+), 217 deletions(-)
 create mode 100755 scripts/simplebench/img_bench_templater.py
 create mode 100644 scripts/simplebench/table_templater.py
 create mode 100755 tests/qemu-iotests/tests/migrate-during-backup
 create mode 100644 tests/qemu-iotests/tests/migrate-during-backup.out

-- 
2.31.1

From: Max Reitz <mreitz@redhat.com>

gluster's block-status implementation is basically a copy of that in
block/file-posix.c, there is only one thing missing, and that is
aligning trailing data extents to the request alignment (as added by
commit 9c3db310ff0).

Note that 9c3db310ff0 mentions that "there seems to be no other block
driver that sets request_alignment and [...]", but while block/gluster.c
does indeed not set request_alignment, block/io.c's
bdrv_refresh_limits() will still default to an alignment of 512 because
block/gluster.c does not provide a byte-aligned read function.
Therefore, unaligned tails can conceivably occur, and so we should apply
the change from 9c3db310ff0 to gluster's block-status implementation.

Reported-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-Id: <20210805143603.59503-1-mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 block/gluster.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/block/gluster.c b/block/gluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
     off_t data = 0, hole = 0;
     int ret = -EINVAL;
 
+    assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
+
     if (!s->fd) {
         return ret;
     }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
         /* On a data extent, compute bytes to the end of the extent,
          * possibly including a partial sector at EOF. */
         *pnum = MIN(bytes, hole - offset);
+
+        /*
+         * We are not allowed to return partial sectors, though, so
+         * round up if necessary.
+         */
+        if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
+            int64_t file_length = qemu_gluster_getlength(bs);
+            if (file_length > 0) {
+                /* Ignore errors, this is just a safeguard */
+                assert(hole == file_length);
+            }
+            *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
+        }
+
         ret = BDRV_BLOCK_DATA;
     } else {
         /* On a hole, compute bytes to the beginning of the next extent.  */
-- 
2.31.1

There is a comment above the BDS definition stating care must be taken
to consider handling newly added fields in bdrv_append().

Actually, this comment should have said "bdrv_swap()" as of 4ddc07cac
(nine years ago), and in any case, bdrv_swap() was dropped in
8e419aefa (six years ago).  So no such care is necessary anymore.

Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20210812084148.14458-2-hreitz@redhat.com>
---
 include/block/block_int.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BdrvChild {
     QLIST_ENTRY(BdrvChild) next_parent;
 };
 
-/*
- * Note: the function bdrv_append() copies and swaps contents of
- * BlockDriverStates, so if you add new fields to this struct, please
- * inspect bdrv_append() to determine if the new fields need to be
- * copied as well.
- */
 struct BlockDriverState {
     /* Protected by big QEMU lock or read-only after opening.  No special
      * locking needed during I/O...
-- 
2.31.1

As we have attempted before
(https://lists.gnu.org/archive/html/qemu-devel/2019-01/msg06451.html,
"file-posix: Cache lseek result for data regions";
https://lists.nongnu.org/archive/html/qemu-block/2021-02/msg00934.html,
"file-posix: Cache next hole"), this patch seeks to reduce the number of
SEEK_DATA/HOLE operations the file-posix driver has to perform.  The
main difference is that this time it is implemented as part of the
general block layer code.

The problem we face is that on some filesystems or in some
circumstances, SEEK_DATA/HOLE is unreasonably slow.  Given the
implementation is outside of qemu, there is little we can do about its
performance.

We have already introduced the want_zero parameter to
bdrv_co_block_status() to reduce the number of SEEK_DATA/HOLE calls
unless we really want zero information; but sometimes we do want that
information, because for files that consist largely of zero areas,
special-casing those areas can give large performance boosts.  So the
real problem is with files that consist largely of data, so that
inquiring the block status does not gain us much performance, but where
such an inquiry itself takes a lot of time.

To address this, we want to cache data regions.  Most of the time, when
bad performance is reported, it is in places where the image is iterated
over from start to end (qemu-img convert or the mirror job), so a simple
yet effective solution is to cache only the current data region.

(Note that only caching data regions but not zero regions means that
returning false information from the cache is not catastrophic: Treating
zeroes as data is fine.  While we try to invalidate the cache on zero
writes and discards, such incongruences may still occur when there are
other processes writing to the image.)

We only use the cache for nodes without children (i.e. protocol nodes),
because that is where the problem is: Drivers that rely on block-status
implementations outside of qemu (e.g. SEEK_DATA/HOLE).

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/307
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20210812084148.14458-3-hreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
[hreitz: Added `local_file == bs` assertion, as suggested by Vladimir]
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 include/block/block_int.h | 50 ++++++++++++++++++++++++
 block.c                   | 80 +++++++++++++++++++++++++++++++++++++++
 block/io.c                | 68 +++++++++++++++++++++++++++++++--
 3 files changed, 195 insertions(+), 3 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/hbitmap.h"
 #include "block/snapshot.h"
 #include "qemu/throttle.h"
+#include "qemu/rcu.h"
 
 #define BLOCK_FLAG_LAZY_REFCOUNTS   8
 
@@ -XXX,XX +XXX,XX @@ struct BdrvChild {
     QLIST_ENTRY(BdrvChild) next_parent;
 };
 
+/*
+ * Allows bdrv_co_block_status() to cache one data region for a
+ * protocol node.
+ *
+ * @valid: Whether the cache is valid (should be accessed with atomic
+ *         functions so this can be reset by RCU readers)
+ * @data_start: Offset where we know (or strongly assume) is data
+ * @data_end: Offset where the data region ends (which is not necessarily
+ *            the start of a zeroed region)
+ */
+typedef struct BdrvBlockStatusCache {
+    struct rcu_head rcu;
+
+    bool valid;
+    int64_t data_start;
+    int64_t data_end;
+} BdrvBlockStatusCache;
+
 struct BlockDriverState {
     /* Protected by big QEMU lock or read-only after opening.  No special
      * locking needed during I/O...
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
 
     /* BdrvChild links to this node may never be frozen */
     bool never_freeze;
+
+    /* Lock for block-status cache RCU writers */
+    CoMutex bsc_modify_lock;
+    /* Always non-NULL, but must only be dereferenced under an RCU read guard */
+    BdrvBlockStatusCache *block_status_cache;
 };
 
 struct BlockBackendRootState {
@@ -XXX,XX +XXX,XX @@ static inline BlockDriverState *bdrv_primary_bs(BlockDriverState *bs)
  */
 void bdrv_drain_all_end_quiesce(BlockDriverState *bs);
 
+/**
+ * Check whether the given offset is in the cached block-status data
+ * region.
+ *
+ * If it is, and @pnum is not NULL, *pnum is set to
+ * `bsc.data_end - offset`, i.e. how many bytes, starting from
+ * @offset, are data (according to the cache).
+ * Otherwise, *pnum is not touched.
+ */
+bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum);
+
+/**
+ * If [offset, offset + bytes) overlaps with the currently cached
+ * block-status region, invalidate the cache.
+ *
+ * (To be used by I/O paths that cause data regions to be zero or
+ * holes.)
+ */
+void bdrv_bsc_invalidate_range(BlockDriverState *bs,
+                               int64_t offset, int64_t bytes);
+
+/**
+ * Mark the range [offset, offset + bytes) as a data region.
+ */
+void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes);
+
 #endif /* BLOCK_INT_H */
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/timer.h"
 #include "qemu/cutils.h"
 #include "qemu/id.h"
+#include "qemu/range.h"
+#include "qemu/rcu.h"
 #include "block/coroutines.h"
 
 #ifdef CONFIG_BSD
@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_new(void)
 
     qemu_co_queue_init(&bs->flush_queue);
 
+    qemu_co_mutex_init(&bs->bsc_modify_lock);
+    bs->block_status_cache = g_new0(BdrvBlockStatusCache, 1);
+
     for (i = 0; i < bdrv_drain_all_count; i++) {
         bdrv_drained_begin(bs);
     }
@@ -XXX,XX +XXX,XX @@ static void bdrv_close(BlockDriverState *bs)
     bs->explicit_options = NULL;
     qobject_unref(bs->full_open_options);
     bs->full_open_options = NULL;
+    g_free(bs->block_status_cache);
+    bs->block_status_cache = NULL;
 
     bdrv_release_named_dirty_bitmaps(bs);
     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs)
 {
     return bdrv_skip_filters(bdrv_cow_bs(bdrv_skip_filters(bs)));
 }
+
+/**
+ * Check whether [offset, offset + bytes) overlaps with the cached
+ * block-status data region.
+ *
+ * If so, and @pnum is not NULL, set *pnum to `bsc.data_end - offset`,
+ * which is what bdrv_bsc_is_data()'s interface needs.
+ * Otherwise, *pnum is not touched.
+ */
+static bool bdrv_bsc_range_overlaps_locked(BlockDriverState *bs,
+                                           int64_t offset, int64_t bytes,
+                                           int64_t *pnum)
+{
+    BdrvBlockStatusCache *bsc = qatomic_rcu_read(&bs->block_status_cache);
+    bool overlaps;
+
+    overlaps =
+        qatomic_read(&bsc->valid) &&
+        ranges_overlap(offset, bytes, bsc->data_start,
+                       bsc->data_end - bsc->data_start);
+
+    if (overlaps && pnum) {
+        *pnum = bsc->data_end - offset;
+    }
+
+    return overlaps;
+}
+
+/**
+ * See block_int.h for this function's documentation.
+ */
+bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum)
+{
+    RCU_READ_LOCK_GUARD();
+
+    return bdrv_bsc_range_overlaps_locked(bs, offset, 1, pnum);
+}
+
+/**
+ * See block_int.h for this function's documentation.
+ */
+void bdrv_bsc_invalidate_range(BlockDriverState *bs,
+                               int64_t offset, int64_t bytes)
+{
+    RCU_READ_LOCK_GUARD();
+
+    if (bdrv_bsc_range_overlaps_locked(bs, offset, bytes, NULL)) {
+        qatomic_set(&bs->block_status_cache->valid, false);
+    }
+}
+
+/**
+ * See block_int.h for this function's documentation.
+ */
+void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes)
+{
+    BdrvBlockStatusCache *new_bsc = g_new(BdrvBlockStatusCache, 1);
+    BdrvBlockStatusCache *old_bsc;
+
+    *new_bsc = (BdrvBlockStatusCache) {
+        .valid = true,
+        .data_start = offset,
+        .data_end = offset + bytes,
+    };
+
+    QEMU_LOCK_GUARD(&bs->bsc_modify_lock);
+
+    old_bsc = qatomic_rcu_read(&bs->block_status_cache);
+    qatomic_rcu_set(&bs->block_status_cache, new_bsc);
+    if (old_bsc) {
+        g_free_rcu(old_bsc, rcu);
+    }
+}
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
         return -ENOTSUP;
     }
 
+    /* Invalidate the cached block-status data range if this write overlaps */
+    bdrv_bsc_invalidate_range(bs, offset, bytes);
+
     assert(alignment % bs->bl.request_alignment == 0);
     head = offset % alignment;
     tail = (offset + bytes) % alignment;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
     aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
 
     if (bs->drv->bdrv_co_block_status) {
-        ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
-                                            aligned_bytes, pnum, &local_map,
-                                            &local_file);
+        /*
+         * Use the block-status cache only for protocol nodes: Format
+         * drivers are generally quick to inquire the status, but protocol
+         * drivers often need to get information from outside of qemu, so
+         * we do not have control over the actual implementation.  There
+         * have been cases where inquiring the status took an unreasonably
+         * long time, and we can do nothing in qemu to fix it.
+         * This is especially problematic for images with large data areas,
+         * because finding the few holes in them and giving them special
+         * treatment does not gain much performance.  Therefore, we try to
+         * cache the last-identified data region.
+         *
+         * Second, limiting ourselves to protocol nodes allows us to assume
+         * the block status for data regions to be DATA | OFFSET_VALID, and
+         * that the host offset is the same as the guest offset.
+         *
+         * Note that it is possible that external writers zero parts of
+         * the cached regions without the cache being invalidated, and so
+         * we may report zeroes as data.  This is not catastrophic,
+         * however, because reporting zeroes as data is fine.
+         */
+        if (QLIST_EMPTY(&bs->children) &&
+            bdrv_bsc_is_data(bs, aligned_offset, pnum))
+        {
+            ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+            local_file = bs;
+            local_map = aligned_offset;
+        } else {
+            ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
+                                                aligned_bytes, pnum, &local_map,
+                                                &local_file);
+
+            /*
+             * Note that checking QLIST_EMPTY(&bs->children) is also done when
+             * the cache is queried above.  Technically, we do not need to check
+             * it here; the worst that can happen is that we fill the cache for
+             * non-protocol nodes, and then it is never used.  However, filling
+             * the cache requires an RCU update, so double check here to avoid
+             * such an update if possible.
+             */
+            if (ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) &&
+                QLIST_EMPTY(&bs->children))
+            {
+                /*
+                 * When a protocol driver reports BLOCK_OFFSET_VALID, the
+                 * returned local_map value must be the same as the offset we
+                 * have passed (aligned_offset), and local_bs must be the node
+                 * itself.
+                 * Assert this, because we follow this rule when reading from
+                 * the cache (see the `local_file = bs` and
+                 * `local_map = aligned_offset` assignments above), and the
+                 * result the cache delivers must be the same as the driver
+                 * would deliver.
+                 */
+                assert(local_file == bs);
+                assert(local_map == aligned_offset);
+                bdrv_bsc_fill(bs, aligned_offset, *pnum);
+            }
+        }
     } else {
         /* Default code for filters */
 
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
         return 0;
     }
 
+    /* Invalidate the cached block-status data range if this discard overlaps */
+    bdrv_bsc_invalidate_range(bs, offset, bytes);
+
     /* Discard is advisory, but some devices track and coalesce
      * unaligned requests, so we must pass everything down rather than
      * round here.  Still, most devices will just silently ignore
-- 
2.31.1

.bdrv_co_block_status() implementations are free to return a *pnum that
exceeds @bytes, because bdrv_co_block_status() in block/io.c will clamp
*pnum as necessary.

On the other hand, if drivers' implementations return values for *pnum
that are as large as possible, our recently introduced block-status
cache will become more effective.

So, make a note in block_int.h that @bytes is no upper limit for *pnum.

Suggested-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210812084148.14458-4-hreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 include/block/block_int.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
      * clamped to bdrv_getlength() and aligned to request_alignment,
      * as well as non-NULL pnum, map, and file; in turn, the driver
      * must return an error or set pnum to an aligned non-zero value.
+     *
+     * Note that @bytes is just a hint on how big of a region the
+     * caller wants to inspect.  It is not a limit on *pnum.
+     * Implementations are free to return larger values of *pnum if
+     * doing so does not incur a performance penalty.
+     *
+     * block/io.c's bdrv_co_block_status() will utilize an unclamped
+     * *pnum value for the block-status cache on protocol nodes, prior
+     * to clamping *pnum for return to its caller.
      */
     int coroutine_fn (*bdrv_co_block_status)(BlockDriverState *bs,
         bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum,
-- 
2.31.1

bdrv_co_block_status() does it for us, we do not need to do it here.

The advantage of not capping *pnum is that bdrv_co_block_status() can
cache larger data regions than requested by its caller.

Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20210812084148.14458-5-hreitz@redhat.com>
---
 block/file-posix.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -XXX,XX +XXX,XX @@ static int find_allocation(BlockDriverState *bs, off_t start,
  * the specified offset) that are known to be in the same
  * allocated/unallocated state.
  *
- * 'bytes' is the max value 'pnum' should be set to.
+ * 'bytes' is a soft cap for 'pnum'.  If the information is free, 'pnum' may
+ * well exceed it.
  */
 static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
                                             bool want_zero,
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
     } else if (data == offset) {
         /* On a data extent, compute bytes to the end of the extent,
          * possibly including a partial sector at EOF. */
-        *pnum = MIN(bytes, hole - offset);
+        *pnum = hole - offset;
 
         /*
          * We are not allowed to return partial sectors, though, so
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
     } else {
         /* On a hole, compute bytes to the beginning of the next extent.  */
         assert(hole == offset);
-        *pnum = MIN(bytes, data - offset);
+        *pnum = data - offset;
         ret = BDRV_BLOCK_ZERO;
     }
     *map = offset;
-- 
2.31.1

bdrv_co_block_status() does it for us, we do not need to do it here.

The advantage of not capping *pnum is that bdrv_co_block_status() can
cache larger data regions than requested by its caller.

Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20210812084148.14458-6-hreitz@redhat.com>
---
 block/gluster.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/block/gluster.c b/block/gluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -XXX,XX +XXX,XX @@ exit:
  * the specified offset) that are known to be in the same
  * allocated/unallocated state.
  *
- * 'bytes' is the max value 'pnum' should be set to.
+ * 'bytes' is a soft cap for 'pnum'.  If the information is free, 'pnum' may
+ * well exceed it.
  *
  * (Based on raw_co_block_status() from file-posix.c.)
  */
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
     } else if (data == offset) {
         /* On a data extent, compute bytes to the end of the extent,
          * possibly including a partial sector at EOF. */
-        *pnum = MIN(bytes, hole - offset);
+        *pnum = hole - offset;
 
         /*
          * We are not allowed to return partial sectors, though, so
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
     } else {
         /* On a hole, compute bytes to the beginning of the next extent.  */
         assert(hole == offset);
-        *pnum = MIN(bytes, data - offset);
+        *pnum = data - offset;
         ret = BDRV_BLOCK_ZERO;
     }
 
-- 
2.31.1

As of recently, pylint complains when `open()` calls are missing an
`encoding=` specified.  Everything we have should be UTF-8 (and in fact,
everything should be UTF-8, period (exceptions apply)), so use that.

Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20210824153540.177128-2-hreitz@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
---
 tests/qemu-iotests/297        | 2 +-
 tests/qemu-iotests/iotests.py | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/qemu-iotests/297 b/tests/qemu-iotests/297
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/297
+++ b/tests/qemu-iotests/297
@@ -XXX,XX +XXX,XX @@ def is_python_file(filename):
     if filename.endswith('.py'):
         return True
 
-    with open(filename) as f:
+    with open(filename, encoding='utf-8') as f:
         try:
             first_line = f.readline()
             return re.match('^#!.*python', first_line) is not None
diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -XXX,XX +XXX,XX @@ def _post_shutdown(self) -> None:
             return
         valgrind_filename =  f"{test_dir}/{self._popen.pid}.valgrind"
         if self.exitcode() == 99:
-            with open(valgrind_filename) as f:
+            with open(valgrind_filename, encoding='utf-8') as f:
                 print(f.read())
         else:
             os.remove(valgrind_filename)
@@ -XXX,XX +XXX,XX @@ def notrun(reason):
     # Each test in qemu-iotests has a number ("seq")
     seq = os.path.basename(sys.argv[0])
 
-    with open('%s/%s.notrun' % (output_dir, seq), 'w') as outfile:
+    with open('%s/%s.notrun' % (output_dir, seq), 'w', encoding='utf-8') \
+            as outfile:
         outfile.write(reason + '\n')
     logger.warning("%s not run: %s", seq, reason)
     sys.exit(0)
@@ -XXX,XX +XXX,XX @@ def case_notrun(reason):
     # Each test in qemu-iotests has a number ("seq")
     seq = os.path.basename(sys.argv[0])
 
-    with open('%s/%s.casenotrun' % (output_dir, seq), 'a') as outfile:
+    with open('%s/%s.casenotrun' % (output_dir, seq), 'a', encoding='utf-8') \
+            as outfile:
         outfile.write('    [case not run] ' + reason + '\n')
 
 def _verify_image_format(supported_fmts: Sequence[str] = (),
-- 
2.31.1

pylint proposes using `[]` instead of `list()` and `{}` instead of
`dict()`, because it is faster.  That seems simple enough, so heed its
advice.

Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20210824153540.177128-3-hreitz@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 tests/qemu-iotests/iotests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -XXX,XX +XXX,XX @@ def hmp_qemu_io(self, drive: str, cmd: str,
 
     def flatten_qmp_object(self, obj, output=None, basestr=''):
         if output is None:
-            output = dict()
+            output = {}
         if isinstance(obj, list):
             for i, item in enumerate(obj):
                 self.flatten_qmp_object(item, output, basestr + str(i) + '.')
@@ -XXX,XX +XXX,XX @@ def flatten_qmp_object(self, obj, output=None, basestr=''):
 
     def qmp_to_opts(self, obj):
         obj = self.flatten_qmp_object(obj)
-        output_list = list()
+        output_list = []
         for key in obj:
             output_list += [key + '=' + obj[key]]
         return ','.join(output_list)
-- 
2.31.1

169 and 199 have been renamed and moved to tests/ (commit a44be0334be:
"iotests: rename and move 169 and 199 tests"), so we can drop them from
the skip list.

Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Reviewed-by: Willian Rampazzo <willianr@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20210902094017.32902-2-hreitz@redhat.com>
---
 tests/qemu-iotests/297 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/297 b/tests/qemu-iotests/297
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/297
+++ b/tests/qemu-iotests/297
@@ -XXX,XX +XXX,XX @@ import iotests
 SKIP_FILES = (
     '030', '040', '041', '044', '045', '055', '056', '057', '065', '093',
     '096', '118', '124', '132', '136', '139', '147', '148', '149',
-    '151', '152', '155', '163', '165', '169', '194', '196', '199', '202',
+    '151', '152', '155', '163', '165', '194', '196', '202',
     '203', '205', '206', '207', '208', '210', '211', '212', '213', '216',
     '218', '219', '224', '228', '234', '235', '236', '237', '238',
     '240', '242', '245', '246', '248', '255', '256', '257', '258', '260',
-- 
2.31.1

pylint complains that discards1_sha256 and all_discards_sha256 are first
set in non-__init__ methods.

These variables are not really class-variables anyway, so let them
instead be returned by start_postcopy(), thus silencing pylint.

Suggested-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902094017.32902-3-hreitz@redhat.com>
---
 .../tests/migrate-bitmaps-postcopy-test             | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test b/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test
+++ b/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test
@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase):
 
         result = self.vm_a.qmp('x-debug-block-dirty-bitmap-sha256',
                                node='drive0', name='bitmap0')
-        self.discards1_sha256 = result['return']['sha256']
+        discards1_sha256 = result['return']['sha256']
 
         # Check, that updating the bitmap by discards works
-        assert self.discards1_sha256 != empty_sha256
+        assert discards1_sha256 != empty_sha256
 
         # We want to calculate resulting sha256. Do it in bitmap0, so, disable
         # other bitmaps
@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase):
 
         result = self.vm_a.qmp('x-debug-block-dirty-bitmap-sha256',
                                node='drive0', name='bitmap0')
-        self.all_discards_sha256 = result['return']['sha256']
+        all_discards_sha256 = result['return']['sha256']
 
         # Now, enable some bitmaps, to be updated during migration
         for i in range(2, nb_bitmaps, 2):
@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase):
 
         event_resume = self.vm_b.event_wait('RESUME')
         self.vm_b_events.append(event_resume)
-        return event_resume
+        return (event_resume, discards1_sha256, all_discards_sha256)
 
     def test_postcopy_success(self):
-        event_resume = self.start_postcopy()
+        event_resume, discards1_sha256, all_discards_sha256 = \
+                self.start_postcopy()
 
         # enabled bitmaps should be updated
         apply_discards(self.vm_b, discards2)
@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase):
         for i in range(0, nb_bitmaps, 5):
             result = self.vm_b.qmp('x-debug-block-dirty-bitmap-sha256',
                                    node='drive0', name='bitmap{}'.format(i))
-            sha = self.discards1_sha256 if i % 2 else self.all_discards_sha256
+            sha = discards1_sha256 if i % 2 else all_discards_sha256
             self.assert_qmp(result, 'return/sha256', sha)
 
     def test_early_shutdown_destination(self):
-- 
2.31.1

There are a couple of things pylint takes issue with:
- The "time" import is unused
- The import order (iotests should come last)
- get_bitmap_hash() doesn't use @self and so should be a function
- Semicolons at the end of some lines
- Parentheses after "if"
- Some lines are too long (80 characters instead of 79)
- inject_test_case()'s @name parameter shadows a top-level @name
  variable
- "lambda self: mc(self)" were equivalent to just "mc", but in
  inject_test_case(), it is not equivalent, so add a comment and disable
  the warning locally
- Always put two empty lines after a function
- f'exec: cat > /dev/null' does not need to be an f-string

Fix them.

Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20210902094017.32902-4-hreitz@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 tests/qemu-iotests/tests/migrate-bitmaps-test | 43 +++++++++++--------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/tests/qemu-iotests/tests/migrate-bitmaps-test b/tests/qemu-iotests/tests/migrate-bitmaps-test
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/tests/migrate-bitmaps-test
+++ b/tests/qemu-iotests/tests/migrate-bitmaps-test
@@ -XXX,XX +XXX,XX @@
 #
 
 import os
-import iotests
-import time
 import itertools
 import operator
 import re
+import iotests
 from iotests import qemu_img, qemu_img_create, Timeout
 
 
@@ -XXX,XX +XXX,XX @@ mig_cmd = 'exec: cat > ' + mig_file
 incoming_cmd = 'exec: cat ' + mig_file
 
 
+def get_bitmap_hash(vm):
+    result = vm.qmp('x-debug-block-dirty-bitmap-sha256',
+                    node='drive0', name='bitmap0')
+    return result['return']['sha256']
+
+
 class TestDirtyBitmapMigration(iotests.QMPTestCase):
     def tearDown(self):
         self.vm_a.shutdown()
@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
             params['persistent'] = True
 
         result = vm.qmp('block-dirty-bitmap-add', **params)
-        self.assert_qmp(result, 'return', {});
-
-    def get_bitmap_hash(self, vm):
-        result = vm.qmp('x-debug-block-dirty-bitmap-sha256',
-                        node='drive0', name='bitmap0')
-        return result['return']['sha256']
+        self.assert_qmp(result, 'return', {})
 
     def check_bitmap(self, vm, sha256):
         result = vm.qmp('x-debug-block-dirty-bitmap-sha256',
                         node='drive0', name='bitmap0')
         if sha256:
-            self.assert_qmp(result, 'return/sha256', sha256);
+            self.assert_qmp(result, 'return/sha256', sha256)
         else:
             self.assert_qmp(result, 'error/desc',
-                            "Dirty bitmap 'bitmap0' not found");
+                            "Dirty bitmap 'bitmap0' not found")
 
     def do_test_migration_resume_source(self, persistent, migrate_bitmaps):
         granularity = 512
@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
         self.add_bitmap(self.vm_a, granularity, persistent)
         for r in regions:
             self.vm_a.hmp_qemu_io('drive0', 'write %d %d' % r)
-        sha256 = self.get_bitmap_hash(self.vm_a)
+        sha256 = get_bitmap_hash(self.vm_a)
 
         result = self.vm_a.qmp('migrate', uri=mig_cmd)
         while True:
@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
                 break
         while True:
             result = self.vm_a.qmp('query-status')
-            if (result['return']['status'] == 'postmigrate'):
+            if result['return']['status'] == 'postmigrate':
                 break
 
         # test that bitmap is still here
@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
         self.add_bitmap(self.vm_a, granularity, persistent)
         for r in regions:
             self.vm_a.hmp_qemu_io('drive0', 'write %d %d' % r)
-        sha256 = self.get_bitmap_hash(self.vm_a)
+        sha256 = get_bitmap_hash(self.vm_a)
 
         if pre_shutdown:
             self.vm_a.shutdown()
@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapMigration(iotests.QMPTestCase):
             self.check_bitmap(self.vm_b, sha256 if persistent else False)
 
 
-def inject_test_case(klass, name, method, *args, **kwargs):
+def inject_test_case(klass, suffix, method, *args, **kwargs):
     mc = operator.methodcaller(method, *args, **kwargs)
-    setattr(klass, 'test_' + method + name, lambda self: mc(self))
+    # We want to add a function attribute to `klass`, so that it is
+    # correctly converted to a method on instantiation.  The
+    # methodcaller object `mc` is a callable, not a function, so we
+    # need the lambda to turn it into a function.
+    # pylint: disable=unnecessary-lambda
+    setattr(klass, 'test_' + method + suffix, lambda self: mc(self))
+
 
 for cmb in list(itertools.product((True, False), repeat=5)):
     name = ('_' if cmb[0] else '_not_') + 'persistent_'
     name += ('_' if cmb[1] else '_not_') + 'migbitmap_'
     name += '_online' if cmb[2] else '_offline'
     name += '_shared' if cmb[3] else '_nonshared'
-    if (cmb[4]):
+    if cmb[4]:
         name += '__pre_shutdown'
 
     inject_test_case(TestDirtyBitmapMigration, name, 'do_test_migration',
@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapBackingMigration(iotests.QMPTestCase):
         self.assert_qmp(result, 'return', {})
 
         # Check that the bitmaps are there
-        for node in self.vm.qmp('query-named-block-nodes', flat=True)['return']:
+        nodes = self.vm.qmp('query-named-block-nodes', flat=True)['return']
+        for node in nodes:
             if 'node0' in node['node-name']:
                 self.assert_qmp(node, 'dirty-bitmaps[0]/name', 'bmap0')
 
@@ -XXX,XX +XXX,XX @@ class TestDirtyBitmapBackingMigration(iotests.QMPTestCase):
         """
         Continue the source after migration.
         """
-        result = self.vm.qmp('migrate', uri=f'exec: cat > /dev/null')
+        result = self.vm.qmp('migrate', uri='exec: cat > /dev/null')
         self.assert_qmp(result, 'return', {})
 
         with Timeout(10, 'Migration timeout'):
-- 
2.31.1

The AbnormalShutdown exception class is not in qemu.machine, but in
qemu.machine.machine.  (qemu.machine.AbnormalShutdown was enough for
Python to find it in order to run this test, but pylint complains about
it.)

Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20210902094017.32902-5-hreitz@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 tests/qemu-iotests/tests/mirror-top-perms | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/tests/mirror-top-perms b/tests/qemu-iotests/tests/mirror-top-perms
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/tests/mirror-top-perms
+++ b/tests/qemu-iotests/tests/mirror-top-perms
@@ -XXX,XX +XXX,XX @@ class TestMirrorTopPerms(iotests.QMPTestCase):
     def tearDown(self):
         try:
             self.vm.shutdown()
-        except qemu.machine.AbnormalShutdown:
+        except qemu.machine.machine.AbnormalShutdown:
             pass
 
         if self.vm_b is not None:
-- 
2.31.1

297 so far does not check the named tests, which reside in the tests/
directory (i.e. full path tests/qemu-iotests/tests).  Fix it.

Thanks to the previous two commits, all named tests pass its scrutiny,
so we do not have to add anything to SKIP_FILES.

Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Reviewed-by: Willian Rampazzo <willianr@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20210902094017.32902-6-hreitz@redhat.com>
---
 tests/qemu-iotests/297 | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/297 b/tests/qemu-iotests/297
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/297
+++ b/tests/qemu-iotests/297
@@ -XXX,XX +XXX,XX @@ def is_python_file(filename):
 
 
 def run_linters():
-    files = [filename for filename in (set(os.listdir('.')) - set(SKIP_FILES))
-             if is_python_file(filename)]
+    named_tests = [f'tests/{entry}' for entry in os.listdir('tests')]
+    check_tests = set(os.listdir('.') + named_tests) - set(SKIP_FILES)
+    files = [filename for filename in check_tests if is_python_file(filename)]
 
     iotests.logger.debug('Files to be checked:')
     iotests.logger.debug(', '.join(sorted(files)))
-- 
2.31.1

From: Stefano Garzarella <sgarzare@redhat.com>

In mirror_iteration() we call mirror_wait_on_conflicts() with
`self` parameter set to NULL.

Starting from commit d44dae1a7c we dereference `self` pointer in
mirror_wait_on_conflicts() without checks if it is not NULL.

Backtrace:
  Program terminated with signal SIGSEGV, Segmentation fault.
  #0  mirror_wait_on_conflicts (self=0x0, s=<optimized out>, offset=<optimized out>, bytes=<optimized out>)
      at ../block/mirror.c:172
  172	                self->waiting_for_op = op;
  [Current thread is 1 (Thread 0x7f0908931ec0 (LWP 380249))]
  (gdb) bt
  #0  mirror_wait_on_conflicts (self=0x0, s=<optimized out>, offset=<optimized out>, bytes=<optimized out>)
      at ../block/mirror.c:172
  #1  0x00005610c5d9d631 in mirror_run (job=0x5610c76a2c00, errp=<optimized out>) at ../block/mirror.c:491
  #2  0x00005610c5d58726 in job_co_entry (opaque=0x5610c76a2c00) at ../job.c:917
  #3  0x00005610c5f046c6 in coroutine_trampoline (i0=<optimized out>, i1=<optimized out>)
      at ../util/coroutine-ucontext.c:173
  #4  0x00007f0909975820 in ?? () at ../sysdeps/unix/sysv/linux/x86_64/__start_context.S:91
      from /usr/lib64/libc.so.6

Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2001404
Fixes: d44dae1a7c ("block/mirror: fix active mirror dead-lock in mirror_wait_on_conflicts")
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20210910124533.288318-1-sgarzare@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 block/mirror.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_wait_on_conflicts(MirrorOp *self,
             if (ranges_overlap(self_start_chunk, self_nb_chunks,
                                op_start_chunk, op_nb_chunks))
             {
-                /*
-                 * If the operation is already (indirectly) waiting for us, or
-                 * will wait for us as soon as it wakes up, then just go on
-                 * (instead of producing a deadlock in the former case).
-                 */
-                if (op->waiting_for_op) {
-                    continue;
+                if (self) {
+                    /*
+                     * If the operation is already (indirectly) waiting for us,
+                     * or will wait for us as soon as it wakes up, then just go
+                     * on (instead of producing a deadlock in the former case).
+                     */
+                    if (op->waiting_for_op) {
+                        continue;
+                    }
+
+                    self->waiting_for_op = op;
                 }
 
-                self->waiting_for_op = op;
                 qemu_co_queue_wait(&op->waiting_requests, NULL);
-                self->waiting_for_op = NULL;
+
+                if (self) {
+                    self->waiting_for_op = NULL;
+                }
+
                 break;
             }
         }
-- 
2.31.1

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

Add a simple test which tries to run migration during backup.
bdrv_inactivate_all() should fail. But due to bug (see next commit with
fix) it doesn't, nodes are inactivated and continued backup crashes
on assertion "assert(!(bs->open_flags & BDRV_O_INACTIVE));" in
bdrv_co_write_req_prepare().

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210911120027.8063-2-vsementsov@virtuozzo.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 .../qemu-iotests/tests/migrate-during-backup  | 97 +++++++++++++++++++
 .../tests/migrate-during-backup.out           |  5 +
 2 files changed, 102 insertions(+)
 create mode 100755 tests/qemu-iotests/tests/migrate-during-backup
 create mode 100644 tests/qemu-iotests/tests/migrate-during-backup.out

diff --git a/tests/qemu-iotests/tests/migrate-during-backup b/tests/qemu-iotests/tests/migrate-during-backup
new file mode 100755
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/tests/migrate-during-backup
@@ -XXX,XX +XXX,XX @@
+#!/usr/bin/env python3
+# group: migration disabled
+#
+# Copyright (c) 2021 Virtuozzo International GmbH
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import os
+import iotests
+from iotests import qemu_img_create, qemu_io
+
+
+disk_a = os.path.join(iotests.test_dir, 'disk_a')
+disk_b = os.path.join(iotests.test_dir, 'disk_b')
+size = '1M'
+mig_file = os.path.join(iotests.test_dir, 'mig_file')
+mig_cmd = 'exec: cat > ' + mig_file
+
+
+class TestMigrateDuringBackup(iotests.QMPTestCase):
+    def tearDown(self):
+        self.vm.shutdown()
+        os.remove(disk_a)
+        os.remove(disk_b)
+        os.remove(mig_file)
+
+    def setUp(self):
+        qemu_img_create('-f', iotests.imgfmt, disk_a, size)
+        qemu_img_create('-f', iotests.imgfmt, disk_b, size)
+        qemu_io('-c', f'write 0 {size}', disk_a)
+
+        self.vm = iotests.VM().add_drive(disk_a)
+        self.vm.launch()
+        result = self.vm.qmp('blockdev-add', {
+            'node-name': 'target',
+            'driver': iotests.imgfmt,
+            'file': {
+                'driver': 'file',
+                'filename': disk_b
+            }
+        })
+        self.assert_qmp(result, 'return', {})
+
+    def test_migrate(self):
+        result = self.vm.qmp('blockdev-backup', device='drive0',
+                             target='target', sync='full',
+                             speed=1, x_perf={
+                                 'max-workers': 1,
+                                 'max-chunk': 64 * 1024
+                             })
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('job-pause', id='drive0')
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('migrate-set-capabilities',
+                             capabilities=[{'capability': 'events',
+                                            'state': True}])
+        self.assert_qmp(result, 'return', {})
+        result = self.vm.qmp('migrate', uri=mig_cmd)
+        self.assert_qmp(result, 'return', {})
+
+        e = self.vm.events_wait((('MIGRATION',
+                                  {'data': {'status': 'completed'}}),
+                                 ('MIGRATION',
+                                  {'data': {'status': 'failed'}})))
+
+        # Don't assert that e is 'failed' now: this way we'll miss
+        # possible crash when backup continues :)
+
+        result = self.vm.qmp('block-job-set-speed', device='drive0',
+                             speed=0)
+        self.assert_qmp(result, 'return', {})
+        result = self.vm.qmp('job-resume', id='drive0')
+        self.assert_qmp(result, 'return', {})
+
+        # For future: if something changes so that both migration
+        # and backup pass, let's not miss that moment, as it may
+        # be a bug as well as improvement.
+        self.assert_qmp(e, 'data/status', 'failed')
+
+
+if __name__ == '__main__':
+    iotests.main(supported_fmts=['qcow2'],
+                 supported_protocols=['file'])
diff --git a/tests/qemu-iotests/tests/migrate-during-backup.out b/tests/qemu-iotests/tests/migrate-during-backup.out
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/tests/migrate-during-backup.out
@@ -XXX,XX +XXX,XX @@
+.
+----------------------------------------------------------------------
+Ran 1 tests
+
+OK
-- 
2.31.1

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

We must not inactivate child when parent has write permissions on
it.

Calling .bdrv_inactivate() doesn't help: actually only qcow2 has this
handler and it is used to flush caches, not for permission
manipulations.

So, let's simply check cumulative parent permissions before
inactivating the node.

This commit fixes a crash when we do migration during backup: prior to
the commit nothing prevents all nodes inactivation at migration finish
and following backup write to the target crashes on assertion
"assert(!(bs->open_flags & BDRV_O_INACTIVE));" in
bdrv_co_write_req_prepare().

After the commit, we rely on the fact that copy-before-write filter
keeps write permission on target node to be able to write to it. So
inactivation fails and migration fails as expected.

Corresponding test now passes, so, enable it.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20210911120027.8063-3-vsementsov@virtuozzo.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 block.c                                        | 8 ++++++++
 tests/qemu-iotests/tests/migrate-during-backup | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static int bdrv_inactivate_recurse(BlockDriverState *bs)
 {
     BdrvChild *child, *parent;
     int ret;
+    uint64_t cumulative_perms, cumulative_shared_perms;
 
     if (!bs->drv) {
         return -ENOMEDIUM;
@@ -XXX,XX +XXX,XX @@ static int bdrv_inactivate_recurse(BlockDriverState *bs)
         }
     }
 
+    bdrv_get_cumulative_perm(bs, &cumulative_perms,
+                             &cumulative_shared_perms);
+    if (cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) {
+        /* Our inactive parents still need write access. Inactivation failed. */
+        return -EPERM;
+    }
+
     bs->open_flags |= BDRV_O_INACTIVE;
 
     /*
diff --git a/tests/qemu-iotests/tests/migrate-during-backup b/tests/qemu-iotests/tests/migrate-during-backup
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/tests/migrate-during-backup
+++ b/tests/qemu-iotests/tests/migrate-during-backup
@@ -XXX,XX +XXX,XX @@
 #!/usr/bin/env python3
-# group: migration disabled
+# group: migration
 #
 # Copyright (c) 2021 Virtuozzo International GmbH
 #
-- 
2.31.1

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

Add simple grammar-parsing template benchmark. New tool consume test
template written in bash with some special grammar injections and
produces multiple tests, run them and finally print a performance
comparison table of different tests produced from one template.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210824101517.59802-2-vsementsov@virtuozzo.com>
Reviewed-by: Hanna Reitz <hreitz@redhat.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 scripts/simplebench/img_bench_templater.py | 95 ++++++++++++++++++++++
 scripts/simplebench/table_templater.py     | 62 ++++++++++++++
 2 files changed, 157 insertions(+)
 create mode 100755 scripts/simplebench/img_bench_templater.py
 create mode 100644 scripts/simplebench/table_templater.py

diff --git a/scripts/simplebench/img_bench_templater.py b/scripts/simplebench/img_bench_templater.py
new file mode 100755
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/scripts/simplebench/img_bench_templater.py
@@ -XXX,XX +XXX,XX @@
+#!/usr/bin/env python3
+#
+# Process img-bench test templates
+#
+# Copyright (c) 2021 Virtuozzo International GmbH.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+
+import sys
+import subprocess
+import re
+import json
+
+import simplebench
+from results_to_text import results_to_text
+from table_templater import Templater
+
+
+def bench_func(env, case):
+    test = templater.gen(env['data'], case['data'])
+
+    p = subprocess.run(test, shell=True, stdout=subprocess.PIPE,
+                       stderr=subprocess.STDOUT, universal_newlines=True)
+
+    if p.returncode == 0:
+        try:
+            m = re.search(r'Run completed in (\d+.\d+) seconds.', p.stdout)
+            return {'seconds': float(m.group(1))}
+        except Exception:
+            return {'error': f'failed to parse qemu-img output: {p.stdout}'}
+    else:
+        return {'error': f'qemu-img failed: {p.returncode}: {p.stdout}'}
+
+
+if __name__ == '__main__':
+    if len(sys.argv) > 1:
+        print("""
+Usage: img_bench_templater.py < path/to/test-template.sh
+
+This script generates performance tests from a test template (example below),
+runs them, and displays the results in a table. The template is read from
+stdin.  It must be written in bash and end with a `qemu-img bench` invocation
+(whose result is parsed to get the test instance’s result).
+
+Use the following syntax in the template to create the various different test
+instances:
+
+  column templating: {var1|var2|...} - test will use different values in
+  different columns. You may use several {} constructions in the test, in this
+  case product of all choice-sets will be used.
+
+  row templating: [var1|var2|...] - similar thing to define rows (test-cases)
+
+Test template example:
+
+Assume you want to compare two qemu-img binaries, called qemu-img-old and
+qemu-img-new in your build directory in two test-cases with 4K writes and 64K
+writes. The template may look like this:
+
+qemu_img=/path/to/qemu/build/qemu-img-{old|new}
+$qemu_img create -f qcow2 /ssd/x.qcow2 1G
+$qemu_img bench -c 100 -d 8 [-s 4K|-s 64K] -w -t none -n /ssd/x.qcow2
+
+When passing this to stdin of img_bench_templater.py, the resulting comparison
+table will contain two columns (for two binaries) and two rows (for two
+test-cases).
+
+In addition to displaying the results, script also stores results in JSON
+format into results.json file in current directory.
+""")
+        sys.exit()
+
+    templater = Templater(sys.stdin.read())
+
+    envs = [{'id': ' / '.join(x), 'data': x} for x in templater.columns]
+    cases = [{'id': ' / '.join(x), 'data': x} for x in templater.rows]
+
+    result = simplebench.bench(bench_func, envs, cases, count=5,
+                               initial_run=False)
+    print(results_to_text(result))
+    with open('results.json', 'w') as f:
+        json.dump(result, f, indent=4)
diff --git a/scripts/simplebench/table_templater.py b/scripts/simplebench/table_templater.py
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/scripts/simplebench/table_templater.py
@@ -XXX,XX +XXX,XX @@
+# Parser for test templates
+#
+# Copyright (c) 2021 Virtuozzo International GmbH.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import itertools
+from lark import Lark
+
+grammar = """
+start: ( text | column_switch | row_switch )+
+
+column_switch: "{" text ["|" text]+ "}"
+row_switch: "[" text ["|" text]+ "]"
+text: /[^|{}\[\]]+/
+"""
+
+parser = Lark(grammar)
+
+class Templater:
+    def __init__(self, template):
+        self.tree = parser.parse(template)
+
+        c_switches = []
+        r_switches = []
+        for x in self.tree.children:
+            if x.data == 'column_switch':
+                c_switches.append([el.children[0].value for el in x.children])
+            elif x.data == 'row_switch':
+                r_switches.append([el.children[0].value for el in x.children])
+
+        self.columns = list(itertools.product(*c_switches))
+        self.rows = list(itertools.product(*r_switches))
+
+    def gen(self, column, row):
+        i = 0
+        j = 0
+        result = []
+
+        for x in self.tree.children:
+            if x.data == 'text':
+                result.append(x.children[0].value)
+            elif x.data == 'column_switch':
+                result.append(column[i])
+                i += 1
+            elif x.data == 'row_switch':
+                result.append(row[j])
+                j += 1
+
+        return ''.join(result)
-- 
2.31.1

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

No logic change, just prepare for the following commit. While being
here do also small grammar fix in a comment.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20210824101517.59802-3-vsementsov@virtuozzo.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 block/qcow2-cluster.c | 49 ++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
 
         if (end <= old_start || start >= old_end) {
             /* No intersection */
-        } else {
-            if (start < old_start) {
-                /* Stop at the start of a running allocation */
-                bytes = old_start - start;
-            } else {
-                bytes = 0;
-            }
+            continue;
+        }
 
-            /* Stop if already an l2meta exists. After yielding, it wouldn't
-             * be valid any more, so we'd have to clean up the old L2Metas
-             * and deal with requests depending on them before starting to
-             * gather new ones. Not worth the trouble. */
-            if (bytes == 0 && *m) {
-                *cur_bytes = 0;
-                return 0;
-            }
+        /* Conflict */
 
-            if (bytes == 0) {
-                /* Wait for the dependency to complete. We need to recheck
-                 * the free/allocated clusters when we continue. */
-                qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
-                return -EAGAIN;
-            }
+        if (start < old_start) {
+            /* Stop at the start of a running allocation */
+            bytes = old_start - start;
+        } else {
+            bytes = 0;
+        }
+
+        /*
+         * Stop if an l2meta already exists. After yielding, it wouldn't
+         * be valid any more, so we'd have to clean up the old L2Metas
+         * and deal with requests depending on them before starting to
+         * gather new ones. Not worth the trouble.
+         */
+        if (bytes == 0 && *m) {
+            *cur_bytes = 0;
+            return 0;
+        }
+
+        if (bytes == 0) {
+            /*
+             * Wait for the dependency to complete. We need to recheck
+             * the free/allocated clusters when we continue.
+             */
+            qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
+            return -EAGAIN;
         }
     }
 
-- 
2.31.1

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

There is no conflict and no dependency if we have parallel writes to
different subclusters of one cluster when the cluster itself is already
allocated. So, relax extra dependency.

Measure performance:
First, prepare build/qemu-img-old and build/qemu-img-new images.

cd scripts/simplebench
./img_bench_templater.py

Paste the following to stdin of running script:

qemu_img=../../build/qemu-img-{old|new}
$qemu_img create -f qcow2 -o extended_l2=on /ssd/x.qcow2 1G
$qemu_img bench -c 100000 -d 8 [-s 2K|-s 2K -o 512|-s $((1024*2+512))] \
        -w -t none -n /ssd/x.qcow2

The result:

All results are in seconds

------------------  ---------  ---------
                    old        new
-s 2K               6.7 ± 15%  6.2 ± 12%
                                 -7%
-s 2K -o 512        13 ± 3%    11 ± 5%
                                 -16%
-s $((1024*2+512))  9.5 ± 4%   8.4
                                 -12%
------------------  ---------  ---------

So small writes are more independent now and that helps to keep deeper
io queue which improves performance.

271 iotest output becomes racy for three allocation in one cluster.
Second and third writes may finish in different order. Second and
third requests don't depend on each other any more. Still they both
depend on first request anyway. Filter out second and third write
offsets to cover both possible outputs.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210824101517.59802-4-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Hanna Reitz <hreitz@redhat.com>
[hreitz: s/ an / and /]
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 block/qcow2-cluster.c      | 11 +++++++++++
 tests/qemu-iotests/271     |  5 ++++-
 tests/qemu-iotests/271.out |  4 ++--
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
             continue;
         }
 
+        if (old_alloc->keep_old_clusters &&
+            (end <= l2meta_cow_start(old_alloc) ||
+             start >= l2meta_cow_end(old_alloc)))
+        {
+            /*
+             * Clusters intersect but COW areas don't. And cluster itself is
+             * already allocated. So, there is no actual conflict.
+             */
+            continue;
+        }
+
         /* Conflict */
 
         if (start < old_start) {
diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/271
+++ b/tests/qemu-iotests/271
@@ -XXX,XX +XXX,XX @@ EOF
 }
 
 _make_test_img -o extended_l2=on 1M
-_concurrent_io     | $QEMU_IO | _filter_qemu_io
+# Second and third writes in _concurrent_io() are independent and may finish in
+# different order. So, filter offset out to match both possible variants.
+_concurrent_io     | $QEMU_IO | _filter_qemu_io | \
+    $SED -e 's/$20480\|40960$/OFFSET/'
 _concurrent_verify | $QEMU_IO | _filter_qemu_io
 
 # success, all done
diff --git a/tests/qemu-iotests/271.out b/tests/qemu-iotests/271.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/271.out
+++ b/tests/qemu-iotests/271.out
@@ -XXX,XX +XXX,XX @@ blkdebug: Suspended request 'A'
 blkdebug: Resuming request 'A'
 wrote 2048/2048 bytes at offset 30720
 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 2048/2048 bytes at offset 20480
+wrote 2048/2048 bytes at offset OFFSET
 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 2048/2048 bytes at offset 40960
+wrote 2048/2048 bytes at offset OFFSET
 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 *** done
-- 
2.31.1

We cannot write to images opened with O_DIRECT unless we allow them to
be resized so they are aligned to the sector size: Since 9c60a5d1978,
bdrv_node_refresh_perm() ensures that for nodes whose length is not
aligned to the request alignment and where someone has taken a WRITE
permission, the RESIZE permission is taken, too).

Let qemu-img convert pass the BDRV_O_RESIZE flag (which causes
blk_new_open() to take the RESIZE permission) when using cache=none for
the target, so that when writing to it, it can be aligned to the target
sector size.

Without this patch, an error is returned:

$ qemu-img convert -f raw -O raw -t none foo.img /mnt/tmp/foo.img
qemu-img: Could not open '/mnt/tmp/foo.img': Cannot get 'write'
permission without 'resize': Image size is not a multiple of request
alignment

Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1994266
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20210819101200.64235-1-hreitz@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 qemu-img.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/qemu-img.c b/qemu-img.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
         goto out;
     }
 
+    if (flags & BDRV_O_NOCACHE) {
+        /*
+         * If we open the target with O_DIRECT, it may be necessary to
+         * extend its size to align to the physical sector size.
+         */
+        flags |= BDRV_O_RESIZE;
+    }
+
     if (skip_create) {
         s.target = img_open(tgt_image_opts, out_filename, out_fmt,
                             flags, writethrough, s.quiet, false);
-- 
2.31.1

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

- don't use same name for size in bytes and in entries
 - use g_autofree for l2_table
 - add whitespace
 - fix block comment style

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20210914122454.141075-2-vsementsov@virtuozzo.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 block/qcow2-refcount.c | 47 +++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                               int flags, BdrvCheckMode fix, bool active)
 {
     BDRVQcow2State *s = bs->opaque;
-    uint64_t *l2_table, l2_entry;
+    uint64_t l2_entry;
     uint64_t next_contiguous_offset = 0;
-    int i, l2_size, nb_csectors, ret;
+    int i, nb_csectors, ret;
+    size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
+    g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes);
 
     /* Read L2 table from disk */
-    l2_size = s->l2_size * l2_entry_size(s);
-    l2_table = g_malloc(l2_size);
-
-    ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
+    ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size_bytes);
     if (ret < 0) {
         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
         res->check_errors++;
-        goto fail;
+        return ret;
     }
 
     /* Do the actual checks */
-    for(i = 0; i < s->l2_size; i++) {
+    for (i = 0; i < s->l2_size; i++) {
         l2_entry = get_l2_entry(s, l2_table, i);
 
         switch (qcow2_get_cluster_type(bs, l2_entry)) {
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                 l2_entry & QCOW2_COMPRESSED_SECTOR_MASK,
                 nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE);
             if (ret < 0) {
-                goto fail;
+                return ret;
             }
 
             if (flags & CHECK_FRAG_INFO) {
                 res->bfi.allocated_clusters++;
                 res->bfi.compressed_clusters++;
 
-                /* Compressed clusters are fragmented by nature.  Since they
+                /*
+                 * Compressed clusters are fragmented by nature.  Since they
                  * take up sub-sector space but we only have sector granularity
                  * I/O we need to re-read the same sectors even for adjacent
                  * compressed clusters.
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                         if (ret < 0) {
                             fprintf(stderr, "ERROR: Overlap check failed\n");
                             res->check_errors++;
-                            /* Something is seriously wrong, so abort checking
-                             * this L2 table */
-                            goto fail;
+                            /*
+                             * Something is seriously wrong, so abort checking
+                             * this L2 table.
+                             */
+                            return ret;
                         }
 
                         ret = bdrv_pwrite_sync(bs->file, l2e_offset,
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                             fprintf(stderr, "ERROR: Failed to overwrite L2 "
                                     "table entry: %s\n", strerror(-ret));
                             res->check_errors++;
-                            /* Do not abort, continue checking the rest of this
-                             * L2 table's entries */
+                            /*
+                             * Do not abort, continue checking the rest of this
+                             * L2 table's entries.
+                             */
                         } else {
                             res->corruptions--;
                             res->corruptions_fixed++;
-                            /* Skip marking the cluster as used
-                             * (it is unused now) */
+                            /*
+                             * Skip marking the cluster as used
+                             * (it is unused now).
+                             */
                             continue;
                         }
                     }
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                                                refcount_table_size,
                                                offset, s->cluster_size);
                 if (ret < 0) {
-                    goto fail;
+                    return ret;
                 }
             }
             break;
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
         }
     }
 
-    g_free(l2_table);
     return 0;
-
-fail:
-    g_free(l2_table);
-    return ret;
 }
 
 /*
-- 
2.31.1

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

Let's pass the whole L2 entry and not bother with
L2E_COMPRESSED_OFFSET_SIZE_MASK.

It also helps further refactoring that adds generic
qcow2_parse_compressed_l2_entry() helper.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20210914122454.141075-3-vsementsov@virtuozzo.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 block/qcow2.h         |  1 -
 block/qcow2-cluster.c |  5 ++---
 block/qcow2.c         | 12 +++++++-----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ typedef enum QCow2MetadataOverlap {
 
 #define L1E_OFFSET_MASK 0x00fffffffffffe00ULL
 #define L2E_OFFSET_MASK 0x00fffffffffffe00ULL
-#define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL
 
 #define REFT_OFFSET_MASK 0xfffffffffffffe00ULL
 
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
  * offset needs to be aligned to a cluster boundary.
  *
  * If the cluster is unallocated then *host_offset will be 0.
- * If the cluster is compressed then *host_offset will contain the
- * complete compressed cluster descriptor.
+ * If the cluster is compressed then *host_offset will contain the l2 entry.
  *
  * On entry, *bytes is the maximum number of contiguous bytes starting at
  * offset that we are interested in.
@@ -XXX,XX +XXX,XX @@ int qcow2_get_host_offset(BlockDriverState *bs, uint64_t offset,
             ret = -EIO;
             goto fail;
         }
-        *host_offset = l2_entry & L2E_COMPRESSED_OFFSET_SIZE_MASK;
+        *host_offset = l2_entry;
         break;
     case QCOW2_SUBCLUSTER_ZERO_PLAIN:
     case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
 
 static int coroutine_fn
 qcow2_co_preadv_compressed(BlockDriverState *bs,
-                           uint64_t cluster_descriptor,
+                           uint64_t l2_entry,
                            uint64_t offset,
                            uint64_t bytes,
                            QEMUIOVector *qiov,
@@ -XXX,XX +XXX,XX @@ typedef struct Qcow2AioTask {
 
     BlockDriverState *bs;
     QCow2SubclusterType subcluster_type; /* only for read */
-    uint64_t host_offset; /* or full descriptor in compressed clusters */
+    uint64_t host_offset; /* or l2_entry for compressed read */
     uint64_t offset;
     uint64_t bytes;
     QEMUIOVector *qiov;
@@ -XXX,XX +XXX,XX @@ qcow2_co_pwritev_compressed_part(BlockDriverState *bs,
 
 static int coroutine_fn
 qcow2_co_preadv_compressed(BlockDriverState *bs,
-                           uint64_t cluster_descriptor,
+                           uint64_t l2_entry,
                            uint64_t offset,
                            uint64_t bytes,
                            QEMUIOVector *qiov,
@@ -XXX,XX +XXX,XX @@ qcow2_co_preadv_compressed(BlockDriverState *bs,
     uint8_t *buf, *out_buf;
     int offset_in_cluster = offset_into_cluster(s, offset);
 
-    coffset = cluster_descriptor & s->cluster_offset_mask;
-    nb_csectors = ((cluster_descriptor >> s->csize_shift) & s->csize_mask) + 1;
+    assert(qcow2_get_cluster_type(bs, l2_entry) == QCOW2_CLUSTER_COMPRESSED);
+
+    coffset = l2_entry & s->cluster_offset_mask;
+    nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1;
     csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE -
         (coffset & ~QCOW2_COMPRESSED_SECTOR_MASK);
 
-- 
2.31.1

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

Add helper to parse compressed l2_entry and use it everywhere instead
of open-coding.

Note, that in most places we move to precise coffset/csize instead of
sector-aligned. Still it should work good enough for updating
refcounts.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20210914122454.141075-4-vsementsov@virtuozzo.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 block/qcow2.h          |  3 ++-
 block/qcow2-cluster.c  | 15 +++++++++++++++
 block/qcow2-refcount.c | 36 +++++++++++++++++-------------------
 block/qcow2.c          |  9 ++-------
 4 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@
 
 /* Defined in the qcow2 spec (compressed cluster descriptor) */
 #define QCOW2_COMPRESSED_SECTOR_SIZE 512U
-#define QCOW2_COMPRESSED_SECTOR_MASK (~(QCOW2_COMPRESSED_SECTOR_SIZE - 1ULL))
 
 /* Must be at least 2 to cover COW */
 #define MIN_L2_CACHE_SIZE 2 /* cache entries */
@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
                                           uint64_t offset,
                                           int compressed_size,
                                           uint64_t *host_offset);
+void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry,
+                                     uint64_t *coffset, int *csize);
 
 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
 void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m);
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ fail:
     g_free(l1_table);
     return ret;
 }
+
+void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry,
+                                     uint64_t *coffset, int *csize)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int nb_csectors;
+
+    assert(qcow2_get_cluster_type(bs, l2_entry) == QCOW2_CLUSTER_COMPRESSED);
+
+    *coffset = l2_entry & s->cluster_offset_mask;
+
+    nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1;
+    *csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE -
+        (*coffset & (QCOW2_COMPRESSED_SECTOR_SIZE - 1));
+}
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
     switch (ctype) {
     case QCOW2_CLUSTER_COMPRESSED:
         {
-            int64_t offset = (l2_entry & s->cluster_offset_mask)
-                & QCOW2_COMPRESSED_SECTOR_MASK;
-            int size = QCOW2_COMPRESSED_SECTOR_SIZE *
-                (((l2_entry >> s->csize_shift) & s->csize_mask) + 1);
-            qcow2_free_clusters(bs, offset, size, type);
+            uint64_t coffset;
+            int csize;
+
+            qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
+            qcow2_free_clusters(bs, coffset, csize, type);
         }
         break;
     case QCOW2_CLUSTER_NORMAL:
@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     bool l1_allocated = false;
     int64_t old_entry, old_l2_offset;
     unsigned slice, slice_size2, n_slices;
-    int i, j, l1_modified = 0, nb_csectors;
+    int i, j, l1_modified = 0;
     int ret;
 
     assert(addend >= -1 && addend <= 1);
@@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
 
                     switch (qcow2_get_cluster_type(bs, entry)) {
                     case QCOW2_CLUSTER_COMPRESSED:
-                        nb_csectors = ((entry >> s->csize_shift) &
-                                       s->csize_mask) + 1;
                         if (addend != 0) {
-                            uint64_t coffset = (entry & s->cluster_offset_mask)
-                                & QCOW2_COMPRESSED_SECTOR_MASK;
+                            uint64_t coffset;
+                            int csize;
+
+                            qcow2_parse_compressed_l2_entry(bs, entry,
+                                                            &coffset, &csize);
                             ret = update_refcount(
-                                bs, coffset,
-                                nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE,
+                                bs, coffset, csize,
                                 abs(addend), addend < 0,
                                 QCOW2_DISCARD_SNAPSHOT);
                             if (ret < 0) {
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
     BDRVQcow2State *s = bs->opaque;
     uint64_t l2_entry;
     uint64_t next_contiguous_offset = 0;
-    int i, nb_csectors, ret;
+    int i, ret;
     size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
     g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes);
 
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
 
     /* Do the actual checks */
     for (i = 0; i < s->l2_size; i++) {
+        uint64_t coffset;
+        int csize;
         l2_entry = get_l2_entry(s, l2_table, i);
 
         switch (qcow2_get_cluster_type(bs, l2_entry)) {
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
             }
 
             /* Mark cluster as used */
-            nb_csectors = ((l2_entry >> s->csize_shift) &
-                           s->csize_mask) + 1;
-            l2_entry &= s->cluster_offset_mask;
+            qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
             ret = qcow2_inc_refcounts_imrt(
-                bs, res, refcount_table, refcount_table_size,
-                l2_entry & QCOW2_COMPRESSED_SECTOR_MASK,
-                nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE);
+                bs, res, refcount_table, refcount_table_size, coffset, csize);
             if (ret < 0) {
                 return ret;
             }
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ qcow2_co_preadv_compressed(BlockDriverState *bs,
                            size_t qiov_offset)
 {
     BDRVQcow2State *s = bs->opaque;
-    int ret = 0, csize, nb_csectors;
+    int ret = 0, csize;
     uint64_t coffset;
     uint8_t *buf, *out_buf;
     int offset_in_cluster = offset_into_cluster(s, offset);
 
-    assert(qcow2_get_cluster_type(bs, l2_entry) == QCOW2_CLUSTER_COMPRESSED);
-
-    coffset = l2_entry & s->cluster_offset_mask;
-    nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1;
-    csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE -
-        (coffset & ~QCOW2_COMPRESSED_SECTOR_MASK);
+    qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
 
     buf = g_try_malloc(csize);
     if (!buf) {
-- 
2.31.1

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

Split fix_l2_entry_by_zero() out of check_refcounts_l2() to be
reused in further patch.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20210914122454.141075-5-vsementsov@virtuozzo.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 block/qcow2-refcount.c | 87 +++++++++++++++++++++++++++++-------------
 1 file changed, 60 insertions(+), 27 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ enum {
     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
 };
 
+/*
+ * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN.
+ *
+ * This function decrements res->corruptions on success, so the caller is
+ * responsible to increment res->corruptions prior to the call.
+ *
+ * On failure in-memory @l2_table may be modified.
+ */
+static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
+                                uint64_t l2_offset,
+                                uint64_t *l2_table, int l2_index, bool active,
+                                bool *metadata_overlap)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int ret;
+    int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t));
+    uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s);
+    int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2;
+    uint64_t l2_entry = has_subclusters(s) ? 0 : QCOW_OFLAG_ZERO;
+
+    set_l2_entry(s, l2_table, l2_index, l2_entry);
+    ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s),
+                                        false);
+    if (metadata_overlap) {
+        *metadata_overlap = ret < 0;
+    }
+    if (ret < 0) {
+        fprintf(stderr, "ERROR: Overlap check failed\n");
+        goto fail;
+    }
+
+    ret = bdrv_pwrite_sync(bs->file, l2e_offset, &l2_table[idx],
+                           l2_entry_size(s));
+    if (ret < 0) {
+        fprintf(stderr, "ERROR: Failed to overwrite L2 "
+                "table entry: %s\n", strerror(-ret));
+        goto fail;
+    }
+
+    res->corruptions--;
+    res->corruptions_fixed++;
+    return 0;
+
+fail:
+    res->check_errors++;
+    return ret;
+}
+
 /*
  * Increases the refcount in the given refcount table for the all clusters
  * referenced in the L2 table. While doing so, performs some checks on L2
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
     int i, ret;
     size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
     g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes);
+    bool metadata_overlap;
 
     /* Read L2 table from disk */
     ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size_bytes);
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                             fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
                             offset);
                     if (fix & BDRV_FIX_ERRORS) {
-                        int idx = i * (l2_entry_size(s) / sizeof(uint64_t));
-                        uint64_t l2e_offset =
-                            l2_offset + (uint64_t)i * l2_entry_size(s);
-                        int ign = active ? QCOW2_OL_ACTIVE_L2 :
-                                           QCOW2_OL_INACTIVE_L2;
-
-                        l2_entry = has_subclusters(s) ? 0 : QCOW_OFLAG_ZERO;
-                        set_l2_entry(s, l2_table, i, l2_entry);
-                        ret = qcow2_pre_write_overlap_check(bs, ign,
-                                l2e_offset, l2_entry_size(s), false);
-                        if (ret < 0) {
-                            fprintf(stderr, "ERROR: Overlap check failed\n");
-                            res->check_errors++;
+                        ret = fix_l2_entry_by_zero(bs, res, l2_offset,
+                                                   l2_table, i, active,
+                                                   &metadata_overlap);
+                        if (metadata_overlap) {
                             /*
                              * Something is seriously wrong, so abort checking
                              * this L2 table.
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                             return ret;
                         }
 
-                        ret = bdrv_pwrite_sync(bs->file, l2e_offset,
-                                               &l2_table[idx],
-                                               l2_entry_size(s));
-                        if (ret < 0) {
-                            fprintf(stderr, "ERROR: Failed to overwrite L2 "
-                                    "table entry: %s\n", strerror(-ret));
-                            res->check_errors++;
-                            /*
-                             * Do not abort, continue checking the rest of this
-                             * L2 table's entries.
-                             */
-                        } else {
-                            res->corruptions--;
-                            res->corruptions_fixed++;
+                        if (ret == 0) {
                             /*
                              * Skip marking the cluster as used
                              * (it is unused now).
                              */
                             continue;
                         }
+
+                        /*
+                         * Failed to fix.
+                         * Do not abort, continue checking the rest of this
+                         * L2 table's entries.
+                         */
                     }
                 } else {
                     fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
-- 
2.31.1

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

We'll reuse the function to fix wrong L2 entry bitmap. Support it now.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20210914122454.141075-6-vsementsov@virtuozzo.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 block/qcow2-refcount.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ enum {
 };
 
 /*
- * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN.
+ * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN (or making all its present
+ * subclusters QCOW2_SUBCLUSTER_ZERO_PLAIN).
  *
  * This function decrements res->corruptions on success, so the caller is
  * responsible to increment res->corruptions prior to the call.
@@ -XXX,XX +XXX,XX @@ static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
     int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t));
     uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s);
     int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2;
-    uint64_t l2_entry = has_subclusters(s) ? 0 : QCOW_OFLAG_ZERO;
 
-    set_l2_entry(s, l2_table, l2_index, l2_entry);
+    if (has_subclusters(s)) {
+        uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, l2_index);
+
+        /* Allocated subclusters become zero */
+        l2_bitmap |= l2_bitmap << 32;
+        l2_bitmap &= QCOW_L2_BITMAP_ALL_ZEROES;
+
+        set_l2_bitmap(s, l2_table, l2_index, l2_bitmap);
+        set_l2_entry(s, l2_table, l2_index, 0);
+    } else {
+        set_l2_entry(s, l2_table, l2_index, QCOW_OFLAG_ZERO);
+    }
+
     ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s),
                                         false);
     if (metadata_overlap) {
-- 
2.31.1

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

Check subcluster bitmap of the l2 entry for different types of
clusters:

- for compressed it must be zero
 - for allocated check consistency of two parts of the bitmap
 - for unallocated all subclusters should be unallocated
   (or zero-plain)

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Tested-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Message-Id: <20210914122454.141075-7-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Hanna Reitz <hreitz@redhat.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 block/qcow2-refcount.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                               int flags, BdrvCheckMode fix, bool active)
 {
     BDRVQcow2State *s = bs->opaque;
-    uint64_t l2_entry;
+    uint64_t l2_entry, l2_bitmap;
     uint64_t next_contiguous_offset = 0;
     int i, ret;
     size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
         uint64_t coffset;
         int csize;
         l2_entry = get_l2_entry(s, l2_table, i);
+        l2_bitmap = get_l2_bitmap(s, l2_table, i);
 
         switch (qcow2_get_cluster_type(bs, l2_entry)) {
         case QCOW2_CLUSTER_COMPRESSED:
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                 break;
             }
 
+            if (l2_bitmap) {
+                fprintf(stderr, "ERROR compressed cluster %d with non-zero "
+                        "subcluster allocation bitmap, entry=0x%" PRIx64 "\n",
+                        i, l2_entry);
+                res->corruptions++;
+                break;
+            }
+
             /* Mark cluster as used */
             qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
             ret = qcow2_inc_refcounts_imrt(
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
         {
             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
 
+            if ((l2_bitmap >> 32) & l2_bitmap) {
+                res->corruptions++;
+                fprintf(stderr, "ERROR offset=%" PRIx64 ": Allocated "
+                        "cluster has corrupted subcluster allocation bitmap\n",
+                        offset);
+            }
+
             /* Correct offsets are cluster aligned */
             if (offset_into_cluster(s, offset)) {
                 bool contains_data;
                 res->corruptions++;
 
                 if (has_subclusters(s)) {
-                    uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, i);
                     contains_data = (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC);
                 } else {
                     contains_data = !(l2_entry & QCOW_OFLAG_ZERO);
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
         }
 
         case QCOW2_CLUSTER_ZERO_PLAIN:
+            /* Impossible when image has subclusters */
+            assert(!l2_bitmap);
+            break;
+
         case QCOW2_CLUSTER_UNALLOCATED:
+            if (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC) {
+                res->corruptions++;
+                fprintf(stderr, "ERROR: Unallocated "
+                        "cluster has non-zero subcluster allocation map\n");
+            }
             break;
 
         default:
-- 
2.31.1

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Tested-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20210914122454.141075-8-vsementsov@virtuozzo.com>
[hreitz: Separated `type` declaration from statements]
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 block/qcow2.h          |  1 +
 block/qcow2-refcount.c | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

- use g_autofree for l1_table
 - better name for size in bytes variable
 - reduce code blocks nesting
 - whitespaces, braces, newlines

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20210914122454.141075-9-vsementsov@virtuozzo.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 block/qcow2-refcount.c | 98 +++++++++++++++++++++---------------------
 1 file changed, 50 insertions(+), 48 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l1(BlockDriverState *bs,
                               int flags, BdrvCheckMode fix, bool active)
 {
     BDRVQcow2State *s = bs->opaque;
-    uint64_t *l1_table = NULL, l2_offset, l1_size2;
+    size_t l1_size_bytes = l1_size * L1E_SIZE;
+    g_autofree uint64_t *l1_table = NULL;
+    uint64_t l2_offset;
     int i, ret;
 
-    l1_size2 = l1_size * L1E_SIZE;
+    if (!l1_size) {
+        return 0;
+    }
 
     /* Mark L1 table as used */
     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
-                                   l1_table_offset, l1_size2);
+                                   l1_table_offset, l1_size_bytes);
     if (ret < 0) {
-        goto fail;
+        return ret;
+    }
+
+    l1_table = g_try_malloc(l1_size_bytes);
+    if (l1_table == NULL) {
+        res->check_errors++;
+        return -ENOMEM;
     }
 
     /* Read L1 table entries from disk */
-    if (l1_size2 > 0) {
-        l1_table = g_try_malloc(l1_size2);
-        if (l1_table == NULL) {
-            ret = -ENOMEM;
-            res->check_errors++;
-            goto fail;
-        }
-        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
-        if (ret < 0) {
-            fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
-            res->check_errors++;
-            goto fail;
-        }
-        for(i = 0;i < l1_size; i++)
-            be64_to_cpus(&l1_table[i]);
+    ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size_bytes);
+    if (ret < 0) {
+        fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
+        res->check_errors++;
+        return ret;
+    }
+
+    for (i = 0; i < l1_size; i++) {
+        be64_to_cpus(&l1_table[i]);
     }
 
     /* Do the actual checks */
-    for(i = 0; i < l1_size; i++) {
-        l2_offset = l1_table[i];
-        if (l2_offset) {
-            /* Mark L2 table as used */
-            l2_offset &= L1E_OFFSET_MASK;
-            ret = qcow2_inc_refcounts_imrt(bs, res,
-                                           refcount_table, refcount_table_size,
-                                           l2_offset, s->cluster_size);
-            if (ret < 0) {
-                goto fail;
-            }
+    for (i = 0; i < l1_size; i++) {
+        if (!l1_table[i]) {
+            continue;
+        }
 
-            /* L2 tables are cluster aligned */
-            if (offset_into_cluster(s, l2_offset)) {
-                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
-                    "cluster aligned; L1 entry corrupted\n", l2_offset);
-                res->corruptions++;
-            }
+        l2_offset = l1_table[i] & L1E_OFFSET_MASK;
 
-            /* Process and check L2 entries */
-            ret = check_refcounts_l2(bs, res, refcount_table,
-                                     refcount_table_size, l2_offset, flags,
-                                     fix, active);
-            if (ret < 0) {
-                goto fail;
-            }
+        /* Mark L2 table as used */
+        ret = qcow2_inc_refcounts_imrt(bs, res,
+                                       refcount_table, refcount_table_size,
+                                       l2_offset, s->cluster_size);
+        if (ret < 0) {
+            return ret;
+        }
+
+        /* L2 tables are cluster aligned */
+        if (offset_into_cluster(s, l2_offset)) {
+            fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
+                "cluster aligned; L1 entry corrupted\n", l2_offset);
+            res->corruptions++;
+        }
+
+        /* Process and check L2 entries */
+        ret = check_refcounts_l2(bs, res, refcount_table,
+                                 refcount_table_size, l2_offset, flags,
+                                 fix, active);
+        if (ret < 0) {
+            return ret;
         }
     }
-    g_free(l1_table);
-    return 0;
 
-fail:
-    g_free(l1_table);
-    return ret;
+    return 0;
 }
 
 /*
-- 
2.31.1

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Tested-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20210914122454.141075-10-vsementsov@virtuozzo.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 block/qcow2.h          | 1 +
 block/qcow2-refcount.c | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ typedef enum QCow2MetadataOverlap {
     (QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2)
 
 #define L1E_OFFSET_MASK 0x00fffffffffffe00ULL
+#define L1E_RESERVED_MASK 0x7f000000000001ffULL
 #define L2E_OFFSET_MASK 0x00fffffffffffe00ULL
 #define L2E_STD_RESERVED_MASK 0x3f000000000001feULL
 
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ static int check_refcounts_l1(BlockDriverState *bs,
             continue;
         }
 
+        if (l1_table[i] & L1E_RESERVED_MASK) {
+            fprintf(stderr, "ERROR found L1 entry with reserved bits set: "
+                    "%" PRIx64 "\n", l1_table[i]);
+            res->corruptions++;
+        }
+
         l2_offset = l1_table[i] & L1E_OFFSET_MASK;
 
         /* Mark L2 table as used */
-- 
2.31.1

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

Split checking for reserved bits out of aligned offset check.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Tested-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20210914122454.141075-11-vsementsov@virtuozzo.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 block/qcow2.h          |  1 +
 block/qcow2-refcount.c | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ typedef enum QCow2MetadataOverlap {
 #define L2E_STD_RESERVED_MASK 0x3f000000000001feULL
 
 #define REFT_OFFSET_MASK 0xfffffffffffffe00ULL
+#define REFT_RESERVED_MASK 0x1ffULL
 
 #define INV_OFFSET (-1ULL)
 
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
 
     for(i = 0; i < s->refcount_table_size; i++) {
         uint64_t offset, cluster;
-        offset = s->refcount_table[i];
+        offset = s->refcount_table[i] & REFT_OFFSET_MASK;
         cluster = offset >> s->cluster_bits;
 
+        if (s->refcount_table[i] & REFT_RESERVED_MASK) {
+            fprintf(stderr, "ERROR refcount table entry %" PRId64 " has "
+                    "reserved bits set\n", i);
+            res->corruptions++;
+            *rebuild = true;
+            continue;
+        }
+
         /* Refcount blocks are cluster aligned */
         if (offset_into_cluster(s, offset)) {
             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
-- 
2.31.1

From: Eric Blake <eblake@redhat.com>

Although we have long supported 'qemu-img convert -o
backing_file=foo,backing_fmt=bar', the fact that we have a shortcut -B
for backing_file but none for backing_fmt has made it more likely that
users accidentally run into:

qemu-img: warning: Deprecated use of backing file without explicit backing format

when using -B instead of -o.  For similarity with other qemu-img
commands, such as create and compare, add '-F $fmt' as the shorthand
for '-o backing_fmt=$fmt'.  Update iotest 122 for coverage of both
spellings.

Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20210913131735.1948339-1-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
---
 docs/tools/qemu-img.rst |  4 ++--
 qemu-img.c              | 10 +++++++---
 qemu-img-cmds.hx        |  2 +-
 tests/qemu-iotests/122  |  2 +-
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst
index XXXXXXX..XXXXXXX 100644
--- a/docs/tools/qemu-img.rst
+++ b/docs/tools/qemu-img.rst
@@ -XXX,XX +XXX,XX @@ Command description:
   4
     Error on reading data
 
-.. option:: convert [--object OBJECTDEF] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps [--skip-broken-bitmaps]] [-U] [-C] [-c] [-p] [-q] [-n] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-O OUTPUT_FMT] [-B BACKING_FILE] [-o OPTIONS] [-l SNAPSHOT_PARAM] [-S SPARSE_SIZE] [-r RATE_LIMIT] [-m NUM_COROUTINES] [-W] FILENAME [FILENAME2 [...]] OUTPUT_FILENAME
+.. option:: convert [--object OBJECTDEF] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps [--skip-broken-bitmaps]] [-U] [-C] [-c] [-p] [-q] [-n] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-O OUTPUT_FMT] [-B BACKING_FILE [-F backing_fmt]] [-o OPTIONS] [-l SNAPSHOT_PARAM] [-S SPARSE_SIZE] [-r RATE_LIMIT] [-m NUM_COROUTINES] [-W] FILENAME [FILENAME2 [...]] OUTPUT_FILENAME
 
   Convert the disk image *FILENAME* or a snapshot *SNAPSHOT_PARAM*
   to disk image *OUTPUT_FILENAME* using format *OUTPUT_FMT*. It can
@@ -XXX,XX +XXX,XX @@ Command description:
   You can use the *BACKING_FILE* option to force the output image to be
   created as a copy on write image of the specified base image; the
   *BACKING_FILE* should have the same content as the input's base image,
-  however the path, image format, etc may differ.
+  however the path, image format (as given by *BACKING_FMT*), etc may differ.
 
   If a relative path name is given, the backing file is looked up relative to
   the directory containing *OUTPUT_FILENAME*.
diff --git a/qemu-img.c b/qemu-img.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
     int c, bs_i, flags, src_flags = BDRV_O_NO_SHARE;
     const char *fmt = NULL, *out_fmt = NULL, *cache = "unsafe",
                *src_cache = BDRV_DEFAULT_CACHE, *out_baseimg = NULL,
-               *out_filename, *out_baseimg_param, *snapshot_name = NULL;
+               *out_filename, *out_baseimg_param, *snapshot_name = NULL,
+               *backing_fmt = NULL;
     BlockDriver *drv = NULL, *proto_drv = NULL;
     BlockDriverInfo bdi;
     BlockDriverState *out_bs;
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
             {"skip-broken-bitmaps", no_argument, 0, OPTION_SKIP_BROKEN},
             {0, 0, 0, 0}
         };
-        c = getopt_long(argc, argv, ":hf:O:B:Cco:l:S:pt:T:qnm:WUr:",
+        c = getopt_long(argc, argv, ":hf:O:B:CcF:o:l:S:pt:T:qnm:WUr:",
                         long_options, NULL);
         if (c == -1) {
             break;
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
         case 'c':
             s.compressed = true;
             break;
+        case 'F':
+            backing_fmt = optarg;
+            break;
         case 'o':
             if (accumulate_options(&options, optarg) < 0) {
                 goto fail_getopt;
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
 
         qemu_opt_set_number(opts, BLOCK_OPT_SIZE,
                             s.total_sectors * BDRV_SECTOR_SIZE, &error_abort);
-        ret = add_old_style_options(out_fmt, opts, out_baseimg, NULL);
+        ret = add_old_style_options(out_fmt, opts, out_baseimg, backing_fmt);
         if (ret < 0) {
             goto out;
         }
diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -XXX,XX +XXX,XX @@ SRST
 ERST
 
 DEF("convert", img_convert,
-    "convert [--object objectdef] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-B backing_file] [-o options] [-l snapshot_param] [-S sparse_size] [-r rate_limit] [-m num_coroutines] [-W] [--salvage] filename [filename2 [...]] output_filename")
+    "convert [--object objectdef] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-B backing_file [-F backing_fmt]] [-o options] [-l snapshot_param] [-S sparse_size] [-r rate_limit] [-m num_coroutines] [-W] [--salvage] filename [filename2 [...]] output_filename")
 SRST
 .. option:: convert [--object OBJECTDEF] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-O OUTPUT_FMT] [-B BACKING_FILE] [-o OPTIONS] [-l SNAPSHOT_PARAM] [-S SPARSE_SIZE] [-r RATE_LIMIT] [-m NUM_COROUTINES] [-W] [--salvage] FILENAME [FILENAME2 [...]] OUTPUT_FILENAME
 ERST
diff --git a/tests/qemu-iotests/122 b/tests/qemu-iotests/122
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/122
+++ b/tests/qemu-iotests/122
@@ -XXX,XX +XXX,XX @@ echo
 _make_test_img -b "$TEST_IMG".base -F $IMGFMT
 
 $QEMU_IO -c "write -P 0 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
-$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base -o backing_fmt=$IMGFMT \
+$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base -F $IMGFMT \
     "$TEST_IMG" "$TEST_IMG".orig
 $QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
 $QEMU_IMG convert -O $IMGFMT -c -B "$TEST_IMG".base -o backing_fmt=$IMGFMT \
-- 
2.31.1

The following changes since commit 56f9e46b841c7be478ca038d8d4085d776ab4b0d:

Merge remote-tracking branch 'remotes/armbru/tags/pull-qapi-2017-02-20' into staging (2017-02-20 17:42:47 +0000)

are available in the git repository at:

git://github.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to a7b91d35bab97a2d3e779d0c64c9b837b52a6cf7:

coroutine-lock: make CoRwlock thread-safe and fair (2017-02-21 11:39:40 +0000)

----------------------------------------------------------------
Pull request

v2:
 * Rebased to resolve scsi conflicts

----------------------------------------------------------------

Paolo Bonzini (24):
  block: move AioContext, QEMUTimer, main-loop to libqemuutil
  aio: introduce aio_co_schedule and aio_co_wake
  block-backend: allow blk_prw from coroutine context
  test-thread-pool: use generic AioContext infrastructure
  io: add methods to set I/O handlers on AioContext
  io: make qio_channel_yield aware of AioContexts
  nbd: convert to use qio_channel_yield
  coroutine-lock: reschedule coroutine on the AioContext it was running
    on
  blkdebug: reschedule coroutine on the AioContext it is running on
  qed: introduce qed_aio_start_io and qed_aio_next_io_cb
  aio: push aio_context_acquire/release down to dispatching
  block: explicitly acquire aiocontext in timers that need it
  block: explicitly acquire aiocontext in callbacks that need it
  block: explicitly acquire aiocontext in bottom halves that need it
  block: explicitly acquire aiocontext in aio callbacks that need it
  aio-posix: partially inline aio_dispatch into aio_poll
  async: remove unnecessary inc/dec pairs
  block: document fields protected by AioContext lock
  coroutine-lock: make CoMutex thread-safe
  coroutine-lock: add limited spinning to CoMutex
  test-aio-multithread: add performance comparison with thread-based
    mutexes
  coroutine-lock: place CoMutex before CoQueue in header
  coroutine-lock: add mutex argument to CoQueue APIs
  coroutine-lock: make CoRwlock thread-safe and fair

-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

AioContext is fairly self contained, the only dependency is QEMUTimer but
that in turn doesn't need anything else.  So move them out of block-obj-y
to avoid introducing a dependency from io/ to block-obj-y.

main-loop and its dependency iohandler also need to be moved, because
later in this series io/ will call iohandler_get_aio_context.

[Changed copyright "the QEMU team" to "other QEMU contributors" as
suggested by Daniel Berrange and agreed by Paolo.
--Stefan]

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-2-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 Makefile.objs                       |  4 ---
 stubs/Makefile.objs                 |  1 +
 tests/Makefile.include              | 11 ++++----
 util/Makefile.objs                  |  6 +++-
 block/io.c                          | 29 -------------------
 stubs/linux-aio.c                   | 32 +++++++++++++++++++++
 stubs/set-fd-handler.c              | 11 --------
 aio-posix.c => util/aio-posix.c     |  2 +-
 aio-win32.c => util/aio-win32.c     |  0
 util/aiocb.c                        | 55 +++++++++++++++++++++++++++++++++++++
 async.c => util/async.c             |  3 +-
 iohandler.c => util/iohandler.c     |  0
 main-loop.c => util/main-loop.c     |  0
 qemu-timer.c => util/qemu-timer.c   |  0
 thread-pool.c => util/thread-pool.c |  2 +-
 trace-events                        | 11 --------
 util/trace-events                   | 11 ++++++++
 17 files changed, 114 insertions(+), 64 deletions(-)
 create mode 100644 stubs/linux-aio.c
 rename aio-posix.c => util/aio-posix.c (99%)
 rename aio-win32.c => util/aio-win32.c (100%)
 create mode 100644 util/aiocb.c
 rename async.c => util/async.c (99%)
 rename iohandler.c => util/iohandler.c (100%)
 rename main-loop.c => util/main-loop.c (100%)
 rename qemu-timer.c => util/qemu-timer.c (100%)
 rename thread-pool.c => util/thread-pool.c (99%)

diff --git a/Makefile.objs b/Makefile.objs
index XXXXXXX..XXXXXXX 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -XXX,XX +XXX,XX @@ chardev-obj-y = chardev/
 #######################################################################
 # block-obj-y is code used by both qemu system emulation and qemu-img
 
-block-obj-y = async.o thread-pool.o
 block-obj-y += nbd/
 block-obj-y += block.o blockjob.o
-block-obj-y += main-loop.o iohandler.o qemu-timer.o
-block-obj-$(CONFIG_POSIX) += aio-posix.o
-block-obj-$(CONFIG_WIN32) += aio-win32.o
 block-obj-y += block/
 block-obj-y += qemu-io-cmds.o
 block-obj-$(CONFIG_REPLICATION) += replication.o
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index XXXXXXX..XXXXXXX 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -XXX,XX +XXX,XX @@ stub-obj-y += get-vm-name.o
 stub-obj-y += iothread.o
 stub-obj-y += iothread-lock.o
 stub-obj-y += is-daemonized.o
+stub-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 stub-obj-y += machine-init-done.o
 stub-obj-y += migr-blocker.o
 stub-obj-y += monitor.o
diff --git a/tests/Makefile.include b/tests/Makefile.include
index XXXXXXX..XXXXXXX 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -XXX,XX +XXX,XX @@ check-unit-y += tests/test-visitor-serialization$(EXESUF)
 check-unit-y += tests/test-iov$(EXESUF)
 gcov-files-test-iov-y = util/iov.c
 check-unit-y += tests/test-aio$(EXESUF)
+gcov-files-test-aio-y = util/async.c util/qemu-timer.o
+gcov-files-test-aio-$(CONFIG_WIN32) += util/aio-win32.c
+gcov-files-test-aio-$(CONFIG_POSIX) += util/aio-posix.c
 check-unit-y += tests/test-throttle$(EXESUF)
 gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
 gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
@@ -XXX,XX +XXX,XX @@ tests/check-qjson$(EXESUF): tests/check-qjson.o $(test-util-obj-y)
 tests/check-qom-interface$(EXESUF): tests/check-qom-interface.o $(test-qom-obj-y)
 tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
 
-tests/test-char$(EXESUF): tests/test-char.o qemu-timer.o \
-	$(test-util-obj-y) $(qtest-obj-y) $(test-block-obj-y) $(chardev-obj-y)
+tests/test-char$(EXESUF): tests/test-char.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y) $(chardev-obj-y)
 tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
 tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
@@ -XXX,XX +XXX,XX @@ tests/test-vmstate$(EXESUF): tests/test-vmstate.o \
 	migration/vmstate.o migration/qemu-file.o \
         migration/qemu-file-channel.o migration/qjson.o \
 	$(test-io-obj-y)
-tests/test-timed-average$(EXESUF): tests/test-timed-average.o qemu-timer.o \
-	$(test-util-obj-y)
+tests/test-timed-average$(EXESUF): tests/test-timed-average.o $(test-util-obj-y)
 tests/test-base64$(EXESUF): tests/test-base64.o \
 	libqemuutil.a libqemustub.a
 tests/ptimer-test$(EXESUF): tests/ptimer-test.o tests/ptimer-test-stubs.o hw/core/ptimer.o libqemustub.a
@@ -XXX,XX +XXX,XX @@ tests/usb-hcd-ehci-test$(EXESUF): tests/usb-hcd-ehci-test.o $(libqos-usb-obj-y)
 tests/usb-hcd-xhci-test$(EXESUF): tests/usb-hcd-xhci-test.o $(libqos-usb-obj-y)
 tests/pc-cpu-test$(EXESUF): tests/pc-cpu-test.o
 tests/postcopy-test$(EXESUF): tests/postcopy-test.o
-tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-timer.o \
+tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o $(test-util-obj-y) \
 	$(qtest-obj-y) $(test-io-obj-y) $(libqos-virtio-obj-y) $(libqos-pc-obj-y) \
 	$(chardev-obj-y)
 tests/qemu-iotests/socket_scm_helper$(EXESUF): tests/qemu-iotests/socket_scm_helper.o
diff --git a/util/Makefile.objs b/util/Makefile.objs
index XXXXXXX..XXXXXXX 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -XXX,XX +XXX,XX @@
 util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
 util-obj-y += bufferiszero.o
 util-obj-y += lockcnt.o
+util-obj-y += aiocb.o async.o thread-pool.o qemu-timer.o
+util-obj-y += main-loop.o iohandler.o
+util-obj-$(CONFIG_POSIX) += aio-posix.o
 util-obj-$(CONFIG_POSIX) += compatfd.o
 util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
 util-obj-$(CONFIG_POSIX) += mmap-alloc.o
 util-obj-$(CONFIG_POSIX) += oslib-posix.o
 util-obj-$(CONFIG_POSIX) += qemu-openpty.o
 util-obj-$(CONFIG_POSIX) += qemu-thread-posix.o
-util-obj-$(CONFIG_WIN32) += event_notifier-win32.o
 util-obj-$(CONFIG_POSIX) += memfd.o
+util-obj-$(CONFIG_WIN32) += aio-win32.o
+util-obj-$(CONFIG_WIN32) += event_notifier-win32.o
 util-obj-$(CONFIG_WIN32) += oslib-win32.o
 util-obj-$(CONFIG_WIN32) += qemu-thread-win32.o
 util-obj-y += envlist.o path.o module.o
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
     return &acb->common;
 }
 
-void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
-                   BlockCompletionFunc *cb, void *opaque)
-{
-    BlockAIOCB *acb;
-
-    acb = g_malloc(aiocb_info->aiocb_size);
-    acb->aiocb_info = aiocb_info;
-    acb->bs = bs;
-    acb->cb = cb;
-    acb->opaque = opaque;
-    acb->refcnt = 1;
-    return acb;
-}
-
-void qemu_aio_ref(void *p)
-{
-    BlockAIOCB *acb = p;
-    acb->refcnt++;
-}
-
-void qemu_aio_unref(void *p)
-{
-    BlockAIOCB *acb = p;
-    assert(acb->refcnt > 0);
-    if (--acb->refcnt == 0) {
-        g_free(acb);
-    }
-}
-
 /**************************************************************/
 /* Coroutine block device emulation */
 
diff --git a/stubs/linux-aio.c b/stubs/linux-aio.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/stubs/linux-aio.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Linux native AIO support.
+ *
+ * Copyright (C) 2009 IBM, Corp.
+ * Copyright (C) 2009 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "block/aio.h"
+#include "block/raw-aio.h"
+
+void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
+{
+    abort();
+}
+
+void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
+{
+    abort();
+}
+
+LinuxAioState *laio_init(void)
+{
+    abort();
+}
+
+void laio_cleanup(LinuxAioState *s)
+{
+    abort();
+}
diff --git a/stubs/set-fd-handler.c b/stubs/set-fd-handler.c
index XXXXXXX..XXXXXXX 100644
--- a/stubs/set-fd-handler.c
+++ b/stubs/set-fd-handler.c
@@ -XXX,XX +XXX,XX @@ void qemu_set_fd_handler(int fd,
 {
     abort();
 }
-
-void aio_set_fd_handler(AioContext *ctx,
-                        int fd,
-                        bool is_external,
-                        IOHandler *io_read,
-                        IOHandler *io_write,
-                        AioPollFn *io_poll,
-                        void *opaque)
-{
-    abort();
-}
diff --git a/aio-posix.c b/util/aio-posix.c
similarity index 99%
rename from aio-posix.c
rename to util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/rcu_queue.h"
 #include "qemu/sockets.h"
 #include "qemu/cutils.h"
-#include "trace-root.h"
+#include "trace.h"
 #ifdef CONFIG_EPOLL_CREATE1
 #include <sys/epoll.h>
 #endif
diff --git a/aio-win32.c b/util/aio-win32.c
similarity index 100%
rename from aio-win32.c
rename to util/aio-win32.c
diff --git a/util/aiocb.c b/util/aiocb.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/util/aiocb.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * BlockAIOCB allocation
+ *
+ * Copyright (c) 2003-2017 Fabrice Bellard and other QEMU contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "block/aio.h"
+
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
+                   BlockCompletionFunc *cb, void *opaque)
+{
+    BlockAIOCB *acb;
+
+    acb = g_malloc(aiocb_info->aiocb_size);
+    acb->aiocb_info = aiocb_info;
+    acb->bs = bs;
+    acb->cb = cb;
+    acb->opaque = opaque;
+    acb->refcnt = 1;
+    return acb;
+}
+
+void qemu_aio_ref(void *p)
+{
+    BlockAIOCB *acb = p;
+    acb->refcnt++;
+}
+
+void qemu_aio_unref(void *p)
+{
+    BlockAIOCB *acb = p;
+    assert(acb->refcnt > 0);
+    if (--acb->refcnt == 0) {
+        g_free(acb);
+    }
+}
diff --git a/async.c b/util/async.c
similarity index 99%
rename from async.c
rename to util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@
 /*
- * QEMU System Emulator
+ * Data plane event loop
  *
  * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2009-2017 QEMU contributors
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/iohandler.c b/util/iohandler.c
similarity index 100%
rename from iohandler.c
rename to util/iohandler.c
diff --git a/main-loop.c b/util/main-loop.c
similarity index 100%
rename from main-loop.c
rename to util/main-loop.c
diff --git a/qemu-timer.c b/util/qemu-timer.c
similarity index 100%
rename from qemu-timer.c
rename to util/qemu-timer.c
diff --git a/thread-pool.c b/util/thread-pool.c
similarity index 99%
rename from thread-pool.c
rename to util/thread-pool.c
index XXXXXXX..XXXXXXX 100644
--- a/thread-pool.c
+++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/queue.h"
 #include "qemu/thread.h"
 #include "qemu/coroutine.h"
-#include "trace-root.h"
+#include "trace.h"
 #include "block/thread-pool.h"
 #include "qemu/main-loop.h"
 
diff --git a/trace-events b/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/trace-events
+++ b/trace-events
@@ -XXX,XX +XXX,XX @@
 #
 # The <format-string> should be a sprintf()-compatible format string.
 
-# aio-posix.c
-run_poll_handlers_begin(void *ctx, int64_t max_ns) "ctx %p max_ns %"PRId64
-run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
-poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
-poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
-
-# thread-pool.c
-thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
-thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
-thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
-
 # ioport.c
 cpu_in(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
 cpu_out(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@
 # See docs/tracing.txt for syntax documentation.
 
+# util/aio-posix.c
+run_poll_handlers_begin(void *ctx, int64_t max_ns) "ctx %p max_ns %"PRId64
+run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
+poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
+poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
+
+# util/thread-pool.c
+thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
+thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
+thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
+
 # util/buffer.c
 buffer_resize(const char *buf, size_t olen, size_t len) "%s: old %zd, new %zd"
 buffer_move_empty(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s"
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

aio_co_wake provides the infrastructure to start a coroutine on a "home"
AioContext.  It will be used by CoMutex and CoQueue, so that coroutines
don't jump from one context to another when they go to sleep on a
mutex or waitqueue.  However, it can also be used as a more efficient
alternative to one-shot bottom halves, and saves the effort of tracking
which AioContext a coroutine is running on.

aio_co_schedule is the part of aio_co_wake that starts a coroutine
on a remove AioContext, but it is also useful to implement e.g.
bdrv_set_aio_context callbacks.

The implementation of aio_co_schedule is based on a lock-free
multiple-producer, single-consumer queue.  The multiple producers use
cmpxchg to add to a LIFO stack.  The consumer (a per-AioContext bottom
half) grabs all items added so far, inverts the list to make it FIFO,
and goes through it one item at a time until it's empty.  The data
structure was inspired by OSv, which uses it in the very code we'll
"port" to QEMU for the thread-safe CoMutex.

Most of the new code is really tests.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-3-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/Makefile.include       |   8 +-
 include/block/aio.h          |  32 +++++++
 include/qemu/coroutine_int.h |  11 ++-
 tests/iothread.h             |  25 +++++
 tests/iothread.c             |  91 ++++++++++++++++++
 tests/test-aio-multithread.c | 213 +++++++++++++++++++++++++++++++++++++++++++
 util/async.c                 |  65 +++++++++++++
 util/qemu-coroutine.c        |   8 ++
 util/trace-events            |   4 +
 9 files changed, 453 insertions(+), 4 deletions(-)
 create mode 100644 tests/iothread.h
 create mode 100644 tests/iothread.c
 create mode 100644 tests/test-aio-multithread.c

diff --git a/tests/Makefile.include b/tests/Makefile.include
index XXXXXXX..XXXXXXX 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -XXX,XX +XXX,XX @@ check-unit-y += tests/test-aio$(EXESUF)
 gcov-files-test-aio-y = util/async.c util/qemu-timer.o
 gcov-files-test-aio-$(CONFIG_WIN32) += util/aio-win32.c
 gcov-files-test-aio-$(CONFIG_POSIX) += util/aio-posix.c
+check-unit-y += tests/test-aio-multithread$(EXESUF)
+gcov-files-test-aio-multithread-y = $(gcov-files-test-aio-y)
+gcov-files-test-aio-multithread-y += util/qemu-coroutine.c tests/iothread.c
 check-unit-y += tests/test-throttle$(EXESUF)
-gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
-gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
 check-unit-y += tests/test-thread-pool$(EXESUF)
 gcov-files-test-thread-pool-y = thread-pool.c
 gcov-files-test-hbitmap-y = util/hbitmap.c
@@ -XXX,XX +XXX,XX @@ test-qapi-obj-y = tests/test-qapi-visit.o tests/test-qapi-types.o \
 	$(test-qom-obj-y)
 test-crypto-obj-y = $(crypto-obj-y) $(test-qom-obj-y)
 test-io-obj-y = $(io-obj-y) $(test-crypto-obj-y)
-test-block-obj-y = $(block-obj-y) $(test-io-obj-y)
+test-block-obj-y = $(block-obj-y) $(test-io-obj-y) tests/iothread.o
 
 tests/check-qint$(EXESUF): tests/check-qint.o $(test-util-obj-y)
 tests/check-qstring$(EXESUF): tests/check-qstring.o $(test-util-obj-y)
@@ -XXX,XX +XXX,XX @@ tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
 tests/test-char$(EXESUF): tests/test-char.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y) $(chardev-obj-y)
 tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
 tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
+tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
 tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
diff --git a/include/block/aio.h b/include/block/aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ typedef void QEMUBHFunc(void *opaque);
 typedef bool AioPollFn(void *opaque);
 typedef void IOHandler(void *opaque);
 
+struct Coroutine;
 struct ThreadPool;
 struct LinuxAioState;
 
@@ -XXX,XX +XXX,XX @@ struct AioContext {
     bool notified;
     EventNotifier notifier;
 
+    QSLIST_HEAD(, Coroutine) scheduled_coroutines;
+    QEMUBH *co_schedule_bh;
+
     /* Thread pool for performing work and receiving completion callbacks.
      * Has its own locking.
      */
@@ -XXX,XX +XXX,XX @@ static inline bool aio_node_check(AioContext *ctx, bool is_external)
 }
 
 /**
+ * aio_co_schedule:
+ * @ctx: the aio context
+ * @co: the coroutine
+ *
+ * Start a coroutine on a remote AioContext.
+ *
+ * The coroutine must not be entered by anyone else while aio_co_schedule()
+ * is active.  In addition the coroutine must have yielded unless ctx
+ * is the context in which the coroutine is running (i.e. the value of
+ * qemu_get_current_aio_context() from the coroutine itself).
+ */
+void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
+
+/**
+ * aio_co_wake:
+ * @co: the coroutine
+ *
+ * Restart a coroutine on the AioContext where it was running last, thus
+ * preventing coroutines from jumping from one context to another when they
+ * go to sleep.
+ *
+ * aio_co_wake may be executed either in coroutine or non-coroutine
+ * context.  The coroutine must not be entered by anyone else while
+ * aio_co_wake() is active.
+ */
+void aio_co_wake(struct Coroutine *co);
+
+/**
  * Return the AioContext whose event loop runs in the current thread.
  *
  * If called from an IOThread this will be the IOThread's AioContext.  If
diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine_int.h
+++ b/include/qemu/coroutine_int.h
@@ -XXX,XX +XXX,XX @@ struct Coroutine {
     CoroutineEntry *entry;
     void *entry_arg;
     Coroutine *caller;
+
+    /* Only used when the coroutine has terminated.  */
     QSLIST_ENTRY(Coroutine) pool_next;
+
     size_t locks_held;
 
-    /* Coroutines that should be woken up when we yield or terminate */
+    /* Coroutines that should be woken up when we yield or terminate.
+     * Only used when the coroutine is running.
+     */
     QSIMPLEQ_HEAD(, Coroutine) co_queue_wakeup;
+
+    /* Only used when the coroutine has yielded.  */
+    AioContext *ctx;
     QSIMPLEQ_ENTRY(Coroutine) co_queue_next;
+    QSLIST_ENTRY(Coroutine) co_scheduled_next;
 };
 
 Coroutine *qemu_coroutine_new(void);
diff --git a/tests/iothread.h b/tests/iothread.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/iothread.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * Event loop thread implementation for unit tests
+ *
+ * Copyright Red Hat Inc., 2013, 2016
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@redhat.com>
+ *  Paolo Bonzini     <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#ifndef TEST_IOTHREAD_H
+#define TEST_IOTHREAD_H
+
+#include "block/aio.h"
+#include "qemu/thread.h"
+
+typedef struct IOThread IOThread;
+
+IOThread *iothread_new(void);
+void iothread_join(IOThread *iothread);
+AioContext *iothread_get_aio_context(IOThread *iothread);
+
+#endif
diff --git a/tests/iothread.c b/tests/iothread.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/iothread.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Event loop thread implementation for unit tests
+ *
+ * Copyright Red Hat Inc., 2013, 2016
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@redhat.com>
+ *  Paolo Bonzini     <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "block/aio.h"
+#include "qemu/main-loop.h"
+#include "qemu/rcu.h"
+#include "iothread.h"
+
+struct IOThread {
+    AioContext *ctx;
+
+    QemuThread thread;
+    QemuMutex init_done_lock;
+    QemuCond init_done_cond;    /* is thread initialization done? */
+    bool stopping;
+};
+
+static __thread IOThread *my_iothread;
+
+AioContext *qemu_get_current_aio_context(void)
+{
+    return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
+}
+
+static void *iothread_run(void *opaque)
+{
+    IOThread *iothread = opaque;
+
+    rcu_register_thread();
+
+    my_iothread = iothread;
+    qemu_mutex_lock(&iothread->init_done_lock);
+    iothread->ctx = aio_context_new(&error_abort);
+    qemu_cond_signal(&iothread->init_done_cond);
+    qemu_mutex_unlock(&iothread->init_done_lock);
+
+    while (!atomic_read(&iothread->stopping)) {
+        aio_poll(iothread->ctx, true);
+    }
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+void iothread_join(IOThread *iothread)
+{
+    iothread->stopping = true;
+    aio_notify(iothread->ctx);
+    qemu_thread_join(&iothread->thread);
+    qemu_cond_destroy(&iothread->init_done_cond);
+    qemu_mutex_destroy(&iothread->init_done_lock);
+    aio_context_unref(iothread->ctx);
+    g_free(iothread);
+}
+
+IOThread *iothread_new(void)
+{
+    IOThread *iothread = g_new0(IOThread, 1);
+
+    qemu_mutex_init(&iothread->init_done_lock);
+    qemu_cond_init(&iothread->init_done_cond);
+    qemu_thread_create(&iothread->thread, NULL, iothread_run,
+                       iothread, QEMU_THREAD_JOINABLE);
+
+    /* Wait for initialization to complete */
+    qemu_mutex_lock(&iothread->init_done_lock);
+    while (iothread->ctx == NULL) {
+        qemu_cond_wait(&iothread->init_done_cond,
+                       &iothread->init_done_lock);
+    }
+    qemu_mutex_unlock(&iothread->init_done_lock);
+    return iothread;
+}
+
+AioContext *iothread_get_aio_context(IOThread *iothread)
+{
+    return iothread->ctx;
+}
diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/test-aio-multithread.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * AioContext multithreading tests
+ *
+ * Copyright Red Hat, Inc. 2016
+ *
+ * Authors:
+ *  Paolo Bonzini    <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include <glib.h>
+#include "block/aio.h"
+#include "qapi/error.h"
+#include "qemu/coroutine.h"
+#include "qemu/thread.h"
+#include "qemu/error-report.h"
+#include "iothread.h"
+
+/* AioContext management */
+
+#define NUM_CONTEXTS 5
+
+static IOThread *threads[NUM_CONTEXTS];
+static AioContext *ctx[NUM_CONTEXTS];
+static __thread int id = -1;
+
+static QemuEvent done_event;
+
+/* Run a function synchronously on a remote iothread. */
+
+typedef struct CtxRunData {
+    QEMUBHFunc *cb;
+    void *arg;
+} CtxRunData;
+
+static void ctx_run_bh_cb(void *opaque)
+{
+    CtxRunData *data = opaque;
+
+    data->cb(data->arg);
+    qemu_event_set(&done_event);
+}
+
+static void ctx_run(int i, QEMUBHFunc *cb, void *opaque)
+{
+    CtxRunData data = {
+        .cb = cb,
+        .arg = opaque
+    };
+
+    qemu_event_reset(&done_event);
+    aio_bh_schedule_oneshot(ctx[i], ctx_run_bh_cb, &data);
+    qemu_event_wait(&done_event);
+}
+
+/* Starting the iothreads. */
+
+static void set_id_cb(void *opaque)
+{
+    int *i = opaque;
+
+    id = *i;
+}
+
+static void create_aio_contexts(void)
+{
+    int i;
+
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        threads[i] = iothread_new();
+        ctx[i] = iothread_get_aio_context(threads[i]);
+    }
+
+    qemu_event_init(&done_event, false);
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        ctx_run(i, set_id_cb, &i);
+    }
+}
+
+/* Stopping the iothreads. */
+
+static void join_aio_contexts(void)
+{
+    int i;
+
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        aio_context_ref(ctx[i]);
+    }
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        iothread_join(threads[i]);
+    }
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        aio_context_unref(ctx[i]);
+    }
+    qemu_event_destroy(&done_event);
+}
+
+/* Basic test for the stuff above. */
+
+static void test_lifecycle(void)
+{
+    create_aio_contexts();
+    join_aio_contexts();
+}
+
+/* aio_co_schedule test.  */
+
+static Coroutine *to_schedule[NUM_CONTEXTS];
+
+static bool now_stopping;
+
+static int count_retry;
+static int count_here;
+static int count_other;
+
+static bool schedule_next(int n)
+{
+    Coroutine *co;
+
+    co = atomic_xchg(&to_schedule[n], NULL);
+    if (!co) {
+        atomic_inc(&count_retry);
+        return false;
+    }
+
+    if (n == id) {
+        atomic_inc(&count_here);
+    } else {
+        atomic_inc(&count_other);
+    }
+
+    aio_co_schedule(ctx[n], co);
+    return true;
+}
+
+static void finish_cb(void *opaque)
+{
+    schedule_next(id);
+}
+
+static coroutine_fn void test_multi_co_schedule_entry(void *opaque)
+{
+    g_assert(to_schedule[id] == NULL);
+    atomic_mb_set(&to_schedule[id], qemu_coroutine_self());
+
+    while (!atomic_mb_read(&now_stopping)) {
+        int n;
+
+        n = g_test_rand_int_range(0, NUM_CONTEXTS);
+        schedule_next(n);
+        qemu_coroutine_yield();
+
+        g_assert(to_schedule[id] == NULL);
+        atomic_mb_set(&to_schedule[id], qemu_coroutine_self());
+    }
+}
+
+
+static void test_multi_co_schedule(int seconds)
+{
+    int i;
+
+    count_here = count_other = count_retry = 0;
+    now_stopping = false;
+
+    create_aio_contexts();
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        Coroutine *co1 = qemu_coroutine_create(test_multi_co_schedule_entry, NULL);
+        aio_co_schedule(ctx[i], co1);
+    }
+
+    g_usleep(seconds * 1000000);
+
+    atomic_mb_set(&now_stopping, true);
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        ctx_run(i, finish_cb, NULL);
+        to_schedule[i] = NULL;
+    }
+
+    join_aio_contexts();
+    g_test_message("scheduled %d, queued %d, retry %d, total %d\n",
+                  count_other, count_here, count_retry,
+                  count_here + count_other + count_retry);
+}
+
+static void test_multi_co_schedule_1(void)
+{
+    test_multi_co_schedule(1);
+}
+
+static void test_multi_co_schedule_10(void)
+{
+    test_multi_co_schedule(10);
+}
+
+/* End of tests.  */
+
+int main(int argc, char **argv)
+{
+    init_clocks();
+
+    g_test_init(&argc, &argv, NULL);
+    g_test_add_func("/aio/multi/lifecycle", test_lifecycle);
+    if (g_test_quick()) {
+        g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
+    } else {
+        g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
+    }
+    return g_test_run();
+}
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/main-loop.h"
 #include "qemu/atomic.h"
 #include "block/raw-aio.h"
+#include "qemu/coroutine_int.h"
+#include "trace.h"
 
 /***********************************************************/
 /* bottom halves (can be seen as timers which expire ASAP) */
@@ -XXX,XX +XXX,XX @@ aio_ctx_finalize(GSource     *source)
     }
 #endif
 
+    assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
+    qemu_bh_delete(ctx->co_schedule_bh);
+
     qemu_lockcnt_lock(&ctx->list_lock);
     assert(!qemu_lockcnt_count(&ctx->list_lock));
     while (ctx->first_bh) {
@@ -XXX,XX +XXX,XX @@ static bool event_notifier_poll(void *opaque)
     return atomic_read(&ctx->notified);
 }
 
+static void co_schedule_bh_cb(void *opaque)
+{
+    AioContext *ctx = opaque;
+    QSLIST_HEAD(, Coroutine) straight, reversed;
+
+    QSLIST_MOVE_ATOMIC(&reversed, &ctx->scheduled_coroutines);
+    QSLIST_INIT(&straight);
+
+    while (!QSLIST_EMPTY(&reversed)) {
+        Coroutine *co = QSLIST_FIRST(&reversed);
+        QSLIST_REMOVE_HEAD(&reversed, co_scheduled_next);
+        QSLIST_INSERT_HEAD(&straight, co, co_scheduled_next);
+    }
+
+    while (!QSLIST_EMPTY(&straight)) {
+        Coroutine *co = QSLIST_FIRST(&straight);
+        QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
+        trace_aio_co_schedule_bh_cb(ctx, co);
+        qemu_coroutine_enter(co);
+    }
+}
+
 AioContext *aio_context_new(Error **errp)
 {
     int ret;
@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
     }
     g_source_set_can_recurse(&ctx->source, true);
     qemu_lockcnt_init(&ctx->list_lock);
+
+    ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx);
+    QSLIST_INIT(&ctx->scheduled_coroutines);
+
     aio_set_event_notifier(ctx, &ctx->notifier,
                            false,
                            (EventNotifierHandler *)
@@ -XXX,XX +XXX,XX @@ fail:
     return NULL;
 }
 
+void aio_co_schedule(AioContext *ctx, Coroutine *co)
+{
+    trace_aio_co_schedule(ctx, co);
+    QSLIST_INSERT_HEAD_ATOMIC(&ctx->scheduled_coroutines,
+                              co, co_scheduled_next);
+    qemu_bh_schedule(ctx->co_schedule_bh);
+}
+
+void aio_co_wake(struct Coroutine *co)
+{
+    AioContext *ctx;
+
+    /* Read coroutine before co->ctx.  Matches smp_wmb in
+     * qemu_coroutine_enter.
+     */
+    smp_read_barrier_depends();
+    ctx = atomic_read(&co->ctx);
+
+    if (ctx != qemu_get_current_aio_context()) {
+        aio_co_schedule(ctx, co);
+        return;
+    }
+
+    if (qemu_in_coroutine()) {
+        Coroutine *self = qemu_coroutine_self();
+        assert(self != co);
+        QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, co, co_queue_next);
+    } else {
+        aio_context_acquire(ctx);
+        qemu_coroutine_enter(co);
+        aio_context_release(ctx);
+    }
+}
+
 void aio_context_ref(AioContext *ctx)
 {
     g_source_ref(&ctx->source);
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine.c
+++ b/util/qemu-coroutine.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/atomic.h"
 #include "qemu/coroutine.h"
 #include "qemu/coroutine_int.h"
+#include "block/aio.h"
 
 enum {
     POOL_BATCH_SIZE = 64,
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_enter(Coroutine *co)
     }
 
     co->caller = self;
+    co->ctx = qemu_get_current_aio_context();
+
+    /* Store co->ctx before anything that stores co.  Matches
+     * barrier in aio_co_wake.
+     */
+    smp_wmb();
+
     ret = qemu_coroutine_switch(self, co, COROUTINE_ENTER);
 
     qemu_co_queue_run_restart(co);
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
 poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 
+# util/async.c
+aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
+aio_co_schedule_bh_cb(void *ctx, void *co) "ctx %p co %p"
+
 # util/thread-pool.c
 thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
 thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

qcow2_create2 calls this.  Do not run a nested event loop, as that
breaks when aio_co_wake tries to queue the coroutine on the co_queue_wakeup
list of the currently running one.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-4-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/block-backend.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
 {
     QEMUIOVector qiov;
     struct iovec iov;
-    Coroutine *co;
     BlkRwCo rwco;
 
     iov = (struct iovec) {
@@ -XXX,XX +XXX,XX @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
         .ret    = NOT_DONE,
     };
 
-    co = qemu_coroutine_create(co_entry, &rwco);
-    qemu_coroutine_enter(co);
-    BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        co_entry(&rwco);
+    } else {
+        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
+        qemu_coroutine_enter(co);
+        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
+    }
 
     return rwco.ret;
 }
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Once the thread pool starts using aio_co_wake, it will also need
qemu_get_current_aio_context().  Make test-thread-pool create
an AioContext with qemu_init_main_loop, so that stubs/iothread.c
and tests/iothread.c can provide the rest.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-5-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/test-thread-pool.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/tests/test-thread-pool.c b/tests/test-thread-pool.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-thread-pool.c
+++ b/tests/test-thread-pool.c
@@ -XXX,XX +XXX,XX @@
 #include "qapi/error.h"
 #include "qemu/timer.h"
 #include "qemu/error-report.h"
+#include "qemu/main-loop.h"
 
 static AioContext *ctx;
 static ThreadPool *pool;
@@ -XXX,XX +XXX,XX @@ static void test_cancel_async(void)
 int main(int argc, char **argv)
 {
     int ret;
-    Error *local_error = NULL;
 
-    init_clocks();
-
-    ctx = aio_context_new(&local_error);
-    if (!ctx) {
-        error_reportf_err(local_error, "Failed to create AIO Context: ");
-        exit(1);
-    }
+    qemu_init_main_loop(&error_abort);
+    ctx = qemu_get_current_aio_context();
     pool = aio_get_thread_pool(ctx);
 
     g_test_init(&argc, &argv, NULL);
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
 
     ret = g_test_run();
 
-    aio_context_unref(ctx);
     return ret;
 }
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This is in preparation for making qio_channel_yield work on
AioContexts other than the main one.

Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-6-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/io/channel.h | 25 +++++++++++++++++++++++++
 io/channel-command.c | 13 +++++++++++++
 io/channel-file.c    | 11 +++++++++++
 io/channel-socket.c  | 16 +++++++++++-----
 io/channel-tls.c     | 12 ++++++++++++
 io/channel-watch.c   |  6 ++++++
 io/channel.c         | 11 +++++++++++
 7 files changed, 89 insertions(+), 5 deletions(-)

diff --git a/include/io/channel.h b/include/io/channel.h
index XXXXXXX..XXXXXXX 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu-common.h"
 #include "qom/object.h"
+#include "block/aio.h"
 
 #define TYPE_QIO_CHANNEL "qio-channel"
 #define QIO_CHANNEL(obj)                                    \
@@ -XXX,XX +XXX,XX @@ struct QIOChannelClass {
                      off_t offset,
                      int whence,
                      Error **errp);
+    void (*io_set_aio_fd_handler)(QIOChannel *ioc,
+                                  AioContext *ctx,
+                                  IOHandler *io_read,
+                                  IOHandler *io_write,
+                                  void *opaque);
 };
 
 /* General I/O handling functions */
@@ -XXX,XX +XXX,XX @@ void qio_channel_yield(QIOChannel *ioc,
 void qio_channel_wait(QIOChannel *ioc,
                       GIOCondition condition);
 
+/**
+ * qio_channel_set_aio_fd_handler:
+ * @ioc: the channel object
+ * @ctx: the AioContext to set the handlers on
+ * @io_read: the read handler
+ * @io_write: the write handler
+ * @opaque: the opaque value passed to the handler
+ *
+ * This is used internally by qio_channel_yield().  It can
+ * be used by channel implementations to forward the handlers
+ * to another channel (e.g. from #QIOChannelTLS to the
+ * underlying socket).
+ */
+void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
+                                    AioContext *ctx,
+                                    IOHandler *io_read,
+                                    IOHandler *io_write,
+                                    void *opaque);
+
 #endif /* QIO_CHANNEL_H */
diff --git a/io/channel-command.c b/io/channel-command.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel-command.c
+++ b/io/channel-command.c
@@ -XXX,XX +XXX,XX @@ static int qio_channel_command_close(QIOChannel *ioc,
 }
 
 
+static void qio_channel_command_set_aio_fd_handler(QIOChannel *ioc,
+                                                   AioContext *ctx,
+                                                   IOHandler *io_read,
+                                                   IOHandler *io_write,
+                                                   void *opaque)
+{
+    QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);
+    aio_set_fd_handler(ctx, cioc->readfd, false, io_read, NULL, NULL, opaque);
+    aio_set_fd_handler(ctx, cioc->writefd, false, NULL, io_write, NULL, opaque);
+}
+
+
 static GSource *qio_channel_command_create_watch(QIOChannel *ioc,
                                                  GIOCondition condition)
 {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_command_class_init(ObjectClass *klass,
     ioc_klass->io_set_blocking = qio_channel_command_set_blocking;
     ioc_klass->io_close = qio_channel_command_close;
     ioc_klass->io_create_watch = qio_channel_command_create_watch;
+    ioc_klass->io_set_aio_fd_handler = qio_channel_command_set_aio_fd_handler;
 }
 
 static const TypeInfo qio_channel_command_info = {
diff --git a/io/channel-file.c b/io/channel-file.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel-file.c
+++ b/io/channel-file.c
@@ -XXX,XX +XXX,XX @@ static int qio_channel_file_close(QIOChannel *ioc,
 }
 
 
+static void qio_channel_file_set_aio_fd_handler(QIOChannel *ioc,
+                                                AioContext *ctx,
+                                                IOHandler *io_read,
+                                                IOHandler *io_write,
+                                                void *opaque)
+{
+    QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);
+    aio_set_fd_handler(ctx, fioc->fd, false, io_read, io_write, NULL, opaque);
+}
+
 static GSource *qio_channel_file_create_watch(QIOChannel *ioc,
                                               GIOCondition condition)
 {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_file_class_init(ObjectClass *klass,
     ioc_klass->io_seek = qio_channel_file_seek;
     ioc_klass->io_close = qio_channel_file_close;
     ioc_klass->io_create_watch = qio_channel_file_create_watch;
+    ioc_klass->io_set_aio_fd_handler = qio_channel_file_set_aio_fd_handler;
 }
 
 static const TypeInfo qio_channel_file_info = {
diff --git a/io/channel-socket.c b/io/channel-socket.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -XXX,XX +XXX,XX @@ qio_channel_socket_set_blocking(QIOChannel *ioc,
         qemu_set_block(sioc->fd);
     } else {
         qemu_set_nonblock(sioc->fd);
-#ifdef WIN32
-        WSAEventSelect(sioc->fd, ioc->event,
-                       FD_READ | FD_ACCEPT | FD_CLOSE |
-                       FD_CONNECT | FD_WRITE | FD_OOB);
-#endif
     }
     return 0;
 }
@@ -XXX,XX +XXX,XX @@ qio_channel_socket_shutdown(QIOChannel *ioc,
     return 0;
 }
 
+static void qio_channel_socket_set_aio_fd_handler(QIOChannel *ioc,
+                                                  AioContext *ctx,
+                                                  IOHandler *io_read,
+                                                  IOHandler *io_write,
+                                                  void *opaque)
+{
+    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+    aio_set_fd_handler(ctx, sioc->fd, false, io_read, io_write, NULL, opaque);
+}
+
 static GSource *qio_channel_socket_create_watch(QIOChannel *ioc,
                                                 GIOCondition condition)
 {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_socket_class_init(ObjectClass *klass,
     ioc_klass->io_set_cork = qio_channel_socket_set_cork;
     ioc_klass->io_set_delay = qio_channel_socket_set_delay;
     ioc_klass->io_create_watch = qio_channel_socket_create_watch;
+    ioc_klass->io_set_aio_fd_handler = qio_channel_socket_set_aio_fd_handler;
 }
 
 static const TypeInfo qio_channel_socket_info = {
diff --git a/io/channel-tls.c b/io/channel-tls.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel-tls.c
+++ b/io/channel-tls.c
@@ -XXX,XX +XXX,XX @@ static int qio_channel_tls_close(QIOChannel *ioc,
     return qio_channel_close(tioc->master, errp);
 }
 
+static void qio_channel_tls_set_aio_fd_handler(QIOChannel *ioc,
+                                               AioContext *ctx,
+                                               IOHandler *io_read,
+                                               IOHandler *io_write,
+                                               void *opaque)
+{
+    QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc);
+
+    qio_channel_set_aio_fd_handler(tioc->master, ctx, io_read, io_write, opaque);
+}
+
 static GSource *qio_channel_tls_create_watch(QIOChannel *ioc,
                                              GIOCondition condition)
 {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_tls_class_init(ObjectClass *klass,
     ioc_klass->io_close = qio_channel_tls_close;
     ioc_klass->io_shutdown = qio_channel_tls_shutdown;
     ioc_klass->io_create_watch = qio_channel_tls_create_watch;
+    ioc_klass->io_set_aio_fd_handler = qio_channel_tls_set_aio_fd_handler;
 }
 
 static const TypeInfo qio_channel_tls_info = {
diff --git a/io/channel-watch.c b/io/channel-watch.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel-watch.c
+++ b/io/channel-watch.c
@@ -XXX,XX +XXX,XX @@ GSource *qio_channel_create_socket_watch(QIOChannel *ioc,
     GSource *source;
     QIOChannelSocketSource *ssource;
 
+#ifdef WIN32
+    WSAEventSelect(socket, ioc->event,
+                   FD_READ | FD_ACCEPT | FD_CLOSE |
+                   FD_CONNECT | FD_WRITE | FD_OOB);
+#endif
+
     source = g_source_new(&qio_channel_socket_source_funcs,
                           sizeof(QIOChannelSocketSource));
     ssource = (QIOChannelSocketSource *)source;
diff --git a/io/channel.c b/io/channel.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -XXX,XX +XXX,XX @@ GSource *qio_channel_create_watch(QIOChannel *ioc,
 }
 
 
+void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
+                                    AioContext *ctx,
+                                    IOHandler *io_read,
+                                    IOHandler *io_write,
+                                    void *opaque)
+{
+    QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
+
+    klass->io_set_aio_fd_handler(ioc, ctx, io_read, io_write, opaque);
+}
+
 guint qio_channel_add_watch(QIOChannel *ioc,
                             GIOCondition condition,
                             QIOChannelFunc func,
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Support separate coroutines for reading and writing, and place the
read/write handlers on the AioContext that the QIOChannel is registered
with.

Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-7-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/io/channel.h | 47 ++++++++++++++++++++++++++--
 io/channel.c         | 86 +++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 109 insertions(+), 24 deletions(-)

diff --git a/include/io/channel.h b/include/io/channel.h
index XXXXXXX..XXXXXXX 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu-common.h"
 #include "qom/object.h"
+#include "qemu/coroutine.h"
 #include "block/aio.h"
 
 #define TYPE_QIO_CHANNEL "qio-channel"
@@ -XXX,XX +XXX,XX @@ struct QIOChannel {
     Object parent;
     unsigned int features; /* bitmask of QIOChannelFeatures */
     char *name;
+    AioContext *ctx;
+    Coroutine *read_coroutine;
+    Coroutine *write_coroutine;
 #ifdef _WIN32
     HANDLE event; /* For use with GSource on Win32 */
 #endif
@@ -XXX,XX +XXX,XX @@ guint qio_channel_add_watch(QIOChannel *ioc,
 
 
 /**
+ * qio_channel_attach_aio_context:
+ * @ioc: the channel object
+ * @ctx: the #AioContext to set the handlers on
+ *
+ * Request that qio_channel_yield() sets I/O handlers on
+ * the given #AioContext.  If @ctx is %NULL, qio_channel_yield()
+ * uses QEMU's main thread event loop.
+ *
+ * You can move a #QIOChannel from one #AioContext to another even if
+ * I/O handlers are set for a coroutine.  However, #QIOChannel provides
+ * no synchronization between the calls to qio_channel_yield() and
+ * qio_channel_attach_aio_context().
+ *
+ * Therefore you should first call qio_channel_detach_aio_context()
+ * to ensure that the coroutine is not entered concurrently.  Then,
+ * while the coroutine has yielded, call qio_channel_attach_aio_context(),
+ * and then aio_co_schedule() to place the coroutine on the new
+ * #AioContext.  The calls to qio_channel_detach_aio_context()
+ * and qio_channel_attach_aio_context() should be protected with
+ * aio_context_acquire() and aio_context_release().
+ */
+void qio_channel_attach_aio_context(QIOChannel *ioc,
+                                    AioContext *ctx);
+
+/**
+ * qio_channel_detach_aio_context:
+ * @ioc: the channel object
+ *
+ * Disable any I/O handlers set by qio_channel_yield().  With the
+ * help of aio_co_schedule(), this allows moving a coroutine that was
+ * paused by qio_channel_yield() to another context.
+ */
+void qio_channel_detach_aio_context(QIOChannel *ioc);
+
+/**
  * qio_channel_yield:
  * @ioc: the channel object
  * @condition: the I/O condition to wait for
  *
- * Yields execution from the current coroutine until
- * the condition indicated by @condition becomes
- * available.
+ * Yields execution from the current coroutine until the condition
+ * indicated by @condition becomes available.  @condition must
+ * be either %G_IO_IN or %G_IO_OUT; it cannot contain both.  In
+ * addition, no two coroutine can be waiting on the same condition
+ * and channel at the same time.
  *
  * This must only be called from coroutine context
  */
diff --git a/io/channel.c b/io/channel.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "io/channel.h"
 #include "qapi/error.h"
-#include "qemu/coroutine.h"
+#include "qemu/main-loop.h"
 
 bool qio_channel_has_feature(QIOChannel *ioc,
                              QIOChannelFeature feature)
@@ -XXX,XX +XXX,XX @@ off_t qio_channel_io_seek(QIOChannel *ioc,
 }
 
 
-typedef struct QIOChannelYieldData QIOChannelYieldData;
-struct QIOChannelYieldData {
-    QIOChannel *ioc;
-    Coroutine *co;
-};
+static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc);
 
+static void qio_channel_restart_read(void *opaque)
+{
+    QIOChannel *ioc = opaque;
+    Coroutine *co = ioc->read_coroutine;
+
+    ioc->read_coroutine = NULL;
+    qio_channel_set_aio_fd_handlers(ioc);
+    aio_co_wake(co);
+}
 
-static gboolean qio_channel_yield_enter(QIOChannel *ioc,
-                                        GIOCondition condition,
-                                        gpointer opaque)
+static void qio_channel_restart_write(void *opaque)
 {
-    QIOChannelYieldData *data = opaque;
-    qemu_coroutine_enter(data->co);
-    return FALSE;
+    QIOChannel *ioc = opaque;
+    Coroutine *co = ioc->write_coroutine;
+
+    ioc->write_coroutine = NULL;
+    qio_channel_set_aio_fd_handlers(ioc);
+    aio_co_wake(co);
 }
 
+static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc)
+{
+    IOHandler *rd_handler = NULL, *wr_handler = NULL;
+    AioContext *ctx;
+
+    if (ioc->read_coroutine) {
+        rd_handler = qio_channel_restart_read;
+    }
+    if (ioc->write_coroutine) {
+        wr_handler = qio_channel_restart_write;
+    }
+
+    ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
+    qio_channel_set_aio_fd_handler(ioc, ctx, rd_handler, wr_handler, ioc);
+}
+
+void qio_channel_attach_aio_context(QIOChannel *ioc,
+                                    AioContext *ctx)
+{
+    AioContext *old_ctx;
+    if (ioc->ctx == ctx) {
+        return;
+    }
+
+    old_ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
+    qio_channel_set_aio_fd_handler(ioc, old_ctx, NULL, NULL, NULL);
+    ioc->ctx = ctx;
+    qio_channel_set_aio_fd_handlers(ioc);
+}
+
+void qio_channel_detach_aio_context(QIOChannel *ioc)
+{
+    ioc->read_coroutine = NULL;
+    ioc->write_coroutine = NULL;
+    qio_channel_set_aio_fd_handlers(ioc);
+    ioc->ctx = NULL;
+}
 
 void coroutine_fn qio_channel_yield(QIOChannel *ioc,
                                     GIOCondition condition)
 {
-    QIOChannelYieldData data;
-
     assert(qemu_in_coroutine());
-    data.ioc = ioc;
-    data.co = qemu_coroutine_self();
-    qio_channel_add_watch(ioc,
-                          condition,
-                          qio_channel_yield_enter,
-                          &data,
-                          NULL);
+    if (condition == G_IO_IN) {
+        assert(!ioc->read_coroutine);
+        ioc->read_coroutine = qemu_coroutine_self();
+    } else if (condition == G_IO_OUT) {
+        assert(!ioc->write_coroutine);
+        ioc->write_coroutine = qemu_coroutine_self();
+    } else {
+        abort();
+    }
+    qio_channel_set_aio_fd_handlers(ioc);
     qemu_coroutine_yield();
 }
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

In the client, read the reply headers from a coroutine, switching the
read side between the "read header" coroutine and the I/O coroutine that
reads the body of the reply.

In the server, if the server can read more requests it will create a new
"read request" coroutine as soon as a request has been read.  Otherwise,
the new coroutine is created in nbd_request_put.

diff --git a/block/nbd-client.h b/block/nbd-client.h
index XXXXXXX..XXXXXXX 100644
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -XXX,XX +XXX,XX @@ typedef struct NBDClientSession {
 
     CoMutex send_mutex;
     CoQueue free_sema;
-    Coroutine *send_coroutine;
+    Coroutine *read_reply_co;
     int in_flight;
 
     Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
diff --git a/block/nbd-client.c b/block/nbd-client.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -XXX,XX +XXX,XX @@
 #define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
 #define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))
 
-static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
+static void nbd_recv_coroutines_enter_all(BlockDriverState *bs)
 {
+    NBDClientSession *s = nbd_get_client_session(bs);
     int i;
 
     for (i = 0; i < MAX_NBD_REQUESTS; i++) {
@@ -XXX,XX +XXX,XX @@ static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
             qemu_coroutine_enter(s->recv_coroutine[i]);
         }
     }
+    BDRV_POLL_WHILE(bs, s->read_reply_co);
 }
 
 static void nbd_teardown_connection(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
     qio_channel_shutdown(client->ioc,
                          QIO_CHANNEL_SHUTDOWN_BOTH,
                          NULL);
-    nbd_recv_coroutines_enter_all(client);
+    nbd_recv_coroutines_enter_all(bs);
 
     nbd_client_detach_aio_context(bs);
     object_unref(OBJECT(client->sioc));
@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
     client->ioc = NULL;
 }
 
-static void nbd_reply_ready(void *opaque)
+static coroutine_fn void nbd_read_reply_entry(void *opaque)
 {
-    BlockDriverState *bs = opaque;
-    NBDClientSession *s = nbd_get_client_session(bs);
+    NBDClientSession *s = opaque;
     uint64_t i;
     int ret;
 
-    if (!s->ioc) { /* Already closed */
-        return;
-    }
-
-    if (s->reply.handle == 0) {
-        /* No reply already in flight.  Fetch a header.  It is possible
-         * that another thread has done the same thing in parallel, so
-         * the socket is not readable anymore.
-         */
+    for (;;) {
+        assert(s->reply.handle == 0);
         ret = nbd_receive_reply(s->ioc, &s->reply);
-        if (ret == -EAGAIN) {
-            return;
-        }
         if (ret < 0) {
-            s->reply.handle = 0;
-            goto fail;
+            break;
         }
-    }
 
-    /* There's no need for a mutex on the receive side, because the
-     * handler acts as a synchronization point and ensures that only
-     * one coroutine is called until the reply finishes.  */
-    i = HANDLE_TO_INDEX(s, s->reply.handle);
-    if (i >= MAX_NBD_REQUESTS) {
-        goto fail;
-    }
+        /* There's no need for a mutex on the receive side, because the
+         * handler acts as a synchronization point and ensures that only
+         * one coroutine is called until the reply finishes.
+         */
+        i = HANDLE_TO_INDEX(s, s->reply.handle);
+        if (i >= MAX_NBD_REQUESTS || !s->recv_coroutine[i]) {
+            break;
+        }
 
-    if (s->recv_coroutine[i]) {
-        qemu_coroutine_enter(s->recv_coroutine[i]);
-        return;
+        /* We're woken up by the recv_coroutine itself.  Note that there
+         * is no race between yielding and reentering read_reply_co.  This
+         * is because:
+         *
+         * - if recv_coroutine[i] runs on the same AioContext, it is only
+         *   entered after we yield
+         *
+         * - if recv_coroutine[i] runs on a different AioContext, reentering
+         *   read_reply_co happens through a bottom half, which can only
+         *   run after we yield.
+         */
+        aio_co_wake(s->recv_coroutine[i]);
+        qemu_coroutine_yield();
     }
-
-fail:
-    nbd_teardown_connection(bs);
-}
-
-static void nbd_restart_write(void *opaque)
-{
-    BlockDriverState *bs = opaque;
-
-    qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine);
+    s->read_reply_co = NULL;
 }
 
 static int nbd_co_send_request(BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
                                QEMUIOVector *qiov)
 {
     NBDClientSession *s = nbd_get_client_session(bs);
-    AioContext *aio_context;
     int rc, ret, i;
 
     qemu_co_mutex_lock(&s->send_mutex);
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
         return -EPIPE;
     }
 
-    s->send_coroutine = qemu_coroutine_self();
-    aio_context = bdrv_get_aio_context(bs);
-
-    aio_set_fd_handler(aio_context, s->sioc->fd, false,
-                       nbd_reply_ready, nbd_restart_write, NULL, bs);
     if (qiov) {
         qio_channel_set_cork(s->ioc, true);
         rc = nbd_send_request(s->ioc, request);
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
     } else {
         rc = nbd_send_request(s->ioc, request);
     }
-    aio_set_fd_handler(aio_context, s->sioc->fd, false,
-                       nbd_reply_ready, NULL, NULL, bs);
-    s->send_coroutine = NULL;
     qemu_co_mutex_unlock(&s->send_mutex);
     return rc;
 }
@@ -XXX,XX +XXX,XX @@ static void nbd_co_receive_reply(NBDClientSession *s,
 {
     int ret;
 
-    /* Wait until we're woken up by the read handler.  TODO: perhaps
-     * peek at the next reply and avoid yielding if it's ours?  */
+    /* Wait until we're woken up by nbd_read_reply_entry.  */
     qemu_coroutine_yield();
     *reply = s->reply;
     if (reply->handle != request->handle ||
@@ -XXX,XX +XXX,XX @@ static void nbd_coroutine_start(NBDClientSession *s,
     /* s->recv_coroutine[i] is set as soon as we get the send_lock.  */
 }
 
-static void nbd_coroutine_end(NBDClientSession *s,
+static void nbd_coroutine_end(BlockDriverState *bs,
                               NBDRequest *request)
 {
+    NBDClientSession *s = nbd_get_client_session(bs);
     int i = HANDLE_TO_INDEX(s, request->handle);
+
     s->recv_coroutine[i] = NULL;
-    if (s->in_flight-- == MAX_NBD_REQUESTS) {
-        qemu_co_queue_next(&s->free_sema);
+    s->in_flight--;
+    qemu_co_queue_next(&s->free_sema);
+
+    /* Kick the read_reply_co to get the next reply.  */
+    if (s->read_reply_co) {
+        aio_co_wake(s->read_reply_co);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
     } else {
         nbd_co_receive_reply(client, &request, &reply, qiov);
     }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
     return -reply.error;
 }
 
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
     } else {
         nbd_co_receive_reply(client, &request, &reply, NULL);
     }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
     return -reply.error;
 }
 
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
     } else {
         nbd_co_receive_reply(client, &request, &reply, NULL);
     }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
     return -reply.error;
 }
 
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_flush(BlockDriverState *bs)
     } else {
         nbd_co_receive_reply(client, &request, &reply, NULL);
     }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
     return -reply.error;
 }
 
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
     } else {
         nbd_co_receive_reply(client, &request, &reply, NULL);
     }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
     return -reply.error;
 
 }
 
 void nbd_client_detach_aio_context(BlockDriverState *bs)
 {
-    aio_set_fd_handler(bdrv_get_aio_context(bs),
-                       nbd_get_client_session(bs)->sioc->fd,
-                       false, NULL, NULL, NULL, NULL);
+    NBDClientSession *client = nbd_get_client_session(bs);
+    qio_channel_detach_aio_context(QIO_CHANNEL(client->sioc));
 }
 
 void nbd_client_attach_aio_context(BlockDriverState *bs,
                                    AioContext *new_context)
 {
-    aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sioc->fd,
-                       false, nbd_reply_ready, NULL, NULL, bs);
+    NBDClientSession *client = nbd_get_client_session(bs);
+    qio_channel_attach_aio_context(QIO_CHANNEL(client->sioc), new_context);
+    aio_co_schedule(new_context, client->read_reply_co);
 }
 
 void nbd_client_close(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ int nbd_client_init(BlockDriverState *bs,
     /* Now that we're connected, set the socket to be non-blocking and
      * kick the reply mechanism.  */
     qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL);
-
+    client->read_reply_co = qemu_coroutine_create(nbd_read_reply_entry, client);
     nbd_client_attach_aio_context(bs, bdrv_get_aio_context(bs));
 
     logout("Established connection with NBD server\n");
diff --git a/nbd/client.c b/nbd/client.c
index XXXXXXX..XXXXXXX 100644
--- a/nbd/client.c
+++ b/nbd/client.c
@@ -XXX,XX +XXX,XX @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply)
     ssize_t ret;
 
     ret = read_sync(ioc, buf, sizeof(buf));
-    if (ret < 0) {
+    if (ret <= 0) {
         return ret;
     }
 
diff --git a/nbd/common.c b/nbd/common.c
index XXXXXXX..XXXXXXX 100644
--- a/nbd/common.c
+++ b/nbd/common.c
@@ -XXX,XX +XXX,XX @@ ssize_t nbd_wr_syncv(QIOChannel *ioc,
         }
         if (len == QIO_CHANNEL_ERR_BLOCK) {
             if (qemu_in_coroutine()) {
-                /* XXX figure out if we can create a variant on
-                 * qio_channel_yield() that works with AIO contexts
-                 * and consider using that in this branch */
-                qemu_coroutine_yield();
-            } else if (done) {
-                /* XXX this is needed by nbd_reply_ready.  */
-                qio_channel_wait(ioc,
-                                 do_read ? G_IO_IN : G_IO_OUT);
+                qio_channel_yield(ioc, do_read ? G_IO_IN : G_IO_OUT);
             } else {
                 return -EAGAIN;
             }
diff --git a/nbd/server.c b/nbd/server.c
index XXXXXXX..XXXXXXX 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -XXX,XX +XXX,XX @@ struct NBDClient {
     CoMutex send_lock;
     Coroutine *send_coroutine;
 
-    bool can_read;
-
     QTAILQ_ENTRY(NBDClient) next;
     int nb_requests;
     bool closing;
@@ -XXX,XX +XXX,XX @@ struct NBDClient {
 
 /* That's all folks */
 
-static void nbd_set_handlers(NBDClient *client);
-static void nbd_unset_handlers(NBDClient *client);
-static void nbd_update_can_read(NBDClient *client);
+static void nbd_client_receive_next_request(NBDClient *client);
 
 static gboolean nbd_negotiate_continue(QIOChannel *ioc,
                                        GIOCondition condition,
@@ -XXX,XX +XXX,XX @@ void nbd_client_put(NBDClient *client)
          */
         assert(client->closing);
 
-        nbd_unset_handlers(client);
+        qio_channel_detach_aio_context(client->ioc);
         object_unref(OBJECT(client->sioc));
         object_unref(OBJECT(client->ioc));
         if (client->tlscreds) {
@@ -XXX,XX +XXX,XX @@ static NBDRequestData *nbd_request_get(NBDClient *client)
 
     assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
     client->nb_requests++;
-    nbd_update_can_read(client);
 
     req = g_new0(NBDRequestData, 1);
     nbd_client_get(client);
@@ -XXX,XX +XXX,XX @@ static void nbd_request_put(NBDRequestData *req)
     g_free(req);
 
     client->nb_requests--;
-    nbd_update_can_read(client);
+    nbd_client_receive_next_request(client);
+
     nbd_client_put(client);
 }
 
@@ -XXX,XX +XXX,XX @@ static void blk_aio_attached(AioContext *ctx, void *opaque)
     exp->ctx = ctx;
 
     QTAILQ_FOREACH(client, &exp->clients, next) {
-        nbd_set_handlers(client);
+        qio_channel_attach_aio_context(client->ioc, ctx);
+        if (client->recv_coroutine) {
+            aio_co_schedule(ctx, client->recv_coroutine);
+        }
+        if (client->send_coroutine) {
+            aio_co_schedule(ctx, client->send_coroutine);
+        }
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void blk_aio_detach(void *opaque)
     TRACE("Export %s: Detaching clients from AIO context %p\n", exp->name, exp->ctx);
 
     QTAILQ_FOREACH(client, &exp->clients, next) {
-        nbd_unset_handlers(client);
+        qio_channel_detach_aio_context(client->ioc);
     }
 
     exp->ctx = NULL;
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
     g_assert(qemu_in_coroutine());
     qemu_co_mutex_lock(&client->send_lock);
     client->send_coroutine = qemu_coroutine_self();
-    nbd_set_handlers(client);
 
     if (!len) {
         rc = nbd_send_reply(client->ioc, reply);
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
     }
 
     client->send_coroutine = NULL;
-    nbd_set_handlers(client);
     qemu_co_mutex_unlock(&client->send_lock);
     return rc;
 }
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
     ssize_t rc;
 
     g_assert(qemu_in_coroutine());
-    client->recv_coroutine = qemu_coroutine_self();
-    nbd_update_can_read(client);
-
+    assert(client->recv_coroutine == qemu_coroutine_self());
     rc = nbd_receive_request(client->ioc, request);
     if (rc < 0) {
         if (rc != -EAGAIN) {
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
 
 out:
     client->recv_coroutine = NULL;
-    nbd_update_can_read(client);
+    nbd_client_receive_next_request(client);
 
     return rc;
 }
 
-static void nbd_trip(void *opaque)
+/* Owns a reference to the NBDClient passed as opaque.  */
+static coroutine_fn void nbd_trip(void *opaque)
 {
     NBDClient *client = opaque;
     NBDExport *exp = client->exp;
     NBDRequestData *req;
-    NBDRequest request;
+    NBDRequest request = { 0 };    /* GCC thinks it can be used uninitialized */
     NBDReply reply;
     ssize_t ret;
     int flags;
 
     TRACE("Reading request.");
     if (client->closing) {
+        nbd_client_put(client);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void nbd_trip(void *opaque)
 
 done:
     nbd_request_put(req);
+    nbd_client_put(client);
     return;
 
 out:
     nbd_request_put(req);
     client_close(client);
+    nbd_client_put(client);
 }
 
-static void nbd_read(void *opaque)
+static void nbd_client_receive_next_request(NBDClient *client)
 {
-    NBDClient *client = opaque;
-
-    if (client->recv_coroutine) {
-        qemu_coroutine_enter(client->recv_coroutine);
-    } else {
-        qemu_coroutine_enter(qemu_coroutine_create(nbd_trip, client));
-    }
-}
-
-static void nbd_restart_write(void *opaque)
-{
-    NBDClient *client = opaque;
-
-    qemu_coroutine_enter(client->send_coroutine);
-}
-
-static void nbd_set_handlers(NBDClient *client)
-{
-    if (client->exp && client->exp->ctx) {
-        aio_set_fd_handler(client->exp->ctx, client->sioc->fd, true,
-                           client->can_read ? nbd_read : NULL,
-                           client->send_coroutine ? nbd_restart_write : NULL,
-                           NULL, client);
-    }
-}
-
-static void nbd_unset_handlers(NBDClient *client)
-{
-    if (client->exp && client->exp->ctx) {
-        aio_set_fd_handler(client->exp->ctx, client->sioc->fd, true, NULL,
-                           NULL, NULL, NULL);
-    }
-}
-
-static void nbd_update_can_read(NBDClient *client)
-{
-    bool can_read = client->recv_coroutine ||
-                    client->nb_requests < MAX_NBD_REQUESTS;
-
-    if (can_read != client->can_read) {
-        client->can_read = can_read;
-        nbd_set_handlers(client);
-
-        /* There is no need to invoke aio_notify(), since aio_set_fd_handler()
-         * in nbd_set_handlers() will have taken care of that */
+    if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS) {
+        nbd_client_get(client);
+        client->recv_coroutine = qemu_coroutine_create(nbd_trip, client);
+        aio_co_schedule(client->exp->ctx, client->recv_coroutine);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void nbd_co_client_start(void *opaque)
         goto out;
     }
     qemu_co_mutex_init(&client->send_lock);
-    nbd_set_handlers(client);
 
     if (exp) {
         QTAILQ_INSERT_TAIL(&exp->clients, client, next);
     }
+
+    nbd_client_receive_next_request(client);
+
 out:
     g_free(data);
 }
@@ -XXX,XX +XXX,XX @@ void nbd_client_new(NBDExport *exp,
     object_ref(OBJECT(client->sioc));
     client->ioc = QIO_CHANNEL(sioc);
     object_ref(OBJECT(client->ioc));
-    client->can_read = true;
     client->close = close_fn;
 
     data->client = client;
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

As a small step towards the introduction of multiqueue, we want
coroutines to remain on the same AioContext that started them,
unless they are moved explicitly with e.g. aio_co_schedule.  This patch
avoids that coroutines switch AioContext when they use a CoMutex.
For now it does not make much of a difference, because the CoMutex
is not thread-safe and the AioContext itself is used to protect the
CoMutex from concurrent access.  However, this is going to change.

diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/coroutine.h"
 #include "qemu/coroutine_int.h"
 #include "qemu/queue.h"
+#include "block/aio.h"
 #include "trace.h"
 
 void qemu_co_queue_init(CoQueue *queue)
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_run_restart(Coroutine *co)
 
 static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
 {
-    Coroutine *self = qemu_coroutine_self();
     Coroutine *next;
 
     if (QSIMPLEQ_EMPTY(&queue->entries)) {
@@ -XXX,XX +XXX,XX @@ static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
 
     while ((next = QSIMPLEQ_FIRST(&queue->entries)) != NULL) {
         QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next);
-        QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, next, co_queue_next);
-        trace_qemu_co_queue_next(next);
+        aio_co_wake(next);
         if (single) {
             break;
         }
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_coroutine_terminate(void *co) "self %p"
 
 # util/qemu-coroutine-lock.c
 qemu_co_queue_run_restart(void *co) "co %p"
-qemu_co_queue_next(void *nxt) "next %p"
 qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Keep the coroutine on the same AioContext.  Without this change,
there would be a race between yielding the coroutine and reentering it.
While the race cannot happen now, because the code only runs from a single
AioContext, this will change with multiqueue support in the block layer.

While doing the change, replace custom bottom half with aio_co_schedule.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Message-id: 20170213135235.12274-10-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/blkdebug.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index XXXXXXX..XXXXXXX 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -XXX,XX +XXX,XX @@ out:
     return ret;
 }
 
-static void error_callback_bh(void *opaque)
-{
-    Coroutine *co = opaque;
-    qemu_coroutine_enter(co);
-}
-
 static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
 {
     BDRVBlkdebugState *s = bs->opaque;
@@ -XXX,XX +XXX,XX @@ static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
     }
 
     if (!immediately) {
-        aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), error_callback_bh,
-                                qemu_coroutine_self());
+        aio_co_schedule(qemu_get_current_aio_context(), qemu_coroutine_self());
         qemu_coroutine_yield();
     }
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

qed_aio_start_io and qed_aio_next_io will not have to acquire/release
the AioContext, while qed_aio_next_io_cb will.  Split the functionality
and gain a little type-safety in the process.

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
     return l2_table;
 }
 
-static void qed_aio_next_io(void *opaque, int ret);
+static void qed_aio_next_io(QEDAIOCB *acb, int ret);
+
+static void qed_aio_start_io(QEDAIOCB *acb)
+{
+    qed_aio_next_io(acb, 0);
+}
+
+static void qed_aio_next_io_cb(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+
+    qed_aio_next_io(acb, ret);
+}
 
 static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
 {
@@ -XXX,XX +XXX,XX @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
 
     acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
     if (acb) {
-        qed_aio_next_io(acb, 0);
+        qed_aio_start_io(acb);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
         QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
         acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
         if (acb) {
-            qed_aio_next_io(acb, 0);
+            qed_aio_start_io(acb);
         } else if (s->header.features & QED_F_NEED_CHECK) {
             qed_start_need_check_timer(s);
         }
@@ -XXX,XX +XXX,XX @@ static void qed_commit_l2_update(void *opaque, int ret)
     acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
     assert(acb->request.l2_table != NULL);
 
-    qed_aio_next_io(opaque, ret);
+    qed_aio_next_io(acb, ret);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
     if (need_alloc) {
         /* Write out the whole new L2 table */
         qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
-                            qed_aio_write_l1_update, acb);
+                           qed_aio_write_l1_update, acb);
     } else {
         /* Write out only the updated part of the L2 table */
         qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
-                            qed_aio_next_io, acb);
+                           qed_aio_next_io_cb, acb);
     }
     return;
 
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
     }
 
     if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
-        next_fn = qed_aio_next_io;
+        next_fn = qed_aio_next_io_cb;
     } else {
         if (s->bs->backing) {
             next_fn = qed_aio_write_flush_before_l2_update;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
     if (acb->flags & QED_AIOCB_ZERO) {
         /* Skip ahead if the clusters are already zero */
         if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
-            qed_aio_next_io(acb, 0);
+            qed_aio_start_io(acb);
             return;
         }
 
@@ -XXX,XX +XXX,XX @@ static void qed_aio_read_data(void *opaque, int ret,
     /* Handle zero cluster and backing file reads */
     if (ret == QED_CLUSTER_ZERO) {
         qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
-        qed_aio_next_io(acb, 0);
+        qed_aio_start_io(acb);
         return;
     } else if (ret != QED_CLUSTER_FOUND) {
         qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
-                              &acb->backing_qiov, qed_aio_next_io, acb);
+                              &acb->backing_qiov, qed_aio_next_io_cb, acb);
         return;
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
     bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
                    &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
-                   qed_aio_next_io, acb);
+                   qed_aio_next_io_cb, acb);
     return;
 
 err:
@@ -XXX,XX +XXX,XX @@ err:
 /**
  * Begin next I/O or complete the request
  */
-static void qed_aio_next_io(void *opaque, int ret)
+static void qed_aio_next_io(QEDAIOCB *acb, int ret)
 {
-    QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
     QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
                                 qed_aio_write_data : qed_aio_read_data;
@@ -XXX,XX +XXX,XX @@ static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
     qemu_iovec_init(&acb->cur_qiov, qiov->niov);
 
     /* Start request */
-    qed_aio_next_io(acb, 0);
+    qed_aio_start_io(acb);
     return &acb->common;
 }
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

The AioContext data structures are now protected by list_lock and/or
they are walked with FOREACH_RCU primitives.  There is no need anymore
to acquire the AioContext for the entire duration of aio_dispatch.
Instead, just acquire it before and after invoking the callbacks.
The next step is then to push it further down.

diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
             (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
             aio_node_check(ctx, node->is_external) &&
             node->io_read) {
+            aio_context_acquire(ctx);
             node->io_read(node->opaque);
+            aio_context_release(ctx);
 
             /* aio_notify() does not count as progress */
             if (node->opaque != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
             (revents & (G_IO_OUT | G_IO_ERR)) &&
             aio_node_check(ctx, node->is_external) &&
             node->io_write) {
+            aio_context_acquire(ctx);
             node->io_write(node->opaque);
+            aio_context_release(ctx);
             progress = true;
         }
 
@@ -XXX,XX +XXX,XX @@ bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
     }
 
     /* Run our timers */
+    aio_context_acquire(ctx);
     progress |= timerlistgroup_run_timers(&ctx->tlg);
+    aio_context_release(ctx);
 
     return progress;
 }
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     int64_t timeout;
     int64_t start = 0;
 
-    aio_context_acquire(ctx);
-    progress = false;
-
     /* aio_notify can avoid the expensive event_notifier_set if
      * everything (file descriptors, bottom halves, timers) will
      * be re-evaluated before the next blocking poll().  This is
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     }
 
-    if (try_poll_mode(ctx, blocking)) {
-        progress = true;
-    } else {
+    aio_context_acquire(ctx);
+    progress = try_poll_mode(ctx, blocking);
+    aio_context_release(ctx);
+
+    if (!progress) {
         assert(npfd == 0);
 
         /* fill pollfds */
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         timeout = blocking ? aio_compute_timeout(ctx) : 0;
 
         /* wait until next event */
-        if (timeout) {
-            aio_context_release(ctx);
-        }
         if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
             AioHandler epoll_handler;
 
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         } else  {
             ret = qemu_poll_ns(pollfds, npfd, timeout);
         }
-        if (timeout) {
-            aio_context_acquire(ctx);
-        }
     }
 
     if (blocking) {
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         progress = true;
     }
 
-    aio_context_release(ctx);
-
     return progress;
 }
 
diff --git a/util/aio-win32.c b/util/aio-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
             (revents || event_notifier_get_handle(node->e) == event) &&
             node->io_notify) {
             node->pfd.revents = 0;
+            aio_context_acquire(ctx);
             node->io_notify(node->e);
+            aio_context_release(ctx);
 
             /* aio_notify() does not count as progress */
             if (node->e != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
             (node->io_read || node->io_write)) {
             node->pfd.revents = 0;
             if ((revents & G_IO_IN) && node->io_read) {
+                aio_context_acquire(ctx);
                 node->io_read(node->opaque);
+                aio_context_release(ctx);
                 progress = true;
             }
             if ((revents & G_IO_OUT) && node->io_write) {
+                aio_context_acquire(ctx);
                 node->io_write(node->opaque);
+                aio_context_release(ctx);
                 progress = true;
             }
 
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     int count;
     int timeout;
 
-    aio_context_acquire(ctx);
     progress = false;
 
     /* aio_notify can avoid the expensive event_notifier_set if
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
 
         timeout = blocking && !have_select_revents
             ? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
-        if (timeout) {
-            aio_context_release(ctx);
-        }
         ret = WaitForMultipleObjects(count, events, FALSE, timeout);
         if (blocking) {
             assert(first);
             atomic_sub(&ctx->notify_me, 2);
         }
-        if (timeout) {
-            aio_context_acquire(ctx);
-        }
 
         if (first) {
             aio_notify_accept(ctx);
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         progress |= aio_dispatch_handlers(ctx, event);
     } while (count > 0);
 
+    aio_context_acquire(ctx);
     progress |= timerlistgroup_run_timers(&ctx->tlg);
-
     aio_context_release(ctx);
     return progress;
 }
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
                 ret = 1;
             }
             bh->idle = 0;
+            aio_context_acquire(ctx);
             aio_bh_call(bh);
+            aio_context_release(ctx);
         }
         if (bh->deleted) {
             deleted = true;
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Message-id: 20170213135235.12274-13-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.h                 |  3 +++
 block/curl.c                |  2 ++
 block/io.c                  |  5 +++++
 block/iscsi.c               |  8 ++++++--
 block/null.c                |  4 ++++
 block/qed.c                 | 12 ++++++++++++
 block/throttle-groups.c     |  2 ++
 util/aio-posix.c            |  2 --
 util/aio-win32.c            |  2 --
 util/qemu-coroutine-sleep.c |  2 +-
 10 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/block/qed.h b/block/qed.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -XXX,XX +XXX,XX @@ enum {
  */
 typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len);
 
+void qed_acquire(BDRVQEDState *s);
+void qed_release(BDRVQEDState *s);
+
 /**
  * Generic callback for chaining async callbacks
  */
diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_multi_timeout_do(void *arg)
         return;
     }
 
+    aio_context_acquire(s->aio_context);
     curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
 
     curl_multi_check_completion(s);
+    aio_context_release(s->aio_context);
 #else
     abort();
 #endif
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_aio_cancel(BlockAIOCB *acb)
         if (acb->aiocb_info->get_aio_context) {
             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
         } else if (acb->bs) {
+            /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
+             * assert that we're not using an I/O thread.  Thread-safe
+             * code should use bdrv_aio_cancel_async exclusively.
+             */
+            assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
             aio_poll(bdrv_get_aio_context(acb->bs), true);
         } else {
             abort();
diff --git a/block/iscsi.c b/block/iscsi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static void iscsi_retry_timer_expired(void *opaque)
     struct IscsiTask *iTask = opaque;
     iTask->complete = 1;
     if (iTask->co) {
-        qemu_coroutine_enter(iTask->co);
+        aio_co_wake(iTask->co);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void iscsi_nop_timed_event(void *opaque)
 {
     IscsiLun *iscsilun = opaque;
 
+    aio_context_acquire(iscsilun->aio_context);
     if (iscsi_get_nops_in_flight(iscsilun->iscsi) >= MAX_NOP_FAILURES) {
         error_report("iSCSI: NOP timeout. Reconnecting...");
         iscsilun->request_timed_out = true;
     } else if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) {
         error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages.");
-        return;
+        goto out;
     }
 
     timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL);
     iscsi_set_events(iscsilun);
+
+out:
+    aio_context_release(iscsilun->aio_context);
 }
 
 static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
diff --git a/block/null.c b/block/null.c
index XXXXXXX..XXXXXXX 100644
--- a/block/null.c
+++ b/block/null.c
@@ -XXX,XX +XXX,XX @@ static void null_bh_cb(void *opaque)
 static void null_timer_cb(void *opaque)
 {
     NullAIOCB *acb = opaque;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+
+    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, 0);
+    aio_context_release(ctx);
     timer_deinit(&acb->timer);
     qemu_aio_unref(acb);
 }
diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_need_check_timer_cb(void *opaque)
 
     trace_qed_need_check_timer_cb(s);
 
+    qed_acquire(s);
     qed_plug_allocating_write_reqs(s);
 
     /* Ensure writes are on disk before clearing flag */
     bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
+    qed_release(s);
+}
+
+void qed_acquire(BDRVQEDState *s)
+{
+    aio_context_acquire(bdrv_get_aio_context(s->bs));
+}
+
+void qed_release(BDRVQEDState *s)
+{
+    aio_context_release(bdrv_get_aio_context(s->bs));
 }
 
 static void qed_start_need_check_timer(BDRVQEDState *s)
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index XXXXXXX..XXXXXXX 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@ static void timer_cb(BlockBackend *blk, bool is_write)
     qemu_mutex_unlock(&tg->lock);
 
     /* Run the request that was waiting for this timer */
+    aio_context_acquire(blk_get_aio_context(blk));
     empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]);
+    aio_context_release(blk_get_aio_context(blk));
 
     /* If the request queue was empty then we have to take care of
      * scheduling the next one */
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
     }
 
     /* Run our timers */
-    aio_context_acquire(ctx);
     progress |= timerlistgroup_run_timers(&ctx->tlg);
-    aio_context_release(ctx);
 
     return progress;
 }
diff --git a/util/aio-win32.c b/util/aio-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         progress |= aio_dispatch_handlers(ctx, event);
     } while (count > 0);
 
-    aio_context_acquire(ctx);
     progress |= timerlistgroup_run_timers(&ctx->tlg);
-    aio_context_release(ctx);
     return progress;
 }
 
diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-sleep.c
+++ b/util/qemu-coroutine-sleep.c
@@ -XXX,XX +XXX,XX @@ static void co_sleep_cb(void *opaque)
 {
     CoSleepCB *sleep_cb = opaque;
 
-    qemu_coroutine_enter(sleep_cb->co);
+    aio_co_wake(sleep_cb->co);
 }
 
 void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type,
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This covers both file descriptor callbacks and polling callbacks,
since they execute related code.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Message-id: 20170213135235.12274-14-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/curl.c          | 16 +++++++++++++---
 block/iscsi.c         |  4 ++++
 block/linux-aio.c     |  4 ++++
 block/nfs.c           |  6 ++++++
 block/sheepdog.c      | 29 +++++++++++++++--------------
 block/ssh.c           | 29 +++++++++--------------------
 block/win32-aio.c     | 10 ++++++----
 hw/block/virtio-blk.c |  5 ++++-
 hw/scsi/virtio-scsi.c |  7 +++++++
 util/aio-posix.c      |  7 -------
 util/aio-win32.c      |  6 ------
 11 files changed, 68 insertions(+), 55 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
     }
 }
 
-static void curl_multi_do(void *arg)
+static void curl_multi_do_locked(CURLState *s)
 {
-    CURLState *s = (CURLState *)arg;
     CURLSocket *socket, *next_socket;
     int running;
     int r;
@@ -XXX,XX +XXX,XX @@ static void curl_multi_do(void *arg)
     }
 }
 
+static void curl_multi_do(void *arg)
+{
+    CURLState *s = (CURLState *)arg;
+
+    aio_context_acquire(s->s->aio_context);
+    curl_multi_do_locked(s);
+    aio_context_release(s->s->aio_context);
+}
+
 static void curl_multi_read(void *arg)
 {
     CURLState *s = (CURLState *)arg;
 
-    curl_multi_do(arg);
+    aio_context_acquire(s->s->aio_context);
+    curl_multi_do_locked(s);
     curl_multi_check_completion(s->s);
+    aio_context_release(s->s->aio_context);
 }
 
 static void curl_multi_timeout_do(void *arg)
diff --git a/block/iscsi.c b/block/iscsi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ iscsi_process_read(void *arg)
     IscsiLun *iscsilun = arg;
     struct iscsi_context *iscsi = iscsilun->iscsi;
 
+    aio_context_acquire(iscsilun->aio_context);
     iscsi_service(iscsi, POLLIN);
     iscsi_set_events(iscsilun);
+    aio_context_release(iscsilun->aio_context);
 }
 
 static void
@@ -XXX,XX +XXX,XX @@ iscsi_process_write(void *arg)
     IscsiLun *iscsilun = arg;
     struct iscsi_context *iscsi = iscsilun->iscsi;
 
+    aio_context_acquire(iscsilun->aio_context);
     iscsi_service(iscsi, POLLOUT);
     iscsi_set_events(iscsilun);
+    aio_context_release(iscsilun->aio_context);
 }
 
 static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun)
diff --git a/block/linux-aio.c b/block/linux-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_completion_cb(EventNotifier *e)
     LinuxAioState *s = container_of(e, LinuxAioState, e);
 
     if (event_notifier_test_and_clear(&s->e)) {
+        aio_context_acquire(s->aio_context);
         qemu_laio_process_completions_and_submit(s);
+        aio_context_release(s->aio_context);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static bool qemu_laio_poll_cb(void *opaque)
         return false;
     }
 
+    aio_context_acquire(s->aio_context);
     qemu_laio_process_completions_and_submit(s);
+    aio_context_release(s->aio_context);
     return true;
 }
 
diff --git a/block/nfs.c b/block/nfs.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -XXX,XX +XXX,XX @@ static void nfs_set_events(NFSClient *client)
 static void nfs_process_read(void *arg)
 {
     NFSClient *client = arg;
+
+    aio_context_acquire(client->aio_context);
     nfs_service(client->context, POLLIN);
     nfs_set_events(client);
+    aio_context_release(client->aio_context);
 }
 
 static void nfs_process_write(void *arg)
 {
     NFSClient *client = arg;
+
+    aio_context_acquire(client->aio_context);
     nfs_service(client->context, POLLOUT);
     nfs_set_events(client);
+    aio_context_release(client->aio_context);
 }
 
 static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
diff --git a/block/sheepdog.c b/block/sheepdog.c
index XXXXXXX..XXXXXXX 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
     return ret;
 }
 
-static void restart_co_req(void *opaque)
-{
-    Coroutine *co = opaque;
-
-    qemu_coroutine_enter(co);
-}
-
 typedef struct SheepdogReqCo {
     int sockfd;
     BlockDriverState *bs;
@@ -XXX,XX +XXX,XX @@ typedef struct SheepdogReqCo {
     unsigned int *rlen;
     int ret;
     bool finished;
+    Coroutine *co;
 } SheepdogReqCo;
 
+static void restart_co_req(void *opaque)
+{
+    SheepdogReqCo *srco = opaque;
+
+    aio_co_wake(srco->co);
+}
+
 static coroutine_fn void do_co_req(void *opaque)
 {
     int ret;
-    Coroutine *co;
     SheepdogReqCo *srco = opaque;
     int sockfd = srco->sockfd;
     SheepdogReq *hdr = srco->hdr;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void do_co_req(void *opaque)
     unsigned int *wlen = srco->wlen;
     unsigned int *rlen = srco->rlen;
 
-    co = qemu_coroutine_self();
+    srco->co = qemu_coroutine_self();
     aio_set_fd_handler(srco->aio_context, sockfd, false,
-                       NULL, restart_co_req, NULL, co);
+                       NULL, restart_co_req, NULL, srco);
 
     ret = send_co_req(sockfd, hdr, data, wlen);
     if (ret < 0) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void do_co_req(void *opaque)
     }
 
     aio_set_fd_handler(srco->aio_context, sockfd, false,
-                       restart_co_req, NULL, NULL, co);
+                       restart_co_req, NULL, NULL, srco);
 
     ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
     if (ret != sizeof(*hdr)) {
@@ -XXX,XX +XXX,XX @@ out:
     aio_set_fd_handler(srco->aio_context, sockfd, false,
                        NULL, NULL, NULL, NULL);
 
+    srco->co = NULL;
     srco->ret = ret;
     srco->finished = true;
     if (srco->bs) {
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
          * We've finished all requests which belong to the AIOCB, so
          * we can switch back to sd_co_readv/writev now.
          */
-        qemu_coroutine_enter(acb->coroutine);
+        aio_co_wake(acb->coroutine);
     }
 
     return;
@@ -XXX,XX +XXX,XX @@ static void co_read_response(void *opaque)
         s->co_recv = qemu_coroutine_create(aio_read_response, opaque);
     }
 
-    qemu_coroutine_enter(s->co_recv);
+    aio_co_wake(s->co_recv);
 }
 
 static void co_write_request(void *opaque)
 {
     BDRVSheepdogState *s = opaque;
 
-    qemu_coroutine_enter(s->co_send);
+    aio_co_wake(s->co_send);
 }
 
 /*
diff --git a/block/ssh.c b/block/ssh.c
index XXXXXXX..XXXXXXX 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -XXX,XX +XXX,XX @@ static void restart_coroutine(void *opaque)
 
     DPRINTF("co=%p", co);
 
-    qemu_coroutine_enter(co);
+    aio_co_wake(co);
 }
 
-static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
+/* A non-blocking call returned EAGAIN, so yield, ensuring the
+ * handlers are set up so that we'll be rescheduled when there is an
+ * interesting event on the socket.
+ */
+static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
 {
     int r;
     IOHandler *rd_handler = NULL, *wr_handler = NULL;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
 
     aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
                        false, rd_handler, wr_handler, NULL, co);
-}
-
-static coroutine_fn void clear_fd_handler(BDRVSSHState *s,
-                                          BlockDriverState *bs)
-{
-    DPRINTF("s->sock=%d", s->sock);
-    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
-                       false, NULL, NULL, NULL, NULL);
-}
-
-/* A non-blocking call returned EAGAIN, so yield, ensuring the
- * handlers are set up so that we'll be rescheduled when there is an
- * interesting event on the socket.
- */
-static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
-{
-    set_fd_handler(s, bs);
     qemu_coroutine_yield();
-    clear_fd_handler(s, bs);
+    DPRINTF("s->sock=%d - back", s->sock);
+    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, false,
+                       NULL, NULL, NULL, NULL);
 }
 
 /* SFTP has a function `libssh2_sftp_seek64' which seeks to a position
diff --git a/block/win32-aio.c b/block/win32-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -XXX,XX +XXX,XX @@ struct QEMUWin32AIOState {
     HANDLE hIOCP;
     EventNotifier e;
     int count;
-    bool is_aio_context_attached;
+    AioContext *aio_ctx;
 };
 
 typedef struct QEMUWin32AIOCB {
@@ -XXX,XX +XXX,XX @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
     }
 
 
+    aio_context_acquire(s->aio_ctx);
     waiocb->common.cb(waiocb->common.opaque, ret);
+    aio_context_release(s->aio_ctx);
     qemu_aio_unref(waiocb);
 }
 
@@ -XXX,XX +XXX,XX @@ void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
                                   AioContext *old_context)
 {
     aio_set_event_notifier(old_context, &aio->e, false, NULL, NULL);
-    aio->is_aio_context_attached = false;
+    aio->aio_ctx = NULL;
 }
 
 void win32_aio_attach_aio_context(QEMUWin32AIOState *aio,
                                   AioContext *new_context)
 {
-    aio->is_aio_context_attached = true;
+    aio->aio_ctx = new_context;
     aio_set_event_notifier(new_context, &aio->e, false,
                            win32_aio_completion_cb, NULL);
 }
@@ -XXX,XX +XXX,XX @@ out_free_state:
 
 void win32_aio_cleanup(QEMUWin32AIOState *aio)
 {
-    assert(!aio->is_aio_context_attached);
+    assert(!aio->aio_ctx);
     CloseHandle(aio->hIOCP);
     event_notifier_cleanup(&aio->e);
     g_free(aio);
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
 {
     VirtIOBlockIoctlReq *ioctl_req = opaque;
     VirtIOBlockReq *req = ioctl_req->req;
-    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
     struct virtio_scsi_inhdr *scsi;
     struct sg_io_hdr *hdr;
 
@@ -XXX,XX +XXX,XX @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
     MultiReqBuffer mrb = {};
     bool progress = false;
 
+    aio_context_acquire(blk_get_aio_context(s->blk));
     blk_io_plug(s->blk);
 
     do {
@@ -XXX,XX +XXX,XX @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
     }
 
     blk_io_unplug(s->blk);
+    aio_context_release(blk_get_aio_context(s->blk));
     return progress;
 }
 
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
     VirtIOSCSIReq *req;
     bool progress = false;
 
+    virtio_scsi_acquire(s);
     while ((req = virtio_scsi_pop_req(s, vq))) {
         progress = true;
         virtio_scsi_handle_ctrl_req(s, req);
     }
+    virtio_scsi_release(s);
     return progress;
 }
 
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
 
     QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs);
 
+    virtio_scsi_acquire(s);
     do {
         virtio_queue_set_notification(vq, 0);
 
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
     QTAILQ_FOREACH_SAFE(req, &reqs, next, next) {
         virtio_scsi_handle_cmd_req_submit(s, req);
     }
+    virtio_scsi_release(s);
     return progress;
 }
 
@@ -XXX,XX +XXX,XX @@ out:
 
 bool virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq)
 {
+    virtio_scsi_acquire(s);
     if (s->events_dropped) {
         virtio_scsi_push_event(s, NULL, VIRTIO_SCSI_T_NO_EVENT, 0);
+        virtio_scsi_release(s);
         return true;
     }
+    virtio_scsi_release(s);
     return false;
 }
 
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
             (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
             aio_node_check(ctx, node->is_external) &&
             node->io_read) {
-            aio_context_acquire(ctx);
             node->io_read(node->opaque);
-            aio_context_release(ctx);
 
             /* aio_notify() does not count as progress */
             if (node->opaque != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
             (revents & (G_IO_OUT | G_IO_ERR)) &&
             aio_node_check(ctx, node->is_external) &&
             node->io_write) {
-            aio_context_acquire(ctx);
             node->io_write(node->opaque);
-            aio_context_release(ctx);
             progress = true;
         }
 
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     }
 
-    aio_context_acquire(ctx);
     progress = try_poll_mode(ctx, blocking);
-    aio_context_release(ctx);
-
     if (!progress) {
         assert(npfd == 0);
 
diff --git a/util/aio-win32.c b/util/aio-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
             (revents || event_notifier_get_handle(node->e) == event) &&
             node->io_notify) {
             node->pfd.revents = 0;
-            aio_context_acquire(ctx);
             node->io_notify(node->e);
-            aio_context_release(ctx);
 
             /* aio_notify() does not count as progress */
             if (node->e != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
             (node->io_read || node->io_write)) {
             node->pfd.revents = 0;
             if ((revents & G_IO_IN) && node->io_read) {
-                aio_context_acquire(ctx);
                 node->io_read(node->opaque);
-                aio_context_release(ctx);
                 progress = true;
             }
             if ((revents & G_IO_OUT) && node->io_write) {
-                aio_context_acquire(ctx);
                 node->io_write(node->opaque);
-                aio_context_release(ctx);
                 progress = true;
             }
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Message-id: 20170213135235.12274-15-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/archipelago.c   |  3 +++
 block/blkreplay.c     |  2 +-
 block/block-backend.c |  6 ++++++
 block/curl.c          | 26 ++++++++++++++++++--------
 block/gluster.c       |  9 +--------
 block/io.c            |  6 +++++-
 block/iscsi.c         |  6 +++++-
 block/linux-aio.c     | 15 +++++++++------
 block/nfs.c           |  3 ++-
 block/null.c          |  4 ++++
 block/qed.c           |  3 +++
 block/rbd.c           |  4 ++++
 dma-helpers.c         |  2 ++
 hw/block/virtio-blk.c |  2 ++
 hw/scsi/scsi-bus.c    |  2 ++
 util/async.c          |  4 ++--
 util/thread-pool.c    |  2 ++
 17 files changed, 71 insertions(+), 28 deletions(-)

diff --git a/block/archipelago.c b/block/archipelago.c
index XXXXXXX..XXXXXXX 100644
--- a/block/archipelago.c
+++ b/block/archipelago.c
@@ -XXX,XX +XXX,XX @@ static void qemu_archipelago_complete_aio(void *opaque)
 {
     AIORequestData *reqdata = (AIORequestData *) opaque;
     ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;
+    AioContext *ctx = bdrv_get_aio_context(aio_cb->common.bs);
 
+    aio_context_acquire(ctx);
     aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
+    aio_context_release(ctx);
     aio_cb->status = 0;
 
     qemu_aio_unref(aio_cb);
diff --git a/block/blkreplay.c b/block/blkreplay.c
index XXXXXXX..XXXXXXX 100755
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -XXX,XX +XXX,XX @@ static int64_t blkreplay_getlength(BlockDriverState *bs)
 static void blkreplay_bh_cb(void *opaque)
 {
     Request *req = opaque;
-    qemu_coroutine_enter(req->co);
+    aio_co_wake(req->co);
     qemu_bh_delete(req->bh);
     g_free(req);
 }
diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
 static void error_callback_bh(void *opaque)
 {
     struct BlockBackendAIOCB *acb = opaque;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 
     bdrv_dec_in_flight(acb->common.bs);
+    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, acb->ret);
+    aio_context_release(ctx);
     qemu_aio_unref(acb);
 }
 
@@ -XXX,XX +XXX,XX @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)
 static void blk_aio_complete_bh(void *opaque)
 {
     BlkAioEmAIOCB *acb = opaque;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 
     assert(acb->has_returned);
+    aio_context_acquire(ctx);
     blk_aio_complete(acb);
+    aio_context_release(ctx);
 }
 
 static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
 {
     CURLState *state;
     int running;
+    int ret = -EINPROGRESS;
 
     CURLAIOCB *acb = p;
-    BDRVCURLState *s = acb->common.bs->opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVCURLState *s = bs->opaque;
+    AioContext *ctx = bdrv_get_aio_context(bs);
 
     size_t start = acb->sector_num * BDRV_SECTOR_SIZE;
     size_t end;
 
+    aio_context_acquire(ctx);
+
     // In case we have the requested data already (e.g. read-ahead),
     // we can just call the callback and be done.
     switch (curl_find_buf(s, start, acb->nb_sectors * BDRV_SECTOR_SIZE, acb)) {
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
             qemu_aio_unref(acb);
             // fall through
         case FIND_RET_WAIT:
-            return;
+            goto out;
         default:
             break;
     }
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
     // No cache found, so let's start a new request
     state = curl_init_state(acb->common.bs, s);
     if (!state) {
-        acb->common.cb(acb->common.opaque, -EIO);
-        qemu_aio_unref(acb);
-        return;
+        ret = -EIO;
+        goto out;
     }
 
     acb->start = 0;
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
     state->orig_buf = g_try_malloc(state->buf_len);
     if (state->buf_len && state->orig_buf == NULL) {
         curl_clean_state(state);
-        acb->common.cb(acb->common.opaque, -ENOMEM);
-        qemu_aio_unref(acb);
-        return;
+        ret = -ENOMEM;
+        goto out;
     }
     state->acb[0] = acb;
 
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
 
     /* Tell curl it needs to kick things off */
     curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
+
+out:
+    if (ret != -EINPROGRESS) {
+        acb->common.cb(acb->common.opaque, ret);
+        qemu_aio_unref(acb);
+    }
+    aio_context_release(ctx);
 }
 
 static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
diff --git a/block/gluster.c b/block/gluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -XXX,XX +XXX,XX @@ static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
     return qemu_gluster_glfs_init(gconf, errp);
 }
 
-static void qemu_gluster_complete_aio(void *opaque)
-{
-    GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
-
-    qemu_coroutine_enter(acb->coroutine);
-}
-
 /*
  * AIO callback routine called from GlusterFS thread.
  */
@@ -XXX,XX +XXX,XX @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
         acb->ret = -EIO; /* Partial read/write - fail it */
     }
 
-    aio_bh_schedule_oneshot(acb->aio_context, qemu_gluster_complete_aio, acb);
+    aio_co_schedule(acb->aio_context, acb->coroutine);
 }
 
 static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
     bdrv_dec_in_flight(bs);
     bdrv_drained_begin(bs);
     data->done = true;
-    qemu_coroutine_enter(co);
+    aio_co_wake(co);
 }
 
 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
 static void bdrv_co_em_bh(void *opaque)
 {
     BlockAIOCBCoroutine *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    AioContext *ctx = bdrv_get_aio_context(bs);
 
     assert(!acb->need_bh);
+    aio_context_acquire(ctx);
     bdrv_co_complete(acb);
+    aio_context_release(ctx);
 }
 
 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
diff --git a/block/iscsi.c b/block/iscsi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static void
 iscsi_bh_cb(void *p)
 {
     IscsiAIOCB *acb = p;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 
     qemu_bh_delete(acb->bh);
 
     g_free(acb->buf);
     acb->buf = NULL;
 
+    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, acb->status);
+    aio_context_release(ctx);
 
     if (acb->task != NULL) {
         scsi_free_scsi_task(acb->task);
@@ -XXX,XX +XXX,XX @@ iscsi_schedule_bh(IscsiAIOCB *acb)
 static void iscsi_co_generic_bh_cb(void *opaque)
 {
     struct IscsiTask *iTask = opaque;
+
     iTask->complete = 1;
-    qemu_coroutine_enter(iTask->co);
+    aio_co_wake(iTask->co);
 }
 
 static void iscsi_retry_timer_expired(void *opaque)
diff --git a/block/linux-aio.c b/block/linux-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -XXX,XX +XXX,XX @@ struct LinuxAioState {
     io_context_t ctx;
     EventNotifier e;
 
-    /* io queue for submit at batch */
+    /* io queue for submit at batch.  Protected by AioContext lock. */
     LaioQueue io_q;
 
-    /* I/O completion processing */
+    /* I/O completion processing.  Only runs in I/O thread.  */
     QEMUBH *completion_bh;
     int event_idx;
     int event_max;
@@ -XXX,XX +XXX,XX @@ static inline ssize_t io_event_ret(struct io_event *ev)
  */
 static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
 {
+    LinuxAioState *s = laiocb->ctx;
     int ret;
 
     ret = laiocb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
     }
 
     laiocb->ret = ret;
+    aio_context_acquire(s->aio_context);
     if (laiocb->co) {
         /* If the coroutine is already entered it must be in ioq_submit() and
          * will notice laio->ret has been filled in when it eventually runs
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
         laiocb->common.cb(laiocb->common.opaque, ret);
         qemu_aio_unref(laiocb);
     }
+    aio_context_release(s->aio_context);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completions(LinuxAioState *s)
 static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
 {
     qemu_laio_process_completions(s);
+
+    aio_context_acquire(s->aio_context);
     if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
         ioq_submit(s);
     }
+    aio_context_release(s->aio_context);
 }
 
 static void qemu_laio_completion_bh(void *opaque)
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_completion_cb(EventNotifier *e)
     LinuxAioState *s = container_of(e, LinuxAioState, e);
 
     if (event_notifier_test_and_clear(&s->e)) {
-        aio_context_acquire(s->aio_context);
         qemu_laio_process_completions_and_submit(s);
-        aio_context_release(s->aio_context);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static bool qemu_laio_poll_cb(void *opaque)
         return false;
     }
 
-    aio_context_acquire(s->aio_context);
     qemu_laio_process_completions_and_submit(s);
-    aio_context_release(s->aio_context);
     return true;
 }
 
@@ -XXX,XX +XXX,XX @@ void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
 {
     aio_set_event_notifier(old_context, &s->e, false, NULL, NULL);
     qemu_bh_delete(s->completion_bh);
+    s->aio_context = NULL;
 }
 
 void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
diff --git a/block/nfs.c b/block/nfs.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -XXX,XX +XXX,XX @@ static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
 static void nfs_co_generic_bh_cb(void *opaque)
 {
     NFSRPC *task = opaque;
+
     task->complete = 1;
-    qemu_coroutine_enter(task->co);
+    aio_co_wake(task->co);
 }
 
 static void
diff --git a/block/null.c b/block/null.c
index XXXXXXX..XXXXXXX 100644
--- a/block/null.c
+++ b/block/null.c
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo null_aiocb_info = {
 static void null_bh_cb(void *opaque)
 {
     NullAIOCB *acb = opaque;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+
+    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, 0);
+    aio_context_release(ctx);
     qemu_aio_unref(acb);
 }
 
diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
 static void qed_aio_complete_bh(void *opaque)
 {
     QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
     BlockCompletionFunc *cb = acb->common.cb;
     void *user_opaque = acb->common.opaque;
     int ret = acb->bh_ret;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete_bh(void *opaque)
     qemu_aio_unref(acb);
 
     /* Invoke callback */
+    qed_acquire(s);
     cb(user_opaque, ret);
+    qed_release(s);
 }
 
 static void qed_aio_complete(QEDAIOCB *acb, int ret)
diff --git a/block/rbd.c b/block/rbd.c
index XXXXXXX..XXXXXXX 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -XXX,XX +XXX,XX @@ shutdown:
 static void qemu_rbd_complete_aio(RADOSCB *rcb)
 {
     RBDAIOCB *acb = rcb->acb;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
     int64_t r;
 
     r = rcb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
     }
     qemu_vfree(acb->bounce);
+
+    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
+    aio_context_release(ctx);
 
     qemu_aio_unref(acb);
 }
diff --git a/dma-helpers.c b/dma-helpers.c
index XXXXXXX..XXXXXXX 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -XXX,XX +XXX,XX @@ static void dma_blk_cb(void *opaque, int ret)
                                 QEMU_ALIGN_DOWN(dbs->iov.size, dbs->align));
     }
 
+    aio_context_acquire(dbs->ctx);
     dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
                             dma_blk_cb, dbs, dbs->io_func_opaque);
+    aio_context_release(dbs->ctx);
     assert(dbs->acb);
 }
 
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_dma_restart_bh(void *opaque)
 
     s->rq = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
     while (req) {
         VirtIOBlockReq *next = req->next;
         if (virtio_blk_handle_request(req, &mrb)) {
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_dma_restart_bh(void *opaque)
     if (mrb.num_reqs) {
         virtio_blk_submit_multireq(s->blk, &mrb);
     }
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
 }
 
 static void virtio_blk_dma_restart_cb(void *opaque, int running,
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_restart_bh(void *opaque)
     qemu_bh_delete(s->bh);
     s->bh = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.blk));
     QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
         scsi_req_ref(req);
         if (req->retry) {
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_restart_bh(void *opaque)
         }
         scsi_req_unref(req);
     }
+    aio_context_release(blk_get_aio_context(s->conf.blk));
 }
 
 void scsi_req_retry(SCSIRequest *req)
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
                 ret = 1;
             }
             bh->idle = 0;
-            aio_context_acquire(ctx);
             aio_bh_call(bh);
-            aio_context_release(ctx);
         }
         if (bh->deleted) {
             deleted = true;
@@ -XXX,XX +XXX,XX @@ static void co_schedule_bh_cb(void *opaque)
         Coroutine *co = QSLIST_FIRST(&straight);
         QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
         trace_aio_co_schedule_bh_cb(ctx, co);
+        aio_context_acquire(ctx);
         qemu_coroutine_enter(co);
+        aio_context_release(ctx);
     }
 }
 
diff --git a/util/thread-pool.c b/util/thread-pool.c
index XXXXXXX..XXXXXXX 100644
--- a/util/thread-pool.c
+++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@ static void thread_pool_completion_bh(void *opaque)
     ThreadPool *pool = opaque;
     ThreadPoolElement *elem, *next;
 
+    aio_context_acquire(pool->ctx);
 restart:
     QLIST_FOREACH_SAFE(elem, &pool->head, all, next) {
         if (elem->state != THREAD_DONE) {
@@ -XXX,XX +XXX,XX @@ restart:
             qemu_aio_unref(elem);
         }
     }
+    aio_context_release(pool->ctx);
 }
 
 static void thread_pool_cancel(BlockAIOCB *acb)
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Message-id: 20170213135235.12274-16-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/archipelago.c    |  3 ---
 block/block-backend.c  |  7 -------
 block/curl.c           |  2 +-
 block/io.c             |  6 +-----
 block/iscsi.c          |  3 ---
 block/linux-aio.c      |  5 +----
 block/mirror.c         | 12 +++++++++---
 block/null.c           |  8 --------
 block/qed-cluster.c    |  2 ++
 block/qed-table.c      | 12 ++++++++++--
 block/qed.c            |  4 ++--
 block/rbd.c            |  4 ----
 block/win32-aio.c      |  3 ---
 hw/block/virtio-blk.c  | 12 +++++++++++-
 hw/scsi/scsi-disk.c    | 15 +++++++++++++++
 hw/scsi/scsi-generic.c | 20 +++++++++++++++++---
 util/thread-pool.c     |  4 +++-
 17 files changed, 72 insertions(+), 50 deletions(-)

diff --git a/block/archipelago.c b/block/archipelago.c
index XXXXXXX..XXXXXXX 100644
--- a/block/archipelago.c
+++ b/block/archipelago.c
@@ -XXX,XX +XXX,XX @@ static void qemu_archipelago_complete_aio(void *opaque)
 {
     AIORequestData *reqdata = (AIORequestData *) opaque;
     ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;
-    AioContext *ctx = bdrv_get_aio_context(aio_cb->common.bs);
 
-    aio_context_acquire(ctx);
     aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
-    aio_context_release(ctx);
     aio_cb->status = 0;
 
     qemu_aio_unref(aio_cb);
diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
 static void error_callback_bh(void *opaque)
 {
     struct BlockBackendAIOCB *acb = opaque;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 
     bdrv_dec_in_flight(acb->common.bs);
-    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, acb->ret);
-    aio_context_release(ctx);
     qemu_aio_unref(acb);
 }
 
@@ -XXX,XX +XXX,XX @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)
 static void blk_aio_complete_bh(void *opaque)
 {
     BlkAioEmAIOCB *acb = opaque;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
-
     assert(acb->has_returned);
-    aio_context_acquire(ctx);
     blk_aio_complete(acb);
-    aio_context_release(ctx);
 }
 
 static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
     curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
 
 out:
+    aio_context_release(ctx);
     if (ret != -EINPROGRESS) {
         acb->common.cb(acb->common.opaque, ret);
         qemu_aio_unref(acb);
     }
-    aio_context_release(ctx);
 }
 
 static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_io_em_complete(void *opaque, int ret)
     CoroutineIOCompletion *co = opaque;
 
     co->ret = ret;
-    qemu_coroutine_enter(co->coroutine);
+    aio_co_wake(co->coroutine);
 }
 
 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
 static void bdrv_co_em_bh(void *opaque)
 {
     BlockAIOCBCoroutine *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
-    AioContext *ctx = bdrv_get_aio_context(bs);
 
     assert(!acb->need_bh);
-    aio_context_acquire(ctx);
     bdrv_co_complete(acb);
-    aio_context_release(ctx);
 }
 
 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
diff --git a/block/iscsi.c b/block/iscsi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static void
 iscsi_bh_cb(void *p)
 {
     IscsiAIOCB *acb = p;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 
     qemu_bh_delete(acb->bh);
 
     g_free(acb->buf);
     acb->buf = NULL;
 
-    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, acb->status);
-    aio_context_release(ctx);
 
     if (acb->task != NULL) {
         scsi_free_scsi_task(acb->task);
diff --git a/block/linux-aio.c b/block/linux-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -XXX,XX +XXX,XX @@ static inline ssize_t io_event_ret(struct io_event *ev)
  */
 static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
 {
-    LinuxAioState *s = laiocb->ctx;
     int ret;
 
     ret = laiocb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
     }
 
     laiocb->ret = ret;
-    aio_context_acquire(s->aio_context);
     if (laiocb->co) {
         /* If the coroutine is already entered it must be in ioq_submit() and
          * will notice laio->ret has been filled in when it eventually runs
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
          * that!
          */
         if (!qemu_coroutine_entered(laiocb->co)) {
-            qemu_coroutine_enter(laiocb->co);
+            aio_co_wake(laiocb->co);
         }
     } else {
         laiocb->common.cb(laiocb->common.opaque, ret);
         qemu_aio_unref(laiocb);
     }
-    aio_context_release(s->aio_context);
 }
 
 /**
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
 {
     MirrorOp *op = opaque;
     MirrorBlockJob *s = op->s;
+
+    aio_context_acquire(blk_get_aio_context(s->common.blk));
     if (ret < 0) {
         BlockErrorAction action;
 
@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
         }
     }
     mirror_iteration_done(op, ret);
+    aio_context_release(blk_get_aio_context(s->common.blk));
 }
 
 static void mirror_read_complete(void *opaque, int ret)
 {
     MirrorOp *op = opaque;
     MirrorBlockJob *s = op->s;
+
+    aio_context_acquire(blk_get_aio_context(s->common.blk));
     if (ret < 0) {
         BlockErrorAction action;
 
@@ -XXX,XX +XXX,XX @@ static void mirror_read_complete(void *opaque, int ret)
         }
 
         mirror_iteration_done(op, ret);
-        return;
+    } else {
+        blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
+                        0, mirror_write_complete, op);
     }
-    blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
-                    0, mirror_write_complete, op);
+    aio_context_release(blk_get_aio_context(s->common.blk));
 }
 
 static inline void mirror_clip_sectors(MirrorBlockJob *s,
diff --git a/block/null.c b/block/null.c
index XXXXXXX..XXXXXXX 100644
--- a/block/null.c
+++ b/block/null.c
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo null_aiocb_info = {
 static void null_bh_cb(void *opaque)
 {
     NullAIOCB *acb = opaque;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
-
-    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, 0);
-    aio_context_release(ctx);
     qemu_aio_unref(acb);
 }
 
 static void null_timer_cb(void *opaque)
 {
     NullAIOCB *acb = opaque;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
-
-    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, 0);
-    aio_context_release(ctx);
     timer_deinit(&acb->timer);
     qemu_aio_unref(acb);
 }
diff --git a/block/qed-cluster.c b/block/qed-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed-cluster.c
+++ b/block/qed-cluster.c
@@ -XXX,XX +XXX,XX @@ static void qed_find_cluster_cb(void *opaque, int ret)
     unsigned int index;
     unsigned int n;
 
+    qed_acquire(s);
     if (ret) {
         goto out;
     }
@@ -XXX,XX +XXX,XX @@ static void qed_find_cluster_cb(void *opaque, int ret)
 
 out:
     find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
+    qed_release(s);
     g_free(find_cluster_cb);
 }
 
diff --git a/block/qed-table.c b/block/qed-table.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -XXX,XX +XXX,XX @@ static void qed_read_table_cb(void *opaque, int ret)
 {
     QEDReadTableCB *read_table_cb = opaque;
     QEDTable *table = read_table_cb->table;
+    BDRVQEDState *s = read_table_cb->s;
     int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
     int i;
 
@@ -XXX,XX +XXX,XX @@ static void qed_read_table_cb(void *opaque, int ret)
     }
 
     /* Byteswap offsets */
+    qed_acquire(s);
     for (i = 0; i < noffsets; i++) {
         table->offsets[i] = le64_to_cpu(table->offsets[i]);
     }
+    qed_release(s);
 
 out:
     /* Completion */
-    trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret);
+    trace_qed_read_table_cb(s, read_table_cb->table, ret);
     gencb_complete(&read_table_cb->gencb, ret);
 }
 
@@ -XXX,XX +XXX,XX @@ typedef struct {
 static void qed_write_table_cb(void *opaque, int ret)
 {
     QEDWriteTableCB *write_table_cb = opaque;
+    BDRVQEDState *s = write_table_cb->s;
 
-    trace_qed_write_table_cb(write_table_cb->s,
+    trace_qed_write_table_cb(s,
                              write_table_cb->orig_table,
                              write_table_cb->flush,
                              ret);
@@ -XXX,XX +XXX,XX @@ static void qed_write_table_cb(void *opaque, int ret)
     if (write_table_cb->flush) {
         /* We still need to flush first */
         write_table_cb->flush = false;
+        qed_acquire(s);
         bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb,
                        write_table_cb);
+        qed_release(s);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void qed_read_l2_table_cb(void *opaque, int ret)
     CachedL2Table *l2_table = request->l2_table;
     uint64_t l2_offset = read_l2_table_cb->l2_offset;
 
+    qed_acquire(s);
     if (ret) {
         /* can't trust loaded L2 table anymore */
         qed_unref_l2_cache_entry(l2_table);
@@ -XXX,XX +XXX,XX @@ static void qed_read_l2_table_cb(void *opaque, int ret)
         request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
         assert(request->l2_table != NULL);
     }
+    qed_release(s);
 
     gencb_complete(&read_l2_table_cb->gencb, ret);
 }
diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l
     }
 
     if (cb->co) {
-        qemu_coroutine_enter(cb->co);
+        aio_co_wake(cb->co);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
     cb->done = true;
     cb->ret = ret;
     if (cb->co) {
-        qemu_coroutine_enter(cb->co);
+        aio_co_wake(cb->co);
     }
 }
 
diff --git a/block/rbd.c b/block/rbd.c
index XXXXXXX..XXXXXXX 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -XXX,XX +XXX,XX @@ shutdown:
 static void qemu_rbd_complete_aio(RADOSCB *rcb)
 {
     RBDAIOCB *acb = rcb->acb;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
     int64_t r;
 
     r = rcb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
     }
     qemu_vfree(acb->bounce);
-
-    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
-    aio_context_release(ctx);
 
     qemu_aio_unref(acb);
 }
diff --git a/block/win32-aio.c b/block/win32-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -XXX,XX +XXX,XX @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
         qemu_vfree(waiocb->buf);
     }
 
-
-    aio_context_acquire(s->aio_ctx);
     waiocb->common.cb(waiocb->common.opaque, ret);
-    aio_context_release(s->aio_ctx);
     qemu_aio_unref(waiocb);
 }
 
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
 static void virtio_blk_rw_complete(void *opaque, int ret)
 {
     VirtIOBlockReq *next = opaque;
+    VirtIOBlock *s = next->dev;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
     while (next) {
         VirtIOBlockReq *req = next;
         next = req->mr_next;
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_rw_complete(void *opaque, int ret)
         block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
         virtio_blk_free_request(req);
     }
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
 }
 
 static void virtio_blk_flush_complete(void *opaque, int ret)
 {
     VirtIOBlockReq *req = opaque;
+    VirtIOBlock *s = req->dev;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
     if (ret) {
         if (virtio_blk_handle_rw_error(req, -ret, 0)) {
-            return;
+            goto out;
         }
     }
 
     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
     block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
     virtio_blk_free_request(req);
+
+out:
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
 }
 
 #ifdef __linux__
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
     virtio_stl_p(vdev, &scsi->data_len, hdr->dxfer_len);
 
 out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
     virtio_blk_req_complete(req, status);
     virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
     g_free(ioctl_req);
 }
 
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (scsi_disk_req_check_error(r, ret, true)) {
         goto done;
     }
@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
     scsi_req_complete(&r->req, GOOD);
 
 done:
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
     scsi_req_unref(&r->req);
 }
 
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_complete(void *opaque, int ret)
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (ret < 0) {
         block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
     } else {
         block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
     }
     scsi_dma_complete_noio(r, ret);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 static void scsi_read_complete(void * opaque, int ret)
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (scsi_disk_req_check_error(r, ret, true)) {
         goto done;
     }
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
 
 done:
     scsi_req_unref(&r->req);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 /* Actually issue a read to the block device.  */
@@ -XXX,XX +XXX,XX @@ static void scsi_do_read_cb(void *opaque, int ret)
     assert (r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (ret < 0) {
         block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
     } else {
         block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
     }
     scsi_do_read(opaque, ret);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 /* Read more data from scsi device into buffer.  */
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
     assert (r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (ret < 0) {
         block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
     } else {
         block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
     }
     scsi_write_complete_noio(r, ret);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 static void scsi_write_data(SCSIRequest *req)
@@ -XXX,XX +XXX,XX @@ static void scsi_unmap_complete(void *opaque, int ret)
 {
     UnmapCBData *data = opaque;
     SCSIDiskReq *r = data->r;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     scsi_unmap_complete_noio(data, ret);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 static void scsi_disk_emulate_unmap(SCSIDiskReq *r, uint8_t *inbuf)
@@ -XXX,XX +XXX,XX @@ static void scsi_write_same_complete(void *opaque, int ret)
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (scsi_disk_req_check_error(r, ret, true)) {
         goto done;
     }
@@ -XXX,XX +XXX,XX @@ done:
     scsi_req_unref(&r->req);
     qemu_vfree(data->iov.iov_base);
     g_free(data);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 static void scsi_disk_emulate_write_same(SCSIDiskReq *r, uint8_t *inbuf)
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -XXX,XX +XXX,XX @@ done:
 static void scsi_command_complete(void *opaque, int ret)
 {
     SCSIGenericReq *r = (SCSIGenericReq *)opaque;
+    SCSIDevice *s = r->req.dev;
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
+
+    aio_context_acquire(blk_get_aio_context(s->conf.blk));
     scsi_command_complete_noio(r, ret);
+    aio_context_release(blk_get_aio_context(s->conf.blk));
 }
 
 static int execute_command(BlockBackend *blk,
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.blk));
+
     if (ret || r->req.io_canceled) {
         scsi_command_complete_noio(r, ret);
-        return;
+        goto done;
     }
 
     len = r->io_header.dxfer_len - r->io_header.resid;
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
     r->len = -1;
     if (len == 0) {
         scsi_command_complete_noio(r, 0);
-        return;
+        goto done;
     }
 
     /* Snoop READ CAPACITY output to set the blocksize.  */
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
     }
     scsi_req_data(&r->req, len);
     scsi_req_unref(&r->req);
+
+done:
+    aio_context_release(blk_get_aio_context(s->conf.blk));
 }
 
 /* Read more data from scsi device into buffer.  */
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.blk));
+
     if (ret || r->req.io_canceled) {
         scsi_command_complete_noio(r, ret);
-        return;
+        goto done;
     }
 
     if (r->req.cmd.buf[0] == MODE_SELECT && r->req.cmd.buf[4] == 12 &&
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
     }
 
     scsi_command_complete_noio(r, ret);
+
+done:
+    aio_context_release(blk_get_aio_context(s->conf.blk));
 }
 
 /* Write data to a scsi device.  Returns nonzero on failure.
diff --git a/util/thread-pool.c b/util/thread-pool.c
index XXXXXXX..XXXXXXX 100644
--- a/util/thread-pool.c
+++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@ restart:
              */
             qemu_bh_schedule(pool->completion_bh);
 
+            aio_context_release(pool->ctx);
             elem->common.cb(elem->common.opaque, elem->ret);
+            aio_context_acquire(pool->ctx);
             qemu_aio_unref(elem);
             goto restart;
         } else {
@@ -XXX,XX +XXX,XX @@ static void thread_pool_co_cb(void *opaque, int ret)
     ThreadPoolCo *co = opaque;
 
     co->ret = ret;
-    qemu_coroutine_enter(co->co);
+    aio_co_wake(co->co);
 }
 
 int coroutine_fn thread_pool_submit_co(ThreadPool *pool, ThreadPoolFunc *func,
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This patch prepares for the removal of unnecessary lockcnt inc/dec pairs.
Extract the dispatching loop for file descriptor handlers into a new
function aio_dispatch_handlers, and then inline aio_dispatch into
aio_poll.

aio_dispatch can now become void.

diff --git a/include/block/aio.h b/include/block/aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ bool aio_pending(AioContext *ctx);
 /* Dispatch any pending callbacks from the GSource attached to the AioContext.
  *
  * This is used internally in the implementation of the GSource.
- *
- * @dispatch_fds: true to process fds, false to skip them
- *                (can be used as an optimization by callers that know there
- *                are no fds ready)
  */
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds);
+void aio_dispatch(AioContext *ctx);
 
 /* Progress in completing AIO work to occur.  This can issue new pending
  * aio as a result of executing I/O completion or bh callbacks.
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
     AioHandler *node, *tmp;
     bool progress = false;
 
-    /*
-     * We have to walk very carefully in case aio_set_fd_handler is
-     * called while we're walking.
-     */
-    qemu_lockcnt_inc(&ctx->list_lock);
-
     QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
         int revents;
 
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
         }
     }
 
-    qemu_lockcnt_dec(&ctx->list_lock);
     return progress;
 }
 
-/*
- * Note that dispatch_fds == false has the side-effect of post-poning the
- * freeing of deleted handlers.
- */
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
+void aio_dispatch(AioContext *ctx)
 {
-    bool progress;
+    aio_bh_poll(ctx);
 
-    /*
-     * If there are callbacks left that have been queued, we need to call them.
-     * Do not call select in this case, because it is possible that the caller
-     * does not need a complete flush (as is the case for aio_poll loops).
-     */
-    progress = aio_bh_poll(ctx);
+    qemu_lockcnt_inc(&ctx->list_lock);
+    aio_dispatch_handlers(ctx);
+    qemu_lockcnt_dec(&ctx->list_lock);
 
-    if (dispatch_fds) {
-        progress |= aio_dispatch_handlers(ctx);
-    }
-
-    /* Run our timers */
-    progress |= timerlistgroup_run_timers(&ctx->tlg);
-
-    return progress;
+    timerlistgroup_run_timers(&ctx->tlg);
 }
 
 /* These thread-local variables are used only in a small part of aio_poll
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     npfd = 0;
     qemu_lockcnt_dec(&ctx->list_lock);
 
-    /* Run dispatch even if there were no readable fds to run timers */
-    if (aio_dispatch(ctx, ret > 0)) {
-        progress = true;
+    progress |= aio_bh_poll(ctx);
+
+    if (ret > 0) {
+        qemu_lockcnt_inc(&ctx->list_lock);
+        progress |= aio_dispatch_handlers(ctx);
+        qemu_lockcnt_dec(&ctx->list_lock);
     }
 
+    progress |= timerlistgroup_run_timers(&ctx->tlg);
+
     return progress;
 }
 
diff --git a/util/aio-win32.c b/util/aio-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
     return progress;
 }
 
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
+void aio_dispatch(AioContext *ctx)
 {
-    bool progress;
-
-    progress = aio_bh_poll(ctx);
-    if (dispatch_fds) {
-        progress |= aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
-    }
-    progress |= timerlistgroup_run_timers(&ctx->tlg);
-    return progress;
+    aio_bh_poll(ctx);
+    aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
+    timerlistgroup_run_timers(&ctx->tlg);
 }
 
 bool aio_poll(AioContext *ctx, bool blocking)
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ aio_ctx_dispatch(GSource     *source,
     AioContext *ctx = (AioContext *) source;
 
     assert(callback == NULL);
-    aio_dispatch(ctx, true);
+    aio_dispatch(ctx);
     return true;
 }
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Pull the increment/decrement pair out of aio_bh_poll and into the
callers.

diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
 
 void aio_dispatch(AioContext *ctx)
 {
+    qemu_lockcnt_inc(&ctx->list_lock);
     aio_bh_poll(ctx);
-
-    qemu_lockcnt_inc(&ctx->list_lock);
     aio_dispatch_handlers(ctx);
     qemu_lockcnt_dec(&ctx->list_lock);
 
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     }
 
     npfd = 0;
-    qemu_lockcnt_dec(&ctx->list_lock);
 
     progress |= aio_bh_poll(ctx);
 
     if (ret > 0) {
-        qemu_lockcnt_inc(&ctx->list_lock);
         progress |= aio_dispatch_handlers(ctx);
-        qemu_lockcnt_dec(&ctx->list_lock);
     }
 
+    qemu_lockcnt_dec(&ctx->list_lock);
+
     progress |= timerlistgroup_run_timers(&ctx->tlg);
 
     return progress;
diff --git a/util/aio-win32.c b/util/aio-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
     bool progress = false;
     AioHandler *tmp;
 
-    qemu_lockcnt_inc(&ctx->list_lock);
-
     /*
      * We have to walk very carefully in case aio_set_fd_handler is
      * called while we're walking.
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
         }
     }
 
-    qemu_lockcnt_dec(&ctx->list_lock);
     return progress;
 }
 
 void aio_dispatch(AioContext *ctx)
 {
+    qemu_lockcnt_inc(&ctx->list_lock);
     aio_bh_poll(ctx);
     aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
+    qemu_lockcnt_dec(&ctx->list_lock);
     timerlistgroup_run_timers(&ctx->tlg);
 }
 
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         }
     }
 
-    qemu_lockcnt_dec(&ctx->list_lock);
     first = true;
 
     /* ctx->notifier is always registered.  */
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         progress |= aio_dispatch_handlers(ctx, event);
     } while (count > 0);
 
+    qemu_lockcnt_dec(&ctx->list_lock);
+
     progress |= timerlistgroup_run_timers(&ctx->tlg);
     return progress;
 }
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ void aio_bh_call(QEMUBH *bh)
     bh->cb(bh->opaque);
 }
 
-/* Multiple occurrences of aio_bh_poll cannot be called concurrently */
+/* Multiple occurrences of aio_bh_poll cannot be called concurrently.
+ * The count in ctx->list_lock is incremented before the call, and is
+ * not affected by the call.
+ */
 int aio_bh_poll(AioContext *ctx)
 {
     QEMUBH *bh, **bhp, *next;
     int ret;
     bool deleted = false;
 
-    qemu_lockcnt_inc(&ctx->list_lock);
-
     ret = 0;
     for (bh = atomic_rcu_read(&ctx->first_bh); bh; bh = next) {
         next = atomic_rcu_read(&bh->next);
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
 
     /* remove deleted bhs */
     if (!deleted) {
-        qemu_lockcnt_dec(&ctx->list_lock);
         return ret;
     }
 
-    if (qemu_lockcnt_dec_and_lock(&ctx->list_lock)) {
+    if (qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
         bhp = &ctx->first_bh;
         while (*bhp) {
             bh = *bhp;
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
                 bhp = &bh->next;
             }
         }
-        qemu_lockcnt_unlock(&ctx->list_lock);
+        qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
     }
     return ret;
 }
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BdrvChild {
  * copied as well.
  */
 struct BlockDriverState {
-    int64_t total_sectors; /* if we are reading a disk image, give its
-                              size in sectors */
+    /* Protected by big QEMU lock or read-only after opening.  No special
+     * locking needed during I/O...
+     */
     int open_flags; /* flags used to open the file, re-used for re-open */
     bool read_only; /* if true, the media is read only */
     bool encrypted; /* if true, the media is encrypted */
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     bool sg;        /* if true, the device is a /dev/sg* */
     bool probed;    /* if true, format was probed rather than specified */
 
-    int copy_on_read; /* if nonzero, copy read backing sectors into image.
-                         note this is a reference count */
-
-    CoQueue flush_queue;            /* Serializing flush queue */
-    bool active_flush_req;          /* Flush request in flight? */
-    unsigned int write_gen;         /* Current data generation */
-    unsigned int flushed_gen;       /* Flushed write generation */
-
     BlockDriver *drv; /* NULL means no media */
     void *opaque;
 
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     BdrvChild *backing;
     BdrvChild *file;
 
-    /* Callback before write request is processed */
-    NotifierWithReturnList before_write_notifiers;
-
-    /* number of in-flight requests; overall and serialising */
-    unsigned int in_flight;
-    unsigned int serialising_in_flight;
-
-    bool wakeup;
-
-    /* Offset after the highest byte written to */
-    uint64_t wr_highest_offset;
-
     /* I/O Limits */
     BlockLimits bl;
 
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     QTAILQ_ENTRY(BlockDriverState) bs_list;
     /* element of the list of monitor-owned BDS */
     QTAILQ_ENTRY(BlockDriverState) monitor_list;
-    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
     int refcnt;
 
-    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
-
     /* operation blockers */
     QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX];
 
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     /* The error object in use for blocking operations on backing_hd */
     Error *backing_blocker;
 
+    /* Protected by AioContext lock */
+
+    /* If true, copy read backing sectors into image.  Can be >1 if more
+     * than one client has requested copy-on-read.
+     */
+    int copy_on_read;
+
+    /* If we are reading a disk image, give its size in sectors.
+     * Generally read-only; it is written to by load_vmstate and save_vmstate,
+     * but the block layer is quiescent during those.
+     */
+    int64_t total_sectors;
+
+    /* Callback before write request is processed */
+    NotifierWithReturnList before_write_notifiers;
+
+    /* number of in-flight requests; overall and serialising */
+    unsigned int in_flight;
+    unsigned int serialising_in_flight;
+
+    bool wakeup;
+
+    /* Offset after the highest byte written to */
+    uint64_t wr_highest_offset;
+
     /* threshold limit for writes, in bytes. "High water mark". */
     uint64_t write_threshold_offset;
     NotifierWithReturn write_threshold_notifier;
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     /* counter for nested bdrv_io_plug */
     unsigned io_plugged;
 
+    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
+    CoQueue flush_queue;                  /* Serializing flush queue */
+    bool active_flush_req;                /* Flush request in flight? */
+    unsigned int write_gen;               /* Current data generation */
+    unsigned int flushed_gen;             /* Flushed write generation */
+
+    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
+
+    /* do we need to tell the quest if we have a volatile write cache? */
+    int enable_write_cache;
+
     int quiesce_counter;
 };
 
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -XXX,XX +XXX,XX @@ typedef struct BlockDevOps {
  * fields that must be public. This is in particular for QLIST_ENTRY() and
  * friends so that BlockBackends can be kept in lists outside block-backend.c */
 typedef struct BlockBackendPublic {
-    /* I/O throttling.
-     * throttle_state tells us if this BlockBackend has I/O limits configured.
-     * io_limits_disabled tells us if they are currently being enforced */
+    /* I/O throttling has its own locking, but also some fields are
+     * protected by the AioContext lock.
+     */
+
+    /* Protected by AioContext lock.  */
     CoQueue      throttled_reqs[2];
+
+    /* Nonzero if the I/O limits are currently being ignored; generally
+     * it is zero.  */
     unsigned int io_limits_disabled;
 
     /* The following fields are protected by the ThrottleGroup lock.
-     * See the ThrottleGroup documentation for details. */
+     * See the ThrottleGroup documentation for details.
+     * throttle_state tells us if I/O limits are configured. */
     ThrottleState *throttle_state;
     ThrottleTimers throttle_timers;
     unsigned       pending_reqs[2];
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This uses the lock-free mutex described in the paper '"Blocking without
Locking", or LFTHREADS: A lock-free thread library' by Gidenstam and
Papatriantafilou.  The same technique is used in OSv, and in fact
the code is essentially a conversion to C of OSv's code.

[Added missing coroutine_fn in tests/test-aio-multithread.c.
--Stefan]

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-2-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h     |  17 ++++-
 tests/test-aio-multithread.c |  86 ++++++++++++++++++++++++
 util/qemu-coroutine-lock.c   | 155 ++++++++++++++++++++++++++++++++++++++++---
 util/trace-events            |   1 +
 4 files changed, 246 insertions(+), 13 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue);
 /**
  * Provides a mutex that can be used to synchronise coroutines
  */
+struct CoWaitRecord;
 typedef struct CoMutex {
-    bool locked;
+    /* Count of pending lockers; 0 for a free mutex, 1 for an
+     * uncontended mutex.
+     */
+    unsigned locked;
+
+    /* A queue of waiters.  Elements are added atomically in front of
+     * from_push.  to_pop is only populated, and popped from, by whoever
+     * is in charge of the next wakeup.  This can be an unlocker or,
+     * through the handoff protocol, a locker that is about to go to sleep.
+     */
+    QSLIST_HEAD(, CoWaitRecord) from_push, to_pop;
+
+    unsigned handoff, sequence;
+
     Coroutine *holder;
-    CoQueue queue;
 } CoMutex;
 
 /**
diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-aio-multithread.c
+++ b/tests/test-aio-multithread.c
@@ -XXX,XX +XXX,XX @@ static void test_multi_co_schedule_10(void)
     test_multi_co_schedule(10);
 }
 
+/* CoMutex thread-safety.  */
+
+static uint32_t atomic_counter;
+static uint32_t running;
+static uint32_t counter;
+static CoMutex comutex;
+
+static void coroutine_fn test_multi_co_mutex_entry(void *opaque)
+{
+    while (!atomic_mb_read(&now_stopping)) {
+        qemu_co_mutex_lock(&comutex);
+        counter++;
+        qemu_co_mutex_unlock(&comutex);
+
+        /* Increase atomic_counter *after* releasing the mutex.  Otherwise
+         * there is a chance (it happens about 1 in 3 runs) that the iothread
+         * exits before the coroutine is woken up, causing a spurious
+         * assertion failure.
+         */
+        atomic_inc(&atomic_counter);
+    }
+    atomic_dec(&running);
+}
+
+static void test_multi_co_mutex(int threads, int seconds)
+{
+    int i;
+
+    qemu_co_mutex_init(&comutex);
+    counter = 0;
+    atomic_counter = 0;
+    now_stopping = false;
+
+    create_aio_contexts();
+    assert(threads <= NUM_CONTEXTS);
+    running = threads;
+    for (i = 0; i < threads; i++) {
+        Coroutine *co1 = qemu_coroutine_create(test_multi_co_mutex_entry, NULL);
+        aio_co_schedule(ctx[i], co1);
+    }
+
+    g_usleep(seconds * 1000000);
+
+    atomic_mb_set(&now_stopping, true);
+    while (running > 0) {
+        g_usleep(100000);
+    }
+
+    join_aio_contexts();
+    g_test_message("%d iterations/second\n", counter / seconds);
+    g_assert_cmpint(counter, ==, atomic_counter);
+}
+
+/* Testing with NUM_CONTEXTS threads focuses on the queue.  The mutex however
+ * is too contended (and the threads spend too much time in aio_poll)
+ * to actually stress the handoff protocol.
+ */
+static void test_multi_co_mutex_1(void)
+{
+    test_multi_co_mutex(NUM_CONTEXTS, 1);
+}
+
+static void test_multi_co_mutex_10(void)
+{
+    test_multi_co_mutex(NUM_CONTEXTS, 10);
+}
+
+/* Testing with fewer threads stresses the handoff protocol too.  Still, the
+ * case where the locker _can_ pick up a handoff is very rare, happening
+ * about 10 times in 1 million, so increase the runtime a bit compared to
+ * other "quick" testcases that only run for 1 second.
+ */
+static void test_multi_co_mutex_2_3(void)
+{
+    test_multi_co_mutex(2, 3);
+}
+
+static void test_multi_co_mutex_2_30(void)
+{
+    test_multi_co_mutex(2, 30);
+}
+
 /* End of tests.  */
 
 int main(int argc, char **argv)
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/aio/multi/lifecycle", test_lifecycle);
     if (g_test_quick()) {
         g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
+        g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1);
+        g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3);
     } else {
         g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
+        g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10);
+        g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30);
     }
     return g_test_run();
 }
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
+ *
+ * The lock-free mutex implementation is based on OSv
+ * (core/lfmutex.cc, include/lockfree/mutex.hh).
+ * Copyright (C) 2013 Cloudius Systems, Ltd.
  */
 
 #include "qemu/osdep.h"
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue)
     return QSIMPLEQ_FIRST(&queue->entries) == NULL;
 }
 
+/* The wait records are handled with a multiple-producer, single-consumer
+ * lock-free queue.  There cannot be two concurrent pop_waiter() calls
+ * because pop_waiter() can only be called while mutex->handoff is zero.
+ * This can happen in three cases:
+ * - in qemu_co_mutex_unlock, before the hand-off protocol has started.
+ *   In this case, qemu_co_mutex_lock will see mutex->handoff == 0 and
+ *   not take part in the handoff.
+ * - in qemu_co_mutex_lock, if it steals the hand-off responsibility from
+ *   qemu_co_mutex_unlock.  In this case, qemu_co_mutex_unlock will fail
+ *   the cmpxchg (it will see either 0 or the next sequence value) and
+ *   exit.  The next hand-off cannot begin until qemu_co_mutex_lock has
+ *   woken up someone.
+ * - in qemu_co_mutex_unlock, if it takes the hand-off token itself.
+ *   In this case another iteration starts with mutex->handoff == 0;
+ *   a concurrent qemu_co_mutex_lock will fail the cmpxchg, and
+ *   qemu_co_mutex_unlock will go back to case (1).
+ *
+ * The following functions manage this queue.
+ */
+typedef struct CoWaitRecord {
+    Coroutine *co;
+    QSLIST_ENTRY(CoWaitRecord) next;
+} CoWaitRecord;
+
+static void push_waiter(CoMutex *mutex, CoWaitRecord *w)
+{
+    w->co = qemu_coroutine_self();
+    QSLIST_INSERT_HEAD_ATOMIC(&mutex->from_push, w, next);
+}
+
+static void move_waiters(CoMutex *mutex)
+{
+    QSLIST_HEAD(, CoWaitRecord) reversed;
+    QSLIST_MOVE_ATOMIC(&reversed, &mutex->from_push);
+    while (!QSLIST_EMPTY(&reversed)) {
+        CoWaitRecord *w = QSLIST_FIRST(&reversed);
+        QSLIST_REMOVE_HEAD(&reversed, next);
+        QSLIST_INSERT_HEAD(&mutex->to_pop, w, next);
+    }
+}
+
+static CoWaitRecord *pop_waiter(CoMutex *mutex)
+{
+    CoWaitRecord *w;
+
+    if (QSLIST_EMPTY(&mutex->to_pop)) {
+        move_waiters(mutex);
+        if (QSLIST_EMPTY(&mutex->to_pop)) {
+            return NULL;
+        }
+    }
+    w = QSLIST_FIRST(&mutex->to_pop);
+    QSLIST_REMOVE_HEAD(&mutex->to_pop, next);
+    return w;
+}
+
+static bool has_waiters(CoMutex *mutex)
+{
+    return QSLIST_EMPTY(&mutex->to_pop) || QSLIST_EMPTY(&mutex->from_push);
+}
+
 void qemu_co_mutex_init(CoMutex *mutex)
 {
     memset(mutex, 0, sizeof(*mutex));
-    qemu_co_queue_init(&mutex->queue);
 }
 
-void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
+static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
 {
     Coroutine *self = qemu_coroutine_self();
+    CoWaitRecord w;
+    unsigned old_handoff;
 
     trace_qemu_co_mutex_lock_entry(mutex, self);
+    w.co = self;
+    push_waiter(mutex, &w);
 
-    while (mutex->locked) {
-        qemu_co_queue_wait(&mutex->queue);
+    /* This is the "Responsibility Hand-Off" protocol; a lock() picks from
+     * a concurrent unlock() the responsibility of waking somebody up.
+     */
+    old_handoff = atomic_mb_read(&mutex->handoff);
+    if (old_handoff &&
+        has_waiters(mutex) &&
+        atomic_cmpxchg(&mutex->handoff, old_handoff, 0) == old_handoff) {
+        /* There can be no concurrent pops, because there can be only
+         * one active handoff at a time.
+         */
+        CoWaitRecord *to_wake = pop_waiter(mutex);
+        Coroutine *co = to_wake->co;
+        if (co == self) {
+            /* We got the lock ourselves!  */
+            assert(to_wake == &w);
+            return;
+        }
+
+        aio_co_wake(co);
     }
 
-    mutex->locked = true;
-    mutex->holder = self;
-    self->locks_held++;
-
+    qemu_coroutine_yield();
     trace_qemu_co_mutex_lock_return(mutex, self);
 }
 
+void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
+{
+    Coroutine *self = qemu_coroutine_self();
+
+    if (atomic_fetch_inc(&mutex->locked) == 0) {
+        /* Uncontended.  */
+        trace_qemu_co_mutex_lock_uncontended(mutex, self);
+    } else {
+        qemu_co_mutex_lock_slowpath(mutex);
+    }
+    mutex->holder = self;
+    self->locks_held++;
+}
+
 void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
 {
     Coroutine *self = qemu_coroutine_self();
 
     trace_qemu_co_mutex_unlock_entry(mutex, self);
 
-    assert(mutex->locked == true);
+    assert(mutex->locked);
     assert(mutex->holder == self);
     assert(qemu_in_coroutine());
 
-    mutex->locked = false;
     mutex->holder = NULL;
     self->locks_held--;
-    qemu_co_queue_next(&mutex->queue);
+    if (atomic_fetch_dec(&mutex->locked) == 1) {
+        /* No waiting qemu_co_mutex_lock().  Pfew, that was easy!  */
+        return;
+    }
+
+    for (;;) {
+        CoWaitRecord *to_wake = pop_waiter(mutex);
+        unsigned our_handoff;
+
+        if (to_wake) {
+            Coroutine *co = to_wake->co;
+            aio_co_wake(co);
+            break;
+        }
+
+        /* Some concurrent lock() is in progress (we know this because
+         * mutex->locked was >1) but it hasn't yet put itself on the wait
+         * queue.  Pick a sequence number for the handoff protocol (not 0).
+         */
+        if (++mutex->sequence == 0) {
+            mutex->sequence = 1;
+        }
+
+        our_handoff = mutex->sequence;
+        atomic_mb_set(&mutex->handoff, our_handoff);
+        if (!has_waiters(mutex)) {
+            /* The concurrent lock has not added itself yet, so it
+             * will be able to pick our handoff.
+             */
+            break;
+        }
+
+        /* Try to do the handoff protocol ourselves; if somebody else has
+         * already taken it, however, we're done and they're responsible.
+         */
+        if (atomic_cmpxchg(&mutex->handoff, our_handoff, 0) != our_handoff) {
+            break;
+        }
+    }
 
     trace_qemu_co_mutex_unlock_return(mutex, self);
 }
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_coroutine_terminate(void *co) "self %p"
 
 # util/qemu-coroutine-lock.c
 qemu_co_queue_run_restart(void *co) "co %p"
+qemu_co_mutex_lock_uncontended(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Running a very small critical section on pthread_mutex_t and CoMutex
shows that pthread_mutex_t is much faster because it doesn't actually
go to sleep.  What happens is that the critical section is shorter
than the latency of entering the kernel and thus FUTEX_WAIT always
fails.  With CoMutex there is no such latency but you still want to
avoid wait and wakeup.  So introduce it artificially.

This only works with one waiters; because CoMutex is fair, it will
always have more waits and wakeups than a pthread_mutex_t.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-3-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h   |  5 +++++
 util/qemu-coroutine-lock.c | 51 ++++++++++++++++++++++++++++++++++++++++------
 util/qemu-coroutine.c      |  2 +-
 3 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ typedef struct CoMutex {
      */
     unsigned locked;
 
+    /* Context that is holding the lock.  Useful to avoid spinning
+     * when two coroutines on the same AioContext try to get the lock. :)
+     */
+    AioContext *ctx;
+
     /* A queue of waiters.  Elements are added atomically in front of
      * from_push.  to_pop is only populated, and popped from, by whoever
      * is in charge of the next wakeup.  This can be an unlocker or,
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu-common.h"
 #include "qemu/coroutine.h"
 #include "qemu/coroutine_int.h"
+#include "qemu/processor.h"
 #include "qemu/queue.h"
 #include "block/aio.h"
 #include "trace.h"
@@ -XXX,XX +XXX,XX @@ void qemu_co_mutex_init(CoMutex *mutex)
     memset(mutex, 0, sizeof(*mutex));
 }
 
-static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
+static void coroutine_fn qemu_co_mutex_wake(CoMutex *mutex, Coroutine *co)
+{
+    /* Read co before co->ctx; pairs with smp_wmb() in
+     * qemu_coroutine_enter().
+     */
+    smp_read_barrier_depends();
+    mutex->ctx = co->ctx;
+    aio_co_wake(co);
+}
+
+static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx,
+                                                     CoMutex *mutex)
 {
     Coroutine *self = qemu_coroutine_self();
     CoWaitRecord w;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
         if (co == self) {
             /* We got the lock ourselves!  */
             assert(to_wake == &w);
+            mutex->ctx = ctx;
             return;
         }
 
-        aio_co_wake(co);
+        qemu_co_mutex_wake(mutex, co);
     }
 
     qemu_coroutine_yield();
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
 
 void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
 {
+    AioContext *ctx = qemu_get_current_aio_context();
     Coroutine *self = qemu_coroutine_self();
+    int waiters, i;
 
-    if (atomic_fetch_inc(&mutex->locked) == 0) {
+    /* Running a very small critical section on pthread_mutex_t and CoMutex
+     * shows that pthread_mutex_t is much faster because it doesn't actually
+     * go to sleep.  What happens is that the critical section is shorter
+     * than the latency of entering the kernel and thus FUTEX_WAIT always
+     * fails.  With CoMutex there is no such latency but you still want to
+     * avoid wait and wakeup.  So introduce it artificially.
+     */
+    i = 0;
+retry_fast_path:
+    waiters = atomic_cmpxchg(&mutex->locked, 0, 1);
+    if (waiters != 0) {
+        while (waiters == 1 && ++i < 1000) {
+            if (atomic_read(&mutex->ctx) == ctx) {
+                break;
+            }
+            if (atomic_read(&mutex->locked) == 0) {
+                goto retry_fast_path;
+            }
+            cpu_relax();
+        }
+        waiters = atomic_fetch_inc(&mutex->locked);
+    }
+
+    if (waiters == 0) {
         /* Uncontended.  */
         trace_qemu_co_mutex_lock_uncontended(mutex, self);
+        mutex->ctx = ctx;
     } else {
-        qemu_co_mutex_lock_slowpath(mutex);
+        qemu_co_mutex_lock_slowpath(ctx, mutex);
     }
     mutex->holder = self;
     self->locks_held++;
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
     assert(mutex->holder == self);
     assert(qemu_in_coroutine());
 
+    mutex->ctx = NULL;
     mutex->holder = NULL;
     self->locks_held--;
     if (atomic_fetch_dec(&mutex->locked) == 1) {
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
         unsigned our_handoff;
 
         if (to_wake) {
-            Coroutine *co = to_wake->co;
-            aio_co_wake(co);
+            qemu_co_mutex_wake(mutex, to_wake->co);
             break;
         }
 
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine.c
+++ b/util/qemu-coroutine.c
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_enter(Coroutine *co)
     co->ctx = qemu_get_current_aio_context();
 
     /* Store co->ctx before anything that stores co.  Matches
-     * barrier in aio_co_wake.
+     * barrier in aio_co_wake and qemu_co_mutex_wake.
      */
     smp_wmb();
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Add two implementations of the same benchmark as the previous patch,
but using pthreads.  One uses a normal QemuMutex, the other is Linux
only and implements a fair mutex based on MCS locks and futexes.
This shows that the slower performance of the 5-thread case is due to
the fairness of CoMutex, rather than to coroutines.  If fairness does
not matter, as is the case with two threads, CoMutex can actually be
faster than pthreads.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-4-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/test-aio-multithread.c | 164 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)

diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-aio-multithread.c
+++ b/tests/test-aio-multithread.c
@@ -XXX,XX +XXX,XX @@ static void test_multi_co_mutex_2_30(void)
     test_multi_co_mutex(2, 30);
 }
 
+/* Same test with fair mutexes, for performance comparison.  */
+
+#ifdef CONFIG_LINUX
+#include "qemu/futex.h"
+
+/* The nodes for the mutex reside in this structure (on which we try to avoid
+ * false sharing).  The head of the mutex is in the "mutex_head" variable.
+ */
+static struct {
+    int next, locked;
+    int padding[14];
+} nodes[NUM_CONTEXTS] __attribute__((__aligned__(64)));
+
+static int mutex_head = -1;
+
+static void mcs_mutex_lock(void)
+{
+    int prev;
+
+    nodes[id].next = -1;
+    nodes[id].locked = 1;
+    prev = atomic_xchg(&mutex_head, id);
+    if (prev != -1) {
+        atomic_set(&nodes[prev].next, id);
+        qemu_futex_wait(&nodes[id].locked, 1);
+    }
+}
+
+static void mcs_mutex_unlock(void)
+{
+    int next;
+    if (nodes[id].next == -1) {
+        if (atomic_read(&mutex_head) == id &&
+            atomic_cmpxchg(&mutex_head, id, -1) == id) {
+            /* Last item in the list, exit.  */
+            return;
+        }
+        while (atomic_read(&nodes[id].next) == -1) {
+            /* mcs_mutex_lock did the xchg, but has not updated
+             * nodes[prev].next yet.
+             */
+        }
+    }
+
+    /* Wake up the next in line.  */
+    next = nodes[id].next;
+    nodes[next].locked = 0;
+    qemu_futex_wake(&nodes[next].locked, 1);
+}
+
+static void test_multi_fair_mutex_entry(void *opaque)
+{
+    while (!atomic_mb_read(&now_stopping)) {
+        mcs_mutex_lock();
+        counter++;
+        mcs_mutex_unlock();
+        atomic_inc(&atomic_counter);
+    }
+    atomic_dec(&running);
+}
+
+static void test_multi_fair_mutex(int threads, int seconds)
+{
+    int i;
+
+    assert(mutex_head == -1);
+    counter = 0;
+    atomic_counter = 0;
+    now_stopping = false;
+
+    create_aio_contexts();
+    assert(threads <= NUM_CONTEXTS);
+    running = threads;
+    for (i = 0; i < threads; i++) {
+        Coroutine *co1 = qemu_coroutine_create(test_multi_fair_mutex_entry, NULL);
+        aio_co_schedule(ctx[i], co1);
+    }
+
+    g_usleep(seconds * 1000000);
+
+    atomic_mb_set(&now_stopping, true);
+    while (running > 0) {
+        g_usleep(100000);
+    }
+
+    join_aio_contexts();
+    g_test_message("%d iterations/second\n", counter / seconds);
+    g_assert_cmpint(counter, ==, atomic_counter);
+}
+
+static void test_multi_fair_mutex_1(void)
+{
+    test_multi_fair_mutex(NUM_CONTEXTS, 1);
+}
+
+static void test_multi_fair_mutex_10(void)
+{
+    test_multi_fair_mutex(NUM_CONTEXTS, 10);
+}
+#endif
+
+/* Same test with pthread mutexes, for performance comparison and
+ * portability.  */
+
+static QemuMutex mutex;
+
+static void test_multi_mutex_entry(void *opaque)
+{
+    while (!atomic_mb_read(&now_stopping)) {
+        qemu_mutex_lock(&mutex);
+        counter++;
+        qemu_mutex_unlock(&mutex);
+        atomic_inc(&atomic_counter);
+    }
+    atomic_dec(&running);
+}
+
+static void test_multi_mutex(int threads, int seconds)
+{
+    int i;
+
+    qemu_mutex_init(&mutex);
+    counter = 0;
+    atomic_counter = 0;
+    now_stopping = false;
+
+    create_aio_contexts();
+    assert(threads <= NUM_CONTEXTS);
+    running = threads;
+    for (i = 0; i < threads; i++) {
+        Coroutine *co1 = qemu_coroutine_create(test_multi_mutex_entry, NULL);
+        aio_co_schedule(ctx[i], co1);
+    }
+
+    g_usleep(seconds * 1000000);
+
+    atomic_mb_set(&now_stopping, true);
+    while (running > 0) {
+        g_usleep(100000);
+    }
+
+    join_aio_contexts();
+    g_test_message("%d iterations/second\n", counter / seconds);
+    g_assert_cmpint(counter, ==, atomic_counter);
+}
+
+static void test_multi_mutex_1(void)
+{
+    test_multi_mutex(NUM_CONTEXTS, 1);
+}
+
+static void test_multi_mutex_10(void)
+{
+    test_multi_mutex(NUM_CONTEXTS, 10);
+}
+
 /* End of tests.  */
 
 int main(int argc, char **argv)
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
         g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
         g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1);
         g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3);
+#ifdef CONFIG_LINUX
+        g_test_add_func("/aio/multi/mutex/mcs", test_multi_fair_mutex_1);
+#endif
+        g_test_add_func("/aio/multi/mutex/pthread", test_multi_mutex_1);
     } else {
         g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
         g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10);
         g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30);
+#ifdef CONFIG_LINUX
+        g_test_add_func("/aio/multi/mutex/mcs", test_multi_fair_mutex_10);
+#endif
+        g_test_add_func("/aio/multi/mutex/pthread", test_multi_mutex_10);
     }
     return g_test_run();
 }
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This will avoid forward references in the next patch.  It is also
more logical because CoQueue is not anymore the basic primitive.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-5-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h | 89 ++++++++++++++++++++++++------------------------
 1 file changed, 44 insertions(+), 45 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ bool qemu_in_coroutine(void);
  */
 bool qemu_coroutine_entered(Coroutine *co);
 
-
-/**
- * CoQueues are a mechanism to queue coroutines in order to continue executing
- * them later. They provide the fundamental primitives on which coroutine locks
- * are built.
- */
-typedef struct CoQueue {
-    QSIMPLEQ_HEAD(, Coroutine) entries;
-} CoQueue;
-
-/**
- * Initialise a CoQueue. This must be called before any other operation is used
- * on the CoQueue.
- */
-void qemu_co_queue_init(CoQueue *queue);
-
-/**
- * Adds the current coroutine to the CoQueue and transfers control to the
- * caller of the coroutine.
- */
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
-
-/**
- * Restarts the next coroutine in the CoQueue and removes it from the queue.
- *
- * Returns true if a coroutine was restarted, false if the queue is empty.
- */
-bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
-
-/**
- * Restarts all coroutines in the CoQueue and leaves the queue empty.
- */
-void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
-
-/**
- * Enter the next coroutine in the queue
- */
-bool qemu_co_enter_next(CoQueue *queue);
-
-/**
- * Checks if the CoQueue is empty.
- */
-bool qemu_co_queue_empty(CoQueue *queue);
-
-
 /**
  * Provides a mutex that can be used to synchronise coroutines
  */
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex);
  */
 void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
 
+
+/**
+ * CoQueues are a mechanism to queue coroutines in order to continue executing
+ * them later.
+ */
+typedef struct CoQueue {
+    QSIMPLEQ_HEAD(, Coroutine) entries;
+} CoQueue;
+
+/**
+ * Initialise a CoQueue. This must be called before any other operation is used
+ * on the CoQueue.
+ */
+void qemu_co_queue_init(CoQueue *queue);
+
+/**
+ * Adds the current coroutine to the CoQueue and transfers control to the
+ * caller of the coroutine.
+ */
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
+
+/**
+ * Restarts the next coroutine in the CoQueue and removes it from the queue.
+ *
+ * Returns true if a coroutine was restarted, false if the queue is empty.
+ */
+bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
+
+/**
+ * Restarts all coroutines in the CoQueue and leaves the queue empty.
+ */
+void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
+
+/**
+ * Enter the next coroutine in the queue
+ */
+bool qemu_co_enter_next(CoQueue *queue);
+
+/**
+ * Checks if the CoQueue is empty.
+ */
+bool qemu_co_queue_empty(CoQueue *queue);
+
+
 typedef struct CoRwlock {
     bool writer;
     int reader;
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

All that CoQueue needs in order to become thread-safe is help
from an external mutex.  Add this to the API.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-6-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h   |  8 +++++---
 block/backup.c             |  2 +-
 block/io.c                 |  4 ++--
 block/nbd-client.c         |  2 +-
 block/qcow2-cluster.c      |  4 +---
 block/sheepdog.c           |  2 +-
 block/throttle-groups.c    |  2 +-
 hw/9pfs/9p.c               |  2 +-
 util/qemu-coroutine-lock.c | 24 +++++++++++++++++++++---
 9 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
 
 /**
  * CoQueues are a mechanism to queue coroutines in order to continue executing
- * them later.
+ * them later.  They are similar to condition variables, but they need help
+ * from an external mutex in order to maintain thread-safety.
  */
 typedef struct CoQueue {
     QSIMPLEQ_HEAD(, Coroutine) entries;
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_init(CoQueue *queue);
 
 /**
  * Adds the current coroutine to the CoQueue and transfers control to the
- * caller of the coroutine.
+ * caller of the coroutine.  The mutex is unlocked during the wait and
+ * locked again afterwards.
  */
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex);
 
 /**
  * Restarts the next coroutine in the CoQueue and removes it from the queue.
diff --git a/block/backup.c b/block/backup.c
index XXXXXXX..XXXXXXX 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
         retry = false;
         QLIST_FOREACH(req, &job->inflight_reqs, list) {
             if (end > req->start && start < req->end) {
-                qemu_co_queue_wait(&req->wait_queue);
+                qemu_co_queue_wait(&req->wait_queue, NULL);
                 retry = true;
                 break;
             }
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
                  * (instead of producing a deadlock in the former case). */
                 if (!req->waiting_for) {
                     self->waiting_for = req;
-                    qemu_co_queue_wait(&req->wait_queue);
+                    qemu_co_queue_wait(&req->wait_queue, NULL);
                     self->waiting_for = NULL;
                     retry = true;
                     waited = true;
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 
     /* Wait until any previous flushes are completed */
     while (bs->active_flush_req) {
-        qemu_co_queue_wait(&bs->flush_queue);
+        qemu_co_queue_wait(&bs->flush_queue, NULL);
     }
 
     bs->active_flush_req = true;
diff --git a/block/nbd-client.c b/block/nbd-client.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -XXX,XX +XXX,XX @@ static void nbd_coroutine_start(NBDClientSession *s,
     /* Poor man semaphore.  The free_sema is locked when no other request
      * can be accepted, and unlocked after receiving one reply.  */
     if (s->in_flight == MAX_NBD_REQUESTS) {
-        qemu_co_queue_wait(&s->free_sema);
+        qemu_co_queue_wait(&s->free_sema, NULL);
         assert(s->in_flight < MAX_NBD_REQUESTS);
     }
     s->in_flight++;
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
             if (bytes == 0) {
                 /* Wait for the dependency to complete. We need to recheck
                  * the free/allocated clusters when we continue. */
-                qemu_co_mutex_unlock(&s->lock);
-                qemu_co_queue_wait(&old_alloc->dependent_requests);
-                qemu_co_mutex_lock(&s->lock);
+                qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
                 return -EAGAIN;
             }
         }
diff --git a/block/sheepdog.c b/block/sheepdog.c
index XXXXXXX..XXXXXXX 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
 retry:
     QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
         if (AIOCBOverlapping(acb, cb)) {
-            qemu_co_queue_wait(&s->overlapping_queue);
+            qemu_co_queue_wait(&s->overlapping_queue, NULL);
             goto retry;
         }
     }
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index XXXXXXX..XXXXXXX 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
     if (must_wait || blkp->pending_reqs[is_write]) {
         blkp->pending_reqs[is_write]++;
         qemu_mutex_unlock(&tg->lock);
-        qemu_co_queue_wait(&blkp->throttled_reqs[is_write]);
+        qemu_co_queue_wait(&blkp->throttled_reqs[is_write], NULL);
         qemu_mutex_lock(&tg->lock);
         blkp->pending_reqs[is_write]--;
     }
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn v9fs_flush(void *opaque)
         /*
          * Wait for pdu to complete.
          */
-        qemu_co_queue_wait(&cancel_pdu->complete);
+        qemu_co_queue_wait(&cancel_pdu->complete, NULL);
         cancel_pdu->cancelled = 0;
         pdu_free(cancel_pdu);
     }
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_init(CoQueue *queue)
     QSIMPLEQ_INIT(&queue->entries);
 }
 
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex)
 {
     Coroutine *self = qemu_coroutine_self();
     QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next);
+
+    if (mutex) {
+        qemu_co_mutex_unlock(mutex);
+    }
+
+    /* There is no race condition here.  Other threads will call
+     * aio_co_schedule on our AioContext, which can reenter this
+     * coroutine but only after this yield and after the main loop
+     * has gone through the next iteration.
+     */
     qemu_coroutine_yield();
     assert(qemu_in_coroutine());
+
+    /* TODO: OSv implements wait morphing here, where the wakeup
+     * primitive automatically places the woken coroutine on the
+     * mutex's queue.  This avoids the thundering herd effect.
+     */
+    if (mutex) {
+        qemu_co_mutex_lock(mutex);
+    }
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock)
     Coroutine *self = qemu_coroutine_self();
 
     while (lock->writer) {
-        qemu_co_queue_wait(&lock->queue);
+        qemu_co_queue_wait(&lock->queue, NULL);
     }
     lock->reader++;
     self->locks_held++;
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_wrlock(CoRwlock *lock)
     Coroutine *self = qemu_coroutine_self();
 
     while (lock->writer || lock->reader) {
-        qemu_co_queue_wait(&lock->queue);
+        qemu_co_queue_wait(&lock->queue, NULL);
     }
     lock->writer = true;
     self->locks_held++;
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This adds a CoMutex around the existing CoQueue.  Because the write-side
can just take CoMutex, the old "writer" field is not necessary anymore.
Instead of removing it altogether, count the number of pending writers
during a read-side critical section and forbid further readers from
entering.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-7-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h   |  3 ++-
 util/qemu-coroutine-lock.c | 35 ++++++++++++++++++++++++-----------
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue);
 
 
 typedef struct CoRwlock {
-    bool writer;
+    int pending_writer;
     int reader;
+    CoMutex mutex;
     CoQueue queue;
 } CoRwlock;
 
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_init(CoRwlock *lock)
 {
     memset(lock, 0, sizeof(*lock));
     qemu_co_queue_init(&lock->queue);
+    qemu_co_mutex_init(&lock->mutex);
 }
 
 void qemu_co_rwlock_rdlock(CoRwlock *lock)
 {
     Coroutine *self = qemu_coroutine_self();
 
-    while (lock->writer) {
-        qemu_co_queue_wait(&lock->queue, NULL);
+    qemu_co_mutex_lock(&lock->mutex);
+    /* For fairness, wait if a writer is in line.  */
+    while (lock->pending_writer) {
+        qemu_co_queue_wait(&lock->queue, &lock->mutex);
     }
     lock->reader++;
+    qemu_co_mutex_unlock(&lock->mutex);
+
+    /* The rest of the read-side critical section is run without the mutex.  */
     self->locks_held++;
 }
 
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
     Coroutine *self = qemu_coroutine_self();
 
     assert(qemu_in_coroutine());
-    if (lock->writer) {
-        lock->writer = false;
+    if (!lock->reader) {
+        /* The critical section started in qemu_co_rwlock_wrlock.  */
         qemu_co_queue_restart_all(&lock->queue);
     } else {
+        self->locks_held--;
+
+        qemu_co_mutex_lock(&lock->mutex);
         lock->reader--;
         assert(lock->reader >= 0);
         /* Wakeup only one waiting writer */
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
             qemu_co_queue_next(&lock->queue);
         }
     }
-    self->locks_held--;
+    qemu_co_mutex_unlock(&lock->mutex);
 }
 
 void qemu_co_rwlock_wrlock(CoRwlock *lock)
 {
-    Coroutine *self = qemu_coroutine_self();
-
-    while (lock->writer || lock->reader) {
-        qemu_co_queue_wait(&lock->queue, NULL);
+    qemu_co_mutex_lock(&lock->mutex);
+    lock->pending_writer++;
+    while (lock->reader) {
+        qemu_co_queue_wait(&lock->queue, &lock->mutex);
     }
-    lock->writer = true;
-    self->locks_held++;
+    lock->pending_writer--;
+
+    /* The rest of the write-side critical section is run with
+     * the mutex taken, so that lock->reader remains zero.
+     * There is no need to update self->locks_held.
+     */
 }
-- 
2.9.3