Series comparison

-[Qemu-devel] [PULL 0/2] Block patches
+[Qemu-devel] [PULL v2 00/24] Block patches
-The following changes since commit ca4e667dbf431d4a2a5a619cde79d30dd2ac3eb2:
+The following changes since commit 56f9e46b841c7be478ca038d8d4085d776ab4b0d:
-  Merge remote-tracking branch 'remotes/kraxel/tags/usb-20170717-pull-request' into staging (2017-07-17 17:54:17 +0100)
+  Merge remote-tracking branch 'remotes/armbru/tags/pull-qapi-2017-02-20' into staging (2017-02-20 17:42:47 +0000)
 are available in the git repository at:
-  git://github.com/codyprime/qemu-kvm-jtc.git tags/block-pull-request
+  git://github.com/stefanha/qemu.git tags/block-pull-request
-for you to fetch changes up to 8508eee740c78d1465e25dad7c3e06137485dfbc:
+for you to fetch changes up to a7b91d35bab97a2d3e779d0c64c9b837b52a6cf7:
-  live-block-ops.txt: Rename, rewrite, and improve it (2017-07-18 00:11:01 -0400)
+  coroutine-lock: make CoRwlock thread-safe and fair (2017-02-21 11:39:40 +0000)
 ----------------------------------------------------------------
-Block patches (documentation)
+Pull request
 v2:
  * Rebased to resolve scsi conflicts
 ----------------------------------------------------------------
-Kashyap Chamarthy (2):
+Paolo Bonzini (24):
-  bitmaps.md: Convert to rST; move it into 'interop' dir
+  block: move AioContext, QEMUTimer, main-loop to libqemuutil
-  live-block-ops.txt: Rename, rewrite, and improve it
+  aio: introduce aio_co_schedule and aio_co_wake
   block-backend: allow blk_prw from coroutine context
   test-thread-pool: use generic AioContext infrastructure
   io: add methods to set I/O handlers on AioContext
   io: make qio_channel_yield aware of AioContexts
   nbd: convert to use qio_channel_yield
   coroutine-lock: reschedule coroutine on the AioContext it was running
     on
   blkdebug: reschedule coroutine on the AioContext it is running on
   qed: introduce qed_aio_start_io and qed_aio_next_io_cb
   aio: push aio_context_acquire/release down to dispatching
   block: explicitly acquire aiocontext in timers that need it
   block: explicitly acquire aiocontext in callbacks that need it
   block: explicitly acquire aiocontext in bottom halves that need it
   block: explicitly acquire aiocontext in aio callbacks that need it
   aio-posix: partially inline aio_dispatch into aio_poll
   async: remove unnecessary inc/dec pairs
   block: document fields protected by AioContext lock
   coroutine-lock: make CoMutex thread-safe
   coroutine-lock: add limited spinning to CoMutex
   test-aio-multithread: add performance comparison with thread-based
     mutexes
   coroutine-lock: place CoMutex before CoQueue in header
   coroutine-lock: add mutex argument to CoQueue APIs
   coroutine-lock: make CoRwlock thread-safe and fair
- docs/devel/bitmaps.md                  |  505 ---------------
+ Makefile.objs                       |   4 -
- docs/interop/bitmaps.rst               |  555 ++++++++++++++++
+ stubs/Makefile.objs                 |   1 +
- docs/interop/live-block-operations.rst | 1088 ++++++++++++++++++++++++++++++++
+ tests/Makefile.include              |  19 +-
- docs/live-block-ops.txt                |   72 ---
+ util/Makefile.objs                  |   6 +-
-files changed, 1643 insertions(+), 577 deletions(-)
+ block/nbd-client.h                  |   2 +-
- delete mode 100644 docs/devel/bitmaps.md
+ block/qed.h                         |   3 +
- create mode 100644 docs/interop/bitmaps.rst
+ include/block/aio.h                 |  38 ++-
- create mode 100644 docs/interop/live-block-operations.rst
+ include/block/block_int.h           |  64 +++--
- delete mode 100644 docs/live-block-ops.txt
+ include/io/channel.h                |  72 +++++-
  include/qemu/coroutine.h            |  84 ++++---
  include/qemu/coroutine_int.h        |  11 +-
  include/sysemu/block-backend.h      |  14 +-
  tests/iothread.h                    |  25 ++
  block/backup.c                      |   2 +-
  block/blkdebug.c                    |   9 +-
  block/blkreplay.c                   |   2 +-
  block/block-backend.c               |  13 +-
  block/curl.c                        |  44 +++-
  block/gluster.c                     |   9 +-
  block/io.c                          |  42 +---
  block/iscsi.c                       |  15 +-
  block/linux-aio.c                   |  10 +-
  block/mirror.c                      |  12 +-
  block/nbd-client.c                  | 119 +++++----
  block/nfs.c                         |   9 +-
  block/qcow2-cluster.c               |   4 +-
  block/qed-cluster.c                 |   2 +
  block/qed-table.c                   |  12 +-
  block/qed.c                         |  58 +++--
  block/sheepdog.c                    |  31 +--
  block/ssh.c                         |  29 +--
  block/throttle-groups.c             |   4 +-
  block/win32-aio.c                   |   9 +-
  dma-helpers.c                       |   2 +
  hw/9pfs/9p.c                        |   2 +-
  hw/block/virtio-blk.c               |  19 +-
  hw/scsi/scsi-bus.c                  |   2 +
  hw/scsi/scsi-disk.c                 |  15 ++
  hw/scsi/scsi-generic.c              |  20 +-
  hw/scsi/virtio-scsi.c               |   7 +
  io/channel-command.c                |  13 +
  io/channel-file.c                   |  11 +
  io/channel-socket.c                 |  16 +-
  io/channel-tls.c                    |  12 +
  io/channel-watch.c                  |   6 +
  io/channel.c                        |  97 ++++++--
  nbd/client.c                        |   2 +-
  nbd/common.c                        |   9 +-
  nbd/server.c                        |  94 +++-----
  stubs/linux-aio.c                   |  32 +++
  stubs/set-fd-handler.c              |  11 -
  tests/iothread.c                    |  91 +++++++
  tests/test-aio-multithread.c        | 463 ++++++++++++++++++++++++++++++++++++
  tests/test-thread-pool.c            |  12 +-
  aio-posix.c => util/aio-posix.c     |  62 ++---
  aio-win32.c => util/aio-win32.c     |  30 +--
  util/aiocb.c                        |  55 +++++
  async.c => util/async.c             |  84 ++++++-
  iohandler.c => util/iohandler.c     |   0
  main-loop.c => util/main-loop.c     |   0
  util/qemu-coroutine-lock.c          | 254 ++++++++++++++++++--
  util/qemu-coroutine-sleep.c         |   2 +-
  util/qemu-coroutine.c               |   8 +
  qemu-timer.c => util/qemu-timer.c   |   0
  thread-pool.c => util/thread-pool.c |   8 +-
  trace-events                        |  11 -
  util/trace-events                   |  17 +-
 files changed, 1712 insertions(+), 533 deletions(-)
  create mode 100644 tests/iothread.h
  create mode 100644 stubs/linux-aio.c
  create mode 100644 tests/iothread.c
  create mode 100644 tests/test-aio-multithread.c
  rename aio-posix.c => util/aio-posix.c (94%)
  rename aio-win32.c => util/aio-win32.c (95%)
  create mode 100644 util/aiocb.c
  rename async.c => util/async.c (82%)
  rename iohandler.c => util/iohandler.c (100%)
  rename main-loop.c => util/main-loop.c (100%)
  rename qemu-timer.c => util/qemu-timer.c (100%)
  rename thread-pool.c => util/thread-pool.c (97%)
 --
-.9.4
+.9.3

-New patch
+[Qemu-devel] [PULL v2 01/24] block: move AioContext, QEMUTimer, main-loop to libqemuutil
+From: Paolo Bonzini <pbonzini@redhat.com>
 AioContext is fairly self contained, the only dependency is QEMUTimer but
 that in turn doesn't need anything else.  So move them out of block-obj-y
 to avoid introducing a dependency from io/ to block-obj-y.
 main-loop and its dependency iohandler also need to be moved, because
 later in this series io/ will call iohandler_get_aio_context.
 [Changed copyright "the QEMU team" to "other QEMU contributors" as
 suggested by Daniel Berrange and agreed by Paolo.
 --Stefan]
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213135235.12274-2-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  Makefile.objs                       |  4 ---
  stubs/Makefile.objs                 |  1 +
  tests/Makefile.include              | 11 ++++----
  util/Makefile.objs                  |  6 +++-
  block/io.c                          | 29 -------------------
  stubs/linux-aio.c                   | 32 +++++++++++++++++++++
  stubs/set-fd-handler.c              | 11 --------
  aio-posix.c => util/aio-posix.c     |  2 +-
  aio-win32.c => util/aio-win32.c     |  0
  util/aiocb.c                        | 55 +++++++++++++++++++++++++++++++++++++
  async.c => util/async.c             |  3 +-
  iohandler.c => util/iohandler.c     |  0
  main-loop.c => util/main-loop.c     |  0
  qemu-timer.c => util/qemu-timer.c   |  0
  thread-pool.c => util/thread-pool.c |  2 +-
  trace-events                        | 11 --------
  util/trace-events                   | 11 ++++++++
 files changed, 114 insertions(+), 64 deletions(-)
  create mode 100644 stubs/linux-aio.c
  rename aio-posix.c => util/aio-posix.c (99%)
  rename aio-win32.c => util/aio-win32.c (100%)
  create mode 100644 util/aiocb.c
  rename async.c => util/async.c (99%)
  rename iohandler.c => util/iohandler.c (100%)
  rename main-loop.c => util/main-loop.c (100%)
  rename qemu-timer.c => util/qemu-timer.c (100%)
  rename thread-pool.c => util/thread-pool.c (99%)
 diff --git a/Makefile.objs b/Makefile.objs
 index XXXXXXX..XXXXXXX 100644
 --- a/Makefile.objs
 +++ b/Makefile.objs
@@ -XXX,XX +XXX,XX @@ chardev-obj-y = chardev/
  #######################################################################
  # block-obj-y is code used by both qemu system emulation and qemu-img
 -block-obj-y = async.o thread-pool.o
  block-obj-y += nbd/
  block-obj-y += block.o blockjob.o
 -block-obj-y += main-loop.o iohandler.o qemu-timer.o
 -block-obj-$(CONFIG_POSIX) += aio-posix.o
 -block-obj-$(CONFIG_WIN32) += aio-win32.o
  block-obj-y += block/
  block-obj-y += qemu-io-cmds.o
  block-obj-$(CONFIG_REPLICATION) += replication.o
 diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
 index XXXXXXX..XXXXXXX 100644
 --- a/stubs/Makefile.objs
 +++ b/stubs/Makefile.objs
@@ -XXX,XX +XXX,XX @@ stub-obj-y += get-vm-name.o
  stub-obj-y += iothread.o
  stub-obj-y += iothread-lock.o
  stub-obj-y += is-daemonized.o
 +stub-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
  stub-obj-y += machine-init-done.o
  stub-obj-y += migr-blocker.o
  stub-obj-y += monitor.o
 diff --git a/tests/Makefile.include b/tests/Makefile.include
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/Makefile.include
 +++ b/tests/Makefile.include
@@ -XXX,XX +XXX,XX @@ check-unit-y += tests/test-visitor-serialization$(EXESUF)
  check-unit-y += tests/test-iov$(EXESUF)
  gcov-files-test-iov-y = util/iov.c
  check-unit-y += tests/test-aio$(EXESUF)
 +gcov-files-test-aio-y = util/async.c util/qemu-timer.o
 +gcov-files-test-aio-$(CONFIG_WIN32) += util/aio-win32.c
 +gcov-files-test-aio-$(CONFIG_POSIX) += util/aio-posix.c
  check-unit-y += tests/test-throttle$(EXESUF)
  gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
  gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
@@ -XXX,XX +XXX,XX @@ tests/check-qjson$(EXESUF): tests/check-qjson.o $(test-util-obj-y)
  tests/check-qom-interface$(EXESUF): tests/check-qom-interface.o $(test-qom-obj-y)
  tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
 -tests/test-char$(EXESUF): tests/test-char.o qemu-timer.o \
 -    $(test-util-obj-y) $(qtest-obj-y) $(test-block-obj-y) $(chardev-obj-y)
 +tests/test-char$(EXESUF): tests/test-char.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y) $(chardev-obj-y)
  tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
  tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
  tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
@@ -XXX,XX +XXX,XX @@ tests/test-vmstate$(EXESUF): tests/test-vmstate.o \
      migration/vmstate.o migration/qemu-file.o \
          migration/qemu-file-channel.o migration/qjson.o \
      $(test-io-obj-y)
 -tests/test-timed-average$(EXESUF): tests/test-timed-average.o qemu-timer.o \
 -    $(test-util-obj-y)
 +tests/test-timed-average$(EXESUF): tests/test-timed-average.o $(test-util-obj-y)
  tests/test-base64$(EXESUF): tests/test-base64.o \
      libqemuutil.a libqemustub.a
  tests/ptimer-test$(EXESUF): tests/ptimer-test.o tests/ptimer-test-stubs.o hw/core/ptimer.o libqemustub.a
@@ -XXX,XX +XXX,XX @@ tests/usb-hcd-ehci-test$(EXESUF): tests/usb-hcd-ehci-test.o $(libqos-usb-obj-y)
  tests/usb-hcd-xhci-test$(EXESUF): tests/usb-hcd-xhci-test.o $(libqos-usb-obj-y)
  tests/pc-cpu-test$(EXESUF): tests/pc-cpu-test.o
  tests/postcopy-test$(EXESUF): tests/postcopy-test.o
 -tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-timer.o \
 +tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o $(test-util-obj-y) \
      $(qtest-obj-y) $(test-io-obj-y) $(libqos-virtio-obj-y) $(libqos-pc-obj-y) \
      $(chardev-obj-y)
  tests/qemu-iotests/socket_scm_helper$(EXESUF): tests/qemu-iotests/socket_scm_helper.o
 diff --git a/util/Makefile.objs b/util/Makefile.objs
 index XXXXXXX..XXXXXXX 100644
 --- a/util/Makefile.objs
 +++ b/util/Makefile.objs
@@ -XXX,XX +XXX,XX @@
  util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
  util-obj-y += bufferiszero.o
  util-obj-y += lockcnt.o
 +util-obj-y += aiocb.o async.o thread-pool.o qemu-timer.o
 +util-obj-y += main-loop.o iohandler.o
 +util-obj-$(CONFIG_POSIX) += aio-posix.o
  util-obj-$(CONFIG_POSIX) += compatfd.o
  util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
  util-obj-$(CONFIG_POSIX) += mmap-alloc.o
  util-obj-$(CONFIG_POSIX) += oslib-posix.o
  util-obj-$(CONFIG_POSIX) += qemu-openpty.o
  util-obj-$(CONFIG_POSIX) += qemu-thread-posix.o
 -util-obj-$(CONFIG_WIN32) += event_notifier-win32.o
  util-obj-$(CONFIG_POSIX) += memfd.o
 +util-obj-$(CONFIG_WIN32) += aio-win32.o
 +util-obj-$(CONFIG_WIN32) += event_notifier-win32.o
  util-obj-$(CONFIG_WIN32) += oslib-win32.o
  util-obj-$(CONFIG_WIN32) += qemu-thread-win32.o
  util-obj-y += envlist.o path.o module.o
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
      return &acb->common;
  }
 -void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
 -                   BlockCompletionFunc *cb, void *opaque)
 -{
 -    BlockAIOCB *acb;
 -
 -    acb = g_malloc(aiocb_info->aiocb_size);
 -    acb->aiocb_info = aiocb_info;
 -    acb->bs = bs;
 -    acb->cb = cb;
 -    acb->opaque = opaque;
 -    acb->refcnt = 1;
 -    return acb;
 -}
 -
 -void qemu_aio_ref(void *p)
 -{
 -    BlockAIOCB *acb = p;
 -    acb->refcnt++;
 -}
 -
 -void qemu_aio_unref(void *p)
 -{
 -    BlockAIOCB *acb = p;
 -    assert(acb->refcnt > 0);
 -    if (--acb->refcnt == 0) {
 -        g_free(acb);
 -    }
 -}
 -
  /**************************************************************/
  /* Coroutine block device emulation */
 diff --git a/stubs/linux-aio.c b/stubs/linux-aio.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/stubs/linux-aio.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * Linux native AIO support.
 + *
 + * Copyright (C) 2009 IBM, Corp.
 + * Copyright (C) 2009 Red Hat, Inc.
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +#include "qemu/osdep.h"
 +#include "block/aio.h"
 +#include "block/raw-aio.h"
 +
 +void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
 +{
 +    abort();
 +}
 +
 +void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
 +{
 +    abort();
 +}
 +
 +LinuxAioState *laio_init(void)
 +{
 +    abort();
 +}
 +
 +void laio_cleanup(LinuxAioState *s)
 +{
 +    abort();
 +}
 diff --git a/stubs/set-fd-handler.c b/stubs/set-fd-handler.c
 index XXXXXXX..XXXXXXX 100644
 --- a/stubs/set-fd-handler.c
 +++ b/stubs/set-fd-handler.c
@@ -XXX,XX +XXX,XX @@ void qemu_set_fd_handler(int fd,
  {
      abort();
  }
 -
 -void aio_set_fd_handler(AioContext *ctx,
 -                        int fd,
 -                        bool is_external,
 -                        IOHandler *io_read,
 -                        IOHandler *io_write,
 -                        AioPollFn *io_poll,
 -                        void *opaque)
 -{
 -    abort();
 -}
 diff --git a/aio-posix.c b/util/aio-posix.c
 similarity index 99%
 rename from aio-posix.c
 rename to util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/aio-posix.c
 +++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/rcu_queue.h"
  #include "qemu/sockets.h"
  #include "qemu/cutils.h"
 -#include "trace-root.h"
 +#include "trace.h"
  #ifdef CONFIG_EPOLL_CREATE1
  #include <sys/epoll.h>
  #endif
 diff --git a/aio-win32.c b/util/aio-win32.c
 similarity index 100%
 rename from aio-win32.c
 rename to util/aio-win32.c
 diff --git a/util/aiocb.c b/util/aiocb.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/util/aiocb.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * BlockAIOCB allocation
 + *
 + * Copyright (c) 2003-2017 Fabrice Bellard and other QEMU contributors
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a copy
 + * of this software and associated documentation files (the "Software"), to deal
 + * in the Software without restriction, including without limitation the rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "block/aio.h"
 +
 +void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
 +                   BlockCompletionFunc *cb, void *opaque)
 +{
 +    BlockAIOCB *acb;
 +
 +    acb = g_malloc(aiocb_info->aiocb_size);
 +    acb->aiocb_info = aiocb_info;
 +    acb->bs = bs;
 +    acb->cb = cb;
 +    acb->opaque = opaque;
 +    acb->refcnt = 1;
 +    return acb;
 +}
 +
 +void qemu_aio_ref(void *p)
 +{
 +    BlockAIOCB *acb = p;
 +    acb->refcnt++;
 +}
 +
 +void qemu_aio_unref(void *p)
 +{
 +    BlockAIOCB *acb = p;
 +    assert(acb->refcnt > 0);
 +    if (--acb->refcnt == 0) {
 +        g_free(acb);
 +    }
 +}
 diff --git a/async.c b/util/async.c
 similarity index 99%
 rename from async.c
 rename to util/async.c
 index XXXXXXX..XXXXXXX 100644
 --- a/async.c
 +++ b/util/async.c
@@ -XXX,XX +XXX,XX @@
  /*
 - * QEMU System Emulator
 + * Data plane event loop
   *
   * Copyright (c) 2003-2008 Fabrice Bellard
 + * Copyright (c) 2009-2017 QEMU contributors
   *
   * Permission is hereby granted, free of charge, to any person obtaining a copy
   * of this software and associated documentation files (the "Software"), to deal
 diff --git a/iohandler.c b/util/iohandler.c
 similarity index 100%
 rename from iohandler.c
 rename to util/iohandler.c
 diff --git a/main-loop.c b/util/main-loop.c
 similarity index 100%
 rename from main-loop.c
 rename to util/main-loop.c
 diff --git a/qemu-timer.c b/util/qemu-timer.c
 similarity index 100%
 rename from qemu-timer.c
 rename to util/qemu-timer.c
 diff --git a/thread-pool.c b/util/thread-pool.c
 similarity index 99%
 rename from thread-pool.c
 rename to util/thread-pool.c
 index XXXXXXX..XXXXXXX 100644
 --- a/thread-pool.c
 +++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/queue.h"
  #include "qemu/thread.h"
  #include "qemu/coroutine.h"
 -#include "trace-root.h"
 +#include "trace.h"
  #include "block/thread-pool.h"
  #include "qemu/main-loop.h"
 diff --git a/trace-events b/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/trace-events
 +++ b/trace-events
@@ -XXX,XX +XXX,XX @@
  #
  # The <format-string> should be a sprintf()-compatible format string.
 -# aio-posix.c
 -run_poll_handlers_begin(void *ctx, int64_t max_ns) "ctx %p max_ns %"PRId64
 -run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
 -poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 -poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 -
 -# thread-pool.c
 -thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
 -thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
 -thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
 -
  # ioport.c
  cpu_in(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
  cpu_out(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
 diff --git a/util/trace-events b/util/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/util/trace-events
 +++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@
  # See docs/tracing.txt for syntax documentation.
 +# util/aio-posix.c
 +run_poll_handlers_begin(void *ctx, int64_t max_ns) "ctx %p max_ns %"PRId64
 +run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
 +poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 +poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 +
 +# util/thread-pool.c
 +thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
 +thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
 +thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
 +
  # util/buffer.c
  buffer_resize(const char *buf, size_t olen, size_t len) "%s: old %zd, new %zd"
  buffer_move_empty(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s"
 --
 .9.3

-New patch
+[Qemu-devel] [PULL v2 02/24] aio: introduce aio_co_schedule and aio_co_wake
+From: Paolo Bonzini <pbonzini@redhat.com>
 aio_co_wake provides the infrastructure to start a coroutine on a "home"
 AioContext.  It will be used by CoMutex and CoQueue, so that coroutines
 don't jump from one context to another when they go to sleep on a
 mutex or waitqueue.  However, it can also be used as a more efficient
 alternative to one-shot bottom halves, and saves the effort of tracking
 which AioContext a coroutine is running on.
 aio_co_schedule is the part of aio_co_wake that starts a coroutine
 on a remove AioContext, but it is also useful to implement e.g.
 bdrv_set_aio_context callbacks.
 The implementation of aio_co_schedule is based on a lock-free
 multiple-producer, single-consumer queue.  The multiple producers use
 cmpxchg to add to a LIFO stack.  The consumer (a per-AioContext bottom
 half) grabs all items added so far, inverts the list to make it FIFO,
 and goes through it one item at a time until it's empty.  The data
 structure was inspired by OSv, which uses it in the very code we'll
 "port" to QEMU for the thread-safe CoMutex.
 Most of the new code is really tests.
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213135235.12274-3-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  tests/Makefile.include       |   8 +-
  include/block/aio.h          |  32 +++++++
  include/qemu/coroutine_int.h |  11 ++-
  tests/iothread.h             |  25 +++++
  tests/iothread.c             |  91 ++++++++++++++++++
  tests/test-aio-multithread.c | 213 +++++++++++++++++++++++++++++++++++++++++++
  util/async.c                 |  65 +++++++++++++
  util/qemu-coroutine.c        |   8 ++
  util/trace-events            |   4 +
 files changed, 453 insertions(+), 4 deletions(-)
  create mode 100644 tests/iothread.h
  create mode 100644 tests/iothread.c
  create mode 100644 tests/test-aio-multithread.c
 diff --git a/tests/Makefile.include b/tests/Makefile.include
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/Makefile.include
 +++ b/tests/Makefile.include
@@ -XXX,XX +XXX,XX @@ check-unit-y += tests/test-aio$(EXESUF)
  gcov-files-test-aio-y = util/async.c util/qemu-timer.o
  gcov-files-test-aio-$(CONFIG_WIN32) += util/aio-win32.c
  gcov-files-test-aio-$(CONFIG_POSIX) += util/aio-posix.c
 +check-unit-y += tests/test-aio-multithread$(EXESUF)
 +gcov-files-test-aio-multithread-y = $(gcov-files-test-aio-y)
 +gcov-files-test-aio-multithread-y += util/qemu-coroutine.c tests/iothread.c
  check-unit-y += tests/test-throttle$(EXESUF)
 -gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
 -gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
  check-unit-y += tests/test-thread-pool$(EXESUF)
  gcov-files-test-thread-pool-y = thread-pool.c
  gcov-files-test-hbitmap-y = util/hbitmap.c
@@ -XXX,XX +XXX,XX @@ test-qapi-obj-y = tests/test-qapi-visit.o tests/test-qapi-types.o \
      $(test-qom-obj-y)
  test-crypto-obj-y = $(crypto-obj-y) $(test-qom-obj-y)
  test-io-obj-y = $(io-obj-y) $(test-crypto-obj-y)
 -test-block-obj-y = $(block-obj-y) $(test-io-obj-y)
 +test-block-obj-y = $(block-obj-y) $(test-io-obj-y) tests/iothread.o
  tests/check-qint$(EXESUF): tests/check-qint.o $(test-util-obj-y)
  tests/check-qstring$(EXESUF): tests/check-qstring.o $(test-util-obj-y)
@@ -XXX,XX +XXX,XX @@ tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
  tests/test-char$(EXESUF): tests/test-char.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y) $(chardev-obj-y)
  tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
  tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
 +tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
  tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
  tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
  tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
 diff --git a/include/block/aio.h b/include/block/aio.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/aio.h
 +++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ typedef void QEMUBHFunc(void *opaque);
  typedef bool AioPollFn(void *opaque);
  typedef void IOHandler(void *opaque);
 +struct Coroutine;
  struct ThreadPool;
  struct LinuxAioState;
@@ -XXX,XX +XXX,XX @@ struct AioContext {
      bool notified;
      EventNotifier notifier;
 +    QSLIST_HEAD(, Coroutine) scheduled_coroutines;
 +    QEMUBH *co_schedule_bh;
 +
      /* Thread pool for performing work and receiving completion callbacks.
       * Has its own locking.
       */
@@ -XXX,XX +XXX,XX @@ static inline bool aio_node_check(AioContext *ctx, bool is_external)
  }
  /**
 + * aio_co_schedule:
 + * @ctx: the aio context
 + * @co: the coroutine
 + *
 + * Start a coroutine on a remote AioContext.
 + *
 + * The coroutine must not be entered by anyone else while aio_co_schedule()
 + * is active.  In addition the coroutine must have yielded unless ctx
 + * is the context in which the coroutine is running (i.e. the value of
 + * qemu_get_current_aio_context() from the coroutine itself).
 + */
 +void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
 +
 +/**
 + * aio_co_wake:
 + * @co: the coroutine
 + *
 + * Restart a coroutine on the AioContext where it was running last, thus
 + * preventing coroutines from jumping from one context to another when they
 + * go to sleep.
 + *
 + * aio_co_wake may be executed either in coroutine or non-coroutine
 + * context.  The coroutine must not be entered by anyone else while
 + * aio_co_wake() is active.
 + */
 +void aio_co_wake(struct Coroutine *co);
 +
 +/**
   * Return the AioContext whose event loop runs in the current thread.
   *
   * If called from an IOThread this will be the IOThread's AioContext.  If
 diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/coroutine_int.h
 +++ b/include/qemu/coroutine_int.h
@@ -XXX,XX +XXX,XX @@ struct Coroutine {
      CoroutineEntry *entry;
      void *entry_arg;
      Coroutine *caller;
 +
 +    /* Only used when the coroutine has terminated.  */
      QSLIST_ENTRY(Coroutine) pool_next;
 +
      size_t locks_held;
 -    /* Coroutines that should be woken up when we yield or terminate */
 +    /* Coroutines that should be woken up when we yield or terminate.
 +     * Only used when the coroutine is running.
 +     */
      QSIMPLEQ_HEAD(, Coroutine) co_queue_wakeup;
 +
 +    /* Only used when the coroutine has yielded.  */
 +    AioContext *ctx;
      QSIMPLEQ_ENTRY(Coroutine) co_queue_next;
 +    QSLIST_ENTRY(Coroutine) co_scheduled_next;
  };
  Coroutine *qemu_coroutine_new(void);
 diff --git a/tests/iothread.h b/tests/iothread.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/iothread.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * Event loop thread implementation for unit tests
 + *
 + * Copyright Red Hat Inc., 2013, 2016
 + *
 + * Authors:
 + *  Stefan Hajnoczi   <stefanha@redhat.com>
 + *  Paolo Bonzini     <pbonzini@redhat.com>
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +#ifndef TEST_IOTHREAD_H
 +#define TEST_IOTHREAD_H
 +
 +#include "block/aio.h"
 +#include "qemu/thread.h"
 +
 +typedef struct IOThread IOThread;
 +
 +IOThread *iothread_new(void);
 +void iothread_join(IOThread *iothread);
 +AioContext *iothread_get_aio_context(IOThread *iothread);
 +
 +#endif
 diff --git a/tests/iothread.c b/tests/iothread.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/iothread.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * Event loop thread implementation for unit tests
 + *
 + * Copyright Red Hat Inc., 2013, 2016
 + *
 + * Authors:
 + *  Stefan Hajnoczi   <stefanha@redhat.com>
 + *  Paolo Bonzini     <pbonzini@redhat.com>
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + *
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qapi/error.h"
 +#include "block/aio.h"
 +#include "qemu/main-loop.h"
 +#include "qemu/rcu.h"
 +#include "iothread.h"
 +
 +struct IOThread {
 +    AioContext *ctx;
 +
 +    QemuThread thread;
 +    QemuMutex init_done_lock;
 +    QemuCond init_done_cond;    /* is thread initialization done? */
 +    bool stopping;
 +};
 +
 +static __thread IOThread *my_iothread;
 +
 +AioContext *qemu_get_current_aio_context(void)
 +{
 +    return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
 +}
 +
 +static void *iothread_run(void *opaque)
 +{
 +    IOThread *iothread = opaque;
 +
 +    rcu_register_thread();
 +
 +    my_iothread = iothread;
 +    qemu_mutex_lock(&iothread->init_done_lock);
 +    iothread->ctx = aio_context_new(&error_abort);
 +    qemu_cond_signal(&iothread->init_done_cond);
 +    qemu_mutex_unlock(&iothread->init_done_lock);
 +
 +    while (!atomic_read(&iothread->stopping)) {
 +        aio_poll(iothread->ctx, true);
 +    }
 +
 +    rcu_unregister_thread();
 +    return NULL;
 +}
 +
 +void iothread_join(IOThread *iothread)
 +{
 +    iothread->stopping = true;
 +    aio_notify(iothread->ctx);
 +    qemu_thread_join(&iothread->thread);
 +    qemu_cond_destroy(&iothread->init_done_cond);
 +    qemu_mutex_destroy(&iothread->init_done_lock);
 +    aio_context_unref(iothread->ctx);
 +    g_free(iothread);
 +}
 +
 +IOThread *iothread_new(void)
 +{
 +    IOThread *iothread = g_new0(IOThread, 1);
 +
 +    qemu_mutex_init(&iothread->init_done_lock);
 +    qemu_cond_init(&iothread->init_done_cond);
 +    qemu_thread_create(&iothread->thread, NULL, iothread_run,
 +                       iothread, QEMU_THREAD_JOINABLE);
 +
 +    /* Wait for initialization to complete */
 +    qemu_mutex_lock(&iothread->init_done_lock);
 +    while (iothread->ctx == NULL) {
 +        qemu_cond_wait(&iothread->init_done_cond,
 +                       &iothread->init_done_lock);
 +    }
 +    qemu_mutex_unlock(&iothread->init_done_lock);
 +    return iothread;
 +}
 +
 +AioContext *iothread_get_aio_context(IOThread *iothread)
 +{
 +    return iothread->ctx;
 +}
 diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/test-aio-multithread.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * AioContext multithreading tests
 + *
 + * Copyright Red Hat, Inc. 2016
 + *
 + * Authors:
 + *  Paolo Bonzini    <pbonzini@redhat.com>
 + *
 + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 + * See the COPYING.LIB file in the top-level directory.
 + */
 +
 +#include "qemu/osdep.h"
 +#include <glib.h>
 +#include "block/aio.h"
 +#include "qapi/error.h"
 +#include "qemu/coroutine.h"
 +#include "qemu/thread.h"
 +#include "qemu/error-report.h"
 +#include "iothread.h"
 +
 +/* AioContext management */
 +
 +#define NUM_CONTEXTS 5
 +
 +static IOThread *threads[NUM_CONTEXTS];
 +static AioContext *ctx[NUM_CONTEXTS];
 +static __thread int id = -1;
 +
 +static QemuEvent done_event;
 +
 +/* Run a function synchronously on a remote iothread. */
 +
 +typedef struct CtxRunData {
 +    QEMUBHFunc *cb;
 +    void *arg;
 +} CtxRunData;
 +
 +static void ctx_run_bh_cb(void *opaque)
 +{
 +    CtxRunData *data = opaque;
 +
 +    data->cb(data->arg);
 +    qemu_event_set(&done_event);
 +}
 +
 +static void ctx_run(int i, QEMUBHFunc *cb, void *opaque)
 +{
 +    CtxRunData data = {
 +        .cb = cb,
 +        .arg = opaque
 +    };
 +
 +    qemu_event_reset(&done_event);
 +    aio_bh_schedule_oneshot(ctx[i], ctx_run_bh_cb, &data);
 +    qemu_event_wait(&done_event);
 +}
 +
 +/* Starting the iothreads. */
 +
 +static void set_id_cb(void *opaque)
 +{
 +    int *i = opaque;
 +
 +    id = *i;
 +}
 +
 +static void create_aio_contexts(void)
 +{
 +    int i;
 +
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        threads[i] = iothread_new();
 +        ctx[i] = iothread_get_aio_context(threads[i]);
 +    }
 +
 +    qemu_event_init(&done_event, false);
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        ctx_run(i, set_id_cb, &i);
 +    }
 +}
 +
 +/* Stopping the iothreads. */
 +
 +static void join_aio_contexts(void)
 +{
 +    int i;
 +
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        aio_context_ref(ctx[i]);
 +    }
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        iothread_join(threads[i]);
 +    }
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        aio_context_unref(ctx[i]);
 +    }
 +    qemu_event_destroy(&done_event);
 +}
 +
 +/* Basic test for the stuff above. */
 +
 +static void test_lifecycle(void)
 +{
 +    create_aio_contexts();
 +    join_aio_contexts();
 +}
 +
 +/* aio_co_schedule test.  */
 +
 +static Coroutine *to_schedule[NUM_CONTEXTS];
 +
 +static bool now_stopping;
 +
 +static int count_retry;
 +static int count_here;
 +static int count_other;
 +
 +static bool schedule_next(int n)
 +{
 +    Coroutine *co;
 +
 +    co = atomic_xchg(&to_schedule[n], NULL);
 +    if (!co) {
 +        atomic_inc(&count_retry);
 +        return false;
 +    }
 +
 +    if (n == id) {
 +        atomic_inc(&count_here);
 +    } else {
 +        atomic_inc(&count_other);
 +    }
 +
 +    aio_co_schedule(ctx[n], co);
 +    return true;
 +}
 +
 +static void finish_cb(void *opaque)
 +{
 +    schedule_next(id);
 +}
 +
 +static coroutine_fn void test_multi_co_schedule_entry(void *opaque)
 +{
 +    g_assert(to_schedule[id] == NULL);
 +    atomic_mb_set(&to_schedule[id], qemu_coroutine_self());
 +
 +    while (!atomic_mb_read(&now_stopping)) {
 +        int n;
 +
 +        n = g_test_rand_int_range(0, NUM_CONTEXTS);
 +        schedule_next(n);
 +        qemu_coroutine_yield();
 +
 +        g_assert(to_schedule[id] == NULL);
 +        atomic_mb_set(&to_schedule[id], qemu_coroutine_self());
 +    }
 +}
 +
 +
 +static void test_multi_co_schedule(int seconds)
 +{
 +    int i;
 +
 +    count_here = count_other = count_retry = 0;
 +    now_stopping = false;
 +
 +    create_aio_contexts();
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        Coroutine *co1 = qemu_coroutine_create(test_multi_co_schedule_entry, NULL);
 +        aio_co_schedule(ctx[i], co1);
 +    }
 +
 +    g_usleep(seconds * 1000000);
 +
 +    atomic_mb_set(&now_stopping, true);
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        ctx_run(i, finish_cb, NULL);
 +        to_schedule[i] = NULL;
 +    }
 +
 +    join_aio_contexts();
 +    g_test_message("scheduled %d, queued %d, retry %d, total %d\n",
 +                  count_other, count_here, count_retry,
 +                  count_here + count_other + count_retry);
 +}
 +
 +static void test_multi_co_schedule_1(void)
 +{
 +    test_multi_co_schedule(1);
 +}
 +
 +static void test_multi_co_schedule_10(void)
 +{
 +    test_multi_co_schedule(10);
 +}
 +
 +/* End of tests.  */
 +
 +int main(int argc, char **argv)
 +{
 +    init_clocks();
 +
 +    g_test_init(&argc, &argv, NULL);
 +    g_test_add_func("/aio/multi/lifecycle", test_lifecycle);
 +    if (g_test_quick()) {
 +        g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
 +    } else {
 +        g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
 +    }
 +    return g_test_run();
 +}
 diff --git a/util/async.c b/util/async.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/async.c
 +++ b/util/async.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/main-loop.h"
  #include "qemu/atomic.h"
  #include "block/raw-aio.h"
 +#include "qemu/coroutine_int.h"
 +#include "trace.h"
  /***********************************************************/
  /* bottom halves (can be seen as timers which expire ASAP) */
@@ -XXX,XX +XXX,XX @@ aio_ctx_finalize(GSource     *source)
      }
  #endif
 +    assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
 +    qemu_bh_delete(ctx->co_schedule_bh);
 +
      qemu_lockcnt_lock(&ctx->list_lock);
      assert(!qemu_lockcnt_count(&ctx->list_lock));
      while (ctx->first_bh) {
@@ -XXX,XX +XXX,XX @@ static bool event_notifier_poll(void *opaque)
      return atomic_read(&ctx->notified);
  }
 +static void co_schedule_bh_cb(void *opaque)
 +{
 +    AioContext *ctx = opaque;
 +    QSLIST_HEAD(, Coroutine) straight, reversed;
 +
 +    QSLIST_MOVE_ATOMIC(&reversed, &ctx->scheduled_coroutines);
 +    QSLIST_INIT(&straight);
 +
 +    while (!QSLIST_EMPTY(&reversed)) {
 +        Coroutine *co = QSLIST_FIRST(&reversed);
 +        QSLIST_REMOVE_HEAD(&reversed, co_scheduled_next);
 +        QSLIST_INSERT_HEAD(&straight, co, co_scheduled_next);
 +    }
 +
 +    while (!QSLIST_EMPTY(&straight)) {
 +        Coroutine *co = QSLIST_FIRST(&straight);
 +        QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
 +        trace_aio_co_schedule_bh_cb(ctx, co);
 +        qemu_coroutine_enter(co);
 +    }
 +}
 +
  AioContext *aio_context_new(Error **errp)
  {
      int ret;
@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
      }
      g_source_set_can_recurse(&ctx->source, true);
      qemu_lockcnt_init(&ctx->list_lock);
 +
 +    ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx);
 +    QSLIST_INIT(&ctx->scheduled_coroutines);
 +
      aio_set_event_notifier(ctx, &ctx->notifier,
                             false,
                             (EventNotifierHandler *)
@@ -XXX,XX +XXX,XX @@ fail:
      return NULL;
  }
 +void aio_co_schedule(AioContext *ctx, Coroutine *co)
 +{
 +    trace_aio_co_schedule(ctx, co);
 +    QSLIST_INSERT_HEAD_ATOMIC(&ctx->scheduled_coroutines,
 +                              co, co_scheduled_next);
 +    qemu_bh_schedule(ctx->co_schedule_bh);
 +}
 +
 +void aio_co_wake(struct Coroutine *co)
 +{
 +    AioContext *ctx;
 +
 +    /* Read coroutine before co->ctx.  Matches smp_wmb in
 +     * qemu_coroutine_enter.
 +     */
 +    smp_read_barrier_depends();
 +    ctx = atomic_read(&co->ctx);
 +
 +    if (ctx != qemu_get_current_aio_context()) {
 +        aio_co_schedule(ctx, co);
 +        return;
 +    }
 +
 +    if (qemu_in_coroutine()) {
 +        Coroutine *self = qemu_coroutine_self();
 +        assert(self != co);
 +        QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, co, co_queue_next);
 +    } else {
 +        aio_context_acquire(ctx);
 +        qemu_coroutine_enter(co);
 +        aio_context_release(ctx);
 +    }
 +}
 +
  void aio_context_ref(AioContext *ctx)
  {
      g_source_ref(&ctx->source);
 diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/qemu-coroutine.c
 +++ b/util/qemu-coroutine.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/atomic.h"
  #include "qemu/coroutine.h"
  #include "qemu/coroutine_int.h"
 +#include "block/aio.h"
  enum {
      POOL_BATCH_SIZE = 64,
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_enter(Coroutine *co)
      }
      co->caller = self;
 +    co->ctx = qemu_get_current_aio_context();
 +
 +    /* Store co->ctx before anything that stores co.  Matches
 +     * barrier in aio_co_wake.
 +     */
 +    smp_wmb();
 +
      ret = qemu_coroutine_switch(self, co, COROUTINE_ENTER);
      qemu_co_queue_run_restart(co);
 diff --git a/util/trace-events b/util/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/util/trace-events
 +++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
  poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
  poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 +# util/async.c
 +aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
 +aio_co_schedule_bh_cb(void *ctx, void *co) "ctx %p co %p"
 +
  # util/thread-pool.c
  thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
  thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
 --
 .9.3

-New patch
+[Qemu-devel] [PULL v2 03/24] block-backend: allow blk_prw from coroutine context
+From: Paolo Bonzini <pbonzini@redhat.com>
+qcow2_create2 calls this.  Do not run a nested event loop, as that
+breaks when aio_co_wake tries to queue the coroutine on the co_queue_wakeup
+list of the currently running one.
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+Message-id: 20170213135235.12274-4-pbonzini@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/block-backend.c | 12 ++++++++----
+file changed, 8 insertions(+), 4 deletions(-)
+diff --git a/block/block-backend.c b/block/block-backend.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/block-backend.c
++++ b/block/block-backend.c
+@@ -XXX,XX +XXX,XX @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
+ {
+     QEMUIOVector qiov;
+     struct iovec iov;
+-    Coroutine *co;
+     BlkRwCo rwco;
+     iov = (struct iovec) {
+@@ -XXX,XX +XXX,XX @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
+         .ret    = NOT_DONE,
+     };
+-    co = qemu_coroutine_create(co_entry, &rwco);
+-    qemu_coroutine_enter(co);
+-    BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
++    if (qemu_in_coroutine()) {
++        /* Fast-path if already in coroutine context */
++        co_entry(&rwco);
++    } else {
++        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
++        qemu_coroutine_enter(co);
++        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
++    }
+     return rwco.ret;
+ }
+--
+.9.3

-New patch
+[Qemu-devel] [PULL v2 04/24] test-thread-pool: use generic AioContext infrastructure
+From: Paolo Bonzini <pbonzini@redhat.com>
+Once the thread pool starts using aio_co_wake, it will also need
+qemu_get_current_aio_context().  Make test-thread-pool create
+an AioContext with qemu_init_main_loop, so that stubs/iothread.c
+and tests/iothread.c can provide the rest.
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+Message-id: 20170213135235.12274-5-pbonzini@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ tests/test-thread-pool.c | 12 +++---------
+file changed, 3 insertions(+), 9 deletions(-)
+diff --git a/tests/test-thread-pool.c b/tests/test-thread-pool.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tests/test-thread-pool.c
++++ b/tests/test-thread-pool.c
+@@ -XXX,XX +XXX,XX @@
+ #include "qapi/error.h"
+ #include "qemu/timer.h"
+ #include "qemu/error-report.h"
++#include "qemu/main-loop.h"
+ static AioContext *ctx;
+ static ThreadPool *pool;
+@@ -XXX,XX +XXX,XX @@ static void test_cancel_async(void)
+ int main(int argc, char **argv)
+ {
+     int ret;
+-    Error *local_error = NULL;
+-    init_clocks();
+-
+-    ctx = aio_context_new(&local_error);
+-    if (!ctx) {
+-        error_reportf_err(local_error, "Failed to create AIO Context: ");
+-        exit(1);
+-    }
++    qemu_init_main_loop(&error_abort);
++    ctx = qemu_get_current_aio_context();
+     pool = aio_get_thread_pool(ctx);
+     g_test_init(&argc, &argv, NULL);
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
+     ret = g_test_run();
+-    aio_context_unref(ctx);
+     return ret;
+ }
+--
+.9.3

-New patch
+[Qemu-devel] [PULL v2 05/24] io: add methods to set I/O handlers on AioContext
+From: Paolo Bonzini <pbonzini@redhat.com>
 This is in preparation for making qio_channel_yield work on
 AioContexts other than the main one.
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213135235.12274-6-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  include/io/channel.h | 25 +++++++++++++++++++++++++
  io/channel-command.c | 13 +++++++++++++
  io/channel-file.c    | 11 +++++++++++
  io/channel-socket.c  | 16 +++++++++++-----
  io/channel-tls.c     | 12 ++++++++++++
  io/channel-watch.c   |  6 ++++++
  io/channel.c         | 11 +++++++++++
 files changed, 89 insertions(+), 5 deletions(-)
 diff --git a/include/io/channel.h b/include/io/channel.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/io/channel.h
 +++ b/include/io/channel.h
@@ -XXX,XX +XXX,XX @@
  #include "qemu-common.h"
  #include "qom/object.h"
 +#include "block/aio.h"
  #define TYPE_QIO_CHANNEL "qio-channel"
  #define QIO_CHANNEL(obj)                                    \
@@ -XXX,XX +XXX,XX @@ struct QIOChannelClass {
                       off_t offset,
                       int whence,
                       Error **errp);
 +    void (*io_set_aio_fd_handler)(QIOChannel *ioc,
 +                                  AioContext *ctx,
 +                                  IOHandler *io_read,
 +                                  IOHandler *io_write,
 +                                  void *opaque);
  };
  /* General I/O handling functions */
@@ -XXX,XX +XXX,XX @@ void qio_channel_yield(QIOChannel *ioc,
  void qio_channel_wait(QIOChannel *ioc,
                        GIOCondition condition);
 +/**
 + * qio_channel_set_aio_fd_handler:
 + * @ioc: the channel object
 + * @ctx: the AioContext to set the handlers on
 + * @io_read: the read handler
 + * @io_write: the write handler
 + * @opaque: the opaque value passed to the handler
 + *
 + * This is used internally by qio_channel_yield().  It can
 + * be used by channel implementations to forward the handlers
 + * to another channel (e.g. from #QIOChannelTLS to the
 + * underlying socket).
 + */
 +void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
 +                                    AioContext *ctx,
 +                                    IOHandler *io_read,
 +                                    IOHandler *io_write,
 +                                    void *opaque);
 +
  #endif /* QIO_CHANNEL_H */
 diff --git a/io/channel-command.c b/io/channel-command.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel-command.c
 +++ b/io/channel-command.c
@@ -XXX,XX +XXX,XX @@ static int qio_channel_command_close(QIOChannel *ioc,
  }
 +static void qio_channel_command_set_aio_fd_handler(QIOChannel *ioc,
 +                                                   AioContext *ctx,
 +                                                   IOHandler *io_read,
 +                                                   IOHandler *io_write,
 +                                                   void *opaque)
 +{
 +    QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);
 +    aio_set_fd_handler(ctx, cioc->readfd, false, io_read, NULL, NULL, opaque);
 +    aio_set_fd_handler(ctx, cioc->writefd, false, NULL, io_write, NULL, opaque);
 +}
 +
 +
  static GSource *qio_channel_command_create_watch(QIOChannel *ioc,
                                                   GIOCondition condition)
  {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_command_class_init(ObjectClass *klass,
      ioc_klass->io_set_blocking = qio_channel_command_set_blocking;
      ioc_klass->io_close = qio_channel_command_close;
      ioc_klass->io_create_watch = qio_channel_command_create_watch;
 +    ioc_klass->io_set_aio_fd_handler = qio_channel_command_set_aio_fd_handler;
  }
  static const TypeInfo qio_channel_command_info = {
 diff --git a/io/channel-file.c b/io/channel-file.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel-file.c
 +++ b/io/channel-file.c
@@ -XXX,XX +XXX,XX @@ static int qio_channel_file_close(QIOChannel *ioc,
  }
 +static void qio_channel_file_set_aio_fd_handler(QIOChannel *ioc,
 +                                                AioContext *ctx,
 +                                                IOHandler *io_read,
 +                                                IOHandler *io_write,
 +                                                void *opaque)
 +{
 +    QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);
 +    aio_set_fd_handler(ctx, fioc->fd, false, io_read, io_write, NULL, opaque);
 +}
 +
  static GSource *qio_channel_file_create_watch(QIOChannel *ioc,
                                                GIOCondition condition)
  {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_file_class_init(ObjectClass *klass,
      ioc_klass->io_seek = qio_channel_file_seek;
      ioc_klass->io_close = qio_channel_file_close;
      ioc_klass->io_create_watch = qio_channel_file_create_watch;
 +    ioc_klass->io_set_aio_fd_handler = qio_channel_file_set_aio_fd_handler;
  }
  static const TypeInfo qio_channel_file_info = {
 diff --git a/io/channel-socket.c b/io/channel-socket.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel-socket.c
 +++ b/io/channel-socket.c
@@ -XXX,XX +XXX,XX @@ qio_channel_socket_set_blocking(QIOChannel *ioc,
          qemu_set_block(sioc->fd);
      } else {
          qemu_set_nonblock(sioc->fd);
 -#ifdef WIN32
 -        WSAEventSelect(sioc->fd, ioc->event,
 -                       FD_READ | FD_ACCEPT | FD_CLOSE |
 -                       FD_CONNECT | FD_WRITE | FD_OOB);
 -#endif
      }
      return 0;
  }
@@ -XXX,XX +XXX,XX @@ qio_channel_socket_shutdown(QIOChannel *ioc,
      return 0;
  }
 +static void qio_channel_socket_set_aio_fd_handler(QIOChannel *ioc,
 +                                                  AioContext *ctx,
 +                                                  IOHandler *io_read,
 +                                                  IOHandler *io_write,
 +                                                  void *opaque)
 +{
 +    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
 +    aio_set_fd_handler(ctx, sioc->fd, false, io_read, io_write, NULL, opaque);
 +}
 +
  static GSource *qio_channel_socket_create_watch(QIOChannel *ioc,
                                                  GIOCondition condition)
  {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_socket_class_init(ObjectClass *klass,
      ioc_klass->io_set_cork = qio_channel_socket_set_cork;
      ioc_klass->io_set_delay = qio_channel_socket_set_delay;
      ioc_klass->io_create_watch = qio_channel_socket_create_watch;
 +    ioc_klass->io_set_aio_fd_handler = qio_channel_socket_set_aio_fd_handler;
  }
  static const TypeInfo qio_channel_socket_info = {
 diff --git a/io/channel-tls.c b/io/channel-tls.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel-tls.c
 +++ b/io/channel-tls.c
@@ -XXX,XX +XXX,XX @@ static int qio_channel_tls_close(QIOChannel *ioc,
      return qio_channel_close(tioc->master, errp);
  }
 +static void qio_channel_tls_set_aio_fd_handler(QIOChannel *ioc,
 +                                               AioContext *ctx,
 +                                               IOHandler *io_read,
 +                                               IOHandler *io_write,
 +                                               void *opaque)
 +{
 +    QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc);
 +
 +    qio_channel_set_aio_fd_handler(tioc->master, ctx, io_read, io_write, opaque);
 +}
 +
  static GSource *qio_channel_tls_create_watch(QIOChannel *ioc,
                                               GIOCondition condition)
  {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_tls_class_init(ObjectClass *klass,
      ioc_klass->io_close = qio_channel_tls_close;
      ioc_klass->io_shutdown = qio_channel_tls_shutdown;
      ioc_klass->io_create_watch = qio_channel_tls_create_watch;
 +    ioc_klass->io_set_aio_fd_handler = qio_channel_tls_set_aio_fd_handler;
  }
  static const TypeInfo qio_channel_tls_info = {
 diff --git a/io/channel-watch.c b/io/channel-watch.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel-watch.c
 +++ b/io/channel-watch.c
@@ -XXX,XX +XXX,XX @@ GSource *qio_channel_create_socket_watch(QIOChannel *ioc,
      GSource *source;
      QIOChannelSocketSource *ssource;
 +#ifdef WIN32
 +    WSAEventSelect(socket, ioc->event,
 +                   FD_READ | FD_ACCEPT | FD_CLOSE |
 +                   FD_CONNECT | FD_WRITE | FD_OOB);
 +#endif
 +
      source = g_source_new(&qio_channel_socket_source_funcs,
                            sizeof(QIOChannelSocketSource));
      ssource = (QIOChannelSocketSource *)source;
 diff --git a/io/channel.c b/io/channel.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel.c
 +++ b/io/channel.c
@@ -XXX,XX +XXX,XX @@ GSource *qio_channel_create_watch(QIOChannel *ioc,
  }
 +void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
 +                                    AioContext *ctx,
 +                                    IOHandler *io_read,
 +                                    IOHandler *io_write,
 +                                    void *opaque)
 +{
 +    QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
 +
 +    klass->io_set_aio_fd_handler(ioc, ctx, io_read, io_write, opaque);
 +}
 +
  guint qio_channel_add_watch(QIOChannel *ioc,
                              GIOCondition condition,
                              QIOChannelFunc func,
 --
 .9.3

-New patch
+[Qemu-devel] [PULL v2 06/24] io: make qio_channel_yield aware of AioContexts
+From: Paolo Bonzini <pbonzini@redhat.com>
 Support separate coroutines for reading and writing, and place the
 read/write handlers on the AioContext that the QIOChannel is registered
 with.
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213135235.12274-7-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  include/io/channel.h | 47 ++++++++++++++++++++++++++--
  io/channel.c         | 86 +++++++++++++++++++++++++++++++++++++++-------------
 files changed, 109 insertions(+), 24 deletions(-)
 diff --git a/include/io/channel.h b/include/io/channel.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/io/channel.h
 +++ b/include/io/channel.h
@@ -XXX,XX +XXX,XX @@
  #include "qemu-common.h"
  #include "qom/object.h"
 +#include "qemu/coroutine.h"
  #include "block/aio.h"
  #define TYPE_QIO_CHANNEL "qio-channel"
@@ -XXX,XX +XXX,XX @@ struct QIOChannel {
      Object parent;
      unsigned int features; /* bitmask of QIOChannelFeatures */
      char *name;
 +    AioContext *ctx;
 +    Coroutine *read_coroutine;
 +    Coroutine *write_coroutine;
  #ifdef _WIN32
      HANDLE event; /* For use with GSource on Win32 */
  #endif
@@ -XXX,XX +XXX,XX @@ guint qio_channel_add_watch(QIOChannel *ioc,
  /**
 + * qio_channel_attach_aio_context:
 + * @ioc: the channel object
 + * @ctx: the #AioContext to set the handlers on
 + *
 + * Request that qio_channel_yield() sets I/O handlers on
 + * the given #AioContext.  If @ctx is %NULL, qio_channel_yield()
 + * uses QEMU's main thread event loop.
 + *
 + * You can move a #QIOChannel from one #AioContext to another even if
 + * I/O handlers are set for a coroutine.  However, #QIOChannel provides
 + * no synchronization between the calls to qio_channel_yield() and
 + * qio_channel_attach_aio_context().
 + *
 + * Therefore you should first call qio_channel_detach_aio_context()
 + * to ensure that the coroutine is not entered concurrently.  Then,
 + * while the coroutine has yielded, call qio_channel_attach_aio_context(),
 + * and then aio_co_schedule() to place the coroutine on the new
 + * #AioContext.  The calls to qio_channel_detach_aio_context()
 + * and qio_channel_attach_aio_context() should be protected with
 + * aio_context_acquire() and aio_context_release().
 + */
 +void qio_channel_attach_aio_context(QIOChannel *ioc,
 +                                    AioContext *ctx);
 +
 +/**
 + * qio_channel_detach_aio_context:
 + * @ioc: the channel object
 + *
 + * Disable any I/O handlers set by qio_channel_yield().  With the
 + * help of aio_co_schedule(), this allows moving a coroutine that was
 + * paused by qio_channel_yield() to another context.
 + */
 +void qio_channel_detach_aio_context(QIOChannel *ioc);
 +
 +/**
   * qio_channel_yield:
   * @ioc: the channel object
   * @condition: the I/O condition to wait for
   *
 - * Yields execution from the current coroutine until
 - * the condition indicated by @condition becomes
 - * available.
 + * Yields execution from the current coroutine until the condition
 + * indicated by @condition becomes available.  @condition must
 + * be either %G_IO_IN or %G_IO_OUT; it cannot contain both.  In
 + * addition, no two coroutine can be waiting on the same condition
 + * and channel at the same time.
   *
   * This must only be called from coroutine context
   */
 diff --git a/io/channel.c b/io/channel.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel.c
 +++ b/io/channel.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "io/channel.h"
  #include "qapi/error.h"
 -#include "qemu/coroutine.h"
 +#include "qemu/main-loop.h"
  bool qio_channel_has_feature(QIOChannel *ioc,
                               QIOChannelFeature feature)
@@ -XXX,XX +XXX,XX @@ off_t qio_channel_io_seek(QIOChannel *ioc,
  }
 -typedef struct QIOChannelYieldData QIOChannelYieldData;
 -struct QIOChannelYieldData {
 -    QIOChannel *ioc;
 -    Coroutine *co;
 -};
 +static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc);
 +static void qio_channel_restart_read(void *opaque)
 +{
 +    QIOChannel *ioc = opaque;
 +    Coroutine *co = ioc->read_coroutine;
 +
 +    ioc->read_coroutine = NULL;
 +    qio_channel_set_aio_fd_handlers(ioc);
 +    aio_co_wake(co);
 +}
 -static gboolean qio_channel_yield_enter(QIOChannel *ioc,
 -                                        GIOCondition condition,
 -                                        gpointer opaque)
 +static void qio_channel_restart_write(void *opaque)
  {
 -    QIOChannelYieldData *data = opaque;
 -    qemu_coroutine_enter(data->co);
 -    return FALSE;
 +    QIOChannel *ioc = opaque;
 +    Coroutine *co = ioc->write_coroutine;
 +
 +    ioc->write_coroutine = NULL;
 +    qio_channel_set_aio_fd_handlers(ioc);
 +    aio_co_wake(co);
  }
 +static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc)
 +{
 +    IOHandler *rd_handler = NULL, *wr_handler = NULL;
 +    AioContext *ctx;
 +
 +    if (ioc->read_coroutine) {
 +        rd_handler = qio_channel_restart_read;
 +    }
 +    if (ioc->write_coroutine) {
 +        wr_handler = qio_channel_restart_write;
 +    }
 +
 +    ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
 +    qio_channel_set_aio_fd_handler(ioc, ctx, rd_handler, wr_handler, ioc);
 +}
 +
 +void qio_channel_attach_aio_context(QIOChannel *ioc,
 +                                    AioContext *ctx)
 +{
 +    AioContext *old_ctx;
 +    if (ioc->ctx == ctx) {
 +        return;
 +    }
 +
 +    old_ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
 +    qio_channel_set_aio_fd_handler(ioc, old_ctx, NULL, NULL, NULL);
 +    ioc->ctx = ctx;
 +    qio_channel_set_aio_fd_handlers(ioc);
 +}
 +
 +void qio_channel_detach_aio_context(QIOChannel *ioc)
 +{
 +    ioc->read_coroutine = NULL;
 +    ioc->write_coroutine = NULL;
 +    qio_channel_set_aio_fd_handlers(ioc);
 +    ioc->ctx = NULL;
 +}
  void coroutine_fn qio_channel_yield(QIOChannel *ioc,
                                      GIOCondition condition)
  {
 -    QIOChannelYieldData data;
 -
      assert(qemu_in_coroutine());
 -    data.ioc = ioc;
 -    data.co = qemu_coroutine_self();
 -    qio_channel_add_watch(ioc,
 -                          condition,
 -                          qio_channel_yield_enter,
 -                          &data,
 -                          NULL);
 +    if (condition == G_IO_IN) {
 +        assert(!ioc->read_coroutine);
 +        ioc->read_coroutine = qemu_coroutine_self();
 +    } else if (condition == G_IO_OUT) {
 +        assert(!ioc->write_coroutine);
 +        ioc->write_coroutine = qemu_coroutine_self();
 +    } else {
 +        abort();
 +    }
 +    qio_channel_set_aio_fd_handlers(ioc);
      qemu_coroutine_yield();
  }
 --
 .9.3

-New patch
+[Qemu-devel] [PULL v2 07/24] nbd: convert to use qio_channel_yield
+From: Paolo Bonzini <pbonzini@redhat.com>
+In the client, read the reply headers from a coroutine, switching the
+read side between the "read header" coroutine and the I/O coroutine that
+reads the body of the reply.
+In the server, if the server can read more requests it will create a new
+"read request" coroutine as soon as a request has been read.  Otherwise,
+the new coroutine is created in nbd_request_put.
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
+Message-id: 20170213135235.12274-8-pbonzini@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/nbd-client.h |   2 +-
+ block/nbd-client.c | 117 ++++++++++++++++++++++++-----------------------------
+ nbd/client.c       |   2 +-
+ nbd/common.c       |   9 +----
+ nbd/server.c       |  94 +++++++++++++-----------------------------
+files changed, 83 insertions(+), 141 deletions(-)
+diff --git a/block/nbd-client.h b/block/nbd-client.h
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nbd-client.h
++++ b/block/nbd-client.h
+@@ -XXX,XX +XXX,XX @@ typedef struct NBDClientSession {
+     CoMutex send_mutex;
+     CoQueue free_sema;
+-    Coroutine *send_coroutine;
++    Coroutine *read_reply_co;
+     int in_flight;
+     Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
+diff --git a/block/nbd-client.c b/block/nbd-client.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nbd-client.c
++++ b/block/nbd-client.c
+@@ -XXX,XX +XXX,XX @@
+ #define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
+ #define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))
+-static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
++static void nbd_recv_coroutines_enter_all(BlockDriverState *bs)
+ {
++    NBDClientSession *s = nbd_get_client_session(bs);
+     int i;
+     for (i = 0; i < MAX_NBD_REQUESTS; i++) {
+@@ -XXX,XX +XXX,XX @@ static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
+             qemu_coroutine_enter(s->recv_coroutine[i]);
+         }
+     }
++    BDRV_POLL_WHILE(bs, s->read_reply_co);
+ }
+ static void nbd_teardown_connection(BlockDriverState *bs)
+@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
+     qio_channel_shutdown(client->ioc,
+                          QIO_CHANNEL_SHUTDOWN_BOTH,
+                          NULL);
+-    nbd_recv_coroutines_enter_all(client);
++    nbd_recv_coroutines_enter_all(bs);
+     nbd_client_detach_aio_context(bs);
+     object_unref(OBJECT(client->sioc));
+@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
+     client->ioc = NULL;
+ }
+-static void nbd_reply_ready(void *opaque)
++static coroutine_fn void nbd_read_reply_entry(void *opaque)
+ {
+-    BlockDriverState *bs = opaque;
+-    NBDClientSession *s = nbd_get_client_session(bs);
++    NBDClientSession *s = opaque;
+     uint64_t i;
+     int ret;
+-    if (!s->ioc) { /* Already closed */
+-        return;
+-    }
+-
+-    if (s->reply.handle == 0) {
+-        /* No reply already in flight.  Fetch a header.  It is possible
+-         * that another thread has done the same thing in parallel, so
+-         * the socket is not readable anymore.
+-         */
++    for (;;) {
++        assert(s->reply.handle == 0);
+         ret = nbd_receive_reply(s->ioc, &s->reply);
+-        if (ret == -EAGAIN) {
+-            return;
+-        }
+         if (ret < 0) {
+-            s->reply.handle = 0;
+-            goto fail;
++            break;
+         }
+-    }
+-    /* There's no need for a mutex on the receive side, because the
+-     * handler acts as a synchronization point and ensures that only
+-     * one coroutine is called until the reply finishes.  */
+-    i = HANDLE_TO_INDEX(s, s->reply.handle);
+-    if (i >= MAX_NBD_REQUESTS) {
+-        goto fail;
+-    }
++        /* There's no need for a mutex on the receive side, because the
++         * handler acts as a synchronization point and ensures that only
++         * one coroutine is called until the reply finishes.
++         */
++        i = HANDLE_TO_INDEX(s, s->reply.handle);
++        if (i >= MAX_NBD_REQUESTS || !s->recv_coroutine[i]) {
++            break;
++        }
+-    if (s->recv_coroutine[i]) {
+-        qemu_coroutine_enter(s->recv_coroutine[i]);
+-        return;
++        /* We're woken up by the recv_coroutine itself.  Note that there
++         * is no race between yielding and reentering read_reply_co.  This
++         * is because:
++         *
++         * - if recv_coroutine[i] runs on the same AioContext, it is only
++         *   entered after we yield
++         *
++         * - if recv_coroutine[i] runs on a different AioContext, reentering
++         *   read_reply_co happens through a bottom half, which can only
++         *   run after we yield.
++         */
++        aio_co_wake(s->recv_coroutine[i]);
++        qemu_coroutine_yield();
+     }
+-
+-fail:
+-    nbd_teardown_connection(bs);
+-}
+-
+-static void nbd_restart_write(void *opaque)
+-{
+-    BlockDriverState *bs = opaque;
+-
+-    qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine);
++    s->read_reply_co = NULL;
+ }
+ static int nbd_co_send_request(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
+                                QEMUIOVector *qiov)
+ {
+     NBDClientSession *s = nbd_get_client_session(bs);
+-    AioContext *aio_context;
+     int rc, ret, i;
+     qemu_co_mutex_lock(&s->send_mutex);
+@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
+         return -EPIPE;
+     }
+-    s->send_coroutine = qemu_coroutine_self();
+-    aio_context = bdrv_get_aio_context(bs);
+-
+-    aio_set_fd_handler(aio_context, s->sioc->fd, false,
+-                       nbd_reply_ready, nbd_restart_write, NULL, bs);
+     if (qiov) {
+         qio_channel_set_cork(s->ioc, true);
+         rc = nbd_send_request(s->ioc, request);
+@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
+     } else {
+         rc = nbd_send_request(s->ioc, request);
+     }
+-    aio_set_fd_handler(aio_context, s->sioc->fd, false,
+-                       nbd_reply_ready, NULL, NULL, bs);
+-    s->send_coroutine = NULL;
+     qemu_co_mutex_unlock(&s->send_mutex);
+     return rc;
+ }
+@@ -XXX,XX +XXX,XX @@ static void nbd_co_receive_reply(NBDClientSession *s,
+ {
+     int ret;
+-    /* Wait until we're woken up by the read handler.  TODO: perhaps
+-     * peek at the next reply and avoid yielding if it's ours?  */
++    /* Wait until we're woken up by nbd_read_reply_entry.  */
+     qemu_coroutine_yield();
+     *reply = s->reply;
+     if (reply->handle != request->handle ||
+@@ -XXX,XX +XXX,XX @@ static void nbd_coroutine_start(NBDClientSession *s,
+     /* s->recv_coroutine[i] is set as soon as we get the send_lock.  */
+ }
+-static void nbd_coroutine_end(NBDClientSession *s,
++static void nbd_coroutine_end(BlockDriverState *bs,
+                               NBDRequest *request)
+ {
++    NBDClientSession *s = nbd_get_client_session(bs);
+     int i = HANDLE_TO_INDEX(s, request->handle);
++
+     s->recv_coroutine[i] = NULL;
+-    if (s->in_flight-- == MAX_NBD_REQUESTS) {
+-        qemu_co_queue_next(&s->free_sema);
++    s->in_flight--;
++    qemu_co_queue_next(&s->free_sema);
++
++    /* Kick the read_reply_co to get the next reply.  */
++    if (s->read_reply_co) {
++        aio_co_wake(s->read_reply_co);
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
+     } else {
+         nbd_co_receive_reply(client, &request, &reply, qiov);
+     }
+-    nbd_coroutine_end(client, &request);
++    nbd_coroutine_end(bs, &request);
+     return -reply.error;
+ }
+@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
+     } else {
+         nbd_co_receive_reply(client, &request, &reply, NULL);
+     }
+-    nbd_coroutine_end(client, &request);
++    nbd_coroutine_end(bs, &request);
+     return -reply.error;
+ }
+@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+     } else {
+         nbd_co_receive_reply(client, &request, &reply, NULL);
+     }
+-    nbd_coroutine_end(client, &request);
++    nbd_coroutine_end(bs, &request);
+     return -reply.error;
+ }
+@@ -XXX,XX +XXX,XX @@ int nbd_client_co_flush(BlockDriverState *bs)
+     } else {
+         nbd_co_receive_reply(client, &request, &reply, NULL);
+     }
+-    nbd_coroutine_end(client, &request);
++    nbd_coroutine_end(bs, &request);
+     return -reply.error;
+ }
+@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
+     } else {
+         nbd_co_receive_reply(client, &request, &reply, NULL);
+     }
+-    nbd_coroutine_end(client, &request);
++    nbd_coroutine_end(bs, &request);
+     return -reply.error;
+ }
+ void nbd_client_detach_aio_context(BlockDriverState *bs)
+ {
+-    aio_set_fd_handler(bdrv_get_aio_context(bs),
+-                       nbd_get_client_session(bs)->sioc->fd,
+-                       false, NULL, NULL, NULL, NULL);
++    NBDClientSession *client = nbd_get_client_session(bs);
++    qio_channel_detach_aio_context(QIO_CHANNEL(client->sioc));
+ }
+ void nbd_client_attach_aio_context(BlockDriverState *bs,
+                                    AioContext *new_context)
+ {
+-    aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sioc->fd,
+-                       false, nbd_reply_ready, NULL, NULL, bs);
++    NBDClientSession *client = nbd_get_client_session(bs);
++    qio_channel_attach_aio_context(QIO_CHANNEL(client->sioc), new_context);
++    aio_co_schedule(new_context, client->read_reply_co);
+ }
+ void nbd_client_close(BlockDriverState *bs)
+@@ -XXX,XX +XXX,XX @@ int nbd_client_init(BlockDriverState *bs,
+     /* Now that we're connected, set the socket to be non-blocking and
+      * kick the reply mechanism.  */
+     qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL);
+-
++    client->read_reply_co = qemu_coroutine_create(nbd_read_reply_entry, client);
+     nbd_client_attach_aio_context(bs, bdrv_get_aio_context(bs));
+     logout("Established connection with NBD server\n");
+diff --git a/nbd/client.c b/nbd/client.c
+index XXXXXXX..XXXXXXX 100644
+--- a/nbd/client.c
++++ b/nbd/client.c
+@@ -XXX,XX +XXX,XX @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply)
+     ssize_t ret;
+     ret = read_sync(ioc, buf, sizeof(buf));
+-    if (ret < 0) {
++    if (ret <= 0) {
+         return ret;
+     }
+diff --git a/nbd/common.c b/nbd/common.c
+index XXXXXXX..XXXXXXX 100644
+--- a/nbd/common.c
++++ b/nbd/common.c
+@@ -XXX,XX +XXX,XX @@ ssize_t nbd_wr_syncv(QIOChannel *ioc,
+         }
+         if (len == QIO_CHANNEL_ERR_BLOCK) {
+             if (qemu_in_coroutine()) {
+-                /* XXX figure out if we can create a variant on
+-                 * qio_channel_yield() that works with AIO contexts
+-                 * and consider using that in this branch */
+-                qemu_coroutine_yield();
+-            } else if (done) {
+-                /* XXX this is needed by nbd_reply_ready.  */
+-                qio_channel_wait(ioc,
+-                                 do_read ? G_IO_IN : G_IO_OUT);
++                qio_channel_yield(ioc, do_read ? G_IO_IN : G_IO_OUT);
+             } else {
+                 return -EAGAIN;
+             }
+diff --git a/nbd/server.c b/nbd/server.c
+index XXXXXXX..XXXXXXX 100644
+--- a/nbd/server.c
++++ b/nbd/server.c
+@@ -XXX,XX +XXX,XX @@ struct NBDClient {
+     CoMutex send_lock;
+     Coroutine *send_coroutine;
+-    bool can_read;
+-
+     QTAILQ_ENTRY(NBDClient) next;
+     int nb_requests;
+     bool closing;
+@@ -XXX,XX +XXX,XX @@ struct NBDClient {
+ /* That's all folks */
+-static void nbd_set_handlers(NBDClient *client);
+-static void nbd_unset_handlers(NBDClient *client);
+-static void nbd_update_can_read(NBDClient *client);
++static void nbd_client_receive_next_request(NBDClient *client);
+ static gboolean nbd_negotiate_continue(QIOChannel *ioc,
+                                        GIOCondition condition,
+@@ -XXX,XX +XXX,XX @@ void nbd_client_put(NBDClient *client)
+          */
+         assert(client->closing);
+-        nbd_unset_handlers(client);
++        qio_channel_detach_aio_context(client->ioc);
+         object_unref(OBJECT(client->sioc));
+         object_unref(OBJECT(client->ioc));
+         if (client->tlscreds) {
+@@ -XXX,XX +XXX,XX @@ static NBDRequestData *nbd_request_get(NBDClient *client)
+     assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
+     client->nb_requests++;
+-    nbd_update_can_read(client);
+     req = g_new0(NBDRequestData, 1);
+     nbd_client_get(client);
+@@ -XXX,XX +XXX,XX @@ static void nbd_request_put(NBDRequestData *req)
+     g_free(req);
+     client->nb_requests--;
+-    nbd_update_can_read(client);
++    nbd_client_receive_next_request(client);
++
+     nbd_client_put(client);
+ }
+@@ -XXX,XX +XXX,XX @@ static void blk_aio_attached(AioContext *ctx, void *opaque)
+     exp->ctx = ctx;
+     QTAILQ_FOREACH(client, &exp->clients, next) {
+-        nbd_set_handlers(client);
++        qio_channel_attach_aio_context(client->ioc, ctx);
++        if (client->recv_coroutine) {
++            aio_co_schedule(ctx, client->recv_coroutine);
++        }
++        if (client->send_coroutine) {
++            aio_co_schedule(ctx, client->send_coroutine);
++        }
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static void blk_aio_detach(void *opaque)
+     TRACE("Export %s: Detaching clients from AIO context %p\n", exp->name, exp->ctx);
+     QTAILQ_FOREACH(client, &exp->clients, next) {
+-        nbd_unset_handlers(client);
++        qio_channel_detach_aio_context(client->ioc);
+     }
+     exp->ctx = NULL;
+@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
+     g_assert(qemu_in_coroutine());
+     qemu_co_mutex_lock(&client->send_lock);
+     client->send_coroutine = qemu_coroutine_self();
+-    nbd_set_handlers(client);
+     if (!len) {
+         rc = nbd_send_reply(client->ioc, reply);
+@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
+     }
+     client->send_coroutine = NULL;
+-    nbd_set_handlers(client);
+     qemu_co_mutex_unlock(&client->send_lock);
+     return rc;
+ }
+@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
+     ssize_t rc;
+     g_assert(qemu_in_coroutine());
+-    client->recv_coroutine = qemu_coroutine_self();
+-    nbd_update_can_read(client);
+-
++    assert(client->recv_coroutine == qemu_coroutine_self());
+     rc = nbd_receive_request(client->ioc, request);
+     if (rc < 0) {
+         if (rc != -EAGAIN) {
+@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
+ out:
+     client->recv_coroutine = NULL;
+-    nbd_update_can_read(client);
++    nbd_client_receive_next_request(client);
+     return rc;
+ }
+-static void nbd_trip(void *opaque)
++/* Owns a reference to the NBDClient passed as opaque.  */
++static coroutine_fn void nbd_trip(void *opaque)
+ {
+     NBDClient *client = opaque;
+     NBDExport *exp = client->exp;
+     NBDRequestData *req;
+-    NBDRequest request;
++    NBDRequest request = { 0 };    /* GCC thinks it can be used uninitialized */
+     NBDReply reply;
+     ssize_t ret;
+     int flags;
+     TRACE("Reading request.");
+     if (client->closing) {
++        nbd_client_put(client);
+         return;
+     }
+@@ -XXX,XX +XXX,XX @@ static void nbd_trip(void *opaque)
+ done:
+     nbd_request_put(req);
++    nbd_client_put(client);
+     return;
+ out:
+     nbd_request_put(req);
+     client_close(client);
++    nbd_client_put(client);
+ }
+-static void nbd_read(void *opaque)
++static void nbd_client_receive_next_request(NBDClient *client)
+ {
+-    NBDClient *client = opaque;
+-
+-    if (client->recv_coroutine) {
+-        qemu_coroutine_enter(client->recv_coroutine);
+-    } else {
+-        qemu_coroutine_enter(qemu_coroutine_create(nbd_trip, client));
+-    }
+-}
+-
+-static void nbd_restart_write(void *opaque)
+-{
+-    NBDClient *client = opaque;
+-
+-    qemu_coroutine_enter(client->send_coroutine);
+-}
+-
+-static void nbd_set_handlers(NBDClient *client)
+-{
+-    if (client->exp && client->exp->ctx) {
+-        aio_set_fd_handler(client->exp->ctx, client->sioc->fd, true,
+-                           client->can_read ? nbd_read : NULL,
+-                           client->send_coroutine ? nbd_restart_write : NULL,
+-                           NULL, client);
+-    }
+-}
+-
+-static void nbd_unset_handlers(NBDClient *client)
+-{
+-    if (client->exp && client->exp->ctx) {
+-        aio_set_fd_handler(client->exp->ctx, client->sioc->fd, true, NULL,
+-                           NULL, NULL, NULL);
+-    }
+-}
+-
+-static void nbd_update_can_read(NBDClient *client)
+-{
+-    bool can_read = client->recv_coroutine ||
+-                    client->nb_requests < MAX_NBD_REQUESTS;
+-
+-    if (can_read != client->can_read) {
+-        client->can_read = can_read;
+-        nbd_set_handlers(client);
+-
+-        /* There is no need to invoke aio_notify(), since aio_set_fd_handler()
+-         * in nbd_set_handlers() will have taken care of that */
++    if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS) {
++        nbd_client_get(client);
++        client->recv_coroutine = qemu_coroutine_create(nbd_trip, client);
++        aio_co_schedule(client->exp->ctx, client->recv_coroutine);
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static coroutine_fn void nbd_co_client_start(void *opaque)
+         goto out;
+     }
+     qemu_co_mutex_init(&client->send_lock);
+-    nbd_set_handlers(client);
+     if (exp) {
+         QTAILQ_INSERT_TAIL(&exp->clients, client, next);
+     }
++
++    nbd_client_receive_next_request(client);
++
+ out:
+     g_free(data);
+ }
+@@ -XXX,XX +XXX,XX @@ void nbd_client_new(NBDExport *exp,
+     object_ref(OBJECT(client->sioc));
+     client->ioc = QIO_CHANNEL(sioc);
+     object_ref(OBJECT(client->ioc));
+-    client->can_read = true;
+     client->close = close_fn;
+     data->client = client;
+--
+.9.3

-New patch
+[Qemu-devel] [PULL v2 08/24] coroutine-lock: reschedule coroutine on the AioContext it was running on
+From: Paolo Bonzini <pbonzini@redhat.com>
+As a small step towards the introduction of multiqueue, we want
+coroutines to remain on the same AioContext that started them,
+unless they are moved explicitly with e.g. aio_co_schedule.  This patch
+avoids that coroutines switch AioContext when they use a CoMutex.
+For now it does not make much of a difference, because the CoMutex
+is not thread-safe and the AioContext itself is used to protect the
+CoMutex from concurrent access.  However, this is going to change.
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
+Message-id: 20170213135235.12274-9-pbonzini@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ util/qemu-coroutine-lock.c | 5 ++---
+ util/trace-events          | 1 -
+files changed, 2 insertions(+), 4 deletions(-)
+diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/qemu-coroutine-lock.c
++++ b/util/qemu-coroutine-lock.c
+@@ -XXX,XX +XXX,XX @@
+ #include "qemu/coroutine.h"
+ #include "qemu/coroutine_int.h"
+ #include "qemu/queue.h"
++#include "block/aio.h"
+ #include "trace.h"
+ void qemu_co_queue_init(CoQueue *queue)
+@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_run_restart(Coroutine *co)
+ static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
+ {
+-    Coroutine *self = qemu_coroutine_self();
+     Coroutine *next;
+     if (QSIMPLEQ_EMPTY(&queue->entries)) {
+@@ -XXX,XX +XXX,XX @@ static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
+     while ((next = QSIMPLEQ_FIRST(&queue->entries)) != NULL) {
+         QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next);
+-        QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, next, co_queue_next);
+-        trace_qemu_co_queue_next(next);
++        aio_co_wake(next);
+         if (single) {
+             break;
+         }
+diff --git a/util/trace-events b/util/trace-events
+index XXXXXXX..XXXXXXX 100644
+--- a/util/trace-events
++++ b/util/trace-events
+@@ -XXX,XX +XXX,XX @@ qemu_coroutine_terminate(void *co) "self %p"
+ # util/qemu-coroutine-lock.c
+ qemu_co_queue_run_restart(void *co) "co %p"
+-qemu_co_queue_next(void *nxt) "next %p"
+ qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
+ qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
+ qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
+--
+.9.3

-New patch
+[Qemu-devel] [PULL v2 09/24] blkdebug: reschedule coroutine on the AioContext it is running on
+From: Paolo Bonzini <pbonzini@redhat.com>
+Keep the coroutine on the same AioContext.  Without this change,
+there would be a race between yielding the coroutine and reentering it.
+While the race cannot happen now, because the code only runs from a single
+AioContext, this will change with multiqueue support in the block layer.
+While doing the change, replace custom bottom half with aio_co_schedule.
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
+Message-id: 20170213135235.12274-10-pbonzini@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/blkdebug.c | 9 +--------
+file changed, 1 insertion(+), 8 deletions(-)
+diff --git a/block/blkdebug.c b/block/blkdebug.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/blkdebug.c
++++ b/block/blkdebug.c
+@@ -XXX,XX +XXX,XX @@ out:
+     return ret;
+ }
+-static void error_callback_bh(void *opaque)
+-{
+-    Coroutine *co = opaque;
+-    qemu_coroutine_enter(co);
+-}
+-
+ static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
+ {
+     BDRVBlkdebugState *s = bs->opaque;
+@@ -XXX,XX +XXX,XX @@ static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
+     }
+     if (!immediately) {
+-        aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), error_callback_bh,
+-                                qemu_coroutine_self());
++        aio_co_schedule(qemu_get_current_aio_context(), qemu_coroutine_self());
+         qemu_coroutine_yield();
+     }
+--
+.9.3

-New patch
+[Qemu-devel] [PULL v2 10/24] qed: introduce qed_aio_start_io and qed_aio_next_io_cb
+From: Paolo Bonzini <pbonzini@redhat.com>
+qed_aio_start_io and qed_aio_next_io will not have to acquire/release
+the AioContext, while qed_aio_next_io_cb will.  Split the functionality
+and gain a little type-safety in the process.
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
+Message-id: 20170213135235.12274-11-pbonzini@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/qed.c | 39 +++++++++++++++++++++++++--------------
+file changed, 25 insertions(+), 14 deletions(-)
+diff --git a/block/qed.c b/block/qed.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/qed.c
++++ b/block/qed.c
+@@ -XXX,XX +XXX,XX @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
+     return l2_table;
+ }
+-static void qed_aio_next_io(void *opaque, int ret);
++static void qed_aio_next_io(QEDAIOCB *acb, int ret);
++
++static void qed_aio_start_io(QEDAIOCB *acb)
++{
++    qed_aio_next_io(acb, 0);
++}
++
++static void qed_aio_next_io_cb(void *opaque, int ret)
++{
++    QEDAIOCB *acb = opaque;
++
++    qed_aio_next_io(acb, ret);
++}
+ static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
+ {
+@@ -XXX,XX +XXX,XX @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
+     acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
+     if (acb) {
+-        qed_aio_next_io(acb, 0);
++        qed_aio_start_io(acb);
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
+         QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
+         acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
+         if (acb) {
+-            qed_aio_next_io(acb, 0);
++            qed_aio_start_io(acb);
+         } else if (s->header.features & QED_F_NEED_CHECK) {
+             qed_start_need_check_timer(s);
+         }
+@@ -XXX,XX +XXX,XX @@ static void qed_commit_l2_update(void *opaque, int ret)
+     acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
+     assert(acb->request.l2_table != NULL);
+-    qed_aio_next_io(opaque, ret);
++    qed_aio_next_io(acb, ret);
+ }
+ /**
+@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
+     if (need_alloc) {
+         /* Write out the whole new L2 table */
+         qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
+-                            qed_aio_write_l1_update, acb);
++                           qed_aio_write_l1_update, acb);
+     } else {
+         /* Write out only the updated part of the L2 table */
+         qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
+-                            qed_aio_next_io, acb);
++                           qed_aio_next_io_cb, acb);
+     }
+     return;
+@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
+     }
+     if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
+-        next_fn = qed_aio_next_io;
++        next_fn = qed_aio_next_io_cb;
+     } else {
+         if (s->bs->backing) {
+             next_fn = qed_aio_write_flush_before_l2_update;
+@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+     if (acb->flags & QED_AIOCB_ZERO) {
+         /* Skip ahead if the clusters are already zero */
+         if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
+-            qed_aio_next_io(acb, 0);
++            qed_aio_start_io(acb);
+             return;
+         }
+@@ -XXX,XX +XXX,XX @@ static void qed_aio_read_data(void *opaque, int ret,
+     /* Handle zero cluster and backing file reads */
+     if (ret == QED_CLUSTER_ZERO) {
+         qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
+-        qed_aio_next_io(acb, 0);
++        qed_aio_start_io(acb);
+         return;
+     } else if (ret != QED_CLUSTER_FOUND) {
+         qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
+-                              &acb->backing_qiov, qed_aio_next_io, acb);
++                              &acb->backing_qiov, qed_aio_next_io_cb, acb);
+         return;
+     }
+     BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+     bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
+                    &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
+-                   qed_aio_next_io, acb);
++                   qed_aio_next_io_cb, acb);
+     return;
+ err:
+@@ -XXX,XX +XXX,XX @@ err:
+ /**
+  * Begin next I/O or complete the request
+  */
+-static void qed_aio_next_io(void *opaque, int ret)
++static void qed_aio_next_io(QEDAIOCB *acb, int ret)
+ {
+-    QEDAIOCB *acb = opaque;
+     BDRVQEDState *s = acb_to_s(acb);
+     QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
+                                 qed_aio_write_data : qed_aio_read_data;
+@@ -XXX,XX +XXX,XX @@ static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
+     qemu_iovec_init(&acb->cur_qiov, qiov->niov);
+     /* Start request */
+-    qed_aio_next_io(acb, 0);
++    qed_aio_start_io(acb);
+     return &acb->common;
+ }
+--
+.9.3

-New patch
+[Qemu-devel] [PULL v2 11/24] aio: push aio_context_acquire/release down to dispatching
+From: Paolo Bonzini <pbonzini@redhat.com>
+The AioContext data structures are now protected by list_lock and/or
+they are walked with FOREACH_RCU primitives.  There is no need anymore
+to acquire the AioContext for the entire duration of aio_dispatch.
+Instead, just acquire it before and after invoking the callbacks.
+The next step is then to push it further down.
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
+Message-id: 20170213135235.12274-12-pbonzini@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ util/aio-posix.c | 25 +++++++++++--------------
+ util/aio-win32.c | 15 +++++++--------
+ util/async.c     |  2 ++
+files changed, 20 insertions(+), 22 deletions(-)
+diff --git a/util/aio-posix.c b/util/aio-posix.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/aio-posix.c
++++ b/util/aio-posix.c
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
+             (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
+             aio_node_check(ctx, node->is_external) &&
+             node->io_read) {
++            aio_context_acquire(ctx);
+             node->io_read(node->opaque);
++            aio_context_release(ctx);
+             /* aio_notify() does not count as progress */
+             if (node->opaque != &ctx->notifier) {
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
+             (revents & (G_IO_OUT | G_IO_ERR)) &&
+             aio_node_check(ctx, node->is_external) &&
+             node->io_write) {
++            aio_context_acquire(ctx);
+             node->io_write(node->opaque);
++            aio_context_release(ctx);
+             progress = true;
+         }
+@@ -XXX,XX +XXX,XX @@ bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
+     }
+     /* Run our timers */
++    aio_context_acquire(ctx);
+     progress |= timerlistgroup_run_timers(&ctx->tlg);
++    aio_context_release(ctx);
+     return progress;
+ }
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
+     int64_t timeout;
+     int64_t start = 0;
+-    aio_context_acquire(ctx);
+-    progress = false;
+-
+     /* aio_notify can avoid the expensive event_notifier_set if
+      * everything (file descriptors, bottom halves, timers) will
+      * be re-evaluated before the next blocking poll().  This is
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
+         start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+     }
+-    if (try_poll_mode(ctx, blocking)) {
+-        progress = true;
+-    } else {
++    aio_context_acquire(ctx);
++    progress = try_poll_mode(ctx, blocking);
++    aio_context_release(ctx);
++
++    if (!progress) {
+         assert(npfd == 0);
+         /* fill pollfds */
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
+         timeout = blocking ? aio_compute_timeout(ctx) : 0;
+         /* wait until next event */
+-        if (timeout) {
+-            aio_context_release(ctx);
+-        }
+         if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
+             AioHandler epoll_handler;
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
+         } else  {
+             ret = qemu_poll_ns(pollfds, npfd, timeout);
+         }
+-        if (timeout) {
+-            aio_context_acquire(ctx);
+-        }
+     }
+     if (blocking) {
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
+         progress = true;
+     }
+-    aio_context_release(ctx);
+-
+     return progress;
+ }
+diff --git a/util/aio-win32.c b/util/aio-win32.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/aio-win32.c
++++ b/util/aio-win32.c
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
+             (revents || event_notifier_get_handle(node->e) == event) &&
+             node->io_notify) {
+             node->pfd.revents = 0;
++            aio_context_acquire(ctx);
+             node->io_notify(node->e);
++            aio_context_release(ctx);
+             /* aio_notify() does not count as progress */
+             if (node->e != &ctx->notifier) {
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
+             (node->io_read || node->io_write)) {
+             node->pfd.revents = 0;
+             if ((revents & G_IO_IN) && node->io_read) {
++                aio_context_acquire(ctx);
+                 node->io_read(node->opaque);
++                aio_context_release(ctx);
+                 progress = true;
+             }
+             if ((revents & G_IO_OUT) && node->io_write) {
++                aio_context_acquire(ctx);
+                 node->io_write(node->opaque);
++                aio_context_release(ctx);
+                 progress = true;
+             }
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
+     int count;
+     int timeout;
+-    aio_context_acquire(ctx);
+     progress = false;
+     /* aio_notify can avoid the expensive event_notifier_set if
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
+         timeout = blocking && !have_select_revents
+             ? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
+-        if (timeout) {
+-            aio_context_release(ctx);
+-        }
+         ret = WaitForMultipleObjects(count, events, FALSE, timeout);
+         if (blocking) {
+             assert(first);
+             atomic_sub(&ctx->notify_me, 2);
+         }
+-        if (timeout) {
+-            aio_context_acquire(ctx);
+-        }
+         if (first) {
+             aio_notify_accept(ctx);
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
+         progress |= aio_dispatch_handlers(ctx, event);
+     } while (count > 0);
++    aio_context_acquire(ctx);
+     progress |= timerlistgroup_run_timers(&ctx->tlg);
+-
+     aio_context_release(ctx);
+     return progress;
+ }
+diff --git a/util/async.c b/util/async.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/async.c
++++ b/util/async.c
+@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
+                 ret = 1;
+             }
+             bh->idle = 0;
++            aio_context_acquire(ctx);
+             aio_bh_call(bh);
++            aio_context_release(ctx);
+         }
+         if (bh->deleted) {
+             deleted = true;
+--
+.9.3

-New patch
+[Qemu-devel] [PULL v2 12/24] block: explicitly acquire aiocontext in timers that need it
+From: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Message-id: 20170213135235.12274-13-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  block/qed.h                 |  3 +++
  block/curl.c                |  2 ++
  block/io.c                  |  5 +++++
  block/iscsi.c               |  8 ++++++--
  block/null.c                |  4 ++++
  block/qed.c                 | 12 ++++++++++++
  block/throttle-groups.c     |  2 ++
  util/aio-posix.c            |  2 --
  util/aio-win32.c            |  2 --
  util/qemu-coroutine-sleep.c |  2 +-
 files changed, 35 insertions(+), 7 deletions(-)
 diff --git a/block/qed.h b/block/qed.h
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.h
 +++ b/block/qed.h
@@ -XXX,XX +XXX,XX @@ enum {
   */
  typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len);
 +void qed_acquire(BDRVQEDState *s);
 +void qed_release(BDRVQEDState *s);
 +
  /**
   * Generic callback for chaining async callbacks
   */
 diff --git a/block/curl.c b/block/curl.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/curl.c
 +++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_multi_timeout_do(void *arg)
          return;
      }
 +    aio_context_acquire(s->aio_context);
      curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
      curl_multi_check_completion(s);
 +    aio_context_release(s->aio_context);
  #else
      abort();
  #endif
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_aio_cancel(BlockAIOCB *acb)
          if (acb->aiocb_info->get_aio_context) {
              aio_poll(acb->aiocb_info->get_aio_context(acb), true);
          } else if (acb->bs) {
 +            /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
 +             * assert that we're not using an I/O thread.  Thread-safe
 +             * code should use bdrv_aio_cancel_async exclusively.
 +             */
 +            assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
              aio_poll(bdrv_get_aio_context(acb->bs), true);
          } else {
              abort();
 diff --git a/block/iscsi.c b/block/iscsi.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/iscsi.c
 +++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static void iscsi_retry_timer_expired(void *opaque)
      struct IscsiTask *iTask = opaque;
      iTask->complete = 1;
      if (iTask->co) {
 -        qemu_coroutine_enter(iTask->co);
 +        aio_co_wake(iTask->co);
      }
  }
@@ -XXX,XX +XXX,XX @@ static void iscsi_nop_timed_event(void *opaque)
  {
      IscsiLun *iscsilun = opaque;
 +    aio_context_acquire(iscsilun->aio_context);
      if (iscsi_get_nops_in_flight(iscsilun->iscsi) >= MAX_NOP_FAILURES) {
          error_report("iSCSI: NOP timeout. Reconnecting...");
          iscsilun->request_timed_out = true;
      } else if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) {
          error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages.");
 -        return;
 +        goto out;
      }
      timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL);
      iscsi_set_events(iscsilun);
 +
 +out:
 +    aio_context_release(iscsilun->aio_context);
  }
  static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
 diff --git a/block/null.c b/block/null.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/null.c
 +++ b/block/null.c
@@ -XXX,XX +XXX,XX @@ static void null_bh_cb(void *opaque)
  static void null_timer_cb(void *opaque)
  {
      NullAIOCB *acb = opaque;
 +    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 +
 +    aio_context_acquire(ctx);
      acb->common.cb(acb->common.opaque, 0);
 +    aio_context_release(ctx);
      timer_deinit(&acb->timer);
      qemu_aio_unref(acb);
  }
 diff --git a/block/qed.c b/block/qed.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.c
 +++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_need_check_timer_cb(void *opaque)
      trace_qed_need_check_timer_cb(s);
 +    qed_acquire(s);
      qed_plug_allocating_write_reqs(s);
      /* Ensure writes are on disk before clearing flag */
      bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
 +    qed_release(s);
 +}
 +
 +void qed_acquire(BDRVQEDState *s)
 +{
 +    aio_context_acquire(bdrv_get_aio_context(s->bs));
 +}
 +
 +void qed_release(BDRVQEDState *s)
 +{
 +    aio_context_release(bdrv_get_aio_context(s->bs));
  }
  static void qed_start_need_check_timer(BDRVQEDState *s)
 diff --git a/block/throttle-groups.c b/block/throttle-groups.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/throttle-groups.c
 +++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@ static void timer_cb(BlockBackend *blk, bool is_write)
      qemu_mutex_unlock(&tg->lock);
      /* Run the request that was waiting for this timer */
 +    aio_context_acquire(blk_get_aio_context(blk));
      empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]);
 +    aio_context_release(blk_get_aio_context(blk));
      /* If the request queue was empty then we have to take care of
       * scheduling the next one */
 diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-posix.c
 +++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
      }
      /* Run our timers */
 -    aio_context_acquire(ctx);
      progress |= timerlistgroup_run_timers(&ctx->tlg);
 -    aio_context_release(ctx);
      return progress;
  }
 diff --git a/util/aio-win32.c b/util/aio-win32.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-win32.c
 +++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          progress |= aio_dispatch_handlers(ctx, event);
      } while (count > 0);
 -    aio_context_acquire(ctx);
      progress |= timerlistgroup_run_timers(&ctx->tlg);
 -    aio_context_release(ctx);
      return progress;
  }
 diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/qemu-coroutine-sleep.c
 +++ b/util/qemu-coroutine-sleep.c
@@ -XXX,XX +XXX,XX @@ static void co_sleep_cb(void *opaque)
  {
      CoSleepCB *sleep_cb = opaque;
 -    qemu_coroutine_enter(sleep_cb->co);
 +    aio_co_wake(sleep_cb->co);
  }
  void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type,
 --
 .9.3

-New patch
+[Qemu-devel] [PULL v2 13/24] block: explicitly acquire aiocontext in callbacks that need it
+From: Paolo Bonzini <pbonzini@redhat.com>
+This covers both file descriptor callbacks and polling callbacks,
+since they execute related code.
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
+Message-id: 20170213135235.12274-14-pbonzini@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/curl.c          | 16 +++++++++++++---
+ block/iscsi.c         |  4 ++++
+ block/linux-aio.c     |  4 ++++
+ block/nfs.c           |  6 ++++++
+ block/sheepdog.c      | 29 +++++++++++++++--------------
+ block/ssh.c           | 29 +++++++++--------------------
+ block/win32-aio.c     | 10 ++++++----
+ hw/block/virtio-blk.c |  5 ++++-
+ hw/scsi/virtio-scsi.c |  7 +++++++
+ util/aio-posix.c      |  7 -------
+ util/aio-win32.c      |  6 ------
+files changed, 68 insertions(+), 55 deletions(-)
+diff --git a/block/curl.c b/block/curl.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/curl.c
++++ b/block/curl.c
+@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
+     }
+ }
+-static void curl_multi_do(void *arg)
++static void curl_multi_do_locked(CURLState *s)
+ {
+-    CURLState *s = (CURLState *)arg;
+     CURLSocket *socket, *next_socket;
+     int running;
+     int r;
+@@ -XXX,XX +XXX,XX @@ static void curl_multi_do(void *arg)
+     }
+ }
++static void curl_multi_do(void *arg)
++{
++    CURLState *s = (CURLState *)arg;
++
++    aio_context_acquire(s->s->aio_context);
++    curl_multi_do_locked(s);
++    aio_context_release(s->s->aio_context);
++}
++
+ static void curl_multi_read(void *arg)
+ {
+     CURLState *s = (CURLState *)arg;
+-    curl_multi_do(arg);
++    aio_context_acquire(s->s->aio_context);
++    curl_multi_do_locked(s);
+     curl_multi_check_completion(s->s);
++    aio_context_release(s->s->aio_context);
+ }
+ static void curl_multi_timeout_do(void *arg)
+diff --git a/block/iscsi.c b/block/iscsi.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/iscsi.c
++++ b/block/iscsi.c
+@@ -XXX,XX +XXX,XX @@ iscsi_process_read(void *arg)
+     IscsiLun *iscsilun = arg;
+     struct iscsi_context *iscsi = iscsilun->iscsi;
++    aio_context_acquire(iscsilun->aio_context);
+     iscsi_service(iscsi, POLLIN);
+     iscsi_set_events(iscsilun);
++    aio_context_release(iscsilun->aio_context);
+ }
+ static void
+@@ -XXX,XX +XXX,XX @@ iscsi_process_write(void *arg)
+     IscsiLun *iscsilun = arg;
+     struct iscsi_context *iscsi = iscsilun->iscsi;
++    aio_context_acquire(iscsilun->aio_context);
+     iscsi_service(iscsi, POLLOUT);
+     iscsi_set_events(iscsilun);
++    aio_context_release(iscsilun->aio_context);
+ }
+ static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun)
+diff --git a/block/linux-aio.c b/block/linux-aio.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/linux-aio.c
++++ b/block/linux-aio.c
+@@ -XXX,XX +XXX,XX @@ static void qemu_laio_completion_cb(EventNotifier *e)
+     LinuxAioState *s = container_of(e, LinuxAioState, e);
+     if (event_notifier_test_and_clear(&s->e)) {
++        aio_context_acquire(s->aio_context);
+         qemu_laio_process_completions_and_submit(s);
++        aio_context_release(s->aio_context);
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static bool qemu_laio_poll_cb(void *opaque)
+         return false;
+     }
++    aio_context_acquire(s->aio_context);
+     qemu_laio_process_completions_and_submit(s);
++    aio_context_release(s->aio_context);
+     return true;
+ }
+diff --git a/block/nfs.c b/block/nfs.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nfs.c
++++ b/block/nfs.c
+@@ -XXX,XX +XXX,XX @@ static void nfs_set_events(NFSClient *client)
+ static void nfs_process_read(void *arg)
+ {
+     NFSClient *client = arg;
++
++    aio_context_acquire(client->aio_context);
+     nfs_service(client->context, POLLIN);
+     nfs_set_events(client);
++    aio_context_release(client->aio_context);
+ }
+ static void nfs_process_write(void *arg)
+ {
+     NFSClient *client = arg;
++
++    aio_context_acquire(client->aio_context);
+     nfs_service(client->context, POLLOUT);
+     nfs_set_events(client);
++    aio_context_release(client->aio_context);
+ }
+ static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
+diff --git a/block/sheepdog.c b/block/sheepdog.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/sheepdog.c
++++ b/block/sheepdog.c
+@@ -XXX,XX +XXX,XX @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
+     return ret;
+ }
+-static void restart_co_req(void *opaque)
+-{
+-    Coroutine *co = opaque;
+-
+-    qemu_coroutine_enter(co);
+-}
+-
+ typedef struct SheepdogReqCo {
+     int sockfd;
+     BlockDriverState *bs;
+@@ -XXX,XX +XXX,XX @@ typedef struct SheepdogReqCo {
+     unsigned int *rlen;
+     int ret;
+     bool finished;
++    Coroutine *co;
+ } SheepdogReqCo;
++static void restart_co_req(void *opaque)
++{
++    SheepdogReqCo *srco = opaque;
++
++    aio_co_wake(srco->co);
++}
++
+ static coroutine_fn void do_co_req(void *opaque)
+ {
+     int ret;
+-    Coroutine *co;
+     SheepdogReqCo *srco = opaque;
+     int sockfd = srco->sockfd;
+     SheepdogReq *hdr = srco->hdr;
+@@ -XXX,XX +XXX,XX @@ static coroutine_fn void do_co_req(void *opaque)
+     unsigned int *wlen = srco->wlen;
+     unsigned int *rlen = srco->rlen;
+-    co = qemu_coroutine_self();
++    srco->co = qemu_coroutine_self();
+     aio_set_fd_handler(srco->aio_context, sockfd, false,
+-                       NULL, restart_co_req, NULL, co);
++                       NULL, restart_co_req, NULL, srco);
+     ret = send_co_req(sockfd, hdr, data, wlen);
+     if (ret < 0) {
+@@ -XXX,XX +XXX,XX @@ static coroutine_fn void do_co_req(void *opaque)
+     }
+     aio_set_fd_handler(srco->aio_context, sockfd, false,
+-                       restart_co_req, NULL, NULL, co);
++                       restart_co_req, NULL, NULL, srco);
+     ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
+     if (ret != sizeof(*hdr)) {
+@@ -XXX,XX +XXX,XX @@ out:
+     aio_set_fd_handler(srco->aio_context, sockfd, false,
+                        NULL, NULL, NULL, NULL);
++    srco->co = NULL;
+     srco->ret = ret;
+     srco->finished = true;
+     if (srco->bs) {
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
+          * We've finished all requests which belong to the AIOCB, so
+          * we can switch back to sd_co_readv/writev now.
+          */
+-        qemu_coroutine_enter(acb->coroutine);
++        aio_co_wake(acb->coroutine);
+     }
+     return;
+@@ -XXX,XX +XXX,XX @@ static void co_read_response(void *opaque)
+         s->co_recv = qemu_coroutine_create(aio_read_response, opaque);
+     }
+-    qemu_coroutine_enter(s->co_recv);
++    aio_co_wake(s->co_recv);
+ }
+ static void co_write_request(void *opaque)
+ {
+     BDRVSheepdogState *s = opaque;
+-    qemu_coroutine_enter(s->co_send);
++    aio_co_wake(s->co_send);
+ }
+ /*
+diff --git a/block/ssh.c b/block/ssh.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/ssh.c
++++ b/block/ssh.c
+@@ -XXX,XX +XXX,XX @@ static void restart_coroutine(void *opaque)
+     DPRINTF("co=%p", co);
+-    qemu_coroutine_enter(co);
++    aio_co_wake(co);
+ }
+-static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
++/* A non-blocking call returned EAGAIN, so yield, ensuring the
++ * handlers are set up so that we'll be rescheduled when there is an
++ * interesting event on the socket.
++ */
++static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
+ {
+     int r;
+     IOHandler *rd_handler = NULL, *wr_handler = NULL;
+@@ -XXX,XX +XXX,XX @@ static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
+     aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
+                        false, rd_handler, wr_handler, NULL, co);
+-}
+-
+-static coroutine_fn void clear_fd_handler(BDRVSSHState *s,
+-                                          BlockDriverState *bs)
+-{
+-    DPRINTF("s->sock=%d", s->sock);
+-    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
+-                       false, NULL, NULL, NULL, NULL);
+-}
+-
+-/* A non-blocking call returned EAGAIN, so yield, ensuring the
+- * handlers are set up so that we'll be rescheduled when there is an
+- * interesting event on the socket.
+- */
+-static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
+-{
+-    set_fd_handler(s, bs);
+     qemu_coroutine_yield();
+-    clear_fd_handler(s, bs);
++    DPRINTF("s->sock=%d - back", s->sock);
++    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, false,
++                       NULL, NULL, NULL, NULL);
+ }
+ /* SFTP has a function `libssh2_sftp_seek64' which seeks to a position
+diff --git a/block/win32-aio.c b/block/win32-aio.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/win32-aio.c
++++ b/block/win32-aio.c
+@@ -XXX,XX +XXX,XX @@ struct QEMUWin32AIOState {
+     HANDLE hIOCP;
+     EventNotifier e;
+     int count;
+-    bool is_aio_context_attached;
++    AioContext *aio_ctx;
+ };
+ typedef struct QEMUWin32AIOCB {
+@@ -XXX,XX +XXX,XX @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
+     }
++    aio_context_acquire(s->aio_ctx);
+     waiocb->common.cb(waiocb->common.opaque, ret);
++    aio_context_release(s->aio_ctx);
+     qemu_aio_unref(waiocb);
+ }
+@@ -XXX,XX +XXX,XX @@ void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
+                                   AioContext *old_context)
+ {
+     aio_set_event_notifier(old_context, &aio->e, false, NULL, NULL);
+-    aio->is_aio_context_attached = false;
++    aio->aio_ctx = NULL;
+ }
+ void win32_aio_attach_aio_context(QEMUWin32AIOState *aio,
+                                   AioContext *new_context)
+ {
+-    aio->is_aio_context_attached = true;
++    aio->aio_ctx = new_context;
+     aio_set_event_notifier(new_context, &aio->e, false,
+                            win32_aio_completion_cb, NULL);
+ }
+@@ -XXX,XX +XXX,XX @@ out_free_state:
+ void win32_aio_cleanup(QEMUWin32AIOState *aio)
+ {
+-    assert(!aio->is_aio_context_attached);
++    assert(!aio->aio_ctx);
+     CloseHandle(aio->hIOCP);
+     event_notifier_cleanup(&aio->e);
+     g_free(aio);
+diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/block/virtio-blk.c
++++ b/hw/block/virtio-blk.c
+@@ -XXX,XX +XXX,XX @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
+ {
+     VirtIOBlockIoctlReq *ioctl_req = opaque;
+     VirtIOBlockReq *req = ioctl_req->req;
+-    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
++    VirtIOBlock *s = req->dev;
++    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+     struct virtio_scsi_inhdr *scsi;
+     struct sg_io_hdr *hdr;
+@@ -XXX,XX +XXX,XX @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
+     MultiReqBuffer mrb = {};
+     bool progress = false;
++    aio_context_acquire(blk_get_aio_context(s->blk));
+     blk_io_plug(s->blk);
+     do {
+@@ -XXX,XX +XXX,XX @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
+     }
+     blk_io_unplug(s->blk);
++    aio_context_release(blk_get_aio_context(s->blk));
+     return progress;
+ }
+diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/scsi/virtio-scsi.c
++++ b/hw/scsi/virtio-scsi.c
+@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
+     VirtIOSCSIReq *req;
+     bool progress = false;
++    virtio_scsi_acquire(s);
+     while ((req = virtio_scsi_pop_req(s, vq))) {
+         progress = true;
+         virtio_scsi_handle_ctrl_req(s, req);
+     }
++    virtio_scsi_release(s);
+     return progress;
+ }
+@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
+     QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs);
++    virtio_scsi_acquire(s);
+     do {
+         virtio_queue_set_notification(vq, 0);
+@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
+     QTAILQ_FOREACH_SAFE(req, &reqs, next, next) {
+         virtio_scsi_handle_cmd_req_submit(s, req);
+     }
++    virtio_scsi_release(s);
+     return progress;
+ }
+@@ -XXX,XX +XXX,XX @@ out:
+ bool virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq)
+ {
++    virtio_scsi_acquire(s);
+     if (s->events_dropped) {
+         virtio_scsi_push_event(s, NULL, VIRTIO_SCSI_T_NO_EVENT, 0);
++        virtio_scsi_release(s);
+         return true;
+     }
++    virtio_scsi_release(s);
+     return false;
+ }
+diff --git a/util/aio-posix.c b/util/aio-posix.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/aio-posix.c
++++ b/util/aio-posix.c
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
+             (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
+             aio_node_check(ctx, node->is_external) &&
+             node->io_read) {
+-            aio_context_acquire(ctx);
+             node->io_read(node->opaque);
+-            aio_context_release(ctx);
+             /* aio_notify() does not count as progress */
+             if (node->opaque != &ctx->notifier) {
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
+             (revents & (G_IO_OUT | G_IO_ERR)) &&
+             aio_node_check(ctx, node->is_external) &&
+             node->io_write) {
+-            aio_context_acquire(ctx);
+             node->io_write(node->opaque);
+-            aio_context_release(ctx);
+             progress = true;
+         }
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
+         start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+     }
+-    aio_context_acquire(ctx);
+     progress = try_poll_mode(ctx, blocking);
+-    aio_context_release(ctx);
+-
+     if (!progress) {
+         assert(npfd == 0);
+diff --git a/util/aio-win32.c b/util/aio-win32.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/aio-win32.c
++++ b/util/aio-win32.c
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
+             (revents || event_notifier_get_handle(node->e) == event) &&
+             node->io_notify) {
+             node->pfd.revents = 0;
+-            aio_context_acquire(ctx);
+             node->io_notify(node->e);
+-            aio_context_release(ctx);
+             /* aio_notify() does not count as progress */
+             if (node->e != &ctx->notifier) {
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
+             (node->io_read || node->io_write)) {
+             node->pfd.revents = 0;
+             if ((revents & G_IO_IN) && node->io_read) {
+-                aio_context_acquire(ctx);
+                 node->io_read(node->opaque);
+-                aio_context_release(ctx);
+                 progress = true;
+             }
+             if ((revents & G_IO_OUT) && node->io_write) {
+-                aio_context_acquire(ctx);
+                 node->io_write(node->opaque);
+-                aio_context_release(ctx);
+                 progress = true;
+             }
+--
+.9.3

-New patch
+[Qemu-devel] [PULL v2 14/24] block: explicitly acquire aiocontext in bottom halves that need it
+From: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
+Message-id: 20170213135235.12274-15-pbonzini@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/archipelago.c   |  3 +++
+ block/blkreplay.c     |  2 +-
+ block/block-backend.c |  6 ++++++
+ block/curl.c          | 26 ++++++++++++++++++--------
+ block/gluster.c       |  9 +--------
+ block/io.c            |  6 +++++-
+ block/iscsi.c         |  6 +++++-
+ block/linux-aio.c     | 15 +++++++++------
+ block/nfs.c           |  3 ++-
+ block/null.c          |  4 ++++
+ block/qed.c           |  3 +++
+ block/rbd.c           |  4 ++++
+ dma-helpers.c         |  2 ++
+ hw/block/virtio-blk.c |  2 ++
+ hw/scsi/scsi-bus.c    |  2 ++
+ util/async.c          |  4 ++--
+ util/thread-pool.c    |  2 ++
+files changed, 71 insertions(+), 28 deletions(-)
+diff --git a/block/archipelago.c b/block/archipelago.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/archipelago.c
++++ b/block/archipelago.c
+@@ -XXX,XX +XXX,XX @@ static void qemu_archipelago_complete_aio(void *opaque)
+ {
+     AIORequestData *reqdata = (AIORequestData *) opaque;
+     ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;
++    AioContext *ctx = bdrv_get_aio_context(aio_cb->common.bs);
++    aio_context_acquire(ctx);
+     aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
++    aio_context_release(ctx);
+     aio_cb->status = 0;
+     qemu_aio_unref(aio_cb);
+diff --git a/block/blkreplay.c b/block/blkreplay.c
+index XXXXXXX..XXXXXXX 100755
+--- a/block/blkreplay.c
++++ b/block/blkreplay.c
+@@ -XXX,XX +XXX,XX @@ static int64_t blkreplay_getlength(BlockDriverState *bs)
+ static void blkreplay_bh_cb(void *opaque)
+ {
+     Request *req = opaque;
+-    qemu_coroutine_enter(req->co);
++    aio_co_wake(req->co);
+     qemu_bh_delete(req->bh);
+     g_free(req);
+ }
+diff --git a/block/block-backend.c b/block/block-backend.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/block-backend.c
++++ b/block/block-backend.c
+@@ -XXX,XX +XXX,XX @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
+ static void error_callback_bh(void *opaque)
+ {
+     struct BlockBackendAIOCB *acb = opaque;
++    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+     bdrv_dec_in_flight(acb->common.bs);
++    aio_context_acquire(ctx);
+     acb->common.cb(acb->common.opaque, acb->ret);
++    aio_context_release(ctx);
+     qemu_aio_unref(acb);
+ }
+@@ -XXX,XX +XXX,XX @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)
+ static void blk_aio_complete_bh(void *opaque)
+ {
+     BlkAioEmAIOCB *acb = opaque;
++    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+     assert(acb->has_returned);
++    aio_context_acquire(ctx);
+     blk_aio_complete(acb);
++    aio_context_release(ctx);
+ }
+ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
+diff --git a/block/curl.c b/block/curl.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/curl.c
++++ b/block/curl.c
+@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
+ {
+     CURLState *state;
+     int running;
++    int ret = -EINPROGRESS;
+     CURLAIOCB *acb = p;
+-    BDRVCURLState *s = acb->common.bs->opaque;
++    BlockDriverState *bs = acb->common.bs;
++    BDRVCURLState *s = bs->opaque;
++    AioContext *ctx = bdrv_get_aio_context(bs);
+     size_t start = acb->sector_num * BDRV_SECTOR_SIZE;
+     size_t end;
++    aio_context_acquire(ctx);
++
+     // In case we have the requested data already (e.g. read-ahead),
+     // we can just call the callback and be done.
+     switch (curl_find_buf(s, start, acb->nb_sectors * BDRV_SECTOR_SIZE, acb)) {
+@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
+             qemu_aio_unref(acb);
+             // fall through
+         case FIND_RET_WAIT:
+-            return;
++            goto out;
+         default:
+             break;
+     }
+@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
+     // No cache found, so let's start a new request
+     state = curl_init_state(acb->common.bs, s);
+     if (!state) {
+-        acb->common.cb(acb->common.opaque, -EIO);
+-        qemu_aio_unref(acb);
+-        return;
++        ret = -EIO;
++        goto out;
+     }
+     acb->start = 0;
+@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
+     state->orig_buf = g_try_malloc(state->buf_len);
+     if (state->buf_len && state->orig_buf == NULL) {
+         curl_clean_state(state);
+-        acb->common.cb(acb->common.opaque, -ENOMEM);
+-        qemu_aio_unref(acb);
+-        return;
++        ret = -ENOMEM;
++        goto out;
+     }
+     state->acb[0] = acb;
+@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
+     /* Tell curl it needs to kick things off */
+     curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
++
++out:
++    if (ret != -EINPROGRESS) {
++        acb->common.cb(acb->common.opaque, ret);
++        qemu_aio_unref(acb);
++    }
++    aio_context_release(ctx);
+ }
+ static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
+diff --git a/block/gluster.c b/block/gluster.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/gluster.c
++++ b/block/gluster.c
+@@ -XXX,XX +XXX,XX @@ static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
+     return qemu_gluster_glfs_init(gconf, errp);
+ }
+-static void qemu_gluster_complete_aio(void *opaque)
+-{
+-    GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
+-
+-    qemu_coroutine_enter(acb->coroutine);
+-}
+-
+ /*
+  * AIO callback routine called from GlusterFS thread.
+  */
+@@ -XXX,XX +XXX,XX @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
+         acb->ret = -EIO; /* Partial read/write - fail it */
+     }
+-    aio_bh_schedule_oneshot(acb->aio_context, qemu_gluster_complete_aio, acb);
++    aio_co_schedule(acb->aio_context, acb->coroutine);
+ }
+ static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
+diff --git a/block/io.c b/block/io.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/io.c
++++ b/block/io.c
+@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
+     bdrv_dec_in_flight(bs);
+     bdrv_drained_begin(bs);
+     data->done = true;
+-    qemu_coroutine_enter(co);
++    aio_co_wake(co);
+ }
+ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
+@@ -XXX,XX +XXX,XX @@ static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
+ static void bdrv_co_em_bh(void *opaque)
+ {
+     BlockAIOCBCoroutine *acb = opaque;
++    BlockDriverState *bs = acb->common.bs;
++    AioContext *ctx = bdrv_get_aio_context(bs);
+     assert(!acb->need_bh);
++    aio_context_acquire(ctx);
+     bdrv_co_complete(acb);
++    aio_context_release(ctx);
+ }
+ static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
+diff --git a/block/iscsi.c b/block/iscsi.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/iscsi.c
++++ b/block/iscsi.c
+@@ -XXX,XX +XXX,XX @@ static void
+ iscsi_bh_cb(void *p)
+ {
+     IscsiAIOCB *acb = p;
++    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+     qemu_bh_delete(acb->bh);
+     g_free(acb->buf);
+     acb->buf = NULL;
++    aio_context_acquire(ctx);
+     acb->common.cb(acb->common.opaque, acb->status);
++    aio_context_release(ctx);
+     if (acb->task != NULL) {
+         scsi_free_scsi_task(acb->task);
+@@ -XXX,XX +XXX,XX @@ iscsi_schedule_bh(IscsiAIOCB *acb)
+ static void iscsi_co_generic_bh_cb(void *opaque)
+ {
+     struct IscsiTask *iTask = opaque;
++
+     iTask->complete = 1;
+-    qemu_coroutine_enter(iTask->co);
++    aio_co_wake(iTask->co);
+ }
+ static void iscsi_retry_timer_expired(void *opaque)
+diff --git a/block/linux-aio.c b/block/linux-aio.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/linux-aio.c
++++ b/block/linux-aio.c
+@@ -XXX,XX +XXX,XX @@ struct LinuxAioState {
+     io_context_t ctx;
+     EventNotifier e;
+-    /* io queue for submit at batch */
++    /* io queue for submit at batch.  Protected by AioContext lock. */
+     LaioQueue io_q;
+-    /* I/O completion processing */
++    /* I/O completion processing.  Only runs in I/O thread.  */
+     QEMUBH *completion_bh;
+     int event_idx;
+     int event_max;
+@@ -XXX,XX +XXX,XX @@ static inline ssize_t io_event_ret(struct io_event *ev)
+  */
+ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
+ {
++    LinuxAioState *s = laiocb->ctx;
+     int ret;
+     ret = laiocb->ret;
+@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
+     }
+     laiocb->ret = ret;
++    aio_context_acquire(s->aio_context);
+     if (laiocb->co) {
+         /* If the coroutine is already entered it must be in ioq_submit() and
+          * will notice laio->ret has been filled in when it eventually runs
+@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
+         laiocb->common.cb(laiocb->common.opaque, ret);
+         qemu_aio_unref(laiocb);
+     }
++    aio_context_release(s->aio_context);
+ }
+ /**
+@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completions(LinuxAioState *s)
+ static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
+ {
+     qemu_laio_process_completions(s);
++
++    aio_context_acquire(s->aio_context);
+     if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
+         ioq_submit(s);
+     }
++    aio_context_release(s->aio_context);
+ }
+ static void qemu_laio_completion_bh(void *opaque)
+@@ -XXX,XX +XXX,XX @@ static void qemu_laio_completion_cb(EventNotifier *e)
+     LinuxAioState *s = container_of(e, LinuxAioState, e);
+     if (event_notifier_test_and_clear(&s->e)) {
+-        aio_context_acquire(s->aio_context);
+         qemu_laio_process_completions_and_submit(s);
+-        aio_context_release(s->aio_context);
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static bool qemu_laio_poll_cb(void *opaque)
+         return false;
+     }
+-    aio_context_acquire(s->aio_context);
+     qemu_laio_process_completions_and_submit(s);
+-    aio_context_release(s->aio_context);
+     return true;
+ }
+@@ -XXX,XX +XXX,XX @@ void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
+ {
+     aio_set_event_notifier(old_context, &s->e, false, NULL, NULL);
+     qemu_bh_delete(s->completion_bh);
++    s->aio_context = NULL;
+ }
+ void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
+diff --git a/block/nfs.c b/block/nfs.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/nfs.c
++++ b/block/nfs.c
+@@ -XXX,XX +XXX,XX @@ static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
+ static void nfs_co_generic_bh_cb(void *opaque)
+ {
+     NFSRPC *task = opaque;
++
+     task->complete = 1;
+-    qemu_coroutine_enter(task->co);
++    aio_co_wake(task->co);
+ }
+ static void
+diff --git a/block/null.c b/block/null.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/null.c
++++ b/block/null.c
+@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo null_aiocb_info = {
+ static void null_bh_cb(void *opaque)
+ {
+     NullAIOCB *acb = opaque;
++    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
++
++    aio_context_acquire(ctx);
+     acb->common.cb(acb->common.opaque, 0);
++    aio_context_release(ctx);
+     qemu_aio_unref(acb);
+ }
+diff --git a/block/qed.c b/block/qed.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/qed.c
++++ b/block/qed.c
+@@ -XXX,XX +XXX,XX @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
+ static void qed_aio_complete_bh(void *opaque)
+ {
+     QEDAIOCB *acb = opaque;
++    BDRVQEDState *s = acb_to_s(acb);
+     BlockCompletionFunc *cb = acb->common.cb;
+     void *user_opaque = acb->common.opaque;
+     int ret = acb->bh_ret;
+@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete_bh(void *opaque)
+     qemu_aio_unref(acb);
+     /* Invoke callback */
++    qed_acquire(s);
+     cb(user_opaque, ret);
++    qed_release(s);
+ }
+ static void qed_aio_complete(QEDAIOCB *acb, int ret)
+diff --git a/block/rbd.c b/block/rbd.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/rbd.c
++++ b/block/rbd.c
+@@ -XXX,XX +XXX,XX @@ shutdown:
+ static void qemu_rbd_complete_aio(RADOSCB *rcb)
+ {
+     RBDAIOCB *acb = rcb->acb;
++    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+     int64_t r;
+     r = rcb->ret;
+@@ -XXX,XX +XXX,XX @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
+         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+     }
+     qemu_vfree(acb->bounce);
++
++    aio_context_acquire(ctx);
+     acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
++    aio_context_release(ctx);
+     qemu_aio_unref(acb);
+ }
+diff --git a/dma-helpers.c b/dma-helpers.c
+index XXXXXXX..XXXXXXX 100644
+--- a/dma-helpers.c
++++ b/dma-helpers.c
+@@ -XXX,XX +XXX,XX @@ static void dma_blk_cb(void *opaque, int ret)
+                                 QEMU_ALIGN_DOWN(dbs->iov.size, dbs->align));
+     }
++    aio_context_acquire(dbs->ctx);
+     dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
+                             dma_blk_cb, dbs, dbs->io_func_opaque);
++    aio_context_release(dbs->ctx);
+     assert(dbs->acb);
+ }
+diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/block/virtio-blk.c
++++ b/hw/block/virtio-blk.c
+@@ -XXX,XX +XXX,XX @@ static void virtio_blk_dma_restart_bh(void *opaque)
+     s->rq = NULL;
++    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+     while (req) {
+         VirtIOBlockReq *next = req->next;
+         if (virtio_blk_handle_request(req, &mrb)) {
+@@ -XXX,XX +XXX,XX @@ static void virtio_blk_dma_restart_bh(void *opaque)
+     if (mrb.num_reqs) {
+         virtio_blk_submit_multireq(s->blk, &mrb);
+     }
++    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+ }
+ static void virtio_blk_dma_restart_cb(void *opaque, int running,
+diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/scsi/scsi-bus.c
++++ b/hw/scsi/scsi-bus.c
+@@ -XXX,XX +XXX,XX @@ static void scsi_dma_restart_bh(void *opaque)
+     qemu_bh_delete(s->bh);
+     s->bh = NULL;
++    aio_context_acquire(blk_get_aio_context(s->conf.blk));
+     QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
+         scsi_req_ref(req);
+         if (req->retry) {
+@@ -XXX,XX +XXX,XX @@ static void scsi_dma_restart_bh(void *opaque)
+         }
+         scsi_req_unref(req);
+     }
++    aio_context_release(blk_get_aio_context(s->conf.blk));
+ }
+ void scsi_req_retry(SCSIRequest *req)
+diff --git a/util/async.c b/util/async.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/async.c
++++ b/util/async.c
+@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
+                 ret = 1;
+             }
+             bh->idle = 0;
+-            aio_context_acquire(ctx);
+             aio_bh_call(bh);
+-            aio_context_release(ctx);
+         }
+         if (bh->deleted) {
+             deleted = true;
+@@ -XXX,XX +XXX,XX @@ static void co_schedule_bh_cb(void *opaque)
+         Coroutine *co = QSLIST_FIRST(&straight);
+         QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
+         trace_aio_co_schedule_bh_cb(ctx, co);
++        aio_context_acquire(ctx);
+         qemu_coroutine_enter(co);
++        aio_context_release(ctx);
+     }
+ }
+diff --git a/util/thread-pool.c b/util/thread-pool.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/thread-pool.c
++++ b/util/thread-pool.c
+@@ -XXX,XX +XXX,XX @@ static void thread_pool_completion_bh(void *opaque)
+     ThreadPool *pool = opaque;
+     ThreadPoolElement *elem, *next;
++    aio_context_acquire(pool->ctx);
+ restart:
+     QLIST_FOREACH_SAFE(elem, &pool->head, all, next) {
+         if (elem->state != THREAD_DONE) {
+@@ -XXX,XX +XXX,XX @@ restart:
+             qemu_aio_unref(elem);
+         }
+     }
++    aio_context_release(pool->ctx);
+ }
+ static void thread_pool_cancel(BlockAIOCB *acb)
+--
+.9.3

-New patch
+[Qemu-devel] [PULL v2 15/24] block: explicitly acquire aiocontext in aio callbacks that need it
+From: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
+Message-id: 20170213135235.12274-16-pbonzini@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/archipelago.c    |  3 ---
+ block/block-backend.c  |  7 -------
+ block/curl.c           |  2 +-
+ block/io.c             |  6 +-----
+ block/iscsi.c          |  3 ---
+ block/linux-aio.c      |  5 +----
+ block/mirror.c         | 12 +++++++++---
+ block/null.c           |  8 --------
+ block/qed-cluster.c    |  2 ++
+ block/qed-table.c      | 12 ++++++++++--
+ block/qed.c            |  4 ++--
+ block/rbd.c            |  4 ----
+ block/win32-aio.c      |  3 ---
+ hw/block/virtio-blk.c  | 12 +++++++++++-
+ hw/scsi/scsi-disk.c    | 15 +++++++++++++++
+ hw/scsi/scsi-generic.c | 20 +++++++++++++++++---
+ util/thread-pool.c     |  4 +++-
+files changed, 72 insertions(+), 50 deletions(-)
+diff --git a/block/archipelago.c b/block/archipelago.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/archipelago.c
++++ b/block/archipelago.c
+@@ -XXX,XX +XXX,XX @@ static void qemu_archipelago_complete_aio(void *opaque)
+ {
+     AIORequestData *reqdata = (AIORequestData *) opaque;
+     ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;
+-    AioContext *ctx = bdrv_get_aio_context(aio_cb->common.bs);
+-    aio_context_acquire(ctx);
+     aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
+-    aio_context_release(ctx);
+     aio_cb->status = 0;
+     qemu_aio_unref(aio_cb);
+diff --git a/block/block-backend.c b/block/block-backend.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/block-backend.c
++++ b/block/block-backend.c
+@@ -XXX,XX +XXX,XX @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
+ static void error_callback_bh(void *opaque)
+ {
+     struct BlockBackendAIOCB *acb = opaque;
+-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+     bdrv_dec_in_flight(acb->common.bs);
+-    aio_context_acquire(ctx);
+     acb->common.cb(acb->common.opaque, acb->ret);
+-    aio_context_release(ctx);
+     qemu_aio_unref(acb);
+ }
+@@ -XXX,XX +XXX,XX @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)
+ static void blk_aio_complete_bh(void *opaque)
+ {
+     BlkAioEmAIOCB *acb = opaque;
+-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+-
+     assert(acb->has_returned);
+-    aio_context_acquire(ctx);
+     blk_aio_complete(acb);
+-    aio_context_release(ctx);
+ }
+ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
+diff --git a/block/curl.c b/block/curl.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/curl.c
++++ b/block/curl.c
+@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
+     curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
+ out:
++    aio_context_release(ctx);
+     if (ret != -EINPROGRESS) {
+         acb->common.cb(acb->common.opaque, ret);
+         qemu_aio_unref(acb);
+     }
+-    aio_context_release(ctx);
+ }
+ static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
+diff --git a/block/io.c b/block/io.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/io.c
++++ b/block/io.c
+@@ -XXX,XX +XXX,XX @@ static void bdrv_co_io_em_complete(void *opaque, int ret)
+     CoroutineIOCompletion *co = opaque;
+     co->ret = ret;
+-    qemu_coroutine_enter(co->coroutine);
++    aio_co_wake(co->coroutine);
+ }
+ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
+ static void bdrv_co_em_bh(void *opaque)
+ {
+     BlockAIOCBCoroutine *acb = opaque;
+-    BlockDriverState *bs = acb->common.bs;
+-    AioContext *ctx = bdrv_get_aio_context(bs);
+     assert(!acb->need_bh);
+-    aio_context_acquire(ctx);
+     bdrv_co_complete(acb);
+-    aio_context_release(ctx);
+ }
+ static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
+diff --git a/block/iscsi.c b/block/iscsi.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/iscsi.c
++++ b/block/iscsi.c
+@@ -XXX,XX +XXX,XX @@ static void
+ iscsi_bh_cb(void *p)
+ {
+     IscsiAIOCB *acb = p;
+-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+     qemu_bh_delete(acb->bh);
+     g_free(acb->buf);
+     acb->buf = NULL;
+-    aio_context_acquire(ctx);
+     acb->common.cb(acb->common.opaque, acb->status);
+-    aio_context_release(ctx);
+     if (acb->task != NULL) {
+         scsi_free_scsi_task(acb->task);
+diff --git a/block/linux-aio.c b/block/linux-aio.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/linux-aio.c
++++ b/block/linux-aio.c
+@@ -XXX,XX +XXX,XX @@ static inline ssize_t io_event_ret(struct io_event *ev)
+  */
+ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
+ {
+-    LinuxAioState *s = laiocb->ctx;
+     int ret;
+     ret = laiocb->ret;
+@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
+     }
+     laiocb->ret = ret;
+-    aio_context_acquire(s->aio_context);
+     if (laiocb->co) {
+         /* If the coroutine is already entered it must be in ioq_submit() and
+          * will notice laio->ret has been filled in when it eventually runs
+@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
+          * that!
+          */
+         if (!qemu_coroutine_entered(laiocb->co)) {
+-            qemu_coroutine_enter(laiocb->co);
++            aio_co_wake(laiocb->co);
+         }
+     } else {
+         laiocb->common.cb(laiocb->common.opaque, ret);
+         qemu_aio_unref(laiocb);
+     }
+-    aio_context_release(s->aio_context);
+ }
+ /**
+diff --git a/block/mirror.c b/block/mirror.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/mirror.c
++++ b/block/mirror.c
+@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
+ {
+     MirrorOp *op = opaque;
+     MirrorBlockJob *s = op->s;
++
++    aio_context_acquire(blk_get_aio_context(s->common.blk));
+     if (ret < 0) {
+         BlockErrorAction action;
+@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
+         }
+     }
+     mirror_iteration_done(op, ret);
++    aio_context_release(blk_get_aio_context(s->common.blk));
+ }
+ static void mirror_read_complete(void *opaque, int ret)
+ {
+     MirrorOp *op = opaque;
+     MirrorBlockJob *s = op->s;
++
++    aio_context_acquire(blk_get_aio_context(s->common.blk));
+     if (ret < 0) {
+         BlockErrorAction action;
+@@ -XXX,XX +XXX,XX @@ static void mirror_read_complete(void *opaque, int ret)
+         }
+         mirror_iteration_done(op, ret);
+-        return;
++    } else {
++        blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
++                        0, mirror_write_complete, op);
+     }
+-    blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
+-                    0, mirror_write_complete, op);
++    aio_context_release(blk_get_aio_context(s->common.blk));
+ }
+ static inline void mirror_clip_sectors(MirrorBlockJob *s,
+diff --git a/block/null.c b/block/null.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/null.c
++++ b/block/null.c
+@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo null_aiocb_info = {
+ static void null_bh_cb(void *opaque)
+ {
+     NullAIOCB *acb = opaque;
+-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+-
+-    aio_context_acquire(ctx);
+     acb->common.cb(acb->common.opaque, 0);
+-    aio_context_release(ctx);
+     qemu_aio_unref(acb);
+ }
+ static void null_timer_cb(void *opaque)
+ {
+     NullAIOCB *acb = opaque;
+-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+-
+-    aio_context_acquire(ctx);
+     acb->common.cb(acb->common.opaque, 0);
+-    aio_context_release(ctx);
+     timer_deinit(&acb->timer);
+     qemu_aio_unref(acb);
+ }
+diff --git a/block/qed-cluster.c b/block/qed-cluster.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/qed-cluster.c
++++ b/block/qed-cluster.c
+@@ -XXX,XX +XXX,XX @@ static void qed_find_cluster_cb(void *opaque, int ret)
+     unsigned int index;
+     unsigned int n;
++    qed_acquire(s);
+     if (ret) {
+         goto out;
+     }
+@@ -XXX,XX +XXX,XX @@ static void qed_find_cluster_cb(void *opaque, int ret)
+ out:
+     find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
++    qed_release(s);
+     g_free(find_cluster_cb);
+ }
+diff --git a/block/qed-table.c b/block/qed-table.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/qed-table.c
++++ b/block/qed-table.c
+@@ -XXX,XX +XXX,XX @@ static void qed_read_table_cb(void *opaque, int ret)
+ {
+     QEDReadTableCB *read_table_cb = opaque;
+     QEDTable *table = read_table_cb->table;
++    BDRVQEDState *s = read_table_cb->s;
+     int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
+     int i;
+@@ -XXX,XX +XXX,XX @@ static void qed_read_table_cb(void *opaque, int ret)
+     }
+     /* Byteswap offsets */
++    qed_acquire(s);
+     for (i = 0; i < noffsets; i++) {
+         table->offsets[i] = le64_to_cpu(table->offsets[i]);
+     }
++    qed_release(s);
+ out:
+     /* Completion */
+-    trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret);
++    trace_qed_read_table_cb(s, read_table_cb->table, ret);
+     gencb_complete(&read_table_cb->gencb, ret);
+ }
+@@ -XXX,XX +XXX,XX @@ typedef struct {
+ static void qed_write_table_cb(void *opaque, int ret)
+ {
+     QEDWriteTableCB *write_table_cb = opaque;
++    BDRVQEDState *s = write_table_cb->s;
+-    trace_qed_write_table_cb(write_table_cb->s,
++    trace_qed_write_table_cb(s,
+                              write_table_cb->orig_table,
+                              write_table_cb->flush,
+                              ret);
+@@ -XXX,XX +XXX,XX @@ static void qed_write_table_cb(void *opaque, int ret)
+     if (write_table_cb->flush) {
+         /* We still need to flush first */
+         write_table_cb->flush = false;
++        qed_acquire(s);
+         bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb,
+                        write_table_cb);
++        qed_release(s);
+         return;
+     }
+@@ -XXX,XX +XXX,XX @@ static void qed_read_l2_table_cb(void *opaque, int ret)
+     CachedL2Table *l2_table = request->l2_table;
+     uint64_t l2_offset = read_l2_table_cb->l2_offset;
++    qed_acquire(s);
+     if (ret) {
+         /* can't trust loaded L2 table anymore */
+         qed_unref_l2_cache_entry(l2_table);
+@@ -XXX,XX +XXX,XX @@ static void qed_read_l2_table_cb(void *opaque, int ret)
+         request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
+         assert(request->l2_table != NULL);
+     }
++    qed_release(s);
+     gencb_complete(&read_l2_table_cb->gencb, ret);
+ }
+diff --git a/block/qed.c b/block/qed.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/qed.c
++++ b/block/qed.c
+@@ -XXX,XX +XXX,XX @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l
+     }
+     if (cb->co) {
+-        qemu_coroutine_enter(cb->co);
++        aio_co_wake(cb->co);
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
+     cb->done = true;
+     cb->ret = ret;
+     if (cb->co) {
+-        qemu_coroutine_enter(cb->co);
++        aio_co_wake(cb->co);
+     }
+ }
+diff --git a/block/rbd.c b/block/rbd.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/rbd.c
++++ b/block/rbd.c
+@@ -XXX,XX +XXX,XX @@ shutdown:
+ static void qemu_rbd_complete_aio(RADOSCB *rcb)
+ {
+     RBDAIOCB *acb = rcb->acb;
+-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+     int64_t r;
+     r = rcb->ret;
+@@ -XXX,XX +XXX,XX @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
+         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+     }
+     qemu_vfree(acb->bounce);
+-
+-    aio_context_acquire(ctx);
+     acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
+-    aio_context_release(ctx);
+     qemu_aio_unref(acb);
+ }
+diff --git a/block/win32-aio.c b/block/win32-aio.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/win32-aio.c
++++ b/block/win32-aio.c
+@@ -XXX,XX +XXX,XX @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
+         qemu_vfree(waiocb->buf);
+     }
+-
+-    aio_context_acquire(s->aio_ctx);
+     waiocb->common.cb(waiocb->common.opaque, ret);
+-    aio_context_release(s->aio_ctx);
+     qemu_aio_unref(waiocb);
+ }
+diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/block/virtio-blk.c
++++ b/hw/block/virtio-blk.c
+@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
+ static void virtio_blk_rw_complete(void *opaque, int ret)
+ {
+     VirtIOBlockReq *next = opaque;
++    VirtIOBlock *s = next->dev;
++    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+     while (next) {
+         VirtIOBlockReq *req = next;
+         next = req->mr_next;
+@@ -XXX,XX +XXX,XX @@ static void virtio_blk_rw_complete(void *opaque, int ret)
+         block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
+         virtio_blk_free_request(req);
+     }
++    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+ }
+ static void virtio_blk_flush_complete(void *opaque, int ret)
+ {
+     VirtIOBlockReq *req = opaque;
++    VirtIOBlock *s = req->dev;
++    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+     if (ret) {
+         if (virtio_blk_handle_rw_error(req, -ret, 0)) {
+-            return;
++            goto out;
+         }
+     }
+     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
+     block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
+     virtio_blk_free_request(req);
++
++out:
++    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+ }
+ #ifdef __linux__
+@@ -XXX,XX +XXX,XX @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
+     virtio_stl_p(vdev, &scsi->data_len, hdr->dxfer_len);
+ out:
++    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+     virtio_blk_req_complete(req, status);
+     virtio_blk_free_request(req);
++    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+     g_free(ioctl_req);
+ }
+diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/scsi/scsi-disk.c
++++ b/hw/scsi/scsi-disk.c
+@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
+     assert(r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
+     if (scsi_disk_req_check_error(r, ret, true)) {
+         goto done;
+     }
+@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
+     scsi_req_complete(&r->req, GOOD);
+ done:
++    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
+     scsi_req_unref(&r->req);
+ }
+@@ -XXX,XX +XXX,XX @@ static void scsi_dma_complete(void *opaque, int ret)
+     assert(r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
+     if (ret < 0) {
+         block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
+     } else {
+         block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
+     }
+     scsi_dma_complete_noio(r, ret);
++    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
+ }
+ static void scsi_read_complete(void * opaque, int ret)
+@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
+     assert(r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
+     if (scsi_disk_req_check_error(r, ret, true)) {
+         goto done;
+     }
+@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
+ done:
+     scsi_req_unref(&r->req);
++    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
+ }
+ /* Actually issue a read to the block device.  */
+@@ -XXX,XX +XXX,XX @@ static void scsi_do_read_cb(void *opaque, int ret)
+     assert (r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
+     if (ret < 0) {
+         block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
+     } else {
+         block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
+     }
+     scsi_do_read(opaque, ret);
++    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
+ }
+ /* Read more data from scsi device into buffer.  */
+@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
+     assert (r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
+     if (ret < 0) {
+         block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
+     } else {
+         block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
+     }
+     scsi_write_complete_noio(r, ret);
++    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
+ }
+ static void scsi_write_data(SCSIRequest *req)
+@@ -XXX,XX +XXX,XX @@ static void scsi_unmap_complete(void *opaque, int ret)
+ {
+     UnmapCBData *data = opaque;
+     SCSIDiskReq *r = data->r;
++    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+     assert(r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
+     scsi_unmap_complete_noio(data, ret);
++    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
+ }
+ static void scsi_disk_emulate_unmap(SCSIDiskReq *r, uint8_t *inbuf)
+@@ -XXX,XX +XXX,XX @@ static void scsi_write_same_complete(void *opaque, int ret)
+     assert(r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
+     if (scsi_disk_req_check_error(r, ret, true)) {
+         goto done;
+     }
+@@ -XXX,XX +XXX,XX @@ done:
+     scsi_req_unref(&r->req);
+     qemu_vfree(data->iov.iov_base);
+     g_free(data);
++    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
+ }
+ static void scsi_disk_emulate_write_same(SCSIDiskReq *r, uint8_t *inbuf)
+diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/scsi/scsi-generic.c
++++ b/hw/scsi/scsi-generic.c
+@@ -XXX,XX +XXX,XX @@ done:
+ static void scsi_command_complete(void *opaque, int ret)
+ {
+     SCSIGenericReq *r = (SCSIGenericReq *)opaque;
++    SCSIDevice *s = r->req.dev;
+     assert(r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++
++    aio_context_acquire(blk_get_aio_context(s->conf.blk));
+     scsi_command_complete_noio(r, ret);
++    aio_context_release(blk_get_aio_context(s->conf.blk));
+ }
+ static int execute_command(BlockBackend *blk,
+@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
+     assert(r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->conf.blk));
++
+     if (ret || r->req.io_canceled) {
+         scsi_command_complete_noio(r, ret);
+-        return;
++        goto done;
+     }
+     len = r->io_header.dxfer_len - r->io_header.resid;
+@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
+     r->len = -1;
+     if (len == 0) {
+         scsi_command_complete_noio(r, 0);
+-        return;
++        goto done;
+     }
+     /* Snoop READ CAPACITY output to set the blocksize.  */
+@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
+     }
+     scsi_req_data(&r->req, len);
+     scsi_req_unref(&r->req);
++
++done:
++    aio_context_release(blk_get_aio_context(s->conf.blk));
+ }
+ /* Read more data from scsi device into buffer.  */
+@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
+     assert(r->req.aiocb != NULL);
+     r->req.aiocb = NULL;
++    aio_context_acquire(blk_get_aio_context(s->conf.blk));
++
+     if (ret || r->req.io_canceled) {
+         scsi_command_complete_noio(r, ret);
+-        return;
++        goto done;
+     }
+     if (r->req.cmd.buf[0] == MODE_SELECT && r->req.cmd.buf[4] == 12 &&
+@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
+     }
+     scsi_command_complete_noio(r, ret);
++
++done:
++    aio_context_release(blk_get_aio_context(s->conf.blk));
+ }
+ /* Write data to a scsi device.  Returns nonzero on failure.
+diff --git a/util/thread-pool.c b/util/thread-pool.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/thread-pool.c
++++ b/util/thread-pool.c
+@@ -XXX,XX +XXX,XX @@ restart:
+              */
+             qemu_bh_schedule(pool->completion_bh);
++            aio_context_release(pool->ctx);
+             elem->common.cb(elem->common.opaque, elem->ret);
++            aio_context_acquire(pool->ctx);
+             qemu_aio_unref(elem);
+             goto restart;
+         } else {
+@@ -XXX,XX +XXX,XX @@ static void thread_pool_co_cb(void *opaque, int ret)
+     ThreadPoolCo *co = opaque;
+     co->ret = ret;
+-    qemu_coroutine_enter(co->co);
++    aio_co_wake(co->co);
+ }
+ int coroutine_fn thread_pool_submit_co(ThreadPool *pool, ThreadPoolFunc *func,
+--
+.9.3

-New patch
+[Qemu-devel] [PULL v2 16/24] aio-posix: partially inline aio_dispatch into aio_poll
+From: Paolo Bonzini <pbonzini@redhat.com>
+This patch prepares for the removal of unnecessary lockcnt inc/dec pairs.
+Extract the dispatching loop for file descriptor handlers into a new
+function aio_dispatch_handlers, and then inline aio_dispatch into
+aio_poll.
+aio_dispatch can now become void.
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
+Message-id: 20170213135235.12274-17-pbonzini@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ include/block/aio.h |  6 +-----
+ util/aio-posix.c    | 44 ++++++++++++++------------------------------
+ util/aio-win32.c    | 13 ++++---------
+ util/async.c        |  2 +-
+files changed, 20 insertions(+), 45 deletions(-)
+diff --git a/include/block/aio.h b/include/block/aio.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/block/aio.h
++++ b/include/block/aio.h
+@@ -XXX,XX +XXX,XX @@ bool aio_pending(AioContext *ctx);
+ /* Dispatch any pending callbacks from the GSource attached to the AioContext.
+  *
+  * This is used internally in the implementation of the GSource.
+- *
+- * @dispatch_fds: true to process fds, false to skip them
+- *                (can be used as an optimization by callers that know there
+- *                are no fds ready)
+  */
+-bool aio_dispatch(AioContext *ctx, bool dispatch_fds);
++void aio_dispatch(AioContext *ctx);
+ /* Progress in completing AIO work to occur.  This can issue new pending
+  * aio as a result of executing I/O completion or bh callbacks.
+diff --git a/util/aio-posix.c b/util/aio-posix.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/aio-posix.c
++++ b/util/aio-posix.c
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
+     AioHandler *node, *tmp;
+     bool progress = false;
+-    /*
+-     * We have to walk very carefully in case aio_set_fd_handler is
+-     * called while we're walking.
+-     */
+-    qemu_lockcnt_inc(&ctx->list_lock);
+-
+     QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
+         int revents;
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
+         }
+     }
+-    qemu_lockcnt_dec(&ctx->list_lock);
+     return progress;
+ }
+-/*
+- * Note that dispatch_fds == false has the side-effect of post-poning the
+- * freeing of deleted handlers.
+- */
+-bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
++void aio_dispatch(AioContext *ctx)
+ {
+-    bool progress;
++    aio_bh_poll(ctx);
+-    /*
+-     * If there are callbacks left that have been queued, we need to call them.
+-     * Do not call select in this case, because it is possible that the caller
+-     * does not need a complete flush (as is the case for aio_poll loops).
+-     */
+-    progress = aio_bh_poll(ctx);
++    qemu_lockcnt_inc(&ctx->list_lock);
++    aio_dispatch_handlers(ctx);
++    qemu_lockcnt_dec(&ctx->list_lock);
+-    if (dispatch_fds) {
+-        progress |= aio_dispatch_handlers(ctx);
+-    }
+-
+-    /* Run our timers */
+-    progress |= timerlistgroup_run_timers(&ctx->tlg);
+-
+-    return progress;
++    timerlistgroup_run_timers(&ctx->tlg);
+ }
+ /* These thread-local variables are used only in a small part of aio_poll
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
+     npfd = 0;
+     qemu_lockcnt_dec(&ctx->list_lock);
+-    /* Run dispatch even if there were no readable fds to run timers */
+-    if (aio_dispatch(ctx, ret > 0)) {
+-        progress = true;
++    progress |= aio_bh_poll(ctx);
++
++    if (ret > 0) {
++        qemu_lockcnt_inc(&ctx->list_lock);
++        progress |= aio_dispatch_handlers(ctx);
++        qemu_lockcnt_dec(&ctx->list_lock);
+     }
++    progress |= timerlistgroup_run_timers(&ctx->tlg);
++
+     return progress;
+ }
+diff --git a/util/aio-win32.c b/util/aio-win32.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/aio-win32.c
++++ b/util/aio-win32.c
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
+     return progress;
+ }
+-bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
++void aio_dispatch(AioContext *ctx)
+ {
+-    bool progress;
+-
+-    progress = aio_bh_poll(ctx);
+-    if (dispatch_fds) {
+-        progress |= aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
+-    }
+-    progress |= timerlistgroup_run_timers(&ctx->tlg);
+-    return progress;
++    aio_bh_poll(ctx);
++    aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
++    timerlistgroup_run_timers(&ctx->tlg);
+ }
+ bool aio_poll(AioContext *ctx, bool blocking)
+diff --git a/util/async.c b/util/async.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/async.c
++++ b/util/async.c
+@@ -XXX,XX +XXX,XX @@ aio_ctx_dispatch(GSource     *source,
+     AioContext *ctx = (AioContext *) source;
+     assert(callback == NULL);
+-    aio_dispatch(ctx, true);
++    aio_dispatch(ctx);
+     return true;
+ }
+--
+.9.3

-New patch
+[Qemu-devel] [PULL v2 17/24] async: remove unnecessary inc/dec pairs
+From: Paolo Bonzini <pbonzini@redhat.com>
+Pull the increment/decrement pair out of aio_bh_poll and into the
+callers.
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
+Message-id: 20170213135235.12274-18-pbonzini@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ util/aio-posix.c |  8 +++-----
+ util/aio-win32.c |  8 ++++----
+ util/async.c     | 12 ++++++------
+files changed, 13 insertions(+), 15 deletions(-)
+diff --git a/util/aio-posix.c b/util/aio-posix.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/aio-posix.c
++++ b/util/aio-posix.c
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
+ void aio_dispatch(AioContext *ctx)
+ {
++    qemu_lockcnt_inc(&ctx->list_lock);
+     aio_bh_poll(ctx);
+-
+-    qemu_lockcnt_inc(&ctx->list_lock);
+     aio_dispatch_handlers(ctx);
+     qemu_lockcnt_dec(&ctx->list_lock);
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
+     }
+     npfd = 0;
+-    qemu_lockcnt_dec(&ctx->list_lock);
+     progress |= aio_bh_poll(ctx);
+     if (ret > 0) {
+-        qemu_lockcnt_inc(&ctx->list_lock);
+         progress |= aio_dispatch_handlers(ctx);
+-        qemu_lockcnt_dec(&ctx->list_lock);
+     }
++    qemu_lockcnt_dec(&ctx->list_lock);
++
+     progress |= timerlistgroup_run_timers(&ctx->tlg);
+     return progress;
+diff --git a/util/aio-win32.c b/util/aio-win32.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/aio-win32.c
++++ b/util/aio-win32.c
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
+     bool progress = false;
+     AioHandler *tmp;
+-    qemu_lockcnt_inc(&ctx->list_lock);
+-
+     /*
+      * We have to walk very carefully in case aio_set_fd_handler is
+      * called while we're walking.
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
+         }
+     }
+-    qemu_lockcnt_dec(&ctx->list_lock);
+     return progress;
+ }
+ void aio_dispatch(AioContext *ctx)
+ {
++    qemu_lockcnt_inc(&ctx->list_lock);
+     aio_bh_poll(ctx);
+     aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
++    qemu_lockcnt_dec(&ctx->list_lock);
+     timerlistgroup_run_timers(&ctx->tlg);
+ }
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
+         }
+     }
+-    qemu_lockcnt_dec(&ctx->list_lock);
+     first = true;
+     /* ctx->notifier is always registered.  */
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
+         progress |= aio_dispatch_handlers(ctx, event);
+     } while (count > 0);
++    qemu_lockcnt_dec(&ctx->list_lock);
++
+     progress |= timerlistgroup_run_timers(&ctx->tlg);
+     return progress;
+ }
+diff --git a/util/async.c b/util/async.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/async.c
++++ b/util/async.c
+@@ -XXX,XX +XXX,XX @@ void aio_bh_call(QEMUBH *bh)
+     bh->cb(bh->opaque);
+ }
+-/* Multiple occurrences of aio_bh_poll cannot be called concurrently */
++/* Multiple occurrences of aio_bh_poll cannot be called concurrently.
++ * The count in ctx->list_lock is incremented before the call, and is
++ * not affected by the call.
++ */
+ int aio_bh_poll(AioContext *ctx)
+ {
+     QEMUBH *bh, **bhp, *next;
+     int ret;
+     bool deleted = false;
+-    qemu_lockcnt_inc(&ctx->list_lock);
+-
+     ret = 0;
+     for (bh = atomic_rcu_read(&ctx->first_bh); bh; bh = next) {
+         next = atomic_rcu_read(&bh->next);
+@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
+     /* remove deleted bhs */
+     if (!deleted) {
+-        qemu_lockcnt_dec(&ctx->list_lock);
+         return ret;
+     }
+-    if (qemu_lockcnt_dec_and_lock(&ctx->list_lock)) {
++    if (qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
+         bhp = &ctx->first_bh;
+         while (*bhp) {
+             bh = *bhp;
+@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
+                 bhp = &bh->next;
+             }
+         }
+-        qemu_lockcnt_unlock(&ctx->list_lock);
++        qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
+     }
+     return ret;
+ }
+--
+.9.3

-New patch
+[Qemu-devel] [PULL v2 18/24] block: document fields protected by AioContext lock
+From: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
+Message-id: 20170213135235.12274-19-pbonzini@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ include/block/block_int.h      | 64 +++++++++++++++++++++++++-----------------
+ include/sysemu/block-backend.h | 14 ++++++---
+files changed, 49 insertions(+), 29 deletions(-)
+diff --git a/include/block/block_int.h b/include/block/block_int.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/block/block_int.h
++++ b/include/block/block_int.h
+@@ -XXX,XX +XXX,XX @@ struct BdrvChild {
+  * copied as well.
+  */
+ struct BlockDriverState {
+-    int64_t total_sectors; /* if we are reading a disk image, give its
+-                              size in sectors */
++    /* Protected by big QEMU lock or read-only after opening.  No special
++     * locking needed during I/O...
++     */
+     int open_flags; /* flags used to open the file, re-used for re-open */
+     bool read_only; /* if true, the media is read only */
+     bool encrypted; /* if true, the media is encrypted */
+@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
+     bool sg;        /* if true, the device is a /dev/sg* */
+     bool probed;    /* if true, format was probed rather than specified */
+-    int copy_on_read; /* if nonzero, copy read backing sectors into image.
+-                         note this is a reference count */
+-
+-    CoQueue flush_queue;            /* Serializing flush queue */
+-    bool active_flush_req;          /* Flush request in flight? */
+-    unsigned int write_gen;         /* Current data generation */
+-    unsigned int flushed_gen;       /* Flushed write generation */
+-
+     BlockDriver *drv; /* NULL means no media */
+     void *opaque;
+@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
+     BdrvChild *backing;
+     BdrvChild *file;
+-    /* Callback before write request is processed */
+-    NotifierWithReturnList before_write_notifiers;
+-
+-    /* number of in-flight requests; overall and serialising */
+-    unsigned int in_flight;
+-    unsigned int serialising_in_flight;
+-
+-    bool wakeup;
+-
+-    /* Offset after the highest byte written to */
+-    uint64_t wr_highest_offset;
+-
+     /* I/O Limits */
+     BlockLimits bl;
+@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
+     QTAILQ_ENTRY(BlockDriverState) bs_list;
+     /* element of the list of monitor-owned BDS */
+     QTAILQ_ENTRY(BlockDriverState) monitor_list;
+-    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
+     int refcnt;
+-    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
+-
+     /* operation blockers */
+     QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX];
+@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
+     /* The error object in use for blocking operations on backing_hd */
+     Error *backing_blocker;
++    /* Protected by AioContext lock */
++
++    /* If true, copy read backing sectors into image.  Can be >1 if more
++     * than one client has requested copy-on-read.
++     */
++    int copy_on_read;
++
++    /* If we are reading a disk image, give its size in sectors.
++     * Generally read-only; it is written to by load_vmstate and save_vmstate,
++     * but the block layer is quiescent during those.
++     */
++    int64_t total_sectors;
++
++    /* Callback before write request is processed */
++    NotifierWithReturnList before_write_notifiers;
++
++    /* number of in-flight requests; overall and serialising */
++    unsigned int in_flight;
++    unsigned int serialising_in_flight;
++
++    bool wakeup;
++
++    /* Offset after the highest byte written to */
++    uint64_t wr_highest_offset;
++
+     /* threshold limit for writes, in bytes. "High water mark". */
+     uint64_t write_threshold_offset;
+     NotifierWithReturn write_threshold_notifier;
+@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
+     /* counter for nested bdrv_io_plug */
+     unsigned io_plugged;
++    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
++    CoQueue flush_queue;                  /* Serializing flush queue */
++    bool active_flush_req;                /* Flush request in flight? */
++    unsigned int write_gen;               /* Current data generation */
++    unsigned int flushed_gen;             /* Flushed write generation */
++
++    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
++
++    /* do we need to tell the quest if we have a volatile write cache? */
++    int enable_write_cache;
++
+     int quiesce_counter;
+ };
+diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/sysemu/block-backend.h
++++ b/include/sysemu/block-backend.h
+@@ -XXX,XX +XXX,XX @@ typedef struct BlockDevOps {
+  * fields that must be public. This is in particular for QLIST_ENTRY() and
+  * friends so that BlockBackends can be kept in lists outside block-backend.c */
+ typedef struct BlockBackendPublic {
+-    /* I/O throttling.
+-     * throttle_state tells us if this BlockBackend has I/O limits configured.
+-     * io_limits_disabled tells us if they are currently being enforced */
++    /* I/O throttling has its own locking, but also some fields are
++     * protected by the AioContext lock.
++     */
++
++    /* Protected by AioContext lock.  */
+     CoQueue      throttled_reqs[2];
++
++    /* Nonzero if the I/O limits are currently being ignored; generally
++     * it is zero.  */
+     unsigned int io_limits_disabled;
+     /* The following fields are protected by the ThrottleGroup lock.
+-     * See the ThrottleGroup documentation for details. */
++     * See the ThrottleGroup documentation for details.
++     * throttle_state tells us if I/O limits are configured. */
+     ThrottleState *throttle_state;
+     ThrottleTimers throttle_timers;
+     unsigned       pending_reqs[2];
+--
+.9.3

-New patch
+[Qemu-devel] [PULL v2 19/24] coroutine-lock: make CoMutex thread-safe
+From: Paolo Bonzini <pbonzini@redhat.com>
 This uses the lock-free mutex described in the paper '"Blocking without
 Locking", or LFTHREADS: A lock-free thread library' by Gidenstam and
 Papatriantafilou.  The same technique is used in OSv, and in fact
 the code is essentially a conversion to C of OSv's code.
 [Added missing coroutine_fn in tests/test-aio-multithread.c.
 --Stefan]
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213181244.16297-2-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  include/qemu/coroutine.h     |  17 ++++-
  tests/test-aio-multithread.c |  86 ++++++++++++++++++++++++
  util/qemu-coroutine-lock.c   | 155 ++++++++++++++++++++++++++++++++++++++++---
  util/trace-events            |   1 +
 files changed, 246 insertions(+), 13 deletions(-)
 diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/coroutine.h
 +++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue);
  /**
   * Provides a mutex that can be used to synchronise coroutines
   */
 +struct CoWaitRecord;
  typedef struct CoMutex {
 -    bool locked;
 +    /* Count of pending lockers; 0 for a free mutex, 1 for an
 +     * uncontended mutex.
 +     */
 +    unsigned locked;
 +
 +    /* A queue of waiters.  Elements are added atomically in front of
 +     * from_push.  to_pop is only populated, and popped from, by whoever
 +     * is in charge of the next wakeup.  This can be an unlocker or,
 +     * through the handoff protocol, a locker that is about to go to sleep.
 +     */
 +    QSLIST_HEAD(, CoWaitRecord) from_push, to_pop;
 +
 +    unsigned handoff, sequence;
 +
      Coroutine *holder;
 -    CoQueue queue;
  } CoMutex;
  /**
 diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/test-aio-multithread.c
 +++ b/tests/test-aio-multithread.c
@@ -XXX,XX +XXX,XX @@ static void test_multi_co_schedule_10(void)
      test_multi_co_schedule(10);
  }
 +/* CoMutex thread-safety.  */
 +
 +static uint32_t atomic_counter;
 +static uint32_t running;
 +static uint32_t counter;
 +static CoMutex comutex;
 +
 +static void coroutine_fn test_multi_co_mutex_entry(void *opaque)
 +{
 +    while (!atomic_mb_read(&now_stopping)) {
 +        qemu_co_mutex_lock(&comutex);
 +        counter++;
 +        qemu_co_mutex_unlock(&comutex);
 +
 +        /* Increase atomic_counter *after* releasing the mutex.  Otherwise
 +         * there is a chance (it happens about 1 in 3 runs) that the iothread
 +         * exits before the coroutine is woken up, causing a spurious
 +         * assertion failure.
 +         */
 +        atomic_inc(&atomic_counter);
 +    }
 +    atomic_dec(&running);
 +}
 +
 +static void test_multi_co_mutex(int threads, int seconds)
 +{
 +    int i;
 +
 +    qemu_co_mutex_init(&comutex);
 +    counter = 0;
 +    atomic_counter = 0;
 +    now_stopping = false;
 +
 +    create_aio_contexts();
 +    assert(threads <= NUM_CONTEXTS);
 +    running = threads;
 +    for (i = 0; i < threads; i++) {
 +        Coroutine *co1 = qemu_coroutine_create(test_multi_co_mutex_entry, NULL);
 +        aio_co_schedule(ctx[i], co1);
 +    }
 +
 +    g_usleep(seconds * 1000000);
 +
 +    atomic_mb_set(&now_stopping, true);
 +    while (running > 0) {
 +        g_usleep(100000);
 +    }
 +
 +    join_aio_contexts();
 +    g_test_message("%d iterations/second\n", counter / seconds);
 +    g_assert_cmpint(counter, ==, atomic_counter);
 +}
 +
 +/* Testing with NUM_CONTEXTS threads focuses on the queue.  The mutex however
 + * is too contended (and the threads spend too much time in aio_poll)
 + * to actually stress the handoff protocol.
 + */
 +static void test_multi_co_mutex_1(void)
 +{
 +    test_multi_co_mutex(NUM_CONTEXTS, 1);
 +}
 +
 +static void test_multi_co_mutex_10(void)
 +{
 +    test_multi_co_mutex(NUM_CONTEXTS, 10);
 +}
 +
 +/* Testing with fewer threads stresses the handoff protocol too.  Still, the
 + * case where the locker _can_ pick up a handoff is very rare, happening
 + * about 10 times in 1 million, so increase the runtime a bit compared to
 + * other "quick" testcases that only run for 1 second.
 + */
 +static void test_multi_co_mutex_2_3(void)
 +{
 +    test_multi_co_mutex(2, 3);
 +}
 +
 +static void test_multi_co_mutex_2_30(void)
 +{
 +    test_multi_co_mutex(2, 30);
 +}
 +
  /* End of tests.  */
  int main(int argc, char **argv)
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/aio/multi/lifecycle", test_lifecycle);
      if (g_test_quick()) {
          g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
 +        g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1);
 +        g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3);
      } else {
          g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
 +        g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10);
 +        g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30);
      }
      return g_test_run();
  }
 diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/qemu-coroutine-lock.c
 +++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@
   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
   * THE SOFTWARE.
 + *
 + * The lock-free mutex implementation is based on OSv
 + * (core/lfmutex.cc, include/lockfree/mutex.hh).
 + * Copyright (C) 2013 Cloudius Systems, Ltd.
   */
  #include "qemu/osdep.h"
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue)
      return QSIMPLEQ_FIRST(&queue->entries) == NULL;
  }
 +/* The wait records are handled with a multiple-producer, single-consumer
 + * lock-free queue.  There cannot be two concurrent pop_waiter() calls
 + * because pop_waiter() can only be called while mutex->handoff is zero.
 + * This can happen in three cases:
 + * - in qemu_co_mutex_unlock, before the hand-off protocol has started.
 + *   In this case, qemu_co_mutex_lock will see mutex->handoff == 0 and
 + *   not take part in the handoff.
 + * - in qemu_co_mutex_lock, if it steals the hand-off responsibility from
 + *   qemu_co_mutex_unlock.  In this case, qemu_co_mutex_unlock will fail
 + *   the cmpxchg (it will see either 0 or the next sequence value) and
 + *   exit.  The next hand-off cannot begin until qemu_co_mutex_lock has
 + *   woken up someone.
 + * - in qemu_co_mutex_unlock, if it takes the hand-off token itself.
 + *   In this case another iteration starts with mutex->handoff == 0;
 + *   a concurrent qemu_co_mutex_lock will fail the cmpxchg, and
 + *   qemu_co_mutex_unlock will go back to case (1).
 + *
 + * The following functions manage this queue.
 + */
 +typedef struct CoWaitRecord {
 +    Coroutine *co;
 +    QSLIST_ENTRY(CoWaitRecord) next;
 +} CoWaitRecord;
 +
 +static void push_waiter(CoMutex *mutex, CoWaitRecord *w)
 +{
 +    w->co = qemu_coroutine_self();
 +    QSLIST_INSERT_HEAD_ATOMIC(&mutex->from_push, w, next);
 +}
 +
 +static void move_waiters(CoMutex *mutex)
 +{
 +    QSLIST_HEAD(, CoWaitRecord) reversed;
 +    QSLIST_MOVE_ATOMIC(&reversed, &mutex->from_push);
 +    while (!QSLIST_EMPTY(&reversed)) {
 +        CoWaitRecord *w = QSLIST_FIRST(&reversed);
 +        QSLIST_REMOVE_HEAD(&reversed, next);
 +        QSLIST_INSERT_HEAD(&mutex->to_pop, w, next);
 +    }
 +}
 +
 +static CoWaitRecord *pop_waiter(CoMutex *mutex)
 +{
 +    CoWaitRecord *w;
 +
 +    if (QSLIST_EMPTY(&mutex->to_pop)) {
 +        move_waiters(mutex);
 +        if (QSLIST_EMPTY(&mutex->to_pop)) {
 +            return NULL;
 +        }
 +    }
 +    w = QSLIST_FIRST(&mutex->to_pop);
 +    QSLIST_REMOVE_HEAD(&mutex->to_pop, next);
 +    return w;
 +}
 +
 +static bool has_waiters(CoMutex *mutex)
 +{
 +    return QSLIST_EMPTY(&mutex->to_pop) || QSLIST_EMPTY(&mutex->from_push);
 +}
 +
  void qemu_co_mutex_init(CoMutex *mutex)
  {
      memset(mutex, 0, sizeof(*mutex));
 -    qemu_co_queue_init(&mutex->queue);
  }
 -void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
 +static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
  {
      Coroutine *self = qemu_coroutine_self();
 +    CoWaitRecord w;
 +    unsigned old_handoff;
      trace_qemu_co_mutex_lock_entry(mutex, self);
 +    w.co = self;
 +    push_waiter(mutex, &w);
 -    while (mutex->locked) {
 -        qemu_co_queue_wait(&mutex->queue);
 +    /* This is the "Responsibility Hand-Off" protocol; a lock() picks from
 +     * a concurrent unlock() the responsibility of waking somebody up.
 +     */
 +    old_handoff = atomic_mb_read(&mutex->handoff);
 +    if (old_handoff &&
 +        has_waiters(mutex) &&
 +        atomic_cmpxchg(&mutex->handoff, old_handoff, 0) == old_handoff) {
 +        /* There can be no concurrent pops, because there can be only
 +         * one active handoff at a time.
 +         */
 +        CoWaitRecord *to_wake = pop_waiter(mutex);
 +        Coroutine *co = to_wake->co;
 +        if (co == self) {
 +            /* We got the lock ourselves!  */
 +            assert(to_wake == &w);
 +            return;
 +        }
 +
 +        aio_co_wake(co);
      }
 -    mutex->locked = true;
 -    mutex->holder = self;
 -    self->locks_held++;
 -
 +    qemu_coroutine_yield();
      trace_qemu_co_mutex_lock_return(mutex, self);
  }
 +void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
 +{
 +    Coroutine *self = qemu_coroutine_self();
 +
 +    if (atomic_fetch_inc(&mutex->locked) == 0) {
 +        /* Uncontended.  */
 +        trace_qemu_co_mutex_lock_uncontended(mutex, self);
 +    } else {
 +        qemu_co_mutex_lock_slowpath(mutex);
 +    }
 +    mutex->holder = self;
 +    self->locks_held++;
 +}
 +
  void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
  {
      Coroutine *self = qemu_coroutine_self();
      trace_qemu_co_mutex_unlock_entry(mutex, self);
 -    assert(mutex->locked == true);
 +    assert(mutex->locked);
      assert(mutex->holder == self);
      assert(qemu_in_coroutine());
 -    mutex->locked = false;
      mutex->holder = NULL;
      self->locks_held--;
 -    qemu_co_queue_next(&mutex->queue);
 +    if (atomic_fetch_dec(&mutex->locked) == 1) {
 +        /* No waiting qemu_co_mutex_lock().  Pfew, that was easy!  */
 +        return;
 +    }
 +
 +    for (;;) {
 +        CoWaitRecord *to_wake = pop_waiter(mutex);
 +        unsigned our_handoff;
 +
 +        if (to_wake) {
 +            Coroutine *co = to_wake->co;
 +            aio_co_wake(co);
 +            break;
 +        }
 +
 +        /* Some concurrent lock() is in progress (we know this because
 +         * mutex->locked was >1) but it hasn't yet put itself on the wait
 +         * queue.  Pick a sequence number for the handoff protocol (not 0).
 +         */
 +        if (++mutex->sequence == 0) {
 +            mutex->sequence = 1;
 +        }
 +
 +        our_handoff = mutex->sequence;
 +        atomic_mb_set(&mutex->handoff, our_handoff);
 +        if (!has_waiters(mutex)) {
 +            /* The concurrent lock has not added itself yet, so it
 +             * will be able to pick our handoff.
 +             */
 +            break;
 +        }
 +
 +        /* Try to do the handoff protocol ourselves; if somebody else has
 +         * already taken it, however, we're done and they're responsible.
 +         */
 +        if (atomic_cmpxchg(&mutex->handoff, our_handoff, 0) != our_handoff) {
 +            break;
 +        }
 +    }
      trace_qemu_co_mutex_unlock_return(mutex, self);
  }
 diff --git a/util/trace-events b/util/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/util/trace-events
 +++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_coroutine_terminate(void *co) "self %p"
  # util/qemu-coroutine-lock.c
  qemu_co_queue_run_restart(void *co) "co %p"
 +qemu_co_mutex_lock_uncontended(void *mutex, void *self) "mutex %p self %p"
  qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
  qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
  qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
 --
 .9.3

-[Qemu-devel] [PULL 1/2] bitmaps.md: Convert to rST; move it into 'interop' dir
+[Qemu-devel] [PULL v2 20/24] coroutine-lock: add limited spinning to CoMutex
-From: Kashyap Chamarthy <kchamart@redhat.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-This is part of the on-going effort to convert QEMU upstream
+Running a very small critical section on pthread_mutex_t and CoMutex
-documentation syntax to reStructuredText (rST).
+shows that pthread_mutex_t is much faster because it doesn't actually
 go to sleep.  What happens is that the critical section is shorter
 than the latency of entering the kernel and thus FUTEX_WAIT always
 fails.  With CoMutex there is no such latency but you still want to
 avoid wait and wakeup.  So introduce it artificially.
-The conversion to rST was done using:
+This only works with one waiters; because CoMutex is fair, it will
 always have more waits and wakeups than a pthread_mutex_t.
-    $ pandoc -f markdown -t rst bitmaps.md -o bitmaps.rst
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213181244.16297-3-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  include/qemu/coroutine.h   |  5 +++++
  util/qemu-coroutine-lock.c | 51 ++++++++++++++++++++++++++++++++++++++++------
  util/qemu-coroutine.c      |  2 +-
 files changed, 51 insertions(+), 7 deletions(-)
-Then, make a couple of small syntactical adjustments.  While at it,
+diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
-reword a statement to avoid ambiguity.  Addressing the feedback from
+index XXXXXXX..XXXXXXX 100644
-this thread:
+--- a/include/qemu/coroutine.h
++++ b/include/qemu/coroutine.h
-    https://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg05428.html
+@@ -XXX,XX +XXX,XX @@ typedef struct CoMutex {
+      */
-Signed-off-by: Kashyap Chamarthy <kchamart@redhat.com>
+     unsigned locked;
-Reviewed-by: John Snow <jsnow@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
++    /* Context that is holding the lock.  Useful to avoid spinning
-Message-id: 20170717105205.32639-2-kchamart@redhat.com
++     * when two coroutines on the same AioContext try to get the lock. :)
-Signed-off-by: Jeff Cody <jcody@redhat.com>
++     */
----
++    AioContext *ctx;
- docs/devel/bitmaps.md    | 505 ------------------------------------------
++
- docs/interop/bitmaps.rst | 555 +++++++++++++++++++++++++++++++++++++++++++++++
+     /* A queue of waiters.  Elements are added atomically in front of
-files changed, 555 insertions(+), 505 deletions(-)
+      * from_push.  to_pop is only populated, and popped from, by whoever
- delete mode 100644 docs/devel/bitmaps.md
+      * is in charge of the next wakeup.  This can be an unlocker or,
- create mode 100644 docs/interop/bitmaps.rst
+diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
+index XXXXXXX..XXXXXXX 100644
-diff --git a/docs/devel/bitmaps.md b/docs/devel/bitmaps.md
+--- a/util/qemu-coroutine-lock.c
-deleted file mode 100644
++++ b/util/qemu-coroutine-lock.c
 index XXXXXXX..XXXXXXX
 --- a/docs/devel/bitmaps.md
 +++ /dev/null
 @@ -XXX,XX +XXX,XX @@
--<!--
+ #include "qemu-common.h"
--Copyright 2015 John Snow <jsnow@redhat.com> and Red Hat, Inc.
+ #include "qemu/coroutine.h"
--All rights reserved.
+ #include "qemu/coroutine_int.h"
--
++#include "qemu/processor.h"
--This file is licensed via The FreeBSD Documentation License, the full text of
+ #include "qemu/queue.h"
--which is included at the end of this document.
+ #include "block/aio.h"
---->
+ #include "trace.h"
--
+@@ -XXX,XX +XXX,XX @@ void qemu_co_mutex_init(CoMutex *mutex)
--# Dirty Bitmaps and Incremental Backup
+     memset(mutex, 0, sizeof(*mutex));
--
+ }
--* Dirty Bitmaps are objects that track which data needs to be backed up for the
--  next incremental backup.
+-static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
--
++static void coroutine_fn qemu_co_mutex_wake(CoMutex *mutex, Coroutine *co)
--* Dirty bitmaps can be created at any time and attached to any node
++{
--  (not just complete drives.)
++    /* Read co before co->ctx; pairs with smp_wmb() in
--
++     * qemu_coroutine_enter().
--## Dirty Bitmap Names
++     */
--
++    smp_read_barrier_depends();
--* A dirty bitmap's name is unique to the node, but bitmaps attached to different
++    mutex->ctx = co->ctx;
--  nodes can share the same name.
++    aio_co_wake(co);
--
++}
 -* Dirty bitmaps created for internal use by QEMU may be anonymous and have no
 -  name, but any user-created bitmaps may not be. There can be any number of
 -  anonymous bitmaps per node.
 -
 -* The name of a user-created bitmap must not be empty ("").
 -
 -## Bitmap Modes
 -
 -* A Bitmap can be "frozen," which means that it is currently in-use by a backup
 -  operation and cannot be deleted, renamed, written to, reset,
 -  etc.
 -
 -* The normal operating mode for a bitmap is "active."
 -
 -## Basic QMP Usage
 -
 -### Supported Commands ###
 -
 -* block-dirty-bitmap-add
 -* block-dirty-bitmap-remove
 -* block-dirty-bitmap-clear
 -
 -### Creation
 -
 -* To create a new bitmap, enabled, on the drive with id=drive0:
 -
 -```json
 -{ "execute": "block-dirty-bitmap-add",
 -  "arguments": {
 -    "node": "drive0",
 -    "name": "bitmap0"
 -  }
 -}
 -```
 -
 -* This bitmap will have a default granularity that matches the cluster size of
 -  its associated drive, if available, clamped to between [4KiB, 64KiB].
 -  The current default for qcow2 is 64KiB.
 -
 -* To create a new bitmap that tracks changes in 32KiB segments:
 -
 -```json
 -{ "execute": "block-dirty-bitmap-add",
 -  "arguments": {
 -    "node": "drive0",
 -    "name": "bitmap0",
 -    "granularity": 32768
 -  }
 -}
 -```
 -
 -### Deletion
 -
 -* Bitmaps that are frozen cannot be deleted.
 -
 -* Deleting the bitmap does not impact any other bitmaps attached to the same
 -  node, nor does it affect any backups already created from this node.
 -
 -* Because bitmaps are only unique to the node to which they are attached,
 -  you must specify the node/drive name here, too.
 -
 -```json
 -{ "execute": "block-dirty-bitmap-remove",
 -  "arguments": {
 -    "node": "drive0",
 -    "name": "bitmap0"
 -  }
 -}
 -```
 -
 -### Resetting
 -
 -* Resetting a bitmap will clear all information it holds.
 -
 -* An incremental backup created from an empty bitmap will copy no data,
 -  as if nothing has changed.
 -
 -```json
 -{ "execute": "block-dirty-bitmap-clear",
 -  "arguments": {
 -    "node": "drive0",
 -    "name": "bitmap0"
 -  }
 -}
 -```
 -
 -## Transactions
 -
 -### Justification
 -
 -Bitmaps can be safely modified when the VM is paused or halted by using
 -the basic QMP commands. For instance, you might perform the following actions:
 -
 -1. Boot the VM in a paused state.
 -2. Create a full drive backup of drive0.
 -3. Create a new bitmap attached to drive0.
 -4. Resume execution of the VM.
 -5. Incremental backups are ready to be created.
 -
 -At this point, the bitmap and drive backup would be correctly in sync,
 -and incremental backups made from this point forward would be correctly aligned
 -to the full drive backup.
 -
 -This is not particularly useful if we decide we want to start incremental
 -backups after the VM has been running for a while, for which we will need to
 -perform actions such as the following:
 -
 -1. Boot the VM and begin execution.
 -2. Using a single transaction, perform the following operations:
 -    * Create bitmap0.
 -    * Create a full drive backup of drive0.
 -3. Incremental backups are now ready to be created.
 -
 -### Supported Bitmap Transactions
 -
 -* block-dirty-bitmap-add
 -* block-dirty-bitmap-clear
 -
 -The usages are identical to their respective QMP commands, but see below
 -for examples.
 -
 -### Example: New Incremental Backup
 -
 -As outlined in the justification, perhaps we want to create a new incremental
 -backup chain attached to a drive.
 -
 -```json
 -{ "execute": "transaction",
 -  "arguments": {
 -    "actions": [
 -      {"type": "block-dirty-bitmap-add",
 -       "data": {"node": "drive0", "name": "bitmap0"} },
 -      {"type": "drive-backup",
 -       "data": {"device": "drive0", "target": "/path/to/full_backup.img",
 -                "sync": "full", "format": "qcow2"} }
 -    ]
 -  }
 -}
 -```
 -
 -### Example: New Incremental Backup Anchor Point
 -
 -Maybe we just want to create a new full backup with an existing bitmap and
 -want to reset the bitmap to track the new chain.
 -
 -```json
 -{ "execute": "transaction",
 -  "arguments": {
 -    "actions": [
 -      {"type": "block-dirty-bitmap-clear",
 -       "data": {"node": "drive0", "name": "bitmap0"} },
 -      {"type": "drive-backup",
 -       "data": {"device": "drive0", "target": "/path/to/new_full_backup.img",
 -                "sync": "full", "format": "qcow2"} }
 -    ]
 -  }
 -}
 -```
 -
 -## Incremental Backups
 -
 -The star of the show.
 -
 -**Nota Bene!** Only incremental backups of entire drives are supported for now.
 -So despite the fact that you can attach a bitmap to any arbitrary node, they are
 -only currently useful when attached to the root node. This is because
 -drive-backup only supports drives/devices instead of arbitrary nodes.
 -
 -### Example: First Incremental Backup
 -
 -1. Create a full backup and sync it to the dirty bitmap, as in the transactional
 -examples above; or with the VM offline, manually create a full copy and then
 -create a new bitmap before the VM begins execution.
 -
 -    * Let's assume the full backup is named 'full_backup.img'.
 -    * Let's assume the bitmap you created is 'bitmap0' attached to 'drive0'.
 -
 -2. Create a destination image for the incremental backup that utilizes the
 -full backup as a backing image.
 -
 -    * Let's assume it is named 'incremental.0.img'.
 -
 -    ```sh
 -    # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
 -    ```
 -
 -3. Issue the incremental backup command:
 -
 -    ```json
 -    { "execute": "drive-backup",
 -      "arguments": {
 -        "device": "drive0",
 -        "bitmap": "bitmap0",
 -        "target": "incremental.0.img",
 -        "format": "qcow2",
 -        "sync": "incremental",
 -        "mode": "existing"
 -      }
 -    }
 -    ```
 -
 -### Example: Second Incremental Backup
 -
 -1. Create a new destination image for the incremental backup that points to the
 -   previous one, e.g.: 'incremental.1.img'
 -
 -    ```sh
 -    # qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2
 -    ```
 -
 -2. Issue a new incremental backup command. The only difference here is that we
 -   have changed the target image below.
 -
 -    ```json
 -    { "execute": "drive-backup",
 -      "arguments": {
 -        "device": "drive0",
 -        "bitmap": "bitmap0",
 -        "target": "incremental.1.img",
 -        "format": "qcow2",
 -        "sync": "incremental",
 -        "mode": "existing"
 -      }
 -    }
 -    ```
 -
 -## Errors
 -
 -* In the event of an error that occurs after a backup job is successfully
 -  launched, either by a direct QMP command or a QMP transaction, the user
 -  will receive a BLOCK_JOB_COMPLETE event with a failure message, accompanied
 -  by a BLOCK_JOB_ERROR event.
 -
 -* In the case of an event being cancelled, the user will receive a
 -  BLOCK_JOB_CANCELLED event instead of a pair of COMPLETE and ERROR events.
 -
 -* In either case, the incremental backup data contained within the bitmap is
 -  safely rolled back, and the data within the bitmap is not lost. The image
 -  file created for the failed attempt can be safely deleted.
 -
 -* Once the underlying problem is fixed (e.g. more storage space is freed up),
 -  you can simply retry the incremental backup command with the same bitmap.
 -
 -### Example
 -
 -1. Create a target image:
 -
 -    ```sh
 -    # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
 -    ```
 -
 -2. Attempt to create an incremental backup via QMP:
 -
 -    ```json
 -    { "execute": "drive-backup",
 -      "arguments": {
 -        "device": "drive0",
 -        "bitmap": "bitmap0",
 -        "target": "incremental.0.img",
 -        "format": "qcow2",
 -        "sync": "incremental",
 -        "mode": "existing"
 -      }
 -    }
 -    ```
 -
 -3. Receive an event notifying us of failure:
 -
 -    ```json
 -    { "timestamp": { "seconds": 1424709442, "microseconds": 844524 },
 -      "data": { "speed": 0, "offset": 0, "len": 67108864,
 -                "error": "No space left on device",
 -                "device": "drive1", "type": "backup" },
 -      "event": "BLOCK_JOB_COMPLETED" }
 -    ```
 -
 -4. Delete the failed incremental, and re-create the image.
 -
 -    ```sh
 -    # rm incremental.0.img
 -    # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
 -    ```
 -
 -5. Retry the command after fixing the underlying problem,
 -   such as freeing up space on the backup volume:
 -
 -    ```json
 -    { "execute": "drive-backup",
 -      "arguments": {
 -        "device": "drive0",
 -        "bitmap": "bitmap0",
 -        "target": "incremental.0.img",
 -        "format": "qcow2",
 -        "sync": "incremental",
 -        "mode": "existing"
 -      }
 -    }
 -    ```
 -
 -6. Receive confirmation that the job completed successfully:
 -
 -    ```json
 -    { "timestamp": { "seconds": 1424709668, "microseconds": 526525 },
 -      "data": { "device": "drive1", "type": "backup",
 -                "speed": 0, "len": 67108864, "offset": 67108864},
 -      "event": "BLOCK_JOB_COMPLETED" }
 -    ```
 -
 -### Partial Transactional Failures
 -
 -* Sometimes, a transaction will succeed in launching and return success,
 -  but then later the backup jobs themselves may fail. It is possible that
 -  a management application may have to deal with a partial backup failure
 -  after a successful transaction.
 -
 -* If multiple backup jobs are specified in a single transaction, when one of
 -  them fails, it will not interact with the other backup jobs in any way.
 -
 -* The job(s) that succeeded will clear the dirty bitmap associated with the
 -  operation, but the job(s) that failed will not. It is not "safe" to delete
 -  any incremental backups that were created successfully in this scenario,
 -  even though others failed.
 -
 -#### Example
 -
 -* QMP example highlighting two backup jobs:
 -
 -    ```json
 -    { "execute": "transaction",
 -      "arguments": {
 -        "actions": [
 -          { "type": "drive-backup",
 -            "data": { "device": "drive0", "bitmap": "bitmap0",
 -                      "format": "qcow2", "mode": "existing",
 -                      "sync": "incremental", "target": "d0-incr-1.qcow2" } },
 -          { "type": "drive-backup",
 -            "data": { "device": "drive1", "bitmap": "bitmap1",
 -                      "format": "qcow2", "mode": "existing",
 -                      "sync": "incremental", "target": "d1-incr-1.qcow2" } },
 -        ]
 -      }
 -    }
 -    ```
 -
 -* QMP example response, highlighting one success and one failure:
 -    * Acknowledgement that the Transaction was accepted and jobs were launched:
 -        ```json
 -        { "return": {} }
 -        ```
 -
 -    * Later, QEMU sends notice that the first job was completed:
 -        ```json
 -        { "timestamp": { "seconds": 1447192343, "microseconds": 615698 },
 -          "data": { "device": "drive0", "type": "backup",
 -                     "speed": 0, "len": 67108864, "offset": 67108864 },
 -          "event": "BLOCK_JOB_COMPLETED"
 -        }
 -        ```
 -
 -    * Later yet, QEMU sends notice that the second job has failed:
 -        ```json
 -        { "timestamp": { "seconds": 1447192399, "microseconds": 683015 },
 -          "data": { "device": "drive1", "action": "report",
 -                    "operation": "read" },
 -          "event": "BLOCK_JOB_ERROR" }
 -        ```
 -
 -        ```json
 -        { "timestamp": { "seconds": 1447192399, "microseconds": 685853 },
 -          "data": { "speed": 0, "offset": 0, "len": 67108864,
 -                    "error": "Input/output error",
 -                    "device": "drive1", "type": "backup" },
 -          "event": "BLOCK_JOB_COMPLETED" }
 -
 -* In the above example, "d0-incr-1.qcow2" is valid and must be kept,
 -  but "d1-incr-1.qcow2" is invalid and should be deleted. If a VM-wide
 -  incremental backup of all drives at a point-in-time is to be made,
 -  new backups for both drives will need to be made, taking into account
 -  that a new incremental backup for drive0 needs to be based on top of
 -  "d0-incr-1.qcow2."
 -
 -### Grouped Completion Mode
 -
 -* While jobs launched by transactions normally complete or fail on their own,
 -  it is possible to instruct them to complete or fail together as a group.
 -
 -* QMP transactions take an optional properties structure that can affect
 -  the semantics of the transaction.
 -
 -* The "completion-mode" transaction property can be either "individual"
 -  which is the default, legacy behavior described above, or "grouped,"
 -  a new behavior detailed below.
 -
 -* Delayed Completion: In grouped completion mode, no jobs will report
 -  success until all jobs are ready to report success.
 -
 -* Grouped failure: If any job fails in grouped completion mode, all remaining
 -  jobs will be cancelled. Any incremental backups will restore their dirty
 -  bitmap objects as if no backup command was ever issued.
 -
 -    * Regardless of if QEMU reports a particular incremental backup job as
 -      CANCELLED or as an ERROR, the in-memory bitmap will be restored.
 -
 -#### Example
 -
 -* Here's the same example scenario from above with the new property:
 -
 -    ```json
 -    { "execute": "transaction",
 -      "arguments": {
 -        "actions": [
 -          { "type": "drive-backup",
 -            "data": { "device": "drive0", "bitmap": "bitmap0",
 -                      "format": "qcow2", "mode": "existing",
 -                      "sync": "incremental", "target": "d0-incr-1.qcow2" } },
 -          { "type": "drive-backup",
 -            "data": { "device": "drive1", "bitmap": "bitmap1",
 -                      "format": "qcow2", "mode": "existing",
 -                      "sync": "incremental", "target": "d1-incr-1.qcow2" } },
 -        ],
 -        "properties": {
 -          "completion-mode": "grouped"
 -        }
 -      }
 -    }
 -    ```
 -
 -* QMP example response, highlighting a failure for drive2:
 -    * Acknowledgement that the Transaction was accepted and jobs were launched:
 -        ```json
 -        { "return": {} }
 -        ```
 -
 -    * Later, QEMU sends notice that the second job has errored out,
 -      but that the first job was also cancelled:
 -        ```json
 -        { "timestamp": { "seconds": 1447193702, "microseconds": 632377 },
 -          "data": { "device": "drive1", "action": "report",
 -                    "operation": "read" },
 -          "event": "BLOCK_JOB_ERROR" }
 -        ```
 -
 -        ```json
 -        { "timestamp": { "seconds": 1447193702, "microseconds": 640074 },
 -          "data": { "speed": 0, "offset": 0, "len": 67108864,
 -                    "error": "Input/output error",
 -                    "device": "drive1", "type": "backup" },
 -          "event": "BLOCK_JOB_COMPLETED" }
 -        ```
 -
 -        ```json
 -        { "timestamp": { "seconds": 1447193702, "microseconds": 640163 },
 -          "data": { "device": "drive0", "type": "backup", "speed": 0,
 -                    "len": 67108864, "offset": 16777216 },
 -          "event": "BLOCK_JOB_CANCELLED" }
 -        ```
 -
 -<!--
 -The FreeBSD Documentation License
 -
 -Redistribution and use in source (Markdown) and 'compiled' forms (SGML, HTML,
 -PDF, PostScript, RTF and so forth) with or without modification, are permitted
 -provided that the following conditions are met:
 -
 -Redistributions of source code (Markdown) must retain the above copyright
 -notice, this list of conditions and the following disclaimer of this file
 -unmodified.
 -
 -Redistributions in compiled form (transformed to other DTDs, converted to PDF,
 -PostScript, RTF and other formats) must reproduce the above copyright notice,
 -this list of conditions and the following disclaimer in the documentation and/or
 -other materials provided with the distribution.
 -
 -THIS DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR  PURPOSE ARE
 -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS  BE LIABLE
 -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 -THIS DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 --->
 diff --git a/docs/interop/bitmaps.rst b/docs/interop/bitmaps.rst
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/docs/interop/bitmaps.rst
@@ -XXX,XX +XXX,XX @@
 +..
 +   Copyright 2015 John Snow <jsnow@redhat.com> and Red Hat, Inc.
 +   All rights reserved.
 +
-+   This file is licensed via The FreeBSD Documentation License, the full
++static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx,
-+   text of which is included at the end of this document.
++                                                     CoMutex *mutex)
-+
+ {
-+====================================
+     Coroutine *self = qemu_coroutine_self();
-+Dirty Bitmaps and Incremental Backup
+     CoWaitRecord w;
-+====================================
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
-+
+         if (co == self) {
-+-  Dirty Bitmaps are objects that track which data needs to be backed up
+             /* We got the lock ourselves!  */
-+   for the next incremental backup.
+             assert(to_wake == &w);
-+
++            mutex->ctx = ctx;
-+-  Dirty bitmaps can be created at any time and attached to any node
+             return;
-+   (not just complete drives).
+         }
-+
-+.. contents::
+-        aio_co_wake(co);
-+
++        qemu_co_mutex_wake(mutex, co);
-+Dirty Bitmap Names
+     }
-+------------------
-+
+     qemu_coroutine_yield();
-+-  A dirty bitmap's name is unique to the node, but bitmaps attached to
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
-+   different nodes can share the same name.
-+
+ void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
-+-  Dirty bitmaps created for internal use by QEMU may be anonymous and
+ {
-+   have no name, but any user-created bitmaps must have a name. There
++    AioContext *ctx = qemu_get_current_aio_context();
-+   can be any number of anonymous bitmaps per node.
+     Coroutine *self = qemu_coroutine_self();
-+
++    int waiters, i;
-+-  The name of a user-created bitmap must not be empty ("").
-+
+-    if (atomic_fetch_inc(&mutex->locked) == 0) {
-+Bitmap Modes
++    /* Running a very small critical section on pthread_mutex_t and CoMutex
-+------------
++     * shows that pthread_mutex_t is much faster because it doesn't actually
-+
++     * go to sleep.  What happens is that the critical section is shorter
-+-  A bitmap can be "frozen," which means that it is currently in-use by
++     * than the latency of entering the kernel and thus FUTEX_WAIT always
-+   a backup operation and cannot be deleted, renamed, written to, reset,
++     * fails.  With CoMutex there is no such latency but you still want to
-+   etc.
++     * avoid wait and wakeup.  So introduce it artificially.
-+
++     */
-+-  The normal operating mode for a bitmap is "active."
++    i = 0;
-+
++retry_fast_path:
-+Basic QMP Usage
++    waiters = atomic_cmpxchg(&mutex->locked, 0, 1);
-+---------------
++    if (waiters != 0) {
-+
++        while (waiters == 1 && ++i < 1000) {
-+Supported Commands
++            if (atomic_read(&mutex->ctx) == ctx) {
-+~~~~~~~~~~~~~~~~~~
++                break;
-+
++            }
-+- ``block-dirty-bitmap-add``
++            if (atomic_read(&mutex->locked) == 0) {
-+- ``block-dirty-bitmap-remove``
++                goto retry_fast_path;
-+- ``block-dirty-bitmap-clear``
++            }
-+
++            cpu_relax();
-+Creation
++        }
-+~~~~~~~~
++        waiters = atomic_fetch_inc(&mutex->locked);
 +
 +-  To create a new bitmap, enabled, on the drive with id=drive0:
 +
 +.. code:: json
 +
 +    { "execute": "block-dirty-bitmap-add",
 +      "arguments": {
 +        "node": "drive0",
 +        "name": "bitmap0"
 +      }
 +    }
 +
-+-  This bitmap will have a default granularity that matches the cluster
++    if (waiters == 0) {
-+   size of its associated drive, if available, clamped to between [4KiB,
+         /* Uncontended.  */
-+   64KiB]. The current default for qcow2 is 64KiB.
+         trace_qemu_co_mutex_lock_uncontended(mutex, self);
-+
++        mutex->ctx = ctx;
-+-  To create a new bitmap that tracks changes in 32KiB segments:
+     } else {
-+
+-        qemu_co_mutex_lock_slowpath(mutex);
-+.. code:: json
++        qemu_co_mutex_lock_slowpath(ctx, mutex);
-+
+     }
-+    { "execute": "block-dirty-bitmap-add",
+     mutex->holder = self;
-+      "arguments": {
+     self->locks_held++;
-+        "node": "drive0",
+@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
-+        "name": "bitmap0",
+     assert(mutex->holder == self);
-+        "granularity": 32768
+     assert(qemu_in_coroutine());
-+      }
-+    }
++    mutex->ctx = NULL;
-+
+     mutex->holder = NULL;
-+Deletion
+     self->locks_held--;
-+~~~~~~~~
+     if (atomic_fetch_dec(&mutex->locked) == 1) {
-+
+@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
-+-  Bitmaps that are frozen cannot be deleted.
+         unsigned our_handoff;
-+
-+-  Deleting the bitmap does not impact any other bitmaps attached to the
+         if (to_wake) {
-+   same node, nor does it affect any backups already created from this
+-            Coroutine *co = to_wake->co;
-+   node.
+-            aio_co_wake(co);
-+
++            qemu_co_mutex_wake(mutex, to_wake->co);
-+-  Because bitmaps are only unique to the node to which they are
+             break;
-+   attached, you must specify the node/drive name here, too.
+         }
-+
-+.. code:: json
+diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
-+
+index XXXXXXX..XXXXXXX 100644
-+    { "execute": "block-dirty-bitmap-remove",
+--- a/util/qemu-coroutine.c
-+      "arguments": {
++++ b/util/qemu-coroutine.c
-+        "node": "drive0",
+@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_enter(Coroutine *co)
-+        "name": "bitmap0"
+     co->ctx = qemu_get_current_aio_context();
-+      }
-+    }
+     /* Store co->ctx before anything that stores co.  Matches
-+
+-     * barrier in aio_co_wake.
-+Resetting
++     * barrier in aio_co_wake and qemu_co_mutex_wake.
-+~~~~~~~~~
+      */
-+
+     smp_wmb();
-+-  Resetting a bitmap will clear all information it holds.
 +
 +-  An incremental backup created from an empty bitmap will copy no data,
 +   as if nothing has changed.
 +
 +.. code:: json
 +
 +    { "execute": "block-dirty-bitmap-clear",
 +      "arguments": {
 +        "node": "drive0",
 +        "name": "bitmap0"
 +      }
 +    }
 +
 +Transactions
 +------------
 +
 +Justification
 +~~~~~~~~~~~~~
 +
 +Bitmaps can be safely modified when the VM is paused or halted by using
 +the basic QMP commands. For instance, you might perform the following
 +actions:
 +
 +1. Boot the VM in a paused state.
 +2. Create a full drive backup of drive0.
 +3. Create a new bitmap attached to drive0.
 +4. Resume execution of the VM.
 +5. Incremental backups are ready to be created.
 +
 +At this point, the bitmap and drive backup would be correctly in sync,
 +and incremental backups made from this point forward would be correctly
 +aligned to the full drive backup.
 +
 +This is not particularly useful if we decide we want to start
 +incremental backups after the VM has been running for a while, for which
 +we will need to perform actions such as the following:
 +
 +1. Boot the VM and begin execution.
 +2. Using a single transaction, perform the following operations:
 +
 +   -  Create ``bitmap0``.
 +   -  Create a full drive backup of ``drive0``.
 +
 +3. Incremental backups are now ready to be created.
 +
 +Supported Bitmap Transactions
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +
 +-  ``block-dirty-bitmap-add``
 +-  ``block-dirty-bitmap-clear``
 +
 +The usages are identical to their respective QMP commands, but see below
 +for examples.
 +
 +Example: New Incremental Backup
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +
 +As outlined in the justification, perhaps we want to create a new
 +incremental backup chain attached to a drive.
 +
 +.. code:: json
 +
 +    { "execute": "transaction",
 +      "arguments": {
 +        "actions": [
 +          {"type": "block-dirty-bitmap-add",
 +           "data": {"node": "drive0", "name": "bitmap0"} },
 +          {"type": "drive-backup",
 +           "data": {"device": "drive0", "target": "/path/to/full_backup.img",
 +                    "sync": "full", "format": "qcow2"} }
 +        ]
 +      }
 +    }
 +
 +Example: New Incremental Backup Anchor Point
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +
 +Maybe we just want to create a new full backup with an existing bitmap
 +and want to reset the bitmap to track the new chain.
 +
 +.. code:: json
 +
 +    { "execute": "transaction",
 +      "arguments": {
 +        "actions": [
 +          {"type": "block-dirty-bitmap-clear",
 +           "data": {"node": "drive0", "name": "bitmap0"} },
 +          {"type": "drive-backup",
 +           "data": {"device": "drive0", "target": "/path/to/new_full_backup.img",
 +                    "sync": "full", "format": "qcow2"} }
 +        ]
 +      }
 +    }
 +
 +Incremental Backups
 +-------------------
 +
 +The star of the show.
 +
 +**Nota Bene!** Only incremental backups of entire drives are supported
 +for now. So despite the fact that you can attach a bitmap to any
 +arbitrary node, they are only currently useful when attached to the root
 +node. This is because drive-backup only supports drives/devices instead
 +of arbitrary nodes.
 +
 +Example: First Incremental Backup
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +
 +1. Create a full backup and sync it to the dirty bitmap, as in the
 +   transactional examples above; or with the VM offline, manually create
 +   a full copy and then create a new bitmap before the VM begins
 +   execution.
 +
 +   -  Let's assume the full backup is named ``full_backup.img``.
 +   -  Let's assume the bitmap you created is ``bitmap0`` attached to
 +      ``drive0``.
 +
 +2. Create a destination image for the incremental backup that utilizes
 +   the full backup as a backing image.
 +
 +   -  Let's assume the new incremental image is named
 +      ``incremental.0.img``.
 +
 +   .. code:: bash
 +
 +       $ qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
 +
 +3. Issue the incremental backup command:
 +
 +   .. code:: json
 +
 +       { "execute": "drive-backup",
 +         "arguments": {
 +           "device": "drive0",
 +           "bitmap": "bitmap0",
 +           "target": "incremental.0.img",
 +           "format": "qcow2",
 +           "sync": "incremental",
 +           "mode": "existing"
 +         }
 +       }
 +
 +Example: Second Incremental Backup
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +
 +1. Create a new destination image for the incremental backup that points
 +   to the previous one, e.g.: ``incremental.1.img``
 +
 +   .. code:: bash
 +
 +       $ qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2
 +
 +2. Issue a new incremental backup command. The only difference here is
 +   that we have changed the target image below.
 +
 +   .. code:: json
 +
 +       { "execute": "drive-backup",
 +         "arguments": {
 +           "device": "drive0",
 +           "bitmap": "bitmap0",
 +           "target": "incremental.1.img",
 +           "format": "qcow2",
 +           "sync": "incremental",
 +           "mode": "existing"
 +         }
 +       }
 +
 +Errors
 +------
 +
 +-  In the event of an error that occurs after a backup job is
 +   successfully launched, either by a direct QMP command or a QMP
 +   transaction, the user will receive a ``BLOCK_JOB_COMPLETE`` event with
 +   a failure message, accompanied by a ``BLOCK_JOB_ERROR`` event.
 +
 +-  In the case of an event being cancelled, the user will receive a
 +   ``BLOCK_JOB_CANCELLED`` event instead of a pair of COMPLETE and ERROR
 +   events.
 +
 +-  In either case, the incremental backup data contained within the
 +   bitmap is safely rolled back, and the data within the bitmap is not
 +   lost. The image file created for the failed attempt can be safely
 +   deleted.
 +
 +-  Once the underlying problem is fixed (e.g. more storage space is
 +   freed up), you can simply retry the incremental backup command with
 +   the same bitmap.
 +
 +Example
 +~~~~~~~
 +
 +1. Create a target image:
 +
 +   .. code:: bash
 +
 +       $ qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
 +
 +2. Attempt to create an incremental backup via QMP:
 +
 +   .. code:: json
 +
 +       { "execute": "drive-backup",
 +         "arguments": {
 +           "device": "drive0",
 +           "bitmap": "bitmap0",
 +           "target": "incremental.0.img",
 +           "format": "qcow2",
 +           "sync": "incremental",
 +           "mode": "existing"
 +         }
 +       }
 +
 +3. Receive an event notifying us of failure:
 +
 +   .. code:: json
 +
 +       { "timestamp": { "seconds": 1424709442, "microseconds": 844524 },
 +         "data": { "speed": 0, "offset": 0, "len": 67108864,
 +                   "error": "No space left on device",
 +                   "device": "drive1", "type": "backup" },
 +         "event": "BLOCK_JOB_COMPLETED" }
 +
 +4. Delete the failed incremental, and re-create the image.
 +
 +   .. code:: bash
 +
 +       $ rm incremental.0.img
 +       $ qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
 +
 +5. Retry the command after fixing the underlying problem, such as
 +   freeing up space on the backup volume:
 +
 +   .. code:: json
 +
 +       { "execute": "drive-backup",
 +         "arguments": {
 +           "device": "drive0",
 +           "bitmap": "bitmap0",
 +           "target": "incremental.0.img",
 +           "format": "qcow2",
 +           "sync": "incremental",
 +           "mode": "existing"
 +         }
 +       }
 +
 +6. Receive confirmation that the job completed successfully:
 +
 +   .. code:: json
 +
 +       { "timestamp": { "seconds": 1424709668, "microseconds": 526525 },
 +         "data": { "device": "drive1", "type": "backup",
 +                   "speed": 0, "len": 67108864, "offset": 67108864},
 +         "event": "BLOCK_JOB_COMPLETED" }
 +
 +Partial Transactional Failures
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +
 +-  Sometimes, a transaction will succeed in launching and return
 +   success, but then later the backup jobs themselves may fail. It is
 +   possible that a management application may have to deal with a
 +   partial backup failure after a successful transaction.
 +
 +-  If multiple backup jobs are specified in a single transaction, when
 +   one of them fails, it will not interact with the other backup jobs in
 +   any way.
 +
 +-  The job(s) that succeeded will clear the dirty bitmap associated with
 +   the operation, but the job(s) that failed will not. It is not "safe"
 +   to delete any incremental backups that were created successfully in
 +   this scenario, even though others failed.
 +
 +Example
 +^^^^^^^
 +
 +-  QMP example highlighting two backup jobs:
 +
 +   .. code:: json
 +
 +       { "execute": "transaction",
 +         "arguments": {
 +           "actions": [
 +             { "type": "drive-backup",
 +               "data": { "device": "drive0", "bitmap": "bitmap0",
 +                         "format": "qcow2", "mode": "existing",
 +                         "sync": "incremental", "target": "d0-incr-1.qcow2" } },
 +             { "type": "drive-backup",
 +               "data": { "device": "drive1", "bitmap": "bitmap1",
 +                         "format": "qcow2", "mode": "existing",
 +                         "sync": "incremental", "target": "d1-incr-1.qcow2" } },
 +           ]
 +         }
 +       }
 +
 +-  QMP example response, highlighting one success and one failure:
 +
 +   -  Acknowledgement that the Transaction was accepted and jobs were
 +      launched:
 +
 +      .. code:: json
 +
 +          { "return": {} }
 +
 +   -  Later, QEMU sends notice that the first job was completed:
 +
 +      .. code:: json
 +
 +          { "timestamp": { "seconds": 1447192343, "microseconds": 615698 },
 +            "data": { "device": "drive0", "type": "backup",
 +                       "speed": 0, "len": 67108864, "offset": 67108864 },
 +            "event": "BLOCK_JOB_COMPLETED"
 +          }
 +
 +   -  Later yet, QEMU sends notice that the second job has failed:
 +
 +      .. code:: json
 +
 +          { "timestamp": { "seconds": 1447192399, "microseconds": 683015 },
 +            "data": { "device": "drive1", "action": "report",
 +                      "operation": "read" },
 +            "event": "BLOCK_JOB_ERROR" }
 +
 +      .. code:: json
 +
 +          { "timestamp": { "seconds": 1447192399, "microseconds":
 +          685853 }, "data": { "speed": 0, "offset": 0, "len": 67108864,
 +          "error": "Input/output error", "device": "drive1", "type":
 +          "backup" }, "event": "BLOCK_JOB_COMPLETED" }
 +
 +-  In the above example, ``d0-incr-1.qcow2`` is valid and must be kept,
 +   but ``d1-incr-1.qcow2`` is invalid and should be deleted. If a VM-wide
 +   incremental backup of all drives at a point-in-time is to be made,
 +   new backups for both drives will need to be made, taking into account
 +   that a new incremental backup for drive0 needs to be based on top of
 +   ``d0-incr-1.qcow2``.
 +
 +Grouped Completion Mode
 +~~~~~~~~~~~~~~~~~~~~~~~
 +
 +-  While jobs launched by transactions normally complete or fail on
 +   their own, it is possible to instruct them to complete or fail
 +   together as a group.
 +
 +-  QMP transactions take an optional properties structure that can
 +   affect the semantics of the transaction.
 +
 +-  The "completion-mode" transaction property can be either "individual"
 +   which is the default, legacy behavior described above, or "grouped,"
 +   a new behavior detailed below.
 +
 +-  Delayed Completion: In grouped completion mode, no jobs will report
 +   success until all jobs are ready to report success.
 +
 +-  Grouped failure: If any job fails in grouped completion mode, all
 +   remaining jobs will be cancelled. Any incremental backups will
 +   restore their dirty bitmap objects as if no backup command was ever
 +   issued.
 +
 +   -  Regardless of if QEMU reports a particular incremental backup job
 +      as CANCELLED or as an ERROR, the in-memory bitmap will be
 +      restored.
 +
 +Example
 +^^^^^^^
 +
 +-  Here's the same example scenario from above with the new property:
 +
 +   .. code:: json
 +
 +       { "execute": "transaction",
 +         "arguments": {
 +           "actions": [
 +             { "type": "drive-backup",
 +               "data": { "device": "drive0", "bitmap": "bitmap0",
 +                         "format": "qcow2", "mode": "existing",
 +                         "sync": "incremental", "target": "d0-incr-1.qcow2" } },
 +             { "type": "drive-backup",
 +               "data": { "device": "drive1", "bitmap": "bitmap1",
 +                         "format": "qcow2", "mode": "existing",
 +                         "sync": "incremental", "target": "d1-incr-1.qcow2" } },
 +           ],
 +           "properties": {
 +             "completion-mode": "grouped"
 +           }
 +         }
 +       }
 +
 +-  QMP example response, highlighting a failure for ``drive2``:
 +
 +   -  Acknowledgement that the Transaction was accepted and jobs were
 +      launched:
 +
 +      .. code:: json
 +
 +          { "return": {} }
 +
 +   -  Later, QEMU sends notice that the second job has errored out, but
 +      that the first job was also cancelled:
 +
 +      .. code:: json
 +
 +          { "timestamp": { "seconds": 1447193702, "microseconds": 632377 },
 +            "data": { "device": "drive1", "action": "report",
 +                      "operation": "read" },
 +            "event": "BLOCK_JOB_ERROR" }
 +
 +      .. code:: json
 +
 +          { "timestamp": { "seconds": 1447193702, "microseconds": 640074 },
 +            "data": { "speed": 0, "offset": 0, "len": 67108864,
 +                      "error": "Input/output error",
 +                      "device": "drive1", "type": "backup" },
 +            "event": "BLOCK_JOB_COMPLETED" }
 +
 +      .. code:: json
 +
 +          { "timestamp": { "seconds": 1447193702, "microseconds": 640163 },
 +            "data": { "device": "drive0", "type": "backup", "speed": 0,
 +                      "len": 67108864, "offset": 16777216 },
 +            "event": "BLOCK_JOB_CANCELLED" }
 +
 +.. raw:: html
 +
 +   <!--
 +   The FreeBSD Documentation License
 +
 +   Redistribution and use in source (Markdown) and 'compiled' forms (SGML, HTML,
 +   PDF, PostScript, RTF and so forth) with or without modification, are permitted
 +   provided that the following conditions are met:
 +
 +   Redistributions of source code (Markdown) must retain the above copyright
 +   notice, this list of conditions and the following disclaimer of this file
 +   unmodified.
 +
 +   Redistributions in compiled form (transformed to other DTDs, converted to PDF,
 +   PostScript, RTF and other formats) must reproduce the above copyright notice,
 +   this list of conditions and the following disclaimer in the documentation and/or
 +   other materials provided with the distribution.
 +
 +   THIS DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 +   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 +   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR  PURPOSE ARE
 +   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS  BE LIABLE
 +   FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 +   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 +   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 +   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 +   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 +   THIS DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 +   -->
 --
-.9.4
+.9.3

-New patch
+[Qemu-devel] [PULL v2 21/24] test-aio-multithread: add performance comparison with thread-based mutexes
+From: Paolo Bonzini <pbonzini@redhat.com>
 Add two implementations of the same benchmark as the previous patch,
 but using pthreads.  One uses a normal QemuMutex, the other is Linux
 only and implements a fair mutex based on MCS locks and futexes.
 This shows that the slower performance of the 5-thread case is due to
 the fairness of CoMutex, rather than to coroutines.  If fairness does
 not matter, as is the case with two threads, CoMutex can actually be
 faster than pthreads.
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213181244.16297-4-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  tests/test-aio-multithread.c | 164 +++++++++++++++++++++++++++++++++++++++++++
 file changed, 164 insertions(+)
 diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/test-aio-multithread.c
 +++ b/tests/test-aio-multithread.c
@@ -XXX,XX +XXX,XX @@ static void test_multi_co_mutex_2_30(void)
      test_multi_co_mutex(2, 30);
  }
 +/* Same test with fair mutexes, for performance comparison.  */
 +
 +#ifdef CONFIG_LINUX
 +#include "qemu/futex.h"
 +
 +/* The nodes for the mutex reside in this structure (on which we try to avoid
 + * false sharing).  The head of the mutex is in the "mutex_head" variable.
 + */
 +static struct {
 +    int next, locked;
 +    int padding[14];
 +} nodes[NUM_CONTEXTS] __attribute__((__aligned__(64)));
 +
 +static int mutex_head = -1;
 +
 +static void mcs_mutex_lock(void)
 +{
 +    int prev;
 +
 +    nodes[id].next = -1;
 +    nodes[id].locked = 1;
 +    prev = atomic_xchg(&mutex_head, id);
 +    if (prev != -1) {
 +        atomic_set(&nodes[prev].next, id);
 +        qemu_futex_wait(&nodes[id].locked, 1);
 +    }
 +}
 +
 +static void mcs_mutex_unlock(void)
 +{
 +    int next;
 +    if (nodes[id].next == -1) {
 +        if (atomic_read(&mutex_head) == id &&
 +            atomic_cmpxchg(&mutex_head, id, -1) == id) {
 +            /* Last item in the list, exit.  */
 +            return;
 +        }
 +        while (atomic_read(&nodes[id].next) == -1) {
 +            /* mcs_mutex_lock did the xchg, but has not updated
 +             * nodes[prev].next yet.
 +             */
 +        }
 +    }
 +
 +    /* Wake up the next in line.  */
 +    next = nodes[id].next;
 +    nodes[next].locked = 0;
 +    qemu_futex_wake(&nodes[next].locked, 1);
 +}
 +
 +static void test_multi_fair_mutex_entry(void *opaque)
 +{
 +    while (!atomic_mb_read(&now_stopping)) {
 +        mcs_mutex_lock();
 +        counter++;
 +        mcs_mutex_unlock();
 +        atomic_inc(&atomic_counter);
 +    }
 +    atomic_dec(&running);
 +}
 +
 +static void test_multi_fair_mutex(int threads, int seconds)
 +{
 +    int i;
 +
 +    assert(mutex_head == -1);
 +    counter = 0;
 +    atomic_counter = 0;
 +    now_stopping = false;
 +
 +    create_aio_contexts();
 +    assert(threads <= NUM_CONTEXTS);
 +    running = threads;
 +    for (i = 0; i < threads; i++) {
 +        Coroutine *co1 = qemu_coroutine_create(test_multi_fair_mutex_entry, NULL);
 +        aio_co_schedule(ctx[i], co1);
 +    }
 +
 +    g_usleep(seconds * 1000000);
 +
 +    atomic_mb_set(&now_stopping, true);
 +    while (running > 0) {
 +        g_usleep(100000);
 +    }
 +
 +    join_aio_contexts();
 +    g_test_message("%d iterations/second\n", counter / seconds);
 +    g_assert_cmpint(counter, ==, atomic_counter);
 +}
 +
 +static void test_multi_fair_mutex_1(void)
 +{
 +    test_multi_fair_mutex(NUM_CONTEXTS, 1);
 +}
 +
 +static void test_multi_fair_mutex_10(void)
 +{
 +    test_multi_fair_mutex(NUM_CONTEXTS, 10);
 +}
 +#endif
 +
 +/* Same test with pthread mutexes, for performance comparison and
 + * portability.  */
 +
 +static QemuMutex mutex;
 +
 +static void test_multi_mutex_entry(void *opaque)
 +{
 +    while (!atomic_mb_read(&now_stopping)) {
 +        qemu_mutex_lock(&mutex);
 +        counter++;
 +        qemu_mutex_unlock(&mutex);
 +        atomic_inc(&atomic_counter);
 +    }
 +    atomic_dec(&running);
 +}
 +
 +static void test_multi_mutex(int threads, int seconds)
 +{
 +    int i;
 +
 +    qemu_mutex_init(&mutex);
 +    counter = 0;
 +    atomic_counter = 0;
 +    now_stopping = false;
 +
 +    create_aio_contexts();
 +    assert(threads <= NUM_CONTEXTS);
 +    running = threads;
 +    for (i = 0; i < threads; i++) {
 +        Coroutine *co1 = qemu_coroutine_create(test_multi_mutex_entry, NULL);
 +        aio_co_schedule(ctx[i], co1);
 +    }
 +
 +    g_usleep(seconds * 1000000);
 +
 +    atomic_mb_set(&now_stopping, true);
 +    while (running > 0) {
 +        g_usleep(100000);
 +    }
 +
 +    join_aio_contexts();
 +    g_test_message("%d iterations/second\n", counter / seconds);
 +    g_assert_cmpint(counter, ==, atomic_counter);
 +}
 +
 +static void test_multi_mutex_1(void)
 +{
 +    test_multi_mutex(NUM_CONTEXTS, 1);
 +}
 +
 +static void test_multi_mutex_10(void)
 +{
 +    test_multi_mutex(NUM_CONTEXTS, 10);
 +}
 +
  /* End of tests.  */
  int main(int argc, char **argv)
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
          g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
          g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1);
          g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3);
 +#ifdef CONFIG_LINUX
 +        g_test_add_func("/aio/multi/mutex/mcs", test_multi_fair_mutex_1);
 +#endif
 +        g_test_add_func("/aio/multi/mutex/pthread", test_multi_mutex_1);
      } else {
          g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
          g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10);
          g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30);
 +#ifdef CONFIG_LINUX
 +        g_test_add_func("/aio/multi/mutex/mcs", test_multi_fair_mutex_10);
 +#endif
 +        g_test_add_func("/aio/multi/mutex/pthread", test_multi_mutex_10);
      }
      return g_test_run();
  }
 --
 .9.3

-[Qemu-devel] [PULL 2/2] live-block-ops.txt: Rename, rewrite, and improve it
+[Qemu-devel] [PULL v2 22/24] coroutine-lock: place CoMutex before CoQueue in header
-From: Kashyap Chamarthy <kchamart@redhat.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-This patch documents (including their QMP invocations) all the four
+This will avoid forward references in the next patch.  It is also
-major kinds of live block operations:
+more logical because CoQueue is not anymore the basic primitive.
-  - `block-stream`
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-  - `block-commit`
+Reviewed-by: Fam Zheng <famz@redhat.com>
-  - `drive-mirror` (& `blockdev-mirror`)
+Message-id: 20170213181244.16297-5-pbonzini@redhat.com
-  - `drive-backup` (& `blockdev-backup`)
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  include/qemu/coroutine.h | 89 ++++++++++++++++++++++++------------------------
 file changed, 44 insertions(+), 45 deletions(-)
-Things considered while writing this document:
+diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
+index XXXXXXX..XXXXXXX 100644
-  - Use reStructuredText as markup language (with the goal of generating
+--- a/include/qemu/coroutine.h
-    the HTML output using the Sphinx Documentation Generator).  It is
++++ b/include/qemu/coroutine.h
-    gentler on the eye, and can be trivially converted to different
+@@ -XXX,XX +XXX,XX @@ bool qemu_in_coroutine(void);
-    formats.  (Another reason: upstream QEMU is considering to switch to
+  */
-    Sphinx, which uses reStructuredText as its markup language.)
+ bool qemu_coroutine_entered(Coroutine *co);
-  - Raw QMP JSON output vs. 'qmp-shell'.  I debated with myself whether
+-
-    to only show raw QMP JSON output (as that is the canonical
+-/**
-    representation), or use 'qmp-shell', which takes key-value pairs.  I
+- * CoQueues are a mechanism to queue coroutines in order to continue executing
-    settled on the approach of: for the first occurrence of a command,
+- * them later. They provide the fundamental primitives on which coroutine locks
-    use raw JSON; for subsequent occurrences, use 'qmp-shell', with an
+- * are built.
-    occasional exception.
+- */
+-typedef struct CoQueue {
-  - Usage of `-blockdev` command-line.
+-    QSIMPLEQ_HEAD(, Coroutine) entries;
+-} CoQueue;
-  - Usage of 'node-name' vs. file path to refer to disks.  While we have
+-
-    `blockdev-{mirror, backup}` as 'node-name'-alternatives for
+-/**
-    `drive-{mirror, backup}`, the `block-commit` command still operates
+- * Initialise a CoQueue. This must be called before any other operation is used
-    on file names for parameters 'base' and 'top'.  So I added a caveat
+- * on the CoQueue.
-    at the beginning to that effect.
+- */
+-void qemu_co_queue_init(CoQueue *queue);
-    Refer this related thread that I started (where I learnt
+-
-    `block-stream` was recently reworked to accept 'node-name' for 'top'
+-/**
-    and 'base' parameters):
+- * Adds the current coroutine to the CoQueue and transfers control to the
-    https://lists.nongnu.org/archive/html/qemu-devel/2017-05/msg06466.html
+- * caller of the coroutine.
-    "[RFC] Making 'block-stream', and 'block-commit' accept node-name"
+- */
+-void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
-All commands showed in this document were tested while documenting.
+-
+-/**
-Thanks: Eric Blake for the section: "A note on points-in-time vs file
+- * Restarts the next coroutine in the CoQueue and removes it from the queue.
-names".  This useful bit was originally articulated by Eric in his
+- *
-KVMForum 2015 presentation, so I included that specific bit in this
+- * Returns true if a coroutine was restarted, false if the queue is empty.
-document.
+- */
+-bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
-Signed-off-by: Kashyap Chamarthy <kchamart@redhat.com>
+-
-Reviewed-by: Eric Blake <eblake@redhat.com>
+-/**
-Message-id: 20170717105205.32639-3-kchamart@redhat.com
+- * Restarts all coroutines in the CoQueue and leaves the queue empty.
-Signed-off-by: Jeff Cody <jcody@redhat.com>
+- */
----
+-void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
- docs/interop/live-block-operations.rst | 1088 ++++++++++++++++++++++++++++++++
+-
- docs/live-block-ops.txt                |   72 ---
+-/**
-files changed, 1088 insertions(+), 72 deletions(-)
+- * Enter the next coroutine in the queue
- create mode 100644 docs/interop/live-block-operations.rst
+- */
- delete mode 100644 docs/live-block-ops.txt
+-bool qemu_co_enter_next(CoQueue *queue);
+-
-diff --git a/docs/interop/live-block-operations.rst b/docs/interop/live-block-operations.rst
+-/**
-new file mode 100644
+- * Checks if the CoQueue is empty.
-index XXXXXXX..XXXXXXX
+- */
---- /dev/null
+-bool qemu_co_queue_empty(CoQueue *queue);
-+++ b/docs/interop/live-block-operations.rst
+-
-@@ -XXX,XX +XXX,XX @@
+-
-+..
+ /**
-+    Copyright (C) 2017 Red Hat Inc.
+  * Provides a mutex that can be used to synchronise coroutines
   */
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex);
   */
  void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
 +
-+    This work is licensed under the terms of the GNU GPL, version 2 or
++/**
-+    later.  See the COPYING file in the top-level directory.
++ * CoQueues are a mechanism to queue coroutines in order to continue executing
 + * them later.
 + */
 +typedef struct CoQueue {
 +    QSIMPLEQ_HEAD(, Coroutine) entries;
 +} CoQueue;
 +
-+============================
++/**
-+Live Block Device Operations
++ * Initialise a CoQueue. This must be called before any other operation is used
-+============================
++ * on the CoQueue.
 + */
 +void qemu_co_queue_init(CoQueue *queue);
 +
-+QEMU Block Layer currently (as of QEMU 2.9) supports four major kinds of
++/**
-+live block device jobs -- stream, commit, mirror, and backup.  These can
++ * Adds the current coroutine to the CoQueue and transfers control to the
-+be used to manipulate disk image chains to accomplish certain tasks,
++ * caller of the coroutine.
-+namely: live copy data from backing files into overlays; shorten long
++ */
-+disk image chains by merging data from overlays into backing files; live
++void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
 +synchronize data from a disk image chain (including current active disk)
 +to another target image; and point-in-time (and incremental) backups of
 +a block device.  Below is a description of the said block (QMP)
 +primitives, and some (non-exhaustive list of) examples to illustrate
 +their use.
 +
-+.. note::
++/**
-+    The file ``qapi/block-core.json`` in the QEMU source tree has the
++ * Restarts the next coroutine in the CoQueue and removes it from the queue.
-+    canonical QEMU API (QAPI) schema documentation for the QMP
++ *
-+    primitives discussed here.
++ * Returns true if a coroutine was restarted, false if the queue is empty.
 + */
 +bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
 +
-+.. todo (kashyapc):: Remove the ".. contents::" directive when Sphinx is
++/**
-+                     integrated.
++ * Restarts all coroutines in the CoQueue and leaves the queue empty.
 + */
 +void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
 +
-+.. contents::
++/**
 + * Enter the next coroutine in the queue
 + */
 +bool qemu_co_enter_next(CoQueue *queue);
 +
-+Disk image backing chain notation
++/**
-+---------------------------------
++ * Checks if the CoQueue is empty.
-+
++ */
-+A simple disk image chain.  (This can be created live using QMP
++bool qemu_co_queue_empty(CoQueue *queue);
 +``blockdev-snapshot-sync``, or offline via ``qemu-img``)::
 +
 +                   (Live QEMU)
 +                        |
 +                        .
 +                        V
 +
 +            [A] <----- [B]
 +
 +    (backing file)    (overlay)
 +
 +The arrow can be read as: Image [A] is the backing file of disk image
 +[B].  And live QEMU is currently writing to image [B], consequently, it
 +is also referred to as the "active layer".
 +
 +There are two kinds of terminology that are common when referring to
 +files in a disk image backing chain:
 +
 +(1) Directional: 'base' and 'top'.  Given the simple disk image chain
 +    above, image [A] can be referred to as 'base', and image [B] as
 +    'top'.  (This terminology can be seen in in QAPI schema file,
 +    block-core.json.)
 +
 +(2) Relational: 'backing file' and 'overlay'.  Again, taking the same
 +    simple disk image chain from the above, disk image [A] is referred
 +    to as the backing file, and image [B] as overlay.
 +
 +   Throughout this document, we will use the relational terminology.
 +
 +.. important::
 +    The overlay files can generally be any format that supports a
 +    backing file, although QCOW2 is the preferred format and the one
 +    used in this document.
 +
 +
-+Brief overview of live block QMP primitives
+ typedef struct CoRwlock {
-+-------------------------------------------
+     bool writer;
-+
+     int reader;
 +The following are the four different kinds of live block operations that
 +QEMU block layer supports.
 +
 +(1) ``block-stream``: Live copy of data from backing files into overlay
 +    files.
 +
 +    .. note:: Once the 'stream' operation has finished, three things to
 +              note:
 +
 +                (a) QEMU rewrites the backing chain to remove
 +                    reference to the now-streamed and redundant backing
 +                    file;
 +
 +                (b) the streamed file *itself* won't be removed by QEMU,
 +                    and must be explicitly discarded by the user;
 +
 +                (c) the streamed file remains valid -- i.e. further
 +                    overlays can be created based on it.  Refer the
 +                    ``block-stream`` section further below for more
 +                    details.
 +
 +(2) ``block-commit``: Live merge of data from overlay files into backing
 +    files (with the optional goal of removing the overlay file from the
 +    chain).  Since QEMU 2.0, this includes "active ``block-commit``"
 +    (i.e. merge the current active layer into the base image).
 +
 +    .. note:: Once the 'commit' operation has finished, there are three
 +              things to note here as well:
 +
 +                (a) QEMU rewrites the backing chain to remove reference
 +                    to now-redundant overlay images that have been
 +                    committed into a backing file;
 +
 +                (b) the committed file *itself* won't be removed by QEMU
 +                    -- it ought to be manually removed;
 +
 +                (c) however, unlike in the case of ``block-stream``, the
 +                    intermediate images will be rendered invalid -- i.e.
 +                    no more further overlays can be created based on
 +                    them.  Refer the ``block-commit`` section further
 +                    below for more details.
 +
 +(3) ``drive-mirror`` (and ``blockdev-mirror``): Synchronize a running
 +    disk to another image.
 +
 +(4) ``drive-backup`` (and ``blockdev-backup``): Point-in-time (live) copy
 +    of a block device to a destination.
 +
 +
 +.. _`Interacting with a QEMU instance`:
 +
 +Interacting with a QEMU instance
 +--------------------------------
 +
 +To show some example invocations of command-line, we will use the
 +following invocation of QEMU, with a QMP server running over UNIX
 +socket::
 +
 +    $ ./x86_64-softmmu/qemu-system-x86_64 -display none -nodefconfig \
 +        -M q35 -nodefaults -m 512 \
 +        -blockdev node-name=node-A,driver=qcow2,file.driver=file,file.node-name=file,file.filename=./a.qcow2 \
 +        -device virtio-blk,drive=node-A,id=virtio0 \
 +        -monitor stdio -qmp unix:/tmp/qmp-sock,server,nowait
 +
 +The ``-blockdev`` command-line option, used above, is available from
 +QEMU 2.9 onwards.  In the above invocation, notice the ``node-name``
 +parameter that is used to refer to the disk image a.qcow2 ('node-A') --
 +this is a cleaner way to refer to a disk image (as opposed to referring
 +to it by spelling out file paths).  So, we will continue to designate a
 +``node-name`` to each further disk image created (either via
 +``blockdev-snapshot-sync``, or ``blockdev-add``) as part of the disk
 +image chain, and continue to refer to the disks using their
 +``node-name`` (where possible, because ``block-commit`` does not yet, as
 +of QEMU 2.9, accept ``node-name`` parameter) when performing various
 +block operations.
 +
 +To interact with the QEMU instance launched above, we will use the
 +``qmp-shell`` utility (located at: ``qemu/scripts/qmp``, as part of the
 +QEMU source directory), which takes key-value pairs for QMP commands.
 +Invoke it as below (which will also print out the complete raw JSON
 +syntax for reference -- examples in the following sections)::
 +
 +    $ ./qmp-shell -v -p /tmp/qmp-sock
 +    (QEMU)
 +
 +.. note::
 +    In the event we have to repeat a certain QMP command, we will: for
 +    the first occurrence of it, show the ``qmp-shell`` invocation, *and*
 +    the corresponding raw JSON QMP syntax; but for subsequent
 +    invocations, present just the ``qmp-shell`` syntax, and omit the
 +    equivalent JSON output.
 +
 +
 +Example disk image chain
 +------------------------
 +
 +We will use the below disk image chain (and occasionally spelling it
 +out where appropriate) when discussing various primitives::
 +
 +    [A] <-- [B] <-- [C] <-- [D]
 +
 +Where [A] is the original base image; [B] and [C] are intermediate
 +overlay images; image [D] is the active layer -- i.e. live QEMU is
 +writing to it.  (The rule of thumb is: live QEMU will always be pointing
 +to the rightmost image in a disk image chain.)
 +
 +The above image chain can be created by invoking
 +``blockdev-snapshot-sync`` commands as following (which shows the
 +creation of overlay image [B]) using the ``qmp-shell`` (our invocation
 +also prints the raw JSON invocation of it)::
 +
 +    (QEMU) blockdev-snapshot-sync node-name=node-A snapshot-file=b.qcow2 snapshot-node-name=node-B format=qcow2
 +    {
 +        "execute": "blockdev-snapshot-sync",
 +        "arguments": {
 +            "node-name": "node-A",
 +            "snapshot-file": "b.qcow2",
 +            "format": "qcow2",
 +            "snapshot-node-name": "node-B"
 +        }
 +    }
 +
 +Here, "node-A" is the name QEMU internally uses to refer to the base
 +image [A] -- it is the backing file, based on which the overlay image,
 +[B], is created.
 +
 +To create the rest of the overlay images, [C], and [D] (omitting the raw
 +JSON output for brevity)::
 +
 +    (QEMU) blockdev-snapshot-sync node-name=node-B snapshot-file=c.qcow2 snapshot-node-name=node-C format=qcow2
 +    (QEMU) blockdev-snapshot-sync node-name=node-C snapshot-file=d.qcow2 snapshot-node-name=node-D format=qcow2
 +
 +
 +A note on points-in-time vs file names
 +--------------------------------------
 +
 +In our disk image chain::
 +
 +    [A] <-- [B] <-- [C] <-- [D]
 +
 +We have *three* points in time and an active layer:
 +
 +- Point 1: Guest state when [B] was created is contained in file [A]
 +- Point 2: Guest state when [C] was created is contained in [A] + [B]
 +- Point 3: Guest state when [D] was created is contained in
 +  [A] + [B] + [C]
 +- Active layer: Current guest state is contained in [A] + [B] + [C] +
 +  [D]
 +
 +Therefore, be aware with naming choices:
 +
 +- Naming a file after the time it is created is misleading -- the
 +  guest data for that point in time is *not* contained in that file
 +  (as explained earlier)
 +- Rather, think of files as a *delta* from the backing file
 +
 +
 +Live block streaming --- ``block-stream``
 +-----------------------------------------
 +
 +The ``block-stream`` command allows you to do live copy data from backing
 +files into overlay images.
 +
 +Given our original example disk image chain from earlier::
 +
 +    [A] <-- [B] <-- [C] <-- [D]
 +
 +The disk image chain can be shortened in one of the following different
 +ways (not an exhaustive list).
 +
 +.. _`Case-1`:
 +
 +(1) Merge everything into the active layer: I.e. copy all contents from
 +    the base image, [A], and overlay images, [B] and [C], into [D],
 +    *while* the guest is running.  The resulting chain will be a
 +    standalone image, [D] -- with contents from [A], [B] and [C] merged
 +    into it (where live QEMU writes go to)::
 +
 +        [D]
 +
 +.. _`Case-2`:
 +
 +(2) Taking the same example disk image chain mentioned earlier, merge
 +    only images [B] and [C] into [D], the active layer.  The result will
 +    be contents of images [B] and [C] will be copied into [D], and the
 +    backing file pointer of image [D] will be adjusted to point to image
 +    [A].  The resulting chain will be::
 +
 +        [A] <-- [D]
 +
 +.. _`Case-3`:
 +
 +(3) Intermediate streaming (available since QEMU 2.8): Starting afresh
 +    with the original example disk image chain, with a total of four
 +    images, it is possible to copy contents from image [B] into image
 +    [C].  Once the copy is finished, image [B] can now be (optionally)
 +    discarded; and the backing file pointer of image [C] will be
 +    adjusted to point to [A].  I.e. after performing "intermediate
 +    streaming" of [B] into [C], the resulting image chain will be (where
 +    live QEMU is writing to [D])::
 +
 +        [A] <-- [C] <-- [D]
 +
 +
 +QMP invocation for ``block-stream``
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +
 +For `Case-1`_, to merge contents of all the backing files into the
 +active layer, where 'node-D' is the current active image (by default
 +``block-stream`` will flatten the entire chain); ``qmp-shell`` (and its
 +corresponding JSON output)::
 +
 +    (QEMU) block-stream device=node-D job-id=job0
 +    {
 +        "execute": "block-stream",
 +        "arguments": {
 +            "device": "node-D",
 +            "job-id": "job0"
 +        }
 +    }
 +
 +For `Case-2`_, merge contents of the images [B] and [C] into [D], where
 +image [D] ends up referring to image [A] as its backing file::
 +
 +    (QEMU) block-stream device=node-D base-node=node-A job-id=job0
 +
 +And for `Case-3`_, of "intermediate" streaming", merge contents of
 +images [B] into [C], where [C] ends up referring to [A] as its backing
 +image::
 +
 +    (QEMU) block-stream device=node-C base-node=node-A job-id=job0
 +
 +Progress of a ``block-stream`` operation can be monitored via the QMP
 +command::
 +
 +    (QEMU) query-block-jobs
 +    {
 +        "execute": "query-block-jobs",
 +        "arguments": {}
 +    }
 +
 +
 +Once the ``block-stream`` operation has completed, QEMU will emit an
 +event, ``BLOCK_JOB_COMPLETED``.  The intermediate overlays remain valid,
 +and can now be (optionally) discarded, or retained to create further
 +overlays based on them.  Finally, the ``block-stream`` jobs can be
 +restarted at anytime.
 +
 +
 +Live block commit --- ``block-commit``
 +--------------------------------------
 +
 +The ``block-commit`` command lets you merge live data from overlay
 +images into backing file(s).  Since QEMU 2.0, this includes "live active
 +commit" (i.e. it is possible to merge the "active layer", the right-most
 +image in a disk image chain where live QEMU will be writing to, into the
 +base image).  This is analogous to ``block-stream``, but in the opposite
 +direction.
 +
 +Again, starting afresh with our example disk image chain, where live
 +QEMU is writing to the right-most image in the chain, [D]::
 +
 +    [A] <-- [B] <-- [C] <-- [D]
 +
 +The disk image chain can be shortened in one of the following ways:
 +
 +.. _`block-commit_Case-1`:
 +
 +(1) Commit content from only image [B] into image [A].  The resulting
 +    chain is the following, where image [C] is adjusted to point at [A]
 +    as its new backing file::
 +
 +        [A] <-- [C] <-- [D]
 +
 +(2) Commit content from images [B] and [C] into image [A].  The
 +    resulting chain, where image [D] is adjusted to point to image [A]
 +    as its new backing file::
 +
 +        [A] <-- [D]
 +
 +.. _`block-commit_Case-3`:
 +
 +(3) Commit content from images [B], [C], and the active layer [D] into
 +    image [A].  The resulting chain (in this case, a consolidated single
 +    image)::
 +
 +        [A]
 +
 +(4) Commit content from image only image [C] into image [B].  The
 +    resulting chain::
 +
 +    [A] <-- [B] <-- [D]
 +
 +(5) Commit content from image [C] and the active layer [D] into image
 +    [B].  The resulting chain::
 +
 +    [A] <-- [B]
 +
 +
 +QMP invocation for ``block-commit``
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +
 +For :ref:`Case-1 <block-commit_Case-1>`, to merge contents only from
 +image [B] into image [A], the invocation is as follows::
 +
 +    (QEMU) block-commit device=node-D base=a.qcow2 top=b.qcow2 job-id=job0
 +    {
 +        "execute": "block-commit",
 +        "arguments": {
 +            "device": "node-D",
 +            "job-id": "job0",
 +            "top": "b.qcow2",
 +            "base": "a.qcow2"
 +        }
 +    }
 +
 +Once the above ``block-commit`` operation has completed, a
 +``BLOCK_JOB_COMPLETED`` event will be issued, and no further action is
 +required.  As the end result, the backing file of image [C] is adjusted
 +to point to image [A], and the original 4-image chain will end up being
 +transformed to::
 +
 +    [A] <-- [C] <-- [D]
 +
 +.. note::
 +    The intermediate image [B] is invalid (as in: no more further
 +    overlays based on it can be created).
 +
 +    Reasoning: An intermediate image after a 'stream' operation still
 +    represents that old point-in-time, and may be valid in that context.
 +    However, an intermediate image after a 'commit' operation no longer
 +    represents any point-in-time, and is invalid in any context.
 +
 +
 +However, :ref:`Case-3 <block-commit_Case-3>` (also called: "active
 +``block-commit``") is a *two-phase* operation: In the first phase, the
 +content from the active overlay, along with the intermediate overlays,
 +is copied into the backing file (also called the base image).  In the
 +second phase, adjust the said backing file as the current active image
 +-- possible via issuing the command ``block-job-complete``.  Optionally,
 +the ``block-commit`` operation can be cancelled by issuing the command
 +``block-job-cancel``, but be careful when doing this.
 +
 +Once the ``block-commit`` operation has completed, the event
 +``BLOCK_JOB_READY`` will be emitted, signalling that the synchronization
 +has finished.  Now the job can be gracefully completed by issuing the
 +command ``block-job-complete`` -- until such a command is issued, the
 +'commit' operation remains active.
 +
 +The following is the flow for :ref:`Case-3 <block-commit_Case-3>` to
 +convert a disk image chain such as this::
 +
 +    [A] <-- [B] <-- [C] <-- [D]
 +
 +Into::
 +
 +    [A]
 +
 +Where content from all the subsequent overlays, [B], and [C], including
 +the active layer, [D], is committed back to [A] -- which is where live
 +QEMU is performing all its current writes).
 +
 +Start the "active ``block-commit``" operation::
 +
 +    (QEMU) block-commit device=node-D base=a.qcow2 top=d.qcow2 job-id=job0
 +    {
 +        "execute": "block-commit",
 +        "arguments": {
 +            "device": "node-D",
 +            "job-id": "job0",
 +            "top": "d.qcow2",
 +            "base": "a.qcow2"
 +        }
 +    }
 +
 +
 +Once the synchronization has completed, the event ``BLOCK_JOB_READY`` will
 +be emitted.
 +
 +Then, optionally query for the status of the active block operations.
 +We can see the 'commit' job is now ready to be completed, as indicated
 +by the line *"ready": true*::
 +
 +    (QEMU) query-block-jobs
 +    {
 +        "execute": "query-block-jobs",
 +        "arguments": {}
 +    }
 +    {
 +        "return": [
 +            {
 +                "busy": false,
 +                "type": "commit",
 +                "len": 1376256,
 +                "paused": false,
 +                "ready": true,
 +                "io-status": "ok",
 +                "offset": 1376256,
 +                "device": "job0",
 +                "speed": 0
 +            }
 +        ]
 +    }
 +
 +Gracefully complete the 'commit' block device job::
 +
 +    (QEMU) block-job-complete device=job0
 +    {
 +        "execute": "block-job-complete",
 +        "arguments": {
 +            "device": "job0"
 +        }
 +    }
 +    {
 +        "return": {}
 +    }
 +
 +Finally, once the above job is completed, an event
 +``BLOCK_JOB_COMPLETED`` will be emitted.
 +
 +.. note::
 +    The invocation for rest of the cases (2, 4, and 5), discussed in the
 +    previous section, is omitted for brevity.
 +
 +
 +Live disk synchronization --- ``drive-mirror`` and ``blockdev-mirror``
 +----------------------------------------------------------------------
 +
 +Synchronize a running disk image chain (all or part of it) to a target
 +image.
 +
 +Again, given our familiar disk image chain::
 +
 +    [A] <-- [B] <-- [C] <-- [D]
 +
 +The ``drive-mirror`` (and its newer equivalent ``blockdev-mirror``) allows
 +you to copy data from the entire chain into a single target image (which
 +can be located on a different host).
 +
 +Once a 'mirror' job has started, there are two possible actions while a
 +``drive-mirror`` job is active:
 +
 +(1) Issuing the command ``block-job-cancel`` after it emits the event
 +    ``BLOCK_JOB_CANCELLED``: will (after completing synchronization of
 +    the content from the disk image chain to the target image, [E])
 +    create a point-in-time (which is at the time of *triggering* the
 +    cancel command) copy, contained in image [E], of the the entire disk
 +    image chain (or only the top-most image, depending on the ``sync``
 +    mode).
 +
 +(2) Issuing the command ``block-job-complete`` after it emits the event
 +    ``BLOCK_JOB_COMPLETED``: will, after completing synchronization of
 +    the content, adjust the guest device (i.e. live QEMU) to point to
 +    the target image, and, causing all the new writes from this point on
 +    to happen there.  One use case for this is live storage migration.
 +
 +About synchronization modes: The synchronization mode determines
 +*which* part of the disk image chain will be copied to the target.
 +Currently, there are four different kinds:
 +
 +(1) ``full`` -- Synchronize the content of entire disk image chain to
 +    the target
 +
 +(2) ``top`` -- Synchronize only the contents of the top-most disk image
 +    in the chain to the target
 +
 +(3) ``none`` -- Synchronize only the new writes from this point on.
 +
 +    .. note:: In the case of ``drive-backup`` (or ``blockdev-backup``),
 +              the behavior of ``none`` synchronization mode is different.
 +              Normally, a ``backup`` job consists of two parts: Anything
 +              that is overwritten by the guest is first copied out to
 +              the backup, and in the background the whole image is
 +              copied from start to end. With ``sync=none``, it's only
 +              the first part.
 +
 +(4) ``incremental`` -- Synchronize content that is described by the
 +    dirty bitmap
 +
 +.. note::
 +    Refer to the :doc:`bitmaps` document in the QEMU source
 +    tree to learn about the detailed workings of the ``incremental``
 +    synchronization mode.
 +
 +
 +QMP invocation for ``drive-mirror``
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +
 +To copy the contents of the entire disk image chain, from [A] all the
 +way to [D], to a new target (``drive-mirror`` will create the destination
 +file, if it doesn't already exist), call it [E]::
 +
 +    (QEMU) drive-mirror device=node-D target=e.qcow2 sync=full job-id=job0
 +    {
 +        "execute": "drive-mirror",
 +        "arguments": {
 +            "device": "node-D",
 +            "job-id": "job0",
 +            "target": "e.qcow2",
 +            "sync": "full"
 +        }
 +    }
 +
 +The ``"sync": "full"``, from the above, means: copy the *entire* chain
 +to the destination.
 +
 +Following the above, querying for active block jobs will show that a
 +'mirror' job is "ready" to be completed (and QEMU will also emit an
 +event, ``BLOCK_JOB_READY``)::
 +
 +    (QEMU) query-block-jobs
 +    {
 +        "execute": "query-block-jobs",
 +        "arguments": {}
 +    }
 +    {
 +        "return": [
 +            {
 +                "busy": false,
 +                "type": "mirror",
 +                "len": 21757952,
 +                "paused": false,
 +                "ready": true,
 +                "io-status": "ok",
 +                "offset": 21757952,
 +                "device": "job0",
 +                "speed": 0
 +            }
 +        ]
 +    }
 +
 +And, as noted in the previous section, there are two possible actions
 +at this point:
 +
 +(a) Create a point-in-time snapshot by ending the synchronization.  The
 +    point-in-time is at the time of *ending* the sync.  (The result of
 +    the following being: the target image, [E], will be populated with
 +    content from the entire chain, [A] to [D])::
 +
 +        (QEMU) block-job-cancel device=job0
 +        {
 +            "execute": "block-job-cancel",
 +            "arguments": {
 +                "device": "job0"
 +            }
 +        }
 +
 +(b) Or, complete the operation and pivot the live QEMU to the target
 +    copy::
 +
 +        (QEMU) block-job-complete device=job0
 +
 +In either of the above cases, if you once again run the
 +`query-block-jobs` command, there should not be any active block
 +operation.
 +
 +Comparing 'commit' and 'mirror': In both then cases, the overlay images
 +can be discarded.  However, with 'commit', the *existing* base image
 +will be modified (by updating it with contents from overlays); while in
 +the case of 'mirror', a *new* target image is populated with the data
 +from the disk image chain.
 +
 +
 +QMP invocation for live storage migration with ``drive-mirror`` + NBD
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +
 +Live storage migration (without shared storage setup) is one of the most
 +common use-cases that takes advantage of the ``drive-mirror`` primitive
 +and QEMU's built-in Network Block Device (NBD) server.  Here's a quick
 +walk-through of this setup.
 +
 +Given the disk image chain::
 +
 +    [A] <-- [B] <-- [C] <-- [D]
 +
 +Instead of copying content from the entire chain, synchronize *only* the
 +contents of the *top*-most disk image (i.e. the active layer), [D], to a
 +target, say, [TargetDisk].
 +
 +.. important::
 +    The destination host must already have the contents of the backing
 +    chain, involving images [A], [B], and [C], visible via other means
 +    -- whether by ``cp``, ``rsync``, or by some storage array-specific
 +    command.)
 +
 +Sometimes, this is also referred to as "shallow copy" -- because only
 +the "active layer", and not the rest of the image chain, is copied to
 +the destination.
 +
 +.. note::
 +    In this example, for the sake of simplicity, we'll be using the same
 +    ``localhost`` as both source and destination.
 +
 +As noted earlier, on the destination host the contents of the backing
 +chain -- from images [A] to [C] -- are already expected to exist in some
 +form (e.g. in a file called, ``Contents-of-A-B-C.qcow2``).  Now, on the
 +destination host, let's create a target overlay image (with the image
 +``Contents-of-A-B-C.qcow2`` as its backing file), to which the contents
 +of image [D] (from the source QEMU) will be mirrored to::
 +
 +    $ qemu-img create -f qcow2 -b ./Contents-of-A-B-C.qcow2 \
 +        -F qcow2 ./target-disk.qcow2
 +
 +And start the destination QEMU (we already have the source QEMU running
 +-- discussed in the section: `Interacting with a QEMU instance`_)
 +instance, with the following invocation.  (As noted earlier, for
 +simplicity's sake, the destination QEMU is started on the same host, but
 +it could be located elsewhere)::
 +
 +    $ ./x86_64-softmmu/qemu-system-x86_64 -display none -nodefconfig \
 +        -M q35 -nodefaults -m 512 \
 +        -blockdev node-name=node-TargetDisk,driver=qcow2,file.driver=file,file.node-name=file,file.filename=./target-disk.qcow2 \
 +        -device virtio-blk,drive=node-TargetDisk,id=virtio0 \
 +        -S -monitor stdio -qmp unix:./qmp-sock2,server,nowait \
 +        -incoming tcp:localhost:6666
 +
 +Given the disk image chain on source QEMU::
 +
 +    [A] <-- [B] <-- [C] <-- [D]
 +
 +On the destination host, it is expected that the contents of the chain
 +``[A] <-- [B] <-- [C]`` are *already* present, and therefore copy *only*
 +the content of image [D].
 +
 +(1) [On *destination* QEMU] As part of the first step, start the
 +    built-in NBD server on a given host (local host, represented by
 +    ``::``)and port::
 +
 +        (QEMU) nbd-server-start addr={"type":"inet","data":{"host":"::","port":"49153"}}
 +        {
 +            "execute": "nbd-server-start",
 +            "arguments": {
 +                "addr": {
 +                    "data": {
 +                        "host": "::",
 +                        "port": "49153"
 +                    },
 +                    "type": "inet"
 +                }
 +            }
 +        }
 +
 +(2) [On *destination* QEMU] And export the destination disk image using
 +    QEMU's built-in NBD server::
 +
 +        (QEMU) nbd-server-add device=node-TargetDisk writable=true
 +        {
 +            "execute": "nbd-server-add",
 +            "arguments": {
 +                "device": "node-TargetDisk"
 +            }
 +        }
 +
 +(3) [On *source* QEMU] Then, invoke ``drive-mirror`` (NB: since we're
 +    running ``drive-mirror`` with ``mode=existing`` (meaning:
 +    synchronize to a pre-created file, therefore 'existing', file on the
 +    target host), with the synchronization mode as 'top' (``"sync:
 +    "top"``)::
 +
 +        (QEMU) drive-mirror device=node-D target=nbd:localhost:49153:exportname=node-TargetDisk sync=top mode=existing job-id=job0
 +        {
 +            "execute": "drive-mirror",
 +            "arguments": {
 +                "device": "node-D",
 +                "mode": "existing",
 +                "job-id": "job0",
 +                "target": "nbd:localhost:49153:exportname=node-TargetDisk",
 +                "sync": "top"
 +            }
 +        }
 +
 +(4) [On *source* QEMU] Once ``drive-mirror`` copies the entire data, and the
 +    event ``BLOCK_JOB_READY`` is emitted, issue ``block-job-cancel`` to
 +    gracefully end the synchronization, from source QEMU::
 +
 +        (QEMU) block-job-cancel device=job0
 +        {
 +            "execute": "block-job-cancel",
 +            "arguments": {
 +                "device": "job0"
 +            }
 +        }
 +
 +(5) [On *destination* QEMU] Then, stop the NBD server::
 +
 +        (QEMU) nbd-server-stop
 +        {
 +            "execute": "nbd-server-stop",
 +            "arguments": {}
 +        }
 +
 +(6) [On *destination* QEMU] Finally, resume the guest vCPUs by issuing the
 +    QMP command `cont`::
 +
 +        (QEMU) cont
 +        {
 +            "execute": "cont",
 +            "arguments": {}
 +        }
 +
 +.. note::
 +    Higher-level libraries (e.g. libvirt) automate the entire above
 +    process (although note that libvirt does not allow same-host
 +    migrations to localhost for other reasons).
 +
 +
 +Notes on ``blockdev-mirror``
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +
 +The ``blockdev-mirror`` command is equivalent in core functionality to
 +``drive-mirror``, except that it operates at node-level in a BDS graph.
 +
 +Also: for ``blockdev-mirror``, the 'target' image needs to be explicitly
 +created (using ``qemu-img``) and attach it to live QEMU via
 +``blockdev-add``, which assigns a name to the to-be created target node.
 +
 +E.g. the sequence of actions to create a point-in-time backup of an
 +entire disk image chain, to a target, using ``blockdev-mirror`` would be:
 +
 +(0) Create the QCOW2 overlays, to arrive at a backing chain of desired
 +    depth
 +
 +(1) Create the target image (using ``qemu-img``), say, ``e.qcow2``
 +
 +(2) Attach the above created file (``e.qcow2``), run-time, using
 +    ``blockdev-add`` to QEMU
 +
 +(3) Perform ``blockdev-mirror`` (use ``"sync": "full"`` to copy the
 +    entire chain to the target).  And notice the event
 +    ``BLOCK_JOB_READY``
 +
 +(4) Optionally, query for active block jobs, there should be a 'mirror'
 +    job ready to be completed
 +
 +(5) Gracefully complete the 'mirror' block device job, and notice the
 +    the event ``BLOCK_JOB_COMPLETED``
 +
 +(6) Shutdown the guest by issuing the QMP ``quit`` command so that
 +    caches are flushed
 +
 +(7) Then, finally, compare the contents of the disk image chain, and
 +    the target copy with ``qemu-img compare``.  You should notice:
 +    "Images are identical"
 +
 +
 +QMP invocation for ``blockdev-mirror``
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +
 +Given the disk image chain::
 +
 +    [A] <-- [B] <-- [C] <-- [D]
 +
 +To copy the contents of the entire disk image chain, from [A] all the
 +way to [D], to a new target, call it [E].  The following is the flow.
 +
 +Create the overlay images, [B], [C], and [D]::
 +
 +    (QEMU) blockdev-snapshot-sync node-name=node-A snapshot-file=b.qcow2 snapshot-node-name=node-B format=qcow2
 +    (QEMU) blockdev-snapshot-sync node-name=node-B snapshot-file=c.qcow2 snapshot-node-name=node-C format=qcow2
 +    (QEMU) blockdev-snapshot-sync node-name=node-C snapshot-file=d.qcow2 snapshot-node-name=node-D format=qcow2
 +
 +Create the target image, [E]::
 +
 +    $ qemu-img create -f qcow2 e.qcow2 39M
 +
 +Add the above created target image to QEMU, via ``blockdev-add``::
 +
 +    (QEMU) blockdev-add driver=qcow2 node-name=node-E file={"driver":"file","filename":"e.qcow2"}
 +    {
 +        "execute": "blockdev-add",
 +        "arguments": {
 +            "node-name": "node-E",
 +            "driver": "qcow2",
 +            "file": {
 +                "driver": "file",
 +                "filename": "e.qcow2"
 +            }
 +        }
 +    }
 +
 +Perform ``blockdev-mirror``, and notice the event ``BLOCK_JOB_READY``::
 +
 +    (QEMU) blockdev-mirror device=node-B target=node-E sync=full job-id=job0
 +    {
 +        "execute": "blockdev-mirror",
 +        "arguments": {
 +            "device": "node-D",
 +            "job-id": "job0",
 +            "target": "node-E",
 +            "sync": "full"
 +        }
 +    }
 +
 +Query for active block jobs, there should be a 'mirror' job ready::
 +
 +    (QEMU) query-block-jobs
 +    {
 +        "execute": "query-block-jobs",
 +        "arguments": {}
 +    }
 +    {
 +        "return": [
 +            {
 +                "busy": false,
 +                "type": "mirror",
 +                "len": 21561344,
 +                "paused": false,
 +                "ready": true,
 +                "io-status": "ok",
 +                "offset": 21561344,
 +                "device": "job0",
 +                "speed": 0
 +            }
 +        ]
 +    }
 +
 +Gracefully complete the block device job operation, and notice the
 +event ``BLOCK_JOB_COMPLETED``::
 +
 +    (QEMU) block-job-complete device=job0
 +    {
 +        "execute": "block-job-complete",
 +        "arguments": {
 +            "device": "job0"
 +        }
 +    }
 +    {
 +        "return": {}
 +    }
 +
 +Shutdown the guest, by issuing the ``quit`` QMP command::
 +
 +    (QEMU) quit
 +    {
 +        "execute": "quit",
 +        "arguments": {}
 +    }
 +
 +
 +Live disk backup --- ``drive-backup`` and ``blockdev-backup``
 +-------------------------------------------------------------
 +
 +The ``drive-backup`` (and its newer equivalent ``blockdev-backup``) allows
 +you to create a point-in-time snapshot.
 +
 +In this case, the point-in-time is when you *start* the ``drive-backup``
 +(or its newer equivalent ``blockdev-backup``) command.
 +
 +
 +QMP invocation for ``drive-backup``
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +
 +Yet again, starting afresh with our example disk image chain::
 +
 +    [A] <-- [B] <-- [C] <-- [D]
 +
 +To create a target image [E], with content populated from image [A] to
 +[D], from the above chain, the following is the syntax.  (If the target
 +image does not exist, ``drive-backup`` will create it)::
 +
 +    (QEMU) drive-backup device=node-D sync=full target=e.qcow2 job-id=job0
 +    {
 +        "execute": "drive-backup",
 +        "arguments": {
 +            "device": "node-D",
 +            "job-id": "job0",
 +            "sync": "full",
 +            "target": "e.qcow2"
 +        }
 +    }
 +
 +Once the above ``drive-backup`` has completed, a ``BLOCK_JOB_COMPLETED`` event
 +will be issued, indicating the live block device job operation has
 +completed, and no further action is required.
 +
 +
 +Notes on ``blockdev-backup``
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +
 +The ``blockdev-backup`` command is equivalent in functionality to
 +``drive-backup``, except that it operates at node-level in a Block Driver
 +State (BDS) graph.
 +
 +E.g. the sequence of actions to create a point-in-time backup
 +of an entire disk image chain, to a target, using ``blockdev-backup``
 +would be:
 +
 +(0) Create the QCOW2 overlays, to arrive at a backing chain of desired
 +    depth
 +
 +(1) Create the target image (using ``qemu-img``), say, ``e.qcow2``
 +
 +(2) Attach the above created file (``e.qcow2``), run-time, using
 +    ``blockdev-add`` to QEMU
 +
 +(3) Perform ``blockdev-backup`` (use ``"sync": "full"`` to copy the
 +    entire chain to the target).  And notice the event
 +    ``BLOCK_JOB_COMPLETED``
 +
 +(4) Shutdown the guest, by issuing the QMP ``quit`` command, so that
 +    caches are flushed
 +
 +(5) Then, finally, compare the contents of the disk image chain, and
 +    the target copy with ``qemu-img compare``.  You should notice:
 +    "Images are identical"
 +
 +The following section shows an example QMP invocation for
 +``blockdev-backup``.
 +
 +QMP invocation for ``blockdev-backup``
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +
 +Given a disk image chain of depth 1 where image [B] is the active
 +overlay (live QEMU is writing to it)::
 +
 +    [A] <-- [B]
 +
 +The following is the procedure to copy the content from the entire chain
 +to a target image (say, [E]), which has the full content from [A] and
 +[B].
 +
 +Create the overlay [B]::
 +
 +    (QEMU) blockdev-snapshot-sync node-name=node-A snapshot-file=b.qcow2 snapshot-node-name=node-B format=qcow2
 +    {
 +        "execute": "blockdev-snapshot-sync",
 +        "arguments": {
 +            "node-name": "node-A",
 +            "snapshot-file": "b.qcow2",
 +            "format": "qcow2",
 +            "snapshot-node-name": "node-B"
 +        }
 +    }
 +
 +
 +Create a target image that will contain the copy::
 +
 +    $ qemu-img create -f qcow2 e.qcow2 39M
 +
 +Then add it to QEMU via ``blockdev-add``::
 +
 +    (QEMU) blockdev-add driver=qcow2 node-name=node-E file={"driver":"file","filename":"e.qcow2"}
 +    {
 +        "execute": "blockdev-add",
 +        "arguments": {
 +            "node-name": "node-E",
 +            "driver": "qcow2",
 +            "file": {
 +                "driver": "file",
 +                "filename": "e.qcow2"
 +            }
 +        }
 +    }
 +
 +Then invoke ``blockdev-backup`` to copy the contents from the entire
 +image chain, consisting of images [A] and [B] to the target image
 +'e.qcow2'::
 +
 +    (QEMU) blockdev-backup device=node-B target=node-E sync=full job-id=job0
 +    {
 +        "execute": "blockdev-backup",
 +        "arguments": {
 +            "device": "node-B",
 +            "job-id": "job0",
 +            "target": "node-E",
 +            "sync": "full"
 +        }
 +    }
 +
 +Once the above 'backup' operation has completed, the event,
 +``BLOCK_JOB_COMPLETED`` will be emitted, signalling successful
 +completion.
 +
 +Next, query for any active block device jobs (there should be none)::
 +
 +    (QEMU) query-block-jobs
 +    {
 +        "execute": "query-block-jobs",
 +        "arguments": {}
 +    }
 +
 +Shutdown the guest::
 +
 +    (QEMU) quit
 +    {
 +            "execute": "quit",
 +                "arguments": {}
 +    }
 +            "return": {}
 +    }
 +
 +.. note::
 +    The above step is really important; if forgotten, an error, "Failed
 +    to get shared "write" lock on e.qcow2", will be thrown when you do
 +    ``qemu-img compare`` to verify the integrity of the disk image
 +    with the backup content.
 +
 +
 +The end result will be the image 'e.qcow2' containing a
 +point-in-time backup of the disk image chain -- i.e. contents from
 +images [A] and [B] at the time the ``blockdev-backup`` command was
 +initiated.
 +
 +One way to confirm the backup disk image contains the identical content
 +with the disk image chain is to compare the backup and the contents of
 +the chain, you should see "Images are identical".  (NB: this is assuming
 +QEMU was launched with ``-S`` option, which will not start the CPUs at
 +guest boot up)::
 +
 +    $ qemu-img compare b.qcow2 e.qcow2
 +    Warning: Image size mismatch!
 +    Images are identical.
 +
 +NOTE: The "Warning: Image size mismatch!" is expected, as we created the
 +target image (e.qcow2) with 39M size.
 diff --git a/docs/live-block-ops.txt b/docs/live-block-ops.txt
 deleted file mode 100644
 index XXXXXXX..XXXXXXX
 --- a/docs/live-block-ops.txt
 +++ /dev/null
@@ -XXX,XX +XXX,XX @@
 -LIVE BLOCK OPERATIONS
 -=====================
 -
 -High level description of live block operations. Note these are not
 -supported for use with the raw format at the moment.
 -
 -Note also that this document is incomplete and it currently only
 -covers the 'stream' operation. Other operations supported by QEMU such
 -as 'commit', 'mirror' and 'backup' are not described here yet. Please
 -refer to the qapi/block-core.json file for an overview of those.
 -
 -Snapshot live merge
 -===================
 -
 -Given a snapshot chain, described in this document in the following
 -format:
 -
 -[A] <- [B] <- [C] <- [D] <- [E]
 -
 -Where the rightmost object ([E] in the example) described is the current
 -image which the guest OS has write access to. To the left of it is its base
 -image, and so on accordingly until the leftmost image, which has no
 -base.
 -
 -The snapshot live merge operation transforms such a chain into a
 -smaller one with fewer elements, such as this transformation relative
 -to the first example:
 -
 -[A] <- [E]
 -
 -Data is copied in the right direction with destination being the
 -rightmost image, but any other intermediate image can be specified
 -instead. In this example data is copied from [C] into [D], so [D] can
 -be backed by [B]:
 -
 -[A] <- [B] <- [D] <- [E]
 -
 -The operation is implemented in QEMU through image streaming facilities.
 -
 -The basic idea is to execute 'block_stream virtio0' while the guest is
 -running. Progress can be monitored using 'info block-jobs'. When the
 -streaming operation completes it raises a QMP event. 'block_stream'
 -copies data from the backing file(s) into the active image. When finished,
 -it adjusts the backing file pointer.
 -
 -The 'base' parameter specifies an image which data need not be
 -streamed from. This image will be used as the backing file for the
 -destination image when the operation is finished.
 -
 -In the first example above, the command would be:
 -
 -(qemu) block_stream virtio0 file-A.img
 -
 -In order to specify a destination image different from the active
 -(rightmost) one we can use its node name instead.
 -
 -In the second example above, the command would be:
 -
 -(qemu) block_stream node-D file-B.img
 -
 -Live block copy
 -===============
 -
 -To copy an in use image to another destination in the filesystem, one
 -should create a live snapshot in the desired destination, then stream
 -into that image. Example:
 -
 -(qemu) snapshot_blkdev ide0-hd0 /new-path/disk.img qcow2
 -
 -(qemu) block_stream ide0-hd0
 -
 -
 --
-.9.4
+.9.3

-New patch
+[Qemu-devel] [PULL v2 23/24] coroutine-lock: add mutex argument to CoQueue APIs
+From: Paolo Bonzini <pbonzini@redhat.com>
 All that CoQueue needs in order to become thread-safe is help
 from an external mutex.  Add this to the API.
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213181244.16297-6-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  include/qemu/coroutine.h   |  8 +++++---
  block/backup.c             |  2 +-
  block/io.c                 |  4 ++--
  block/nbd-client.c         |  2 +-
  block/qcow2-cluster.c      |  4 +---
  block/sheepdog.c           |  2 +-
  block/throttle-groups.c    |  2 +-
  hw/9pfs/9p.c               |  2 +-
  util/qemu-coroutine-lock.c | 24 +++++++++++++++++++++---
 files changed, 34 insertions(+), 16 deletions(-)
 diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/coroutine.h
 +++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
  /**
   * CoQueues are a mechanism to queue coroutines in order to continue executing
 - * them later.
 + * them later.  They are similar to condition variables, but they need help
 + * from an external mutex in order to maintain thread-safety.
   */
  typedef struct CoQueue {
      QSIMPLEQ_HEAD(, Coroutine) entries;
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_init(CoQueue *queue);
  /**
   * Adds the current coroutine to the CoQueue and transfers control to the
 - * caller of the coroutine.
 + * caller of the coroutine.  The mutex is unlocked during the wait and
 + * locked again afterwards.
   */
 -void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
 +void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex);
  /**
   * Restarts the next coroutine in the CoQueue and removes it from the queue.
 diff --git a/block/backup.c b/block/backup.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/backup.c
 +++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
          retry = false;
          QLIST_FOREACH(req, &job->inflight_reqs, list) {
              if (end > req->start && start < req->end) {
 -                qemu_co_queue_wait(&req->wait_queue);
 +                qemu_co_queue_wait(&req->wait_queue, NULL);
                  retry = true;
                  break;
              }
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
                   * (instead of producing a deadlock in the former case). */
                  if (!req->waiting_for) {
                      self->waiting_for = req;
 -                    qemu_co_queue_wait(&req->wait_queue);
 +                    qemu_co_queue_wait(&req->wait_queue, NULL);
                      self->waiting_for = NULL;
                      retry = true;
                      waited = true;
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
      /* Wait until any previous flushes are completed */
      while (bs->active_flush_req) {
 -        qemu_co_queue_wait(&bs->flush_queue);
 +        qemu_co_queue_wait(&bs->flush_queue, NULL);
      }
      bs->active_flush_req = true;
 diff --git a/block/nbd-client.c b/block/nbd-client.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/nbd-client.c
 +++ b/block/nbd-client.c
@@ -XXX,XX +XXX,XX @@ static void nbd_coroutine_start(NBDClientSession *s,
      /* Poor man semaphore.  The free_sema is locked when no other request
       * can be accepted, and unlocked after receiving one reply.  */
      if (s->in_flight == MAX_NBD_REQUESTS) {
 -        qemu_co_queue_wait(&s->free_sema);
 +        qemu_co_queue_wait(&s->free_sema, NULL);
          assert(s->in_flight < MAX_NBD_REQUESTS);
      }
      s->in_flight++;
 diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2-cluster.c
 +++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
              if (bytes == 0) {
                  /* Wait for the dependency to complete. We need to recheck
                   * the free/allocated clusters when we continue. */
 -                qemu_co_mutex_unlock(&s->lock);
 -                qemu_co_queue_wait(&old_alloc->dependent_requests);
 -                qemu_co_mutex_lock(&s->lock);
 +                qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
                  return -EAGAIN;
              }
          }
 diff --git a/block/sheepdog.c b/block/sheepdog.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/sheepdog.c
 +++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
  retry:
      QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
          if (AIOCBOverlapping(acb, cb)) {
 -            qemu_co_queue_wait(&s->overlapping_queue);
 +            qemu_co_queue_wait(&s->overlapping_queue, NULL);
              goto retry;
          }
      }
 diff --git a/block/throttle-groups.c b/block/throttle-groups.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/throttle-groups.c
 +++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
      if (must_wait || blkp->pending_reqs[is_write]) {
          blkp->pending_reqs[is_write]++;
          qemu_mutex_unlock(&tg->lock);
 -        qemu_co_queue_wait(&blkp->throttled_reqs[is_write]);
 +        qemu_co_queue_wait(&blkp->throttled_reqs[is_write], NULL);
          qemu_mutex_lock(&tg->lock);
          blkp->pending_reqs[is_write]--;
      }
 diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/9pfs/9p.c
 +++ b/hw/9pfs/9p.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn v9fs_flush(void *opaque)
          /*
           * Wait for pdu to complete.
           */
 -        qemu_co_queue_wait(&cancel_pdu->complete);
 +        qemu_co_queue_wait(&cancel_pdu->complete, NULL);
          cancel_pdu->cancelled = 0;
          pdu_free(cancel_pdu);
      }
 diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/qemu-coroutine-lock.c
 +++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_init(CoQueue *queue)
      QSIMPLEQ_INIT(&queue->entries);
  }
 -void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
 +void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex)
  {
      Coroutine *self = qemu_coroutine_self();
      QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next);
 +
 +    if (mutex) {
 +        qemu_co_mutex_unlock(mutex);
 +    }
 +
 +    /* There is no race condition here.  Other threads will call
 +     * aio_co_schedule on our AioContext, which can reenter this
 +     * coroutine but only after this yield and after the main loop
 +     * has gone through the next iteration.
 +     */
      qemu_coroutine_yield();
      assert(qemu_in_coroutine());
 +
 +    /* TODO: OSv implements wait morphing here, where the wakeup
 +     * primitive automatically places the woken coroutine on the
 +     * mutex's queue.  This avoids the thundering herd effect.
 +     */
 +    if (mutex) {
 +        qemu_co_mutex_lock(mutex);
 +    }
  }
  /**
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock)
      Coroutine *self = qemu_coroutine_self();
      while (lock->writer) {
 -        qemu_co_queue_wait(&lock->queue);
 +        qemu_co_queue_wait(&lock->queue, NULL);
      }
      lock->reader++;
      self->locks_held++;
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_wrlock(CoRwlock *lock)
      Coroutine *self = qemu_coroutine_self();
      while (lock->writer || lock->reader) {
 -        qemu_co_queue_wait(&lock->queue);
 +        qemu_co_queue_wait(&lock->queue, NULL);
      }
      lock->writer = true;
      self->locks_held++;
 --
 .9.3

-New patch
+[Qemu-devel] [PULL v2 24/24] coroutine-lock: make CoRwlock thread-safe and fair
+From: Paolo Bonzini <pbonzini@redhat.com>
+This adds a CoMutex around the existing CoQueue.  Because the write-side
+can just take CoMutex, the old "writer" field is not necessary anymore.
+Instead of removing it altogether, count the number of pending writers
+during a read-side critical section and forbid further readers from
+entering.
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
+Message-id: 20170213181244.16297-7-pbonzini@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ include/qemu/coroutine.h   |  3 ++-
+ util/qemu-coroutine-lock.c | 35 ++++++++++++++++++++++++-----------
+files changed, 26 insertions(+), 12 deletions(-)
+diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/qemu/coroutine.h
++++ b/include/qemu/coroutine.h
+@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue);
+ typedef struct CoRwlock {
+-    bool writer;
++    int pending_writer;
+     int reader;
++    CoMutex mutex;
+     CoQueue queue;
+ } CoRwlock;
+diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/qemu-coroutine-lock.c
++++ b/util/qemu-coroutine-lock.c
+@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_init(CoRwlock *lock)
+ {
+     memset(lock, 0, sizeof(*lock));
+     qemu_co_queue_init(&lock->queue);
++    qemu_co_mutex_init(&lock->mutex);
+ }
+ void qemu_co_rwlock_rdlock(CoRwlock *lock)
+ {
+     Coroutine *self = qemu_coroutine_self();
+-    while (lock->writer) {
+-        qemu_co_queue_wait(&lock->queue, NULL);
++    qemu_co_mutex_lock(&lock->mutex);
++    /* For fairness, wait if a writer is in line.  */
++    while (lock->pending_writer) {
++        qemu_co_queue_wait(&lock->queue, &lock->mutex);
+     }
+     lock->reader++;
++    qemu_co_mutex_unlock(&lock->mutex);
++
++    /* The rest of the read-side critical section is run without the mutex.  */
+     self->locks_held++;
+ }
+@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
+     Coroutine *self = qemu_coroutine_self();
+     assert(qemu_in_coroutine());
+-    if (lock->writer) {
+-        lock->writer = false;
++    if (!lock->reader) {
++        /* The critical section started in qemu_co_rwlock_wrlock.  */
+         qemu_co_queue_restart_all(&lock->queue);
+     } else {
++        self->locks_held--;
++
++        qemu_co_mutex_lock(&lock->mutex);
+         lock->reader--;
+         assert(lock->reader >= 0);
+         /* Wakeup only one waiting writer */
+@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
+             qemu_co_queue_next(&lock->queue);
+         }
+     }
+-    self->locks_held--;
++    qemu_co_mutex_unlock(&lock->mutex);
+ }
+ void qemu_co_rwlock_wrlock(CoRwlock *lock)
+ {
+-    Coroutine *self = qemu_coroutine_self();
+-
+-    while (lock->writer || lock->reader) {
+-        qemu_co_queue_wait(&lock->queue, NULL);
++    qemu_co_mutex_lock(&lock->mutex);
++    lock->pending_writer++;
++    while (lock->reader) {
++        qemu_co_queue_wait(&lock->queue, &lock->mutex);
+     }
+-    lock->writer = true;
+-    self->locks_held++;
++    lock->pending_writer--;
++
++    /* The rest of the write-side critical section is run with
++     * the mutex taken, so that lock->reader remains zero.
++     * There is no need to update self->locks_held.
++     */
+ }
+--
+.9.3

The following changes since commit ca4e667dbf431d4a2a5a619cde79d30dd2ac3eb2:

Merge remote-tracking branch 'remotes/kraxel/tags/usb-20170717-pull-request' into staging (2017-07-17 17:54:17 +0100)

are available in the git repository at:

git://github.com/codyprime/qemu-kvm-jtc.git tags/block-pull-request

for you to fetch changes up to 8508eee740c78d1465e25dad7c3e06137485dfbc:

live-block-ops.txt: Rename, rewrite, and improve it (2017-07-18 00:11:01 -0400)

----------------------------------------------------------------
Block patches (documentation)
----------------------------------------------------------------

Kashyap Chamarthy (2):
  bitmaps.md: Convert to rST; move it into 'interop' dir
  live-block-ops.txt: Rename, rewrite, and improve it

docs/devel/bitmaps.md                  |  505 ---------------
 docs/interop/bitmaps.rst               |  555 ++++++++++++++++
 docs/interop/live-block-operations.rst | 1088 ++++++++++++++++++++++++++++++++
 docs/live-block-ops.txt                |   72 ---
 4 files changed, 1643 insertions(+), 577 deletions(-)
 delete mode 100644 docs/devel/bitmaps.md
 create mode 100644 docs/interop/bitmaps.rst
 create mode 100644 docs/interop/live-block-operations.rst
 delete mode 100644 docs/live-block-ops.txt

-- 
2.9.4

From: Kashyap Chamarthy <kchamart@redhat.com>

This is part of the on-going effort to convert QEMU upstream
documentation syntax to reStructuredText (rST).

The conversion to rST was done using:

$ pandoc -f markdown -t rst bitmaps.md -o bitmaps.rst

Then, make a couple of small syntactical adjustments.  While at it,
reword a statement to avoid ambiguity.  Addressing the feedback from
this thread:

https://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg05428.html

Signed-off-by: Kashyap Chamarthy <kchamart@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20170717105205.32639-2-kchamart@redhat.com
Signed-off-by: Jeff Cody <jcody@redhat.com>
---
 docs/devel/bitmaps.md    | 505 ------------------------------------------
 docs/interop/bitmaps.rst | 555 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 555 insertions(+), 505 deletions(-)
 delete mode 100644 docs/devel/bitmaps.md
 create mode 100644 docs/interop/bitmaps.rst

diff --git a/docs/devel/bitmaps.md b/docs/devel/bitmaps.md
deleted file mode 100644
index XXXXXXX..XXXXXXX
--- a/docs/devel/bitmaps.md
+++ /dev/null
@@ -XXX,XX +XXX,XX @@
-
-
-# Dirty Bitmaps and Incremental Backup
-
-* Dirty Bitmaps are objects that track which data needs to be backed up for the
-  next incremental backup.
-
-* Dirty bitmaps can be created at any time and attached to any node
-  (not just complete drives.)
-
-## Dirty Bitmap Names
-
-* A dirty bitmap's name is unique to the node, but bitmaps attached to different
-  nodes can share the same name.
-
-* Dirty bitmaps created for internal use by QEMU may be anonymous and have no
-  name, but any user-created bitmaps may not be. There can be any number of
-  anonymous bitmaps per node.
-
-* The name of a user-created bitmap must not be empty ("").
-
-## Bitmap Modes
-
-* A Bitmap can be "frozen," which means that it is currently in-use by a backup
-  operation and cannot be deleted, renamed, written to, reset,
-  etc.
-
-* The normal operating mode for a bitmap is "active."
-
-## Basic QMP Usage
-
-### Supported Commands ###
-
-* block-dirty-bitmap-add
-* block-dirty-bitmap-remove
-* block-dirty-bitmap-clear
-
-### Creation
-
-* To create a new bitmap, enabled, on the drive with id=drive0:
-
-```json
-{ "execute": "block-dirty-bitmap-add",
-  "arguments": {
-    "node": "drive0",
-    "name": "bitmap0"
-  }
-}
-```
-
-* This bitmap will have a default granularity that matches the cluster size of
-  its associated drive, if available, clamped to between [4KiB, 64KiB].
-  The current default for qcow2 is 64KiB.
-
-* To create a new bitmap that tracks changes in 32KiB segments:
-
-```json
-{ "execute": "block-dirty-bitmap-add",
-  "arguments": {
-    "node": "drive0",
-    "name": "bitmap0",
-    "granularity": 32768
-  }
-}
-```
-
-### Deletion
-
-* Bitmaps that are frozen cannot be deleted.
-
-* Deleting the bitmap does not impact any other bitmaps attached to the same
-  node, nor does it affect any backups already created from this node.
-
-* Because bitmaps are only unique to the node to which they are attached,
-  you must specify the node/drive name here, too.
-
-```json
-{ "execute": "block-dirty-bitmap-remove",
-  "arguments": {
-    "node": "drive0",
-    "name": "bitmap0"
-  }
-}
-```
-
-### Resetting
-
-* Resetting a bitmap will clear all information it holds.
-
-* An incremental backup created from an empty bitmap will copy no data,
-  as if nothing has changed.
-
-```json
-{ "execute": "block-dirty-bitmap-clear",
-  "arguments": {
-    "node": "drive0",
-    "name": "bitmap0"
-  }
-}
-```
-
-## Transactions
-
-### Justification
-
-Bitmaps can be safely modified when the VM is paused or halted by using
-the basic QMP commands. For instance, you might perform the following actions:
-
-1. Boot the VM in a paused state.
-2. Create a full drive backup of drive0.
-3. Create a new bitmap attached to drive0.
-4. Resume execution of the VM.
-5. Incremental backups are ready to be created.
-
-At this point, the bitmap and drive backup would be correctly in sync,
-and incremental backups made from this point forward would be correctly aligned
-to the full drive backup.
-
-This is not particularly useful if we decide we want to start incremental
-backups after the VM has been running for a while, for which we will need to
-perform actions such as the following:
-
-1. Boot the VM and begin execution.
-2. Using a single transaction, perform the following operations:
-    * Create bitmap0.
-    * Create a full drive backup of drive0.
-3. Incremental backups are now ready to be created.
-
-### Supported Bitmap Transactions
-
-* block-dirty-bitmap-add
-* block-dirty-bitmap-clear
-
-The usages are identical to their respective QMP commands, but see below
-for examples.
-
-### Example: New Incremental Backup
-
-As outlined in the justification, perhaps we want to create a new incremental
-backup chain attached to a drive.
-
-```json
-{ "execute": "transaction",
-  "arguments": {
-    "actions": [
-      {"type": "block-dirty-bitmap-add",
-       "data": {"node": "drive0", "name": "bitmap0"} },
-      {"type": "drive-backup",
-       "data": {"device": "drive0", "target": "/path/to/full_backup.img",
-                "sync": "full", "format": "qcow2"} }
-    ]
-  }
-}
-```
-
-### Example: New Incremental Backup Anchor Point
-
-Maybe we just want to create a new full backup with an existing bitmap and
-want to reset the bitmap to track the new chain.
-
-```json
-{ "execute": "transaction",
-  "arguments": {
-    "actions": [
-      {"type": "block-dirty-bitmap-clear",
-       "data": {"node": "drive0", "name": "bitmap0"} },
-      {"type": "drive-backup",
-       "data": {"device": "drive0", "target": "/path/to/new_full_backup.img",
-                "sync": "full", "format": "qcow2"} }
-    ]
-  }
-}
-```
-
-## Incremental Backups
-
-The star of the show.
-
-**Nota Bene!** Only incremental backups of entire drives are supported for now.
-So despite the fact that you can attach a bitmap to any arbitrary node, they are
-only currently useful when attached to the root node. This is because
-drive-backup only supports drives/devices instead of arbitrary nodes.
-
-### Example: First Incremental Backup
-
-1. Create a full backup and sync it to the dirty bitmap, as in the transactional
-examples above; or with the VM offline, manually create a full copy and then
-create a new bitmap before the VM begins execution.
-
-    * Let's assume the full backup is named 'full_backup.img'.
-    * Let's assume the bitmap you created is 'bitmap0' attached to 'drive0'.
-
-2. Create a destination image for the incremental backup that utilizes the
-full backup as a backing image.
-
-    * Let's assume it is named 'incremental.0.img'.
-
-    ```sh
-    # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
-    ```
-
-3. Issue the incremental backup command:
-
-    ```json
-    { "execute": "drive-backup",
-      "arguments": {
-        "device": "drive0",
-        "bitmap": "bitmap0",
-        "target": "incremental.0.img",
-        "format": "qcow2",
-        "sync": "incremental",
-        "mode": "existing"
-      }
-    }
-    ```
-
-### Example: Second Incremental Backup
-
-1. Create a new destination image for the incremental backup that points to the
-   previous one, e.g.: 'incremental.1.img'
-
-    ```sh
-    # qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2
-    ```
-
-2. Issue a new incremental backup command. The only difference here is that we
-   have changed the target image below.
-
-    ```json
-    { "execute": "drive-backup",
-      "arguments": {
-        "device": "drive0",
-        "bitmap": "bitmap0",
-        "target": "incremental.1.img",
-        "format": "qcow2",
-        "sync": "incremental",
-        "mode": "existing"
-      }
-    }
-    ```
-
-## Errors
-
-* In the event of an error that occurs after a backup job is successfully
-  launched, either by a direct QMP command or a QMP transaction, the user
-  will receive a BLOCK_JOB_COMPLETE event with a failure message, accompanied
-  by a BLOCK_JOB_ERROR event.
-
-* In the case of an event being cancelled, the user will receive a
-  BLOCK_JOB_CANCELLED event instead of a pair of COMPLETE and ERROR events.
-
-* In either case, the incremental backup data contained within the bitmap is
-  safely rolled back, and the data within the bitmap is not lost. The image
-  file created for the failed attempt can be safely deleted.
-
-* Once the underlying problem is fixed (e.g. more storage space is freed up),
-  you can simply retry the incremental backup command with the same bitmap.
-
-### Example
-
-1. Create a target image:
-
-    ```sh
-    # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
-    ```
-
-2. Attempt to create an incremental backup via QMP:
-
-    ```json
-    { "execute": "drive-backup",
-      "arguments": {
-        "device": "drive0",
-        "bitmap": "bitmap0",
-        "target": "incremental.0.img",
-        "format": "qcow2",
-        "sync": "incremental",
-        "mode": "existing"
-      }
-    }
-    ```
-
-3. Receive an event notifying us of failure:
-
-    ```json
-    { "timestamp": { "seconds": 1424709442, "microseconds": 844524 },
-      "data": { "speed": 0, "offset": 0, "len": 67108864,
-                "error": "No space left on device",
-                "device": "drive1", "type": "backup" },
-      "event": "BLOCK_JOB_COMPLETED" }
-    ```
-
-4. Delete the failed incremental, and re-create the image.
-
-    ```sh
-    # rm incremental.0.img
-    # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
-    ```
-
-5. Retry the command after fixing the underlying problem,
-   such as freeing up space on the backup volume:
-
-    ```json
-    { "execute": "drive-backup",
-      "arguments": {
-        "device": "drive0",
-        "bitmap": "bitmap0",
-        "target": "incremental.0.img",
-        "format": "qcow2",
-        "sync": "incremental",
-        "mode": "existing"
-      }
-    }
-    ```
-
-6. Receive confirmation that the job completed successfully:
-
-    ```json
-    { "timestamp": { "seconds": 1424709668, "microseconds": 526525 },
-      "data": { "device": "drive1", "type": "backup",
-                "speed": 0, "len": 67108864, "offset": 67108864},
-      "event": "BLOCK_JOB_COMPLETED" }
-    ```
-
-### Partial Transactional Failures
-
-* Sometimes, a transaction will succeed in launching and return success,
-  but then later the backup jobs themselves may fail. It is possible that
-  a management application may have to deal with a partial backup failure
-  after a successful transaction.
-
-* If multiple backup jobs are specified in a single transaction, when one of
-  them fails, it will not interact with the other backup jobs in any way.
-
-* The job(s) that succeeded will clear the dirty bitmap associated with the
-  operation, but the job(s) that failed will not. It is not "safe" to delete
-  any incremental backups that were created successfully in this scenario,
-  even though others failed.
-
-#### Example
-
-* QMP example highlighting two backup jobs:
-
-    ```json
-    { "execute": "transaction",
-      "arguments": {
-        "actions": [
-          { "type": "drive-backup",
-            "data": { "device": "drive0", "bitmap": "bitmap0",
-                      "format": "qcow2", "mode": "existing",
-                      "sync": "incremental", "target": "d0-incr-1.qcow2" } },
-          { "type": "drive-backup",
-            "data": { "device": "drive1", "bitmap": "bitmap1",
-                      "format": "qcow2", "mode": "existing",
-                      "sync": "incremental", "target": "d1-incr-1.qcow2" } },
-        ]
-      }
-    }
-    ```
-
-* QMP example response, highlighting one success and one failure:
-    * Acknowledgement that the Transaction was accepted and jobs were launched:
-        ```json
-        { "return": {} }
-        ```
-
-    * Later, QEMU sends notice that the first job was completed:
-        ```json
-        { "timestamp": { "seconds": 1447192343, "microseconds": 615698 },
-          "data": { "device": "drive0", "type": "backup",
-                     "speed": 0, "len": 67108864, "offset": 67108864 },
-          "event": "BLOCK_JOB_COMPLETED"
-        }
-        ```
-
-    * Later yet, QEMU sends notice that the second job has failed:
-        ```json
-        { "timestamp": { "seconds": 1447192399, "microseconds": 683015 },
-          "data": { "device": "drive1", "action": "report",
-                    "operation": "read" },
-          "event": "BLOCK_JOB_ERROR" }
-        ```
-
-        ```json
-        { "timestamp": { "seconds": 1447192399, "microseconds": 685853 },
-          "data": { "speed": 0, "offset": 0, "len": 67108864,
-                    "error": "Input/output error",
-                    "device": "drive1", "type": "backup" },
-          "event": "BLOCK_JOB_COMPLETED" }
-
-* In the above example, "d0-incr-1.qcow2" is valid and must be kept,
-  but "d1-incr-1.qcow2" is invalid and should be deleted. If a VM-wide
-  incremental backup of all drives at a point-in-time is to be made,
-  new backups for both drives will need to be made, taking into account
-  that a new incremental backup for drive0 needs to be based on top of
-  "d0-incr-1.qcow2."
-
-### Grouped Completion Mode
-
-* While jobs launched by transactions normally complete or fail on their own,
-  it is possible to instruct them to complete or fail together as a group.
-
-* QMP transactions take an optional properties structure that can affect
-  the semantics of the transaction.
-
-* The "completion-mode" transaction property can be either "individual"
-  which is the default, legacy behavior described above, or "grouped,"
-  a new behavior detailed below.
-
-* Delayed Completion: In grouped completion mode, no jobs will report
-  success until all jobs are ready to report success.
-
-* Grouped failure: If any job fails in grouped completion mode, all remaining
-  jobs will be cancelled. Any incremental backups will restore their dirty
-  bitmap objects as if no backup command was ever issued.
-
-    * Regardless of if QEMU reports a particular incremental backup job as
-      CANCELLED or as an ERROR, the in-memory bitmap will be restored.
-
-#### Example
-
-* Here's the same example scenario from above with the new property:
-
-    ```json
-    { "execute": "transaction",
-      "arguments": {
-        "actions": [
-          { "type": "drive-backup",
-            "data": { "device": "drive0", "bitmap": "bitmap0",
-                      "format": "qcow2", "mode": "existing",
-                      "sync": "incremental", "target": "d0-incr-1.qcow2" } },
-          { "type": "drive-backup",
-            "data": { "device": "drive1", "bitmap": "bitmap1",
-                      "format": "qcow2", "mode": "existing",
-                      "sync": "incremental", "target": "d1-incr-1.qcow2" } },
-        ],
-        "properties": {
-          "completion-mode": "grouped"
-        }
-      }
-    }
-    ```
-
-* QMP example response, highlighting a failure for drive2:
-    * Acknowledgement that the Transaction was accepted and jobs were launched:
-        ```json
-        { "return": {} }
-        ```
-
-    * Later, QEMU sends notice that the second job has errored out,
-      but that the first job was also cancelled:
-        ```json
-        { "timestamp": { "seconds": 1447193702, "microseconds": 632377 },
-          "data": { "device": "drive1", "action": "report",
-                    "operation": "read" },
-          "event": "BLOCK_JOB_ERROR" }
-        ```
-
-        ```json
-        { "timestamp": { "seconds": 1447193702, "microseconds": 640074 },
-          "data": { "speed": 0, "offset": 0, "len": 67108864,
-                    "error": "Input/output error",
-                    "device": "drive1", "type": "backup" },
-          "event": "BLOCK_JOB_COMPLETED" }
-        ```
-
-        ```json
-        { "timestamp": { "seconds": 1447193702, "microseconds": 640163 },
-          "data": { "device": "drive0", "type": "backup", "speed": 0,
-                    "len": 67108864, "offset": 16777216 },
-          "event": "BLOCK_JOB_CANCELLED" }
-        ```
-
-
diff --git a/docs/interop/bitmaps.rst b/docs/interop/bitmaps.rst
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/docs/interop/bitmaps.rst
@@ -XXX,XX +XXX,XX @@
+..
+   Copyright 2015 John Snow <jsnow@redhat.com> and Red Hat, Inc.
+   All rights reserved.
+
+   This file is licensed via The FreeBSD Documentation License, the full
+   text of which is included at the end of this document.
+
+====================================
+Dirty Bitmaps and Incremental Backup
+====================================
+
+-  Dirty Bitmaps are objects that track which data needs to be backed up
+   for the next incremental backup.
+
+-  Dirty bitmaps can be created at any time and attached to any node
+   (not just complete drives).
+
+.. contents::
+
+Dirty Bitmap Names
+------------------
+
+-  A dirty bitmap's name is unique to the node, but bitmaps attached to
+   different nodes can share the same name.
+
+-  Dirty bitmaps created for internal use by QEMU may be anonymous and
+   have no name, but any user-created bitmaps must have a name. There
+   can be any number of anonymous bitmaps per node.
+
+-  The name of a user-created bitmap must not be empty ("").
+
+Bitmap Modes
+------------
+
+-  A bitmap can be "frozen," which means that it is currently in-use by
+   a backup operation and cannot be deleted, renamed, written to, reset,
+   etc.
+
+-  The normal operating mode for a bitmap is "active."
+
+Basic QMP Usage
+---------------
+
+Supported Commands
+~~~~~~~~~~~~~~~~~~
+
+- ``block-dirty-bitmap-add``
+- ``block-dirty-bitmap-remove``
+- ``block-dirty-bitmap-clear``
+
+Creation
+~~~~~~~~
+
+-  To create a new bitmap, enabled, on the drive with id=drive0:
+
+.. code:: json
+
+    { "execute": "block-dirty-bitmap-add",
+      "arguments": {
+        "node": "drive0",
+        "name": "bitmap0"
+      }
+    }
+
+-  This bitmap will have a default granularity that matches the cluster
+   size of its associated drive, if available, clamped to between [4KiB,
+   64KiB]. The current default for qcow2 is 64KiB.
+
+-  To create a new bitmap that tracks changes in 32KiB segments:
+
+.. code:: json
+
+    { "execute": "block-dirty-bitmap-add",
+      "arguments": {
+        "node": "drive0",
+        "name": "bitmap0",
+        "granularity": 32768
+      }
+    }
+
+Deletion
+~~~~~~~~
+
+-  Bitmaps that are frozen cannot be deleted.
+
+-  Deleting the bitmap does not impact any other bitmaps attached to the
+   same node, nor does it affect any backups already created from this
+   node.
+
+-  Because bitmaps are only unique to the node to which they are
+   attached, you must specify the node/drive name here, too.
+
+.. code:: json
+
+    { "execute": "block-dirty-bitmap-remove",
+      "arguments": {
+        "node": "drive0",
+        "name": "bitmap0"
+      }
+    }
+
+Resetting
+~~~~~~~~~
+
+-  Resetting a bitmap will clear all information it holds.
+
+-  An incremental backup created from an empty bitmap will copy no data,
+   as if nothing has changed.
+
+.. code:: json
+
+    { "execute": "block-dirty-bitmap-clear",
+      "arguments": {
+        "node": "drive0",
+        "name": "bitmap0"
+      }
+    }
+
+Transactions
+------------
+
+Justification
+~~~~~~~~~~~~~
+
+Bitmaps can be safely modified when the VM is paused or halted by using
+the basic QMP commands. For instance, you might perform the following
+actions:
+
+1. Boot the VM in a paused state.
+2. Create a full drive backup of drive0.
+3. Create a new bitmap attached to drive0.
+4. Resume execution of the VM.
+5. Incremental backups are ready to be created.
+
+At this point, the bitmap and drive backup would be correctly in sync,
+and incremental backups made from this point forward would be correctly
+aligned to the full drive backup.
+
+This is not particularly useful if we decide we want to start
+incremental backups after the VM has been running for a while, for which
+we will need to perform actions such as the following:
+
+1. Boot the VM and begin execution.
+2. Using a single transaction, perform the following operations:
+
+   -  Create ``bitmap0``.
+   -  Create a full drive backup of ``drive0``.
+
+3. Incremental backups are now ready to be created.
+
+Supported Bitmap Transactions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+-  ``block-dirty-bitmap-add``
+-  ``block-dirty-bitmap-clear``
+
+The usages are identical to their respective QMP commands, but see below
+for examples.
+
+Example: New Incremental Backup
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As outlined in the justification, perhaps we want to create a new
+incremental backup chain attached to a drive.
+
+.. code:: json
+
+    { "execute": "transaction",
+      "arguments": {
+        "actions": [
+          {"type": "block-dirty-bitmap-add",
+           "data": {"node": "drive0", "name": "bitmap0"} },
+          {"type": "drive-backup",
+           "data": {"device": "drive0", "target": "/path/to/full_backup.img",
+                    "sync": "full", "format": "qcow2"} }
+        ]
+      }
+    }
+
+Example: New Incremental Backup Anchor Point
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Maybe we just want to create a new full backup with an existing bitmap
+and want to reset the bitmap to track the new chain.
+
+.. code:: json
+
+    { "execute": "transaction",
+      "arguments": {
+        "actions": [
+          {"type": "block-dirty-bitmap-clear",
+           "data": {"node": "drive0", "name": "bitmap0"} },
+          {"type": "drive-backup",
+           "data": {"device": "drive0", "target": "/path/to/new_full_backup.img",
+                    "sync": "full", "format": "qcow2"} }
+        ]
+      }
+    }
+
+Incremental Backups
+-------------------
+
+The star of the show.
+
+**Nota Bene!** Only incremental backups of entire drives are supported
+for now. So despite the fact that you can attach a bitmap to any
+arbitrary node, they are only currently useful when attached to the root
+node. This is because drive-backup only supports drives/devices instead
+of arbitrary nodes.
+
+Example: First Incremental Backup
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+1. Create a full backup and sync it to the dirty bitmap, as in the
+   transactional examples above; or with the VM offline, manually create
+   a full copy and then create a new bitmap before the VM begins
+   execution.
+
+   -  Let's assume the full backup is named ``full_backup.img``.
+   -  Let's assume the bitmap you created is ``bitmap0`` attached to
+      ``drive0``.
+
+2. Create a destination image for the incremental backup that utilizes
+   the full backup as a backing image.
+
+   -  Let's assume the new incremental image is named
+      ``incremental.0.img``.
+
+   .. code:: bash
+
+       $ qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+
+3. Issue the incremental backup command:
+
+   .. code:: json
+
+       { "execute": "drive-backup",
+         "arguments": {
+           "device": "drive0",
+           "bitmap": "bitmap0",
+           "target": "incremental.0.img",
+           "format": "qcow2",
+           "sync": "incremental",
+           "mode": "existing"
+         }
+       }
+
+Example: Second Incremental Backup
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+1. Create a new destination image for the incremental backup that points
+   to the previous one, e.g.: ``incremental.1.img``
+
+   .. code:: bash
+
+       $ qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2
+
+2. Issue a new incremental backup command. The only difference here is
+   that we have changed the target image below.
+
+   .. code:: json
+
+       { "execute": "drive-backup",
+         "arguments": {
+           "device": "drive0",
+           "bitmap": "bitmap0",
+           "target": "incremental.1.img",
+           "format": "qcow2",
+           "sync": "incremental",
+           "mode": "existing"
+         }
+       }
+
+Errors
+------
+
+-  In the event of an error that occurs after a backup job is
+   successfully launched, either by a direct QMP command or a QMP
+   transaction, the user will receive a ``BLOCK_JOB_COMPLETE`` event with
+   a failure message, accompanied by a ``BLOCK_JOB_ERROR`` event.
+
+-  In the case of an event being cancelled, the user will receive a
+   ``BLOCK_JOB_CANCELLED`` event instead of a pair of COMPLETE and ERROR
+   events.
+
+-  In either case, the incremental backup data contained within the
+   bitmap is safely rolled back, and the data within the bitmap is not
+   lost. The image file created for the failed attempt can be safely
+   deleted.
+
+-  Once the underlying problem is fixed (e.g. more storage space is
+   freed up), you can simply retry the incremental backup command with
+   the same bitmap.
+
+Example
+~~~~~~~
+
+1. Create a target image:
+
+   .. code:: bash
+
+       $ qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+
+2. Attempt to create an incremental backup via QMP:
+
+   .. code:: json
+
+       { "execute": "drive-backup",
+         "arguments": {
+           "device": "drive0",
+           "bitmap": "bitmap0",
+           "target": "incremental.0.img",
+           "format": "qcow2",
+           "sync": "incremental",
+           "mode": "existing"
+         }
+       }
+
+3. Receive an event notifying us of failure:
+
+   .. code:: json
+
+       { "timestamp": { "seconds": 1424709442, "microseconds": 844524 },
+         "data": { "speed": 0, "offset": 0, "len": 67108864,
+                   "error": "No space left on device",
+                   "device": "drive1", "type": "backup" },
+         "event": "BLOCK_JOB_COMPLETED" }
+
+4. Delete the failed incremental, and re-create the image.
+
+   .. code:: bash
+
+       $ rm incremental.0.img
+       $ qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+
+5. Retry the command after fixing the underlying problem, such as
+   freeing up space on the backup volume:
+
+   .. code:: json
+
+       { "execute": "drive-backup",
+         "arguments": {
+           "device": "drive0",
+           "bitmap": "bitmap0",
+           "target": "incremental.0.img",
+           "format": "qcow2",
+           "sync": "incremental",
+           "mode": "existing"
+         }
+       }
+
+6. Receive confirmation that the job completed successfully:
+
+   .. code:: json
+
+       { "timestamp": { "seconds": 1424709668, "microseconds": 526525 },
+         "data": { "device": "drive1", "type": "backup",
+                   "speed": 0, "len": 67108864, "offset": 67108864},
+         "event": "BLOCK_JOB_COMPLETED" }
+
+Partial Transactional Failures
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+-  Sometimes, a transaction will succeed in launching and return
+   success, but then later the backup jobs themselves may fail. It is
+   possible that a management application may have to deal with a
+   partial backup failure after a successful transaction.
+
+-  If multiple backup jobs are specified in a single transaction, when
+   one of them fails, it will not interact with the other backup jobs in
+   any way.
+
+-  The job(s) that succeeded will clear the dirty bitmap associated with
+   the operation, but the job(s) that failed will not. It is not "safe"
+   to delete any incremental backups that were created successfully in
+   this scenario, even though others failed.
+
+Example
+^^^^^^^
+
+-  QMP example highlighting two backup jobs:
+
+   .. code:: json
+
+       { "execute": "transaction",
+         "arguments": {
+           "actions": [
+             { "type": "drive-backup",
+               "data": { "device": "drive0", "bitmap": "bitmap0",
+                         "format": "qcow2", "mode": "existing",
+                         "sync": "incremental", "target": "d0-incr-1.qcow2" } },
+             { "type": "drive-backup",
+               "data": { "device": "drive1", "bitmap": "bitmap1",
+                         "format": "qcow2", "mode": "existing",
+                         "sync": "incremental", "target": "d1-incr-1.qcow2" } },
+           ]
+         }
+       }
+
+-  QMP example response, highlighting one success and one failure:
+
+   -  Acknowledgement that the Transaction was accepted and jobs were
+      launched:
+
+      .. code:: json
+
+          { "return": {} }
+
+   -  Later, QEMU sends notice that the first job was completed:
+
+      .. code:: json
+
+          { "timestamp": { "seconds": 1447192343, "microseconds": 615698 },
+            "data": { "device": "drive0", "type": "backup",
+                       "speed": 0, "len": 67108864, "offset": 67108864 },
+            "event": "BLOCK_JOB_COMPLETED"
+          }
+
+   -  Later yet, QEMU sends notice that the second job has failed:
+
+      .. code:: json
+
+          { "timestamp": { "seconds": 1447192399, "microseconds": 683015 },
+            "data": { "device": "drive1", "action": "report",
+                      "operation": "read" },
+            "event": "BLOCK_JOB_ERROR" }
+
+      .. code:: json
+
+          { "timestamp": { "seconds": 1447192399, "microseconds":
+          685853 }, "data": { "speed": 0, "offset": 0, "len": 67108864,
+          "error": "Input/output error", "device": "drive1", "type":
+          "backup" }, "event": "BLOCK_JOB_COMPLETED" }
+
+-  In the above example, ``d0-incr-1.qcow2`` is valid and must be kept,
+   but ``d1-incr-1.qcow2`` is invalid and should be deleted. If a VM-wide
+   incremental backup of all drives at a point-in-time is to be made,
+   new backups for both drives will need to be made, taking into account
+   that a new incremental backup for drive0 needs to be based on top of
+   ``d0-incr-1.qcow2``.
+
+Grouped Completion Mode
+~~~~~~~~~~~~~~~~~~~~~~~
+
+-  While jobs launched by transactions normally complete or fail on
+   their own, it is possible to instruct them to complete or fail
+   together as a group.
+
+-  QMP transactions take an optional properties structure that can
+   affect the semantics of the transaction.
+
+-  The "completion-mode" transaction property can be either "individual"
+   which is the default, legacy behavior described above, or "grouped,"
+   a new behavior detailed below.
+
+-  Delayed Completion: In grouped completion mode, no jobs will report
+   success until all jobs are ready to report success.
+
+-  Grouped failure: If any job fails in grouped completion mode, all
+   remaining jobs will be cancelled. Any incremental backups will
+   restore their dirty bitmap objects as if no backup command was ever
+   issued.
+
+   -  Regardless of if QEMU reports a particular incremental backup job
+      as CANCELLED or as an ERROR, the in-memory bitmap will be
+      restored.
+
+Example
+^^^^^^^
+
+-  Here's the same example scenario from above with the new property:
+
+   .. code:: json
+
+       { "execute": "transaction",
+         "arguments": {
+           "actions": [
+             { "type": "drive-backup",
+               "data": { "device": "drive0", "bitmap": "bitmap0",
+                         "format": "qcow2", "mode": "existing",
+                         "sync": "incremental", "target": "d0-incr-1.qcow2" } },
+             { "type": "drive-backup",
+               "data": { "device": "drive1", "bitmap": "bitmap1",
+                         "format": "qcow2", "mode": "existing",
+                         "sync": "incremental", "target": "d1-incr-1.qcow2" } },
+           ],
+           "properties": {
+             "completion-mode": "grouped"
+           }
+         }
+       }
+
+-  QMP example response, highlighting a failure for ``drive2``:
+
+   -  Acknowledgement that the Transaction was accepted and jobs were
+      launched:
+
+      .. code:: json
+
+          { "return": {} }
+
+   -  Later, QEMU sends notice that the second job has errored out, but
+      that the first job was also cancelled:
+
+      .. code:: json
+
+          { "timestamp": { "seconds": 1447193702, "microseconds": 632377 },
+            "data": { "device": "drive1", "action": "report",
+                      "operation": "read" },
+            "event": "BLOCK_JOB_ERROR" }
+
+      .. code:: json
+
+          { "timestamp": { "seconds": 1447193702, "microseconds": 640074 },
+            "data": { "speed": 0, "offset": 0, "len": 67108864,
+                      "error": "Input/output error",
+                      "device": "drive1", "type": "backup" },
+            "event": "BLOCK_JOB_COMPLETED" }
+
+      .. code:: json
+
+          { "timestamp": { "seconds": 1447193702, "microseconds": 640163 },
+            "data": { "device": "drive0", "type": "backup", "speed": 0,
+                      "len": 67108864, "offset": 16777216 },
+            "event": "BLOCK_JOB_CANCELLED" }
+
+.. raw:: html
+
+   
-- 
2.9.4

From: Kashyap Chamarthy <kchamart@redhat.com>

This patch documents (including their QMP invocations) all the four
major kinds of live block operations:

- `block-stream`
  - `block-commit`
  - `drive-mirror` (& `blockdev-mirror`)
  - `drive-backup` (& `blockdev-backup`)

Things considered while writing this document:

- Use reStructuredText as markup language (with the goal of generating
    the HTML output using the Sphinx Documentation Generator).  It is
    gentler on the eye, and can be trivially converted to different
    formats.  (Another reason: upstream QEMU is considering to switch to
    Sphinx, which uses reStructuredText as its markup language.)

- Raw QMP JSON output vs. 'qmp-shell'.  I debated with myself whether
    to only show raw QMP JSON output (as that is the canonical
    representation), or use 'qmp-shell', which takes key-value pairs.  I
    settled on the approach of: for the first occurrence of a command,
    use raw JSON; for subsequent occurrences, use 'qmp-shell', with an
    occasional exception.

- Usage of `-blockdev` command-line.

- Usage of 'node-name' vs. file path to refer to disks.  While we have
    `blockdev-{mirror, backup}` as 'node-name'-alternatives for
    `drive-{mirror, backup}`, the `block-commit` command still operates
    on file names for parameters 'base' and 'top'.  So I added a caveat
    at the beginning to that effect.

Refer this related thread that I started (where I learnt
    `block-stream` was recently reworked to accept 'node-name' for 'top'
    and 'base' parameters):
    https://lists.nongnu.org/archive/html/qemu-devel/2017-05/msg06466.html
    "[RFC] Making 'block-stream', and 'block-commit' accept node-name"

All commands showed in this document were tested while documenting.

Thanks: Eric Blake for the section: "A note on points-in-time vs file
names".  This useful bit was originally articulated by Eric in his
KVMForum 2015 presentation, so I included that specific bit in this
document.

Signed-off-by: Kashyap Chamarthy <kchamart@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20170717105205.32639-3-kchamart@redhat.com
Signed-off-by: Jeff Cody <jcody@redhat.com>
---
 docs/interop/live-block-operations.rst | 1088 ++++++++++++++++++++++++++++++++
 docs/live-block-ops.txt                |   72 ---
 2 files changed, 1088 insertions(+), 72 deletions(-)
 create mode 100644 docs/interop/live-block-operations.rst
 delete mode 100644 docs/live-block-ops.txt

diff --git a/docs/interop/live-block-operations.rst b/docs/interop/live-block-operations.rst
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/docs/interop/live-block-operations.rst
@@ -XXX,XX +XXX,XX @@
+..
+    Copyright (C) 2017 Red Hat Inc.
+
+    This work is licensed under the terms of the GNU GPL, version 2 or
+    later.  See the COPYING file in the top-level directory.
+
+============================
+Live Block Device Operations
+============================
+
+QEMU Block Layer currently (as of QEMU 2.9) supports four major kinds of
+live block device jobs -- stream, commit, mirror, and backup.  These can
+be used to manipulate disk image chains to accomplish certain tasks,
+namely: live copy data from backing files into overlays; shorten long
+disk image chains by merging data from overlays into backing files; live
+synchronize data from a disk image chain (including current active disk)
+to another target image; and point-in-time (and incremental) backups of
+a block device.  Below is a description of the said block (QMP)
+primitives, and some (non-exhaustive list of) examples to illustrate
+their use.
+
+.. note::
+    The file ``qapi/block-core.json`` in the QEMU source tree has the
+    canonical QEMU API (QAPI) schema documentation for the QMP
+    primitives discussed here.
+
+.. todo (kashyapc):: Remove the ".. contents::" directive when Sphinx is
+                     integrated.
+
+.. contents::
+
+Disk image backing chain notation
+---------------------------------
+
+A simple disk image chain.  (This can be created live using QMP
+``blockdev-snapshot-sync``, or offline via ``qemu-img``)::
+
+                   (Live QEMU)
+                        |
+                        .
+                        V
+
+            [A] <----- [B]
+
+    (backing file)    (overlay)
+
+The arrow can be read as: Image [A] is the backing file of disk image
+[B].  And live QEMU is currently writing to image [B], consequently, it
+is also referred to as the "active layer".
+
+There are two kinds of terminology that are common when referring to
+files in a disk image backing chain:
+
+(1) Directional: 'base' and 'top'.  Given the simple disk image chain
+    above, image [A] can be referred to as 'base', and image [B] as
+    'top'.  (This terminology can be seen in in QAPI schema file,
+    block-core.json.)
+
+(2) Relational: 'backing file' and 'overlay'.  Again, taking the same
+    simple disk image chain from the above, disk image [A] is referred
+    to as the backing file, and image [B] as overlay.
+
+   Throughout this document, we will use the relational terminology.
+
+.. important::
+    The overlay files can generally be any format that supports a
+    backing file, although QCOW2 is the preferred format and the one
+    used in this document.
+
+
+Brief overview of live block QMP primitives
+-------------------------------------------
+
+The following are the four different kinds of live block operations that
+QEMU block layer supports.
+
+(1) ``block-stream``: Live copy of data from backing files into overlay
+    files.
+
+    .. note:: Once the 'stream' operation has finished, three things to
+              note:
+
+                (a) QEMU rewrites the backing chain to remove
+                    reference to the now-streamed and redundant backing
+                    file;
+
+                (b) the streamed file *itself* won't be removed by QEMU,
+                    and must be explicitly discarded by the user;
+
+                (c) the streamed file remains valid -- i.e. further
+                    overlays can be created based on it.  Refer the
+                    ``block-stream`` section further below for more
+                    details.
+
+(2) ``block-commit``: Live merge of data from overlay files into backing
+    files (with the optional goal of removing the overlay file from the
+    chain).  Since QEMU 2.0, this includes "active ``block-commit``"
+    (i.e. merge the current active layer into the base image).
+
+    .. note:: Once the 'commit' operation has finished, there are three
+              things to note here as well:
+
+                (a) QEMU rewrites the backing chain to remove reference
+                    to now-redundant overlay images that have been
+                    committed into a backing file;
+
+                (b) the committed file *itself* won't be removed by QEMU
+                    -- it ought to be manually removed;
+
+                (c) however, unlike in the case of ``block-stream``, the
+                    intermediate images will be rendered invalid -- i.e.
+                    no more further overlays can be created based on
+                    them.  Refer the ``block-commit`` section further
+                    below for more details.
+
+(3) ``drive-mirror`` (and ``blockdev-mirror``): Synchronize a running
+    disk to another image.
+
+(4) ``drive-backup`` (and ``blockdev-backup``): Point-in-time (live) copy
+    of a block device to a destination.
+
+
+.. _`Interacting with a QEMU instance`:
+
+Interacting with a QEMU instance
+--------------------------------
+
+To show some example invocations of command-line, we will use the
+following invocation of QEMU, with a QMP server running over UNIX
+socket::
+
+    $ ./x86_64-softmmu/qemu-system-x86_64 -display none -nodefconfig \
+        -M q35 -nodefaults -m 512 \
+        -blockdev node-name=node-A,driver=qcow2,file.driver=file,file.node-name=file,file.filename=./a.qcow2 \
+        -device virtio-blk,drive=node-A,id=virtio0 \
+        -monitor stdio -qmp unix:/tmp/qmp-sock,server,nowait
+
+The ``-blockdev`` command-line option, used above, is available from
+QEMU 2.9 onwards.  In the above invocation, notice the ``node-name``
+parameter that is used to refer to the disk image a.qcow2 ('node-A') --
+this is a cleaner way to refer to a disk image (as opposed to referring
+to it by spelling out file paths).  So, we will continue to designate a
+``node-name`` to each further disk image created (either via
+``blockdev-snapshot-sync``, or ``blockdev-add``) as part of the disk
+image chain, and continue to refer to the disks using their
+``node-name`` (where possible, because ``block-commit`` does not yet, as
+of QEMU 2.9, accept ``node-name`` parameter) when performing various
+block operations.
+
+To interact with the QEMU instance launched above, we will use the
+``qmp-shell`` utility (located at: ``qemu/scripts/qmp``, as part of the
+QEMU source directory), which takes key-value pairs for QMP commands.
+Invoke it as below (which will also print out the complete raw JSON
+syntax for reference -- examples in the following sections)::
+
+    $ ./qmp-shell -v -p /tmp/qmp-sock
+    (QEMU)
+
+.. note::
+    In the event we have to repeat a certain QMP command, we will: for
+    the first occurrence of it, show the ``qmp-shell`` invocation, *and*
+    the corresponding raw JSON QMP syntax; but for subsequent
+    invocations, present just the ``qmp-shell`` syntax, and omit the
+    equivalent JSON output.
+
+
+Example disk image chain
+------------------------
+
+We will use the below disk image chain (and occasionally spelling it
+out where appropriate) when discussing various primitives::
+
+    [A] <-- [B] <-- [C] <-- [D]
+
+Where [A] is the original base image; [B] and [C] are intermediate
+overlay images; image [D] is the active layer -- i.e. live QEMU is
+writing to it.  (The rule of thumb is: live QEMU will always be pointing
+to the rightmost image in a disk image chain.)
+
+The above image chain can be created by invoking
+``blockdev-snapshot-sync`` commands as following (which shows the
+creation of overlay image [B]) using the ``qmp-shell`` (our invocation
+also prints the raw JSON invocation of it)::
+
+    (QEMU) blockdev-snapshot-sync node-name=node-A snapshot-file=b.qcow2 snapshot-node-name=node-B format=qcow2
+    {
+        "execute": "blockdev-snapshot-sync",
+        "arguments": {
+            "node-name": "node-A",
+            "snapshot-file": "b.qcow2",
+            "format": "qcow2",
+            "snapshot-node-name": "node-B"
+        }
+    }
+
+Here, "node-A" is the name QEMU internally uses to refer to the base
+image [A] -- it is the backing file, based on which the overlay image,
+[B], is created.
+
+To create the rest of the overlay images, [C], and [D] (omitting the raw
+JSON output for brevity)::
+
+    (QEMU) blockdev-snapshot-sync node-name=node-B snapshot-file=c.qcow2 snapshot-node-name=node-C format=qcow2
+    (QEMU) blockdev-snapshot-sync node-name=node-C snapshot-file=d.qcow2 snapshot-node-name=node-D format=qcow2
+
+
+A note on points-in-time vs file names
+--------------------------------------
+
+In our disk image chain::
+
+    [A] <-- [B] <-- [C] <-- [D]
+
+We have *three* points in time and an active layer:
+
+- Point 1: Guest state when [B] was created is contained in file [A]
+- Point 2: Guest state when [C] was created is contained in [A] + [B]
+- Point 3: Guest state when [D] was created is contained in
+  [A] + [B] + [C]
+- Active layer: Current guest state is contained in [A] + [B] + [C] +
+  [D]
+
+Therefore, be aware with naming choices:
+
+- Naming a file after the time it is created is misleading -- the
+  guest data for that point in time is *not* contained in that file
+  (as explained earlier)
+- Rather, think of files as a *delta* from the backing file
+
+
+Live block streaming --- ``block-stream``
+-----------------------------------------
+
+The ``block-stream`` command allows you to do live copy data from backing
+files into overlay images.
+
+Given our original example disk image chain from earlier::
+
+    [A] <-- [B] <-- [C] <-- [D]
+
+The disk image chain can be shortened in one of the following different
+ways (not an exhaustive list).
+
+.. _`Case-1`:
+
+(1) Merge everything into the active layer: I.e. copy all contents from
+    the base image, [A], and overlay images, [B] and [C], into [D],
+    *while* the guest is running.  The resulting chain will be a
+    standalone image, [D] -- with contents from [A], [B] and [C] merged
+    into it (where live QEMU writes go to)::
+
+        [D]
+
+.. _`Case-2`:
+
+(2) Taking the same example disk image chain mentioned earlier, merge
+    only images [B] and [C] into [D], the active layer.  The result will
+    be contents of images [B] and [C] will be copied into [D], and the
+    backing file pointer of image [D] will be adjusted to point to image
+    [A].  The resulting chain will be::
+
+        [A] <-- [D]
+
+.. _`Case-3`:
+
+(3) Intermediate streaming (available since QEMU 2.8): Starting afresh
+    with the original example disk image chain, with a total of four
+    images, it is possible to copy contents from image [B] into image
+    [C].  Once the copy is finished, image [B] can now be (optionally)
+    discarded; and the backing file pointer of image [C] will be
+    adjusted to point to [A].  I.e. after performing "intermediate
+    streaming" of [B] into [C], the resulting image chain will be (where
+    live QEMU is writing to [D])::
+
+        [A] <-- [C] <-- [D]
+
+
+QMP invocation for ``block-stream``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For `Case-1`_, to merge contents of all the backing files into the
+active layer, where 'node-D' is the current active image (by default
+``block-stream`` will flatten the entire chain); ``qmp-shell`` (and its
+corresponding JSON output)::
+
+    (QEMU) block-stream device=node-D job-id=job0
+    {
+        "execute": "block-stream",
+        "arguments": {
+            "device": "node-D",
+            "job-id": "job0"
+        }
+    }
+
+For `Case-2`_, merge contents of the images [B] and [C] into [D], where
+image [D] ends up referring to image [A] as its backing file::
+
+    (QEMU) block-stream device=node-D base-node=node-A job-id=job0
+
+And for `Case-3`_, of "intermediate" streaming", merge contents of
+images [B] into [C], where [C] ends up referring to [A] as its backing
+image::
+
+    (QEMU) block-stream device=node-C base-node=node-A job-id=job0
+
+Progress of a ``block-stream`` operation can be monitored via the QMP
+command::
+
+    (QEMU) query-block-jobs
+    {
+        "execute": "query-block-jobs",
+        "arguments": {}
+    }
+
+
+Once the ``block-stream`` operation has completed, QEMU will emit an
+event, ``BLOCK_JOB_COMPLETED``.  The intermediate overlays remain valid,
+and can now be (optionally) discarded, or retained to create further
+overlays based on them.  Finally, the ``block-stream`` jobs can be
+restarted at anytime.
+
+
+Live block commit --- ``block-commit``
+--------------------------------------
+
+The ``block-commit`` command lets you merge live data from overlay
+images into backing file(s).  Since QEMU 2.0, this includes "live active
+commit" (i.e. it is possible to merge the "active layer", the right-most
+image in a disk image chain where live QEMU will be writing to, into the
+base image).  This is analogous to ``block-stream``, but in the opposite
+direction.
+
+Again, starting afresh with our example disk image chain, where live
+QEMU is writing to the right-most image in the chain, [D]::
+
+    [A] <-- [B] <-- [C] <-- [D]
+
+The disk image chain can be shortened in one of the following ways:
+
+.. _`block-commit_Case-1`:
+
+(1) Commit content from only image [B] into image [A].  The resulting
+    chain is the following, where image [C] is adjusted to point at [A]
+    as its new backing file::
+
+        [A] <-- [C] <-- [D]
+
+(2) Commit content from images [B] and [C] into image [A].  The
+    resulting chain, where image [D] is adjusted to point to image [A]
+    as its new backing file::
+
+        [A] <-- [D]
+
+.. _`block-commit_Case-3`:
+
+(3) Commit content from images [B], [C], and the active layer [D] into
+    image [A].  The resulting chain (in this case, a consolidated single
+    image)::
+
+        [A]
+
+(4) Commit content from image only image [C] into image [B].  The
+    resulting chain::
+
+	[A] <-- [B] <-- [D]
+
+(5) Commit content from image [C] and the active layer [D] into image
+    [B].  The resulting chain::
+
+	[A] <-- [B]
+
+
+QMP invocation for ``block-commit``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For :ref:`Case-1 <block-commit_Case-1>`, to merge contents only from
+image [B] into image [A], the invocation is as follows::
+
+    (QEMU) block-commit device=node-D base=a.qcow2 top=b.qcow2 job-id=job0
+    {
+        "execute": "block-commit",
+        "arguments": {
+            "device": "node-D",
+            "job-id": "job0",
+            "top": "b.qcow2",
+            "base": "a.qcow2"
+        }
+    }
+
+Once the above ``block-commit`` operation has completed, a
+``BLOCK_JOB_COMPLETED`` event will be issued, and no further action is
+required.  As the end result, the backing file of image [C] is adjusted
+to point to image [A], and the original 4-image chain will end up being
+transformed to::
+
+    [A] <-- [C] <-- [D]
+
+.. note::
+    The intermediate image [B] is invalid (as in: no more further
+    overlays based on it can be created).
+
+    Reasoning: An intermediate image after a 'stream' operation still
+    represents that old point-in-time, and may be valid in that context.
+    However, an intermediate image after a 'commit' operation no longer
+    represents any point-in-time, and is invalid in any context.
+
+
+However, :ref:`Case-3 <block-commit_Case-3>` (also called: "active
+``block-commit``") is a *two-phase* operation: In the first phase, the
+content from the active overlay, along with the intermediate overlays,
+is copied into the backing file (also called the base image).  In the
+second phase, adjust the said backing file as the current active image
+-- possible via issuing the command ``block-job-complete``.  Optionally,
+the ``block-commit`` operation can be cancelled by issuing the command
+``block-job-cancel``, but be careful when doing this.
+
+Once the ``block-commit`` operation has completed, the event
+``BLOCK_JOB_READY`` will be emitted, signalling that the synchronization
+has finished.  Now the job can be gracefully completed by issuing the
+command ``block-job-complete`` -- until such a command is issued, the
+'commit' operation remains active.
+
+The following is the flow for :ref:`Case-3 <block-commit_Case-3>` to
+convert a disk image chain such as this::
+
+    [A] <-- [B] <-- [C] <-- [D]
+
+Into::
+
+    [A]
+
+Where content from all the subsequent overlays, [B], and [C], including
+the active layer, [D], is committed back to [A] -- which is where live
+QEMU is performing all its current writes).
+
+Start the "active ``block-commit``" operation::
+
+    (QEMU) block-commit device=node-D base=a.qcow2 top=d.qcow2 job-id=job0
+    {
+        "execute": "block-commit",
+        "arguments": {
+            "device": "node-D",
+            "job-id": "job0",
+            "top": "d.qcow2",
+            "base": "a.qcow2"
+        }
+    }
+
+
+Once the synchronization has completed, the event ``BLOCK_JOB_READY`` will
+be emitted.
+
+Then, optionally query for the status of the active block operations.
+We can see the 'commit' job is now ready to be completed, as indicated
+by the line *"ready": true*::
+
+    (QEMU) query-block-jobs
+    {
+        "execute": "query-block-jobs",
+        "arguments": {}
+    }
+    {
+        "return": [
+            {
+                "busy": false,
+                "type": "commit",
+                "len": 1376256,
+                "paused": false,
+                "ready": true,
+                "io-status": "ok",
+                "offset": 1376256,
+                "device": "job0",
+                "speed": 0
+            }
+        ]
+    }
+
+Gracefully complete the 'commit' block device job::
+
+    (QEMU) block-job-complete device=job0
+    {
+        "execute": "block-job-complete",
+        "arguments": {
+            "device": "job0"
+        }
+    }
+    {
+        "return": {}
+    }
+
+Finally, once the above job is completed, an event
+``BLOCK_JOB_COMPLETED`` will be emitted.
+
+.. note::
+    The invocation for rest of the cases (2, 4, and 5), discussed in the
+    previous section, is omitted for brevity.
+
+
+Live disk synchronization --- ``drive-mirror`` and ``blockdev-mirror``
+----------------------------------------------------------------------
+
+Synchronize a running disk image chain (all or part of it) to a target
+image.
+
+Again, given our familiar disk image chain::
+
+    [A] <-- [B] <-- [C] <-- [D]
+
+The ``drive-mirror`` (and its newer equivalent ``blockdev-mirror``) allows
+you to copy data from the entire chain into a single target image (which
+can be located on a different host).
+
+Once a 'mirror' job has started, there are two possible actions while a
+``drive-mirror`` job is active:
+
+(1) Issuing the command ``block-job-cancel`` after it emits the event
+    ``BLOCK_JOB_CANCELLED``: will (after completing synchronization of
+    the content from the disk image chain to the target image, [E])
+    create a point-in-time (which is at the time of *triggering* the
+    cancel command) copy, contained in image [E], of the the entire disk
+    image chain (or only the top-most image, depending on the ``sync``
+    mode).
+
+(2) Issuing the command ``block-job-complete`` after it emits the event
+    ``BLOCK_JOB_COMPLETED``: will, after completing synchronization of
+    the content, adjust the guest device (i.e. live QEMU) to point to
+    the target image, and, causing all the new writes from this point on
+    to happen there.  One use case for this is live storage migration.
+
+About synchronization modes: The synchronization mode determines
+*which* part of the disk image chain will be copied to the target.
+Currently, there are four different kinds:
+
+(1) ``full`` -- Synchronize the content of entire disk image chain to
+    the target
+
+(2) ``top`` -- Synchronize only the contents of the top-most disk image
+    in the chain to the target
+
+(3) ``none`` -- Synchronize only the new writes from this point on.
+
+    .. note:: In the case of ``drive-backup`` (or ``blockdev-backup``),
+              the behavior of ``none`` synchronization mode is different.
+              Normally, a ``backup`` job consists of two parts: Anything
+              that is overwritten by the guest is first copied out to
+              the backup, and in the background the whole image is
+              copied from start to end. With ``sync=none``, it's only
+              the first part.
+
+(4) ``incremental`` -- Synchronize content that is described by the
+    dirty bitmap
+
+.. note::
+    Refer to the :doc:`bitmaps` document in the QEMU source
+    tree to learn about the detailed workings of the ``incremental``
+    synchronization mode.
+
+
+QMP invocation for ``drive-mirror``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To copy the contents of the entire disk image chain, from [A] all the
+way to [D], to a new target (``drive-mirror`` will create the destination
+file, if it doesn't already exist), call it [E]::
+
+    (QEMU) drive-mirror device=node-D target=e.qcow2 sync=full job-id=job0
+    {
+        "execute": "drive-mirror",
+        "arguments": {
+            "device": "node-D",
+            "job-id": "job0",
+            "target": "e.qcow2",
+            "sync": "full"
+        }
+    }
+
+The ``"sync": "full"``, from the above, means: copy the *entire* chain
+to the destination.
+
+Following the above, querying for active block jobs will show that a
+'mirror' job is "ready" to be completed (and QEMU will also emit an
+event, ``BLOCK_JOB_READY``)::
+
+    (QEMU) query-block-jobs
+    {
+        "execute": "query-block-jobs",
+        "arguments": {}
+    }
+    {
+        "return": [
+            {
+                "busy": false,
+                "type": "mirror",
+                "len": 21757952,
+                "paused": false,
+                "ready": true,
+                "io-status": "ok",
+                "offset": 21757952,
+                "device": "job0",
+                "speed": 0
+            }
+        ]
+    }
+
+And, as noted in the previous section, there are two possible actions
+at this point:
+
+(a) Create a point-in-time snapshot by ending the synchronization.  The
+    point-in-time is at the time of *ending* the sync.  (The result of
+    the following being: the target image, [E], will be populated with
+    content from the entire chain, [A] to [D])::
+
+        (QEMU) block-job-cancel device=job0
+        {
+            "execute": "block-job-cancel",
+            "arguments": {
+                "device": "job0"
+            }
+        }
+
+(b) Or, complete the operation and pivot the live QEMU to the target
+    copy::
+
+        (QEMU) block-job-complete device=job0
+
+In either of the above cases, if you once again run the
+`query-block-jobs` command, there should not be any active block
+operation.
+
+Comparing 'commit' and 'mirror': In both then cases, the overlay images
+can be discarded.  However, with 'commit', the *existing* base image
+will be modified (by updating it with contents from overlays); while in
+the case of 'mirror', a *new* target image is populated with the data
+from the disk image chain.
+
+
+QMP invocation for live storage migration with ``drive-mirror`` + NBD
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Live storage migration (without shared storage setup) is one of the most
+common use-cases that takes advantage of the ``drive-mirror`` primitive
+and QEMU's built-in Network Block Device (NBD) server.  Here's a quick
+walk-through of this setup.
+
+Given the disk image chain::
+
+    [A] <-- [B] <-- [C] <-- [D]
+
+Instead of copying content from the entire chain, synchronize *only* the
+contents of the *top*-most disk image (i.e. the active layer), [D], to a
+target, say, [TargetDisk].
+
+.. important::
+    The destination host must already have the contents of the backing
+    chain, involving images [A], [B], and [C], visible via other means
+    -- whether by ``cp``, ``rsync``, or by some storage array-specific
+    command.)
+
+Sometimes, this is also referred to as "shallow copy" -- because only
+the "active layer", and not the rest of the image chain, is copied to
+the destination.
+
+.. note::
+    In this example, for the sake of simplicity, we'll be using the same
+    ``localhost`` as both source and destination.
+
+As noted earlier, on the destination host the contents of the backing
+chain -- from images [A] to [C] -- are already expected to exist in some
+form (e.g. in a file called, ``Contents-of-A-B-C.qcow2``).  Now, on the
+destination host, let's create a target overlay image (with the image
+``Contents-of-A-B-C.qcow2`` as its backing file), to which the contents
+of image [D] (from the source QEMU) will be mirrored to::
+
+    $ qemu-img create -f qcow2 -b ./Contents-of-A-B-C.qcow2 \
+        -F qcow2 ./target-disk.qcow2
+
+And start the destination QEMU (we already have the source QEMU running
+-- discussed in the section: `Interacting with a QEMU instance`_)
+instance, with the following invocation.  (As noted earlier, for
+simplicity's sake, the destination QEMU is started on the same host, but
+it could be located elsewhere)::
+
+    $ ./x86_64-softmmu/qemu-system-x86_64 -display none -nodefconfig \
+        -M q35 -nodefaults -m 512 \
+        -blockdev node-name=node-TargetDisk,driver=qcow2,file.driver=file,file.node-name=file,file.filename=./target-disk.qcow2 \
+        -device virtio-blk,drive=node-TargetDisk,id=virtio0 \
+        -S -monitor stdio -qmp unix:./qmp-sock2,server,nowait \
+        -incoming tcp:localhost:6666
+
+Given the disk image chain on source QEMU::
+
+    [A] <-- [B] <-- [C] <-- [D]
+
+On the destination host, it is expected that the contents of the chain
+``[A] <-- [B] <-- [C]`` are *already* present, and therefore copy *only*
+the content of image [D].
+
+(1) [On *destination* QEMU] As part of the first step, start the
+    built-in NBD server on a given host (local host, represented by
+    ``::``)and port::
+
+        (QEMU) nbd-server-start addr={"type":"inet","data":{"host":"::","port":"49153"}}
+        {
+            "execute": "nbd-server-start",
+            "arguments": {
+                "addr": {
+                    "data": {
+                        "host": "::",
+                        "port": "49153"
+                    },
+                    "type": "inet"
+                }
+            }
+        }
+
+(2) [On *destination* QEMU] And export the destination disk image using
+    QEMU's built-in NBD server::
+
+        (QEMU) nbd-server-add device=node-TargetDisk writable=true
+        {
+            "execute": "nbd-server-add",
+            "arguments": {
+                "device": "node-TargetDisk"
+            }
+        }
+
+(3) [On *source* QEMU] Then, invoke ``drive-mirror`` (NB: since we're
+    running ``drive-mirror`` with ``mode=existing`` (meaning:
+    synchronize to a pre-created file, therefore 'existing', file on the
+    target host), with the synchronization mode as 'top' (``"sync:
+    "top"``)::
+
+        (QEMU) drive-mirror device=node-D target=nbd:localhost:49153:exportname=node-TargetDisk sync=top mode=existing job-id=job0
+        {
+            "execute": "drive-mirror",
+            "arguments": {
+                "device": "node-D",
+                "mode": "existing",
+                "job-id": "job0",
+                "target": "nbd:localhost:49153:exportname=node-TargetDisk",
+                "sync": "top"
+            }
+        }
+
+(4) [On *source* QEMU] Once ``drive-mirror`` copies the entire data, and the
+    event ``BLOCK_JOB_READY`` is emitted, issue ``block-job-cancel`` to
+    gracefully end the synchronization, from source QEMU::
+
+        (QEMU) block-job-cancel device=job0
+        {
+            "execute": "block-job-cancel",
+            "arguments": {
+                "device": "job0"
+            }
+        }
+
+(5) [On *destination* QEMU] Then, stop the NBD server::
+
+        (QEMU) nbd-server-stop
+        {
+            "execute": "nbd-server-stop",
+            "arguments": {}
+        }
+
+(6) [On *destination* QEMU] Finally, resume the guest vCPUs by issuing the
+    QMP command `cont`::
+
+        (QEMU) cont
+        {
+            "execute": "cont",
+            "arguments": {}
+        }
+
+.. note::
+    Higher-level libraries (e.g. libvirt) automate the entire above
+    process (although note that libvirt does not allow same-host
+    migrations to localhost for other reasons).
+
+
+Notes on ``blockdev-mirror``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``blockdev-mirror`` command is equivalent in core functionality to
+``drive-mirror``, except that it operates at node-level in a BDS graph.
+
+Also: for ``blockdev-mirror``, the 'target' image needs to be explicitly
+created (using ``qemu-img``) and attach it to live QEMU via
+``blockdev-add``, which assigns a name to the to-be created target node.
+
+E.g. the sequence of actions to create a point-in-time backup of an
+entire disk image chain, to a target, using ``blockdev-mirror`` would be:
+
+(0) Create the QCOW2 overlays, to arrive at a backing chain of desired
+    depth
+
+(1) Create the target image (using ``qemu-img``), say, ``e.qcow2``
+
+(2) Attach the above created file (``e.qcow2``), run-time, using
+    ``blockdev-add`` to QEMU
+
+(3) Perform ``blockdev-mirror`` (use ``"sync": "full"`` to copy the
+    entire chain to the target).  And notice the event
+    ``BLOCK_JOB_READY``
+
+(4) Optionally, query for active block jobs, there should be a 'mirror'
+    job ready to be completed
+
+(5) Gracefully complete the 'mirror' block device job, and notice the
+    the event ``BLOCK_JOB_COMPLETED``
+
+(6) Shutdown the guest by issuing the QMP ``quit`` command so that
+    caches are flushed
+
+(7) Then, finally, compare the contents of the disk image chain, and
+    the target copy with ``qemu-img compare``.  You should notice:
+    "Images are identical"
+
+
+QMP invocation for ``blockdev-mirror``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Given the disk image chain::
+
+    [A] <-- [B] <-- [C] <-- [D]
+
+To copy the contents of the entire disk image chain, from [A] all the
+way to [D], to a new target, call it [E].  The following is the flow.
+
+Create the overlay images, [B], [C], and [D]::
+
+    (QEMU) blockdev-snapshot-sync node-name=node-A snapshot-file=b.qcow2 snapshot-node-name=node-B format=qcow2
+    (QEMU) blockdev-snapshot-sync node-name=node-B snapshot-file=c.qcow2 snapshot-node-name=node-C format=qcow2
+    (QEMU) blockdev-snapshot-sync node-name=node-C snapshot-file=d.qcow2 snapshot-node-name=node-D format=qcow2
+
+Create the target image, [E]::
+
+    $ qemu-img create -f qcow2 e.qcow2 39M
+
+Add the above created target image to QEMU, via ``blockdev-add``::
+
+    (QEMU) blockdev-add driver=qcow2 node-name=node-E file={"driver":"file","filename":"e.qcow2"}
+    {
+        "execute": "blockdev-add",
+        "arguments": {
+            "node-name": "node-E",
+            "driver": "qcow2",
+            "file": {
+                "driver": "file",
+                "filename": "e.qcow2"
+            }
+        }
+    }
+
+Perform ``blockdev-mirror``, and notice the event ``BLOCK_JOB_READY``::
+
+    (QEMU) blockdev-mirror device=node-B target=node-E sync=full job-id=job0
+    {
+        "execute": "blockdev-mirror",
+        "arguments": {
+            "device": "node-D",
+            "job-id": "job0",
+            "target": "node-E",
+            "sync": "full"
+        }
+    }
+
+Query for active block jobs, there should be a 'mirror' job ready::
+
+    (QEMU) query-block-jobs
+    {
+        "execute": "query-block-jobs",
+        "arguments": {}
+    }
+    {
+        "return": [
+            {
+                "busy": false,
+                "type": "mirror",
+                "len": 21561344,
+                "paused": false,
+                "ready": true,
+                "io-status": "ok",
+                "offset": 21561344,
+                "device": "job0",
+                "speed": 0
+            }
+        ]
+    }
+
+Gracefully complete the block device job operation, and notice the
+event ``BLOCK_JOB_COMPLETED``::
+
+    (QEMU) block-job-complete device=job0
+    {
+        "execute": "block-job-complete",
+        "arguments": {
+            "device": "job0"
+        }
+    }
+    {
+        "return": {}
+    }
+
+Shutdown the guest, by issuing the ``quit`` QMP command::
+
+    (QEMU) quit
+    {
+        "execute": "quit",
+        "arguments": {}
+    }
+
+
+Live disk backup --- ``drive-backup`` and ``blockdev-backup``
+-------------------------------------------------------------
+
+The ``drive-backup`` (and its newer equivalent ``blockdev-backup``) allows
+you to create a point-in-time snapshot.
+
+In this case, the point-in-time is when you *start* the ``drive-backup``
+(or its newer equivalent ``blockdev-backup``) command.
+
+
+QMP invocation for ``drive-backup``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Yet again, starting afresh with our example disk image chain::
+
+    [A] <-- [B] <-- [C] <-- [D]
+
+To create a target image [E], with content populated from image [A] to
+[D], from the above chain, the following is the syntax.  (If the target
+image does not exist, ``drive-backup`` will create it)::
+
+    (QEMU) drive-backup device=node-D sync=full target=e.qcow2 job-id=job0
+    {
+        "execute": "drive-backup",
+        "arguments": {
+            "device": "node-D",
+            "job-id": "job0",
+            "sync": "full",
+            "target": "e.qcow2"
+        }
+    }
+
+Once the above ``drive-backup`` has completed, a ``BLOCK_JOB_COMPLETED`` event
+will be issued, indicating the live block device job operation has
+completed, and no further action is required.
+
+
+Notes on ``blockdev-backup``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``blockdev-backup`` command is equivalent in functionality to
+``drive-backup``, except that it operates at node-level in a Block Driver
+State (BDS) graph.
+
+E.g. the sequence of actions to create a point-in-time backup
+of an entire disk image chain, to a target, using ``blockdev-backup``
+would be:
+
+(0) Create the QCOW2 overlays, to arrive at a backing chain of desired
+    depth
+
+(1) Create the target image (using ``qemu-img``), say, ``e.qcow2``
+
+(2) Attach the above created file (``e.qcow2``), run-time, using
+    ``blockdev-add`` to QEMU
+
+(3) Perform ``blockdev-backup`` (use ``"sync": "full"`` to copy the
+    entire chain to the target).  And notice the event
+    ``BLOCK_JOB_COMPLETED``
+
+(4) Shutdown the guest, by issuing the QMP ``quit`` command, so that
+    caches are flushed
+
+(5) Then, finally, compare the contents of the disk image chain, and
+    the target copy with ``qemu-img compare``.  You should notice:
+    "Images are identical"
+
+The following section shows an example QMP invocation for
+``blockdev-backup``.
+
+QMP invocation for ``blockdev-backup``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Given a disk image chain of depth 1 where image [B] is the active
+overlay (live QEMU is writing to it)::
+
+    [A] <-- [B]
+
+The following is the procedure to copy the content from the entire chain
+to a target image (say, [E]), which has the full content from [A] and
+[B].
+
+Create the overlay [B]::
+
+    (QEMU) blockdev-snapshot-sync node-name=node-A snapshot-file=b.qcow2 snapshot-node-name=node-B format=qcow2
+    {
+        "execute": "blockdev-snapshot-sync",
+        "arguments": {
+            "node-name": "node-A",
+            "snapshot-file": "b.qcow2",
+            "format": "qcow2",
+            "snapshot-node-name": "node-B"
+        }
+    }
+
+
+Create a target image that will contain the copy::
+
+    $ qemu-img create -f qcow2 e.qcow2 39M
+
+Then add it to QEMU via ``blockdev-add``::
+
+    (QEMU) blockdev-add driver=qcow2 node-name=node-E file={"driver":"file","filename":"e.qcow2"}
+    {
+        "execute": "blockdev-add",
+        "arguments": {
+            "node-name": "node-E",
+            "driver": "qcow2",
+            "file": {
+                "driver": "file",
+                "filename": "e.qcow2"
+            }
+        }
+    }
+
+Then invoke ``blockdev-backup`` to copy the contents from the entire
+image chain, consisting of images [A] and [B] to the target image
+'e.qcow2'::
+
+    (QEMU) blockdev-backup device=node-B target=node-E sync=full job-id=job0
+    {
+        "execute": "blockdev-backup",
+        "arguments": {
+            "device": "node-B",
+            "job-id": "job0",
+            "target": "node-E",
+            "sync": "full"
+        }
+    }
+
+Once the above 'backup' operation has completed, the event,
+``BLOCK_JOB_COMPLETED`` will be emitted, signalling successful
+completion.
+
+Next, query for any active block device jobs (there should be none)::
+
+    (QEMU) query-block-jobs
+    {
+        "execute": "query-block-jobs",
+        "arguments": {}
+    }
+
+Shutdown the guest::
+
+    (QEMU) quit
+    {
+            "execute": "quit",
+                "arguments": {}
+    }
+            "return": {}
+    }
+
+.. note::
+    The above step is really important; if forgotten, an error, "Failed
+    to get shared "write" lock on e.qcow2", will be thrown when you do
+    ``qemu-img compare`` to verify the integrity of the disk image
+    with the backup content.
+
+
+The end result will be the image 'e.qcow2' containing a
+point-in-time backup of the disk image chain -- i.e. contents from
+images [A] and [B] at the time the ``blockdev-backup`` command was
+initiated.
+
+One way to confirm the backup disk image contains the identical content
+with the disk image chain is to compare the backup and the contents of
+the chain, you should see "Images are identical".  (NB: this is assuming
+QEMU was launched with ``-S`` option, which will not start the CPUs at
+guest boot up)::
+
+    $ qemu-img compare b.qcow2 e.qcow2
+    Warning: Image size mismatch!
+    Images are identical.
+
+NOTE: The "Warning: Image size mismatch!" is expected, as we created the
+target image (e.qcow2) with 39M size.
diff --git a/docs/live-block-ops.txt b/docs/live-block-ops.txt
deleted file mode 100644
index XXXXXXX..XXXXXXX
--- a/docs/live-block-ops.txt
+++ /dev/null
@@ -XXX,XX +XXX,XX @@
-LIVE BLOCK OPERATIONS
-=====================
-
-High level description of live block operations. Note these are not
-supported for use with the raw format at the moment.
-
-Note also that this document is incomplete and it currently only
-covers the 'stream' operation. Other operations supported by QEMU such
-as 'commit', 'mirror' and 'backup' are not described here yet. Please
-refer to the qapi/block-core.json file for an overview of those.
-
-Snapshot live merge
-===================
-
-Given a snapshot chain, described in this document in the following
-format:
-
-[A] <- [B] <- [C] <- [D] <- [E]
-
-Where the rightmost object ([E] in the example) described is the current
-image which the guest OS has write access to. To the left of it is its base
-image, and so on accordingly until the leftmost image, which has no
-base.
-
-The snapshot live merge operation transforms such a chain into a
-smaller one with fewer elements, such as this transformation relative
-to the first example:
-
-[A] <- [E]
-
-Data is copied in the right direction with destination being the
-rightmost image, but any other intermediate image can be specified
-instead. In this example data is copied from [C] into [D], so [D] can
-be backed by [B]:
-
-[A] <- [B] <- [D] <- [E]
-
-The operation is implemented in QEMU through image streaming facilities.
-
-The basic idea is to execute 'block_stream virtio0' while the guest is
-running. Progress can be monitored using 'info block-jobs'. When the
-streaming operation completes it raises a QMP event. 'block_stream'
-copies data from the backing file(s) into the active image. When finished,
-it adjusts the backing file pointer.
-
-The 'base' parameter specifies an image which data need not be
-streamed from. This image will be used as the backing file for the
-destination image when the operation is finished.
-
-In the first example above, the command would be:
-
-(qemu) block_stream virtio0 file-A.img
-
-In order to specify a destination image different from the active
-(rightmost) one we can use its node name instead.
-
-In the second example above, the command would be:
-
-(qemu) block_stream node-D file-B.img
-
-Live block copy
-===============
-
-To copy an in use image to another destination in the filesystem, one
-should create a live snapshot in the desired destination, then stream
-into that image. Example:
-
-(qemu) snapshot_blkdev ide0-hd0 /new-path/disk.img qcow2
-
-(qemu) block_stream ide0-hd0
-
-
-- 
2.9.4

The following changes since commit 56f9e46b841c7be478ca038d8d4085d776ab4b0d:

Merge remote-tracking branch 'remotes/armbru/tags/pull-qapi-2017-02-20' into staging (2017-02-20 17:42:47 +0000)

are available in the git repository at:

git://github.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to a7b91d35bab97a2d3e779d0c64c9b837b52a6cf7:

coroutine-lock: make CoRwlock thread-safe and fair (2017-02-21 11:39:40 +0000)

----------------------------------------------------------------
Pull request

v2:
 * Rebased to resolve scsi conflicts

----------------------------------------------------------------

Paolo Bonzini (24):
  block: move AioContext, QEMUTimer, main-loop to libqemuutil
  aio: introduce aio_co_schedule and aio_co_wake
  block-backend: allow blk_prw from coroutine context
  test-thread-pool: use generic AioContext infrastructure
  io: add methods to set I/O handlers on AioContext
  io: make qio_channel_yield aware of AioContexts
  nbd: convert to use qio_channel_yield
  coroutine-lock: reschedule coroutine on the AioContext it was running
    on
  blkdebug: reschedule coroutine on the AioContext it is running on
  qed: introduce qed_aio_start_io and qed_aio_next_io_cb
  aio: push aio_context_acquire/release down to dispatching
  block: explicitly acquire aiocontext in timers that need it
  block: explicitly acquire aiocontext in callbacks that need it
  block: explicitly acquire aiocontext in bottom halves that need it
  block: explicitly acquire aiocontext in aio callbacks that need it
  aio-posix: partially inline aio_dispatch into aio_poll
  async: remove unnecessary inc/dec pairs
  block: document fields protected by AioContext lock
  coroutine-lock: make CoMutex thread-safe
  coroutine-lock: add limited spinning to CoMutex
  test-aio-multithread: add performance comparison with thread-based
    mutexes
  coroutine-lock: place CoMutex before CoQueue in header
  coroutine-lock: add mutex argument to CoQueue APIs
  coroutine-lock: make CoRwlock thread-safe and fair

-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

AioContext is fairly self contained, the only dependency is QEMUTimer but
that in turn doesn't need anything else.  So move them out of block-obj-y
to avoid introducing a dependency from io/ to block-obj-y.

main-loop and its dependency iohandler also need to be moved, because
later in this series io/ will call iohandler_get_aio_context.

[Changed copyright "the QEMU team" to "other QEMU contributors" as
suggested by Daniel Berrange and agreed by Paolo.
--Stefan]

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-2-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 Makefile.objs                       |  4 ---
 stubs/Makefile.objs                 |  1 +
 tests/Makefile.include              | 11 ++++----
 util/Makefile.objs                  |  6 +++-
 block/io.c                          | 29 -------------------
 stubs/linux-aio.c                   | 32 +++++++++++++++++++++
 stubs/set-fd-handler.c              | 11 --------
 aio-posix.c => util/aio-posix.c     |  2 +-
 aio-win32.c => util/aio-win32.c     |  0
 util/aiocb.c                        | 55 +++++++++++++++++++++++++++++++++++++
 async.c => util/async.c             |  3 +-
 iohandler.c => util/iohandler.c     |  0
 main-loop.c => util/main-loop.c     |  0
 qemu-timer.c => util/qemu-timer.c   |  0
 thread-pool.c => util/thread-pool.c |  2 +-
 trace-events                        | 11 --------
 util/trace-events                   | 11 ++++++++
 17 files changed, 114 insertions(+), 64 deletions(-)
 create mode 100644 stubs/linux-aio.c
 rename aio-posix.c => util/aio-posix.c (99%)
 rename aio-win32.c => util/aio-win32.c (100%)
 create mode 100644 util/aiocb.c
 rename async.c => util/async.c (99%)
 rename iohandler.c => util/iohandler.c (100%)
 rename main-loop.c => util/main-loop.c (100%)
 rename qemu-timer.c => util/qemu-timer.c (100%)
 rename thread-pool.c => util/thread-pool.c (99%)

diff --git a/Makefile.objs b/Makefile.objs
index XXXXXXX..XXXXXXX 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -XXX,XX +XXX,XX @@ chardev-obj-y = chardev/
 #######################################################################
 # block-obj-y is code used by both qemu system emulation and qemu-img
 
-block-obj-y = async.o thread-pool.o
 block-obj-y += nbd/
 block-obj-y += block.o blockjob.o
-block-obj-y += main-loop.o iohandler.o qemu-timer.o
-block-obj-$(CONFIG_POSIX) += aio-posix.o
-block-obj-$(CONFIG_WIN32) += aio-win32.o
 block-obj-y += block/
 block-obj-y += qemu-io-cmds.o
 block-obj-$(CONFIG_REPLICATION) += replication.o
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index XXXXXXX..XXXXXXX 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -XXX,XX +XXX,XX @@ stub-obj-y += get-vm-name.o
 stub-obj-y += iothread.o
 stub-obj-y += iothread-lock.o
 stub-obj-y += is-daemonized.o
+stub-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 stub-obj-y += machine-init-done.o
 stub-obj-y += migr-blocker.o
 stub-obj-y += monitor.o
diff --git a/tests/Makefile.include b/tests/Makefile.include
index XXXXXXX..XXXXXXX 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -XXX,XX +XXX,XX @@ check-unit-y += tests/test-visitor-serialization$(EXESUF)
 check-unit-y += tests/test-iov$(EXESUF)
 gcov-files-test-iov-y = util/iov.c
 check-unit-y += tests/test-aio$(EXESUF)
+gcov-files-test-aio-y = util/async.c util/qemu-timer.o
+gcov-files-test-aio-$(CONFIG_WIN32) += util/aio-win32.c
+gcov-files-test-aio-$(CONFIG_POSIX) += util/aio-posix.c
 check-unit-y += tests/test-throttle$(EXESUF)
 gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
 gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
@@ -XXX,XX +XXX,XX @@ tests/check-qjson$(EXESUF): tests/check-qjson.o $(test-util-obj-y)
 tests/check-qom-interface$(EXESUF): tests/check-qom-interface.o $(test-qom-obj-y)
 tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
 
-tests/test-char$(EXESUF): tests/test-char.o qemu-timer.o \
-	$(test-util-obj-y) $(qtest-obj-y) $(test-block-obj-y) $(chardev-obj-y)
+tests/test-char$(EXESUF): tests/test-char.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y) $(chardev-obj-y)
 tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
 tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
@@ -XXX,XX +XXX,XX @@ tests/test-vmstate$(EXESUF): tests/test-vmstate.o \
 	migration/vmstate.o migration/qemu-file.o \
         migration/qemu-file-channel.o migration/qjson.o \
 	$(test-io-obj-y)
-tests/test-timed-average$(EXESUF): tests/test-timed-average.o qemu-timer.o \
-	$(test-util-obj-y)
+tests/test-timed-average$(EXESUF): tests/test-timed-average.o $(test-util-obj-y)
 tests/test-base64$(EXESUF): tests/test-base64.o \
 	libqemuutil.a libqemustub.a
 tests/ptimer-test$(EXESUF): tests/ptimer-test.o tests/ptimer-test-stubs.o hw/core/ptimer.o libqemustub.a
@@ -XXX,XX +XXX,XX @@ tests/usb-hcd-ehci-test$(EXESUF): tests/usb-hcd-ehci-test.o $(libqos-usb-obj-y)
 tests/usb-hcd-xhci-test$(EXESUF): tests/usb-hcd-xhci-test.o $(libqos-usb-obj-y)
 tests/pc-cpu-test$(EXESUF): tests/pc-cpu-test.o
 tests/postcopy-test$(EXESUF): tests/postcopy-test.o
-tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-timer.o \
+tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o $(test-util-obj-y) \
 	$(qtest-obj-y) $(test-io-obj-y) $(libqos-virtio-obj-y) $(libqos-pc-obj-y) \
 	$(chardev-obj-y)
 tests/qemu-iotests/socket_scm_helper$(EXESUF): tests/qemu-iotests/socket_scm_helper.o
diff --git a/util/Makefile.objs b/util/Makefile.objs
index XXXXXXX..XXXXXXX 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -XXX,XX +XXX,XX @@
 util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
 util-obj-y += bufferiszero.o
 util-obj-y += lockcnt.o
+util-obj-y += aiocb.o async.o thread-pool.o qemu-timer.o
+util-obj-y += main-loop.o iohandler.o
+util-obj-$(CONFIG_POSIX) += aio-posix.o
 util-obj-$(CONFIG_POSIX) += compatfd.o
 util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
 util-obj-$(CONFIG_POSIX) += mmap-alloc.o
 util-obj-$(CONFIG_POSIX) += oslib-posix.o
 util-obj-$(CONFIG_POSIX) += qemu-openpty.o
 util-obj-$(CONFIG_POSIX) += qemu-thread-posix.o
-util-obj-$(CONFIG_WIN32) += event_notifier-win32.o
 util-obj-$(CONFIG_POSIX) += memfd.o
+util-obj-$(CONFIG_WIN32) += aio-win32.o
+util-obj-$(CONFIG_WIN32) += event_notifier-win32.o
 util-obj-$(CONFIG_WIN32) += oslib-win32.o
 util-obj-$(CONFIG_WIN32) += qemu-thread-win32.o
 util-obj-y += envlist.o path.o module.o
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
     return &acb->common;
 }
 
-void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
-                   BlockCompletionFunc *cb, void *opaque)
-{
-    BlockAIOCB *acb;
-
-    acb = g_malloc(aiocb_info->aiocb_size);
-    acb->aiocb_info = aiocb_info;
-    acb->bs = bs;
-    acb->cb = cb;
-    acb->opaque = opaque;
-    acb->refcnt = 1;
-    return acb;
-}
-
-void qemu_aio_ref(void *p)
-{
-    BlockAIOCB *acb = p;
-    acb->refcnt++;
-}
-
-void qemu_aio_unref(void *p)
-{
-    BlockAIOCB *acb = p;
-    assert(acb->refcnt > 0);
-    if (--acb->refcnt == 0) {
-        g_free(acb);
-    }
-}
-
 /**************************************************************/
 /* Coroutine block device emulation */
 
diff --git a/stubs/linux-aio.c b/stubs/linux-aio.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/stubs/linux-aio.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Linux native AIO support.
+ *
+ * Copyright (C) 2009 IBM, Corp.
+ * Copyright (C) 2009 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "block/aio.h"
+#include "block/raw-aio.h"
+
+void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
+{
+    abort();
+}
+
+void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
+{
+    abort();
+}
+
+LinuxAioState *laio_init(void)
+{
+    abort();
+}
+
+void laio_cleanup(LinuxAioState *s)
+{
+    abort();
+}
diff --git a/stubs/set-fd-handler.c b/stubs/set-fd-handler.c
index XXXXXXX..XXXXXXX 100644
--- a/stubs/set-fd-handler.c
+++ b/stubs/set-fd-handler.c
@@ -XXX,XX +XXX,XX @@ void qemu_set_fd_handler(int fd,
 {
     abort();
 }
-
-void aio_set_fd_handler(AioContext *ctx,
-                        int fd,
-                        bool is_external,
-                        IOHandler *io_read,
-                        IOHandler *io_write,
-                        AioPollFn *io_poll,
-                        void *opaque)
-{
-    abort();
-}
diff --git a/aio-posix.c b/util/aio-posix.c
similarity index 99%
rename from aio-posix.c
rename to util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/rcu_queue.h"
 #include "qemu/sockets.h"
 #include "qemu/cutils.h"
-#include "trace-root.h"
+#include "trace.h"
 #ifdef CONFIG_EPOLL_CREATE1
 #include <sys/epoll.h>
 #endif
diff --git a/aio-win32.c b/util/aio-win32.c
similarity index 100%
rename from aio-win32.c
rename to util/aio-win32.c
diff --git a/util/aiocb.c b/util/aiocb.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/util/aiocb.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * BlockAIOCB allocation
+ *
+ * Copyright (c) 2003-2017 Fabrice Bellard and other QEMU contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "block/aio.h"
+
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
+                   BlockCompletionFunc *cb, void *opaque)
+{
+    BlockAIOCB *acb;
+
+    acb = g_malloc(aiocb_info->aiocb_size);
+    acb->aiocb_info = aiocb_info;
+    acb->bs = bs;
+    acb->cb = cb;
+    acb->opaque = opaque;
+    acb->refcnt = 1;
+    return acb;
+}
+
+void qemu_aio_ref(void *p)
+{
+    BlockAIOCB *acb = p;
+    acb->refcnt++;
+}
+
+void qemu_aio_unref(void *p)
+{
+    BlockAIOCB *acb = p;
+    assert(acb->refcnt > 0);
+    if (--acb->refcnt == 0) {
+        g_free(acb);
+    }
+}
diff --git a/async.c b/util/async.c
similarity index 99%
rename from async.c
rename to util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@
 /*
- * QEMU System Emulator
+ * Data plane event loop
  *
  * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2009-2017 QEMU contributors
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/iohandler.c b/util/iohandler.c
similarity index 100%
rename from iohandler.c
rename to util/iohandler.c
diff --git a/main-loop.c b/util/main-loop.c
similarity index 100%
rename from main-loop.c
rename to util/main-loop.c
diff --git a/qemu-timer.c b/util/qemu-timer.c
similarity index 100%
rename from qemu-timer.c
rename to util/qemu-timer.c
diff --git a/thread-pool.c b/util/thread-pool.c
similarity index 99%
rename from thread-pool.c
rename to util/thread-pool.c
index XXXXXXX..XXXXXXX 100644
--- a/thread-pool.c
+++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/queue.h"
 #include "qemu/thread.h"
 #include "qemu/coroutine.h"
-#include "trace-root.h"
+#include "trace.h"
 #include "block/thread-pool.h"
 #include "qemu/main-loop.h"
 
diff --git a/trace-events b/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/trace-events
+++ b/trace-events
@@ -XXX,XX +XXX,XX @@
 #
 # The <format-string> should be a sprintf()-compatible format string.
 
-# aio-posix.c
-run_poll_handlers_begin(void *ctx, int64_t max_ns) "ctx %p max_ns %"PRId64
-run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
-poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
-poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
-
-# thread-pool.c
-thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
-thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
-thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
-
 # ioport.c
 cpu_in(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
 cpu_out(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@
 # See docs/tracing.txt for syntax documentation.
 
+# util/aio-posix.c
+run_poll_handlers_begin(void *ctx, int64_t max_ns) "ctx %p max_ns %"PRId64
+run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
+poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
+poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
+
+# util/thread-pool.c
+thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
+thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
+thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
+
 # util/buffer.c
 buffer_resize(const char *buf, size_t olen, size_t len) "%s: old %zd, new %zd"
 buffer_move_empty(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s"
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

aio_co_wake provides the infrastructure to start a coroutine on a "home"
AioContext.  It will be used by CoMutex and CoQueue, so that coroutines
don't jump from one context to another when they go to sleep on a
mutex or waitqueue.  However, it can also be used as a more efficient
alternative to one-shot bottom halves, and saves the effort of tracking
which AioContext a coroutine is running on.

aio_co_schedule is the part of aio_co_wake that starts a coroutine
on a remove AioContext, but it is also useful to implement e.g.
bdrv_set_aio_context callbacks.

The implementation of aio_co_schedule is based on a lock-free
multiple-producer, single-consumer queue.  The multiple producers use
cmpxchg to add to a LIFO stack.  The consumer (a per-AioContext bottom
half) grabs all items added so far, inverts the list to make it FIFO,
and goes through it one item at a time until it's empty.  The data
structure was inspired by OSv, which uses it in the very code we'll
"port" to QEMU for the thread-safe CoMutex.

Most of the new code is really tests.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-3-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/Makefile.include       |   8 +-
 include/block/aio.h          |  32 +++++++
 include/qemu/coroutine_int.h |  11 ++-
 tests/iothread.h             |  25 +++++
 tests/iothread.c             |  91 ++++++++++++++++++
 tests/test-aio-multithread.c | 213 +++++++++++++++++++++++++++++++++++++++++++
 util/async.c                 |  65 +++++++++++++
 util/qemu-coroutine.c        |   8 ++
 util/trace-events            |   4 +
 9 files changed, 453 insertions(+), 4 deletions(-)
 create mode 100644 tests/iothread.h
 create mode 100644 tests/iothread.c
 create mode 100644 tests/test-aio-multithread.c

diff --git a/tests/Makefile.include b/tests/Makefile.include
index XXXXXXX..XXXXXXX 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -XXX,XX +XXX,XX @@ check-unit-y += tests/test-aio$(EXESUF)
 gcov-files-test-aio-y = util/async.c util/qemu-timer.o
 gcov-files-test-aio-$(CONFIG_WIN32) += util/aio-win32.c
 gcov-files-test-aio-$(CONFIG_POSIX) += util/aio-posix.c
+check-unit-y += tests/test-aio-multithread$(EXESUF)
+gcov-files-test-aio-multithread-y = $(gcov-files-test-aio-y)
+gcov-files-test-aio-multithread-y += util/qemu-coroutine.c tests/iothread.c
 check-unit-y += tests/test-throttle$(EXESUF)
-gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
-gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
 check-unit-y += tests/test-thread-pool$(EXESUF)
 gcov-files-test-thread-pool-y = thread-pool.c
 gcov-files-test-hbitmap-y = util/hbitmap.c
@@ -XXX,XX +XXX,XX @@ test-qapi-obj-y = tests/test-qapi-visit.o tests/test-qapi-types.o \
 	$(test-qom-obj-y)
 test-crypto-obj-y = $(crypto-obj-y) $(test-qom-obj-y)
 test-io-obj-y = $(io-obj-y) $(test-crypto-obj-y)
-test-block-obj-y = $(block-obj-y) $(test-io-obj-y)
+test-block-obj-y = $(block-obj-y) $(test-io-obj-y) tests/iothread.o
 
 tests/check-qint$(EXESUF): tests/check-qint.o $(test-util-obj-y)
 tests/check-qstring$(EXESUF): tests/check-qstring.o $(test-util-obj-y)
@@ -XXX,XX +XXX,XX @@ tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
 tests/test-char$(EXESUF): tests/test-char.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y) $(chardev-obj-y)
 tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
 tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
+tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
 tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
diff --git a/include/block/aio.h b/include/block/aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ typedef void QEMUBHFunc(void *opaque);
 typedef bool AioPollFn(void *opaque);
 typedef void IOHandler(void *opaque);
 
+struct Coroutine;
 struct ThreadPool;
 struct LinuxAioState;
 
@@ -XXX,XX +XXX,XX @@ struct AioContext {
     bool notified;
     EventNotifier notifier;
 
+    QSLIST_HEAD(, Coroutine) scheduled_coroutines;
+    QEMUBH *co_schedule_bh;
+
     /* Thread pool for performing work and receiving completion callbacks.
      * Has its own locking.
      */
@@ -XXX,XX +XXX,XX @@ static inline bool aio_node_check(AioContext *ctx, bool is_external)
 }
 
 /**
+ * aio_co_schedule:
+ * @ctx: the aio context
+ * @co: the coroutine
+ *
+ * Start a coroutine on a remote AioContext.
+ *
+ * The coroutine must not be entered by anyone else while aio_co_schedule()
+ * is active.  In addition the coroutine must have yielded unless ctx
+ * is the context in which the coroutine is running (i.e. the value of
+ * qemu_get_current_aio_context() from the coroutine itself).
+ */
+void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
+
+/**
+ * aio_co_wake:
+ * @co: the coroutine
+ *
+ * Restart a coroutine on the AioContext where it was running last, thus
+ * preventing coroutines from jumping from one context to another when they
+ * go to sleep.
+ *
+ * aio_co_wake may be executed either in coroutine or non-coroutine
+ * context.  The coroutine must not be entered by anyone else while
+ * aio_co_wake() is active.
+ */
+void aio_co_wake(struct Coroutine *co);
+
+/**
  * Return the AioContext whose event loop runs in the current thread.
  *
  * If called from an IOThread this will be the IOThread's AioContext.  If
diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine_int.h
+++ b/include/qemu/coroutine_int.h
@@ -XXX,XX +XXX,XX @@ struct Coroutine {
     CoroutineEntry *entry;
     void *entry_arg;
     Coroutine *caller;
+
+    /* Only used when the coroutine has terminated.  */
     QSLIST_ENTRY(Coroutine) pool_next;
+
     size_t locks_held;
 
-    /* Coroutines that should be woken up when we yield or terminate */
+    /* Coroutines that should be woken up when we yield or terminate.
+     * Only used when the coroutine is running.
+     */
     QSIMPLEQ_HEAD(, Coroutine) co_queue_wakeup;
+
+    /* Only used when the coroutine has yielded.  */
+    AioContext *ctx;
     QSIMPLEQ_ENTRY(Coroutine) co_queue_next;
+    QSLIST_ENTRY(Coroutine) co_scheduled_next;
 };
 
 Coroutine *qemu_coroutine_new(void);
diff --git a/tests/iothread.h b/tests/iothread.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/iothread.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * Event loop thread implementation for unit tests
+ *
+ * Copyright Red Hat Inc., 2013, 2016
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@redhat.com>
+ *  Paolo Bonzini     <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#ifndef TEST_IOTHREAD_H
+#define TEST_IOTHREAD_H
+
+#include "block/aio.h"
+#include "qemu/thread.h"
+
+typedef struct IOThread IOThread;
+
+IOThread *iothread_new(void);
+void iothread_join(IOThread *iothread);
+AioContext *iothread_get_aio_context(IOThread *iothread);
+
+#endif
diff --git a/tests/iothread.c b/tests/iothread.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/iothread.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Event loop thread implementation for unit tests
+ *
+ * Copyright Red Hat Inc., 2013, 2016
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@redhat.com>
+ *  Paolo Bonzini     <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "block/aio.h"
+#include "qemu/main-loop.h"
+#include "qemu/rcu.h"
+#include "iothread.h"
+
+struct IOThread {
+    AioContext *ctx;
+
+    QemuThread thread;
+    QemuMutex init_done_lock;
+    QemuCond init_done_cond;    /* is thread initialization done? */
+    bool stopping;
+};
+
+static __thread IOThread *my_iothread;
+
+AioContext *qemu_get_current_aio_context(void)
+{
+    return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
+}
+
+static void *iothread_run(void *opaque)
+{
+    IOThread *iothread = opaque;
+
+    rcu_register_thread();
+
+    my_iothread = iothread;
+    qemu_mutex_lock(&iothread->init_done_lock);
+    iothread->ctx = aio_context_new(&error_abort);
+    qemu_cond_signal(&iothread->init_done_cond);
+    qemu_mutex_unlock(&iothread->init_done_lock);
+
+    while (!atomic_read(&iothread->stopping)) {
+        aio_poll(iothread->ctx, true);
+    }
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+void iothread_join(IOThread *iothread)
+{
+    iothread->stopping = true;
+    aio_notify(iothread->ctx);
+    qemu_thread_join(&iothread->thread);
+    qemu_cond_destroy(&iothread->init_done_cond);
+    qemu_mutex_destroy(&iothread->init_done_lock);
+    aio_context_unref(iothread->ctx);
+    g_free(iothread);
+}
+
+IOThread *iothread_new(void)
+{
+    IOThread *iothread = g_new0(IOThread, 1);
+
+    qemu_mutex_init(&iothread->init_done_lock);
+    qemu_cond_init(&iothread->init_done_cond);
+    qemu_thread_create(&iothread->thread, NULL, iothread_run,
+                       iothread, QEMU_THREAD_JOINABLE);
+
+    /* Wait for initialization to complete */
+    qemu_mutex_lock(&iothread->init_done_lock);
+    while (iothread->ctx == NULL) {
+        qemu_cond_wait(&iothread->init_done_cond,
+                       &iothread->init_done_lock);
+    }
+    qemu_mutex_unlock(&iothread->init_done_lock);
+    return iothread;
+}
+
+AioContext *iothread_get_aio_context(IOThread *iothread)
+{
+    return iothread->ctx;
+}
diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/test-aio-multithread.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * AioContext multithreading tests
+ *
+ * Copyright Red Hat, Inc. 2016
+ *
+ * Authors:
+ *  Paolo Bonzini    <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include <glib.h>
+#include "block/aio.h"
+#include "qapi/error.h"
+#include "qemu/coroutine.h"
+#include "qemu/thread.h"
+#include "qemu/error-report.h"
+#include "iothread.h"
+
+/* AioContext management */
+
+#define NUM_CONTEXTS 5
+
+static IOThread *threads[NUM_CONTEXTS];
+static AioContext *ctx[NUM_CONTEXTS];
+static __thread int id = -1;
+
+static QemuEvent done_event;
+
+/* Run a function synchronously on a remote iothread. */
+
+typedef struct CtxRunData {
+    QEMUBHFunc *cb;
+    void *arg;
+} CtxRunData;
+
+static void ctx_run_bh_cb(void *opaque)
+{
+    CtxRunData *data = opaque;
+
+    data->cb(data->arg);
+    qemu_event_set(&done_event);
+}
+
+static void ctx_run(int i, QEMUBHFunc *cb, void *opaque)
+{
+    CtxRunData data = {
+        .cb = cb,
+        .arg = opaque
+    };
+
+    qemu_event_reset(&done_event);
+    aio_bh_schedule_oneshot(ctx[i], ctx_run_bh_cb, &data);
+    qemu_event_wait(&done_event);
+}
+
+/* Starting the iothreads. */
+
+static void set_id_cb(void *opaque)
+{
+    int *i = opaque;
+
+    id = *i;
+}
+
+static void create_aio_contexts(void)
+{
+    int i;
+
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        threads[i] = iothread_new();
+        ctx[i] = iothread_get_aio_context(threads[i]);
+    }
+
+    qemu_event_init(&done_event, false);
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        ctx_run(i, set_id_cb, &i);
+    }
+}
+
+/* Stopping the iothreads. */
+
+static void join_aio_contexts(void)
+{
+    int i;
+
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        aio_context_ref(ctx[i]);
+    }
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        iothread_join(threads[i]);
+    }
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        aio_context_unref(ctx[i]);
+    }
+    qemu_event_destroy(&done_event);
+}
+
+/* Basic test for the stuff above. */
+
+static void test_lifecycle(void)
+{
+    create_aio_contexts();
+    join_aio_contexts();
+}
+
+/* aio_co_schedule test.  */
+
+static Coroutine *to_schedule[NUM_CONTEXTS];
+
+static bool now_stopping;
+
+static int count_retry;
+static int count_here;
+static int count_other;
+
+static bool schedule_next(int n)
+{
+    Coroutine *co;
+
+    co = atomic_xchg(&to_schedule[n], NULL);
+    if (!co) {
+        atomic_inc(&count_retry);
+        return false;
+    }
+
+    if (n == id) {
+        atomic_inc(&count_here);
+    } else {
+        atomic_inc(&count_other);
+    }
+
+    aio_co_schedule(ctx[n], co);
+    return true;
+}
+
+static void finish_cb(void *opaque)
+{
+    schedule_next(id);
+}
+
+static coroutine_fn void test_multi_co_schedule_entry(void *opaque)
+{
+    g_assert(to_schedule[id] == NULL);
+    atomic_mb_set(&to_schedule[id], qemu_coroutine_self());
+
+    while (!atomic_mb_read(&now_stopping)) {
+        int n;
+
+        n = g_test_rand_int_range(0, NUM_CONTEXTS);
+        schedule_next(n);
+        qemu_coroutine_yield();
+
+        g_assert(to_schedule[id] == NULL);
+        atomic_mb_set(&to_schedule[id], qemu_coroutine_self());
+    }
+}
+
+
+static void test_multi_co_schedule(int seconds)
+{
+    int i;
+
+    count_here = count_other = count_retry = 0;
+    now_stopping = false;
+
+    create_aio_contexts();
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        Coroutine *co1 = qemu_coroutine_create(test_multi_co_schedule_entry, NULL);
+        aio_co_schedule(ctx[i], co1);
+    }
+
+    g_usleep(seconds * 1000000);
+
+    atomic_mb_set(&now_stopping, true);
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        ctx_run(i, finish_cb, NULL);
+        to_schedule[i] = NULL;
+    }
+
+    join_aio_contexts();
+    g_test_message("scheduled %d, queued %d, retry %d, total %d\n",
+                  count_other, count_here, count_retry,
+                  count_here + count_other + count_retry);
+}
+
+static void test_multi_co_schedule_1(void)
+{
+    test_multi_co_schedule(1);
+}
+
+static void test_multi_co_schedule_10(void)
+{
+    test_multi_co_schedule(10);
+}
+
+/* End of tests.  */
+
+int main(int argc, char **argv)
+{
+    init_clocks();
+
+    g_test_init(&argc, &argv, NULL);
+    g_test_add_func("/aio/multi/lifecycle", test_lifecycle);
+    if (g_test_quick()) {
+        g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
+    } else {
+        g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
+    }
+    return g_test_run();
+}
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/main-loop.h"
 #include "qemu/atomic.h"
 #include "block/raw-aio.h"
+#include "qemu/coroutine_int.h"
+#include "trace.h"
 
 /***********************************************************/
 /* bottom halves (can be seen as timers which expire ASAP) */
@@ -XXX,XX +XXX,XX @@ aio_ctx_finalize(GSource     *source)
     }
 #endif
 
+    assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
+    qemu_bh_delete(ctx->co_schedule_bh);
+
     qemu_lockcnt_lock(&ctx->list_lock);
     assert(!qemu_lockcnt_count(&ctx->list_lock));
     while (ctx->first_bh) {
@@ -XXX,XX +XXX,XX @@ static bool event_notifier_poll(void *opaque)
     return atomic_read(&ctx->notified);
 }
 
+static void co_schedule_bh_cb(void *opaque)
+{
+    AioContext *ctx = opaque;
+    QSLIST_HEAD(, Coroutine) straight, reversed;
+
+    QSLIST_MOVE_ATOMIC(&reversed, &ctx->scheduled_coroutines);
+    QSLIST_INIT(&straight);
+
+    while (!QSLIST_EMPTY(&reversed)) {
+        Coroutine *co = QSLIST_FIRST(&reversed);
+        QSLIST_REMOVE_HEAD(&reversed, co_scheduled_next);
+        QSLIST_INSERT_HEAD(&straight, co, co_scheduled_next);
+    }
+
+    while (!QSLIST_EMPTY(&straight)) {
+        Coroutine *co = QSLIST_FIRST(&straight);
+        QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
+        trace_aio_co_schedule_bh_cb(ctx, co);
+        qemu_coroutine_enter(co);
+    }
+}
+
 AioContext *aio_context_new(Error **errp)
 {
     int ret;
@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
     }
     g_source_set_can_recurse(&ctx->source, true);
     qemu_lockcnt_init(&ctx->list_lock);
+
+    ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx);
+    QSLIST_INIT(&ctx->scheduled_coroutines);
+
     aio_set_event_notifier(ctx, &ctx->notifier,
                            false,
                            (EventNotifierHandler *)
@@ -XXX,XX +XXX,XX @@ fail:
     return NULL;
 }
 
+void aio_co_schedule(AioContext *ctx, Coroutine *co)
+{
+    trace_aio_co_schedule(ctx, co);
+    QSLIST_INSERT_HEAD_ATOMIC(&ctx->scheduled_coroutines,
+                              co, co_scheduled_next);
+    qemu_bh_schedule(ctx->co_schedule_bh);
+}
+
+void aio_co_wake(struct Coroutine *co)
+{
+    AioContext *ctx;
+
+    /* Read coroutine before co->ctx.  Matches smp_wmb in
+     * qemu_coroutine_enter.
+     */
+    smp_read_barrier_depends();
+    ctx = atomic_read(&co->ctx);
+
+    if (ctx != qemu_get_current_aio_context()) {
+        aio_co_schedule(ctx, co);
+        return;
+    }
+
+    if (qemu_in_coroutine()) {
+        Coroutine *self = qemu_coroutine_self();
+        assert(self != co);
+        QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, co, co_queue_next);
+    } else {
+        aio_context_acquire(ctx);
+        qemu_coroutine_enter(co);
+        aio_context_release(ctx);
+    }
+}
+
 void aio_context_ref(AioContext *ctx)
 {
     g_source_ref(&ctx->source);
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine.c
+++ b/util/qemu-coroutine.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/atomic.h"
 #include "qemu/coroutine.h"
 #include "qemu/coroutine_int.h"
+#include "block/aio.h"
 
 enum {
     POOL_BATCH_SIZE = 64,
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_enter(Coroutine *co)
     }
 
     co->caller = self;
+    co->ctx = qemu_get_current_aio_context();
+
+    /* Store co->ctx before anything that stores co.  Matches
+     * barrier in aio_co_wake.
+     */
+    smp_wmb();
+
     ret = qemu_coroutine_switch(self, co, COROUTINE_ENTER);
 
     qemu_co_queue_run_restart(co);
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
 poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 
+# util/async.c
+aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
+aio_co_schedule_bh_cb(void *ctx, void *co) "ctx %p co %p"
+
 # util/thread-pool.c
 thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
 thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

qcow2_create2 calls this.  Do not run a nested event loop, as that
breaks when aio_co_wake tries to queue the coroutine on the co_queue_wakeup
list of the currently running one.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-4-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/block-backend.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
 {
     QEMUIOVector qiov;
     struct iovec iov;
-    Coroutine *co;
     BlkRwCo rwco;
 
     iov = (struct iovec) {
@@ -XXX,XX +XXX,XX @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
         .ret    = NOT_DONE,
     };
 
-    co = qemu_coroutine_create(co_entry, &rwco);
-    qemu_coroutine_enter(co);
-    BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        co_entry(&rwco);
+    } else {
+        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
+        qemu_coroutine_enter(co);
+        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
+    }
 
     return rwco.ret;
 }
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Once the thread pool starts using aio_co_wake, it will also need
qemu_get_current_aio_context().  Make test-thread-pool create
an AioContext with qemu_init_main_loop, so that stubs/iothread.c
and tests/iothread.c can provide the rest.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-5-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/test-thread-pool.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/tests/test-thread-pool.c b/tests/test-thread-pool.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-thread-pool.c
+++ b/tests/test-thread-pool.c
@@ -XXX,XX +XXX,XX @@
 #include "qapi/error.h"
 #include "qemu/timer.h"
 #include "qemu/error-report.h"
+#include "qemu/main-loop.h"
 
 static AioContext *ctx;
 static ThreadPool *pool;
@@ -XXX,XX +XXX,XX @@ static void test_cancel_async(void)
 int main(int argc, char **argv)
 {
     int ret;
-    Error *local_error = NULL;
 
-    init_clocks();
-
-    ctx = aio_context_new(&local_error);
-    if (!ctx) {
-        error_reportf_err(local_error, "Failed to create AIO Context: ");
-        exit(1);
-    }
+    qemu_init_main_loop(&error_abort);
+    ctx = qemu_get_current_aio_context();
     pool = aio_get_thread_pool(ctx);
 
     g_test_init(&argc, &argv, NULL);
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
 
     ret = g_test_run();
 
-    aio_context_unref(ctx);
     return ret;
 }
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This is in preparation for making qio_channel_yield work on
AioContexts other than the main one.

Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-6-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/io/channel.h | 25 +++++++++++++++++++++++++
 io/channel-command.c | 13 +++++++++++++
 io/channel-file.c    | 11 +++++++++++
 io/channel-socket.c  | 16 +++++++++++-----
 io/channel-tls.c     | 12 ++++++++++++
 io/channel-watch.c   |  6 ++++++
 io/channel.c         | 11 +++++++++++
 7 files changed, 89 insertions(+), 5 deletions(-)

diff --git a/include/io/channel.h b/include/io/channel.h
index XXXXXXX..XXXXXXX 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu-common.h"
 #include "qom/object.h"
+#include "block/aio.h"
 
 #define TYPE_QIO_CHANNEL "qio-channel"
 #define QIO_CHANNEL(obj)                                    \
@@ -XXX,XX +XXX,XX @@ struct QIOChannelClass {
                      off_t offset,
                      int whence,
                      Error **errp);
+    void (*io_set_aio_fd_handler)(QIOChannel *ioc,
+                                  AioContext *ctx,
+                                  IOHandler *io_read,
+                                  IOHandler *io_write,
+                                  void *opaque);
 };
 
 /* General I/O handling functions */
@@ -XXX,XX +XXX,XX @@ void qio_channel_yield(QIOChannel *ioc,
 void qio_channel_wait(QIOChannel *ioc,
                       GIOCondition condition);
 
+/**
+ * qio_channel_set_aio_fd_handler:
+ * @ioc: the channel object
+ * @ctx: the AioContext to set the handlers on
+ * @io_read: the read handler
+ * @io_write: the write handler
+ * @opaque: the opaque value passed to the handler
+ *
+ * This is used internally by qio_channel_yield().  It can
+ * be used by channel implementations to forward the handlers
+ * to another channel (e.g. from #QIOChannelTLS to the
+ * underlying socket).
+ */
+void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
+                                    AioContext *ctx,
+                                    IOHandler *io_read,
+                                    IOHandler *io_write,
+                                    void *opaque);
+
 #endif /* QIO_CHANNEL_H */
diff --git a/io/channel-command.c b/io/channel-command.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel-command.c
+++ b/io/channel-command.c
@@ -XXX,XX +XXX,XX @@ static int qio_channel_command_close(QIOChannel *ioc,
 }
 
 
+static void qio_channel_command_set_aio_fd_handler(QIOChannel *ioc,
+                                                   AioContext *ctx,
+                                                   IOHandler *io_read,
+                                                   IOHandler *io_write,
+                                                   void *opaque)
+{
+    QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);
+    aio_set_fd_handler(ctx, cioc->readfd, false, io_read, NULL, NULL, opaque);
+    aio_set_fd_handler(ctx, cioc->writefd, false, NULL, io_write, NULL, opaque);
+}
+
+
 static GSource *qio_channel_command_create_watch(QIOChannel *ioc,
                                                  GIOCondition condition)
 {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_command_class_init(ObjectClass *klass,
     ioc_klass->io_set_blocking = qio_channel_command_set_blocking;
     ioc_klass->io_close = qio_channel_command_close;
     ioc_klass->io_create_watch = qio_channel_command_create_watch;
+    ioc_klass->io_set_aio_fd_handler = qio_channel_command_set_aio_fd_handler;
 }
 
 static const TypeInfo qio_channel_command_info = {
diff --git a/io/channel-file.c b/io/channel-file.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel-file.c
+++ b/io/channel-file.c
@@ -XXX,XX +XXX,XX @@ static int qio_channel_file_close(QIOChannel *ioc,
 }
 
 
+static void qio_channel_file_set_aio_fd_handler(QIOChannel *ioc,
+                                                AioContext *ctx,
+                                                IOHandler *io_read,
+                                                IOHandler *io_write,
+                                                void *opaque)
+{
+    QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);
+    aio_set_fd_handler(ctx, fioc->fd, false, io_read, io_write, NULL, opaque);
+}
+
 static GSource *qio_channel_file_create_watch(QIOChannel *ioc,
                                               GIOCondition condition)
 {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_file_class_init(ObjectClass *klass,
     ioc_klass->io_seek = qio_channel_file_seek;
     ioc_klass->io_close = qio_channel_file_close;
     ioc_klass->io_create_watch = qio_channel_file_create_watch;
+    ioc_klass->io_set_aio_fd_handler = qio_channel_file_set_aio_fd_handler;
 }
 
 static const TypeInfo qio_channel_file_info = {
diff --git a/io/channel-socket.c b/io/channel-socket.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -XXX,XX +XXX,XX @@ qio_channel_socket_set_blocking(QIOChannel *ioc,
         qemu_set_block(sioc->fd);
     } else {
         qemu_set_nonblock(sioc->fd);
-#ifdef WIN32
-        WSAEventSelect(sioc->fd, ioc->event,
-                       FD_READ | FD_ACCEPT | FD_CLOSE |
-                       FD_CONNECT | FD_WRITE | FD_OOB);
-#endif
     }
     return 0;
 }
@@ -XXX,XX +XXX,XX @@ qio_channel_socket_shutdown(QIOChannel *ioc,
     return 0;
 }
 
+static void qio_channel_socket_set_aio_fd_handler(QIOChannel *ioc,
+                                                  AioContext *ctx,
+                                                  IOHandler *io_read,
+                                                  IOHandler *io_write,
+                                                  void *opaque)
+{
+    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+    aio_set_fd_handler(ctx, sioc->fd, false, io_read, io_write, NULL, opaque);
+}
+
 static GSource *qio_channel_socket_create_watch(QIOChannel *ioc,
                                                 GIOCondition condition)
 {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_socket_class_init(ObjectClass *klass,
     ioc_klass->io_set_cork = qio_channel_socket_set_cork;
     ioc_klass->io_set_delay = qio_channel_socket_set_delay;
     ioc_klass->io_create_watch = qio_channel_socket_create_watch;
+    ioc_klass->io_set_aio_fd_handler = qio_channel_socket_set_aio_fd_handler;
 }
 
 static const TypeInfo qio_channel_socket_info = {
diff --git a/io/channel-tls.c b/io/channel-tls.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel-tls.c
+++ b/io/channel-tls.c
@@ -XXX,XX +XXX,XX @@ static int qio_channel_tls_close(QIOChannel *ioc,
     return qio_channel_close(tioc->master, errp);
 }
 
+static void qio_channel_tls_set_aio_fd_handler(QIOChannel *ioc,
+                                               AioContext *ctx,
+                                               IOHandler *io_read,
+                                               IOHandler *io_write,
+                                               void *opaque)
+{
+    QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc);
+
+    qio_channel_set_aio_fd_handler(tioc->master, ctx, io_read, io_write, opaque);
+}
+
 static GSource *qio_channel_tls_create_watch(QIOChannel *ioc,
                                              GIOCondition condition)
 {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_tls_class_init(ObjectClass *klass,
     ioc_klass->io_close = qio_channel_tls_close;
     ioc_klass->io_shutdown = qio_channel_tls_shutdown;
     ioc_klass->io_create_watch = qio_channel_tls_create_watch;
+    ioc_klass->io_set_aio_fd_handler = qio_channel_tls_set_aio_fd_handler;
 }
 
 static const TypeInfo qio_channel_tls_info = {
diff --git a/io/channel-watch.c b/io/channel-watch.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel-watch.c
+++ b/io/channel-watch.c
@@ -XXX,XX +XXX,XX @@ GSource *qio_channel_create_socket_watch(QIOChannel *ioc,
     GSource *source;
     QIOChannelSocketSource *ssource;
 
+#ifdef WIN32
+    WSAEventSelect(socket, ioc->event,
+                   FD_READ | FD_ACCEPT | FD_CLOSE |
+                   FD_CONNECT | FD_WRITE | FD_OOB);
+#endif
+
     source = g_source_new(&qio_channel_socket_source_funcs,
                           sizeof(QIOChannelSocketSource));
     ssource = (QIOChannelSocketSource *)source;
diff --git a/io/channel.c b/io/channel.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -XXX,XX +XXX,XX @@ GSource *qio_channel_create_watch(QIOChannel *ioc,
 }
 
 
+void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
+                                    AioContext *ctx,
+                                    IOHandler *io_read,
+                                    IOHandler *io_write,
+                                    void *opaque)
+{
+    QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
+
+    klass->io_set_aio_fd_handler(ioc, ctx, io_read, io_write, opaque);
+}
+
 guint qio_channel_add_watch(QIOChannel *ioc,
                             GIOCondition condition,
                             QIOChannelFunc func,
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Support separate coroutines for reading and writing, and place the
read/write handlers on the AioContext that the QIOChannel is registered
with.

Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-7-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/io/channel.h | 47 ++++++++++++++++++++++++++--
 io/channel.c         | 86 +++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 109 insertions(+), 24 deletions(-)

diff --git a/include/io/channel.h b/include/io/channel.h
index XXXXXXX..XXXXXXX 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu-common.h"
 #include "qom/object.h"
+#include "qemu/coroutine.h"
 #include "block/aio.h"
 
 #define TYPE_QIO_CHANNEL "qio-channel"
@@ -XXX,XX +XXX,XX @@ struct QIOChannel {
     Object parent;
     unsigned int features; /* bitmask of QIOChannelFeatures */
     char *name;
+    AioContext *ctx;
+    Coroutine *read_coroutine;
+    Coroutine *write_coroutine;
 #ifdef _WIN32
     HANDLE event; /* For use with GSource on Win32 */
 #endif
@@ -XXX,XX +XXX,XX @@ guint qio_channel_add_watch(QIOChannel *ioc,
 
 
 /**
+ * qio_channel_attach_aio_context:
+ * @ioc: the channel object
+ * @ctx: the #AioContext to set the handlers on
+ *
+ * Request that qio_channel_yield() sets I/O handlers on
+ * the given #AioContext.  If @ctx is %NULL, qio_channel_yield()
+ * uses QEMU's main thread event loop.
+ *
+ * You can move a #QIOChannel from one #AioContext to another even if
+ * I/O handlers are set for a coroutine.  However, #QIOChannel provides
+ * no synchronization between the calls to qio_channel_yield() and
+ * qio_channel_attach_aio_context().
+ *
+ * Therefore you should first call qio_channel_detach_aio_context()
+ * to ensure that the coroutine is not entered concurrently.  Then,
+ * while the coroutine has yielded, call qio_channel_attach_aio_context(),
+ * and then aio_co_schedule() to place the coroutine on the new
+ * #AioContext.  The calls to qio_channel_detach_aio_context()
+ * and qio_channel_attach_aio_context() should be protected with
+ * aio_context_acquire() and aio_context_release().
+ */
+void qio_channel_attach_aio_context(QIOChannel *ioc,
+                                    AioContext *ctx);
+
+/**
+ * qio_channel_detach_aio_context:
+ * @ioc: the channel object
+ *
+ * Disable any I/O handlers set by qio_channel_yield().  With the
+ * help of aio_co_schedule(), this allows moving a coroutine that was
+ * paused by qio_channel_yield() to another context.
+ */
+void qio_channel_detach_aio_context(QIOChannel *ioc);
+
+/**
  * qio_channel_yield:
  * @ioc: the channel object
  * @condition: the I/O condition to wait for
  *
- * Yields execution from the current coroutine until
- * the condition indicated by @condition becomes
- * available.
+ * Yields execution from the current coroutine until the condition
+ * indicated by @condition becomes available.  @condition must
+ * be either %G_IO_IN or %G_IO_OUT; it cannot contain both.  In
+ * addition, no two coroutine can be waiting on the same condition
+ * and channel at the same time.
  *
  * This must only be called from coroutine context
  */
diff --git a/io/channel.c b/io/channel.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "io/channel.h"
 #include "qapi/error.h"
-#include "qemu/coroutine.h"
+#include "qemu/main-loop.h"
 
 bool qio_channel_has_feature(QIOChannel *ioc,
                              QIOChannelFeature feature)
@@ -XXX,XX +XXX,XX @@ off_t qio_channel_io_seek(QIOChannel *ioc,
 }
 
 
-typedef struct QIOChannelYieldData QIOChannelYieldData;
-struct QIOChannelYieldData {
-    QIOChannel *ioc;
-    Coroutine *co;
-};
+static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc);
 
+static void qio_channel_restart_read(void *opaque)
+{
+    QIOChannel *ioc = opaque;
+    Coroutine *co = ioc->read_coroutine;
+
+    ioc->read_coroutine = NULL;
+    qio_channel_set_aio_fd_handlers(ioc);
+    aio_co_wake(co);
+}
 
-static gboolean qio_channel_yield_enter(QIOChannel *ioc,
-                                        GIOCondition condition,
-                                        gpointer opaque)
+static void qio_channel_restart_write(void *opaque)
 {
-    QIOChannelYieldData *data = opaque;
-    qemu_coroutine_enter(data->co);
-    return FALSE;
+    QIOChannel *ioc = opaque;
+    Coroutine *co = ioc->write_coroutine;
+
+    ioc->write_coroutine = NULL;
+    qio_channel_set_aio_fd_handlers(ioc);
+    aio_co_wake(co);
 }
 
+static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc)
+{
+    IOHandler *rd_handler = NULL, *wr_handler = NULL;
+    AioContext *ctx;
+
+    if (ioc->read_coroutine) {
+        rd_handler = qio_channel_restart_read;
+    }
+    if (ioc->write_coroutine) {
+        wr_handler = qio_channel_restart_write;
+    }
+
+    ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
+    qio_channel_set_aio_fd_handler(ioc, ctx, rd_handler, wr_handler, ioc);
+}
+
+void qio_channel_attach_aio_context(QIOChannel *ioc,
+                                    AioContext *ctx)
+{
+    AioContext *old_ctx;
+    if (ioc->ctx == ctx) {
+        return;
+    }
+
+    old_ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
+    qio_channel_set_aio_fd_handler(ioc, old_ctx, NULL, NULL, NULL);
+    ioc->ctx = ctx;
+    qio_channel_set_aio_fd_handlers(ioc);
+}
+
+void qio_channel_detach_aio_context(QIOChannel *ioc)
+{
+    ioc->read_coroutine = NULL;
+    ioc->write_coroutine = NULL;
+    qio_channel_set_aio_fd_handlers(ioc);
+    ioc->ctx = NULL;
+}
 
 void coroutine_fn qio_channel_yield(QIOChannel *ioc,
                                     GIOCondition condition)
 {
-    QIOChannelYieldData data;
-
     assert(qemu_in_coroutine());
-    data.ioc = ioc;
-    data.co = qemu_coroutine_self();
-    qio_channel_add_watch(ioc,
-                          condition,
-                          qio_channel_yield_enter,
-                          &data,
-                          NULL);
+    if (condition == G_IO_IN) {
+        assert(!ioc->read_coroutine);
+        ioc->read_coroutine = qemu_coroutine_self();
+    } else if (condition == G_IO_OUT) {
+        assert(!ioc->write_coroutine);
+        ioc->write_coroutine = qemu_coroutine_self();
+    } else {
+        abort();
+    }
+    qio_channel_set_aio_fd_handlers(ioc);
     qemu_coroutine_yield();
 }
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

In the client, read the reply headers from a coroutine, switching the
read side between the "read header" coroutine and the I/O coroutine that
reads the body of the reply.

In the server, if the server can read more requests it will create a new
"read request" coroutine as soon as a request has been read.  Otherwise,
the new coroutine is created in nbd_request_put.

diff --git a/block/nbd-client.h b/block/nbd-client.h
index XXXXXXX..XXXXXXX 100644
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -XXX,XX +XXX,XX @@ typedef struct NBDClientSession {
 
     CoMutex send_mutex;
     CoQueue free_sema;
-    Coroutine *send_coroutine;
+    Coroutine *read_reply_co;
     int in_flight;
 
     Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
diff --git a/block/nbd-client.c b/block/nbd-client.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -XXX,XX +XXX,XX @@
 #define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
 #define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))
 
-static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
+static void nbd_recv_coroutines_enter_all(BlockDriverState *bs)
 {
+    NBDClientSession *s = nbd_get_client_session(bs);
     int i;
 
     for (i = 0; i < MAX_NBD_REQUESTS; i++) {
@@ -XXX,XX +XXX,XX @@ static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
             qemu_coroutine_enter(s->recv_coroutine[i]);
         }
     }
+    BDRV_POLL_WHILE(bs, s->read_reply_co);
 }
 
 static void nbd_teardown_connection(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
     qio_channel_shutdown(client->ioc,
                          QIO_CHANNEL_SHUTDOWN_BOTH,
                          NULL);
-    nbd_recv_coroutines_enter_all(client);
+    nbd_recv_coroutines_enter_all(bs);
 
     nbd_client_detach_aio_context(bs);
     object_unref(OBJECT(client->sioc));
@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
     client->ioc = NULL;
 }
 
-static void nbd_reply_ready(void *opaque)
+static coroutine_fn void nbd_read_reply_entry(void *opaque)
 {
-    BlockDriverState *bs = opaque;
-    NBDClientSession *s = nbd_get_client_session(bs);
+    NBDClientSession *s = opaque;
     uint64_t i;
     int ret;
 
-    if (!s->ioc) { /* Already closed */
-        return;
-    }
-
-    if (s->reply.handle == 0) {
-        /* No reply already in flight.  Fetch a header.  It is possible
-         * that another thread has done the same thing in parallel, so
-         * the socket is not readable anymore.
-         */
+    for (;;) {
+        assert(s->reply.handle == 0);
         ret = nbd_receive_reply(s->ioc, &s->reply);
-        if (ret == -EAGAIN) {
-            return;
-        }
         if (ret < 0) {
-            s->reply.handle = 0;
-            goto fail;
+            break;
         }
-    }
 
-    /* There's no need for a mutex on the receive side, because the
-     * handler acts as a synchronization point and ensures that only
-     * one coroutine is called until the reply finishes.  */
-    i = HANDLE_TO_INDEX(s, s->reply.handle);
-    if (i >= MAX_NBD_REQUESTS) {
-        goto fail;
-    }
+        /* There's no need for a mutex on the receive side, because the
+         * handler acts as a synchronization point and ensures that only
+         * one coroutine is called until the reply finishes.
+         */
+        i = HANDLE_TO_INDEX(s, s->reply.handle);
+        if (i >= MAX_NBD_REQUESTS || !s->recv_coroutine[i]) {
+            break;
+        }
 
-    if (s->recv_coroutine[i]) {
-        qemu_coroutine_enter(s->recv_coroutine[i]);
-        return;
+        /* We're woken up by the recv_coroutine itself.  Note that there
+         * is no race between yielding and reentering read_reply_co.  This
+         * is because:
+         *
+         * - if recv_coroutine[i] runs on the same AioContext, it is only
+         *   entered after we yield
+         *
+         * - if recv_coroutine[i] runs on a different AioContext, reentering
+         *   read_reply_co happens through a bottom half, which can only
+         *   run after we yield.
+         */
+        aio_co_wake(s->recv_coroutine[i]);
+        qemu_coroutine_yield();
     }
-
-fail:
-    nbd_teardown_connection(bs);
-}
-
-static void nbd_restart_write(void *opaque)
-{
-    BlockDriverState *bs = opaque;
-
-    qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine);
+    s->read_reply_co = NULL;
 }
 
 static int nbd_co_send_request(BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
                                QEMUIOVector *qiov)
 {
     NBDClientSession *s = nbd_get_client_session(bs);
-    AioContext *aio_context;
     int rc, ret, i;
 
     qemu_co_mutex_lock(&s->send_mutex);
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
         return -EPIPE;
     }
 
-    s->send_coroutine = qemu_coroutine_self();
-    aio_context = bdrv_get_aio_context(bs);
-
-    aio_set_fd_handler(aio_context, s->sioc->fd, false,
-                       nbd_reply_ready, nbd_restart_write, NULL, bs);
     if (qiov) {
         qio_channel_set_cork(s->ioc, true);
         rc = nbd_send_request(s->ioc, request);
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
     } else {
         rc = nbd_send_request(s->ioc, request);
     }
-    aio_set_fd_handler(aio_context, s->sioc->fd, false,
-                       nbd_reply_ready, NULL, NULL, bs);
-    s->send_coroutine = NULL;
     qemu_co_mutex_unlock(&s->send_mutex);
     return rc;
 }
@@ -XXX,XX +XXX,XX @@ static void nbd_co_receive_reply(NBDClientSession *s,
 {
     int ret;
 
-    /* Wait until we're woken up by the read handler.  TODO: perhaps
-     * peek at the next reply and avoid yielding if it's ours?  */
+    /* Wait until we're woken up by nbd_read_reply_entry.  */
     qemu_coroutine_yield();
     *reply = s->reply;
     if (reply->handle != request->handle ||
@@ -XXX,XX +XXX,XX @@ static void nbd_coroutine_start(NBDClientSession *s,
     /* s->recv_coroutine[i] is set as soon as we get the send_lock.  */
 }
 
-static void nbd_coroutine_end(NBDClientSession *s,
+static void nbd_coroutine_end(BlockDriverState *bs,
                               NBDRequest *request)
 {
+    NBDClientSession *s = nbd_get_client_session(bs);
     int i = HANDLE_TO_INDEX(s, request->handle);
+
     s->recv_coroutine[i] = NULL;
-    if (s->in_flight-- == MAX_NBD_REQUESTS) {
-        qemu_co_queue_next(&s->free_sema);
+    s->in_flight--;
+    qemu_co_queue_next(&s->free_sema);
+
+    /* Kick the read_reply_co to get the next reply.  */
+    if (s->read_reply_co) {
+        aio_co_wake(s->read_reply_co);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
     } else {
         nbd_co_receive_reply(client, &request, &reply, qiov);
     }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
     return -reply.error;
 }
 
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
     } else {
         nbd_co_receive_reply(client, &request, &reply, NULL);
     }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
     return -reply.error;
 }
 
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
     } else {
         nbd_co_receive_reply(client, &request, &reply, NULL);
     }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
     return -reply.error;
 }
 
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_flush(BlockDriverState *bs)
     } else {
         nbd_co_receive_reply(client, &request, &reply, NULL);
     }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
     return -reply.error;
 }
 
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
     } else {
         nbd_co_receive_reply(client, &request, &reply, NULL);
     }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
     return -reply.error;
 
 }
 
 void nbd_client_detach_aio_context(BlockDriverState *bs)
 {
-    aio_set_fd_handler(bdrv_get_aio_context(bs),
-                       nbd_get_client_session(bs)->sioc->fd,
-                       false, NULL, NULL, NULL, NULL);
+    NBDClientSession *client = nbd_get_client_session(bs);
+    qio_channel_detach_aio_context(QIO_CHANNEL(client->sioc));
 }
 
 void nbd_client_attach_aio_context(BlockDriverState *bs,
                                    AioContext *new_context)
 {
-    aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sioc->fd,
-                       false, nbd_reply_ready, NULL, NULL, bs);
+    NBDClientSession *client = nbd_get_client_session(bs);
+    qio_channel_attach_aio_context(QIO_CHANNEL(client->sioc), new_context);
+    aio_co_schedule(new_context, client->read_reply_co);
 }
 
 void nbd_client_close(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ int nbd_client_init(BlockDriverState *bs,
     /* Now that we're connected, set the socket to be non-blocking and
      * kick the reply mechanism.  */
     qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL);
-
+    client->read_reply_co = qemu_coroutine_create(nbd_read_reply_entry, client);
     nbd_client_attach_aio_context(bs, bdrv_get_aio_context(bs));
 
     logout("Established connection with NBD server\n");
diff --git a/nbd/client.c b/nbd/client.c
index XXXXXXX..XXXXXXX 100644
--- a/nbd/client.c
+++ b/nbd/client.c
@@ -XXX,XX +XXX,XX @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply)
     ssize_t ret;
 
     ret = read_sync(ioc, buf, sizeof(buf));
-    if (ret < 0) {
+    if (ret <= 0) {
         return ret;
     }
 
diff --git a/nbd/common.c b/nbd/common.c
index XXXXXXX..XXXXXXX 100644
--- a/nbd/common.c
+++ b/nbd/common.c
@@ -XXX,XX +XXX,XX @@ ssize_t nbd_wr_syncv(QIOChannel *ioc,
         }
         if (len == QIO_CHANNEL_ERR_BLOCK) {
             if (qemu_in_coroutine()) {
-                /* XXX figure out if we can create a variant on
-                 * qio_channel_yield() that works with AIO contexts
-                 * and consider using that in this branch */
-                qemu_coroutine_yield();
-            } else if (done) {
-                /* XXX this is needed by nbd_reply_ready.  */
-                qio_channel_wait(ioc,
-                                 do_read ? G_IO_IN : G_IO_OUT);
+                qio_channel_yield(ioc, do_read ? G_IO_IN : G_IO_OUT);
             } else {
                 return -EAGAIN;
             }
diff --git a/nbd/server.c b/nbd/server.c
index XXXXXXX..XXXXXXX 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -XXX,XX +XXX,XX @@ struct NBDClient {
     CoMutex send_lock;
     Coroutine *send_coroutine;
 
-    bool can_read;
-
     QTAILQ_ENTRY(NBDClient) next;
     int nb_requests;
     bool closing;
@@ -XXX,XX +XXX,XX @@ struct NBDClient {
 
 /* That's all folks */
 
-static void nbd_set_handlers(NBDClient *client);
-static void nbd_unset_handlers(NBDClient *client);
-static void nbd_update_can_read(NBDClient *client);
+static void nbd_client_receive_next_request(NBDClient *client);
 
 static gboolean nbd_negotiate_continue(QIOChannel *ioc,
                                        GIOCondition condition,
@@ -XXX,XX +XXX,XX @@ void nbd_client_put(NBDClient *client)
          */
         assert(client->closing);
 
-        nbd_unset_handlers(client);
+        qio_channel_detach_aio_context(client->ioc);
         object_unref(OBJECT(client->sioc));
         object_unref(OBJECT(client->ioc));
         if (client->tlscreds) {
@@ -XXX,XX +XXX,XX @@ static NBDRequestData *nbd_request_get(NBDClient *client)
 
     assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
     client->nb_requests++;
-    nbd_update_can_read(client);
 
     req = g_new0(NBDRequestData, 1);
     nbd_client_get(client);
@@ -XXX,XX +XXX,XX @@ static void nbd_request_put(NBDRequestData *req)
     g_free(req);
 
     client->nb_requests--;
-    nbd_update_can_read(client);
+    nbd_client_receive_next_request(client);
+
     nbd_client_put(client);
 }
 
@@ -XXX,XX +XXX,XX @@ static void blk_aio_attached(AioContext *ctx, void *opaque)
     exp->ctx = ctx;
 
     QTAILQ_FOREACH(client, &exp->clients, next) {
-        nbd_set_handlers(client);
+        qio_channel_attach_aio_context(client->ioc, ctx);
+        if (client->recv_coroutine) {
+            aio_co_schedule(ctx, client->recv_coroutine);
+        }
+        if (client->send_coroutine) {
+            aio_co_schedule(ctx, client->send_coroutine);
+        }
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void blk_aio_detach(void *opaque)
     TRACE("Export %s: Detaching clients from AIO context %p\n", exp->name, exp->ctx);
 
     QTAILQ_FOREACH(client, &exp->clients, next) {
-        nbd_unset_handlers(client);
+        qio_channel_detach_aio_context(client->ioc);
     }
 
     exp->ctx = NULL;
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
     g_assert(qemu_in_coroutine());
     qemu_co_mutex_lock(&client->send_lock);
     client->send_coroutine = qemu_coroutine_self();
-    nbd_set_handlers(client);
 
     if (!len) {
         rc = nbd_send_reply(client->ioc, reply);
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
     }
 
     client->send_coroutine = NULL;
-    nbd_set_handlers(client);
     qemu_co_mutex_unlock(&client->send_lock);
     return rc;
 }
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
     ssize_t rc;
 
     g_assert(qemu_in_coroutine());
-    client->recv_coroutine = qemu_coroutine_self();
-    nbd_update_can_read(client);
-
+    assert(client->recv_coroutine == qemu_coroutine_self());
     rc = nbd_receive_request(client->ioc, request);
     if (rc < 0) {
         if (rc != -EAGAIN) {
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
 
 out:
     client->recv_coroutine = NULL;
-    nbd_update_can_read(client);
+    nbd_client_receive_next_request(client);
 
     return rc;
 }
 
-static void nbd_trip(void *opaque)
+/* Owns a reference to the NBDClient passed as opaque.  */
+static coroutine_fn void nbd_trip(void *opaque)
 {
     NBDClient *client = opaque;
     NBDExport *exp = client->exp;
     NBDRequestData *req;
-    NBDRequest request;
+    NBDRequest request = { 0 };    /* GCC thinks it can be used uninitialized */
     NBDReply reply;
     ssize_t ret;
     int flags;
 
     TRACE("Reading request.");
     if (client->closing) {
+        nbd_client_put(client);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void nbd_trip(void *opaque)
 
 done:
     nbd_request_put(req);
+    nbd_client_put(client);
     return;
 
 out:
     nbd_request_put(req);
     client_close(client);
+    nbd_client_put(client);
 }
 
-static void nbd_read(void *opaque)
+static void nbd_client_receive_next_request(NBDClient *client)
 {
-    NBDClient *client = opaque;
-
-    if (client->recv_coroutine) {
-        qemu_coroutine_enter(client->recv_coroutine);
-    } else {
-        qemu_coroutine_enter(qemu_coroutine_create(nbd_trip, client));
-    }
-}
-
-static void nbd_restart_write(void *opaque)
-{
-    NBDClient *client = opaque;
-
-    qemu_coroutine_enter(client->send_coroutine);
-}
-
-static void nbd_set_handlers(NBDClient *client)
-{
-    if (client->exp && client->exp->ctx) {
-        aio_set_fd_handler(client->exp->ctx, client->sioc->fd, true,
-                           client->can_read ? nbd_read : NULL,
-                           client->send_coroutine ? nbd_restart_write : NULL,
-                           NULL, client);
-    }
-}
-
-static void nbd_unset_handlers(NBDClient *client)
-{
-    if (client->exp && client->exp->ctx) {
-        aio_set_fd_handler(client->exp->ctx, client->sioc->fd, true, NULL,
-                           NULL, NULL, NULL);
-    }
-}
-
-static void nbd_update_can_read(NBDClient *client)
-{
-    bool can_read = client->recv_coroutine ||
-                    client->nb_requests < MAX_NBD_REQUESTS;
-
-    if (can_read != client->can_read) {
-        client->can_read = can_read;
-        nbd_set_handlers(client);
-
-        /* There is no need to invoke aio_notify(), since aio_set_fd_handler()
-         * in nbd_set_handlers() will have taken care of that */
+    if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS) {
+        nbd_client_get(client);
+        client->recv_coroutine = qemu_coroutine_create(nbd_trip, client);
+        aio_co_schedule(client->exp->ctx, client->recv_coroutine);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void nbd_co_client_start(void *opaque)
         goto out;
     }
     qemu_co_mutex_init(&client->send_lock);
-    nbd_set_handlers(client);
 
     if (exp) {
         QTAILQ_INSERT_TAIL(&exp->clients, client, next);
     }
+
+    nbd_client_receive_next_request(client);
+
 out:
     g_free(data);
 }
@@ -XXX,XX +XXX,XX @@ void nbd_client_new(NBDExport *exp,
     object_ref(OBJECT(client->sioc));
     client->ioc = QIO_CHANNEL(sioc);
     object_ref(OBJECT(client->ioc));
-    client->can_read = true;
     client->close = close_fn;
 
     data->client = client;
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

As a small step towards the introduction of multiqueue, we want
coroutines to remain on the same AioContext that started them,
unless they are moved explicitly with e.g. aio_co_schedule.  This patch
avoids that coroutines switch AioContext when they use a CoMutex.
For now it does not make much of a difference, because the CoMutex
is not thread-safe and the AioContext itself is used to protect the
CoMutex from concurrent access.  However, this is going to change.

diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/coroutine.h"
 #include "qemu/coroutine_int.h"
 #include "qemu/queue.h"
+#include "block/aio.h"
 #include "trace.h"
 
 void qemu_co_queue_init(CoQueue *queue)
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_run_restart(Coroutine *co)
 
 static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
 {
-    Coroutine *self = qemu_coroutine_self();
     Coroutine *next;
 
     if (QSIMPLEQ_EMPTY(&queue->entries)) {
@@ -XXX,XX +XXX,XX @@ static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
 
     while ((next = QSIMPLEQ_FIRST(&queue->entries)) != NULL) {
         QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next);
-        QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, next, co_queue_next);
-        trace_qemu_co_queue_next(next);
+        aio_co_wake(next);
         if (single) {
             break;
         }
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_coroutine_terminate(void *co) "self %p"
 
 # util/qemu-coroutine-lock.c
 qemu_co_queue_run_restart(void *co) "co %p"
-qemu_co_queue_next(void *nxt) "next %p"
 qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Keep the coroutine on the same AioContext.  Without this change,
there would be a race between yielding the coroutine and reentering it.
While the race cannot happen now, because the code only runs from a single
AioContext, this will change with multiqueue support in the block layer.

While doing the change, replace custom bottom half with aio_co_schedule.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Message-id: 20170213135235.12274-10-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/blkdebug.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index XXXXXXX..XXXXXXX 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -XXX,XX +XXX,XX @@ out:
     return ret;
 }
 
-static void error_callback_bh(void *opaque)
-{
-    Coroutine *co = opaque;
-    qemu_coroutine_enter(co);
-}
-
 static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
 {
     BDRVBlkdebugState *s = bs->opaque;
@@ -XXX,XX +XXX,XX @@ static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
     }
 
     if (!immediately) {
-        aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), error_callback_bh,
-                                qemu_coroutine_self());
+        aio_co_schedule(qemu_get_current_aio_context(), qemu_coroutine_self());
         qemu_coroutine_yield();
     }
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

qed_aio_start_io and qed_aio_next_io will not have to acquire/release
the AioContext, while qed_aio_next_io_cb will.  Split the functionality
and gain a little type-safety in the process.

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
     return l2_table;
 }
 
-static void qed_aio_next_io(void *opaque, int ret);
+static void qed_aio_next_io(QEDAIOCB *acb, int ret);
+
+static void qed_aio_start_io(QEDAIOCB *acb)
+{
+    qed_aio_next_io(acb, 0);
+}
+
+static void qed_aio_next_io_cb(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+
+    qed_aio_next_io(acb, ret);
+}
 
 static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
 {
@@ -XXX,XX +XXX,XX @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
 
     acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
     if (acb) {
-        qed_aio_next_io(acb, 0);
+        qed_aio_start_io(acb);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
         QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
         acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
         if (acb) {
-            qed_aio_next_io(acb, 0);
+            qed_aio_start_io(acb);
         } else if (s->header.features & QED_F_NEED_CHECK) {
             qed_start_need_check_timer(s);
         }
@@ -XXX,XX +XXX,XX @@ static void qed_commit_l2_update(void *opaque, int ret)
     acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
     assert(acb->request.l2_table != NULL);
 
-    qed_aio_next_io(opaque, ret);
+    qed_aio_next_io(acb, ret);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
     if (need_alloc) {
         /* Write out the whole new L2 table */
         qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
-                            qed_aio_write_l1_update, acb);
+                           qed_aio_write_l1_update, acb);
     } else {
         /* Write out only the updated part of the L2 table */
         qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
-                            qed_aio_next_io, acb);
+                           qed_aio_next_io_cb, acb);
     }
     return;
 
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
     }
 
     if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
-        next_fn = qed_aio_next_io;
+        next_fn = qed_aio_next_io_cb;
     } else {
         if (s->bs->backing) {
             next_fn = qed_aio_write_flush_before_l2_update;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
     if (acb->flags & QED_AIOCB_ZERO) {
         /* Skip ahead if the clusters are already zero */
         if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
-            qed_aio_next_io(acb, 0);
+            qed_aio_start_io(acb);
             return;
         }
 
@@ -XXX,XX +XXX,XX @@ static void qed_aio_read_data(void *opaque, int ret,
     /* Handle zero cluster and backing file reads */
     if (ret == QED_CLUSTER_ZERO) {
         qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
-        qed_aio_next_io(acb, 0);
+        qed_aio_start_io(acb);
         return;
     } else if (ret != QED_CLUSTER_FOUND) {
         qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
-                              &acb->backing_qiov, qed_aio_next_io, acb);
+                              &acb->backing_qiov, qed_aio_next_io_cb, acb);
         return;
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
     bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
                    &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
-                   qed_aio_next_io, acb);
+                   qed_aio_next_io_cb, acb);
     return;
 
 err:
@@ -XXX,XX +XXX,XX @@ err:
 /**
  * Begin next I/O or complete the request
  */
-static void qed_aio_next_io(void *opaque, int ret)
+static void qed_aio_next_io(QEDAIOCB *acb, int ret)
 {
-    QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
     QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
                                 qed_aio_write_data : qed_aio_read_data;
@@ -XXX,XX +XXX,XX @@ static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
     qemu_iovec_init(&acb->cur_qiov, qiov->niov);
 
     /* Start request */
-    qed_aio_next_io(acb, 0);
+    qed_aio_start_io(acb);
     return &acb->common;
 }
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

The AioContext data structures are now protected by list_lock and/or
they are walked with FOREACH_RCU primitives.  There is no need anymore
to acquire the AioContext for the entire duration of aio_dispatch.
Instead, just acquire it before and after invoking the callbacks.
The next step is then to push it further down.

diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
             (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
             aio_node_check(ctx, node->is_external) &&
             node->io_read) {
+            aio_context_acquire(ctx);
             node->io_read(node->opaque);
+            aio_context_release(ctx);
 
             /* aio_notify() does not count as progress */
             if (node->opaque != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
             (revents & (G_IO_OUT | G_IO_ERR)) &&
             aio_node_check(ctx, node->is_external) &&
             node->io_write) {
+            aio_context_acquire(ctx);
             node->io_write(node->opaque);
+            aio_context_release(ctx);
             progress = true;
         }
 
@@ -XXX,XX +XXX,XX @@ bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
     }
 
     /* Run our timers */
+    aio_context_acquire(ctx);
     progress |= timerlistgroup_run_timers(&ctx->tlg);
+    aio_context_release(ctx);
 
     return progress;
 }
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     int64_t timeout;
     int64_t start = 0;
 
-    aio_context_acquire(ctx);
-    progress = false;
-
     /* aio_notify can avoid the expensive event_notifier_set if
      * everything (file descriptors, bottom halves, timers) will
      * be re-evaluated before the next blocking poll().  This is
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     }
 
-    if (try_poll_mode(ctx, blocking)) {
-        progress = true;
-    } else {
+    aio_context_acquire(ctx);
+    progress = try_poll_mode(ctx, blocking);
+    aio_context_release(ctx);
+
+    if (!progress) {
         assert(npfd == 0);
 
         /* fill pollfds */
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         timeout = blocking ? aio_compute_timeout(ctx) : 0;
 
         /* wait until next event */
-        if (timeout) {
-            aio_context_release(ctx);
-        }
         if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
             AioHandler epoll_handler;
 
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         } else  {
             ret = qemu_poll_ns(pollfds, npfd, timeout);
         }
-        if (timeout) {
-            aio_context_acquire(ctx);
-        }
     }
 
     if (blocking) {
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         progress = true;
     }
 
-    aio_context_release(ctx);
-
     return progress;
 }
 
diff --git a/util/aio-win32.c b/util/aio-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
             (revents || event_notifier_get_handle(node->e) == event) &&
             node->io_notify) {
             node->pfd.revents = 0;
+            aio_context_acquire(ctx);
             node->io_notify(node->e);
+            aio_context_release(ctx);
 
             /* aio_notify() does not count as progress */
             if (node->e != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
             (node->io_read || node->io_write)) {
             node->pfd.revents = 0;
             if ((revents & G_IO_IN) && node->io_read) {
+                aio_context_acquire(ctx);
                 node->io_read(node->opaque);
+                aio_context_release(ctx);
                 progress = true;
             }
             if ((revents & G_IO_OUT) && node->io_write) {
+                aio_context_acquire(ctx);
                 node->io_write(node->opaque);
+                aio_context_release(ctx);
                 progress = true;
             }
 
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     int count;
     int timeout;
 
-    aio_context_acquire(ctx);
     progress = false;
 
     /* aio_notify can avoid the expensive event_notifier_set if
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
 
         timeout = blocking && !have_select_revents
             ? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
-        if (timeout) {
-            aio_context_release(ctx);
-        }
         ret = WaitForMultipleObjects(count, events, FALSE, timeout);
         if (blocking) {
             assert(first);
             atomic_sub(&ctx->notify_me, 2);
         }
-        if (timeout) {
-            aio_context_acquire(ctx);
-        }
 
         if (first) {
             aio_notify_accept(ctx);
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         progress |= aio_dispatch_handlers(ctx, event);
     } while (count > 0);
 
+    aio_context_acquire(ctx);
     progress |= timerlistgroup_run_timers(&ctx->tlg);
-
     aio_context_release(ctx);
     return progress;
 }
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
                 ret = 1;
             }
             bh->idle = 0;
+            aio_context_acquire(ctx);
             aio_bh_call(bh);
+            aio_context_release(ctx);
         }
         if (bh->deleted) {
             deleted = true;
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Message-id: 20170213135235.12274-13-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.h                 |  3 +++
 block/curl.c                |  2 ++
 block/io.c                  |  5 +++++
 block/iscsi.c               |  8 ++++++--
 block/null.c                |  4 ++++
 block/qed.c                 | 12 ++++++++++++
 block/throttle-groups.c     |  2 ++
 util/aio-posix.c            |  2 --
 util/aio-win32.c            |  2 --
 util/qemu-coroutine-sleep.c |  2 +-
 10 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/block/qed.h b/block/qed.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -XXX,XX +XXX,XX @@ enum {
  */
 typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len);
 
+void qed_acquire(BDRVQEDState *s);
+void qed_release(BDRVQEDState *s);
+
 /**
  * Generic callback for chaining async callbacks
  */
diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_multi_timeout_do(void *arg)
         return;
     }
 
+    aio_context_acquire(s->aio_context);
     curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
 
     curl_multi_check_completion(s);
+    aio_context_release(s->aio_context);
 #else
     abort();
 #endif
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_aio_cancel(BlockAIOCB *acb)
         if (acb->aiocb_info->get_aio_context) {
             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
         } else if (acb->bs) {
+            /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
+             * assert that we're not using an I/O thread.  Thread-safe
+             * code should use bdrv_aio_cancel_async exclusively.
+             */
+            assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
             aio_poll(bdrv_get_aio_context(acb->bs), true);
         } else {
             abort();
diff --git a/block/iscsi.c b/block/iscsi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static void iscsi_retry_timer_expired(void *opaque)
     struct IscsiTask *iTask = opaque;
     iTask->complete = 1;
     if (iTask->co) {
-        qemu_coroutine_enter(iTask->co);
+        aio_co_wake(iTask->co);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void iscsi_nop_timed_event(void *opaque)
 {
     IscsiLun *iscsilun = opaque;
 
+    aio_context_acquire(iscsilun->aio_context);
     if (iscsi_get_nops_in_flight(iscsilun->iscsi) >= MAX_NOP_FAILURES) {
         error_report("iSCSI: NOP timeout. Reconnecting...");
         iscsilun->request_timed_out = true;
     } else if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) {
         error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages.");
-        return;
+        goto out;
     }
 
     timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL);
     iscsi_set_events(iscsilun);
+
+out:
+    aio_context_release(iscsilun->aio_context);
 }
 
 static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
diff --git a/block/null.c b/block/null.c
index XXXXXXX..XXXXXXX 100644
--- a/block/null.c
+++ b/block/null.c
@@ -XXX,XX +XXX,XX @@ static void null_bh_cb(void *opaque)
 static void null_timer_cb(void *opaque)
 {
     NullAIOCB *acb = opaque;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+
+    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, 0);
+    aio_context_release(ctx);
     timer_deinit(&acb->timer);
     qemu_aio_unref(acb);
 }
diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_need_check_timer_cb(void *opaque)
 
     trace_qed_need_check_timer_cb(s);
 
+    qed_acquire(s);
     qed_plug_allocating_write_reqs(s);
 
     /* Ensure writes are on disk before clearing flag */
     bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
+    qed_release(s);
+}
+
+void qed_acquire(BDRVQEDState *s)
+{
+    aio_context_acquire(bdrv_get_aio_context(s->bs));
+}
+
+void qed_release(BDRVQEDState *s)
+{
+    aio_context_release(bdrv_get_aio_context(s->bs));
 }
 
 static void qed_start_need_check_timer(BDRVQEDState *s)
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index XXXXXXX..XXXXXXX 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@ static void timer_cb(BlockBackend *blk, bool is_write)
     qemu_mutex_unlock(&tg->lock);
 
     /* Run the request that was waiting for this timer */
+    aio_context_acquire(blk_get_aio_context(blk));
     empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]);
+    aio_context_release(blk_get_aio_context(blk));
 
     /* If the request queue was empty then we have to take care of
      * scheduling the next one */
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
     }
 
     /* Run our timers */
-    aio_context_acquire(ctx);
     progress |= timerlistgroup_run_timers(&ctx->tlg);
-    aio_context_release(ctx);
 
     return progress;
 }
diff --git a/util/aio-win32.c b/util/aio-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         progress |= aio_dispatch_handlers(ctx, event);
     } while (count > 0);
 
-    aio_context_acquire(ctx);
     progress |= timerlistgroup_run_timers(&ctx->tlg);
-    aio_context_release(ctx);
     return progress;
 }
 
diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-sleep.c
+++ b/util/qemu-coroutine-sleep.c
@@ -XXX,XX +XXX,XX @@ static void co_sleep_cb(void *opaque)
 {
     CoSleepCB *sleep_cb = opaque;
 
-    qemu_coroutine_enter(sleep_cb->co);
+    aio_co_wake(sleep_cb->co);
 }
 
 void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type,
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This covers both file descriptor callbacks and polling callbacks,
since they execute related code.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Message-id: 20170213135235.12274-14-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/curl.c          | 16 +++++++++++++---
 block/iscsi.c         |  4 ++++
 block/linux-aio.c     |  4 ++++
 block/nfs.c           |  6 ++++++
 block/sheepdog.c      | 29 +++++++++++++++--------------
 block/ssh.c           | 29 +++++++++--------------------
 block/win32-aio.c     | 10 ++++++----
 hw/block/virtio-blk.c |  5 ++++-
 hw/scsi/virtio-scsi.c |  7 +++++++
 util/aio-posix.c      |  7 -------
 util/aio-win32.c      |  6 ------
 11 files changed, 68 insertions(+), 55 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
     }
 }
 
-static void curl_multi_do(void *arg)
+static void curl_multi_do_locked(CURLState *s)
 {
-    CURLState *s = (CURLState *)arg;
     CURLSocket *socket, *next_socket;
     int running;
     int r;
@@ -XXX,XX +XXX,XX @@ static void curl_multi_do(void *arg)
     }
 }
 
+static void curl_multi_do(void *arg)
+{
+    CURLState *s = (CURLState *)arg;
+
+    aio_context_acquire(s->s->aio_context);
+    curl_multi_do_locked(s);
+    aio_context_release(s->s->aio_context);
+}
+
 static void curl_multi_read(void *arg)
 {
     CURLState *s = (CURLState *)arg;
 
-    curl_multi_do(arg);
+    aio_context_acquire(s->s->aio_context);
+    curl_multi_do_locked(s);
     curl_multi_check_completion(s->s);
+    aio_context_release(s->s->aio_context);
 }
 
 static void curl_multi_timeout_do(void *arg)
diff --git a/block/iscsi.c b/block/iscsi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ iscsi_process_read(void *arg)
     IscsiLun *iscsilun = arg;
     struct iscsi_context *iscsi = iscsilun->iscsi;
 
+    aio_context_acquire(iscsilun->aio_context);
     iscsi_service(iscsi, POLLIN);
     iscsi_set_events(iscsilun);
+    aio_context_release(iscsilun->aio_context);
 }
 
 static void
@@ -XXX,XX +XXX,XX @@ iscsi_process_write(void *arg)
     IscsiLun *iscsilun = arg;
     struct iscsi_context *iscsi = iscsilun->iscsi;
 
+    aio_context_acquire(iscsilun->aio_context);
     iscsi_service(iscsi, POLLOUT);
     iscsi_set_events(iscsilun);
+    aio_context_release(iscsilun->aio_context);
 }
 
 static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun)
diff --git a/block/linux-aio.c b/block/linux-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_completion_cb(EventNotifier *e)
     LinuxAioState *s = container_of(e, LinuxAioState, e);
 
     if (event_notifier_test_and_clear(&s->e)) {
+        aio_context_acquire(s->aio_context);
         qemu_laio_process_completions_and_submit(s);
+        aio_context_release(s->aio_context);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static bool qemu_laio_poll_cb(void *opaque)
         return false;
     }
 
+    aio_context_acquire(s->aio_context);
     qemu_laio_process_completions_and_submit(s);
+    aio_context_release(s->aio_context);
     return true;
 }
 
diff --git a/block/nfs.c b/block/nfs.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -XXX,XX +XXX,XX @@ static void nfs_set_events(NFSClient *client)
 static void nfs_process_read(void *arg)
 {
     NFSClient *client = arg;
+
+    aio_context_acquire(client->aio_context);
     nfs_service(client->context, POLLIN);
     nfs_set_events(client);
+    aio_context_release(client->aio_context);
 }
 
 static void nfs_process_write(void *arg)
 {
     NFSClient *client = arg;
+
+    aio_context_acquire(client->aio_context);
     nfs_service(client->context, POLLOUT);
     nfs_set_events(client);
+    aio_context_release(client->aio_context);
 }
 
 static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
diff --git a/block/sheepdog.c b/block/sheepdog.c
index XXXXXXX..XXXXXXX 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
     return ret;
 }
 
-static void restart_co_req(void *opaque)
-{
-    Coroutine *co = opaque;
-
-    qemu_coroutine_enter(co);
-}
-
 typedef struct SheepdogReqCo {
     int sockfd;
     BlockDriverState *bs;
@@ -XXX,XX +XXX,XX @@ typedef struct SheepdogReqCo {
     unsigned int *rlen;
     int ret;
     bool finished;
+    Coroutine *co;
 } SheepdogReqCo;
 
+static void restart_co_req(void *opaque)
+{
+    SheepdogReqCo *srco = opaque;
+
+    aio_co_wake(srco->co);
+}
+
 static coroutine_fn void do_co_req(void *opaque)
 {
     int ret;
-    Coroutine *co;
     SheepdogReqCo *srco = opaque;
     int sockfd = srco->sockfd;
     SheepdogReq *hdr = srco->hdr;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void do_co_req(void *opaque)
     unsigned int *wlen = srco->wlen;
     unsigned int *rlen = srco->rlen;
 
-    co = qemu_coroutine_self();
+    srco->co = qemu_coroutine_self();
     aio_set_fd_handler(srco->aio_context, sockfd, false,
-                       NULL, restart_co_req, NULL, co);
+                       NULL, restart_co_req, NULL, srco);
 
     ret = send_co_req(sockfd, hdr, data, wlen);
     if (ret < 0) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void do_co_req(void *opaque)
     }
 
     aio_set_fd_handler(srco->aio_context, sockfd, false,
-                       restart_co_req, NULL, NULL, co);
+                       restart_co_req, NULL, NULL, srco);
 
     ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
     if (ret != sizeof(*hdr)) {
@@ -XXX,XX +XXX,XX @@ out:
     aio_set_fd_handler(srco->aio_context, sockfd, false,
                        NULL, NULL, NULL, NULL);
 
+    srco->co = NULL;
     srco->ret = ret;
     srco->finished = true;
     if (srco->bs) {
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
          * We've finished all requests which belong to the AIOCB, so
          * we can switch back to sd_co_readv/writev now.
          */
-        qemu_coroutine_enter(acb->coroutine);
+        aio_co_wake(acb->coroutine);
     }
 
     return;
@@ -XXX,XX +XXX,XX @@ static void co_read_response(void *opaque)
         s->co_recv = qemu_coroutine_create(aio_read_response, opaque);
     }
 
-    qemu_coroutine_enter(s->co_recv);
+    aio_co_wake(s->co_recv);
 }
 
 static void co_write_request(void *opaque)
 {
     BDRVSheepdogState *s = opaque;
 
-    qemu_coroutine_enter(s->co_send);
+    aio_co_wake(s->co_send);
 }
 
 /*
diff --git a/block/ssh.c b/block/ssh.c
index XXXXXXX..XXXXXXX 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -XXX,XX +XXX,XX @@ static void restart_coroutine(void *opaque)
 
     DPRINTF("co=%p", co);
 
-    qemu_coroutine_enter(co);
+    aio_co_wake(co);
 }
 
-static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
+/* A non-blocking call returned EAGAIN, so yield, ensuring the
+ * handlers are set up so that we'll be rescheduled when there is an
+ * interesting event on the socket.
+ */
+static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
 {
     int r;
     IOHandler *rd_handler = NULL, *wr_handler = NULL;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
 
     aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
                        false, rd_handler, wr_handler, NULL, co);
-}
-
-static coroutine_fn void clear_fd_handler(BDRVSSHState *s,
-                                          BlockDriverState *bs)
-{
-    DPRINTF("s->sock=%d", s->sock);
-    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
-                       false, NULL, NULL, NULL, NULL);
-}
-
-/* A non-blocking call returned EAGAIN, so yield, ensuring the
- * handlers are set up so that we'll be rescheduled when there is an
- * interesting event on the socket.
- */
-static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
-{
-    set_fd_handler(s, bs);
     qemu_coroutine_yield();
-    clear_fd_handler(s, bs);
+    DPRINTF("s->sock=%d - back", s->sock);
+    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, false,
+                       NULL, NULL, NULL, NULL);
 }
 
 /* SFTP has a function `libssh2_sftp_seek64' which seeks to a position
diff --git a/block/win32-aio.c b/block/win32-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -XXX,XX +XXX,XX @@ struct QEMUWin32AIOState {
     HANDLE hIOCP;
     EventNotifier e;
     int count;
-    bool is_aio_context_attached;
+    AioContext *aio_ctx;
 };
 
 typedef struct QEMUWin32AIOCB {
@@ -XXX,XX +XXX,XX @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
     }
 
 
+    aio_context_acquire(s->aio_ctx);
     waiocb->common.cb(waiocb->common.opaque, ret);
+    aio_context_release(s->aio_ctx);
     qemu_aio_unref(waiocb);
 }
 
@@ -XXX,XX +XXX,XX @@ void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
                                   AioContext *old_context)
 {
     aio_set_event_notifier(old_context, &aio->e, false, NULL, NULL);
-    aio->is_aio_context_attached = false;
+    aio->aio_ctx = NULL;
 }
 
 void win32_aio_attach_aio_context(QEMUWin32AIOState *aio,
                                   AioContext *new_context)
 {
-    aio->is_aio_context_attached = true;
+    aio->aio_ctx = new_context;
     aio_set_event_notifier(new_context, &aio->e, false,
                            win32_aio_completion_cb, NULL);
 }
@@ -XXX,XX +XXX,XX @@ out_free_state:
 
 void win32_aio_cleanup(QEMUWin32AIOState *aio)
 {
-    assert(!aio->is_aio_context_attached);
+    assert(!aio->aio_ctx);
     CloseHandle(aio->hIOCP);
     event_notifier_cleanup(&aio->e);
     g_free(aio);
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
 {
     VirtIOBlockIoctlReq *ioctl_req = opaque;
     VirtIOBlockReq *req = ioctl_req->req;
-    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
     struct virtio_scsi_inhdr *scsi;
     struct sg_io_hdr *hdr;
 
@@ -XXX,XX +XXX,XX @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
     MultiReqBuffer mrb = {};
     bool progress = false;
 
+    aio_context_acquire(blk_get_aio_context(s->blk));
     blk_io_plug(s->blk);
 
     do {
@@ -XXX,XX +XXX,XX @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
     }
 
     blk_io_unplug(s->blk);
+    aio_context_release(blk_get_aio_context(s->blk));
     return progress;
 }
 
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
     VirtIOSCSIReq *req;
     bool progress = false;
 
+    virtio_scsi_acquire(s);
     while ((req = virtio_scsi_pop_req(s, vq))) {
         progress = true;
         virtio_scsi_handle_ctrl_req(s, req);
     }
+    virtio_scsi_release(s);
     return progress;
 }
 
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
 
     QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs);
 
+    virtio_scsi_acquire(s);
     do {
         virtio_queue_set_notification(vq, 0);
 
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
     QTAILQ_FOREACH_SAFE(req, &reqs, next, next) {
         virtio_scsi_handle_cmd_req_submit(s, req);
     }
+    virtio_scsi_release(s);
     return progress;
 }
 
@@ -XXX,XX +XXX,XX @@ out:
 
 bool virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq)
 {
+    virtio_scsi_acquire(s);
     if (s->events_dropped) {
         virtio_scsi_push_event(s, NULL, VIRTIO_SCSI_T_NO_EVENT, 0);
+        virtio_scsi_release(s);
         return true;
     }
+    virtio_scsi_release(s);
     return false;
 }
 
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
             (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
             aio_node_check(ctx, node->is_external) &&
             node->io_read) {
-            aio_context_acquire(ctx);
             node->io_read(node->opaque);
-            aio_context_release(ctx);
 
             /* aio_notify() does not count as progress */
             if (node->opaque != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
             (revents & (G_IO_OUT | G_IO_ERR)) &&
             aio_node_check(ctx, node->is_external) &&
             node->io_write) {
-            aio_context_acquire(ctx);
             node->io_write(node->opaque);
-            aio_context_release(ctx);
             progress = true;
         }
 
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     }
 
-    aio_context_acquire(ctx);
     progress = try_poll_mode(ctx, blocking);
-    aio_context_release(ctx);
-
     if (!progress) {
         assert(npfd == 0);
 
diff --git a/util/aio-win32.c b/util/aio-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
             (revents || event_notifier_get_handle(node->e) == event) &&
             node->io_notify) {
             node->pfd.revents = 0;
-            aio_context_acquire(ctx);
             node->io_notify(node->e);
-            aio_context_release(ctx);
 
             /* aio_notify() does not count as progress */
             if (node->e != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
             (node->io_read || node->io_write)) {
             node->pfd.revents = 0;
             if ((revents & G_IO_IN) && node->io_read) {
-                aio_context_acquire(ctx);
                 node->io_read(node->opaque);
-                aio_context_release(ctx);
                 progress = true;
             }
             if ((revents & G_IO_OUT) && node->io_write) {
-                aio_context_acquire(ctx);
                 node->io_write(node->opaque);
-                aio_context_release(ctx);
                 progress = true;
             }
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Message-id: 20170213135235.12274-15-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/archipelago.c   |  3 +++
 block/blkreplay.c     |  2 +-
 block/block-backend.c |  6 ++++++
 block/curl.c          | 26 ++++++++++++++++++--------
 block/gluster.c       |  9 +--------
 block/io.c            |  6 +++++-
 block/iscsi.c         |  6 +++++-
 block/linux-aio.c     | 15 +++++++++------
 block/nfs.c           |  3 ++-
 block/null.c          |  4 ++++
 block/qed.c           |  3 +++
 block/rbd.c           |  4 ++++
 dma-helpers.c         |  2 ++
 hw/block/virtio-blk.c |  2 ++
 hw/scsi/scsi-bus.c    |  2 ++
 util/async.c          |  4 ++--
 util/thread-pool.c    |  2 ++
 17 files changed, 71 insertions(+), 28 deletions(-)

diff --git a/block/archipelago.c b/block/archipelago.c
index XXXXXXX..XXXXXXX 100644
--- a/block/archipelago.c
+++ b/block/archipelago.c
@@ -XXX,XX +XXX,XX @@ static void qemu_archipelago_complete_aio(void *opaque)
 {
     AIORequestData *reqdata = (AIORequestData *) opaque;
     ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;
+    AioContext *ctx = bdrv_get_aio_context(aio_cb->common.bs);
 
+    aio_context_acquire(ctx);
     aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
+    aio_context_release(ctx);
     aio_cb->status = 0;
 
     qemu_aio_unref(aio_cb);
diff --git a/block/blkreplay.c b/block/blkreplay.c
index XXXXXXX..XXXXXXX 100755
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -XXX,XX +XXX,XX @@ static int64_t blkreplay_getlength(BlockDriverState *bs)
 static void blkreplay_bh_cb(void *opaque)
 {
     Request *req = opaque;
-    qemu_coroutine_enter(req->co);
+    aio_co_wake(req->co);
     qemu_bh_delete(req->bh);
     g_free(req);
 }
diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
 static void error_callback_bh(void *opaque)
 {
     struct BlockBackendAIOCB *acb = opaque;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 
     bdrv_dec_in_flight(acb->common.bs);
+    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, acb->ret);
+    aio_context_release(ctx);
     qemu_aio_unref(acb);
 }
 
@@ -XXX,XX +XXX,XX @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)
 static void blk_aio_complete_bh(void *opaque)
 {
     BlkAioEmAIOCB *acb = opaque;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 
     assert(acb->has_returned);
+    aio_context_acquire(ctx);
     blk_aio_complete(acb);
+    aio_context_release(ctx);
 }
 
 static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
 {
     CURLState *state;
     int running;
+    int ret = -EINPROGRESS;
 
     CURLAIOCB *acb = p;
-    BDRVCURLState *s = acb->common.bs->opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVCURLState *s = bs->opaque;
+    AioContext *ctx = bdrv_get_aio_context(bs);
 
     size_t start = acb->sector_num * BDRV_SECTOR_SIZE;
     size_t end;
 
+    aio_context_acquire(ctx);
+
     // In case we have the requested data already (e.g. read-ahead),
     // we can just call the callback and be done.
     switch (curl_find_buf(s, start, acb->nb_sectors * BDRV_SECTOR_SIZE, acb)) {
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
             qemu_aio_unref(acb);
             // fall through
         case FIND_RET_WAIT:
-            return;
+            goto out;
         default:
             break;
     }
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
     // No cache found, so let's start a new request
     state = curl_init_state(acb->common.bs, s);
     if (!state) {
-        acb->common.cb(acb->common.opaque, -EIO);
-        qemu_aio_unref(acb);
-        return;
+        ret = -EIO;
+        goto out;
     }
 
     acb->start = 0;
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
     state->orig_buf = g_try_malloc(state->buf_len);
     if (state->buf_len && state->orig_buf == NULL) {
         curl_clean_state(state);
-        acb->common.cb(acb->common.opaque, -ENOMEM);
-        qemu_aio_unref(acb);
-        return;
+        ret = -ENOMEM;
+        goto out;
     }
     state->acb[0] = acb;
 
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
 
     /* Tell curl it needs to kick things off */
     curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
+
+out:
+    if (ret != -EINPROGRESS) {
+        acb->common.cb(acb->common.opaque, ret);
+        qemu_aio_unref(acb);
+    }
+    aio_context_release(ctx);
 }
 
 static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
diff --git a/block/gluster.c b/block/gluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -XXX,XX +XXX,XX @@ static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
     return qemu_gluster_glfs_init(gconf, errp);
 }
 
-static void qemu_gluster_complete_aio(void *opaque)
-{
-    GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
-
-    qemu_coroutine_enter(acb->coroutine);
-}
-
 /*
  * AIO callback routine called from GlusterFS thread.
  */
@@ -XXX,XX +XXX,XX @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
         acb->ret = -EIO; /* Partial read/write - fail it */
     }
 
-    aio_bh_schedule_oneshot(acb->aio_context, qemu_gluster_complete_aio, acb);
+    aio_co_schedule(acb->aio_context, acb->coroutine);
 }
 
 static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
     bdrv_dec_in_flight(bs);
     bdrv_drained_begin(bs);
     data->done = true;
-    qemu_coroutine_enter(co);
+    aio_co_wake(co);
 }
 
 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
 static void bdrv_co_em_bh(void *opaque)
 {
     BlockAIOCBCoroutine *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    AioContext *ctx = bdrv_get_aio_context(bs);
 
     assert(!acb->need_bh);
+    aio_context_acquire(ctx);
     bdrv_co_complete(acb);
+    aio_context_release(ctx);
 }
 
 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
diff --git a/block/iscsi.c b/block/iscsi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static void
 iscsi_bh_cb(void *p)
 {
     IscsiAIOCB *acb = p;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 
     qemu_bh_delete(acb->bh);
 
     g_free(acb->buf);
     acb->buf = NULL;
 
+    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, acb->status);
+    aio_context_release(ctx);
 
     if (acb->task != NULL) {
         scsi_free_scsi_task(acb->task);
@@ -XXX,XX +XXX,XX @@ iscsi_schedule_bh(IscsiAIOCB *acb)
 static void iscsi_co_generic_bh_cb(void *opaque)
 {
     struct IscsiTask *iTask = opaque;
+
     iTask->complete = 1;
-    qemu_coroutine_enter(iTask->co);
+    aio_co_wake(iTask->co);
 }
 
 static void iscsi_retry_timer_expired(void *opaque)
diff --git a/block/linux-aio.c b/block/linux-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -XXX,XX +XXX,XX @@ struct LinuxAioState {
     io_context_t ctx;
     EventNotifier e;
 
-    /* io queue for submit at batch */
+    /* io queue for submit at batch.  Protected by AioContext lock. */
     LaioQueue io_q;
 
-    /* I/O completion processing */
+    /* I/O completion processing.  Only runs in I/O thread.  */
     QEMUBH *completion_bh;
     int event_idx;
     int event_max;
@@ -XXX,XX +XXX,XX @@ static inline ssize_t io_event_ret(struct io_event *ev)
  */
 static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
 {
+    LinuxAioState *s = laiocb->ctx;
     int ret;
 
     ret = laiocb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
     }
 
     laiocb->ret = ret;
+    aio_context_acquire(s->aio_context);
     if (laiocb->co) {
         /* If the coroutine is already entered it must be in ioq_submit() and
          * will notice laio->ret has been filled in when it eventually runs
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
         laiocb->common.cb(laiocb->common.opaque, ret);
         qemu_aio_unref(laiocb);
     }
+    aio_context_release(s->aio_context);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completions(LinuxAioState *s)
 static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
 {
     qemu_laio_process_completions(s);
+
+    aio_context_acquire(s->aio_context);
     if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
         ioq_submit(s);
     }
+    aio_context_release(s->aio_context);
 }
 
 static void qemu_laio_completion_bh(void *opaque)
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_completion_cb(EventNotifier *e)
     LinuxAioState *s = container_of(e, LinuxAioState, e);
 
     if (event_notifier_test_and_clear(&s->e)) {
-        aio_context_acquire(s->aio_context);
         qemu_laio_process_completions_and_submit(s);
-        aio_context_release(s->aio_context);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static bool qemu_laio_poll_cb(void *opaque)
         return false;
     }
 
-    aio_context_acquire(s->aio_context);
     qemu_laio_process_completions_and_submit(s);
-    aio_context_release(s->aio_context);
     return true;
 }
 
@@ -XXX,XX +XXX,XX @@ void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
 {
     aio_set_event_notifier(old_context, &s->e, false, NULL, NULL);
     qemu_bh_delete(s->completion_bh);
+    s->aio_context = NULL;
 }
 
 void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
diff --git a/block/nfs.c b/block/nfs.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -XXX,XX +XXX,XX @@ static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
 static void nfs_co_generic_bh_cb(void *opaque)
 {
     NFSRPC *task = opaque;
+
     task->complete = 1;
-    qemu_coroutine_enter(task->co);
+    aio_co_wake(task->co);
 }
 
 static void
diff --git a/block/null.c b/block/null.c
index XXXXXXX..XXXXXXX 100644
--- a/block/null.c
+++ b/block/null.c
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo null_aiocb_info = {
 static void null_bh_cb(void *opaque)
 {
     NullAIOCB *acb = opaque;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+
+    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, 0);
+    aio_context_release(ctx);
     qemu_aio_unref(acb);
 }
 
diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
 static void qed_aio_complete_bh(void *opaque)
 {
     QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
     BlockCompletionFunc *cb = acb->common.cb;
     void *user_opaque = acb->common.opaque;
     int ret = acb->bh_ret;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete_bh(void *opaque)
     qemu_aio_unref(acb);
 
     /* Invoke callback */
+    qed_acquire(s);
     cb(user_opaque, ret);
+    qed_release(s);
 }
 
 static void qed_aio_complete(QEDAIOCB *acb, int ret)
diff --git a/block/rbd.c b/block/rbd.c
index XXXXXXX..XXXXXXX 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -XXX,XX +XXX,XX @@ shutdown:
 static void qemu_rbd_complete_aio(RADOSCB *rcb)
 {
     RBDAIOCB *acb = rcb->acb;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
     int64_t r;
 
     r = rcb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
     }
     qemu_vfree(acb->bounce);
+
+    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
+    aio_context_release(ctx);
 
     qemu_aio_unref(acb);
 }
diff --git a/dma-helpers.c b/dma-helpers.c
index XXXXXXX..XXXXXXX 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -XXX,XX +XXX,XX @@ static void dma_blk_cb(void *opaque, int ret)
                                 QEMU_ALIGN_DOWN(dbs->iov.size, dbs->align));
     }
 
+    aio_context_acquire(dbs->ctx);
     dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
                             dma_blk_cb, dbs, dbs->io_func_opaque);
+    aio_context_release(dbs->ctx);
     assert(dbs->acb);
 }
 
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_dma_restart_bh(void *opaque)
 
     s->rq = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
     while (req) {
         VirtIOBlockReq *next = req->next;
         if (virtio_blk_handle_request(req, &mrb)) {
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_dma_restart_bh(void *opaque)
     if (mrb.num_reqs) {
         virtio_blk_submit_multireq(s->blk, &mrb);
     }
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
 }
 
 static void virtio_blk_dma_restart_cb(void *opaque, int running,
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_restart_bh(void *opaque)
     qemu_bh_delete(s->bh);
     s->bh = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.blk));
     QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
         scsi_req_ref(req);
         if (req->retry) {
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_restart_bh(void *opaque)
         }
         scsi_req_unref(req);
     }
+    aio_context_release(blk_get_aio_context(s->conf.blk));
 }
 
 void scsi_req_retry(SCSIRequest *req)
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
                 ret = 1;
             }
             bh->idle = 0;
-            aio_context_acquire(ctx);
             aio_bh_call(bh);
-            aio_context_release(ctx);
         }
         if (bh->deleted) {
             deleted = true;
@@ -XXX,XX +XXX,XX @@ static void co_schedule_bh_cb(void *opaque)
         Coroutine *co = QSLIST_FIRST(&straight);
         QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
         trace_aio_co_schedule_bh_cb(ctx, co);
+        aio_context_acquire(ctx);
         qemu_coroutine_enter(co);
+        aio_context_release(ctx);
     }
 }
 
diff --git a/util/thread-pool.c b/util/thread-pool.c
index XXXXXXX..XXXXXXX 100644
--- a/util/thread-pool.c
+++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@ static void thread_pool_completion_bh(void *opaque)
     ThreadPool *pool = opaque;
     ThreadPoolElement *elem, *next;
 
+    aio_context_acquire(pool->ctx);
 restart:
     QLIST_FOREACH_SAFE(elem, &pool->head, all, next) {
         if (elem->state != THREAD_DONE) {
@@ -XXX,XX +XXX,XX @@ restart:
             qemu_aio_unref(elem);
         }
     }
+    aio_context_release(pool->ctx);
 }
 
 static void thread_pool_cancel(BlockAIOCB *acb)
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Message-id: 20170213135235.12274-16-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/archipelago.c    |  3 ---
 block/block-backend.c  |  7 -------
 block/curl.c           |  2 +-
 block/io.c             |  6 +-----
 block/iscsi.c          |  3 ---
 block/linux-aio.c      |  5 +----
 block/mirror.c         | 12 +++++++++---
 block/null.c           |  8 --------
 block/qed-cluster.c    |  2 ++
 block/qed-table.c      | 12 ++++++++++--
 block/qed.c            |  4 ++--
 block/rbd.c            |  4 ----
 block/win32-aio.c      |  3 ---
 hw/block/virtio-blk.c  | 12 +++++++++++-
 hw/scsi/scsi-disk.c    | 15 +++++++++++++++
 hw/scsi/scsi-generic.c | 20 +++++++++++++++++---
 util/thread-pool.c     |  4 +++-
 17 files changed, 72 insertions(+), 50 deletions(-)

diff --git a/block/archipelago.c b/block/archipelago.c
index XXXXXXX..XXXXXXX 100644
--- a/block/archipelago.c
+++ b/block/archipelago.c
@@ -XXX,XX +XXX,XX @@ static void qemu_archipelago_complete_aio(void *opaque)
 {
     AIORequestData *reqdata = (AIORequestData *) opaque;
     ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;
-    AioContext *ctx = bdrv_get_aio_context(aio_cb->common.bs);
 
-    aio_context_acquire(ctx);
     aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
-    aio_context_release(ctx);
     aio_cb->status = 0;
 
     qemu_aio_unref(aio_cb);
diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
 static void error_callback_bh(void *opaque)
 {
     struct BlockBackendAIOCB *acb = opaque;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 
     bdrv_dec_in_flight(acb->common.bs);
-    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, acb->ret);
-    aio_context_release(ctx);
     qemu_aio_unref(acb);
 }
 
@@ -XXX,XX +XXX,XX @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)
 static void blk_aio_complete_bh(void *opaque)
 {
     BlkAioEmAIOCB *acb = opaque;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
-
     assert(acb->has_returned);
-    aio_context_acquire(ctx);
     blk_aio_complete(acb);
-    aio_context_release(ctx);
 }
 
 static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
     curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
 
 out:
+    aio_context_release(ctx);
     if (ret != -EINPROGRESS) {
         acb->common.cb(acb->common.opaque, ret);
         qemu_aio_unref(acb);
     }
-    aio_context_release(ctx);
 }
 
 static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_io_em_complete(void *opaque, int ret)
     CoroutineIOCompletion *co = opaque;
 
     co->ret = ret;
-    qemu_coroutine_enter(co->coroutine);
+    aio_co_wake(co->coroutine);
 }
 
 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
 static void bdrv_co_em_bh(void *opaque)
 {
     BlockAIOCBCoroutine *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
-    AioContext *ctx = bdrv_get_aio_context(bs);
 
     assert(!acb->need_bh);
-    aio_context_acquire(ctx);
     bdrv_co_complete(acb);
-    aio_context_release(ctx);
 }
 
 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
diff --git a/block/iscsi.c b/block/iscsi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static void
 iscsi_bh_cb(void *p)
 {
     IscsiAIOCB *acb = p;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 
     qemu_bh_delete(acb->bh);
 
     g_free(acb->buf);
     acb->buf = NULL;
 
-    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, acb->status);
-    aio_context_release(ctx);
 
     if (acb->task != NULL) {
         scsi_free_scsi_task(acb->task);
diff --git a/block/linux-aio.c b/block/linux-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -XXX,XX +XXX,XX @@ static inline ssize_t io_event_ret(struct io_event *ev)
  */
 static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
 {
-    LinuxAioState *s = laiocb->ctx;
     int ret;
 
     ret = laiocb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
     }
 
     laiocb->ret = ret;
-    aio_context_acquire(s->aio_context);
     if (laiocb->co) {
         /* If the coroutine is already entered it must be in ioq_submit() and
          * will notice laio->ret has been filled in when it eventually runs
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
          * that!
          */
         if (!qemu_coroutine_entered(laiocb->co)) {
-            qemu_coroutine_enter(laiocb->co);
+            aio_co_wake(laiocb->co);
         }
     } else {
         laiocb->common.cb(laiocb->common.opaque, ret);
         qemu_aio_unref(laiocb);
     }
-    aio_context_release(s->aio_context);
 }
 
 /**
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
 {
     MirrorOp *op = opaque;
     MirrorBlockJob *s = op->s;
+
+    aio_context_acquire(blk_get_aio_context(s->common.blk));
     if (ret < 0) {
         BlockErrorAction action;
 
@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
         }
     }
     mirror_iteration_done(op, ret);
+    aio_context_release(blk_get_aio_context(s->common.blk));
 }
 
 static void mirror_read_complete(void *opaque, int ret)
 {
     MirrorOp *op = opaque;
     MirrorBlockJob *s = op->s;
+
+    aio_context_acquire(blk_get_aio_context(s->common.blk));
     if (ret < 0) {
         BlockErrorAction action;
 
@@ -XXX,XX +XXX,XX @@ static void mirror_read_complete(void *opaque, int ret)
         }
 
         mirror_iteration_done(op, ret);
-        return;
+    } else {
+        blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
+                        0, mirror_write_complete, op);
     }
-    blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
-                    0, mirror_write_complete, op);
+    aio_context_release(blk_get_aio_context(s->common.blk));
 }
 
 static inline void mirror_clip_sectors(MirrorBlockJob *s,
diff --git a/block/null.c b/block/null.c
index XXXXXXX..XXXXXXX 100644
--- a/block/null.c
+++ b/block/null.c
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo null_aiocb_info = {
 static void null_bh_cb(void *opaque)
 {
     NullAIOCB *acb = opaque;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
-
-    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, 0);
-    aio_context_release(ctx);
     qemu_aio_unref(acb);
 }
 
 static void null_timer_cb(void *opaque)
 {
     NullAIOCB *acb = opaque;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
-
-    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, 0);
-    aio_context_release(ctx);
     timer_deinit(&acb->timer);
     qemu_aio_unref(acb);
 }
diff --git a/block/qed-cluster.c b/block/qed-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed-cluster.c
+++ b/block/qed-cluster.c
@@ -XXX,XX +XXX,XX @@ static void qed_find_cluster_cb(void *opaque, int ret)
     unsigned int index;
     unsigned int n;
 
+    qed_acquire(s);
     if (ret) {
         goto out;
     }
@@ -XXX,XX +XXX,XX @@ static void qed_find_cluster_cb(void *opaque, int ret)
 
 out:
     find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
+    qed_release(s);
     g_free(find_cluster_cb);
 }
 
diff --git a/block/qed-table.c b/block/qed-table.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -XXX,XX +XXX,XX @@ static void qed_read_table_cb(void *opaque, int ret)
 {
     QEDReadTableCB *read_table_cb = opaque;
     QEDTable *table = read_table_cb->table;
+    BDRVQEDState *s = read_table_cb->s;
     int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
     int i;
 
@@ -XXX,XX +XXX,XX @@ static void qed_read_table_cb(void *opaque, int ret)
     }
 
     /* Byteswap offsets */
+    qed_acquire(s);
     for (i = 0; i < noffsets; i++) {
         table->offsets[i] = le64_to_cpu(table->offsets[i]);
     }
+    qed_release(s);
 
 out:
     /* Completion */
-    trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret);
+    trace_qed_read_table_cb(s, read_table_cb->table, ret);
     gencb_complete(&read_table_cb->gencb, ret);
 }
 
@@ -XXX,XX +XXX,XX @@ typedef struct {
 static void qed_write_table_cb(void *opaque, int ret)
 {
     QEDWriteTableCB *write_table_cb = opaque;
+    BDRVQEDState *s = write_table_cb->s;
 
-    trace_qed_write_table_cb(write_table_cb->s,
+    trace_qed_write_table_cb(s,
                              write_table_cb->orig_table,
                              write_table_cb->flush,
                              ret);
@@ -XXX,XX +XXX,XX @@ static void qed_write_table_cb(void *opaque, int ret)
     if (write_table_cb->flush) {
         /* We still need to flush first */
         write_table_cb->flush = false;
+        qed_acquire(s);
         bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb,
                        write_table_cb);
+        qed_release(s);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void qed_read_l2_table_cb(void *opaque, int ret)
     CachedL2Table *l2_table = request->l2_table;
     uint64_t l2_offset = read_l2_table_cb->l2_offset;
 
+    qed_acquire(s);
     if (ret) {
         /* can't trust loaded L2 table anymore */
         qed_unref_l2_cache_entry(l2_table);
@@ -XXX,XX +XXX,XX @@ static void qed_read_l2_table_cb(void *opaque, int ret)
         request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
         assert(request->l2_table != NULL);
     }
+    qed_release(s);
 
     gencb_complete(&read_l2_table_cb->gencb, ret);
 }
diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l
     }
 
     if (cb->co) {
-        qemu_coroutine_enter(cb->co);
+        aio_co_wake(cb->co);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
     cb->done = true;
     cb->ret = ret;
     if (cb->co) {
-        qemu_coroutine_enter(cb->co);
+        aio_co_wake(cb->co);
     }
 }
 
diff --git a/block/rbd.c b/block/rbd.c
index XXXXXXX..XXXXXXX 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -XXX,XX +XXX,XX @@ shutdown:
 static void qemu_rbd_complete_aio(RADOSCB *rcb)
 {
     RBDAIOCB *acb = rcb->acb;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
     int64_t r;
 
     r = rcb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
     }
     qemu_vfree(acb->bounce);
-
-    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
-    aio_context_release(ctx);
 
     qemu_aio_unref(acb);
 }
diff --git a/block/win32-aio.c b/block/win32-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -XXX,XX +XXX,XX @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
         qemu_vfree(waiocb->buf);
     }
 
-
-    aio_context_acquire(s->aio_ctx);
     waiocb->common.cb(waiocb->common.opaque, ret);
-    aio_context_release(s->aio_ctx);
     qemu_aio_unref(waiocb);
 }
 
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
 static void virtio_blk_rw_complete(void *opaque, int ret)
 {
     VirtIOBlockReq *next = opaque;
+    VirtIOBlock *s = next->dev;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
     while (next) {
         VirtIOBlockReq *req = next;
         next = req->mr_next;
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_rw_complete(void *opaque, int ret)
         block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
         virtio_blk_free_request(req);
     }
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
 }
 
 static void virtio_blk_flush_complete(void *opaque, int ret)
 {
     VirtIOBlockReq *req = opaque;
+    VirtIOBlock *s = req->dev;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
     if (ret) {
         if (virtio_blk_handle_rw_error(req, -ret, 0)) {
-            return;
+            goto out;
         }
     }
 
     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
     block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
     virtio_blk_free_request(req);
+
+out:
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
 }
 
 #ifdef __linux__
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
     virtio_stl_p(vdev, &scsi->data_len, hdr->dxfer_len);
 
 out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
     virtio_blk_req_complete(req, status);
     virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
     g_free(ioctl_req);
 }
 
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (scsi_disk_req_check_error(r, ret, true)) {
         goto done;
     }
@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
     scsi_req_complete(&r->req, GOOD);
 
 done:
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
     scsi_req_unref(&r->req);
 }
 
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_complete(void *opaque, int ret)
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (ret < 0) {
         block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
     } else {
         block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
     }
     scsi_dma_complete_noio(r, ret);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 static void scsi_read_complete(void * opaque, int ret)
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (scsi_disk_req_check_error(r, ret, true)) {
         goto done;
     }
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
 
 done:
     scsi_req_unref(&r->req);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 /* Actually issue a read to the block device.  */
@@ -XXX,XX +XXX,XX @@ static void scsi_do_read_cb(void *opaque, int ret)
     assert (r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (ret < 0) {
         block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
     } else {
         block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
     }
     scsi_do_read(opaque, ret);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 /* Read more data from scsi device into buffer.  */
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
     assert (r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (ret < 0) {
         block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
     } else {
         block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
     }
     scsi_write_complete_noio(r, ret);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 static void scsi_write_data(SCSIRequest *req)
@@ -XXX,XX +XXX,XX @@ static void scsi_unmap_complete(void *opaque, int ret)
 {
     UnmapCBData *data = opaque;
     SCSIDiskReq *r = data->r;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     scsi_unmap_complete_noio(data, ret);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 static void scsi_disk_emulate_unmap(SCSIDiskReq *r, uint8_t *inbuf)
@@ -XXX,XX +XXX,XX @@ static void scsi_write_same_complete(void *opaque, int ret)
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (scsi_disk_req_check_error(r, ret, true)) {
         goto done;
     }
@@ -XXX,XX +XXX,XX @@ done:
     scsi_req_unref(&r->req);
     qemu_vfree(data->iov.iov_base);
     g_free(data);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 static void scsi_disk_emulate_write_same(SCSIDiskReq *r, uint8_t *inbuf)
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -XXX,XX +XXX,XX @@ done:
 static void scsi_command_complete(void *opaque, int ret)
 {
     SCSIGenericReq *r = (SCSIGenericReq *)opaque;
+    SCSIDevice *s = r->req.dev;
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
+
+    aio_context_acquire(blk_get_aio_context(s->conf.blk));
     scsi_command_complete_noio(r, ret);
+    aio_context_release(blk_get_aio_context(s->conf.blk));
 }
 
 static int execute_command(BlockBackend *blk,
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.blk));
+
     if (ret || r->req.io_canceled) {
         scsi_command_complete_noio(r, ret);
-        return;
+        goto done;
     }
 
     len = r->io_header.dxfer_len - r->io_header.resid;
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
     r->len = -1;
     if (len == 0) {
         scsi_command_complete_noio(r, 0);
-        return;
+        goto done;
     }
 
     /* Snoop READ CAPACITY output to set the blocksize.  */
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
     }
     scsi_req_data(&r->req, len);
     scsi_req_unref(&r->req);
+
+done:
+    aio_context_release(blk_get_aio_context(s->conf.blk));
 }
 
 /* Read more data from scsi device into buffer.  */
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.blk));
+
     if (ret || r->req.io_canceled) {
         scsi_command_complete_noio(r, ret);
-        return;
+        goto done;
     }
 
     if (r->req.cmd.buf[0] == MODE_SELECT && r->req.cmd.buf[4] == 12 &&
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
     }
 
     scsi_command_complete_noio(r, ret);
+
+done:
+    aio_context_release(blk_get_aio_context(s->conf.blk));
 }
 
 /* Write data to a scsi device.  Returns nonzero on failure.
diff --git a/util/thread-pool.c b/util/thread-pool.c
index XXXXXXX..XXXXXXX 100644
--- a/util/thread-pool.c
+++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@ restart:
              */
             qemu_bh_schedule(pool->completion_bh);
 
+            aio_context_release(pool->ctx);
             elem->common.cb(elem->common.opaque, elem->ret);
+            aio_context_acquire(pool->ctx);
             qemu_aio_unref(elem);
             goto restart;
         } else {
@@ -XXX,XX +XXX,XX @@ static void thread_pool_co_cb(void *opaque, int ret)
     ThreadPoolCo *co = opaque;
 
     co->ret = ret;
-    qemu_coroutine_enter(co->co);
+    aio_co_wake(co->co);
 }
 
 int coroutine_fn thread_pool_submit_co(ThreadPool *pool, ThreadPoolFunc *func,
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This patch prepares for the removal of unnecessary lockcnt inc/dec pairs.
Extract the dispatching loop for file descriptor handlers into a new
function aio_dispatch_handlers, and then inline aio_dispatch into
aio_poll.

aio_dispatch can now become void.

diff --git a/include/block/aio.h b/include/block/aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ bool aio_pending(AioContext *ctx);
 /* Dispatch any pending callbacks from the GSource attached to the AioContext.
  *
  * This is used internally in the implementation of the GSource.
- *
- * @dispatch_fds: true to process fds, false to skip them
- *                (can be used as an optimization by callers that know there
- *                are no fds ready)
  */
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds);
+void aio_dispatch(AioContext *ctx);
 
 /* Progress in completing AIO work to occur.  This can issue new pending
  * aio as a result of executing I/O completion or bh callbacks.
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
     AioHandler *node, *tmp;
     bool progress = false;
 
-    /*
-     * We have to walk very carefully in case aio_set_fd_handler is
-     * called while we're walking.
-     */
-    qemu_lockcnt_inc(&ctx->list_lock);
-
     QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
         int revents;
 
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
         }
     }
 
-    qemu_lockcnt_dec(&ctx->list_lock);
     return progress;
 }
 
-/*
- * Note that dispatch_fds == false has the side-effect of post-poning the
- * freeing of deleted handlers.
- */
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
+void aio_dispatch(AioContext *ctx)
 {
-    bool progress;
+    aio_bh_poll(ctx);
 
-    /*
-     * If there are callbacks left that have been queued, we need to call them.
-     * Do not call select in this case, because it is possible that the caller
-     * does not need a complete flush (as is the case for aio_poll loops).
-     */
-    progress = aio_bh_poll(ctx);
+    qemu_lockcnt_inc(&ctx->list_lock);
+    aio_dispatch_handlers(ctx);
+    qemu_lockcnt_dec(&ctx->list_lock);
 
-    if (dispatch_fds) {
-        progress |= aio_dispatch_handlers(ctx);
-    }
-
-    /* Run our timers */
-    progress |= timerlistgroup_run_timers(&ctx->tlg);
-
-    return progress;
+    timerlistgroup_run_timers(&ctx->tlg);
 }
 
 /* These thread-local variables are used only in a small part of aio_poll
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     npfd = 0;
     qemu_lockcnt_dec(&ctx->list_lock);
 
-    /* Run dispatch even if there were no readable fds to run timers */
-    if (aio_dispatch(ctx, ret > 0)) {
-        progress = true;
+    progress |= aio_bh_poll(ctx);
+
+    if (ret > 0) {
+        qemu_lockcnt_inc(&ctx->list_lock);
+        progress |= aio_dispatch_handlers(ctx);
+        qemu_lockcnt_dec(&ctx->list_lock);
     }
 
+    progress |= timerlistgroup_run_timers(&ctx->tlg);
+
     return progress;
 }
 
diff --git a/util/aio-win32.c b/util/aio-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
     return progress;
 }
 
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
+void aio_dispatch(AioContext *ctx)
 {
-    bool progress;
-
-    progress = aio_bh_poll(ctx);
-    if (dispatch_fds) {
-        progress |= aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
-    }
-    progress |= timerlistgroup_run_timers(&ctx->tlg);
-    return progress;
+    aio_bh_poll(ctx);
+    aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
+    timerlistgroup_run_timers(&ctx->tlg);
 }
 
 bool aio_poll(AioContext *ctx, bool blocking)
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ aio_ctx_dispatch(GSource     *source,
     AioContext *ctx = (AioContext *) source;
 
     assert(callback == NULL);
-    aio_dispatch(ctx, true);
+    aio_dispatch(ctx);
     return true;
 }
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Pull the increment/decrement pair out of aio_bh_poll and into the
callers.

diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
 
 void aio_dispatch(AioContext *ctx)
 {
+    qemu_lockcnt_inc(&ctx->list_lock);
     aio_bh_poll(ctx);
-
-    qemu_lockcnt_inc(&ctx->list_lock);
     aio_dispatch_handlers(ctx);
     qemu_lockcnt_dec(&ctx->list_lock);
 
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     }
 
     npfd = 0;
-    qemu_lockcnt_dec(&ctx->list_lock);
 
     progress |= aio_bh_poll(ctx);
 
     if (ret > 0) {
-        qemu_lockcnt_inc(&ctx->list_lock);
         progress |= aio_dispatch_handlers(ctx);
-        qemu_lockcnt_dec(&ctx->list_lock);
     }
 
+    qemu_lockcnt_dec(&ctx->list_lock);
+
     progress |= timerlistgroup_run_timers(&ctx->tlg);
 
     return progress;
diff --git a/util/aio-win32.c b/util/aio-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
     bool progress = false;
     AioHandler *tmp;
 
-    qemu_lockcnt_inc(&ctx->list_lock);
-
     /*
      * We have to walk very carefully in case aio_set_fd_handler is
      * called while we're walking.
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
         }
     }
 
-    qemu_lockcnt_dec(&ctx->list_lock);
     return progress;
 }
 
 void aio_dispatch(AioContext *ctx)
 {
+    qemu_lockcnt_inc(&ctx->list_lock);
     aio_bh_poll(ctx);
     aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
+    qemu_lockcnt_dec(&ctx->list_lock);
     timerlistgroup_run_timers(&ctx->tlg);
 }
 
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         }
     }
 
-    qemu_lockcnt_dec(&ctx->list_lock);
     first = true;
 
     /* ctx->notifier is always registered.  */
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         progress |= aio_dispatch_handlers(ctx, event);
     } while (count > 0);
 
+    qemu_lockcnt_dec(&ctx->list_lock);
+
     progress |= timerlistgroup_run_timers(&ctx->tlg);
     return progress;
 }
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ void aio_bh_call(QEMUBH *bh)
     bh->cb(bh->opaque);
 }
 
-/* Multiple occurrences of aio_bh_poll cannot be called concurrently */
+/* Multiple occurrences of aio_bh_poll cannot be called concurrently.
+ * The count in ctx->list_lock is incremented before the call, and is
+ * not affected by the call.
+ */
 int aio_bh_poll(AioContext *ctx)
 {
     QEMUBH *bh, **bhp, *next;
     int ret;
     bool deleted = false;
 
-    qemu_lockcnt_inc(&ctx->list_lock);
-
     ret = 0;
     for (bh = atomic_rcu_read(&ctx->first_bh); bh; bh = next) {
         next = atomic_rcu_read(&bh->next);
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
 
     /* remove deleted bhs */
     if (!deleted) {
-        qemu_lockcnt_dec(&ctx->list_lock);
         return ret;
     }
 
-    if (qemu_lockcnt_dec_and_lock(&ctx->list_lock)) {
+    if (qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
         bhp = &ctx->first_bh;
         while (*bhp) {
             bh = *bhp;
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
                 bhp = &bh->next;
             }
         }
-        qemu_lockcnt_unlock(&ctx->list_lock);
+        qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
     }
     return ret;
 }
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BdrvChild {
  * copied as well.
  */
 struct BlockDriverState {
-    int64_t total_sectors; /* if we are reading a disk image, give its
-                              size in sectors */
+    /* Protected by big QEMU lock or read-only after opening.  No special
+     * locking needed during I/O...
+     */
     int open_flags; /* flags used to open the file, re-used for re-open */
     bool read_only; /* if true, the media is read only */
     bool encrypted; /* if true, the media is encrypted */
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     bool sg;        /* if true, the device is a /dev/sg* */
     bool probed;    /* if true, format was probed rather than specified */
 
-    int copy_on_read; /* if nonzero, copy read backing sectors into image.
-                         note this is a reference count */
-
-    CoQueue flush_queue;            /* Serializing flush queue */
-    bool active_flush_req;          /* Flush request in flight? */
-    unsigned int write_gen;         /* Current data generation */
-    unsigned int flushed_gen;       /* Flushed write generation */
-
     BlockDriver *drv; /* NULL means no media */
     void *opaque;
 
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     BdrvChild *backing;
     BdrvChild *file;
 
-    /* Callback before write request is processed */
-    NotifierWithReturnList before_write_notifiers;
-
-    /* number of in-flight requests; overall and serialising */
-    unsigned int in_flight;
-    unsigned int serialising_in_flight;
-
-    bool wakeup;
-
-    /* Offset after the highest byte written to */
-    uint64_t wr_highest_offset;
-
     /* I/O Limits */
     BlockLimits bl;
 
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     QTAILQ_ENTRY(BlockDriverState) bs_list;
     /* element of the list of monitor-owned BDS */
     QTAILQ_ENTRY(BlockDriverState) monitor_list;
-    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
     int refcnt;
 
-    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
-
     /* operation blockers */
     QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX];
 
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     /* The error object in use for blocking operations on backing_hd */
     Error *backing_blocker;
 
+    /* Protected by AioContext lock */
+
+    /* If true, copy read backing sectors into image.  Can be >1 if more
+     * than one client has requested copy-on-read.
+     */
+    int copy_on_read;
+
+    /* If we are reading a disk image, give its size in sectors.
+     * Generally read-only; it is written to by load_vmstate and save_vmstate,
+     * but the block layer is quiescent during those.
+     */
+    int64_t total_sectors;
+
+    /* Callback before write request is processed */
+    NotifierWithReturnList before_write_notifiers;
+
+    /* number of in-flight requests; overall and serialising */
+    unsigned int in_flight;
+    unsigned int serialising_in_flight;
+
+    bool wakeup;
+
+    /* Offset after the highest byte written to */
+    uint64_t wr_highest_offset;
+
     /* threshold limit for writes, in bytes. "High water mark". */
     uint64_t write_threshold_offset;
     NotifierWithReturn write_threshold_notifier;
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     /* counter for nested bdrv_io_plug */
     unsigned io_plugged;
 
+    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
+    CoQueue flush_queue;                  /* Serializing flush queue */
+    bool active_flush_req;                /* Flush request in flight? */
+    unsigned int write_gen;               /* Current data generation */
+    unsigned int flushed_gen;             /* Flushed write generation */
+
+    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
+
+    /* do we need to tell the quest if we have a volatile write cache? */
+    int enable_write_cache;
+
     int quiesce_counter;
 };
 
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -XXX,XX +XXX,XX @@ typedef struct BlockDevOps {
  * fields that must be public. This is in particular for QLIST_ENTRY() and
  * friends so that BlockBackends can be kept in lists outside block-backend.c */
 typedef struct BlockBackendPublic {
-    /* I/O throttling.
-     * throttle_state tells us if this BlockBackend has I/O limits configured.
-     * io_limits_disabled tells us if they are currently being enforced */
+    /* I/O throttling has its own locking, but also some fields are
+     * protected by the AioContext lock.
+     */
+
+    /* Protected by AioContext lock.  */
     CoQueue      throttled_reqs[2];
+
+    /* Nonzero if the I/O limits are currently being ignored; generally
+     * it is zero.  */
     unsigned int io_limits_disabled;
 
     /* The following fields are protected by the ThrottleGroup lock.
-     * See the ThrottleGroup documentation for details. */
+     * See the ThrottleGroup documentation for details.
+     * throttle_state tells us if I/O limits are configured. */
     ThrottleState *throttle_state;
     ThrottleTimers throttle_timers;
     unsigned       pending_reqs[2];
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This uses the lock-free mutex described in the paper '"Blocking without
Locking", or LFTHREADS: A lock-free thread library' by Gidenstam and
Papatriantafilou.  The same technique is used in OSv, and in fact
the code is essentially a conversion to C of OSv's code.

[Added missing coroutine_fn in tests/test-aio-multithread.c.
--Stefan]

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-2-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h     |  17 ++++-
 tests/test-aio-multithread.c |  86 ++++++++++++++++++++++++
 util/qemu-coroutine-lock.c   | 155 ++++++++++++++++++++++++++++++++++++++++---
 util/trace-events            |   1 +
 4 files changed, 246 insertions(+), 13 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue);
 /**
  * Provides a mutex that can be used to synchronise coroutines
  */
+struct CoWaitRecord;
 typedef struct CoMutex {
-    bool locked;
+    /* Count of pending lockers; 0 for a free mutex, 1 for an
+     * uncontended mutex.
+     */
+    unsigned locked;
+
+    /* A queue of waiters.  Elements are added atomically in front of
+     * from_push.  to_pop is only populated, and popped from, by whoever
+     * is in charge of the next wakeup.  This can be an unlocker or,
+     * through the handoff protocol, a locker that is about to go to sleep.
+     */
+    QSLIST_HEAD(, CoWaitRecord) from_push, to_pop;
+
+    unsigned handoff, sequence;
+
     Coroutine *holder;
-    CoQueue queue;
 } CoMutex;
 
 /**
diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-aio-multithread.c
+++ b/tests/test-aio-multithread.c
@@ -XXX,XX +XXX,XX @@ static void test_multi_co_schedule_10(void)
     test_multi_co_schedule(10);
 }
 
+/* CoMutex thread-safety.  */
+
+static uint32_t atomic_counter;
+static uint32_t running;
+static uint32_t counter;
+static CoMutex comutex;
+
+static void coroutine_fn test_multi_co_mutex_entry(void *opaque)
+{
+    while (!atomic_mb_read(&now_stopping)) {
+        qemu_co_mutex_lock(&comutex);
+        counter++;
+        qemu_co_mutex_unlock(&comutex);
+
+        /* Increase atomic_counter *after* releasing the mutex.  Otherwise
+         * there is a chance (it happens about 1 in 3 runs) that the iothread
+         * exits before the coroutine is woken up, causing a spurious
+         * assertion failure.
+         */
+        atomic_inc(&atomic_counter);
+    }
+    atomic_dec(&running);
+}
+
+static void test_multi_co_mutex(int threads, int seconds)
+{
+    int i;
+
+    qemu_co_mutex_init(&comutex);
+    counter = 0;
+    atomic_counter = 0;
+    now_stopping = false;
+
+    create_aio_contexts();
+    assert(threads <= NUM_CONTEXTS);
+    running = threads;
+    for (i = 0; i < threads; i++) {
+        Coroutine *co1 = qemu_coroutine_create(test_multi_co_mutex_entry, NULL);
+        aio_co_schedule(ctx[i], co1);
+    }
+
+    g_usleep(seconds * 1000000);
+
+    atomic_mb_set(&now_stopping, true);
+    while (running > 0) {
+        g_usleep(100000);
+    }
+
+    join_aio_contexts();
+    g_test_message("%d iterations/second\n", counter / seconds);
+    g_assert_cmpint(counter, ==, atomic_counter);
+}
+
+/* Testing with NUM_CONTEXTS threads focuses on the queue.  The mutex however
+ * is too contended (and the threads spend too much time in aio_poll)
+ * to actually stress the handoff protocol.
+ */
+static void test_multi_co_mutex_1(void)
+{
+    test_multi_co_mutex(NUM_CONTEXTS, 1);
+}
+
+static void test_multi_co_mutex_10(void)
+{
+    test_multi_co_mutex(NUM_CONTEXTS, 10);
+}
+
+/* Testing with fewer threads stresses the handoff protocol too.  Still, the
+ * case where the locker _can_ pick up a handoff is very rare, happening
+ * about 10 times in 1 million, so increase the runtime a bit compared to
+ * other "quick" testcases that only run for 1 second.
+ */
+static void test_multi_co_mutex_2_3(void)
+{
+    test_multi_co_mutex(2, 3);
+}
+
+static void test_multi_co_mutex_2_30(void)
+{
+    test_multi_co_mutex(2, 30);
+}
+
 /* End of tests.  */
 
 int main(int argc, char **argv)
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/aio/multi/lifecycle", test_lifecycle);
     if (g_test_quick()) {
         g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
+        g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1);
+        g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3);
     } else {
         g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
+        g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10);
+        g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30);
     }
     return g_test_run();
 }
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
+ *
+ * The lock-free mutex implementation is based on OSv
+ * (core/lfmutex.cc, include/lockfree/mutex.hh).
+ * Copyright (C) 2013 Cloudius Systems, Ltd.
  */
 
 #include "qemu/osdep.h"
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue)
     return QSIMPLEQ_FIRST(&queue->entries) == NULL;
 }
 
+/* The wait records are handled with a multiple-producer, single-consumer
+ * lock-free queue.  There cannot be two concurrent pop_waiter() calls
+ * because pop_waiter() can only be called while mutex->handoff is zero.
+ * This can happen in three cases:
+ * - in qemu_co_mutex_unlock, before the hand-off protocol has started.
+ *   In this case, qemu_co_mutex_lock will see mutex->handoff == 0 and
+ *   not take part in the handoff.
+ * - in qemu_co_mutex_lock, if it steals the hand-off responsibility from
+ *   qemu_co_mutex_unlock.  In this case, qemu_co_mutex_unlock will fail
+ *   the cmpxchg (it will see either 0 or the next sequence value) and
+ *   exit.  The next hand-off cannot begin until qemu_co_mutex_lock has
+ *   woken up someone.
+ * - in qemu_co_mutex_unlock, if it takes the hand-off token itself.
+ *   In this case another iteration starts with mutex->handoff == 0;
+ *   a concurrent qemu_co_mutex_lock will fail the cmpxchg, and
+ *   qemu_co_mutex_unlock will go back to case (1).
+ *
+ * The following functions manage this queue.
+ */
+typedef struct CoWaitRecord {
+    Coroutine *co;
+    QSLIST_ENTRY(CoWaitRecord) next;
+} CoWaitRecord;
+
+static void push_waiter(CoMutex *mutex, CoWaitRecord *w)
+{
+    w->co = qemu_coroutine_self();
+    QSLIST_INSERT_HEAD_ATOMIC(&mutex->from_push, w, next);
+}
+
+static void move_waiters(CoMutex *mutex)
+{
+    QSLIST_HEAD(, CoWaitRecord) reversed;
+    QSLIST_MOVE_ATOMIC(&reversed, &mutex->from_push);
+    while (!QSLIST_EMPTY(&reversed)) {
+        CoWaitRecord *w = QSLIST_FIRST(&reversed);
+        QSLIST_REMOVE_HEAD(&reversed, next);
+        QSLIST_INSERT_HEAD(&mutex->to_pop, w, next);
+    }
+}
+
+static CoWaitRecord *pop_waiter(CoMutex *mutex)
+{
+    CoWaitRecord *w;
+
+    if (QSLIST_EMPTY(&mutex->to_pop)) {
+        move_waiters(mutex);
+        if (QSLIST_EMPTY(&mutex->to_pop)) {
+            return NULL;
+        }
+    }
+    w = QSLIST_FIRST(&mutex->to_pop);
+    QSLIST_REMOVE_HEAD(&mutex->to_pop, next);
+    return w;
+}
+
+static bool has_waiters(CoMutex *mutex)
+{
+    return QSLIST_EMPTY(&mutex->to_pop) || QSLIST_EMPTY(&mutex->from_push);
+}
+
 void qemu_co_mutex_init(CoMutex *mutex)
 {
     memset(mutex, 0, sizeof(*mutex));
-    qemu_co_queue_init(&mutex->queue);
 }
 
-void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
+static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
 {
     Coroutine *self = qemu_coroutine_self();
+    CoWaitRecord w;
+    unsigned old_handoff;
 
     trace_qemu_co_mutex_lock_entry(mutex, self);
+    w.co = self;
+    push_waiter(mutex, &w);
 
-    while (mutex->locked) {
-        qemu_co_queue_wait(&mutex->queue);
+    /* This is the "Responsibility Hand-Off" protocol; a lock() picks from
+     * a concurrent unlock() the responsibility of waking somebody up.
+     */
+    old_handoff = atomic_mb_read(&mutex->handoff);
+    if (old_handoff &&
+        has_waiters(mutex) &&
+        atomic_cmpxchg(&mutex->handoff, old_handoff, 0) == old_handoff) {
+        /* There can be no concurrent pops, because there can be only
+         * one active handoff at a time.
+         */
+        CoWaitRecord *to_wake = pop_waiter(mutex);
+        Coroutine *co = to_wake->co;
+        if (co == self) {
+            /* We got the lock ourselves!  */
+            assert(to_wake == &w);
+            return;
+        }
+
+        aio_co_wake(co);
     }
 
-    mutex->locked = true;
-    mutex->holder = self;
-    self->locks_held++;
-
+    qemu_coroutine_yield();
     trace_qemu_co_mutex_lock_return(mutex, self);
 }
 
+void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
+{
+    Coroutine *self = qemu_coroutine_self();
+
+    if (atomic_fetch_inc(&mutex->locked) == 0) {
+        /* Uncontended.  */
+        trace_qemu_co_mutex_lock_uncontended(mutex, self);
+    } else {
+        qemu_co_mutex_lock_slowpath(mutex);
+    }
+    mutex->holder = self;
+    self->locks_held++;
+}
+
 void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
 {
     Coroutine *self = qemu_coroutine_self();
 
     trace_qemu_co_mutex_unlock_entry(mutex, self);
 
-    assert(mutex->locked == true);
+    assert(mutex->locked);
     assert(mutex->holder == self);
     assert(qemu_in_coroutine());
 
-    mutex->locked = false;
     mutex->holder = NULL;
     self->locks_held--;
-    qemu_co_queue_next(&mutex->queue);
+    if (atomic_fetch_dec(&mutex->locked) == 1) {
+        /* No waiting qemu_co_mutex_lock().  Pfew, that was easy!  */
+        return;
+    }
+
+    for (;;) {
+        CoWaitRecord *to_wake = pop_waiter(mutex);
+        unsigned our_handoff;
+
+        if (to_wake) {
+            Coroutine *co = to_wake->co;
+            aio_co_wake(co);
+            break;
+        }
+
+        /* Some concurrent lock() is in progress (we know this because
+         * mutex->locked was >1) but it hasn't yet put itself on the wait
+         * queue.  Pick a sequence number for the handoff protocol (not 0).
+         */
+        if (++mutex->sequence == 0) {
+            mutex->sequence = 1;
+        }
+
+        our_handoff = mutex->sequence;
+        atomic_mb_set(&mutex->handoff, our_handoff);
+        if (!has_waiters(mutex)) {
+            /* The concurrent lock has not added itself yet, so it
+             * will be able to pick our handoff.
+             */
+            break;
+        }
+
+        /* Try to do the handoff protocol ourselves; if somebody else has
+         * already taken it, however, we're done and they're responsible.
+         */
+        if (atomic_cmpxchg(&mutex->handoff, our_handoff, 0) != our_handoff) {
+            break;
+        }
+    }
 
     trace_qemu_co_mutex_unlock_return(mutex, self);
 }
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_coroutine_terminate(void *co) "self %p"
 
 # util/qemu-coroutine-lock.c
 qemu_co_queue_run_restart(void *co) "co %p"
+qemu_co_mutex_lock_uncontended(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Running a very small critical section on pthread_mutex_t and CoMutex
shows that pthread_mutex_t is much faster because it doesn't actually
go to sleep.  What happens is that the critical section is shorter
than the latency of entering the kernel and thus FUTEX_WAIT always
fails.  With CoMutex there is no such latency but you still want to
avoid wait and wakeup.  So introduce it artificially.

This only works with one waiters; because CoMutex is fair, it will
always have more waits and wakeups than a pthread_mutex_t.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-3-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h   |  5 +++++
 util/qemu-coroutine-lock.c | 51 ++++++++++++++++++++++++++++++++++++++++------
 util/qemu-coroutine.c      |  2 +-
 3 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ typedef struct CoMutex {
      */
     unsigned locked;
 
+    /* Context that is holding the lock.  Useful to avoid spinning
+     * when two coroutines on the same AioContext try to get the lock. :)
+     */
+    AioContext *ctx;
+
     /* A queue of waiters.  Elements are added atomically in front of
      * from_push.  to_pop is only populated, and popped from, by whoever
      * is in charge of the next wakeup.  This can be an unlocker or,
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu-common.h"
 #include "qemu/coroutine.h"
 #include "qemu/coroutine_int.h"
+#include "qemu/processor.h"
 #include "qemu/queue.h"
 #include "block/aio.h"
 #include "trace.h"
@@ -XXX,XX +XXX,XX @@ void qemu_co_mutex_init(CoMutex *mutex)
     memset(mutex, 0, sizeof(*mutex));
 }
 
-static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
+static void coroutine_fn qemu_co_mutex_wake(CoMutex *mutex, Coroutine *co)
+{
+    /* Read co before co->ctx; pairs with smp_wmb() in
+     * qemu_coroutine_enter().
+     */
+    smp_read_barrier_depends();
+    mutex->ctx = co->ctx;
+    aio_co_wake(co);
+}
+
+static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx,
+                                                     CoMutex *mutex)
 {
     Coroutine *self = qemu_coroutine_self();
     CoWaitRecord w;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
         if (co == self) {
             /* We got the lock ourselves!  */
             assert(to_wake == &w);
+            mutex->ctx = ctx;
             return;
         }
 
-        aio_co_wake(co);
+        qemu_co_mutex_wake(mutex, co);
     }
 
     qemu_coroutine_yield();
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
 
 void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
 {
+    AioContext *ctx = qemu_get_current_aio_context();
     Coroutine *self = qemu_coroutine_self();
+    int waiters, i;
 
-    if (atomic_fetch_inc(&mutex->locked) == 0) {
+    /* Running a very small critical section on pthread_mutex_t and CoMutex
+     * shows that pthread_mutex_t is much faster because it doesn't actually
+     * go to sleep.  What happens is that the critical section is shorter
+     * than the latency of entering the kernel and thus FUTEX_WAIT always
+     * fails.  With CoMutex there is no such latency but you still want to
+     * avoid wait and wakeup.  So introduce it artificially.
+     */
+    i = 0;
+retry_fast_path:
+    waiters = atomic_cmpxchg(&mutex->locked, 0, 1);
+    if (waiters != 0) {
+        while (waiters == 1 && ++i < 1000) {
+            if (atomic_read(&mutex->ctx) == ctx) {
+                break;
+            }
+            if (atomic_read(&mutex->locked) == 0) {
+                goto retry_fast_path;
+            }
+            cpu_relax();
+        }
+        waiters = atomic_fetch_inc(&mutex->locked);
+    }
+
+    if (waiters == 0) {
         /* Uncontended.  */
         trace_qemu_co_mutex_lock_uncontended(mutex, self);
+        mutex->ctx = ctx;
     } else {
-        qemu_co_mutex_lock_slowpath(mutex);
+        qemu_co_mutex_lock_slowpath(ctx, mutex);
     }
     mutex->holder = self;
     self->locks_held++;
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
     assert(mutex->holder == self);
     assert(qemu_in_coroutine());
 
+    mutex->ctx = NULL;
     mutex->holder = NULL;
     self->locks_held--;
     if (atomic_fetch_dec(&mutex->locked) == 1) {
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
         unsigned our_handoff;
 
         if (to_wake) {
-            Coroutine *co = to_wake->co;
-            aio_co_wake(co);
+            qemu_co_mutex_wake(mutex, to_wake->co);
             break;
         }
 
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine.c
+++ b/util/qemu-coroutine.c
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_enter(Coroutine *co)
     co->ctx = qemu_get_current_aio_context();
 
     /* Store co->ctx before anything that stores co.  Matches
-     * barrier in aio_co_wake.
+     * barrier in aio_co_wake and qemu_co_mutex_wake.
      */
     smp_wmb();
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Add two implementations of the same benchmark as the previous patch,
but using pthreads.  One uses a normal QemuMutex, the other is Linux
only and implements a fair mutex based on MCS locks and futexes.
This shows that the slower performance of the 5-thread case is due to
the fairness of CoMutex, rather than to coroutines.  If fairness does
not matter, as is the case with two threads, CoMutex can actually be
faster than pthreads.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-4-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/test-aio-multithread.c | 164 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)

diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-aio-multithread.c
+++ b/tests/test-aio-multithread.c
@@ -XXX,XX +XXX,XX @@ static void test_multi_co_mutex_2_30(void)
     test_multi_co_mutex(2, 30);
 }
 
+/* Same test with fair mutexes, for performance comparison.  */
+
+#ifdef CONFIG_LINUX
+#include "qemu/futex.h"
+
+/* The nodes for the mutex reside in this structure (on which we try to avoid
+ * false sharing).  The head of the mutex is in the "mutex_head" variable.
+ */
+static struct {
+    int next, locked;
+    int padding[14];
+} nodes[NUM_CONTEXTS] __attribute__((__aligned__(64)));
+
+static int mutex_head = -1;
+
+static void mcs_mutex_lock(void)
+{
+    int prev;
+
+    nodes[id].next = -1;
+    nodes[id].locked = 1;
+    prev = atomic_xchg(&mutex_head, id);
+    if (prev != -1) {
+        atomic_set(&nodes[prev].next, id);
+        qemu_futex_wait(&nodes[id].locked, 1);
+    }
+}
+
+static void mcs_mutex_unlock(void)
+{
+    int next;
+    if (nodes[id].next == -1) {
+        if (atomic_read(&mutex_head) == id &&
+            atomic_cmpxchg(&mutex_head, id, -1) == id) {
+            /* Last item in the list, exit.  */
+            return;
+        }
+        while (atomic_read(&nodes[id].next) == -1) {
+            /* mcs_mutex_lock did the xchg, but has not updated
+             * nodes[prev].next yet.
+             */
+        }
+    }
+
+    /* Wake up the next in line.  */
+    next = nodes[id].next;
+    nodes[next].locked = 0;
+    qemu_futex_wake(&nodes[next].locked, 1);
+}
+
+static void test_multi_fair_mutex_entry(void *opaque)
+{
+    while (!atomic_mb_read(&now_stopping)) {
+        mcs_mutex_lock();
+        counter++;
+        mcs_mutex_unlock();
+        atomic_inc(&atomic_counter);
+    }
+    atomic_dec(&running);
+}
+
+static void test_multi_fair_mutex(int threads, int seconds)
+{
+    int i;
+
+    assert(mutex_head == -1);
+    counter = 0;
+    atomic_counter = 0;
+    now_stopping = false;
+
+    create_aio_contexts();
+    assert(threads <= NUM_CONTEXTS);
+    running = threads;
+    for (i = 0; i < threads; i++) {
+        Coroutine *co1 = qemu_coroutine_create(test_multi_fair_mutex_entry, NULL);
+        aio_co_schedule(ctx[i], co1);
+    }
+
+    g_usleep(seconds * 1000000);
+
+    atomic_mb_set(&now_stopping, true);
+    while (running > 0) {
+        g_usleep(100000);
+    }
+
+    join_aio_contexts();
+    g_test_message("%d iterations/second\n", counter / seconds);
+    g_assert_cmpint(counter, ==, atomic_counter);
+}
+
+static void test_multi_fair_mutex_1(void)
+{
+    test_multi_fair_mutex(NUM_CONTEXTS, 1);
+}
+
+static void test_multi_fair_mutex_10(void)
+{
+    test_multi_fair_mutex(NUM_CONTEXTS, 10);
+}
+#endif
+
+/* Same test with pthread mutexes, for performance comparison and
+ * portability.  */
+
+static QemuMutex mutex;
+
+static void test_multi_mutex_entry(void *opaque)
+{
+    while (!atomic_mb_read(&now_stopping)) {
+        qemu_mutex_lock(&mutex);
+        counter++;
+        qemu_mutex_unlock(&mutex);
+        atomic_inc(&atomic_counter);
+    }
+    atomic_dec(&running);
+}
+
+static void test_multi_mutex(int threads, int seconds)
+{
+    int i;
+
+    qemu_mutex_init(&mutex);
+    counter = 0;
+    atomic_counter = 0;
+    now_stopping = false;
+
+    create_aio_contexts();
+    assert(threads <= NUM_CONTEXTS);
+    running = threads;
+    for (i = 0; i < threads; i++) {
+        Coroutine *co1 = qemu_coroutine_create(test_multi_mutex_entry, NULL);
+        aio_co_schedule(ctx[i], co1);
+    }
+
+    g_usleep(seconds * 1000000);
+
+    atomic_mb_set(&now_stopping, true);
+    while (running > 0) {
+        g_usleep(100000);
+    }
+
+    join_aio_contexts();
+    g_test_message("%d iterations/second\n", counter / seconds);
+    g_assert_cmpint(counter, ==, atomic_counter);
+}
+
+static void test_multi_mutex_1(void)
+{
+    test_multi_mutex(NUM_CONTEXTS, 1);
+}
+
+static void test_multi_mutex_10(void)
+{
+    test_multi_mutex(NUM_CONTEXTS, 10);
+}
+
 /* End of tests.  */
 
 int main(int argc, char **argv)
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
         g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
         g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1);
         g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3);
+#ifdef CONFIG_LINUX
+        g_test_add_func("/aio/multi/mutex/mcs", test_multi_fair_mutex_1);
+#endif
+        g_test_add_func("/aio/multi/mutex/pthread", test_multi_mutex_1);
     } else {
         g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
         g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10);
         g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30);
+#ifdef CONFIG_LINUX
+        g_test_add_func("/aio/multi/mutex/mcs", test_multi_fair_mutex_10);
+#endif
+        g_test_add_func("/aio/multi/mutex/pthread", test_multi_mutex_10);
     }
     return g_test_run();
 }
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This will avoid forward references in the next patch.  It is also
more logical because CoQueue is not anymore the basic primitive.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-5-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h | 89 ++++++++++++++++++++++++------------------------
 1 file changed, 44 insertions(+), 45 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ bool qemu_in_coroutine(void);
  */
 bool qemu_coroutine_entered(Coroutine *co);
 
-
-/**
- * CoQueues are a mechanism to queue coroutines in order to continue executing
- * them later. They provide the fundamental primitives on which coroutine locks
- * are built.
- */
-typedef struct CoQueue {
-    QSIMPLEQ_HEAD(, Coroutine) entries;
-} CoQueue;
-
-/**
- * Initialise a CoQueue. This must be called before any other operation is used
- * on the CoQueue.
- */
-void qemu_co_queue_init(CoQueue *queue);
-
-/**
- * Adds the current coroutine to the CoQueue and transfers control to the
- * caller of the coroutine.
- */
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
-
-/**
- * Restarts the next coroutine in the CoQueue and removes it from the queue.
- *
- * Returns true if a coroutine was restarted, false if the queue is empty.
- */
-bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
-
-/**
- * Restarts all coroutines in the CoQueue and leaves the queue empty.
- */
-void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
-
-/**
- * Enter the next coroutine in the queue
- */
-bool qemu_co_enter_next(CoQueue *queue);
-
-/**
- * Checks if the CoQueue is empty.
- */
-bool qemu_co_queue_empty(CoQueue *queue);
-
-
 /**
  * Provides a mutex that can be used to synchronise coroutines
  */
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex);
  */
 void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
 
+
+/**
+ * CoQueues are a mechanism to queue coroutines in order to continue executing
+ * them later.
+ */
+typedef struct CoQueue {
+    QSIMPLEQ_HEAD(, Coroutine) entries;
+} CoQueue;
+
+/**
+ * Initialise a CoQueue. This must be called before any other operation is used
+ * on the CoQueue.
+ */
+void qemu_co_queue_init(CoQueue *queue);
+
+/**
+ * Adds the current coroutine to the CoQueue and transfers control to the
+ * caller of the coroutine.
+ */
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
+
+/**
+ * Restarts the next coroutine in the CoQueue and removes it from the queue.
+ *
+ * Returns true if a coroutine was restarted, false if the queue is empty.
+ */
+bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
+
+/**
+ * Restarts all coroutines in the CoQueue and leaves the queue empty.
+ */
+void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
+
+/**
+ * Enter the next coroutine in the queue
+ */
+bool qemu_co_enter_next(CoQueue *queue);
+
+/**
+ * Checks if the CoQueue is empty.
+ */
+bool qemu_co_queue_empty(CoQueue *queue);
+
+
 typedef struct CoRwlock {
     bool writer;
     int reader;
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

All that CoQueue needs in order to become thread-safe is help
from an external mutex.  Add this to the API.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-6-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h   |  8 +++++---
 block/backup.c             |  2 +-
 block/io.c                 |  4 ++--
 block/nbd-client.c         |  2 +-
 block/qcow2-cluster.c      |  4 +---
 block/sheepdog.c           |  2 +-
 block/throttle-groups.c    |  2 +-
 hw/9pfs/9p.c               |  2 +-
 util/qemu-coroutine-lock.c | 24 +++++++++++++++++++++---
 9 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
 
 /**
  * CoQueues are a mechanism to queue coroutines in order to continue executing
- * them later.
+ * them later.  They are similar to condition variables, but they need help
+ * from an external mutex in order to maintain thread-safety.
  */
 typedef struct CoQueue {
     QSIMPLEQ_HEAD(, Coroutine) entries;
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_init(CoQueue *queue);
 
 /**
  * Adds the current coroutine to the CoQueue and transfers control to the
- * caller of the coroutine.
+ * caller of the coroutine.  The mutex is unlocked during the wait and
+ * locked again afterwards.
  */
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex);
 
 /**
  * Restarts the next coroutine in the CoQueue and removes it from the queue.
diff --git a/block/backup.c b/block/backup.c
index XXXXXXX..XXXXXXX 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
         retry = false;
         QLIST_FOREACH(req, &job->inflight_reqs, list) {
             if (end > req->start && start < req->end) {
-                qemu_co_queue_wait(&req->wait_queue);
+                qemu_co_queue_wait(&req->wait_queue, NULL);
                 retry = true;
                 break;
             }
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
                  * (instead of producing a deadlock in the former case). */
                 if (!req->waiting_for) {
                     self->waiting_for = req;
-                    qemu_co_queue_wait(&req->wait_queue);
+                    qemu_co_queue_wait(&req->wait_queue, NULL);
                     self->waiting_for = NULL;
                     retry = true;
                     waited = true;
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 
     /* Wait until any previous flushes are completed */
     while (bs->active_flush_req) {
-        qemu_co_queue_wait(&bs->flush_queue);
+        qemu_co_queue_wait(&bs->flush_queue, NULL);
     }
 
     bs->active_flush_req = true;
diff --git a/block/nbd-client.c b/block/nbd-client.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -XXX,XX +XXX,XX @@ static void nbd_coroutine_start(NBDClientSession *s,
     /* Poor man semaphore.  The free_sema is locked when no other request
      * can be accepted, and unlocked after receiving one reply.  */
     if (s->in_flight == MAX_NBD_REQUESTS) {
-        qemu_co_queue_wait(&s->free_sema);
+        qemu_co_queue_wait(&s->free_sema, NULL);
         assert(s->in_flight < MAX_NBD_REQUESTS);
     }
     s->in_flight++;
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
             if (bytes == 0) {
                 /* Wait for the dependency to complete. We need to recheck
                  * the free/allocated clusters when we continue. */
-                qemu_co_mutex_unlock(&s->lock);
-                qemu_co_queue_wait(&old_alloc->dependent_requests);
-                qemu_co_mutex_lock(&s->lock);
+                qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
                 return -EAGAIN;
             }
         }
diff --git a/block/sheepdog.c b/block/sheepdog.c
index XXXXXXX..XXXXXXX 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
 retry:
     QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
         if (AIOCBOverlapping(acb, cb)) {
-            qemu_co_queue_wait(&s->overlapping_queue);
+            qemu_co_queue_wait(&s->overlapping_queue, NULL);
             goto retry;
         }
     }
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index XXXXXXX..XXXXXXX 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
     if (must_wait || blkp->pending_reqs[is_write]) {
         blkp->pending_reqs[is_write]++;
         qemu_mutex_unlock(&tg->lock);
-        qemu_co_queue_wait(&blkp->throttled_reqs[is_write]);
+        qemu_co_queue_wait(&blkp->throttled_reqs[is_write], NULL);
         qemu_mutex_lock(&tg->lock);
         blkp->pending_reqs[is_write]--;
     }
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn v9fs_flush(void *opaque)
         /*
          * Wait for pdu to complete.
          */
-        qemu_co_queue_wait(&cancel_pdu->complete);
+        qemu_co_queue_wait(&cancel_pdu->complete, NULL);
         cancel_pdu->cancelled = 0;
         pdu_free(cancel_pdu);
     }
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_init(CoQueue *queue)
     QSIMPLEQ_INIT(&queue->entries);
 }
 
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex)
 {
     Coroutine *self = qemu_coroutine_self();
     QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next);
+
+    if (mutex) {
+        qemu_co_mutex_unlock(mutex);
+    }
+
+    /* There is no race condition here.  Other threads will call
+     * aio_co_schedule on our AioContext, which can reenter this
+     * coroutine but only after this yield and after the main loop
+     * has gone through the next iteration.
+     */
     qemu_coroutine_yield();
     assert(qemu_in_coroutine());
+
+    /* TODO: OSv implements wait morphing here, where the wakeup
+     * primitive automatically places the woken coroutine on the
+     * mutex's queue.  This avoids the thundering herd effect.
+     */
+    if (mutex) {
+        qemu_co_mutex_lock(mutex);
+    }
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock)
     Coroutine *self = qemu_coroutine_self();
 
     while (lock->writer) {
-        qemu_co_queue_wait(&lock->queue);
+        qemu_co_queue_wait(&lock->queue, NULL);
     }
     lock->reader++;
     self->locks_held++;
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_wrlock(CoRwlock *lock)
     Coroutine *self = qemu_coroutine_self();
 
     while (lock->writer || lock->reader) {
-        qemu_co_queue_wait(&lock->queue);
+        qemu_co_queue_wait(&lock->queue, NULL);
     }
     lock->writer = true;
     self->locks_held++;
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This adds a CoMutex around the existing CoQueue.  Because the write-side
can just take CoMutex, the old "writer" field is not necessary anymore.
Instead of removing it altogether, count the number of pending writers
during a read-side critical section and forbid further readers from
entering.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-7-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h   |  3 ++-
 util/qemu-coroutine-lock.c | 35 ++++++++++++++++++++++++-----------
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue);
 
 
 typedef struct CoRwlock {
-    bool writer;
+    int pending_writer;
     int reader;
+    CoMutex mutex;
     CoQueue queue;
 } CoRwlock;
 
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_init(CoRwlock *lock)
 {
     memset(lock, 0, sizeof(*lock));
     qemu_co_queue_init(&lock->queue);
+    qemu_co_mutex_init(&lock->mutex);
 }
 
 void qemu_co_rwlock_rdlock(CoRwlock *lock)
 {
     Coroutine *self = qemu_coroutine_self();
 
-    while (lock->writer) {
-        qemu_co_queue_wait(&lock->queue, NULL);
+    qemu_co_mutex_lock(&lock->mutex);
+    /* For fairness, wait if a writer is in line.  */
+    while (lock->pending_writer) {
+        qemu_co_queue_wait(&lock->queue, &lock->mutex);
     }
     lock->reader++;
+    qemu_co_mutex_unlock(&lock->mutex);
+
+    /* The rest of the read-side critical section is run without the mutex.  */
     self->locks_held++;
 }
 
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
     Coroutine *self = qemu_coroutine_self();
 
     assert(qemu_in_coroutine());
-    if (lock->writer) {
-        lock->writer = false;
+    if (!lock->reader) {
+        /* The critical section started in qemu_co_rwlock_wrlock.  */
         qemu_co_queue_restart_all(&lock->queue);
     } else {
+        self->locks_held--;
+
+        qemu_co_mutex_lock(&lock->mutex);
         lock->reader--;
         assert(lock->reader >= 0);
         /* Wakeup only one waiting writer */
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
             qemu_co_queue_next(&lock->queue);
         }
     }
-    self->locks_held--;
+    qemu_co_mutex_unlock(&lock->mutex);
 }
 
 void qemu_co_rwlock_wrlock(CoRwlock *lock)
 {
-    Coroutine *self = qemu_coroutine_self();
-
-    while (lock->writer || lock->reader) {
-        qemu_co_queue_wait(&lock->queue, NULL);
+    qemu_co_mutex_lock(&lock->mutex);
+    lock->pending_writer++;
+    while (lock->reader) {
+        qemu_co_queue_wait(&lock->queue, &lock->mutex);
     }
-    lock->writer = true;
-    self->locks_held++;
+    lock->pending_writer--;
+
+    /* The rest of the write-side critical section is run with
+     * the mutex taken, so that lock->reader remains zero.
+     * There is no need to update self->locks_held.
+     */
 }
-- 
2.9.3