Series comparison

-[Qemu-devel] [PULL 00/20] Block patches
+[Qemu-devel] [PULL v2 00/24] Block patches
-The following changes since commit 5ec2eca83dc478ddf24077e02a8b34dd26cd3ff9:
+The following changes since commit 56f9e46b841c7be478ca038d8d4085d776ab4b0d:
-  Merge remote-tracking branch 'remotes/awilliam/tags/vfio-updates-20190613.0' into staging (2019-06-14 09:33:55 +0100)
+  Merge remote-tracking branch 'remotes/armbru/tags/pull-qapi-2017-02-20' into staging (2017-02-20 17:42:47 +0000)
-are available in the Git repository at:
+are available in the git repository at:
-  https://github.com/XanClic/qemu.git tags/pull-block-2019-06-14
+  git://github.com/stefanha/qemu.git tags/block-pull-request
-for you to fetch changes up to 21c1ce592a144188dfe59b9e156a97da412a59a2:
+for you to fetch changes up to a7b91d35bab97a2d3e779d0c64c9b837b52a6cf7:
-  iotests: Test qemu-img convert -C --salvage (2019-06-14 15:09:42 +0200)
+  coroutine-lock: make CoRwlock thread-safe and fair (2017-02-21 11:39:40 +0000)
 ----------------------------------------------------------------
-Block patches:
+Pull request
-- Allow blockdev-backup from nodes that are not in qemu's main AIO
-  context to newly added nodes
+v2:
-- Add salvaging mode to qemu-img convert
+ * Rebased to resolve scsi conflicts
 - Minor fixes to tests, documentation, and for less Valgrind annoyance
 ----------------------------------------------------------------
-Andrey Shinkevich (1):
-  hw/block/fdc: floppy command FIFO memory initialization
-John Snow (6):
+Paolo Bonzini (24):
-  blockdev-backup: don't check aio_context too early
+  block: move AioContext, QEMUTimer, main-loop to libqemuutil
-  iotests.py: do not use infinite waits
+  aio: introduce aio_co_schedule and aio_co_wake
-  QEMUMachine: add events_wait method
+  block-backend: allow blk_prw from coroutine context
-  iotests.py: rewrite run_job to be pickier
+  test-thread-pool: use generic AioContext infrastructure
-  iotests: add iotest 256 for testing blockdev-backup across iothread
+  io: add methods to set I/O handlers on AioContext
-    contexts
+  io: make qio_channel_yield aware of AioContexts
-  event_match: always match on None value
+  nbd: convert to use qio_channel_yield
   coroutine-lock: reschedule coroutine on the AioContext it was running
     on
   blkdebug: reschedule coroutine on the AioContext it is running on
   qed: introduce qed_aio_start_io and qed_aio_next_io_cb
   aio: push aio_context_acquire/release down to dispatching
   block: explicitly acquire aiocontext in timers that need it
   block: explicitly acquire aiocontext in callbacks that need it
   block: explicitly acquire aiocontext in bottom halves that need it
   block: explicitly acquire aiocontext in aio callbacks that need it
   aio-posix: partially inline aio_dispatch into aio_poll
   async: remove unnecessary inc/dec pairs
   block: document fields protected by AioContext lock
   coroutine-lock: make CoMutex thread-safe
   coroutine-lock: add limited spinning to CoMutex
   test-aio-multithread: add performance comparison with thread-based
     mutexes
   coroutine-lock: place CoMutex before CoQueue in header
   coroutine-lock: add mutex argument to CoQueue APIs
   coroutine-lock: make CoRwlock thread-safe and fair
-Max Reitz (12):
+ Makefile.objs                       |   4 -
-  iotests: Filter 175's allocation information
+ stubs/Makefile.objs                 |   1 +
-  iotests: Fix intermittent failure in 219
+ tests/Makefile.include              |  19 +-
-  qemu-img: Fix options leakage in img_rebase()
+ util/Makefile.objs                  |   6 +-
-  qapi/block-core: Overlays are not snapshots
+ block/nbd-client.h                  |   2 +-
-  blockdev: Overlays are not snapshots
+ block/qed.h                         |   3 +
-  qemu-img: Move quiet into ImgConvertState
+ include/block/aio.h                 |  38 ++-
-  qemu-img: Add salvaging mode to convert
+ include/block/block_int.h           |  64 +++--
-  blkdebug: Add @iotype error option
+ include/io/channel.h                |  72 +++++-
-  blkdebug: Add "none" event
+ include/qemu/coroutine.h            |  84 ++++---
-  blkdebug: Inject errors on .bdrv_co_block_status()
+ include/qemu/coroutine_int.h        |  11 +-
-  iotests: Test qemu-img convert --salvage
+ include/sysemu/block-backend.h      |  14 +-
-  iotests: Test qemu-img convert -C --salvage
+ tests/iothread.h                    |  25 ++
+ block/backup.c                      |   2 +-
-Vladimir Sementsov-Ogievskiy (1):
+ block/blkdebug.c                    |   9 +-
-  iotests: restrict 254 to support only qcow2
+ block/blkreplay.c                   |   2 +-
+ block/block-backend.c               |  13 +-
- qapi/block-core.json          |  53 ++++++++---
+ block/curl.c                        |  44 +++-
- block/blkdebug.c              |  60 ++++++++++--
+ block/gluster.c                     |   9 +-
- blockdev.c                    |  14 +--
+ block/io.c                          |  42 +---
- hw/block/fdc.c                |   1 +
+ block/iscsi.c                       |  15 +-
- qemu-img.c                    | 106 +++++++++++++++------
+ block/linux-aio.c                   |  10 +-
- python/qemu/__init__.py       |  67 ++++++++++----
+ block/mirror.c                      |  12 +-
- qemu-img-cmds.hx              |   4 +-
+ block/nbd-client.c                  | 119 +++++----
- qemu-img.texi                 |   4 +
+ block/nfs.c                         |   9 +-
- tests/qemu-iotests/082        |   1 +
+ block/qcow2-cluster.c               |   4 +-
- tests/qemu-iotests/082.out    |   3 +
+ block/qed-cluster.c                 |   2 +
- tests/qemu-iotests/085.out    |  10 +-
+ block/qed-table.c                   |  12 +-
- tests/qemu-iotests/175        |  26 +++++-
+ block/qed.c                         |  58 +++--
- tests/qemu-iotests/175.out    |   8 +-
+ block/sheepdog.c                    |  31 +--
- tests/qemu-iotests/219        |  13 ++-
+ block/ssh.c                         |  29 +--
- tests/qemu-iotests/251        | 170 ++++++++++++++++++++++++++++++++++
+ block/throttle-groups.c             |   4 +-
- tests/qemu-iotests/251.out    |  43 +++++++++
+ block/win32-aio.c                   |   9 +-
- tests/qemu-iotests/254        |   2 +
+ dma-helpers.c                       |   2 +
- tests/qemu-iotests/256        | 122 ++++++++++++++++++++++++
+ hw/9pfs/9p.c                        |   2 +-
- tests/qemu-iotests/256.out    | 119 ++++++++++++++++++++++++
+ hw/block/virtio-blk.c               |  19 +-
- tests/qemu-iotests/group      |   2 +
+ hw/scsi/scsi-bus.c                  |   2 +
- tests/qemu-iotests/iotests.py |  60 +++++++-----
+ hw/scsi/scsi-disk.c                 |  15 ++
-files changed, 772 insertions(+), 116 deletions(-)
+ hw/scsi/scsi-generic.c              |  20 +-
- create mode 100755 tests/qemu-iotests/251
+ hw/scsi/virtio-scsi.c               |   7 +
- create mode 100644 tests/qemu-iotests/251.out
+ io/channel-command.c                |  13 +
- create mode 100755 tests/qemu-iotests/256
+ io/channel-file.c                   |  11 +
- create mode 100644 tests/qemu-iotests/256.out
+ io/channel-socket.c                 |  16 +-
  io/channel-tls.c                    |  12 +
  io/channel-watch.c                  |   6 +
  io/channel.c                        |  97 ++++++--
  nbd/client.c                        |   2 +-
  nbd/common.c                        |   9 +-
  nbd/server.c                        |  94 +++-----
  stubs/linux-aio.c                   |  32 +++
  stubs/set-fd-handler.c              |  11 -
  tests/iothread.c                    |  91 +++++++
  tests/test-aio-multithread.c        | 463 ++++++++++++++++++++++++++++++++++++
  tests/test-thread-pool.c            |  12 +-
  aio-posix.c => util/aio-posix.c     |  62 ++---
  aio-win32.c => util/aio-win32.c     |  30 +--
  util/aiocb.c                        |  55 +++++
  async.c => util/async.c             |  84 ++++++-
  iohandler.c => util/iohandler.c     |   0
  main-loop.c => util/main-loop.c     |   0
  util/qemu-coroutine-lock.c          | 254 ++++++++++++++++++--
  util/qemu-coroutine-sleep.c         |   2 +-
  util/qemu-coroutine.c               |   8 +
  qemu-timer.c => util/qemu-timer.c   |   0
  thread-pool.c => util/thread-pool.c |   8 +-
  trace-events                        |  11 -
  util/trace-events                   |  17 +-
 files changed, 1712 insertions(+), 533 deletions(-)
  create mode 100644 tests/iothread.h
  create mode 100644 stubs/linux-aio.c
  create mode 100644 tests/iothread.c
  create mode 100644 tests/test-aio-multithread.c
  rename aio-posix.c => util/aio-posix.c (94%)
  rename aio-win32.c => util/aio-win32.c (95%)
  create mode 100644 util/aiocb.c
  rename async.c => util/async.c (82%)
  rename iohandler.c => util/iohandler.c (100%)
  rename main-loop.c => util/main-loop.c (100%)
  rename qemu-timer.c => util/qemu-timer.c (100%)
  rename thread-pool.c => util/thread-pool.c (97%)
 --
-.21.0
+.9.3

-[Qemu-devel] [PULL 12/20] qapi/block-core: Overlays are not snapshots
+[Qemu-devel] [PULL v2 01/24] block: move AioContext, QEMUTimer, main-loop to libqemuutil
-A snapshot is something that reflects the state of something at a
+From: Paolo Bonzini <pbonzini@redhat.com>
-certain point in time.  It does not change.
+AioContext is fairly self contained, the only dependency is QEMUTimer but
-The file our snapshot commands create (or the node they install) is not
+that in turn doesn't need anything else.  So move them out of block-obj-y
-a snapshot, as it does change over time.  It is an overlay.  We cannot
+to avoid introducing a dependency from io/ to block-obj-y.
-do anything about the parameter names, but we can at least adjust the
-descriptions to reflect that fact.
+main-loop and its dependency iohandler also need to be moved, because
+later in this series io/ will call iohandler_get_aio_context.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+[Changed copyright "the QEMU team" to "other QEMU contributors" as
-Message-id: 20190603202236.1342-2-mreitz@redhat.com
+suggested by Daniel Berrange and agreed by Paolo.
-Reviewed-by: John Snow <jsnow@redhat.com>
+--Stefan]
-Reviewed-by: Alberto Garcia <berto@igalia.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213135235.12274-2-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- qapi/block-core.json | 20 ++++++++++----------
+ Makefile.objs                       |  4 ---
-file changed, 10 insertions(+), 10 deletions(-)
+ stubs/Makefile.objs                 |  1 +
+ tests/Makefile.include              | 11 ++++----
-diff --git a/qapi/block-core.json b/qapi/block-core.json
+ util/Makefile.objs                  |  6 +++-
-index XXXXXXX..XXXXXXX 100644
+ block/io.c                          | 29 -------------------
---- a/qapi/block-core.json
+ stubs/linux-aio.c                   | 32 +++++++++++++++++++++
-+++ b/qapi/block-core.json
+ stubs/set-fd-handler.c              | 11 --------
  aio-posix.c => util/aio-posix.c     |  2 +-
  aio-win32.c => util/aio-win32.c     |  0
  util/aiocb.c                        | 55 +++++++++++++++++++++++++++++++++++++
  async.c => util/async.c             |  3 +-
  iohandler.c => util/iohandler.c     |  0
  main-loop.c => util/main-loop.c     |  0
  qemu-timer.c => util/qemu-timer.c   |  0
  thread-pool.c => util/thread-pool.c |  2 +-
  trace-events                        | 11 --------
  util/trace-events                   | 11 ++++++++
 files changed, 114 insertions(+), 64 deletions(-)
  create mode 100644 stubs/linux-aio.c
  rename aio-posix.c => util/aio-posix.c (99%)
  rename aio-win32.c => util/aio-win32.c (100%)
  create mode 100644 util/aiocb.c
  rename async.c => util/async.c (99%)
  rename iohandler.c => util/iohandler.c (100%)
  rename main-loop.c => util/main-loop.c (100%)
  rename qemu-timer.c => util/qemu-timer.c (100%)
  rename thread-pool.c => util/thread-pool.c (99%)
 diff --git a/Makefile.objs b/Makefile.objs
 index XXXXXXX..XXXXXXX 100644
 --- a/Makefile.objs
 +++ b/Makefile.objs
@@ -XXX,XX +XXX,XX @@ chardev-obj-y = chardev/
  #######################################################################
  # block-obj-y is code used by both qemu system emulation and qemu-img
 -block-obj-y = async.o thread-pool.o
  block-obj-y += nbd/
  block-obj-y += block.o blockjob.o
 -block-obj-y += main-loop.o iohandler.o qemu-timer.o
 -block-obj-$(CONFIG_POSIX) += aio-posix.o
 -block-obj-$(CONFIG_WIN32) += aio-win32.o
  block-obj-y += block/
  block-obj-y += qemu-io-cmds.o
  block-obj-$(CONFIG_REPLICATION) += replication.o
 diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
 index XXXXXXX..XXXXXXX 100644
 --- a/stubs/Makefile.objs
 +++ b/stubs/Makefile.objs
@@ -XXX,XX +XXX,XX @@ stub-obj-y += get-vm-name.o
  stub-obj-y += iothread.o
  stub-obj-y += iothread-lock.o
  stub-obj-y += is-daemonized.o
 +stub-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
  stub-obj-y += machine-init-done.o
  stub-obj-y += migr-blocker.o
  stub-obj-y += monitor.o
 diff --git a/tests/Makefile.include b/tests/Makefile.include
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/Makefile.include
 +++ b/tests/Makefile.include
@@ -XXX,XX +XXX,XX @@ check-unit-y += tests/test-visitor-serialization$(EXESUF)
  check-unit-y += tests/test-iov$(EXESUF)
  gcov-files-test-iov-y = util/iov.c
  check-unit-y += tests/test-aio$(EXESUF)
 +gcov-files-test-aio-y = util/async.c util/qemu-timer.o
 +gcov-files-test-aio-$(CONFIG_WIN32) += util/aio-win32.c
 +gcov-files-test-aio-$(CONFIG_POSIX) += util/aio-posix.c
  check-unit-y += tests/test-throttle$(EXESUF)
  gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
  gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
@@ -XXX,XX +XXX,XX @@ tests/check-qjson$(EXESUF): tests/check-qjson.o $(test-util-obj-y)
  tests/check-qom-interface$(EXESUF): tests/check-qom-interface.o $(test-qom-obj-y)
  tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
 -tests/test-char$(EXESUF): tests/test-char.o qemu-timer.o \
 -    $(test-util-obj-y) $(qtest-obj-y) $(test-block-obj-y) $(chardev-obj-y)
 +tests/test-char$(EXESUF): tests/test-char.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y) $(chardev-obj-y)
  tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
  tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
  tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
@@ -XXX,XX +XXX,XX @@ tests/test-vmstate$(EXESUF): tests/test-vmstate.o \
      migration/vmstate.o migration/qemu-file.o \
          migration/qemu-file-channel.o migration/qjson.o \
      $(test-io-obj-y)
 -tests/test-timed-average$(EXESUF): tests/test-timed-average.o qemu-timer.o \
 -    $(test-util-obj-y)
 +tests/test-timed-average$(EXESUF): tests/test-timed-average.o $(test-util-obj-y)
  tests/test-base64$(EXESUF): tests/test-base64.o \
      libqemuutil.a libqemustub.a
  tests/ptimer-test$(EXESUF): tests/ptimer-test.o tests/ptimer-test-stubs.o hw/core/ptimer.o libqemustub.a
@@ -XXX,XX +XXX,XX @@ tests/usb-hcd-ehci-test$(EXESUF): tests/usb-hcd-ehci-test.o $(libqos-usb-obj-y)
  tests/usb-hcd-xhci-test$(EXESUF): tests/usb-hcd-xhci-test.o $(libqos-usb-obj-y)
  tests/pc-cpu-test$(EXESUF): tests/pc-cpu-test.o
  tests/postcopy-test$(EXESUF): tests/postcopy-test.o
 -tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-timer.o \
 +tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o $(test-util-obj-y) \
      $(qtest-obj-y) $(test-io-obj-y) $(libqos-virtio-obj-y) $(libqos-pc-obj-y) \
      $(chardev-obj-y)
  tests/qemu-iotests/socket_scm_helper$(EXESUF): tests/qemu-iotests/socket_scm_helper.o
 diff --git a/util/Makefile.objs b/util/Makefile.objs
 index XXXXXXX..XXXXXXX 100644
 --- a/util/Makefile.objs
 +++ b/util/Makefile.objs
@@ -XXX,XX +XXX,XX @@
  util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
  util-obj-y += bufferiszero.o
  util-obj-y += lockcnt.o
 +util-obj-y += aiocb.o async.o thread-pool.o qemu-timer.o
 +util-obj-y += main-loop.o iohandler.o
 +util-obj-$(CONFIG_POSIX) += aio-posix.o
  util-obj-$(CONFIG_POSIX) += compatfd.o
  util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
  util-obj-$(CONFIG_POSIX) += mmap-alloc.o
  util-obj-$(CONFIG_POSIX) += oslib-posix.o
  util-obj-$(CONFIG_POSIX) += qemu-openpty.o
  util-obj-$(CONFIG_POSIX) += qemu-thread-posix.o
 -util-obj-$(CONFIG_WIN32) += event_notifier-win32.o
  util-obj-$(CONFIG_POSIX) += memfd.o
 +util-obj-$(CONFIG_WIN32) += aio-win32.o
 +util-obj-$(CONFIG_WIN32) += event_notifier-win32.o
  util-obj-$(CONFIG_WIN32) += oslib-win32.o
  util-obj-$(CONFIG_WIN32) += qemu-thread-win32.o
  util-obj-y += envlist.o path.o module.o
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
      return &acb->common;
  }
 -void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
 -                   BlockCompletionFunc *cb, void *opaque)
 -{
 -    BlockAIOCB *acb;
 -
 -    acb = g_malloc(aiocb_info->aiocb_size);
 -    acb->aiocb_info = aiocb_info;
 -    acb->bs = bs;
 -    acb->cb = cb;
 -    acb->opaque = opaque;
 -    acb->refcnt = 1;
 -    return acb;
 -}
 -
 -void qemu_aio_ref(void *p)
 -{
 -    BlockAIOCB *acb = p;
 -    acb->refcnt++;
 -}
 -
 -void qemu_aio_unref(void *p)
 -{
 -    BlockAIOCB *acb = p;
 -    assert(acb->refcnt > 0);
 -    if (--acb->refcnt == 0) {
 -        g_free(acb);
 -    }
 -}
 -
  /**************************************************************/
  /* Coroutine block device emulation */
 diff --git a/stubs/linux-aio.c b/stubs/linux-aio.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/stubs/linux-aio.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * Linux native AIO support.
 + *
 + * Copyright (C) 2009 IBM, Corp.
 + * Copyright (C) 2009 Red Hat, Inc.
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +#include "qemu/osdep.h"
 +#include "block/aio.h"
 +#include "block/raw-aio.h"
 +
 +void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
 +{
 +    abort();
 +}
 +
 +void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
 +{
 +    abort();
 +}
 +
 +LinuxAioState *laio_init(void)
 +{
 +    abort();
 +}
 +
 +void laio_cleanup(LinuxAioState *s)
 +{
 +    abort();
 +}
 diff --git a/stubs/set-fd-handler.c b/stubs/set-fd-handler.c
 index XXXXXXX..XXXXXXX 100644
 --- a/stubs/set-fd-handler.c
 +++ b/stubs/set-fd-handler.c
@@ -XXX,XX +XXX,XX @@ void qemu_set_fd_handler(int fd,
  {
      abort();
  }
 -
 -void aio_set_fd_handler(AioContext *ctx,
 -                        int fd,
 -                        bool is_external,
 -                        IOHandler *io_read,
 -                        IOHandler *io_write,
 -                        AioPollFn *io_poll,
 -                        void *opaque)
 -{
 -    abort();
 -}
 diff --git a/aio-posix.c b/util/aio-posix.c
 similarity index 99%
 rename from aio-posix.c
 rename to util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/aio-posix.c
 +++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/rcu_queue.h"
  #include "qemu/sockets.h"
  #include "qemu/cutils.h"
 -#include "trace-root.h"
 +#include "trace.h"
  #ifdef CONFIG_EPOLL_CREATE1
  #include <sys/epoll.h>
  #endif
 diff --git a/aio-win32.c b/util/aio-win32.c
 similarity index 100%
 rename from aio-win32.c
 rename to util/aio-win32.c
 diff --git a/util/aiocb.c b/util/aiocb.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/util/aiocb.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * BlockAIOCB allocation
 + *
 + * Copyright (c) 2003-2017 Fabrice Bellard and other QEMU contributors
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a copy
 + * of this software and associated documentation files (the "Software"), to deal
 + * in the Software without restriction, including without limitation the rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "block/aio.h"
 +
 +void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
 +                   BlockCompletionFunc *cb, void *opaque)
 +{
 +    BlockAIOCB *acb;
 +
 +    acb = g_malloc(aiocb_info->aiocb_size);
 +    acb->aiocb_info = aiocb_info;
 +    acb->bs = bs;
 +    acb->cb = cb;
 +    acb->opaque = opaque;
 +    acb->refcnt = 1;
 +    return acb;
 +}
 +
 +void qemu_aio_ref(void *p)
 +{
 +    BlockAIOCB *acb = p;
 +    acb->refcnt++;
 +}
 +
 +void qemu_aio_unref(void *p)
 +{
 +    BlockAIOCB *acb = p;
 +    assert(acb->refcnt > 0);
 +    if (--acb->refcnt == 0) {
 +        g_free(acb);
 +    }
 +}
 diff --git a/async.c b/util/async.c
 similarity index 99%
 rename from async.c
 rename to util/async.c
 index XXXXXXX..XXXXXXX 100644
 --- a/async.c
 +++ b/util/async.c
@@ -XXX,XX +XXX,XX @@
  /*
 - * QEMU System Emulator
 + * Data plane event loop
   *
   * Copyright (c) 2003-2008 Fabrice Bellard
 + * Copyright (c) 2009-2017 QEMU contributors
   *
   * Permission is hereby granted, free of charge, to any person obtaining a copy
   * of this software and associated documentation files (the "Software"), to deal
 diff --git a/iohandler.c b/util/iohandler.c
 similarity index 100%
 rename from iohandler.c
 rename to util/iohandler.c
 diff --git a/main-loop.c b/util/main-loop.c
 similarity index 100%
 rename from main-loop.c
 rename to util/main-loop.c
 diff --git a/qemu-timer.c b/util/qemu-timer.c
 similarity index 100%
 rename from qemu-timer.c
 rename to util/qemu-timer.c
 diff --git a/thread-pool.c b/util/thread-pool.c
 similarity index 99%
 rename from thread-pool.c
 rename to util/thread-pool.c
 index XXXXXXX..XXXXXXX 100644
 --- a/thread-pool.c
 +++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/queue.h"
  #include "qemu/thread.h"
  #include "qemu/coroutine.h"
 -#include "trace-root.h"
 +#include "trace.h"
  #include "block/thread-pool.h"
  #include "qemu/main-loop.h"
 diff --git a/trace-events b/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/trace-events
 +++ b/trace-events
 @@ -XXX,XX +XXX,XX @@
  #
- # Either @device or @node-name must be set but not both.
+ # The <format-string> should be a sprintf()-compatible format string.
- #
--# @device: the name of the device to generate the snapshot from.
+-# aio-posix.c
-+# @device: the name of the device to take a snapshot of.
+-run_poll_handlers_begin(void *ctx, int64_t max_ns) "ctx %p max_ns %"PRId64
- #
+-run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
- # @node-name: graph node name to generate the snapshot from (Since 2.0)
+-poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
- #
+-poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
--# @snapshot-file: the target of the new image. If the file exists, or
+-
--# if it is a device, the snapshot will be created in the existing
+-# thread-pool.c
--# file/device. Otherwise, a new file will be created.
+-thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
-+# @snapshot-file: the target of the new overlay image. If the file
+-thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
-+# exists, or if it is a device, the overlay will be created in the
+-thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
-+# existing file/device. Otherwise, a new file will be created.
+-
- #
+ # ioport.c
- # @snapshot-node-name: the graph node name of the new image (Since 2.0)
+ cpu_in(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
- #
+ cpu_out(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
--# @format: the format of the snapshot image, default is 'qcow2'.
+diff --git a/util/trace-events b/util/trace-events
-+# @format: the format of the overlay image, default is 'qcow2'.
+index XXXXXXX..XXXXXXX 100644
- #
+--- a/util/trace-events
- # @mode: whether and how QEMU should create a new image, default is
++++ b/util/trace-events
- #        'absolute-paths'.
+@@ -XXX,XX +XXX,XX @@
-@@ -XXX,XX +XXX,XX @@
+ # See docs/tracing.txt for syntax documentation.
- ##
- # @BlockdevSnapshot:
++# util/aio-posix.c
- #
++run_poll_handlers_begin(void *ctx, int64_t max_ns) "ctx %p max_ns %"PRId64
--# @node: device or node name that will have a snapshot created.
++run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
-+# @node: device or node name that will have a snapshot taken.
++poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
- #
++poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
- # @overlay: reference to the existing block device that will become
++
--#           the overlay of @node, as part of creating the snapshot.
++# util/thread-pool.c
-+#           the overlay of @node, as part of taking the snapshot.
++thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
- #           It must not have a current backing file (this can be
++thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
- #           achieved by passing "backing": null to blockdev-add).
++thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
- #
++
-@@ -XXX,XX +XXX,XX @@
+ # util/buffer.c
- ##
+ buffer_resize(const char *buf, size_t olen, size_t len) "%s: old %zd, new %zd"
- # @blockdev-snapshot-sync:
+ buffer_move_empty(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s"
  #
 -# Generates a synchronous snapshot of a block device.
 +# Takes a synchronous snapshot of a block device.
  #
  # For the arguments, see the documentation of BlockdevSnapshotSync.
  #
@@ -XXX,XX +XXX,XX @@
  ##
  # @blockdev-snapshot:
  #
 -# Generates a snapshot of a block device.
 +# Takes a snapshot of a block device.
  #
 -# Create a snapshot, by installing 'node' as the backing image of
 +# Take a snapshot, by installing 'node' as the backing image of
  # 'overlay'. Additionally, if 'node' is associated with a block
  # device, the block device changes to using 'overlay' as its new active
  # image.
 --
-.21.0
+.9.3

-[Qemu-devel] [PULL 05/20] iotests: add iotest 256 for testing blockdev-backup across iothread contexts
+[Qemu-devel] [PULL v2 02/24] aio: introduce aio_co_schedule and aio_co_wake
-From: John Snow <jsnow@redhat.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: John Snow <jsnow@redhat.com>
+aio_co_wake provides the infrastructure to start a coroutine on a "home"
-Message-id: 20190523170643.20794-6-jsnow@redhat.com
+AioContext.  It will be used by CoMutex and CoQueue, so that coroutines
-Reviewed-by: Max Reitz <mreitz@redhat.com>
+don't jump from one context to another when they go to sleep on a
-[mreitz: Moved from 250 to 256]
+mutex or waitqueue.  However, it can also be used as a more efficient
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+alternative to one-shot bottom halves, and saves the effort of tracking
 which AioContext a coroutine is running on.
 aio_co_schedule is the part of aio_co_wake that starts a coroutine
 on a remove AioContext, but it is also useful to implement e.g.
 bdrv_set_aio_context callbacks.
 The implementation of aio_co_schedule is based on a lock-free
 multiple-producer, single-consumer queue.  The multiple producers use
 cmpxchg to add to a LIFO stack.  The consumer (a per-AioContext bottom
 half) grabs all items added so far, inverts the list to make it FIFO,
 and goes through it one item at a time until it's empty.  The data
 structure was inspired by OSv, which uses it in the very code we'll
 "port" to QEMU for the thread-safe CoMutex.
 Most of the new code is really tests.
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213135235.12274-3-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- tests/qemu-iotests/256     | 122 +++++++++++++++++++++++++++++++++++++
+ tests/Makefile.include       |   8 +-
- tests/qemu-iotests/256.out | 119 ++++++++++++++++++++++++++++++++++++
+ include/block/aio.h          |  32 +++++++
- tests/qemu-iotests/group   |   1 +
+ include/qemu/coroutine_int.h |  11 ++-
-files changed, 242 insertions(+)
+ tests/iothread.h             |  25 +++++
- create mode 100755 tests/qemu-iotests/256
+ tests/iothread.c             |  91 ++++++++++++++++++
- create mode 100644 tests/qemu-iotests/256.out
+ tests/test-aio-multithread.c | 213 +++++++++++++++++++++++++++++++++++++++++++
+ util/async.c                 |  65 +++++++++++++
-diff --git a/tests/qemu-iotests/256 b/tests/qemu-iotests/256
+ util/qemu-coroutine.c        |   8 ++
-new file mode 100755
+ util/trace-events            |   4 +
-index XXXXXXX..XXXXXXX
+files changed, 453 insertions(+), 4 deletions(-)
---- /dev/null
+ create mode 100644 tests/iothread.h
-+++ b/tests/qemu-iotests/256
+ create mode 100644 tests/iothread.c
-@@ -XXX,XX +XXX,XX @@
+ create mode 100644 tests/test-aio-multithread.c
-+#!/usr/bin/env python
-+#
+diff --git a/tests/Makefile.include b/tests/Makefile.include
-+# Test incremental/backup across iothread contexts
+index XXXXXXX..XXXXXXX 100644
-+#
+--- a/tests/Makefile.include
-+# Copyright (c) 2019 John Snow for Red Hat, Inc.
++++ b/tests/Makefile.include
-+#
+@@ -XXX,XX +XXX,XX @@ check-unit-y += tests/test-aio$(EXESUF)
-+# This program is free software; you can redistribute it and/or modify
+ gcov-files-test-aio-y = util/async.c util/qemu-timer.o
-+# it under the terms of the GNU General Public License as published by
+ gcov-files-test-aio-$(CONFIG_WIN32) += util/aio-win32.c
-+# the Free Software Foundation; either version 2 of the License, or
+ gcov-files-test-aio-$(CONFIG_POSIX) += util/aio-posix.c
-+# (at your option) any later version.
++check-unit-y += tests/test-aio-multithread$(EXESUF)
-+#
++gcov-files-test-aio-multithread-y = $(gcov-files-test-aio-y)
-+# This program is distributed in the hope that it will be useful,
++gcov-files-test-aio-multithread-y += util/qemu-coroutine.c tests/iothread.c
-+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+ check-unit-y += tests/test-throttle$(EXESUF)
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+-gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
-+# GNU General Public License for more details.
+-gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
-+#
+ check-unit-y += tests/test-thread-pool$(EXESUF)
-+# You should have received a copy of the GNU General Public License
+ gcov-files-test-thread-pool-y = thread-pool.c
-+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ gcov-files-test-hbitmap-y = util/hbitmap.c
-+#
+@@ -XXX,XX +XXX,XX @@ test-qapi-obj-y = tests/test-qapi-visit.o tests/test-qapi-types.o \
-+# owner=jsnow@redhat.com
+     $(test-qom-obj-y)
-+
+ test-crypto-obj-y = $(crypto-obj-y) $(test-qom-obj-y)
-+import os
+ test-io-obj-y = $(io-obj-y) $(test-crypto-obj-y)
-+import iotests
+-test-block-obj-y = $(block-obj-y) $(test-io-obj-y)
-+from iotests import log
++test-block-obj-y = $(block-obj-y) $(test-io-obj-y) tests/iothread.o
-+
-+iotests.verify_image_format(supported_fmts=['qcow2'])
+ tests/check-qint$(EXESUF): tests/check-qint.o $(test-util-obj-y)
-+size = 64 * 1024 * 1024
+ tests/check-qstring$(EXESUF): tests/check-qstring.o $(test-util-obj-y)
-+
+@@ -XXX,XX +XXX,XX @@ tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
-+with iotests.FilePath('img0') as img0_path, \
+ tests/test-char$(EXESUF): tests/test-char.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y) $(chardev-obj-y)
-+     iotests.FilePath('img1') as img1_path, \
+ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
-+     iotests.FilePath('img0-full') as img0_full_path, \
+ tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
-+     iotests.FilePath('img1-full') as img1_full_path, \
++tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
-+     iotests.FilePath('img0-incr') as img0_incr_path, \
+ tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
-+     iotests.FilePath('img1-incr') as img1_incr_path, \
+ tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
-+     iotests.VM() as vm:
+ tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
-+
+diff --git a/include/block/aio.h b/include/block/aio.h
-+    def create_target(filepath, name, size):
+index XXXXXXX..XXXXXXX 100644
-+        basename = os.path.basename(filepath)
+--- a/include/block/aio.h
-+        nodename = "file_{}".format(basename)
++++ b/include/block/aio.h
-+        log(vm.command('blockdev-create', job_id='job1',
+@@ -XXX,XX +XXX,XX @@ typedef void QEMUBHFunc(void *opaque);
-+                       options={
+ typedef bool AioPollFn(void *opaque);
-+                           'driver': 'file',
+ typedef void IOHandler(void *opaque);
-+                           'filename': filepath,
-+                           'size': 0,
++struct Coroutine;
-+                       }))
+ struct ThreadPool;
-+        vm.run_job('job1')
+ struct LinuxAioState;
-+        log(vm.command('blockdev-add', driver='file',
-+                       node_name=nodename, filename=filepath))
+@@ -XXX,XX +XXX,XX @@ struct AioContext {
-+        log(vm.command('blockdev-create', job_id='job2',
+     bool notified;
-+                       options={
+     EventNotifier notifier;
-+                           'driver': iotests.imgfmt,
-+                           'file': nodename,
++    QSLIST_HEAD(, Coroutine) scheduled_coroutines;
-+                           'size': size,
++    QEMUBH *co_schedule_bh;
-+                       }))
++
-+        vm.run_job('job2')
+     /* Thread pool for performing work and receiving completion callbacks.
-+        log(vm.command('blockdev-add', driver=iotests.imgfmt,
+      * Has its own locking.
-+                       node_name=name,
+      */
-+                       file=nodename))
+@@ -XXX,XX +XXX,XX @@ static inline bool aio_node_check(AioContext *ctx, bool is_external)
-+
+ }
-+    log('--- Preparing images & VM ---\n')
-+    vm.add_object('iothread,id=iothread0')
+ /**
-+    vm.add_object('iothread,id=iothread1')
++ * aio_co_schedule:
-+    vm.add_device('virtio-scsi-pci,id=scsi0,iothread=iothread0')
++ * @ctx: the aio context
-+    vm.add_device('virtio-scsi-pci,id=scsi1,iothread=iothread1')
++ * @co: the coroutine
-+    iotests.qemu_img_create('-f', iotests.imgfmt, img0_path, str(size))
++ *
-+    iotests.qemu_img_create('-f', iotests.imgfmt, img1_path, str(size))
++ * Start a coroutine on a remote AioContext.
-+    vm.add_drive(img0_path, interface='none')
++ *
-+    vm.add_device('scsi-hd,id=device0,drive=drive0,bus=scsi0.0')
++ * The coroutine must not be entered by anyone else while aio_co_schedule()
-+    vm.add_drive(img1_path, interface='none')
++ * is active.  In addition the coroutine must have yielded unless ctx
-+    vm.add_device('scsi-hd,id=device1,drive=drive1,bus=scsi1.0')
++ * is the context in which the coroutine is running (i.e. the value of
-+
++ * qemu_get_current_aio_context() from the coroutine itself).
-+    log('--- Starting VM ---\n')
++ */
-+    vm.launch()
++void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
 +
-+    log('--- Create Targets & Full Backups ---\n')
++/**
-+    create_target(img0_full_path, 'img0-full', size)
++ * aio_co_wake:
-+    create_target(img1_full_path, 'img1-full', size)
++ * @co: the coroutine
-+    ret = vm.qmp_log('transaction', indent=2, actions=[
++ *
-+        { 'type': 'block-dirty-bitmap-add',
++ * Restart a coroutine on the AioContext where it was running last, thus
-+          'data': { 'node': 'drive0', 'name': 'bitmap0' }},
++ * preventing coroutines from jumping from one context to another when they
-+        { 'type': 'block-dirty-bitmap-add',
++ * go to sleep.
-+          'data': { 'node': 'drive1', 'name': 'bitmap1' }},
++ *
-+        { 'type': 'blockdev-backup',
++ * aio_co_wake may be executed either in coroutine or non-coroutine
-+          'data': { 'device': 'drive0',
++ * context.  The coroutine must not be entered by anyone else while
-+                    'target': 'img0-full',
++ * aio_co_wake() is active.
-+                    'sync': 'full',
++ */
-+                    'job-id': 'j0' }},
++void aio_co_wake(struct Coroutine *co);
-+        { 'type': 'blockdev-backup',
++
-+          'data': { 'device': 'drive1',
++/**
-+                    'target': 'img1-full',
+  * Return the AioContext whose event loop runs in the current thread.
-+                    'sync': 'full',
+  *
-+                    'job-id': 'j1' }}
+  * If called from an IOThread this will be the IOThread's AioContext.  If
-+    ])
+diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
-+    if "error" in ret:
+index XXXXXXX..XXXXXXX 100644
-+        raise Exception(ret['error']['desc'])
+--- a/include/qemu/coroutine_int.h
-+    vm.run_job('j0', auto_dismiss=True)
++++ b/include/qemu/coroutine_int.h
-+    vm.run_job('j1', auto_dismiss=True)
+@@ -XXX,XX +XXX,XX @@ struct Coroutine {
-+
+     CoroutineEntry *entry;
-+    log('\n--- Create Targets & Incremental Backups ---\n')
+     void *entry_arg;
-+    create_target(img0_incr_path, 'img0-incr', size)
+     Coroutine *caller;
-+    create_target(img1_incr_path, 'img1-incr', size)
++
-+    ret = vm.qmp_log('transaction', indent=2, actions=[
++    /* Only used when the coroutine has terminated.  */
-+        { 'type': 'blockdev-backup',
+     QSLIST_ENTRY(Coroutine) pool_next;
-+          'data': { 'device': 'drive0',
++
-+                    'target': 'img0-incr',
+     size_t locks_held;
-+                    'sync': 'incremental',
-+                    'bitmap': 'bitmap0',
+-    /* Coroutines that should be woken up when we yield or terminate */
-+                    'job-id': 'j2' }},
++    /* Coroutines that should be woken up when we yield or terminate.
-+        { 'type': 'blockdev-backup',
++     * Only used when the coroutine is running.
-+          'data': { 'device': 'drive1',
++     */
-+                    'target': 'img1-incr',
+     QSIMPLEQ_HEAD(, Coroutine) co_queue_wakeup;
-+                    'sync': 'incremental',
++
-+                    'bitmap': 'bitmap1',
++    /* Only used when the coroutine has yielded.  */
-+                    'job-id': 'j3' }}
++    AioContext *ctx;
-+    ])
+     QSIMPLEQ_ENTRY(Coroutine) co_queue_next;
-+    if "error" in ret:
++    QSLIST_ENTRY(Coroutine) co_scheduled_next;
-+        raise Exception(ret['error']['desc'])
+ };
-+    vm.run_job('j2', auto_dismiss=True)
-+    vm.run_job('j3', auto_dismiss=True)
+ Coroutine *qemu_coroutine_new(void);
-+
+diff --git a/tests/iothread.h b/tests/iothread.h
 +    log('\n--- Done ---')
 +    vm.shutdown()
 diff --git a/tests/qemu-iotests/256.out b/tests/qemu-iotests/256.out
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/tests/qemu-iotests/256.out
++++ b/tests/iothread.h
 @@ -XXX,XX +XXX,XX @@
-+--- Preparing images & VM ---
++/*
-+
++ * Event loop thread implementation for unit tests
-+--- Starting VM ---
++ *
-+
++ * Copyright Red Hat Inc., 2013, 2016
-+--- Create Targets & Full Backups ---
++ *
-+
++ * Authors:
-+{}
++ *  Stefan Hajnoczi   <stefanha@redhat.com>
-+{"execute": "job-dismiss", "arguments": {"id": "job1"}}
++ *  Paolo Bonzini     <pbonzini@redhat.com>
-+{"return": {}}
++ *
-+{}
++ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+{}
++ * See the COPYING file in the top-level directory.
-+{"execute": "job-dismiss", "arguments": {"id": "job2"}}
++ */
-+{"return": {}}
++#ifndef TEST_IOTHREAD_H
-+{}
++#define TEST_IOTHREAD_H
-+{}
++
-+{"execute": "job-dismiss", "arguments": {"id": "job1"}}
++#include "block/aio.h"
-+{"return": {}}
++#include "qemu/thread.h"
-+{}
++
-+{}
++typedef struct IOThread IOThread;
-+{"execute": "job-dismiss", "arguments": {"id": "job2"}}
++
-+{"return": {}}
++IOThread *iothread_new(void);
-+{}
++void iothread_join(IOThread *iothread);
-+{
++AioContext *iothread_get_aio_context(IOThread *iothread);
-+  "execute": "transaction",
++
-+  "arguments": {
++#endif
-+    "actions": [
+diff --git a/tests/iothread.c b/tests/iothread.c
-+      {
+new file mode 100644
-+        "data": {
+index XXXXXXX..XXXXXXX
-+          "name": "bitmap0",
+--- /dev/null
-+          "node": "drive0"
++++ b/tests/iothread.c
-+        },
+@@ -XXX,XX +XXX,XX @@
-+        "type": "block-dirty-bitmap-add"
++/*
-+      },
++ * Event loop thread implementation for unit tests
-+      {
++ *
-+        "data": {
++ * Copyright Red Hat Inc., 2013, 2016
-+          "name": "bitmap1",
++ *
-+          "node": "drive1"
++ * Authors:
-+        },
++ *  Stefan Hajnoczi   <stefanha@redhat.com>
-+        "type": "block-dirty-bitmap-add"
++ *  Paolo Bonzini     <pbonzini@redhat.com>
-+      },
++ *
-+      {
++ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+        "data": {
++ * See the COPYING file in the top-level directory.
-+          "device": "drive0",
++ *
-+          "job-id": "j0",
++ */
-+          "sync": "full",
++
-+          "target": "img0-full"
++#include "qemu/osdep.h"
-+        },
++#include "qapi/error.h"
-+        "type": "blockdev-backup"
++#include "block/aio.h"
-+      },
++#include "qemu/main-loop.h"
-+      {
++#include "qemu/rcu.h"
-+        "data": {
++#include "iothread.h"
-+          "device": "drive1",
++
-+          "job-id": "j1",
++struct IOThread {
-+          "sync": "full",
++    AioContext *ctx;
-+          "target": "img1-full"
++
-+        },
++    QemuThread thread;
-+        "type": "blockdev-backup"
++    QemuMutex init_done_lock;
-+      }
++    QemuCond init_done_cond;    /* is thread initialization done? */
-+    ]
++    bool stopping;
-+  }
++};
-+}
++
-+{
++static __thread IOThread *my_iothread;
-+  "return": {}
++
-+}
++AioContext *qemu_get_current_aio_context(void)
-+{"data": {"device": "j0", "len": 67108864, "offset": 67108864, "speed": 0, "type": "backup"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
++{
-+{"data": {"device": "j1", "len": 67108864, "offset": 67108864, "speed": 0, "type": "backup"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
++    return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
-+
++}
-+--- Create Targets & Incremental Backups ---
++
-+
++static void *iothread_run(void *opaque)
-+{}
++{
-+{"execute": "job-dismiss", "arguments": {"id": "job1"}}
++    IOThread *iothread = opaque;
-+{"return": {}}
++
-+{}
++    rcu_register_thread();
-+{}
++
-+{"execute": "job-dismiss", "arguments": {"id": "job2"}}
++    my_iothread = iothread;
-+{"return": {}}
++    qemu_mutex_lock(&iothread->init_done_lock);
-+{}
++    iothread->ctx = aio_context_new(&error_abort);
-+{}
++    qemu_cond_signal(&iothread->init_done_cond);
-+{"execute": "job-dismiss", "arguments": {"id": "job1"}}
++    qemu_mutex_unlock(&iothread->init_done_lock);
-+{"return": {}}
++
-+{}
++    while (!atomic_read(&iothread->stopping)) {
-+{}
++        aio_poll(iothread->ctx, true);
-+{"execute": "job-dismiss", "arguments": {"id": "job2"}}
++    }
-+{"return": {}}
++
-+{}
++    rcu_unregister_thread();
-+{
++    return NULL;
-+  "execute": "transaction",
++}
-+  "arguments": {
++
-+    "actions": [
++void iothread_join(IOThread *iothread)
-+      {
++{
-+        "data": {
++    iothread->stopping = true;
-+          "bitmap": "bitmap0",
++    aio_notify(iothread->ctx);
-+          "device": "drive0",
++    qemu_thread_join(&iothread->thread);
-+          "job-id": "j2",
++    qemu_cond_destroy(&iothread->init_done_cond);
-+          "sync": "incremental",
++    qemu_mutex_destroy(&iothread->init_done_lock);
-+          "target": "img0-incr"
++    aio_context_unref(iothread->ctx);
-+        },
++    g_free(iothread);
-+        "type": "blockdev-backup"
++}
-+      },
++
-+      {
++IOThread *iothread_new(void)
-+        "data": {
++{
-+          "bitmap": "bitmap1",
++    IOThread *iothread = g_new0(IOThread, 1);
-+          "device": "drive1",
++
-+          "job-id": "j3",
++    qemu_mutex_init(&iothread->init_done_lock);
-+          "sync": "incremental",
++    qemu_cond_init(&iothread->init_done_cond);
-+          "target": "img1-incr"
++    qemu_thread_create(&iothread->thread, NULL, iothread_run,
-+        },
++                       iothread, QEMU_THREAD_JOINABLE);
-+        "type": "blockdev-backup"
++
-+      }
++    /* Wait for initialization to complete */
-+    ]
++    qemu_mutex_lock(&iothread->init_done_lock);
-+  }
++    while (iothread->ctx == NULL) {
-+}
++        qemu_cond_wait(&iothread->init_done_cond,
-+{
++                       &iothread->init_done_lock);
-+  "return": {}
++    }
-+}
++    qemu_mutex_unlock(&iothread->init_done_lock);
-+{"data": {"device": "j2", "len": 67108864, "offset": 67108864, "speed": 0, "type": "backup"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
++    return iothread;
-+{"data": {"device": "j3", "len": 67108864, "offset": 67108864, "speed": 0, "type": "backup"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
++}
 +
-+--- Done ---
++AioContext *iothread_get_aio_context(IOThread *iothread)
-diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
++{
 +    return iothread->ctx;
 +}
 diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/test-aio-multithread.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * AioContext multithreading tests
 + *
 + * Copyright Red Hat, Inc. 2016
 + *
 + * Authors:
 + *  Paolo Bonzini    <pbonzini@redhat.com>
 + *
 + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 + * See the COPYING.LIB file in the top-level directory.
 + */
 +
 +#include "qemu/osdep.h"
 +#include <glib.h>
 +#include "block/aio.h"
 +#include "qapi/error.h"
 +#include "qemu/coroutine.h"
 +#include "qemu/thread.h"
 +#include "qemu/error-report.h"
 +#include "iothread.h"
 +
 +/* AioContext management */
 +
 +#define NUM_CONTEXTS 5
 +
 +static IOThread *threads[NUM_CONTEXTS];
 +static AioContext *ctx[NUM_CONTEXTS];
 +static __thread int id = -1;
 +
 +static QemuEvent done_event;
 +
 +/* Run a function synchronously on a remote iothread. */
 +
 +typedef struct CtxRunData {
 +    QEMUBHFunc *cb;
 +    void *arg;
 +} CtxRunData;
 +
 +static void ctx_run_bh_cb(void *opaque)
 +{
 +    CtxRunData *data = opaque;
 +
 +    data->cb(data->arg);
 +    qemu_event_set(&done_event);
 +}
 +
 +static void ctx_run(int i, QEMUBHFunc *cb, void *opaque)
 +{
 +    CtxRunData data = {
 +        .cb = cb,
 +        .arg = opaque
 +    };
 +
 +    qemu_event_reset(&done_event);
 +    aio_bh_schedule_oneshot(ctx[i], ctx_run_bh_cb, &data);
 +    qemu_event_wait(&done_event);
 +}
 +
 +/* Starting the iothreads. */
 +
 +static void set_id_cb(void *opaque)
 +{
 +    int *i = opaque;
 +
 +    id = *i;
 +}
 +
 +static void create_aio_contexts(void)
 +{
 +    int i;
 +
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        threads[i] = iothread_new();
 +        ctx[i] = iothread_get_aio_context(threads[i]);
 +    }
 +
 +    qemu_event_init(&done_event, false);
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        ctx_run(i, set_id_cb, &i);
 +    }
 +}
 +
 +/* Stopping the iothreads. */
 +
 +static void join_aio_contexts(void)
 +{
 +    int i;
 +
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        aio_context_ref(ctx[i]);
 +    }
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        iothread_join(threads[i]);
 +    }
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        aio_context_unref(ctx[i]);
 +    }
 +    qemu_event_destroy(&done_event);
 +}
 +
 +/* Basic test for the stuff above. */
 +
 +static void test_lifecycle(void)
 +{
 +    create_aio_contexts();
 +    join_aio_contexts();
 +}
 +
 +/* aio_co_schedule test.  */
 +
 +static Coroutine *to_schedule[NUM_CONTEXTS];
 +
 +static bool now_stopping;
 +
 +static int count_retry;
 +static int count_here;
 +static int count_other;
 +
 +static bool schedule_next(int n)
 +{
 +    Coroutine *co;
 +
 +    co = atomic_xchg(&to_schedule[n], NULL);
 +    if (!co) {
 +        atomic_inc(&count_retry);
 +        return false;
 +    }
 +
 +    if (n == id) {
 +        atomic_inc(&count_here);
 +    } else {
 +        atomic_inc(&count_other);
 +    }
 +
 +    aio_co_schedule(ctx[n], co);
 +    return true;
 +}
 +
 +static void finish_cb(void *opaque)
 +{
 +    schedule_next(id);
 +}
 +
 +static coroutine_fn void test_multi_co_schedule_entry(void *opaque)
 +{
 +    g_assert(to_schedule[id] == NULL);
 +    atomic_mb_set(&to_schedule[id], qemu_coroutine_self());
 +
 +    while (!atomic_mb_read(&now_stopping)) {
 +        int n;
 +
 +        n = g_test_rand_int_range(0, NUM_CONTEXTS);
 +        schedule_next(n);
 +        qemu_coroutine_yield();
 +
 +        g_assert(to_schedule[id] == NULL);
 +        atomic_mb_set(&to_schedule[id], qemu_coroutine_self());
 +    }
 +}
 +
 +
 +static void test_multi_co_schedule(int seconds)
 +{
 +    int i;
 +
 +    count_here = count_other = count_retry = 0;
 +    now_stopping = false;
 +
 +    create_aio_contexts();
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        Coroutine *co1 = qemu_coroutine_create(test_multi_co_schedule_entry, NULL);
 +        aio_co_schedule(ctx[i], co1);
 +    }
 +
 +    g_usleep(seconds * 1000000);
 +
 +    atomic_mb_set(&now_stopping, true);
 +    for (i = 0; i < NUM_CONTEXTS; i++) {
 +        ctx_run(i, finish_cb, NULL);
 +        to_schedule[i] = NULL;
 +    }
 +
 +    join_aio_contexts();
 +    g_test_message("scheduled %d, queued %d, retry %d, total %d\n",
 +                  count_other, count_here, count_retry,
 +                  count_here + count_other + count_retry);
 +}
 +
 +static void test_multi_co_schedule_1(void)
 +{
 +    test_multi_co_schedule(1);
 +}
 +
 +static void test_multi_co_schedule_10(void)
 +{
 +    test_multi_co_schedule(10);
 +}
 +
 +/* End of tests.  */
 +
 +int main(int argc, char **argv)
 +{
 +    init_clocks();
 +
 +    g_test_init(&argc, &argv, NULL);
 +    g_test_add_func("/aio/multi/lifecycle", test_lifecycle);
 +    if (g_test_quick()) {
 +        g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
 +    } else {
 +        g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
 +    }
 +    return g_test_run();
 +}
 diff --git a/util/async.c b/util/async.c
 index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/group
+--- a/util/async.c
-+++ b/tests/qemu-iotests/group
++++ b/util/async.c
 @@ -XXX,XX +XXX,XX @@
-rw auto quick
+ #include "qemu/main-loop.h"
-rw auto backing quick
+ #include "qemu/atomic.h"
-rw auto quick
+ #include "block/raw-aio.h"
-+256 rw auto quick
++#include "qemu/coroutine_int.h"
 +#include "trace.h"
  /***********************************************************/
  /* bottom halves (can be seen as timers which expire ASAP) */
@@ -XXX,XX +XXX,XX @@ aio_ctx_finalize(GSource     *source)
      }
  #endif
 +    assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
 +    qemu_bh_delete(ctx->co_schedule_bh);
 +
      qemu_lockcnt_lock(&ctx->list_lock);
      assert(!qemu_lockcnt_count(&ctx->list_lock));
      while (ctx->first_bh) {
@@ -XXX,XX +XXX,XX @@ static bool event_notifier_poll(void *opaque)
      return atomic_read(&ctx->notified);
  }
 +static void co_schedule_bh_cb(void *opaque)
 +{
 +    AioContext *ctx = opaque;
 +    QSLIST_HEAD(, Coroutine) straight, reversed;
 +
 +    QSLIST_MOVE_ATOMIC(&reversed, &ctx->scheduled_coroutines);
 +    QSLIST_INIT(&straight);
 +
 +    while (!QSLIST_EMPTY(&reversed)) {
 +        Coroutine *co = QSLIST_FIRST(&reversed);
 +        QSLIST_REMOVE_HEAD(&reversed, co_scheduled_next);
 +        QSLIST_INSERT_HEAD(&straight, co, co_scheduled_next);
 +    }
 +
 +    while (!QSLIST_EMPTY(&straight)) {
 +        Coroutine *co = QSLIST_FIRST(&straight);
 +        QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
 +        trace_aio_co_schedule_bh_cb(ctx, co);
 +        qemu_coroutine_enter(co);
 +    }
 +}
 +
  AioContext *aio_context_new(Error **errp)
  {
      int ret;
@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
      }
      g_source_set_can_recurse(&ctx->source, true);
      qemu_lockcnt_init(&ctx->list_lock);
 +
 +    ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx);
 +    QSLIST_INIT(&ctx->scheduled_coroutines);
 +
      aio_set_event_notifier(ctx, &ctx->notifier,
                             false,
                             (EventNotifierHandler *)
@@ -XXX,XX +XXX,XX @@ fail:
      return NULL;
  }
 +void aio_co_schedule(AioContext *ctx, Coroutine *co)
 +{
 +    trace_aio_co_schedule(ctx, co);
 +    QSLIST_INSERT_HEAD_ATOMIC(&ctx->scheduled_coroutines,
 +                              co, co_scheduled_next);
 +    qemu_bh_schedule(ctx->co_schedule_bh);
 +}
 +
 +void aio_co_wake(struct Coroutine *co)
 +{
 +    AioContext *ctx;
 +
 +    /* Read coroutine before co->ctx.  Matches smp_wmb in
 +     * qemu_coroutine_enter.
 +     */
 +    smp_read_barrier_depends();
 +    ctx = atomic_read(&co->ctx);
 +
 +    if (ctx != qemu_get_current_aio_context()) {
 +        aio_co_schedule(ctx, co);
 +        return;
 +    }
 +
 +    if (qemu_in_coroutine()) {
 +        Coroutine *self = qemu_coroutine_self();
 +        assert(self != co);
 +        QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, co, co_queue_next);
 +    } else {
 +        aio_context_acquire(ctx);
 +        qemu_coroutine_enter(co);
 +        aio_context_release(ctx);
 +    }
 +}
 +
  void aio_context_ref(AioContext *ctx)
  {
      g_source_ref(&ctx->source);
 diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/qemu-coroutine.c
 +++ b/util/qemu-coroutine.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/atomic.h"
  #include "qemu/coroutine.h"
  #include "qemu/coroutine_int.h"
 +#include "block/aio.h"
  enum {
      POOL_BATCH_SIZE = 64,
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_enter(Coroutine *co)
      }
      co->caller = self;
 +    co->ctx = qemu_get_current_aio_context();
 +
 +    /* Store co->ctx before anything that stores co.  Matches
 +     * barrier in aio_co_wake.
 +     */
 +    smp_wmb();
 +
      ret = qemu_coroutine_switch(self, co, COROUTINE_ENTER);
      qemu_co_queue_run_restart(co);
 diff --git a/util/trace-events b/util/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/util/trace-events
 +++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
  poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
  poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 +# util/async.c
 +aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
 +aio_co_schedule_bh_cb(void *ctx, void *co) "ctx %p co %p"
 +
  # util/thread-pool.c
  thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
  thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
 --
-.21.0
+.9.3

-[Qemu-devel] [PULL 18/20] blkdebug: Inject errors on .bdrv_co_block_status()
+[Qemu-devel] [PULL v2 03/24] block-backend: allow blk_prw from coroutine context
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+qcow2_create2 calls this.  Do not run a nested event loop, as that
-Message-id: 20190507203508.18026-6-mreitz@redhat.com
+breaks when aio_co_wake tries to queue the coroutine on the co_queue_wakeup
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+list of the currently running one.
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213135235.12274-4-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- qapi/block-core.json | 5 ++++-
+ block/block-backend.c | 12 ++++++++----
- block/blkdebug.c     | 8 ++++++++
+file changed, 8 insertions(+), 4 deletions(-)
 files changed, 12 insertions(+), 1 deletion(-)
-diff --git a/qapi/block-core.json b/qapi/block-core.json
+diff --git a/block/block-backend.c b/block/block-backend.c
 index XXXXXXX..XXXXXXX 100644
---- a/qapi/block-core.json
+--- a/block/block-backend.c
-+++ b/qapi/block-core.json
++++ b/block/block-backend.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
  #
  # @flush: .bdrv_co_flush_to_disk()
  #
 +# @block-status: .bdrv_co_block_status()
 +#
  # Since: 4.1
  ##
  { 'enum': 'BlkdebugIOType', 'prefix': 'BLKDEBUG_IO_TYPE',
 -  'data': [ 'read', 'write', 'write-zeroes', 'discard', 'flush' ] }
 +  'data': [ 'read', 'write', 'write-zeroes', 'discard', 'flush',
 +            'block-status' ] }
  ##
  # @BlkdebugInjectErrorOptions:
 diff --git a/block/blkdebug.c b/block/blkdebug.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/blkdebug.c
 +++ b/block/blkdebug.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkdebug_co_block_status(BlockDriverState *bs,
                                                   int64_t *map,
                                                   BlockDriverState **file)
  {
-+    int err;
+     QEMUIOVector qiov;
-+
+     struct iovec iov;
-     assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
+-    Coroutine *co;
-+
+     BlkRwCo rwco;
-+    err = rule_check(bs, offset, bytes, BLKDEBUG_IO_TYPE_BLOCK_STATUS);
-+    if (err) {
+     iov = (struct iovec) {
-+        return err;
+@@ -XXX,XX +XXX,XX @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
          .ret    = NOT_DONE,
      };
 -    co = qemu_coroutine_create(co_entry, &rwco);
 -    qemu_coroutine_enter(co);
 -    BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
 +    if (qemu_in_coroutine()) {
 +        /* Fast-path if already in coroutine context */
 +        co_entry(&rwco);
 +    } else {
 +        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
 +        qemu_coroutine_enter(co);
 +        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
 +    }
-+
-     return bdrv_co_block_status_from_file(bs, want_zero, offset, bytes,
+     return rwco.ret;
                                            pnum, map, file);
  }
 --
-.21.0
+.9.3

-[Qemu-devel] [PULL 20/20] iotests: Test qemu-img convert -C --salvage
+[Qemu-devel] [PULL v2 04/24] test-thread-pool: use generic AioContext infrastructure
-We do not support this combination (yet), so this should yield an error
+From: Paolo Bonzini <pbonzini@redhat.com>
 message.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Once the thread pool starts using aio_co_wake, it will also need
-Tested-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+qemu_get_current_aio_context().  Make test-thread-pool create
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+an AioContext with qemu_init_main_loop, so that stubs/iothread.c
-Message-id: 20190507203508.18026-8-mreitz@redhat.com
+and tests/iothread.c can provide the rest.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213135235.12274-5-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- tests/qemu-iotests/082     | 1 +
+ tests/test-thread-pool.c | 12 +++---------
- tests/qemu-iotests/082.out | 3 +++
+file changed, 3 insertions(+), 9 deletions(-)
 files changed, 4 insertions(+)
-diff --git a/tests/qemu-iotests/082 b/tests/qemu-iotests/082
+diff --git a/tests/test-thread-pool.c b/tests/test-thread-pool.c
 index XXXXXXX..XXXXXXX 100755
 --- a/tests/qemu-iotests/082
 +++ b/tests/qemu-iotests/082
@@ -XXX,XX +XXX,XX @@ echo === convert: -C and other options ===
  run_qemu_img convert -C -S 4k -O $IMGFMT "$TEST_IMG" "$TEST_IMG".target
  run_qemu_img convert -C -S 8k -O $IMGFMT "$TEST_IMG" "$TEST_IMG".target
  run_qemu_img convert -C -c -O $IMGFMT "$TEST_IMG" "$TEST_IMG".target
 +run_qemu_img convert -C --salvage -O $IMGFMT "$TEST_IMG" "$TEST_IMG".target
  echo
  echo === amend: Options specified more than once ===
 diff --git a/tests/qemu-iotests/082.out b/tests/qemu-iotests/082.out
 index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/082.out
+--- a/tests/test-thread-pool.c
-+++ b/tests/qemu-iotests/082.out
++++ b/tests/test-thread-pool.c
-@@ -XXX,XX +XXX,XX @@ qemu-img: Cannot enable copy offloading when -S is used
+@@ -XXX,XX +XXX,XX @@
- Testing: convert -C -c -O qcow2 TEST_DIR/t.qcow2 TEST_DIR/t.qcow2.target
+ #include "qapi/error.h"
- qemu-img: Cannot enable copy offloading when -c is used
+ #include "qemu/timer.h"
+ #include "qemu/error-report.h"
-+Testing: convert -C --salvage -O qcow2 TEST_DIR/t.qcow2 TEST_DIR/t.qcow2.target
++#include "qemu/main-loop.h"
-+qemu-img: Cannot use copy offloading in salvaging mode
-+
+ static AioContext *ctx;
- === amend: Options specified more than once ===
+ static ThreadPool *pool;
+@@ -XXX,XX +XXX,XX @@ static void test_cancel_async(void)
- Testing: amend -f foo -f qcow2 -o lazy_refcounts=on TEST_DIR/t.qcow2
+ int main(int argc, char **argv)
  {
      int ret;
 -    Error *local_error = NULL;
 -    init_clocks();
 -
 -    ctx = aio_context_new(&local_error);
 -    if (!ctx) {
 -        error_reportf_err(local_error, "Failed to create AIO Context: ");
 -        exit(1);
 -    }
 +    qemu_init_main_loop(&error_abort);
 +    ctx = qemu_get_current_aio_context();
      pool = aio_get_thread_pool(ctx);
      g_test_init(&argc, &argv, NULL);
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      ret = g_test_run();
 -    aio_context_unref(ctx);
      return ret;
  }
 --
-.21.0
+.9.3

-[Qemu-devel] [PULL 11/20] qemu-img: Fix options leakage in img_rebase()
+[Qemu-devel] [PULL v2 05/24] io: add methods to set I/O handlers on AioContext
-img_rebase() can leak a QDict in two occasions.  Fix it.
+From: Paolo Bonzini <pbonzini@redhat.com>
-Coverity: CID 1401416
+This is in preparation for making qio_channel_yield work on
-Fixes: d16699b64671466b42079c45b89127aeea1ca565
+AioContexts other than the main one.
-Fixes: 330c72957196e0ae382abcaa97ebf4eb9bc8574f
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
-Message-id: 20190528195338.12376-1-mreitz@redhat.com
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: John Snow <jsnow@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213135235.12274-6-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- qemu-img.c | 3 +++
+ include/io/channel.h | 25 +++++++++++++++++++++++++
-file changed, 3 insertions(+)
+ io/channel-command.c | 13 +++++++++++++
+ io/channel-file.c    | 11 +++++++++++
-diff --git a/qemu-img.c b/qemu-img.c
+ io/channel-socket.c  | 16 +++++++++++-----
-index XXXXXXX..XXXXXXX 100644
+ io/channel-tls.c     | 12 ++++++++++++
---- a/qemu-img.c
+ io/channel-watch.c   |  6 ++++++
-+++ b/qemu-img.c
+ io/channel.c         | 11 +++++++++++
-@@ -XXX,XX +XXX,XX @@ static int img_rebase(int argc, char **argv)
+files changed, 89 insertions(+), 5 deletions(-)
-                                                              out_baseimg,
-                                                              &local_err);
+diff --git a/include/io/channel.h b/include/io/channel.h
-             if (local_err) {
+index XXXXXXX..XXXXXXX 100644
-+                qobject_unref(options);
+--- a/include/io/channel.h
-                 error_reportf_err(local_err,
++++ b/include/io/channel.h
-                                   "Could not resolve backing filename: ");
+@@ -XXX,XX +XXX,XX @@
-                 ret = -1;
-@@ -XXX,XX +XXX,XX @@ static int img_rebase(int argc, char **argv)
+ #include "qemu-common.h"
-              */
+ #include "qom/object.h"
-             prefix_chain_bs = bdrv_find_backing_image(bs, out_real_path);
++#include "block/aio.h"
-             if (prefix_chain_bs) {
-+                qobject_unref(options);
+ #define TYPE_QIO_CHANNEL "qio-channel"
-                 g_free(out_real_path);
+ #define QIO_CHANNEL(obj)                                    \
-+
+@@ -XXX,XX +XXX,XX @@ struct QIOChannelClass {
-                 blk_new_backing = blk_new(qemu_get_aio_context(),
+                      off_t offset,
-                                           BLK_PERM_CONSISTENT_READ,
+                      int whence,
-                                           BLK_PERM_ALL);
+                      Error **errp);
 +    void (*io_set_aio_fd_handler)(QIOChannel *ioc,
 +                                  AioContext *ctx,
 +                                  IOHandler *io_read,
 +                                  IOHandler *io_write,
 +                                  void *opaque);
  };
  /* General I/O handling functions */
@@ -XXX,XX +XXX,XX @@ void qio_channel_yield(QIOChannel *ioc,
  void qio_channel_wait(QIOChannel *ioc,
                        GIOCondition condition);
 +/**
 + * qio_channel_set_aio_fd_handler:
 + * @ioc: the channel object
 + * @ctx: the AioContext to set the handlers on
 + * @io_read: the read handler
 + * @io_write: the write handler
 + * @opaque: the opaque value passed to the handler
 + *
 + * This is used internally by qio_channel_yield().  It can
 + * be used by channel implementations to forward the handlers
 + * to another channel (e.g. from #QIOChannelTLS to the
 + * underlying socket).
 + */
 +void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
 +                                    AioContext *ctx,
 +                                    IOHandler *io_read,
 +                                    IOHandler *io_write,
 +                                    void *opaque);
 +
  #endif /* QIO_CHANNEL_H */
 diff --git a/io/channel-command.c b/io/channel-command.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel-command.c
 +++ b/io/channel-command.c
@@ -XXX,XX +XXX,XX @@ static int qio_channel_command_close(QIOChannel *ioc,
  }
 +static void qio_channel_command_set_aio_fd_handler(QIOChannel *ioc,
 +                                                   AioContext *ctx,
 +                                                   IOHandler *io_read,
 +                                                   IOHandler *io_write,
 +                                                   void *opaque)
 +{
 +    QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);
 +    aio_set_fd_handler(ctx, cioc->readfd, false, io_read, NULL, NULL, opaque);
 +    aio_set_fd_handler(ctx, cioc->writefd, false, NULL, io_write, NULL, opaque);
 +}
 +
 +
  static GSource *qio_channel_command_create_watch(QIOChannel *ioc,
                                                   GIOCondition condition)
  {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_command_class_init(ObjectClass *klass,
      ioc_klass->io_set_blocking = qio_channel_command_set_blocking;
      ioc_klass->io_close = qio_channel_command_close;
      ioc_klass->io_create_watch = qio_channel_command_create_watch;
 +    ioc_klass->io_set_aio_fd_handler = qio_channel_command_set_aio_fd_handler;
  }
  static const TypeInfo qio_channel_command_info = {
 diff --git a/io/channel-file.c b/io/channel-file.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel-file.c
 +++ b/io/channel-file.c
@@ -XXX,XX +XXX,XX @@ static int qio_channel_file_close(QIOChannel *ioc,
  }
 +static void qio_channel_file_set_aio_fd_handler(QIOChannel *ioc,
 +                                                AioContext *ctx,
 +                                                IOHandler *io_read,
 +                                                IOHandler *io_write,
 +                                                void *opaque)
 +{
 +    QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);
 +    aio_set_fd_handler(ctx, fioc->fd, false, io_read, io_write, NULL, opaque);
 +}
 +
  static GSource *qio_channel_file_create_watch(QIOChannel *ioc,
                                                GIOCondition condition)
  {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_file_class_init(ObjectClass *klass,
      ioc_klass->io_seek = qio_channel_file_seek;
      ioc_klass->io_close = qio_channel_file_close;
      ioc_klass->io_create_watch = qio_channel_file_create_watch;
 +    ioc_klass->io_set_aio_fd_handler = qio_channel_file_set_aio_fd_handler;
  }
  static const TypeInfo qio_channel_file_info = {
 diff --git a/io/channel-socket.c b/io/channel-socket.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel-socket.c
 +++ b/io/channel-socket.c
@@ -XXX,XX +XXX,XX @@ qio_channel_socket_set_blocking(QIOChannel *ioc,
          qemu_set_block(sioc->fd);
      } else {
          qemu_set_nonblock(sioc->fd);
 -#ifdef WIN32
 -        WSAEventSelect(sioc->fd, ioc->event,
 -                       FD_READ | FD_ACCEPT | FD_CLOSE |
 -                       FD_CONNECT | FD_WRITE | FD_OOB);
 -#endif
      }
      return 0;
  }
@@ -XXX,XX +XXX,XX @@ qio_channel_socket_shutdown(QIOChannel *ioc,
      return 0;
  }
 +static void qio_channel_socket_set_aio_fd_handler(QIOChannel *ioc,
 +                                                  AioContext *ctx,
 +                                                  IOHandler *io_read,
 +                                                  IOHandler *io_write,
 +                                                  void *opaque)
 +{
 +    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
 +    aio_set_fd_handler(ctx, sioc->fd, false, io_read, io_write, NULL, opaque);
 +}
 +
  static GSource *qio_channel_socket_create_watch(QIOChannel *ioc,
                                                  GIOCondition condition)
  {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_socket_class_init(ObjectClass *klass,
      ioc_klass->io_set_cork = qio_channel_socket_set_cork;
      ioc_klass->io_set_delay = qio_channel_socket_set_delay;
      ioc_klass->io_create_watch = qio_channel_socket_create_watch;
 +    ioc_klass->io_set_aio_fd_handler = qio_channel_socket_set_aio_fd_handler;
  }
  static const TypeInfo qio_channel_socket_info = {
 diff --git a/io/channel-tls.c b/io/channel-tls.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel-tls.c
 +++ b/io/channel-tls.c
@@ -XXX,XX +XXX,XX @@ static int qio_channel_tls_close(QIOChannel *ioc,
      return qio_channel_close(tioc->master, errp);
  }
 +static void qio_channel_tls_set_aio_fd_handler(QIOChannel *ioc,
 +                                               AioContext *ctx,
 +                                               IOHandler *io_read,
 +                                               IOHandler *io_write,
 +                                               void *opaque)
 +{
 +    QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc);
 +
 +    qio_channel_set_aio_fd_handler(tioc->master, ctx, io_read, io_write, opaque);
 +}
 +
  static GSource *qio_channel_tls_create_watch(QIOChannel *ioc,
                                               GIOCondition condition)
  {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_tls_class_init(ObjectClass *klass,
      ioc_klass->io_close = qio_channel_tls_close;
      ioc_klass->io_shutdown = qio_channel_tls_shutdown;
      ioc_klass->io_create_watch = qio_channel_tls_create_watch;
 +    ioc_klass->io_set_aio_fd_handler = qio_channel_tls_set_aio_fd_handler;
  }
  static const TypeInfo qio_channel_tls_info = {
 diff --git a/io/channel-watch.c b/io/channel-watch.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel-watch.c
 +++ b/io/channel-watch.c
@@ -XXX,XX +XXX,XX @@ GSource *qio_channel_create_socket_watch(QIOChannel *ioc,
      GSource *source;
      QIOChannelSocketSource *ssource;
 +#ifdef WIN32
 +    WSAEventSelect(socket, ioc->event,
 +                   FD_READ | FD_ACCEPT | FD_CLOSE |
 +                   FD_CONNECT | FD_WRITE | FD_OOB);
 +#endif
 +
      source = g_source_new(&qio_channel_socket_source_funcs,
                            sizeof(QIOChannelSocketSource));
      ssource = (QIOChannelSocketSource *)source;
 diff --git a/io/channel.c b/io/channel.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel.c
 +++ b/io/channel.c
@@ -XXX,XX +XXX,XX @@ GSource *qio_channel_create_watch(QIOChannel *ioc,
  }
 +void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
 +                                    AioContext *ctx,
 +                                    IOHandler *io_read,
 +                                    IOHandler *io_write,
 +                                    void *opaque)
 +{
 +    QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
 +
 +    klass->io_set_aio_fd_handler(ioc, ctx, io_read, io_write, opaque);
 +}
 +
  guint qio_channel_add_watch(QIOChannel *ioc,
                              GIOCondition condition,
                              QIOChannelFunc func,
 --
-.21.0
+.9.3

-New patch
+[Qemu-devel] [PULL v2 06/24] io: make qio_channel_yield aware of AioContexts
+From: Paolo Bonzini <pbonzini@redhat.com>
 Support separate coroutines for reading and writing, and place the
 read/write handlers on the AioContext that the QIOChannel is registered
 with.
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213135235.12274-7-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  include/io/channel.h | 47 ++++++++++++++++++++++++++--
  io/channel.c         | 86 +++++++++++++++++++++++++++++++++++++++-------------
 files changed, 109 insertions(+), 24 deletions(-)
 diff --git a/include/io/channel.h b/include/io/channel.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/io/channel.h
 +++ b/include/io/channel.h
@@ -XXX,XX +XXX,XX @@
  #include "qemu-common.h"
  #include "qom/object.h"
 +#include "qemu/coroutine.h"
  #include "block/aio.h"
  #define TYPE_QIO_CHANNEL "qio-channel"
@@ -XXX,XX +XXX,XX @@ struct QIOChannel {
      Object parent;
      unsigned int features; /* bitmask of QIOChannelFeatures */
      char *name;
 +    AioContext *ctx;
 +    Coroutine *read_coroutine;
 +    Coroutine *write_coroutine;
  #ifdef _WIN32
      HANDLE event; /* For use with GSource on Win32 */
  #endif
@@ -XXX,XX +XXX,XX @@ guint qio_channel_add_watch(QIOChannel *ioc,
  /**
 + * qio_channel_attach_aio_context:
 + * @ioc: the channel object
 + * @ctx: the #AioContext to set the handlers on
 + *
 + * Request that qio_channel_yield() sets I/O handlers on
 + * the given #AioContext.  If @ctx is %NULL, qio_channel_yield()
 + * uses QEMU's main thread event loop.
 + *
 + * You can move a #QIOChannel from one #AioContext to another even if
 + * I/O handlers are set for a coroutine.  However, #QIOChannel provides
 + * no synchronization between the calls to qio_channel_yield() and
 + * qio_channel_attach_aio_context().
 + *
 + * Therefore you should first call qio_channel_detach_aio_context()
 + * to ensure that the coroutine is not entered concurrently.  Then,
 + * while the coroutine has yielded, call qio_channel_attach_aio_context(),
 + * and then aio_co_schedule() to place the coroutine on the new
 + * #AioContext.  The calls to qio_channel_detach_aio_context()
 + * and qio_channel_attach_aio_context() should be protected with
 + * aio_context_acquire() and aio_context_release().
 + */
 +void qio_channel_attach_aio_context(QIOChannel *ioc,
 +                                    AioContext *ctx);
 +
 +/**
 + * qio_channel_detach_aio_context:
 + * @ioc: the channel object
 + *
 + * Disable any I/O handlers set by qio_channel_yield().  With the
 + * help of aio_co_schedule(), this allows moving a coroutine that was
 + * paused by qio_channel_yield() to another context.
 + */
 +void qio_channel_detach_aio_context(QIOChannel *ioc);
 +
 +/**
   * qio_channel_yield:
   * @ioc: the channel object
   * @condition: the I/O condition to wait for
   *
 - * Yields execution from the current coroutine until
 - * the condition indicated by @condition becomes
 - * available.
 + * Yields execution from the current coroutine until the condition
 + * indicated by @condition becomes available.  @condition must
 + * be either %G_IO_IN or %G_IO_OUT; it cannot contain both.  In
 + * addition, no two coroutine can be waiting on the same condition
 + * and channel at the same time.
   *
   * This must only be called from coroutine context
   */
 diff --git a/io/channel.c b/io/channel.c
 index XXXXXXX..XXXXXXX 100644
 --- a/io/channel.c
 +++ b/io/channel.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "io/channel.h"
  #include "qapi/error.h"
 -#include "qemu/coroutine.h"
 +#include "qemu/main-loop.h"
  bool qio_channel_has_feature(QIOChannel *ioc,
                               QIOChannelFeature feature)
@@ -XXX,XX +XXX,XX @@ off_t qio_channel_io_seek(QIOChannel *ioc,
  }
 -typedef struct QIOChannelYieldData QIOChannelYieldData;
 -struct QIOChannelYieldData {
 -    QIOChannel *ioc;
 -    Coroutine *co;
 -};
 +static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc);
 +static void qio_channel_restart_read(void *opaque)
 +{
 +    QIOChannel *ioc = opaque;
 +    Coroutine *co = ioc->read_coroutine;
 +
 +    ioc->read_coroutine = NULL;
 +    qio_channel_set_aio_fd_handlers(ioc);
 +    aio_co_wake(co);
 +}
 -static gboolean qio_channel_yield_enter(QIOChannel *ioc,
 -                                        GIOCondition condition,
 -                                        gpointer opaque)
 +static void qio_channel_restart_write(void *opaque)
  {
 -    QIOChannelYieldData *data = opaque;
 -    qemu_coroutine_enter(data->co);
 -    return FALSE;
 +    QIOChannel *ioc = opaque;
 +    Coroutine *co = ioc->write_coroutine;
 +
 +    ioc->write_coroutine = NULL;
 +    qio_channel_set_aio_fd_handlers(ioc);
 +    aio_co_wake(co);
  }
 +static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc)
 +{
 +    IOHandler *rd_handler = NULL, *wr_handler = NULL;
 +    AioContext *ctx;
 +
 +    if (ioc->read_coroutine) {
 +        rd_handler = qio_channel_restart_read;
 +    }
 +    if (ioc->write_coroutine) {
 +        wr_handler = qio_channel_restart_write;
 +    }
 +
 +    ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
 +    qio_channel_set_aio_fd_handler(ioc, ctx, rd_handler, wr_handler, ioc);
 +}
 +
 +void qio_channel_attach_aio_context(QIOChannel *ioc,
 +                                    AioContext *ctx)
 +{
 +    AioContext *old_ctx;
 +    if (ioc->ctx == ctx) {
 +        return;
 +    }
 +
 +    old_ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
 +    qio_channel_set_aio_fd_handler(ioc, old_ctx, NULL, NULL, NULL);
 +    ioc->ctx = ctx;
 +    qio_channel_set_aio_fd_handlers(ioc);
 +}
 +
 +void qio_channel_detach_aio_context(QIOChannel *ioc)
 +{
 +    ioc->read_coroutine = NULL;
 +    ioc->write_coroutine = NULL;
 +    qio_channel_set_aio_fd_handlers(ioc);
 +    ioc->ctx = NULL;
 +}
  void coroutine_fn qio_channel_yield(QIOChannel *ioc,
                                      GIOCondition condition)
  {
 -    QIOChannelYieldData data;
 -
      assert(qemu_in_coroutine());
 -    data.ioc = ioc;
 -    data.co = qemu_coroutine_self();
 -    qio_channel_add_watch(ioc,
 -                          condition,
 -                          qio_channel_yield_enter,
 -                          &data,
 -                          NULL);
 +    if (condition == G_IO_IN) {
 +        assert(!ioc->read_coroutine);
 +        ioc->read_coroutine = qemu_coroutine_self();
 +    } else if (condition == G_IO_OUT) {
 +        assert(!ioc->write_coroutine);
 +        ioc->write_coroutine = qemu_coroutine_self();
 +    } else {
 +        abort();
 +    }
 +    qio_channel_set_aio_fd_handlers(ioc);
      qemu_coroutine_yield();
  }
 --
 .9.3

-[Qemu-devel] [PULL 17/20] blkdebug: Add "none" event
+[Qemu-devel] [PULL v2 07/24] nbd: convert to use qio_channel_yield
-Together with @iotypes and @sector, this can be used to trap e.g. the
+From: Paolo Bonzini <pbonzini@redhat.com>
 first read or write access to a certain sector without having to know
 what happens internally in the block layer, i.e. which "real" events
 happen right before such an access.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+In the client, read the reply headers from a coroutine, switching the
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+read side between the "read header" coroutine and the I/O coroutine that
-Message-id: 20190507203508.18026-5-mreitz@redhat.com
+reads the body of the reply.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
 In the server, if the server can read more requests it will create a new
 "read request" coroutine as soon as a request has been read.  Otherwise,
 the new coroutine is created in nbd_request_put.
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Message-id: 20170213135235.12274-8-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- qapi/block-core.json | 4 +++-
+ block/nbd-client.h |   2 +-
- block/blkdebug.c     | 2 ++
+ block/nbd-client.c | 117 ++++++++++++++++++++++++-----------------------------
-files changed, 5 insertions(+), 1 deletion(-)
+ nbd/client.c       |   2 +-
  nbd/common.c       |   9 +----
  nbd/server.c       |  94 +++++++++++++-----------------------------
 files changed, 83 insertions(+), 141 deletions(-)
-diff --git a/qapi/block-core.json b/qapi/block-core.json
+diff --git a/block/nbd-client.h b/block/nbd-client.h
 index XXXXXXX..XXXXXXX 100644
---- a/qapi/block-core.json
+--- a/block/nbd-client.h
-+++ b/qapi/block-core.json
++++ b/block/nbd-client.h
@@ -XXX,XX +XXX,XX @@ typedef struct NBDClientSession {
      CoMutex send_mutex;
      CoQueue free_sema;
 -    Coroutine *send_coroutine;
 +    Coroutine *read_reply_co;
      int in_flight;
      Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
 diff --git a/block/nbd-client.c b/block/nbd-client.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/nbd-client.c
 +++ b/block/nbd-client.c
 @@ -XXX,XX +XXX,XX @@
- #
+ #define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
- # @cluster_alloc_space: an allocation of file space for a cluster (since 4.1)
+ #define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))
- #
-+# @none: triggers once at creation of the blkdebug node (since 4.1)
+-static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
-+#
++static void nbd_recv_coroutines_enter_all(BlockDriverState *bs)
- # Since: 2.9
+ {
- ##
++    NBDClientSession *s = nbd_get_client_session(bs);
- { 'enum': 'BlkdebugEvent', 'prefix': 'BLKDBG',
+     int i;
-@@ -XXX,XX +XXX,XX @@
-             'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
+     for (i = 0; i < MAX_NBD_REQUESTS; i++) {
-             'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
+@@ -XXX,XX +XXX,XX @@ static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
-             'l1_shrink_write_table', 'l1_shrink_free_l2_clusters',
+             qemu_coroutine_enter(s->recv_coroutine[i]);
--            'cor_write', 'cluster_alloc_space'] }
+         }
-+            'cor_write', 'cluster_alloc_space', 'none'] }
+     }
++    BDRV_POLL_WHILE(bs, s->read_reply_co);
- ##
+ }
- # @BlkdebugIOType:
-diff --git a/block/blkdebug.c b/block/blkdebug.c
+ static void nbd_teardown_connection(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
      qio_channel_shutdown(client->ioc,
                           QIO_CHANNEL_SHUTDOWN_BOTH,
                           NULL);
 -    nbd_recv_coroutines_enter_all(client);
 +    nbd_recv_coroutines_enter_all(bs);
      nbd_client_detach_aio_context(bs);
      object_unref(OBJECT(client->sioc));
@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
      client->ioc = NULL;
  }
 -static void nbd_reply_ready(void *opaque)
 +static coroutine_fn void nbd_read_reply_entry(void *opaque)
  {
 -    BlockDriverState *bs = opaque;
 -    NBDClientSession *s = nbd_get_client_session(bs);
 +    NBDClientSession *s = opaque;
      uint64_t i;
      int ret;
 -    if (!s->ioc) { /* Already closed */
 -        return;
 -    }
 -
 -    if (s->reply.handle == 0) {
 -        /* No reply already in flight.  Fetch a header.  It is possible
 -         * that another thread has done the same thing in parallel, so
 -         * the socket is not readable anymore.
 -         */
 +    for (;;) {
 +        assert(s->reply.handle == 0);
          ret = nbd_receive_reply(s->ioc, &s->reply);
 -        if (ret == -EAGAIN) {
 -            return;
 -        }
          if (ret < 0) {
 -            s->reply.handle = 0;
 -            goto fail;
 +            break;
          }
 -    }
 -    /* There's no need for a mutex on the receive side, because the
 -     * handler acts as a synchronization point and ensures that only
 -     * one coroutine is called until the reply finishes.  */
 -    i = HANDLE_TO_INDEX(s, s->reply.handle);
 -    if (i >= MAX_NBD_REQUESTS) {
 -        goto fail;
 -    }
 +        /* There's no need for a mutex on the receive side, because the
 +         * handler acts as a synchronization point and ensures that only
 +         * one coroutine is called until the reply finishes.
 +         */
 +        i = HANDLE_TO_INDEX(s, s->reply.handle);
 +        if (i >= MAX_NBD_REQUESTS || !s->recv_coroutine[i]) {
 +            break;
 +        }
 -    if (s->recv_coroutine[i]) {
 -        qemu_coroutine_enter(s->recv_coroutine[i]);
 -        return;
 +        /* We're woken up by the recv_coroutine itself.  Note that there
 +         * is no race between yielding and reentering read_reply_co.  This
 +         * is because:
 +         *
 +         * - if recv_coroutine[i] runs on the same AioContext, it is only
 +         *   entered after we yield
 +         *
 +         * - if recv_coroutine[i] runs on a different AioContext, reentering
 +         *   read_reply_co happens through a bottom half, which can only
 +         *   run after we yield.
 +         */
 +        aio_co_wake(s->recv_coroutine[i]);
 +        qemu_coroutine_yield();
      }
 -
 -fail:
 -    nbd_teardown_connection(bs);
 -}
 -
 -static void nbd_restart_write(void *opaque)
 -{
 -    BlockDriverState *bs = opaque;
 -
 -    qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine);
 +    s->read_reply_co = NULL;
  }
  static int nbd_co_send_request(BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
                                 QEMUIOVector *qiov)
  {
      NBDClientSession *s = nbd_get_client_session(bs);
 -    AioContext *aio_context;
      int rc, ret, i;
      qemu_co_mutex_lock(&s->send_mutex);
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
          return -EPIPE;
      }
 -    s->send_coroutine = qemu_coroutine_self();
 -    aio_context = bdrv_get_aio_context(bs);
 -
 -    aio_set_fd_handler(aio_context, s->sioc->fd, false,
 -                       nbd_reply_ready, nbd_restart_write, NULL, bs);
      if (qiov) {
          qio_channel_set_cork(s->ioc, true);
          rc = nbd_send_request(s->ioc, request);
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
      } else {
          rc = nbd_send_request(s->ioc, request);
      }
 -    aio_set_fd_handler(aio_context, s->sioc->fd, false,
 -                       nbd_reply_ready, NULL, NULL, bs);
 -    s->send_coroutine = NULL;
      qemu_co_mutex_unlock(&s->send_mutex);
      return rc;
  }
@@ -XXX,XX +XXX,XX @@ static void nbd_co_receive_reply(NBDClientSession *s,
  {
      int ret;
 -    /* Wait until we're woken up by the read handler.  TODO: perhaps
 -     * peek at the next reply and avoid yielding if it's ours?  */
 +    /* Wait until we're woken up by nbd_read_reply_entry.  */
      qemu_coroutine_yield();
      *reply = s->reply;
      if (reply->handle != request->handle ||
@@ -XXX,XX +XXX,XX @@ static void nbd_coroutine_start(NBDClientSession *s,
      /* s->recv_coroutine[i] is set as soon as we get the send_lock.  */
  }
 -static void nbd_coroutine_end(NBDClientSession *s,
 +static void nbd_coroutine_end(BlockDriverState *bs,
                                NBDRequest *request)
  {
 +    NBDClientSession *s = nbd_get_client_session(bs);
      int i = HANDLE_TO_INDEX(s, request->handle);
 +
      s->recv_coroutine[i] = NULL;
 -    if (s->in_flight-- == MAX_NBD_REQUESTS) {
 -        qemu_co_queue_next(&s->free_sema);
 +    s->in_flight--;
 +    qemu_co_queue_next(&s->free_sema);
 +
 +    /* Kick the read_reply_co to get the next reply.  */
 +    if (s->read_reply_co) {
 +        aio_co_wake(s->read_reply_co);
      }
  }
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
      } else {
          nbd_co_receive_reply(client, &request, &reply, qiov);
      }
 -    nbd_coroutine_end(client, &request);
 +    nbd_coroutine_end(bs, &request);
      return -reply.error;
  }
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
      } else {
          nbd_co_receive_reply(client, &request, &reply, NULL);
      }
 -    nbd_coroutine_end(client, &request);
 +    nbd_coroutine_end(bs, &request);
      return -reply.error;
  }
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
      } else {
          nbd_co_receive_reply(client, &request, &reply, NULL);
      }
 -    nbd_coroutine_end(client, &request);
 +    nbd_coroutine_end(bs, &request);
      return -reply.error;
  }
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_flush(BlockDriverState *bs)
      } else {
          nbd_co_receive_reply(client, &request, &reply, NULL);
      }
 -    nbd_coroutine_end(client, &request);
 +    nbd_coroutine_end(bs, &request);
      return -reply.error;
  }
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
      } else {
          nbd_co_receive_reply(client, &request, &reply, NULL);
      }
 -    nbd_coroutine_end(client, &request);
 +    nbd_coroutine_end(bs, &request);
      return -reply.error;
  }
  void nbd_client_detach_aio_context(BlockDriverState *bs)
  {
 -    aio_set_fd_handler(bdrv_get_aio_context(bs),
 -                       nbd_get_client_session(bs)->sioc->fd,
 -                       false, NULL, NULL, NULL, NULL);
 +    NBDClientSession *client = nbd_get_client_session(bs);
 +    qio_channel_detach_aio_context(QIO_CHANNEL(client->sioc));
  }
  void nbd_client_attach_aio_context(BlockDriverState *bs,
                                     AioContext *new_context)
  {
 -    aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sioc->fd,
 -                       false, nbd_reply_ready, NULL, NULL, bs);
 +    NBDClientSession *client = nbd_get_client_session(bs);
 +    qio_channel_attach_aio_context(QIO_CHANNEL(client->sioc), new_context);
 +    aio_co_schedule(new_context, client->read_reply_co);
  }
  void nbd_client_close(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ int nbd_client_init(BlockDriverState *bs,
      /* Now that we're connected, set the socket to be non-blocking and
       * kick the reply mechanism.  */
      qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL);
 -
 +    client->read_reply_co = qemu_coroutine_create(nbd_read_reply_entry, client);
      nbd_client_attach_aio_context(bs, bdrv_get_aio_context(bs));
      logout("Established connection with NBD server\n");
 diff --git a/nbd/client.c b/nbd/client.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/blkdebug.c
+--- a/nbd/client.c
-+++ b/block/blkdebug.c
++++ b/nbd/client.c
-@@ -XXX,XX +XXX,XX @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
+@@ -XXX,XX +XXX,XX @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply)
      ssize_t ret;
      ret = read_sync(ioc, buf, sizeof(buf));
 -    if (ret < 0) {
 +    if (ret <= 0) {
          return ret;
      }
 diff --git a/nbd/common.c b/nbd/common.c
 index XXXXXXX..XXXXXXX 100644
 --- a/nbd/common.c
 +++ b/nbd/common.c
@@ -XXX,XX +XXX,XX @@ ssize_t nbd_wr_syncv(QIOChannel *ioc,
          }
          if (len == QIO_CHANNEL_ERR_BLOCK) {
              if (qemu_in_coroutine()) {
 -                /* XXX figure out if we can create a variant on
 -                 * qio_channel_yield() that works with AIO contexts
 -                 * and consider using that in this branch */
 -                qemu_coroutine_yield();
 -            } else if (done) {
 -                /* XXX this is needed by nbd_reply_ready.  */
 -                qio_channel_wait(ioc,
 -                                 do_read ? G_IO_IN : G_IO_OUT);
 +                qio_channel_yield(ioc, do_read ? G_IO_IN : G_IO_OUT);
              } else {
                  return -EAGAIN;
              }
 diff --git a/nbd/server.c b/nbd/server.c
 index XXXXXXX..XXXXXXX 100644
 --- a/nbd/server.c
 +++ b/nbd/server.c
@@ -XXX,XX +XXX,XX @@ struct NBDClient {
      CoMutex send_lock;
      Coroutine *send_coroutine;
 -    bool can_read;
 -
      QTAILQ_ENTRY(NBDClient) next;
      int nb_requests;
      bool closing;
@@ -XXX,XX +XXX,XX @@ struct NBDClient {
  /* That's all folks */
 -static void nbd_set_handlers(NBDClient *client);
 -static void nbd_unset_handlers(NBDClient *client);
 -static void nbd_update_can_read(NBDClient *client);
 +static void nbd_client_receive_next_request(NBDClient *client);
  static gboolean nbd_negotiate_continue(QIOChannel *ioc,
                                         GIOCondition condition,
@@ -XXX,XX +XXX,XX @@ void nbd_client_put(NBDClient *client)
           */
          assert(client->closing);
 -        nbd_unset_handlers(client);
 +        qio_channel_detach_aio_context(client->ioc);
          object_unref(OBJECT(client->sioc));
          object_unref(OBJECT(client->ioc));
          if (client->tlscreds) {
@@ -XXX,XX +XXX,XX @@ static NBDRequestData *nbd_request_get(NBDClient *client)
      assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
      client->nb_requests++;
 -    nbd_update_can_read(client);
      req = g_new0(NBDRequestData, 1);
      nbd_client_get(client);
@@ -XXX,XX +XXX,XX @@ static void nbd_request_put(NBDRequestData *req)
      g_free(req);
      client->nb_requests--;
 -    nbd_update_can_read(client);
 +    nbd_client_receive_next_request(client);
 +
      nbd_client_put(client);
  }
@@ -XXX,XX +XXX,XX @@ static void blk_aio_attached(AioContext *ctx, void *opaque)
      exp->ctx = ctx;
      QTAILQ_FOREACH(client, &exp->clients, next) {
 -        nbd_set_handlers(client);
 +        qio_channel_attach_aio_context(client->ioc, ctx);
 +        if (client->recv_coroutine) {
 +            aio_co_schedule(ctx, client->recv_coroutine);
 +        }
 +        if (client->send_coroutine) {
 +            aio_co_schedule(ctx, client->send_coroutine);
 +        }
      }
  }
@@ -XXX,XX +XXX,XX @@ static void blk_aio_detach(void *opaque)
      TRACE("Export %s: Detaching clients from AIO context %p\n", exp->name, exp->ctx);
      QTAILQ_FOREACH(client, &exp->clients, next) {
 -        nbd_unset_handlers(client);
 +        qio_channel_detach_aio_context(client->ioc);
      }
      exp->ctx = NULL;
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
      g_assert(qemu_in_coroutine());
      qemu_co_mutex_lock(&client->send_lock);
      client->send_coroutine = qemu_coroutine_self();
 -    nbd_set_handlers(client);
      if (!len) {
          rc = nbd_send_reply(client->ioc, reply);
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
      }
      client->send_coroutine = NULL;
 -    nbd_set_handlers(client);
      qemu_co_mutex_unlock(&client->send_lock);
      return rc;
  }
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
      ssize_t rc;
      g_assert(qemu_in_coroutine());
 -    client->recv_coroutine = qemu_coroutine_self();
 -    nbd_update_can_read(client);
 -
 +    assert(client->recv_coroutine == qemu_coroutine_self());
      rc = nbd_receive_request(client->ioc, request);
      if (rc < 0) {
          if (rc != -EAGAIN) {
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
  out:
      client->recv_coroutine = NULL;
 -    nbd_update_can_read(client);
 +    nbd_client_receive_next_request(client);
      return rc;
  }
 -static void nbd_trip(void *opaque)
 +/* Owns a reference to the NBDClient passed as opaque.  */
 +static coroutine_fn void nbd_trip(void *opaque)
  {
      NBDClient *client = opaque;
      NBDExport *exp = client->exp;
      NBDRequestData *req;
 -    NBDRequest request;
 +    NBDRequest request = { 0 };    /* GCC thinks it can be used uninitialized */
      NBDReply reply;
      ssize_t ret;
      int flags;
      TRACE("Reading request.");
      if (client->closing) {
 +        nbd_client_put(client);
          return;
      }
@@ -XXX,XX +XXX,XX @@ static void nbd_trip(void *opaque)
  done:
      nbd_request_put(req);
 +    nbd_client_put(client);
      return;
  out:
      nbd_request_put(req);
      client_close(client);
 +    nbd_client_put(client);
  }
 -static void nbd_read(void *opaque)
 +static void nbd_client_receive_next_request(NBDClient *client)
  {
 -    NBDClient *client = opaque;
 -
 -    if (client->recv_coroutine) {
 -        qemu_coroutine_enter(client->recv_coroutine);
 -    } else {
 -        qemu_coroutine_enter(qemu_coroutine_create(nbd_trip, client));
 -    }
 -}
 -
 -static void nbd_restart_write(void *opaque)
 -{
 -    NBDClient *client = opaque;
 -
 -    qemu_coroutine_enter(client->send_coroutine);
 -}
 -
 -static void nbd_set_handlers(NBDClient *client)
 -{
 -    if (client->exp && client->exp->ctx) {
 -        aio_set_fd_handler(client->exp->ctx, client->sioc->fd, true,
 -                           client->can_read ? nbd_read : NULL,
 -                           client->send_coroutine ? nbd_restart_write : NULL,
 -                           NULL, client);
 -    }
 -}
 -
 -static void nbd_unset_handlers(NBDClient *client)
 -{
 -    if (client->exp && client->exp->ctx) {
 -        aio_set_fd_handler(client->exp->ctx, client->sioc->fd, true, NULL,
 -                           NULL, NULL, NULL);
 -    }
 -}
 -
 -static void nbd_update_can_read(NBDClient *client)
 -{
 -    bool can_read = client->recv_coroutine ||
 -                    client->nb_requests < MAX_NBD_REQUESTS;
 -
 -    if (can_read != client->can_read) {
 -        client->can_read = can_read;
 -        nbd_set_handlers(client);
 -
 -        /* There is no need to invoke aio_notify(), since aio_set_fd_handler()
 -         * in nbd_set_handlers() will have taken care of that */
 +    if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS) {
 +        nbd_client_get(client);
 +        client->recv_coroutine = qemu_coroutine_create(nbd_trip, client);
 +        aio_co_schedule(client->exp->ctx, client->recv_coroutine);
      }
  }
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void nbd_co_client_start(void *opaque)
          goto out;
      }
+     qemu_co_mutex_init(&client->send_lock);
-+    bdrv_debug_event(bs, BLKDBG_NONE);
+-    nbd_set_handlers(client);
      if (exp) {
          QTAILQ_INSERT_TAIL(&exp->clients, client, next);
      }
 +
-     ret = 0;
++    nbd_client_receive_next_request(client);
 +
  out:
-     if (ret < 0) {
+     g_free(data);
  }
@@ -XXX,XX +XXX,XX @@ void nbd_client_new(NBDExport *exp,
      object_ref(OBJECT(client->sioc));
      client->ioc = QIO_CHANNEL(sioc);
      object_ref(OBJECT(client->ioc));
 -    client->can_read = true;
      client->close = close_fn;
      data->client = client;
 --
-.21.0
+.9.3

-[Qemu-devel] [PULL 09/20] hw/block/fdc: floppy command FIFO memory initialization
+[Qemu-devel] [PULL v2 08/24] coroutine-lock: reschedule coroutine on the AioContext it was running on
-From: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-The uninitialized memory allocated for the command FIFO of the
+As a small step towards the introduction of multiqueue, we want
-floppy controller during the VM hardware initialization incurs
+coroutines to remain on the same AioContext that started them,
-many unwanted reports by Valgrind when VM state is being saved.
+unless they are moved explicitly with e.g. aio_co_schedule.  This patch
-That verbosity hardens a search for the real memory issues when
+avoids that coroutines switch AioContext when they use a CoMutex.
-the iotests run. Particularly, the patch eliminates 20 unnecessary
+For now it does not make much of a difference, because the CoMutex
-reports of the Valgrind tool in the iotest #169.
+is not thread-safe and the AioContext itself is used to protect the
 CoMutex from concurrent access.  However, this is going to change.
-Signed-off-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Message-id: 1559154027-282547-1-git-send-email-andrey.shinkevich@virtuozzo.com
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Reviewed-by: John Snow <jsnow@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Message-id: 20170213135235.12274-9-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- hw/block/fdc.c | 1 +
+ util/qemu-coroutine-lock.c | 5 ++---
-file changed, 1 insertion(+)
+ util/trace-events          | 1 -
 files changed, 2 insertions(+), 4 deletions(-)
-diff --git a/hw/block/fdc.c b/hw/block/fdc.c
+diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/block/fdc.c
+--- a/util/qemu-coroutine-lock.c
-+++ b/hw/block/fdc.c
++++ b/util/qemu-coroutine-lock.c
-@@ -XXX,XX +XXX,XX @@ static void fdctrl_realize_common(DeviceState *dev, FDCtrl *fdctrl,
+@@ -XXX,XX +XXX,XX @@
+ #include "qemu/coroutine.h"
-     FLOPPY_DPRINTF("init controller\n");
+ #include "qemu/coroutine_int.h"
-     fdctrl->fifo = qemu_memalign(512, FD_SECTOR_LEN);
+ #include "qemu/queue.h"
-+    memset(fdctrl->fifo, 0, FD_SECTOR_LEN);
++#include "block/aio.h"
-     fdctrl->fifo_size = 512;
+ #include "trace.h"
-     fdctrl->result_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
-                                              fdctrl_result_timer, fdctrl);
+ void qemu_co_queue_init(CoQueue *queue)
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_run_restart(Coroutine *co)
  static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
  {
 -    Coroutine *self = qemu_coroutine_self();
      Coroutine *next;
      if (QSIMPLEQ_EMPTY(&queue->entries)) {
@@ -XXX,XX +XXX,XX @@ static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
      while ((next = QSIMPLEQ_FIRST(&queue->entries)) != NULL) {
          QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next);
 -        QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, next, co_queue_next);
 -        trace_qemu_co_queue_next(next);
 +        aio_co_wake(next);
          if (single) {
              break;
          }
 diff --git a/util/trace-events b/util/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/util/trace-events
 +++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_coroutine_terminate(void *co) "self %p"
  # util/qemu-coroutine-lock.c
  qemu_co_queue_run_restart(void *co) "co %p"
 -qemu_co_queue_next(void *nxt) "next %p"
  qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
  qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
  qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
 --
-.21.0
+.9.3

-[Qemu-devel] [PULL 16/20] blkdebug: Add @iotype error option
+[Qemu-devel] [PULL v2 09/24] blkdebug: reschedule coroutine on the AioContext it is running on
-This new error option allows users of blkdebug to inject errors only on
+From: Paolo Bonzini <pbonzini@redhat.com>
 certain kinds of I/O operations.  Users usually want to make a very
 specific operation fail, not just any; but right now they simply hope
 that the event that triggers the error injection is followed up with
 that very operation.  That may not be true, however, because the block
 layer is changing (including blkdebug, which may increase the number of
 types of I/O operations on which to inject errors).
-The new option's default has been chosen to keep backwards
+Keep the coroutine on the same AioContext.  Without this change,
-compatibility.
+there would be a race between yielding the coroutine and reentering it.
 While the race cannot happen now, because the code only runs from a single
 AioContext, this will change with multiqueue support in the block layer.
-Note that similar to the internal representation, we could choose to
+While doing the change, replace custom bottom half with aio_co_schedule.
 expose this option as a list of I/O types.  But there is no practical
 use for this, because as described above, users usually know exactly
 which kind of operation they want to make fail, so there is no need to
 specify multiple I/O types at once.  In addition, exposing this option
 as a list would require non-trivial changes to qemu_opts_absorb_qdict().
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
-Message-id: 20190507203508.18026-4-mreitz@redhat.com
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Message-id: 20170213135235.12274-10-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- qapi/block-core.json | 26 +++++++++++++++++++++++
+ block/blkdebug.c | 9 +--------
- block/blkdebug.c     | 50 ++++++++++++++++++++++++++++++++++++--------
+file changed, 1 insertion(+), 8 deletions(-)
 files changed, 67 insertions(+), 9 deletions(-)
-diff --git a/qapi/block-core.json b/qapi/block-core.json
-index XXXXXXX..XXXXXXX 100644
---- a/qapi/block-core.json
-+++ b/qapi/block-core.json
-@@ -XXX,XX +XXX,XX @@
-             'l1_shrink_write_table', 'l1_shrink_free_l2_clusters',
-             'cor_write', 'cluster_alloc_space'] }
-+##
-+# @BlkdebugIOType:
-+#
-+# Kinds of I/O that blkdebug can inject errors in.
-+#
-+# @read: .bdrv_co_preadv()
-+#
-+# @write: .bdrv_co_pwritev()
-+#
-+# @write-zeroes: .bdrv_co_pwrite_zeroes()
-+#
-+# @discard: .bdrv_co_pdiscard()
-+#
-+# @flush: .bdrv_co_flush_to_disk()
-+#
-+# Since: 4.1
-+##
-+{ 'enum': 'BlkdebugIOType', 'prefix': 'BLKDEBUG_IO_TYPE',
-+  'data': [ 'read', 'write', 'write-zeroes', 'discard', 'flush' ] }
-+
- ##
- # @BlkdebugInjectErrorOptions:
- #
-@@ -XXX,XX +XXX,XX @@
- # @state:       the state identifier blkdebug needs to be in to
- #               actually trigger the event; defaults to "any"
- #
-+# @iotype:      the type of I/O operations on which this error should
-+#               be injected; defaults to "all read, write,
-+#               write-zeroes, discard, and flush operations"
-+#               (since: 4.1)
-+#
- # @errno:       error identifier (errno) to be returned; defaults to
- #               EIO
- #
-@@ -XXX,XX +XXX,XX @@
- { 'struct': 'BlkdebugInjectErrorOptions',
-   'data': { 'event': 'BlkdebugEvent',
-             '*state': 'int',
-+            '*iotype': 'BlkdebugIOType',
-             '*errno': 'int',
-             '*sector': 'int',
-             '*once': 'bool',
 diff --git a/block/blkdebug.c b/block/blkdebug.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/blkdebug.c
 +++ b/block/blkdebug.c
-@@ -XXX,XX +XXX,XX @@ typedef struct BlkdebugRule {
-     int state;
-     union {
-         struct {
-+            uint64_t iotype_mask;
-             int error;
-             int immediately;
-             int once;
-@@ -XXX,XX +XXX,XX @@ typedef struct BlkdebugRule {
-     QSIMPLEQ_ENTRY(BlkdebugRule) active_next;
- } BlkdebugRule;
-+QEMU_BUILD_BUG_MSG(BLKDEBUG_IO_TYPE__MAX > 64,
-+                   "BlkdebugIOType mask does not fit into an uint64_t");
-+
- static QemuOptsList inject_error_opts = {
-     .name = "inject-error",
-     .head = QTAILQ_HEAD_INITIALIZER(inject_error_opts.head),
-@@ -XXX,XX +XXX,XX @@ static QemuOptsList inject_error_opts = {
-             .name = "state",
-             .type = QEMU_OPT_NUMBER,
-         },
-+        {
-+            .name = "iotype",
-+            .type = QEMU_OPT_STRING,
-+        },
-         {
-             .name = "errno",
-             .type = QEMU_OPT_NUMBER,
-@@ -XXX,XX +XXX,XX @@ static int add_rule(void *opaque, QemuOpts *opts, Error **errp)
-     int event;
-     struct BlkdebugRule *rule;
-     int64_t sector;
-+    BlkdebugIOType iotype;
-+    Error *local_error = NULL;
-     /* Find the right event for the rule */
-     event_name = qemu_opt_get(opts, "event");
-@@ -XXX,XX +XXX,XX @@ static int add_rule(void *opaque, QemuOpts *opts, Error **errp)
-         sector = qemu_opt_get_number(opts, "sector", -1);
-         rule->options.inject.offset =
-             sector == -1 ? -1 : sector * BDRV_SECTOR_SIZE;
-+
-+        iotype = qapi_enum_parse(&BlkdebugIOType_lookup,
-+                                 qemu_opt_get(opts, "iotype"),
-+                                 BLKDEBUG_IO_TYPE__MAX, &local_error);
-+        if (local_error) {
-+            error_propagate(errp, local_error);
-+            return -1;
-+        }
-+        if (iotype != BLKDEBUG_IO_TYPE__MAX) {
-+            rule->options.inject.iotype_mask = (1ull << iotype);
-+        } else {
-+            /* Apply the default */
-+            rule->options.inject.iotype_mask =
-+                (1ull << BLKDEBUG_IO_TYPE_READ)
-+                | (1ull << BLKDEBUG_IO_TYPE_WRITE)
-+                | (1ull << BLKDEBUG_IO_TYPE_WRITE_ZEROES)
-+                | (1ull << BLKDEBUG_IO_TYPE_DISCARD)
-+                | (1ull << BLKDEBUG_IO_TYPE_FLUSH);
-+        }
-+
-         break;
-     case ACTION_SET_STATE:
 @@ -XXX,XX +XXX,XX @@ out:
      return ret;
  }
--static int rule_check(BlockDriverState *bs, uint64_t offset, uint64_t bytes)
+-static void error_callback_bh(void *opaque)
-+static int rule_check(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+-{
-+                      BlkdebugIOType iotype)
+-    Coroutine *co = opaque;
 -    qemu_coroutine_enter(co);
 -}
 -
  static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
  {
      BDRVBlkdebugState *s = bs->opaque;
-     BlkdebugRule *rule = NULL;
+@@ -XXX,XX +XXX,XX @@ static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
@@ -XXX,XX +XXX,XX @@ static int rule_check(BlockDriverState *bs, uint64_t offset, uint64_t bytes)
      QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
          uint64_t inject_offset = rule->options.inject.offset;
 -        if (inject_offset == -1 ||
 -            (bytes && inject_offset >= offset &&
 -             inject_offset < offset + bytes))
 +        if ((inject_offset == -1 ||
 +             (bytes && inject_offset >= offset &&
 +              inject_offset < offset + bytes)) &&
 +            (rule->options.inject.iotype_mask & (1ull << iotype)))
          {
              break;
          }
@@ -XXX,XX +XXX,XX @@ blkdebug_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
          assert(bytes <= bs->bl.max_transfer);
      }
--    err = rule_check(bs, offset, bytes);
+     if (!immediately) {
-+    err = rule_check(bs, offset, bytes, BLKDEBUG_IO_TYPE_READ);
+-        aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), error_callback_bh,
-     if (err) {
+-                                qemu_coroutine_self());
-         return err;
++        aio_co_schedule(qemu_get_current_aio_context(), qemu_coroutine_self());
          qemu_coroutine_yield();
      }
-@@ -XXX,XX +XXX,XX @@ blkdebug_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
          assert(bytes <= bs->bl.max_transfer);
      }
 -    err = rule_check(bs, offset, bytes);
 +    err = rule_check(bs, offset, bytes, BLKDEBUG_IO_TYPE_WRITE);
      if (err) {
          return err;
      }
@@ -XXX,XX +XXX,XX @@ blkdebug_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
  static int blkdebug_co_flush(BlockDriverState *bs)
  {
 -    int err = rule_check(bs, 0, 0);
 +    int err = rule_check(bs, 0, 0, BLKDEBUG_IO_TYPE_FLUSH);
      if (err) {
          return err;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkdebug_co_pwrite_zeroes(BlockDriverState *bs,
          assert(bytes <= bs->bl.max_pwrite_zeroes);
      }
 -    err = rule_check(bs, offset, bytes);
 +    err = rule_check(bs, offset, bytes, BLKDEBUG_IO_TYPE_WRITE_ZEROES);
      if (err) {
          return err;
      }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs,
          assert(bytes <= bs->bl.max_pdiscard);
      }
 -    err = rule_check(bs, offset, bytes);
 +    err = rule_check(bs, offset, bytes, BLKDEBUG_IO_TYPE_DISCARD);
      if (err) {
          return err;
      }
 --
-.21.0
+.9.3

-[Qemu-devel] [PULL 10/20] iotests: restrict 254 to support only qcow2
+[Qemu-devel] [PULL v2 10/24] qed: introduce qed_aio_start_io and qed_aio_next_io_cb
-From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Test fails at least for qcow, because of different cluster sizes in
+qed_aio_start_io and qed_aio_next_io will not have to acquire/release
-base and top (and therefore different granularities of bitmaps we are
+the AioContext, while qed_aio_next_io_cb will.  Split the functionality
-trying to merge).
+and gain a little type-safety in the process.
-The test aim is to check block-dirty-bitmap-merge between different
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-nodes functionality, no needs to check all formats. So, let's just drop
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-support for anything except qcow2.
+Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Message-id: 20170213135235.12274-11-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  block/qed.c | 39 +++++++++++++++++++++++++--------------
 file changed, 25 insertions(+), 14 deletions(-)
-Reported-by: Max Reitz <mreitz@redhat.com>
+diff --git a/block/qed.c b/block/qed.c
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+index XXXXXXX..XXXXXXX 100644
-Message-id: 20190605155405.104384-1-vsementsov@virtuozzo.com
+--- a/block/qed.c
-Signed-off-by: Max Reitz <mreitz@redhat.com>
++++ b/block/qed.c
----
+@@ -XXX,XX +XXX,XX @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
- tests/qemu-iotests/254 | 2 ++
+     return l2_table;
-file changed, 2 insertions(+)
+ }
-diff --git a/tests/qemu-iotests/254 b/tests/qemu-iotests/254
+-static void qed_aio_next_io(void *opaque, int ret);
-index XXXXXXX..XXXXXXX 100755
++static void qed_aio_next_io(QEDAIOCB *acb, int ret);
 --- a/tests/qemu-iotests/254
 +++ b/tests/qemu-iotests/254
@@ -XXX,XX +XXX,XX @@
  import iotests
  from iotests import qemu_img_create, file_path, log
 +iotests.verify_image_format(supported_fmts=['qcow2'])
 +
- disk, top = file_path('disk', 'top')
++static void qed_aio_start_io(QEDAIOCB *acb)
- size = 1024 * 1024
++{
 +    qed_aio_next_io(acb, 0);
 +}
 +
 +static void qed_aio_next_io_cb(void *opaque, int ret)
 +{
 +    QEDAIOCB *acb = opaque;
 +
 +    qed_aio_next_io(acb, ret);
 +}
  static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
  {
@@ -XXX,XX +XXX,XX @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
      acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
      if (acb) {
 -        qed_aio_next_io(acb, 0);
 +        qed_aio_start_io(acb);
      }
  }
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
          QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
          acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
          if (acb) {
 -            qed_aio_next_io(acb, 0);
 +            qed_aio_start_io(acb);
          } else if (s->header.features & QED_F_NEED_CHECK) {
              qed_start_need_check_timer(s);
          }
@@ -XXX,XX +XXX,XX @@ static void qed_commit_l2_update(void *opaque, int ret)
      acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
      assert(acb->request.l2_table != NULL);
 -    qed_aio_next_io(opaque, ret);
 +    qed_aio_next_io(acb, ret);
  }
  /**
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
      if (need_alloc) {
          /* Write out the whole new L2 table */
          qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
 -                            qed_aio_write_l1_update, acb);
 +                           qed_aio_write_l1_update, acb);
      } else {
          /* Write out only the updated part of the L2 table */
          qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
 -                            qed_aio_next_io, acb);
 +                           qed_aio_next_io_cb, acb);
      }
      return;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
      }
      if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
 -        next_fn = qed_aio_next_io;
 +        next_fn = qed_aio_next_io_cb;
      } else {
          if (s->bs->backing) {
              next_fn = qed_aio_write_flush_before_l2_update;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
      if (acb->flags & QED_AIOCB_ZERO) {
          /* Skip ahead if the clusters are already zero */
          if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
 -            qed_aio_next_io(acb, 0);
 +            qed_aio_start_io(acb);
              return;
          }
@@ -XXX,XX +XXX,XX @@ static void qed_aio_read_data(void *opaque, int ret,
      /* Handle zero cluster and backing file reads */
      if (ret == QED_CLUSTER_ZERO) {
          qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
 -        qed_aio_next_io(acb, 0);
 +        qed_aio_start_io(acb);
          return;
      } else if (ret != QED_CLUSTER_FOUND) {
          qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
 -                              &acb->backing_qiov, qed_aio_next_io, acb);
 +                              &acb->backing_qiov, qed_aio_next_io_cb, acb);
          return;
      }
      BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
      bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
                     &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
 -                   qed_aio_next_io, acb);
 +                   qed_aio_next_io_cb, acb);
      return;
  err:
@@ -XXX,XX +XXX,XX @@ err:
  /**
   * Begin next I/O or complete the request
   */
 -static void qed_aio_next_io(void *opaque, int ret)
 +static void qed_aio_next_io(QEDAIOCB *acb, int ret)
  {
 -    QEDAIOCB *acb = opaque;
      BDRVQEDState *s = acb_to_s(acb);
      QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
                                  qed_aio_write_data : qed_aio_read_data;
@@ -XXX,XX +XXX,XX @@ static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
      qemu_iovec_init(&acb->cur_qiov, qiov->niov);
      /* Start request */
 -    qed_aio_next_io(acb, 0);
 +    qed_aio_start_io(acb);
      return &acb->common;
  }
 --
-.21.0
+.9.3

-[Qemu-devel] [PULL 13/20] blockdev: Overlays are not snapshots
+[Qemu-devel] [PULL v2 11/24] aio: push aio_context_acquire/release down to dispatching
-There are error messages which refer to an overlay node as the snapshot.
+From: Paolo Bonzini <pbonzini@redhat.com>
 That is wrong, those are two different things.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+The AioContext data structures are now protected by list_lock and/or
-Reviewed-by: Eric Blake <eblake@redhat.com>
+they are walked with FOREACH_RCU primitives.  There is no need anymore
-Message-id: 20190603202236.1342-3-mreitz@redhat.com
+to acquire the AioContext for the entire duration of aio_dispatch.
-Reviewed-by: John Snow <jsnow@redhat.com>
+Instead, just acquire it before and after invoking the callbacks.
-Reviewed-by: Alberto Garcia <berto@igalia.com>
+The next step is then to push it further down.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Message-id: 20170213135235.12274-12-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- blockdev.c                 | 10 +++++-----
+ util/aio-posix.c | 25 +++++++++++--------------
- tests/qemu-iotests/085.out | 10 +++++-----
+ util/aio-win32.c | 15 +++++++--------
-files changed, 10 insertions(+), 10 deletions(-)
+ util/async.c     |  2 ++
 files changed, 20 insertions(+), 22 deletions(-)
-diff --git a/blockdev.c b/blockdev.c
+diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
---- a/blockdev.c
+--- a/util/aio-posix.c
-+++ b/blockdev.c
++++ b/util/aio-posix.c
-@@ -XXX,XX +XXX,XX @@ static void external_snapshot_prepare(BlkActionState *common,
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
-             s->has_snapshot_node_name ? s->snapshot_node_name : NULL;
+             (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
+             aio_node_check(ctx, node->is_external) &&
-         if (node_name && !snapshot_node_name) {
+             node->io_read) {
--            error_setg(errp, "New snapshot node name missing");
++            aio_context_acquire(ctx);
-+            error_setg(errp, "New overlay node name missing");
+             node->io_read(node->opaque);
-             goto out;
++            aio_context_release(ctx);
              /* aio_notify() does not count as progress */
              if (node->opaque != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
              (revents & (G_IO_OUT | G_IO_ERR)) &&
              aio_node_check(ctx, node->is_external) &&
              node->io_write) {
 +            aio_context_acquire(ctx);
              node->io_write(node->opaque);
 +            aio_context_release(ctx);
              progress = true;
          }
-         if (snapshot_node_name &&
+@@ -XXX,XX +XXX,XX @@ bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
-             bdrv_lookup_bs(snapshot_node_name, snapshot_node_name, NULL)) {
+     }
--            error_setg(errp, "New snapshot node name already in use");
-+            error_setg(errp, "New overlay node name already in use");
+     /* Run our timers */
-             goto out;
++    aio_context_acquire(ctx);
      progress |= timerlistgroup_run_timers(&ctx->tlg);
 +    aio_context_release(ctx);
      return progress;
  }
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
      int64_t timeout;
      int64_t start = 0;
 -    aio_context_acquire(ctx);
 -    progress = false;
 -
      /* aio_notify can avoid the expensive event_notifier_set if
       * everything (file descriptors, bottom halves, timers) will
       * be re-evaluated before the next blocking poll().  This is
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
      }
 -    if (try_poll_mode(ctx, blocking)) {
 -        progress = true;
 -    } else {
 +    aio_context_acquire(ctx);
 +    progress = try_poll_mode(ctx, blocking);
 +    aio_context_release(ctx);
 +
 +    if (!progress) {
          assert(npfd == 0);
          /* fill pollfds */
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          timeout = blocking ? aio_compute_timeout(ctx) : 0;
          /* wait until next event */
 -        if (timeout) {
 -            aio_context_release(ctx);
 -        }
          if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
              AioHandler epoll_handler;
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          } else  {
              ret = qemu_poll_ns(pollfds, npfd, timeout);
          }
+-        if (timeout) {
-@@ -XXX,XX +XXX,XX @@ static void external_snapshot_prepare(BlkActionState *common,
+-            aio_context_acquire(ctx);
 -        }
      }
-     if (bdrv_has_blk(state->new_bs)) {
+     if (blocking) {
--        error_setg(errp, "The snapshot is already in use");
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
-+        error_setg(errp, "The overlay is already in use");
+         progress = true;
          goto out;
      }
-@@ -XXX,XX +XXX,XX @@ static void external_snapshot_prepare(BlkActionState *common,
+-    aio_context_release(ctx);
-     }
+-
+     return progress;
-     if (state->new_bs->backing != NULL) {
+ }
--        error_setg(errp, "The snapshot already has a backing image");
-+        error_setg(errp, "The overlay already has a backing image");
+diff --git a/util/aio-win32.c b/util/aio-win32.c
          goto out;
      }
      if (!state->new_bs->drv->supports_backing) {
 -        error_setg(errp, "The snapshot does not support backing images");
 +        error_setg(errp, "The overlay does not support backing images");
          goto out;
      }
 diff --git a/tests/qemu-iotests/085.out b/tests/qemu-iotests/085.out
 index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/085.out
+--- a/util/aio-win32.c
-+++ b/tests/qemu-iotests/085.out
++++ b/util/aio-win32.c
-@@ -XXX,XX +XXX,XX @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
+             (revents || event_notifier_get_handle(node->e) == event) &&
- === Invalid command - cannot create a snapshot using a file BDS ===
+             node->io_notify) {
+             node->pfd.revents = 0;
--{"error": {"class": "GenericError", "desc": "The snapshot does not support backing images"}}
++            aio_context_acquire(ctx);
-+{"error": {"class": "GenericError", "desc": "The overlay does not support backing images"}}
+             node->io_notify(node->e);
++            aio_context_release(ctx);
- === Invalid command - snapshot node used as active layer ===
+             /* aio_notify() does not count as progress */
--{"error": {"class": "GenericError", "desc": "The snapshot is already in use"}}
+             if (node->e != &ctx->notifier) {
--{"error": {"class": "GenericError", "desc": "The snapshot is already in use"}}
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
--{"error": {"class": "GenericError", "desc": "The snapshot is already in use"}}
+             (node->io_read || node->io_write)) {
-+{"error": {"class": "GenericError", "desc": "The overlay is already in use"}}
+             node->pfd.revents = 0;
-+{"error": {"class": "GenericError", "desc": "The overlay is already in use"}}
+             if ((revents & G_IO_IN) && node->io_read) {
-+{"error": {"class": "GenericError", "desc": "The overlay is already in use"}}
++                aio_context_acquire(ctx);
+                 node->io_read(node->opaque);
- === Invalid command - snapshot node used as backing hd ===
++                aio_context_release(ctx);
+                 progress = true;
-@@ -XXX,XX +XXX,XX @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/
+             }
- Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134217728
+             if ((revents & G_IO_OUT) && node->io_write) {
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base
++                aio_context_acquire(ctx);
- {"return": {}}
+                 node->io_write(node->opaque);
--{"error": {"class": "GenericError", "desc": "The snapshot already has a backing image"}}
++                aio_context_release(ctx);
-+{"error": {"class": "GenericError", "desc": "The overlay already has a backing image"}}
+                 progress = true;
+             }
- === Invalid command - The node does not exist ===
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
      int count;
      int timeout;
 -    aio_context_acquire(ctx);
      progress = false;
      /* aio_notify can avoid the expensive event_notifier_set if
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          timeout = blocking && !have_select_revents
              ? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
 -        if (timeout) {
 -            aio_context_release(ctx);
 -        }
          ret = WaitForMultipleObjects(count, events, FALSE, timeout);
          if (blocking) {
              assert(first);
              atomic_sub(&ctx->notify_me, 2);
          }
 -        if (timeout) {
 -            aio_context_acquire(ctx);
 -        }
          if (first) {
              aio_notify_accept(ctx);
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          progress |= aio_dispatch_handlers(ctx, event);
      } while (count > 0);
 +    aio_context_acquire(ctx);
      progress |= timerlistgroup_run_timers(&ctx->tlg);
 -
      aio_context_release(ctx);
      return progress;
  }
 diff --git a/util/async.c b/util/async.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/async.c
 +++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
                  ret = 1;
              }
              bh->idle = 0;
 +            aio_context_acquire(ctx);
              aio_bh_call(bh);
 +            aio_context_release(ctx);
          }
          if (bh->deleted) {
              deleted = true;
 --
-.21.0
+.9.3

-[Qemu-devel] [PULL 07/20] iotests: Filter 175's allocation information
+[Qemu-devel] [PULL v2 12/24] block: explicitly acquire aiocontext in timers that need it
-It is possible for an empty file to take up blocks on a filesystem, for
+From: Paolo Bonzini <pbonzini@redhat.com>
-example:
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-$ qemu-img create -f raw test.img 1G
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Formatting 'test.img', fmt=raw size=1073741824
+Reviewed-by: Fam Zheng <famz@redhat.com>
-$ mkfs.ext4 -I 128 -q test.img
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
-$ mkdir test-mount
+Message-id: 20170213135235.12274-13-pbonzini@redhat.com
-$ sudo mount -o loop test.img test-mount
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 $ sudo touch test-mount/test-file
 $ stat -c 'blocks=%b' test-mount/test-file
 blocks=8
 These extra blocks (one cluster) are apparently used for metadata,
 because they are always there, on top of blocks used for data:
 $ sudo dd if=/dev/zero of=test-mount/test-file bs=1M count=1
 +0 records in
 +0 records out
 1048576 bytes (1.0 MB, 1.0 MiB) copied, 0.00135339 s, 775 MB/s
 $ stat -c 'blocks=%b' test-mount/test-file
 blocks=2056
 Make iotest 175 take this into account.
 Reported-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Nir Soffer <nsoffer@redhat.com>
 Message-id: 20190516144319.12570-1-mreitz@redhat.com
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
- tests/qemu-iotests/175     | 26 ++++++++++++++++++++++----
+ block/qed.h                 |  3 +++
- tests/qemu-iotests/175.out |  8 ++++----
+ block/curl.c                |  2 ++
-files changed, 26 insertions(+), 8 deletions(-)
+ block/io.c                  |  5 +++++
+ block/iscsi.c               |  8 ++++++--
-diff --git a/tests/qemu-iotests/175 b/tests/qemu-iotests/175
+ block/null.c                |  4 ++++
-index XXXXXXX..XXXXXXX 100755
+ block/qed.c                 | 12 ++++++++++++
---- a/tests/qemu-iotests/175
+ block/throttle-groups.c     |  2 ++
-+++ b/tests/qemu-iotests/175
+ util/aio-posix.c            |  2 --
-@@ -XXX,XX +XXX,XX @@ status=1    # failure is the default!
+ util/aio-win32.c            |  2 --
+ util/qemu-coroutine-sleep.c |  2 +-
- _cleanup()
+files changed, 35 insertions(+), 7 deletions(-)
 diff --git a/block/qed.h b/block/qed.h
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.h
 +++ b/block/qed.h
@@ -XXX,XX +XXX,XX @@ enum {
   */
  typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len);
 +void qed_acquire(BDRVQEDState *s);
 +void qed_release(BDRVQEDState *s);
 +
  /**
   * Generic callback for chaining async callbacks
   */
 diff --git a/block/curl.c b/block/curl.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/curl.c
 +++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_multi_timeout_do(void *arg)
          return;
      }
 +    aio_context_acquire(s->aio_context);
      curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
      curl_multi_check_completion(s);
 +    aio_context_release(s->aio_context);
  #else
      abort();
  #endif
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_aio_cancel(BlockAIOCB *acb)
          if (acb->aiocb_info->get_aio_context) {
              aio_poll(acb->aiocb_info->get_aio_context(acb), true);
          } else if (acb->bs) {
 +            /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
 +             * assert that we're not using an I/O thread.  Thread-safe
 +             * code should use bdrv_aio_cancel_async exclusively.
 +             */
 +            assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
              aio_poll(bdrv_get_aio_context(acb->bs), true);
          } else {
              abort();
 diff --git a/block/iscsi.c b/block/iscsi.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/iscsi.c
 +++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static void iscsi_retry_timer_expired(void *opaque)
      struct IscsiTask *iTask = opaque;
      iTask->complete = 1;
      if (iTask->co) {
 -        qemu_coroutine_enter(iTask->co);
 +        aio_co_wake(iTask->co);
      }
  }
@@ -XXX,XX +XXX,XX @@ static void iscsi_nop_timed_event(void *opaque)
  {
--    _cleanup_test_img
+     IscsiLun *iscsilun = opaque;
-+    _cleanup_test_img
-+    rm -f "$TEST_DIR/empty"
++    aio_context_acquire(iscsilun->aio_context);
- }
+     if (iscsi_get_nops_in_flight(iscsilun->iscsi) >= MAX_NOP_FAILURES) {
- trap "_cleanup; exit \$status" 0 1 2 3 15
+         error_report("iSCSI: NOP timeout. Reconnecting...");
+         iscsilun->request_timed_out = true;
-+# Some file systems sometimes allocate extra blocks independently of
+     } else if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) {
-+# the file size.  This function hides the resulting difference in the
+         error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages.");
-+# stat -c '%b' output.
+-        return;
-+# Parameter 1: Number of blocks an empty file occupies
++        goto out;
-+# Parameter 2: Image size in bytes
+     }
-+_filter_blocks()
      timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL);
      iscsi_set_events(iscsilun);
 +
 +out:
 +    aio_context_release(iscsilun->aio_context);
  }
  static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
 diff --git a/block/null.c b/block/null.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/null.c
 +++ b/block/null.c
@@ -XXX,XX +XXX,XX @@ static void null_bh_cb(void *opaque)
  static void null_timer_cb(void *opaque)
  {
      NullAIOCB *acb = opaque;
 +    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 +
 +    aio_context_acquire(ctx);
      acb->common.cb(acb->common.opaque, 0);
 +    aio_context_release(ctx);
      timer_deinit(&acb->timer);
      qemu_aio_unref(acb);
  }
 diff --git a/block/qed.c b/block/qed.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.c
 +++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_need_check_timer_cb(void *opaque)
      trace_qed_need_check_timer_cb(s);
 +    qed_acquire(s);
      qed_plug_allocating_write_reqs(s);
      /* Ensure writes are on disk before clearing flag */
      bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
 +    qed_release(s);
 +}
 +
 +void qed_acquire(BDRVQEDState *s)
 +{
-+    extra_blocks=$1
++    aio_context_acquire(bdrv_get_aio_context(s->bs));
 +    img_size=$2
 +
 +    sed -e "s/blocks=$extra_blocks\\(\$\\|[^0-9]\\)/nothing allocated/" \
 +        -e "s/blocks=$((extra_blocks + img_size / 512))\\(\$\\|[^0-9]\\)/everything allocated/"
 +}
 +
- # get standard environment, filters and checks
++void qed_release(BDRVQEDState *s)
- . ./common.rc
++{
- . ./common.filter
++    aio_context_release(bdrv_get_aio_context(s->bs));
-@@ -XXX,XX +XXX,XX @@ _supported_fmt raw
+ }
- _supported_proto file
- _supported_os Linux
+ static void qed_start_need_check_timer(BDRVQEDState *s)
+diff --git a/block/throttle-groups.c b/block/throttle-groups.c
--size=1m
+index XXXXXXX..XXXXXXX 100644
-+size=$((1 * 1024 * 1024))
+--- a/block/throttle-groups.c
-+
++++ b/block/throttle-groups.c
-+touch "$TEST_DIR/empty"
+@@ -XXX,XX +XXX,XX @@ static void timer_cb(BlockBackend *blk, bool is_write)
-+extra_blocks=$(stat -c '%b' "$TEST_DIR/empty")
+     qemu_mutex_unlock(&tg->lock);
- echo
+     /* Run the request that was waiting for this timer */
- echo "== creating image with default preallocation =="
++    aio_context_acquire(blk_get_aio_context(blk));
- _make_test_img $size | _filter_imgfmt
+     empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]);
--stat -c "size=%s, blocks=%b" $TEST_IMG
++    aio_context_release(blk_get_aio_context(blk));
-+stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $size
+     /* If the request queue was empty then we have to take care of
- for mode in off full falloc; do
+      * scheduling the next one */
-     echo
+diff --git a/util/aio-posix.c b/util/aio-posix.c
-     echo "== creating image with preallocation $mode =="
+index XXXXXXX..XXXXXXX 100644
-     IMGOPTS=preallocation=$mode _make_test_img $size | _filter_imgfmt
+--- a/util/aio-posix.c
--    stat -c "size=%s, blocks=%b" $TEST_IMG
++++ b/util/aio-posix.c
-+    stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $size
+@@ -XXX,XX +XXX,XX @@ bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
- done
+     }
- # success, all done
+     /* Run our timers */
-diff --git a/tests/qemu-iotests/175.out b/tests/qemu-iotests/175.out
+-    aio_context_acquire(ctx);
-index XXXXXXX..XXXXXXX 100644
+     progress |= timerlistgroup_run_timers(&ctx->tlg);
---- a/tests/qemu-iotests/175.out
+-    aio_context_release(ctx);
-+++ b/tests/qemu-iotests/175.out
-@@ -XXX,XX +XXX,XX @@ QA output created by 175
+     return progress;
+ }
- == creating image with default preallocation ==
+diff --git a/util/aio-win32.c b/util/aio-win32.c
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
+index XXXXXXX..XXXXXXX 100644
--size=1048576, blocks=0
+--- a/util/aio-win32.c
-+size=1048576, nothing allocated
++++ b/util/aio-win32.c
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
- == creating image with preallocation off ==
+         progress |= aio_dispatch_handlers(ctx, event);
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=off
+     } while (count > 0);
--size=1048576, blocks=0
-+size=1048576, nothing allocated
+-    aio_context_acquire(ctx);
+     progress |= timerlistgroup_run_timers(&ctx->tlg);
- == creating image with preallocation full ==
+-    aio_context_release(ctx);
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=full
+     return progress;
--size=1048576, blocks=2048
+ }
-+size=1048576, everything allocated
+diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
- == creating image with preallocation falloc ==
+index XXXXXXX..XXXXXXX 100644
- Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=falloc
+--- a/util/qemu-coroutine-sleep.c
--size=1048576, blocks=2048
++++ b/util/qemu-coroutine-sleep.c
-+size=1048576, everything allocated
+@@ -XXX,XX +XXX,XX @@ static void co_sleep_cb(void *opaque)
-  *** done
+ {
      CoSleepCB *sleep_cb = opaque;
 -    qemu_coroutine_enter(sleep_cb->co);
 +    aio_co_wake(sleep_cb->co);
  }
  void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type,
 --
-.21.0
+.9.3

-[Qemu-devel] [PULL 03/20] QEMUMachine: add events_wait method
+[Qemu-devel] [PULL v2 13/24] block: explicitly acquire aiocontext in callbacks that need it
-From: John Snow <jsnow@redhat.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Instead of event_wait which looks for a single event, add an events_wait
+This covers both file descriptor callbacks and polling callbacks,
-which can look for any number of events simultaneously. However, it
+since they execute related code.
 will still only return one at a time, whichever happens first.
-Signed-off-by: John Snow <jsnow@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Message-id: 20190523170643.20794-4-jsnow@redhat.com
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
+Reviewed-by: Fam Zheng <famz@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Message-id: 20170213135235.12274-14-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- python/qemu/__init__.py | 69 +++++++++++++++++++++++++++++------------
+ block/curl.c          | 16 +++++++++++++---
-file changed, 49 insertions(+), 20 deletions(-)
+ block/iscsi.c         |  4 ++++
  block/linux-aio.c     |  4 ++++
  block/nfs.c           |  6 ++++++
  block/sheepdog.c      | 29 +++++++++++++++--------------
  block/ssh.c           | 29 +++++++++--------------------
  block/win32-aio.c     | 10 ++++++----
  hw/block/virtio-blk.c |  5 ++++-
  hw/scsi/virtio-scsi.c |  7 +++++++
  util/aio-posix.c      |  7 -------
  util/aio-win32.c      |  6 ------
 files changed, 68 insertions(+), 55 deletions(-)
-diff --git a/python/qemu/__init__.py b/python/qemu/__init__.py
+diff --git a/block/curl.c b/block/curl.c
 index XXXXXXX..XXXXXXX 100644
---- a/python/qemu/__init__.py
+--- a/block/curl.c
-+++ b/python/qemu/__init__.py
++++ b/block/curl.c
-@@ -XXX,XX +XXX,XX @@ class QEMUMachine(object):
+@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
-         self._qmp.clear_events()
+     }
-         return events
+ }
--    def event_wait(self, name, timeout=60.0, match=None):
+-static void curl_multi_do(void *arg)
-+    @staticmethod
++static void curl_multi_do_locked(CURLState *s)
-+    def event_match(event, match=None):
+ {
-         """
+-    CURLState *s = (CURLState *)arg;
--        Wait for specified timeout on named event in QMP; optionally filter
+     CURLSocket *socket, *next_socket;
--        results by match.
+     int running;
-+        Check if an event matches optional match criteria.
+     int r;
+@@ -XXX,XX +XXX,XX @@ static void curl_multi_do(void *arg)
--        The 'match' is checked to be a recursive subset of the 'event'; skips
+     }
--        branch processing on match's value None
+ }
--           {"foo": {"bar": 1}} matches {"foo": None}
--           {"foo": {"bar": 1}} does not matches {"foo": {"baz": None}}
++static void curl_multi_do(void *arg)
-+        The match criteria takes the form of a matching subdict. The event is
++{
-+        checked to be a superset of the subdict, recursively, with matching
++    CURLState *s = (CURLState *)arg;
-+        values whenever those values are not None.
++
-+
++    aio_context_acquire(s->s->aio_context);
-+        Examples, with the subdict queries on the left:
++    curl_multi_do_locked(s);
-+         - None matches any object.
++    aio_context_release(s->s->aio_context);
-+         - {"foo": None} matches {"foo": {"bar": 1}}
++}
-+         - {"foo": {"baz": None}} does not match {"foo": {"bar": 1}}
++
-+         - {"foo": {"baz": 2}} matches {"foo": {"bar": 1, "baz": 2}}
+ static void curl_multi_read(void *arg)
-         """
+ {
--        def event_match(event, match=None):
+     CURLState *s = (CURLState *)arg;
--            if match is None:
--                return True
+-    curl_multi_do(arg);
-+        if match is None:
++    aio_context_acquire(s->s->aio_context);
-+            return True
++    curl_multi_do_locked(s);
+     curl_multi_check_completion(s->s);
--            for key in match:
++    aio_context_release(s->s->aio_context);
--                if key in event:
+ }
--                    if isinstance(event[key], dict):
--                        if not event_match(event[key], match[key]):
+ static void curl_multi_timeout_do(void *arg)
--                            return False
+diff --git a/block/iscsi.c b/block/iscsi.c
--                    elif event[key] != match[key]:
+index XXXXXXX..XXXXXXX 100644
-+        for key in match:
+--- a/block/iscsi.c
-+            if key in event:
++++ b/block/iscsi.c
-+                if isinstance(event[key], dict):
+@@ -XXX,XX +XXX,XX @@ iscsi_process_read(void *arg)
-+                    if not QEMUMachine.event_match(event[key], match[key]):
+     IscsiLun *iscsilun = arg;
-                         return False
+     struct iscsi_context *iscsi = iscsilun->iscsi;
--                else:
-+                elif event[key] != match[key]:
++    aio_context_acquire(iscsilun->aio_context);
-                     return False
+     iscsi_service(iscsi, POLLIN);
-+            else:
+     iscsi_set_events(iscsilun);
-+                return False
++    aio_context_release(iscsilun->aio_context);
-+        return True
+ }
--            return True
+ static void
-+    def event_wait(self, name, timeout=60.0, match=None):
+@@ -XXX,XX +XXX,XX @@ iscsi_process_write(void *arg)
-+        """
+     IscsiLun *iscsilun = arg;
-+        event_wait waits for and returns a named event from QMP with a timeout.
+     struct iscsi_context *iscsi = iscsilun->iscsi;
-+
-+        name: The event to wait for.
++    aio_context_acquire(iscsilun->aio_context);
-+        timeout: QEMUMonitorProtocol.pull_event timeout parameter.
+     iscsi_service(iscsi, POLLOUT);
-+        match: Optional match criteria. See event_match for details.
+     iscsi_set_events(iscsilun);
-+        """
++    aio_context_release(iscsilun->aio_context);
-+        return self.events_wait([(name, match)], timeout)
+ }
-+
-+    def events_wait(self, events, timeout=60.0):
+ static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun)
-+        """
+diff --git a/block/linux-aio.c b/block/linux-aio.c
-+        events_wait waits for and returns a named event from QMP with a timeout.
+index XXXXXXX..XXXXXXX 100644
-+
+--- a/block/linux-aio.c
-+        events: a sequence of (name, match_criteria) tuples.
++++ b/block/linux-aio.c
-+                The match criteria are optional and may be None.
+@@ -XXX,XX +XXX,XX @@ static void qemu_laio_completion_cb(EventNotifier *e)
-+                See event_match for details.
+     LinuxAioState *s = container_of(e, LinuxAioState, e);
-+        timeout: QEMUMonitorProtocol.pull_event timeout parameter.
-+        """
+     if (event_notifier_test_and_clear(&s->e)) {
-+        def _match(event):
++        aio_context_acquire(s->aio_context);
-+            for name, match in events:
+         qemu_laio_process_completions_and_submit(s);
-+                if (event['event'] == name and
++        aio_context_release(s->aio_context);
-+                    self.event_match(event, match)):
+     }
-+                    return True
+ }
-+            return False
+@@ -XXX,XX +XXX,XX @@ static bool qemu_laio_poll_cb(void *opaque)
-         # Search cached events
+         return false;
-         for event in self._events:
+     }
--            if (event['event'] == name) and event_match(event, match):
-+            if _match(event):
++    aio_context_acquire(s->aio_context);
-                 self._events.remove(event)
+     qemu_laio_process_completions_and_submit(s);
-                 return event
++    aio_context_release(s->aio_context);
+     return true;
-         # Poll for new events
+ }
-         while True:
-             event = self._qmp.pull_event(wait=timeout)
+diff --git a/block/nfs.c b/block/nfs.c
--            if (event['event'] == name) and event_match(event, match):
+index XXXXXXX..XXXXXXX 100644
-+            if _match(event):
+--- a/block/nfs.c
-                 return event
++++ b/block/nfs.c
-             self._events.append(event)
+@@ -XXX,XX +XXX,XX @@ static void nfs_set_events(NFSClient *client)
  static void nfs_process_read(void *arg)
  {
      NFSClient *client = arg;
 +
 +    aio_context_acquire(client->aio_context);
      nfs_service(client->context, POLLIN);
      nfs_set_events(client);
 +    aio_context_release(client->aio_context);
  }
  static void nfs_process_write(void *arg)
  {
      NFSClient *client = arg;
 +
 +    aio_context_acquire(client->aio_context);
      nfs_service(client->context, POLLOUT);
      nfs_set_events(client);
 +    aio_context_release(client->aio_context);
  }
  static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
 diff --git a/block/sheepdog.c b/block/sheepdog.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/sheepdog.c
 +++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
      return ret;
  }
 -static void restart_co_req(void *opaque)
 -{
 -    Coroutine *co = opaque;
 -
 -    qemu_coroutine_enter(co);
 -}
 -
  typedef struct SheepdogReqCo {
      int sockfd;
      BlockDriverState *bs;
@@ -XXX,XX +XXX,XX @@ typedef struct SheepdogReqCo {
      unsigned int *rlen;
      int ret;
      bool finished;
 +    Coroutine *co;
  } SheepdogReqCo;
 +static void restart_co_req(void *opaque)
 +{
 +    SheepdogReqCo *srco = opaque;
 +
 +    aio_co_wake(srco->co);
 +}
 +
  static coroutine_fn void do_co_req(void *opaque)
  {
      int ret;
 -    Coroutine *co;
      SheepdogReqCo *srco = opaque;
      int sockfd = srco->sockfd;
      SheepdogReq *hdr = srco->hdr;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void do_co_req(void *opaque)
      unsigned int *wlen = srco->wlen;
      unsigned int *rlen = srco->rlen;
 -    co = qemu_coroutine_self();
 +    srco->co = qemu_coroutine_self();
      aio_set_fd_handler(srco->aio_context, sockfd, false,
 -                       NULL, restart_co_req, NULL, co);
 +                       NULL, restart_co_req, NULL, srco);
      ret = send_co_req(sockfd, hdr, data, wlen);
      if (ret < 0) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void do_co_req(void *opaque)
      }
      aio_set_fd_handler(srco->aio_context, sockfd, false,
 -                       restart_co_req, NULL, NULL, co);
 +                       restart_co_req, NULL, NULL, srco);
      ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
      if (ret != sizeof(*hdr)) {
@@ -XXX,XX +XXX,XX @@ out:
      aio_set_fd_handler(srco->aio_context, sockfd, false,
                         NULL, NULL, NULL, NULL);
 +    srco->co = NULL;
      srco->ret = ret;
      srco->finished = true;
      if (srco->bs) {
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
           * We've finished all requests which belong to the AIOCB, so
           * we can switch back to sd_co_readv/writev now.
           */
 -        qemu_coroutine_enter(acb->coroutine);
 +        aio_co_wake(acb->coroutine);
      }
      return;
@@ -XXX,XX +XXX,XX @@ static void co_read_response(void *opaque)
          s->co_recv = qemu_coroutine_create(aio_read_response, opaque);
      }
 -    qemu_coroutine_enter(s->co_recv);
 +    aio_co_wake(s->co_recv);
  }
  static void co_write_request(void *opaque)
  {
      BDRVSheepdogState *s = opaque;
 -    qemu_coroutine_enter(s->co_send);
 +    aio_co_wake(s->co_send);
  }
  /*
 diff --git a/block/ssh.c b/block/ssh.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/ssh.c
 +++ b/block/ssh.c
@@ -XXX,XX +XXX,XX @@ static void restart_coroutine(void *opaque)
      DPRINTF("co=%p", co);
 -    qemu_coroutine_enter(co);
 +    aio_co_wake(co);
  }
 -static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
 +/* A non-blocking call returned EAGAIN, so yield, ensuring the
 + * handlers are set up so that we'll be rescheduled when there is an
 + * interesting event on the socket.
 + */
 +static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
  {
      int r;
      IOHandler *rd_handler = NULL, *wr_handler = NULL;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
      aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
                         false, rd_handler, wr_handler, NULL, co);
 -}
 -
 -static coroutine_fn void clear_fd_handler(BDRVSSHState *s,
 -                                          BlockDriverState *bs)
 -{
 -    DPRINTF("s->sock=%d", s->sock);
 -    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
 -                       false, NULL, NULL, NULL, NULL);
 -}
 -
 -/* A non-blocking call returned EAGAIN, so yield, ensuring the
 - * handlers are set up so that we'll be rescheduled when there is an
 - * interesting event on the socket.
 - */
 -static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
 -{
 -    set_fd_handler(s, bs);
      qemu_coroutine_yield();
 -    clear_fd_handler(s, bs);
 +    DPRINTF("s->sock=%d - back", s->sock);
 +    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, false,
 +                       NULL, NULL, NULL, NULL);
  }
  /* SFTP has a function `libssh2_sftp_seek64' which seeks to a position
 diff --git a/block/win32-aio.c b/block/win32-aio.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/win32-aio.c
 +++ b/block/win32-aio.c
@@ -XXX,XX +XXX,XX @@ struct QEMUWin32AIOState {
      HANDLE hIOCP;
      EventNotifier e;
      int count;
 -    bool is_aio_context_attached;
 +    AioContext *aio_ctx;
  };
  typedef struct QEMUWin32AIOCB {
@@ -XXX,XX +XXX,XX @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
      }
 +    aio_context_acquire(s->aio_ctx);
      waiocb->common.cb(waiocb->common.opaque, ret);
 +    aio_context_release(s->aio_ctx);
      qemu_aio_unref(waiocb);
  }
@@ -XXX,XX +XXX,XX @@ void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
                                    AioContext *old_context)
  {
      aio_set_event_notifier(old_context, &aio->e, false, NULL, NULL);
 -    aio->is_aio_context_attached = false;
 +    aio->aio_ctx = NULL;
  }
  void win32_aio_attach_aio_context(QEMUWin32AIOState *aio,
                                    AioContext *new_context)
  {
 -    aio->is_aio_context_attached = true;
 +    aio->aio_ctx = new_context;
      aio_set_event_notifier(new_context, &aio->e, false,
                             win32_aio_completion_cb, NULL);
  }
@@ -XXX,XX +XXX,XX @@ out_free_state:
  void win32_aio_cleanup(QEMUWin32AIOState *aio)
  {
 -    assert(!aio->is_aio_context_attached);
 +    assert(!aio->aio_ctx);
      CloseHandle(aio->hIOCP);
      event_notifier_cleanup(&aio->e);
      g_free(aio);
 diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/block/virtio-blk.c
 +++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
  {
      VirtIOBlockIoctlReq *ioctl_req = opaque;
      VirtIOBlockReq *req = ioctl_req->req;
 -    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
 +    VirtIOBlock *s = req->dev;
 +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
      struct virtio_scsi_inhdr *scsi;
      struct sg_io_hdr *hdr;
@@ -XXX,XX +XXX,XX @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
      MultiReqBuffer mrb = {};
      bool progress = false;
 +    aio_context_acquire(blk_get_aio_context(s->blk));
      blk_io_plug(s->blk);
      do {
@@ -XXX,XX +XXX,XX @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
      }
      blk_io_unplug(s->blk);
 +    aio_context_release(blk_get_aio_context(s->blk));
      return progress;
  }
 diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/scsi/virtio-scsi.c
 +++ b/hw/scsi/virtio-scsi.c
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
      VirtIOSCSIReq *req;
      bool progress = false;
 +    virtio_scsi_acquire(s);
      while ((req = virtio_scsi_pop_req(s, vq))) {
          progress = true;
          virtio_scsi_handle_ctrl_req(s, req);
      }
 +    virtio_scsi_release(s);
      return progress;
  }
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
      QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs);
 +    virtio_scsi_acquire(s);
      do {
          virtio_queue_set_notification(vq, 0);
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
      QTAILQ_FOREACH_SAFE(req, &reqs, next, next) {
          virtio_scsi_handle_cmd_req_submit(s, req);
      }
 +    virtio_scsi_release(s);
      return progress;
  }
@@ -XXX,XX +XXX,XX @@ out:
  bool virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq)
  {
 +    virtio_scsi_acquire(s);
      if (s->events_dropped) {
          virtio_scsi_push_event(s, NULL, VIRTIO_SCSI_T_NO_EVENT, 0);
 +        virtio_scsi_release(s);
          return true;
      }
 +    virtio_scsi_release(s);
      return false;
  }
 diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-posix.c
 +++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
              (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
              aio_node_check(ctx, node->is_external) &&
              node->io_read) {
 -            aio_context_acquire(ctx);
              node->io_read(node->opaque);
 -            aio_context_release(ctx);
              /* aio_notify() does not count as progress */
              if (node->opaque != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
              (revents & (G_IO_OUT | G_IO_ERR)) &&
              aio_node_check(ctx, node->is_external) &&
              node->io_write) {
 -            aio_context_acquire(ctx);
              node->io_write(node->opaque);
 -            aio_context_release(ctx);
              progress = true;
          }
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
      }
 -    aio_context_acquire(ctx);
      progress = try_poll_mode(ctx, blocking);
 -    aio_context_release(ctx);
 -
      if (!progress) {
          assert(npfd == 0);
 diff --git a/util/aio-win32.c b/util/aio-win32.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-win32.c
 +++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
              (revents || event_notifier_get_handle(node->e) == event) &&
              node->io_notify) {
              node->pfd.revents = 0;
 -            aio_context_acquire(ctx);
              node->io_notify(node->e);
 -            aio_context_release(ctx);
              /* aio_notify() does not count as progress */
              if (node->e != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
              (node->io_read || node->io_write)) {
              node->pfd.revents = 0;
              if ((revents & G_IO_IN) && node->io_read) {
 -                aio_context_acquire(ctx);
                  node->io_read(node->opaque);
 -                aio_context_release(ctx);
                  progress = true;
              }
              if ((revents & G_IO_OUT) && node->io_write) {
 -                aio_context_acquire(ctx);
                  node->io_write(node->opaque);
 -                aio_context_release(ctx);
                  progress = true;
              }
 --
-.21.0
+.9.3

-[Qemu-devel] [PULL 08/20] iotests: Fix intermittent failure in 219
+[Qemu-devel] [PULL v2 14/24] block: explicitly acquire aiocontext in bottom halves that need it
-In 219, we wait for the job to make progress before we emit its status.
+From: Paolo Bonzini <pbonzini@redhat.com>
 This makes the output reliable.  We do not wait for any more progress if
 the job's current-progress already matches its total-progress.
-Unfortunately, there is a bug: Right after the job has been started,
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-it's possible that total-progress is still 0.  In that case, we may skip
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-the first progress-making step and keep ending up 64 kB short.
+Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Message-id: 20170213135235.12274-15-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  block/archipelago.c   |  3 +++
  block/blkreplay.c     |  2 +-
  block/block-backend.c |  6 ++++++
  block/curl.c          | 26 ++++++++++++++++++--------
  block/gluster.c       |  9 +--------
  block/io.c            |  6 +++++-
  block/iscsi.c         |  6 +++++-
  block/linux-aio.c     | 15 +++++++++------
  block/nfs.c           |  3 ++-
  block/null.c          |  4 ++++
  block/qed.c           |  3 +++
  block/rbd.c           |  4 ++++
  dma-helpers.c         |  2 ++
  hw/block/virtio-blk.c |  2 ++
  hw/scsi/scsi-bus.c    |  2 ++
  util/async.c          |  4 ++--
  util/thread-pool.c    |  2 ++
 files changed, 71 insertions(+), 28 deletions(-)
-To fix that bug, we can simply wait for total-progress to reach 4 MB
+diff --git a/block/archipelago.c b/block/archipelago.c
-(the image size) after starting the job.
+index XXXXXXX..XXXXXXX 100644
+--- a/block/archipelago.c
-Reported-by: Karen Mezick <kmezick@redhat.com>
++++ b/block/archipelago.c
-Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1686651
+@@ -XXX,XX +XXX,XX @@ static void qemu_archipelago_complete_aio(void *opaque)
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+ {
-Message-id: 20190516161114.27596-1-mreitz@redhat.com
+     AIORequestData *reqdata = (AIORequestData *) opaque;
-Reviewed-by: John Snow <jsnow@redhat.com>
+     ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;
-[mreitz: Adjusted commit message as per John's proposal]
++    AioContext *ctx = bdrv_get_aio_context(aio_cb->common.bs);
-Signed-off-by: Max Reitz <mreitz@redhat.com>
----
++    aio_context_acquire(ctx);
- tests/qemu-iotests/219 | 13 ++++++++++---
+     aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
-file changed, 10 insertions(+), 3 deletions(-)
++    aio_context_release(ctx);
+     aio_cb->status = 0;
-diff --git a/tests/qemu-iotests/219 b/tests/qemu-iotests/219
      qemu_aio_unref(aio_cb);
 diff --git a/block/blkreplay.c b/block/blkreplay.c
 index XXXXXXX..XXXXXXX 100755
---- a/tests/qemu-iotests/219
+--- a/block/blkreplay.c
-+++ b/tests/qemu-iotests/219
++++ b/block/blkreplay.c
-@@ -XXX,XX +XXX,XX @@ import iotests
+@@ -XXX,XX +XXX,XX @@ static int64_t blkreplay_getlength(BlockDriverState *bs)
+ static void blkreplay_bh_cb(void *opaque)
- iotests.verify_image_format(supported_fmts=['qcow2'])
+ {
+     Request *req = opaque;
-+img_size = 4 * 1024 * 1024
+-    qemu_coroutine_enter(req->co);
-+
++    aio_co_wake(req->co);
- def pause_wait(vm, job_id):
+     qemu_bh_delete(req->bh);
-     with iotests.Timeout(3, "Timeout waiting for job to pause"):
+     g_free(req);
-         while True:
+ }
-@@ -XXX,XX +XXX,XX @@ def test_pause_resume(vm):
+diff --git a/block/block-backend.c b/block/block-backend.c
-                 iotests.log(vm.qmp('query-jobs'))
+index XXXXXXX..XXXXXXX 100644
+--- a/block/block-backend.c
- def test_job_lifecycle(vm, job, job_args, has_ready=False):
++++ b/block/block-backend.c
-+    global img_size
+@@ -XXX,XX +XXX,XX @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
-+
+ static void error_callback_bh(void *opaque)
-     iotests.log('')
+ {
-     iotests.log('')
+     struct BlockBackendAIOCB *acb = opaque;
-     iotests.log('Starting block job: %s (auto-finalize: %s; auto-dismiss: %s)' %
++    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
-@@ -XXX,XX +XXX,XX @@ def test_job_lifecycle(vm, job, job_args, has_ready=False):
-     iotests.log(iotests.filter_qmp_event(vm.event_wait('JOB_STATUS_CHANGE')))
+     bdrv_dec_in_flight(acb->common.bs);
-     iotests.log(iotests.filter_qmp_event(vm.event_wait('JOB_STATUS_CHANGE')))
++    aio_context_acquire(ctx);
+     acb->common.cb(acb->common.opaque, acb->ret);
-+    # Wait for total-progress to stabilize
++    aio_context_release(ctx);
-+    while vm.qmp('query-jobs')['return'][0]['total-progress'] < img_size:
+     qemu_aio_unref(acb);
-+        pass
+ }
-+
-     # RUNNING state:
+@@ -XXX,XX +XXX,XX @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)
-     # pause/resume should work, complete/finalize/dismiss should error out
+ static void blk_aio_complete_bh(void *opaque)
-     iotests.log('')
+ {
-@@ -XXX,XX +XXX,XX @@ with iotests.FilePath('disk.img') as disk_path, \
+     BlkAioEmAIOCB *acb = opaque;
-      iotests.FilePath('copy.img') as copy_path, \
++    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
-      iotests.VM() as vm:
+     assert(acb->has_returned);
--    img_size = '4M'
++    aio_context_acquire(ctx);
--    iotests.qemu_img_create('-f', iotests.imgfmt, disk_path, img_size)
+     blk_aio_complete(acb);
--    iotests.qemu_io('-c', 'write 0 %s' % (img_size),
++    aio_context_release(ctx);
-+    iotests.qemu_img_create('-f', iotests.imgfmt, disk_path, str(img_size))
+ }
-+    iotests.qemu_io('-c', 'write 0 %i' % (img_size),
-                     '-f', iotests.imgfmt, disk_path)
+ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
+diff --git a/block/curl.c b/block/curl.c
-     iotests.log('Launching VM...')
+index XXXXXXX..XXXXXXX 100644
 --- a/block/curl.c
 +++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
  {
      CURLState *state;
      int running;
 +    int ret = -EINPROGRESS;
      CURLAIOCB *acb = p;
 -    BDRVCURLState *s = acb->common.bs->opaque;
 +    BlockDriverState *bs = acb->common.bs;
 +    BDRVCURLState *s = bs->opaque;
 +    AioContext *ctx = bdrv_get_aio_context(bs);
      size_t start = acb->sector_num * BDRV_SECTOR_SIZE;
      size_t end;
 +    aio_context_acquire(ctx);
 +
      // In case we have the requested data already (e.g. read-ahead),
      // we can just call the callback and be done.
      switch (curl_find_buf(s, start, acb->nb_sectors * BDRV_SECTOR_SIZE, acb)) {
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
              qemu_aio_unref(acb);
              // fall through
          case FIND_RET_WAIT:
 -            return;
 +            goto out;
          default:
              break;
      }
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
      // No cache found, so let's start a new request
      state = curl_init_state(acb->common.bs, s);
      if (!state) {
 -        acb->common.cb(acb->common.opaque, -EIO);
 -        qemu_aio_unref(acb);
 -        return;
 +        ret = -EIO;
 +        goto out;
      }
      acb->start = 0;
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
      state->orig_buf = g_try_malloc(state->buf_len);
      if (state->buf_len && state->orig_buf == NULL) {
          curl_clean_state(state);
 -        acb->common.cb(acb->common.opaque, -ENOMEM);
 -        qemu_aio_unref(acb);
 -        return;
 +        ret = -ENOMEM;
 +        goto out;
      }
      state->acb[0] = acb;
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
      /* Tell curl it needs to kick things off */
      curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
 +
 +out:
 +    if (ret != -EINPROGRESS) {
 +        acb->common.cb(acb->common.opaque, ret);
 +        qemu_aio_unref(acb);
 +    }
 +    aio_context_release(ctx);
  }
  static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
 diff --git a/block/gluster.c b/block/gluster.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/gluster.c
 +++ b/block/gluster.c
@@ -XXX,XX +XXX,XX @@ static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
      return qemu_gluster_glfs_init(gconf, errp);
  }
 -static void qemu_gluster_complete_aio(void *opaque)
 -{
 -    GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
 -
 -    qemu_coroutine_enter(acb->coroutine);
 -}
 -
  /*
   * AIO callback routine called from GlusterFS thread.
   */
@@ -XXX,XX +XXX,XX @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
          acb->ret = -EIO; /* Partial read/write - fail it */
      }
 -    aio_bh_schedule_oneshot(acb->aio_context, qemu_gluster_complete_aio, acb);
 +    aio_co_schedule(acb->aio_context, acb->coroutine);
  }
  static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
      bdrv_dec_in_flight(bs);
      bdrv_drained_begin(bs);
      data->done = true;
 -    qemu_coroutine_enter(co);
 +    aio_co_wake(co);
  }
  static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
  static void bdrv_co_em_bh(void *opaque)
  {
      BlockAIOCBCoroutine *acb = opaque;
 +    BlockDriverState *bs = acb->common.bs;
 +    AioContext *ctx = bdrv_get_aio_context(bs);
      assert(!acb->need_bh);
 +    aio_context_acquire(ctx);
      bdrv_co_complete(acb);
 +    aio_context_release(ctx);
  }
  static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
 diff --git a/block/iscsi.c b/block/iscsi.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/iscsi.c
 +++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static void
  iscsi_bh_cb(void *p)
  {
      IscsiAIOCB *acb = p;
 +    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
      qemu_bh_delete(acb->bh);
      g_free(acb->buf);
      acb->buf = NULL;
 +    aio_context_acquire(ctx);
      acb->common.cb(acb->common.opaque, acb->status);
 +    aio_context_release(ctx);
      if (acb->task != NULL) {
          scsi_free_scsi_task(acb->task);
@@ -XXX,XX +XXX,XX @@ iscsi_schedule_bh(IscsiAIOCB *acb)
  static void iscsi_co_generic_bh_cb(void *opaque)
  {
      struct IscsiTask *iTask = opaque;
 +
      iTask->complete = 1;
 -    qemu_coroutine_enter(iTask->co);
 +    aio_co_wake(iTask->co);
  }
  static void iscsi_retry_timer_expired(void *opaque)
 diff --git a/block/linux-aio.c b/block/linux-aio.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/linux-aio.c
 +++ b/block/linux-aio.c
@@ -XXX,XX +XXX,XX @@ struct LinuxAioState {
      io_context_t ctx;
      EventNotifier e;
 -    /* io queue for submit at batch */
 +    /* io queue for submit at batch.  Protected by AioContext lock. */
      LaioQueue io_q;
 -    /* I/O completion processing */
 +    /* I/O completion processing.  Only runs in I/O thread.  */
      QEMUBH *completion_bh;
      int event_idx;
      int event_max;
@@ -XXX,XX +XXX,XX @@ static inline ssize_t io_event_ret(struct io_event *ev)
   */
  static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
  {
 +    LinuxAioState *s = laiocb->ctx;
      int ret;
      ret = laiocb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
      }
      laiocb->ret = ret;
 +    aio_context_acquire(s->aio_context);
      if (laiocb->co) {
          /* If the coroutine is already entered it must be in ioq_submit() and
           * will notice laio->ret has been filled in when it eventually runs
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
          laiocb->common.cb(laiocb->common.opaque, ret);
          qemu_aio_unref(laiocb);
      }
 +    aio_context_release(s->aio_context);
  }
  /**
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completions(LinuxAioState *s)
  static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
  {
      qemu_laio_process_completions(s);
 +
 +    aio_context_acquire(s->aio_context);
      if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
          ioq_submit(s);
      }
 +    aio_context_release(s->aio_context);
  }
  static void qemu_laio_completion_bh(void *opaque)
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_completion_cb(EventNotifier *e)
      LinuxAioState *s = container_of(e, LinuxAioState, e);
      if (event_notifier_test_and_clear(&s->e)) {
 -        aio_context_acquire(s->aio_context);
          qemu_laio_process_completions_and_submit(s);
 -        aio_context_release(s->aio_context);
      }
  }
@@ -XXX,XX +XXX,XX @@ static bool qemu_laio_poll_cb(void *opaque)
          return false;
      }
 -    aio_context_acquire(s->aio_context);
      qemu_laio_process_completions_and_submit(s);
 -    aio_context_release(s->aio_context);
      return true;
  }
@@ -XXX,XX +XXX,XX @@ void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
  {
      aio_set_event_notifier(old_context, &s->e, false, NULL, NULL);
      qemu_bh_delete(s->completion_bh);
 +    s->aio_context = NULL;
  }
  void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
 diff --git a/block/nfs.c b/block/nfs.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/nfs.c
 +++ b/block/nfs.c
@@ -XXX,XX +XXX,XX @@ static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
  static void nfs_co_generic_bh_cb(void *opaque)
  {
      NFSRPC *task = opaque;
 +
      task->complete = 1;
 -    qemu_coroutine_enter(task->co);
 +    aio_co_wake(task->co);
  }
  static void
 diff --git a/block/null.c b/block/null.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/null.c
 +++ b/block/null.c
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo null_aiocb_info = {
  static void null_bh_cb(void *opaque)
  {
      NullAIOCB *acb = opaque;
 +    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 +
 +    aio_context_acquire(ctx);
      acb->common.cb(acb->common.opaque, 0);
 +    aio_context_release(ctx);
      qemu_aio_unref(acb);
  }
 diff --git a/block/qed.c b/block/qed.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.c
 +++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
  static void qed_aio_complete_bh(void *opaque)
  {
      QEDAIOCB *acb = opaque;
 +    BDRVQEDState *s = acb_to_s(acb);
      BlockCompletionFunc *cb = acb->common.cb;
      void *user_opaque = acb->common.opaque;
      int ret = acb->bh_ret;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete_bh(void *opaque)
      qemu_aio_unref(acb);
      /* Invoke callback */
 +    qed_acquire(s);
      cb(user_opaque, ret);
 +    qed_release(s);
  }
  static void qed_aio_complete(QEDAIOCB *acb, int ret)
 diff --git a/block/rbd.c b/block/rbd.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/rbd.c
 +++ b/block/rbd.c
@@ -XXX,XX +XXX,XX @@ shutdown:
  static void qemu_rbd_complete_aio(RADOSCB *rcb)
  {
      RBDAIOCB *acb = rcb->acb;
 +    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
      int64_t r;
      r = rcb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
          qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
      }
      qemu_vfree(acb->bounce);
 +
 +    aio_context_acquire(ctx);
      acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
 +    aio_context_release(ctx);
      qemu_aio_unref(acb);
  }
 diff --git a/dma-helpers.c b/dma-helpers.c
 index XXXXXXX..XXXXXXX 100644
 --- a/dma-helpers.c
 +++ b/dma-helpers.c
@@ -XXX,XX +XXX,XX @@ static void dma_blk_cb(void *opaque, int ret)
                                  QEMU_ALIGN_DOWN(dbs->iov.size, dbs->align));
      }
 +    aio_context_acquire(dbs->ctx);
      dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
                              dma_blk_cb, dbs, dbs->io_func_opaque);
 +    aio_context_release(dbs->ctx);
      assert(dbs->acb);
  }
 diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/block/virtio-blk.c
 +++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_dma_restart_bh(void *opaque)
      s->rq = NULL;
 +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
      while (req) {
          VirtIOBlockReq *next = req->next;
          if (virtio_blk_handle_request(req, &mrb)) {
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_dma_restart_bh(void *opaque)
      if (mrb.num_reqs) {
          virtio_blk_submit_multireq(s->blk, &mrb);
      }
 +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
  }
  static void virtio_blk_dma_restart_cb(void *opaque, int running,
 diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/scsi/scsi-bus.c
 +++ b/hw/scsi/scsi-bus.c
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_restart_bh(void *opaque)
      qemu_bh_delete(s->bh);
      s->bh = NULL;
 +    aio_context_acquire(blk_get_aio_context(s->conf.blk));
      QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
          scsi_req_ref(req);
          if (req->retry) {
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_restart_bh(void *opaque)
          }
          scsi_req_unref(req);
      }
 +    aio_context_release(blk_get_aio_context(s->conf.blk));
  }
  void scsi_req_retry(SCSIRequest *req)
 diff --git a/util/async.c b/util/async.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/async.c
 +++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
                  ret = 1;
              }
              bh->idle = 0;
 -            aio_context_acquire(ctx);
              aio_bh_call(bh);
 -            aio_context_release(ctx);
          }
          if (bh->deleted) {
              deleted = true;
@@ -XXX,XX +XXX,XX @@ static void co_schedule_bh_cb(void *opaque)
          Coroutine *co = QSLIST_FIRST(&straight);
          QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
          trace_aio_co_schedule_bh_cb(ctx, co);
 +        aio_context_acquire(ctx);
          qemu_coroutine_enter(co);
 +        aio_context_release(ctx);
      }
  }
 diff --git a/util/thread-pool.c b/util/thread-pool.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/thread-pool.c
 +++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@ static void thread_pool_completion_bh(void *opaque)
      ThreadPool *pool = opaque;
      ThreadPoolElement *elem, *next;
 +    aio_context_acquire(pool->ctx);
  restart:
      QLIST_FOREACH_SAFE(elem, &pool->head, all, next) {
          if (elem->state != THREAD_DONE) {
@@ -XXX,XX +XXX,XX @@ restart:
              qemu_aio_unref(elem);
          }
      }
 +    aio_context_release(pool->ctx);
  }
  static void thread_pool_cancel(BlockAIOCB *acb)
 --
-.21.0
+.9.3

-[Qemu-devel] [PULL 06/20] event_match: always match on None value
+[Qemu-devel] [PULL v2 15/24] block: explicitly acquire aiocontext in aio callbacks that need it
-From: John Snow <jsnow@redhat.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Before, event_match didn't always recurse if the event value was not a
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-dictionary, and would instead check for equality immediately.
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Message-id: 20170213135235.12274-16-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  block/archipelago.c    |  3 ---
  block/block-backend.c  |  7 -------
  block/curl.c           |  2 +-
  block/io.c             |  6 +-----
  block/iscsi.c          |  3 ---
  block/linux-aio.c      |  5 +----
  block/mirror.c         | 12 +++++++++---
  block/null.c           |  8 --------
  block/qed-cluster.c    |  2 ++
  block/qed-table.c      | 12 ++++++++++--
  block/qed.c            |  4 ++--
  block/rbd.c            |  4 ----
  block/win32-aio.c      |  3 ---
  hw/block/virtio-blk.c  | 12 +++++++++++-
  hw/scsi/scsi-disk.c    | 15 +++++++++++++++
  hw/scsi/scsi-generic.c | 20 +++++++++++++++++---
  util/thread-pool.c     |  4 +++-
 files changed, 72 insertions(+), 50 deletions(-)
-By delaying equality checking to post-recursion, we can allow leaf
+diff --git a/block/archipelago.c b/block/archipelago.c
-values like "5" to match "None" and take advantage of the generic
+index XXXXXXX..XXXXXXX 100644
-None-returns-True clause.
+--- a/block/archipelago.c
++++ b/block/archipelago.c
-This makes the matching a little more obviously consistent at the
+@@ -XXX,XX +XXX,XX @@ static void qemu_archipelago_complete_aio(void *opaque)
-expense of being able to check for explicit None values, which is
+ {
-probably not that important given what this function is used for.
+     AIORequestData *reqdata = (AIORequestData *) opaque;
+     ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;
-Signed-off-by: John Snow <jsnow@redhat.com>
+-    AioContext *ctx = bdrv_get_aio_context(aio_cb->common.bs);
-Message-id: 20190528183857.26167-1-jsnow@redhat.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+-    aio_context_acquire(ctx);
----
+     aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
- python/qemu/__init__.py | 24 ++++++++++++++----------
+-    aio_context_release(ctx);
-file changed, 14 insertions(+), 10 deletions(-)
+     aio_cb->status = 0;
-diff --git a/python/qemu/__init__.py b/python/qemu/__init__.py
+     qemu_aio_unref(aio_cb);
-index XXXXXXX..XXXXXXX 100644
+diff --git a/block/block-backend.c b/block/block-backend.c
---- a/python/qemu/__init__.py
+index XXXXXXX..XXXXXXX 100644
-+++ b/python/qemu/__init__.py
+--- a/block/block-backend.c
-@@ -XXX,XX +XXX,XX @@ class QEMUMachine(object):
++++ b/block/block-backend.c
+@@ -XXX,XX +XXX,XX @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
-         The match criteria takes the form of a matching subdict. The event is
+ static void error_callback_bh(void *opaque)
-         checked to be a superset of the subdict, recursively, with matching
+ {
--        values whenever those values are not None.
+     struct BlockBackendAIOCB *acb = opaque;
-+        values whenever the subdict values are not None.
+-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
-+
-+        This has a limitation that you cannot explicitly check for None values.
+     bdrv_dec_in_flight(acb->common.bs);
+-    aio_context_acquire(ctx);
-         Examples, with the subdict queries on the left:
+     acb->common.cb(acb->common.opaque, acb->ret);
-          - None matches any object.
+-    aio_context_release(ctx);
-          - {"foo": None} matches {"foo": {"bar": 1}}
+     qemu_aio_unref(acb);
--         - {"foo": {"baz": None}} does not match {"foo": {"bar": 1}}
+ }
--         - {"foo": {"baz": 2}} matches {"foo": {"bar": 1, "baz": 2}}
-+         - {"foo": None} matches {"foo": 5}
+@@ -XXX,XX +XXX,XX @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)
-+         - {"foo": {"abc": None}} does not match {"foo": {"bar": 1}}
+ static void blk_aio_complete_bh(void *opaque)
-+         - {"foo": {"rab": 2}} matches {"foo": {"bar": 1, "rab": 2}}
+ {
-         """
+     BlkAioEmAIOCB *acb = opaque;
-         if match is None:
+-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
-             return True
+-
+     assert(acb->has_returned);
--        for key in match:
+-    aio_context_acquire(ctx);
--            if key in event:
+     blk_aio_complete(acb);
--                if isinstance(event[key], dict):
+-    aio_context_release(ctx);
-+        try:
+ }
-+            for key in match:
-+                if key in event:
+ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
-                     if not QEMUMachine.event_match(event[key], match[key]):
+diff --git a/block/curl.c b/block/curl.c
-                         return False
+index XXXXXXX..XXXXXXX 100644
--                elif event[key] != match[key]:
+--- a/block/curl.c
-+                else:
++++ b/block/curl.c
-                     return False
+@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
--            else:
+     curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
--                return False
--        return True
+ out:
-+            return True
++    aio_context_release(ctx);
-+        except TypeError:
+     if (ret != -EINPROGRESS) {
-+            # either match or event wasn't iterable (not a dict)
+         acb->common.cb(acb->common.opaque, ret);
-+            return match == event
+         qemu_aio_unref(acb);
+     }
-     def event_wait(self, name, timeout=60.0, match=None):
+-    aio_context_release(ctx);
-         """
+ }
  static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_io_em_complete(void *opaque, int ret)
      CoroutineIOCompletion *co = opaque;
      co->ret = ret;
 -    qemu_coroutine_enter(co->coroutine);
 +    aio_co_wake(co->coroutine);
  }
  static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
  static void bdrv_co_em_bh(void *opaque)
  {
      BlockAIOCBCoroutine *acb = opaque;
 -    BlockDriverState *bs = acb->common.bs;
 -    AioContext *ctx = bdrv_get_aio_context(bs);
      assert(!acb->need_bh);
 -    aio_context_acquire(ctx);
      bdrv_co_complete(acb);
 -    aio_context_release(ctx);
  }
  static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
 diff --git a/block/iscsi.c b/block/iscsi.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/iscsi.c
 +++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static void
  iscsi_bh_cb(void *p)
  {
      IscsiAIOCB *acb = p;
 -    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
      qemu_bh_delete(acb->bh);
      g_free(acb->buf);
      acb->buf = NULL;
 -    aio_context_acquire(ctx);
      acb->common.cb(acb->common.opaque, acb->status);
 -    aio_context_release(ctx);
      if (acb->task != NULL) {
          scsi_free_scsi_task(acb->task);
 diff --git a/block/linux-aio.c b/block/linux-aio.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/linux-aio.c
 +++ b/block/linux-aio.c
@@ -XXX,XX +XXX,XX @@ static inline ssize_t io_event_ret(struct io_event *ev)
   */
  static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
  {
 -    LinuxAioState *s = laiocb->ctx;
      int ret;
      ret = laiocb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
      }
      laiocb->ret = ret;
 -    aio_context_acquire(s->aio_context);
      if (laiocb->co) {
          /* If the coroutine is already entered it must be in ioq_submit() and
           * will notice laio->ret has been filled in when it eventually runs
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
           * that!
           */
          if (!qemu_coroutine_entered(laiocb->co)) {
 -            qemu_coroutine_enter(laiocb->co);
 +            aio_co_wake(laiocb->co);
          }
      } else {
          laiocb->common.cb(laiocb->common.opaque, ret);
          qemu_aio_unref(laiocb);
      }
 -    aio_context_release(s->aio_context);
  }
  /**
 diff --git a/block/mirror.c b/block/mirror.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
  {
      MirrorOp *op = opaque;
      MirrorBlockJob *s = op->s;
 +
 +    aio_context_acquire(blk_get_aio_context(s->common.blk));
      if (ret < 0) {
          BlockErrorAction action;
@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
          }
      }
      mirror_iteration_done(op, ret);
 +    aio_context_release(blk_get_aio_context(s->common.blk));
  }
  static void mirror_read_complete(void *opaque, int ret)
  {
      MirrorOp *op = opaque;
      MirrorBlockJob *s = op->s;
 +
 +    aio_context_acquire(blk_get_aio_context(s->common.blk));
      if (ret < 0) {
          BlockErrorAction action;
@@ -XXX,XX +XXX,XX @@ static void mirror_read_complete(void *opaque, int ret)
          }
          mirror_iteration_done(op, ret);
 -        return;
 +    } else {
 +        blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
 +                        0, mirror_write_complete, op);
      }
 -    blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
 -                    0, mirror_write_complete, op);
 +    aio_context_release(blk_get_aio_context(s->common.blk));
  }
  static inline void mirror_clip_sectors(MirrorBlockJob *s,
 diff --git a/block/null.c b/block/null.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/null.c
 +++ b/block/null.c
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo null_aiocb_info = {
  static void null_bh_cb(void *opaque)
  {
      NullAIOCB *acb = opaque;
 -    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 -
 -    aio_context_acquire(ctx);
      acb->common.cb(acb->common.opaque, 0);
 -    aio_context_release(ctx);
      qemu_aio_unref(acb);
  }
  static void null_timer_cb(void *opaque)
  {
      NullAIOCB *acb = opaque;
 -    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 -
 -    aio_context_acquire(ctx);
      acb->common.cb(acb->common.opaque, 0);
 -    aio_context_release(ctx);
      timer_deinit(&acb->timer);
      qemu_aio_unref(acb);
  }
 diff --git a/block/qed-cluster.c b/block/qed-cluster.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed-cluster.c
 +++ b/block/qed-cluster.c
@@ -XXX,XX +XXX,XX @@ static void qed_find_cluster_cb(void *opaque, int ret)
      unsigned int index;
      unsigned int n;
 +    qed_acquire(s);
      if (ret) {
          goto out;
      }
@@ -XXX,XX +XXX,XX @@ static void qed_find_cluster_cb(void *opaque, int ret)
  out:
      find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
 +    qed_release(s);
      g_free(find_cluster_cb);
  }
 diff --git a/block/qed-table.c b/block/qed-table.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed-table.c
 +++ b/block/qed-table.c
@@ -XXX,XX +XXX,XX @@ static void qed_read_table_cb(void *opaque, int ret)
  {
      QEDReadTableCB *read_table_cb = opaque;
      QEDTable *table = read_table_cb->table;
 +    BDRVQEDState *s = read_table_cb->s;
      int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
      int i;
@@ -XXX,XX +XXX,XX @@ static void qed_read_table_cb(void *opaque, int ret)
      }
      /* Byteswap offsets */
 +    qed_acquire(s);
      for (i = 0; i < noffsets; i++) {
          table->offsets[i] = le64_to_cpu(table->offsets[i]);
      }
 +    qed_release(s);
  out:
      /* Completion */
 -    trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret);
 +    trace_qed_read_table_cb(s, read_table_cb->table, ret);
      gencb_complete(&read_table_cb->gencb, ret);
  }
@@ -XXX,XX +XXX,XX @@ typedef struct {
  static void qed_write_table_cb(void *opaque, int ret)
  {
      QEDWriteTableCB *write_table_cb = opaque;
 +    BDRVQEDState *s = write_table_cb->s;
 -    trace_qed_write_table_cb(write_table_cb->s,
 +    trace_qed_write_table_cb(s,
                               write_table_cb->orig_table,
                               write_table_cb->flush,
                               ret);
@@ -XXX,XX +XXX,XX @@ static void qed_write_table_cb(void *opaque, int ret)
      if (write_table_cb->flush) {
          /* We still need to flush first */
          write_table_cb->flush = false;
 +        qed_acquire(s);
          bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb,
                         write_table_cb);
 +        qed_release(s);
          return;
      }
@@ -XXX,XX +XXX,XX @@ static void qed_read_l2_table_cb(void *opaque, int ret)
      CachedL2Table *l2_table = request->l2_table;
      uint64_t l2_offset = read_l2_table_cb->l2_offset;
 +    qed_acquire(s);
      if (ret) {
          /* can't trust loaded L2 table anymore */
          qed_unref_l2_cache_entry(l2_table);
@@ -XXX,XX +XXX,XX @@ static void qed_read_l2_table_cb(void *opaque, int ret)
          request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
          assert(request->l2_table != NULL);
      }
 +    qed_release(s);
      gencb_complete(&read_l2_table_cb->gencb, ret);
  }
 diff --git a/block/qed.c b/block/qed.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qed.c
 +++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l
      }
      if (cb->co) {
 -        qemu_coroutine_enter(cb->co);
 +        aio_co_wake(cb->co);
      }
  }
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
      cb->done = true;
      cb->ret = ret;
      if (cb->co) {
 -        qemu_coroutine_enter(cb->co);
 +        aio_co_wake(cb->co);
      }
  }
 diff --git a/block/rbd.c b/block/rbd.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/rbd.c
 +++ b/block/rbd.c
@@ -XXX,XX +XXX,XX @@ shutdown:
  static void qemu_rbd_complete_aio(RADOSCB *rcb)
  {
      RBDAIOCB *acb = rcb->acb;
 -    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
      int64_t r;
      r = rcb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
          qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
      }
      qemu_vfree(acb->bounce);
 -
 -    aio_context_acquire(ctx);
      acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
 -    aio_context_release(ctx);
      qemu_aio_unref(acb);
  }
 diff --git a/block/win32-aio.c b/block/win32-aio.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/win32-aio.c
 +++ b/block/win32-aio.c
@@ -XXX,XX +XXX,XX @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
          qemu_vfree(waiocb->buf);
      }
 -
 -    aio_context_acquire(s->aio_ctx);
      waiocb->common.cb(waiocb->common.opaque, ret);
 -    aio_context_release(s->aio_ctx);
      qemu_aio_unref(waiocb);
  }
 diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/block/virtio-blk.c
 +++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
  static void virtio_blk_rw_complete(void *opaque, int ret)
  {
      VirtIOBlockReq *next = opaque;
 +    VirtIOBlock *s = next->dev;
 +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
      while (next) {
          VirtIOBlockReq *req = next;
          next = req->mr_next;
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_rw_complete(void *opaque, int ret)
          block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
          virtio_blk_free_request(req);
      }
 +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
  }
  static void virtio_blk_flush_complete(void *opaque, int ret)
  {
      VirtIOBlockReq *req = opaque;
 +    VirtIOBlock *s = req->dev;
 +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
      if (ret) {
          if (virtio_blk_handle_rw_error(req, -ret, 0)) {
 -            return;
 +            goto out;
          }
      }
      virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
      block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
      virtio_blk_free_request(req);
 +
 +out:
 +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
  }
  #ifdef __linux__
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
      virtio_stl_p(vdev, &scsi->data_len, hdr->dxfer_len);
  out:
 +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
      virtio_blk_req_complete(req, status);
      virtio_blk_free_request(req);
 +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
      g_free(ioctl_req);
  }
 diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/scsi/scsi-disk.c
 +++ b/hw/scsi/scsi-disk.c
@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
      assert(r->req.aiocb != NULL);
      r->req.aiocb = NULL;
 +    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
      if (scsi_disk_req_check_error(r, ret, true)) {
          goto done;
      }
@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
      scsi_req_complete(&r->req, GOOD);
  done:
 +    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
      scsi_req_unref(&r->req);
  }
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_complete(void *opaque, int ret)
      assert(r->req.aiocb != NULL);
      r->req.aiocb = NULL;
 +    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
      if (ret < 0) {
          block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
      } else {
          block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
      }
      scsi_dma_complete_noio(r, ret);
 +    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
  }
  static void scsi_read_complete(void * opaque, int ret)
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
      assert(r->req.aiocb != NULL);
      r->req.aiocb = NULL;
 +    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
      if (scsi_disk_req_check_error(r, ret, true)) {
          goto done;
      }
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
  done:
      scsi_req_unref(&r->req);
 +    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
  }
  /* Actually issue a read to the block device.  */
@@ -XXX,XX +XXX,XX @@ static void scsi_do_read_cb(void *opaque, int ret)
      assert (r->req.aiocb != NULL);
      r->req.aiocb = NULL;
 +    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
      if (ret < 0) {
          block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
      } else {
          block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
      }
      scsi_do_read(opaque, ret);
 +    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
  }
  /* Read more data from scsi device into buffer.  */
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
      assert (r->req.aiocb != NULL);
      r->req.aiocb = NULL;
 +    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
      if (ret < 0) {
          block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
      } else {
          block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
      }
      scsi_write_complete_noio(r, ret);
 +    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
  }
  static void scsi_write_data(SCSIRequest *req)
@@ -XXX,XX +XXX,XX @@ static void scsi_unmap_complete(void *opaque, int ret)
  {
      UnmapCBData *data = opaque;
      SCSIDiskReq *r = data->r;
 +    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
      assert(r->req.aiocb != NULL);
      r->req.aiocb = NULL;
 +    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
      scsi_unmap_complete_noio(data, ret);
 +    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
  }
  static void scsi_disk_emulate_unmap(SCSIDiskReq *r, uint8_t *inbuf)
@@ -XXX,XX +XXX,XX @@ static void scsi_write_same_complete(void *opaque, int ret)
      assert(r->req.aiocb != NULL);
      r->req.aiocb = NULL;
 +    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
      if (scsi_disk_req_check_error(r, ret, true)) {
          goto done;
      }
@@ -XXX,XX +XXX,XX @@ done:
      scsi_req_unref(&r->req);
      qemu_vfree(data->iov.iov_base);
      g_free(data);
 +    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
  }
  static void scsi_disk_emulate_write_same(SCSIDiskReq *r, uint8_t *inbuf)
 diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/scsi/scsi-generic.c
 +++ b/hw/scsi/scsi-generic.c
@@ -XXX,XX +XXX,XX @@ done:
  static void scsi_command_complete(void *opaque, int ret)
  {
      SCSIGenericReq *r = (SCSIGenericReq *)opaque;
 +    SCSIDevice *s = r->req.dev;
      assert(r->req.aiocb != NULL);
      r->req.aiocb = NULL;
 +
 +    aio_context_acquire(blk_get_aio_context(s->conf.blk));
      scsi_command_complete_noio(r, ret);
 +    aio_context_release(blk_get_aio_context(s->conf.blk));
  }
  static int execute_command(BlockBackend *blk,
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
      assert(r->req.aiocb != NULL);
      r->req.aiocb = NULL;
 +    aio_context_acquire(blk_get_aio_context(s->conf.blk));
 +
      if (ret || r->req.io_canceled) {
          scsi_command_complete_noio(r, ret);
 -        return;
 +        goto done;
      }
      len = r->io_header.dxfer_len - r->io_header.resid;
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
      r->len = -1;
      if (len == 0) {
          scsi_command_complete_noio(r, 0);
 -        return;
 +        goto done;
      }
      /* Snoop READ CAPACITY output to set the blocksize.  */
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
      }
      scsi_req_data(&r->req, len);
      scsi_req_unref(&r->req);
 +
 +done:
 +    aio_context_release(blk_get_aio_context(s->conf.blk));
  }
  /* Read more data from scsi device into buffer.  */
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
      assert(r->req.aiocb != NULL);
      r->req.aiocb = NULL;
 +    aio_context_acquire(blk_get_aio_context(s->conf.blk));
 +
      if (ret || r->req.io_canceled) {
          scsi_command_complete_noio(r, ret);
 -        return;
 +        goto done;
      }
      if (r->req.cmd.buf[0] == MODE_SELECT && r->req.cmd.buf[4] == 12 &&
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
      }
      scsi_command_complete_noio(r, ret);
 +
 +done:
 +    aio_context_release(blk_get_aio_context(s->conf.blk));
  }
  /* Write data to a scsi device.  Returns nonzero on failure.
 diff --git a/util/thread-pool.c b/util/thread-pool.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/thread-pool.c
 +++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@ restart:
               */
              qemu_bh_schedule(pool->completion_bh);
 +            aio_context_release(pool->ctx);
              elem->common.cb(elem->common.opaque, elem->ret);
 +            aio_context_acquire(pool->ctx);
              qemu_aio_unref(elem);
              goto restart;
          } else {
@@ -XXX,XX +XXX,XX @@ static void thread_pool_co_cb(void *opaque, int ret)
      ThreadPoolCo *co = opaque;
      co->ret = ret;
 -    qemu_coroutine_enter(co->co);
 +    aio_co_wake(co->co);
  }
  int coroutine_fn thread_pool_submit_co(ThreadPool *pool, ThreadPoolFunc *func,
 --
-.21.0
+.9.3

-[Qemu-devel] [PULL 01/20] blockdev-backup: don't check aio_context too early
+[Qemu-devel] [PULL v2 16/24] aio-posix: partially inline aio_dispatch into aio_poll
-From: John Snow <jsnow@redhat.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-in blockdev_backup_prepare, we check to make sure that the target is
+This patch prepares for the removal of unnecessary lockcnt inc/dec pairs.
-associated with a compatible aio context. However, do_blockdev_backup is
+Extract the dispatching loop for file descriptor handlers into a new
-called later and has some logic to move the target to a compatible
+function aio_dispatch_handlers, and then inline aio_dispatch into
-aio_context. The transaction version will fail certain commands
+aio_poll.
 needlessly early as a result.
-Allow blockdev_backup_prepare to simply call do_blockdev_backup, which
+aio_dispatch can now become void.
 will ultimately decide if the contexts are compatible or not.
-Note: the transaction version has always disallowed this operation since
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-its initial commit bd8baecd (2014), whereas the version of
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-qmp_blockdev_backup at the time, from commit c29c1dd312f, tried to
+Reviewed-by: Fam Zheng <famz@redhat.com>
-enforce the aio_context switch instead. It's not clear, and I can't see
+Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
-from the mailing list archives at the time, why the two functions take a
+Message-id: 20170213135235.12274-17-pbonzini@redhat.com
-different approach. It wasn't until later in efd7556708b (2016) that the
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-standalone version tried to determine if it could set the context or
+---
-not.
+ include/block/aio.h |  6 +-----
  util/aio-posix.c    | 44 ++++++++++++++------------------------------
  util/aio-win32.c    | 13 ++++---------
  util/async.c        |  2 +-
 files changed, 20 insertions(+), 45 deletions(-)
-Reported-by: aihua liang <aliang@redhat.com>
+diff --git a/include/block/aio.h b/include/block/aio.h
 Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1683498
 Signed-off-by: John Snow <jsnow@redhat.com>
 Message-id: 20190523170643.20794-2-jsnow@redhat.com
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  blockdev.c | 4 ----
 file changed, 4 deletions(-)
 diff --git a/blockdev.c b/blockdev.c
 index XXXXXXX..XXXXXXX 100644
---- a/blockdev.c
+--- a/include/block/aio.h
-+++ b/blockdev.c
++++ b/include/block/aio.h
-@@ -XXX,XX +XXX,XX @@ static void blockdev_backup_prepare(BlkActionState *common, Error **errp)
+@@ -XXX,XX +XXX,XX @@ bool aio_pending(AioContext *ctx);
  /* Dispatch any pending callbacks from the GSource attached to the AioContext.
   *
   * This is used internally in the implementation of the GSource.
 - *
 - * @dispatch_fds: true to process fds, false to skip them
 - *                (can be used as an optimization by callers that know there
 - *                are no fds ready)
   */
 -bool aio_dispatch(AioContext *ctx, bool dispatch_fds);
 +void aio_dispatch(AioContext *ctx);
  /* Progress in completing AIO work to occur.  This can issue new pending
   * aio as a result of executing I/O completion or bh callbacks.
 diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-posix.c
 +++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
      AioHandler *node, *tmp;
      bool progress = false;
 -    /*
 -     * We have to walk very carefully in case aio_set_fd_handler is
 -     * called while we're walking.
 -     */
 -    qemu_lockcnt_inc(&ctx->list_lock);
 -
      QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
          int revents;
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
          }
      }
-     aio_context = bdrv_get_aio_context(bs);
+-    qemu_lockcnt_dec(&ctx->list_lock);
--    if (aio_context != bdrv_get_aio_context(target)) {
+     return progress;
--        error_setg(errp, "Backup between two IO threads is not implemented");
+ }
--        return;
 -/*
 - * Note that dispatch_fds == false has the side-effect of post-poning the
 - * freeing of deleted handlers.
 - */
 -bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
 +void aio_dispatch(AioContext *ctx)
  {
 -    bool progress;
 +    aio_bh_poll(ctx);
 -    /*
 -     * If there are callbacks left that have been queued, we need to call them.
 -     * Do not call select in this case, because it is possible that the caller
 -     * does not need a complete flush (as is the case for aio_poll loops).
 -     */
 -    progress = aio_bh_poll(ctx);
 +    qemu_lockcnt_inc(&ctx->list_lock);
 +    aio_dispatch_handlers(ctx);
 +    qemu_lockcnt_dec(&ctx->list_lock);
 -    if (dispatch_fds) {
 -        progress |= aio_dispatch_handlers(ctx);
 -    }
-     aio_context_acquire(aio_context);
+-
-     state->bs = bs;
+-    /* Run our timers */
 -    progress |= timerlistgroup_run_timers(&ctx->tlg);
 -
 -    return progress;
 +    timerlistgroup_run_timers(&ctx->tlg);
  }
  /* These thread-local variables are used only in a small part of aio_poll
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
      npfd = 0;
      qemu_lockcnt_dec(&ctx->list_lock);
 -    /* Run dispatch even if there were no readable fds to run timers */
 -    if (aio_dispatch(ctx, ret > 0)) {
 -        progress = true;
 +    progress |= aio_bh_poll(ctx);
 +
 +    if (ret > 0) {
 +        qemu_lockcnt_inc(&ctx->list_lock);
 +        progress |= aio_dispatch_handlers(ctx);
 +        qemu_lockcnt_dec(&ctx->list_lock);
      }
 +    progress |= timerlistgroup_run_timers(&ctx->tlg);
 +
      return progress;
  }
 diff --git a/util/aio-win32.c b/util/aio-win32.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-win32.c
 +++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
      return progress;
  }
 -bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
 +void aio_dispatch(AioContext *ctx)
  {
 -    bool progress;
 -
 -    progress = aio_bh_poll(ctx);
 -    if (dispatch_fds) {
 -        progress |= aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
 -    }
 -    progress |= timerlistgroup_run_timers(&ctx->tlg);
 -    return progress;
 +    aio_bh_poll(ctx);
 +    aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
 +    timerlistgroup_run_timers(&ctx->tlg);
  }
  bool aio_poll(AioContext *ctx, bool blocking)
 diff --git a/util/async.c b/util/async.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/async.c
 +++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ aio_ctx_dispatch(GSource     *source,
      AioContext *ctx = (AioContext *) source;
      assert(callback == NULL);
 -    aio_dispatch(ctx, true);
 +    aio_dispatch(ctx);
      return true;
  }
 --
-.21.0
+.9.3

-[Qemu-devel] [PULL 14/20] qemu-img: Move quiet into ImgConvertState
+[Qemu-devel] [PULL v2 17/24] async: remove unnecessary inc/dec pairs
-Move img_convert()'s quiet flag into the ImgConvertState so it is
+From: Paolo Bonzini <pbonzini@redhat.com>
 accessible by nested functions.  -q dictates that it suppresses anything
 but errors, so if those functions want to emit warnings, they need to
 query this flag first.  (There currently are no such warnings, but there
 will be as of the next patch.)
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Pull the increment/decrement pair out of aio_bh_poll and into the
-Reviewed-by: Eric Blake <eblake@redhat.com>
+callers.
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-Message-id: 20190507203508.18026-2-mreitz@redhat.com
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Message-id: 20170213135235.12274-18-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- qemu-img.c | 13 +++++++------
+ util/aio-posix.c |  8 +++-----
-file changed, 7 insertions(+), 6 deletions(-)
+ util/aio-win32.c |  8 ++++----
  util/async.c     | 12 ++++++------
 files changed, 13 insertions(+), 15 deletions(-)
-diff --git a/qemu-img.c b/qemu-img.c
+diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
---- a/qemu-img.c
+--- a/util/aio-posix.c
-+++ b/qemu-img.c
++++ b/util/aio-posix.c
-@@ -XXX,XX +XXX,XX @@ typedef struct ImgConvertState {
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
-     int64_t target_backing_sectors; /* negative if unknown */
-     bool wr_in_order;
+ void aio_dispatch(AioContext *ctx)
-     bool copy_range;
+ {
-+    bool quiet;
++    qemu_lockcnt_inc(&ctx->list_lock);
-     int min_sparse;
+     aio_bh_poll(ctx);
-     int alignment;
+-
-     size_t cluster_sectors;
+-    qemu_lockcnt_inc(&ctx->list_lock);
-@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
+     aio_dispatch_handlers(ctx);
-     QDict *open_opts = NULL;
+     qemu_lockcnt_dec(&ctx->list_lock);
-     char *options = NULL;
-     Error *local_err = NULL;
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
 -    bool writethrough, src_writethrough, quiet = false, image_opts = false,
 +    bool writethrough, src_writethrough, image_opts = false,
           skip_create = false, progress = false, tgt_image_opts = false;
      int64_t ret = -EINVAL;
      bool force_share = false;
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
              src_cache = optarg;
              break;
          case 'q':
 -            quiet = true;
 +            s.quiet = true;
              break;
          case 'n':
              skip_create = true;
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
      }
-     /* Initialize before goto out */
+     npfd = 0;
--    if (quiet) {
+-    qemu_lockcnt_dec(&ctx->list_lock);
-+    if (s.quiet) {
-         progress = false;
+     progress |= aio_bh_poll(ctx);
      if (ret > 0) {
 -        qemu_lockcnt_inc(&ctx->list_lock);
          progress |= aio_dispatch_handlers(ctx);
 -        qemu_lockcnt_dec(&ctx->list_lock);
      }
-     qemu_progress_init(progress, 1.0);
-@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
++    qemu_lockcnt_dec(&ctx->list_lock);
++
-     for (bs_i = 0; bs_i < s.src_num; bs_i++) {
+     progress |= timerlistgroup_run_timers(&ctx->tlg);
-         s.src[bs_i] = img_open(image_opts, argv[optind + bs_i],
--                               fmt, src_flags, src_writethrough, quiet,
+     return progress;
-+                               fmt, src_flags, src_writethrough, s.quiet,
+diff --git a/util/aio-win32.c b/util/aio-win32.c
-                                force_share);
+index XXXXXXX..XXXXXXX 100644
-         if (!s.src[bs_i]) {
+--- a/util/aio-win32.c
-             ret = -1;
++++ b/util/aio-win32.c
-@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
+     bool progress = false;
-     if (skip_create) {
+     AioHandler *tmp;
-         s.target = img_open(tgt_image_opts, out_filename, out_fmt,
--                            flags, writethrough, quiet, false);
+-    qemu_lockcnt_inc(&ctx->list_lock);
-+                            flags, writethrough, s.quiet, false);
+-
-     } else {
+     /*
-         /* TODO ultimately we should allow --target-image-opts
+      * We have to walk very carefully in case aio_set_fd_handler is
-          * to be used even when -n is not given.
+      * called while we're walking.
-@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
-          * to allow filenames in option syntax
+         }
           */
          s.target = img_open_file(out_filename, open_opts, out_fmt,
 -                                 flags, writethrough, quiet, false);
 +                                 flags, writethrough, s.quiet, false);
          open_opts = NULL; /* blk_new_open will have freed it */
      }
-     if (!s.target) {
 -    qemu_lockcnt_dec(&ctx->list_lock);
      return progress;
  }
  void aio_dispatch(AioContext *ctx)
  {
 +    qemu_lockcnt_inc(&ctx->list_lock);
      aio_bh_poll(ctx);
      aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
 +    qemu_lockcnt_dec(&ctx->list_lock);
      timerlistgroup_run_timers(&ctx->tlg);
  }
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          }
      }
 -    qemu_lockcnt_dec(&ctx->list_lock);
      first = true;
      /* ctx->notifier is always registered.  */
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
          progress |= aio_dispatch_handlers(ctx, event);
      } while (count > 0);
 +    qemu_lockcnt_dec(&ctx->list_lock);
 +
      progress |= timerlistgroup_run_timers(&ctx->tlg);
      return progress;
  }
 diff --git a/util/async.c b/util/async.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/async.c
 +++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ void aio_bh_call(QEMUBH *bh)
      bh->cb(bh->opaque);
  }
 -/* Multiple occurrences of aio_bh_poll cannot be called concurrently */
 +/* Multiple occurrences of aio_bh_poll cannot be called concurrently.
 + * The count in ctx->list_lock is incremented before the call, and is
 + * not affected by the call.
 + */
  int aio_bh_poll(AioContext *ctx)
  {
      QEMUBH *bh, **bhp, *next;
      int ret;
      bool deleted = false;
 -    qemu_lockcnt_inc(&ctx->list_lock);
 -
      ret = 0;
      for (bh = atomic_rcu_read(&ctx->first_bh); bh; bh = next) {
          next = atomic_rcu_read(&bh->next);
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
      /* remove deleted bhs */
      if (!deleted) {
 -        qemu_lockcnt_dec(&ctx->list_lock);
          return ret;
      }
 -    if (qemu_lockcnt_dec_and_lock(&ctx->list_lock)) {
 +    if (qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
          bhp = &ctx->first_bh;
          while (*bhp) {
              bh = *bhp;
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
                  bhp = &bh->next;
              }
          }
 -        qemu_lockcnt_unlock(&ctx->list_lock);
 +        qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
      }
      return ret;
  }
 --
-.21.0
+.9.3

-[Qemu-devel] [PULL 04/20] iotests.py: rewrite run_job to be pickier
+[Qemu-devel] [PULL v2 18/24] block: document fields protected by AioContext lock
-From: John Snow <jsnow@redhat.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Don't pull events out of the queue that don't belong to us;
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-be choosier so that we can use this method to drive jobs that
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-were launched by transactions that may have more jobs.
+Reviewed-by: Fam Zheng <famz@redhat.com>
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Message-id: 20170213135235.12274-19-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  include/block/block_int.h      | 64 +++++++++++++++++++++++++-----------------
  include/sysemu/block-backend.h | 14 ++++++---
 files changed, 49 insertions(+), 29 deletions(-)
-Signed-off-by: John Snow <jsnow@redhat.com>
+diff --git a/include/block/block_int.h b/include/block/block_int.h
 Message-id: 20190523170643.20794-5-jsnow@redhat.com
 Reviewed-by: Max Reitz <mreitz@redhat.com>
 Signed-off-by: Max Reitz <mreitz@redhat.com>
 ---
  tests/qemu-iotests/iotests.py | 48 +++++++++++++++++++++--------------
 file changed, 29 insertions(+), 19 deletions(-)
 diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
 index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/iotests.py
+--- a/include/block/block_int.h
-+++ b/tests/qemu-iotests/iotests.py
++++ b/include/block/block_int.h
-@@ -XXX,XX +XXX,XX @@ class VM(qtest.QEMUQtestMachine):
+@@ -XXX,XX +XXX,XX @@ struct BdrvChild {
-     # Returns None on success, and an error string on failure
+  * copied as well.
-     def run_job(self, job, auto_finalize=True, auto_dismiss=False,
+  */
-                 pre_finalize=None, wait=60.0):
+ struct BlockDriverState {
-+        match_device = {'data': {'device': job}}
+-    int64_t total_sectors; /* if we are reading a disk image, give its
-+        match_id = {'data': {'id': job}}
+-                              size in sectors */
-+        events = [
++    /* Protected by big QEMU lock or read-only after opening.  No special
-+            ('BLOCK_JOB_COMPLETED', match_device),
++     * locking needed during I/O...
-+            ('BLOCK_JOB_CANCELLED', match_device),
++     */
-+            ('BLOCK_JOB_ERROR', match_device),
+     int open_flags; /* flags used to open the file, re-used for re-open */
-+            ('BLOCK_JOB_READY', match_device),
+     bool read_only; /* if true, the media is read only */
-+            ('BLOCK_JOB_PENDING', match_id),
+     bool encrypted; /* if true, the media is encrypted */
-+            ('JOB_STATUS_CHANGE', match_id)
+@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
-+        ]
+     bool sg;        /* if true, the device is a /dev/sg* */
-         error = None
+     bool probed;    /* if true, format was probed rather than specified */
-         while True:
--            for ev in self.get_qmp_events_filtered(wait=wait):
+-    int copy_on_read; /* if nonzero, copy read backing sectors into image.
--                if ev['event'] == 'JOB_STATUS_CHANGE':
+-                         note this is a reference count */
--                    status = ev['data']['status']
+-
--                    if status == 'aborting':
+-    CoQueue flush_queue;            /* Serializing flush queue */
--                        result = self.qmp('query-jobs')
+-    bool active_flush_req;          /* Flush request in flight? */
--                        for j in result['return']:
+-    unsigned int write_gen;         /* Current data generation */
--                            if j['id'] == job:
+-    unsigned int flushed_gen;       /* Flushed write generation */
--                                error = j['error']
+-
--                                log('Job failed: %s' % (j['error']))
+     BlockDriver *drv; /* NULL means no media */
--                    elif status == 'pending' and not auto_finalize:
+     void *opaque;
--                        if pre_finalize:
--                            pre_finalize()
+@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
--                        self.qmp_log('job-finalize', id=job)
+     BdrvChild *backing;
--                    elif status == 'concluded' and not auto_dismiss:
+     BdrvChild *file;
--                        self.qmp_log('job-dismiss', id=job)
--                    elif status == 'null':
+-    /* Callback before write request is processed */
--                        return error
+-    NotifierWithReturnList before_write_notifiers;
--                else:
+-
--                    log(ev)
+-    /* number of in-flight requests; overall and serialising */
-+            ev = filter_qmp_event(self.events_wait(events))
+-    unsigned int in_flight;
-+            if ev['event'] != 'JOB_STATUS_CHANGE':
+-    unsigned int serialising_in_flight;
-+                log(ev)
+-
-+                continue
+-    bool wakeup;
-+            status = ev['data']['status']
+-
-+            if status == 'aborting':
+-    /* Offset after the highest byte written to */
-+                result = self.qmp('query-jobs')
+-    uint64_t wr_highest_offset;
-+                for j in result['return']:
+-
-+                    if j['id'] == job:
+     /* I/O Limits */
-+                        error = j['error']
+     BlockLimits bl;
-+                        log('Job failed: %s' % (j['error']))
-+            elif status == 'pending' and not auto_finalize:
+@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
-+                if pre_finalize:
+     QTAILQ_ENTRY(BlockDriverState) bs_list;
-+                    pre_finalize()
+     /* element of the list of monitor-owned BDS */
-+                self.qmp_log('job-finalize', id=job)
+     QTAILQ_ENTRY(BlockDriverState) monitor_list;
-+            elif status == 'concluded' and not auto_dismiss:
+-    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
-+                self.qmp_log('job-dismiss', id=job)
+     int refcnt;
-+            elif status == 'null':
-+                return error
+-    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
+-
-     def node_info(self, node_name):
+     /* operation blockers */
-         nodes = self.qmp('query-named-block-nodes')
+     QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX];
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
      /* The error object in use for blocking operations on backing_hd */
      Error *backing_blocker;
 +    /* Protected by AioContext lock */
 +
 +    /* If true, copy read backing sectors into image.  Can be >1 if more
 +     * than one client has requested copy-on-read.
 +     */
 +    int copy_on_read;
 +
 +    /* If we are reading a disk image, give its size in sectors.
 +     * Generally read-only; it is written to by load_vmstate and save_vmstate,
 +     * but the block layer is quiescent during those.
 +     */
 +    int64_t total_sectors;
 +
 +    /* Callback before write request is processed */
 +    NotifierWithReturnList before_write_notifiers;
 +
 +    /* number of in-flight requests; overall and serialising */
 +    unsigned int in_flight;
 +    unsigned int serialising_in_flight;
 +
 +    bool wakeup;
 +
 +    /* Offset after the highest byte written to */
 +    uint64_t wr_highest_offset;
 +
      /* threshold limit for writes, in bytes. "High water mark". */
      uint64_t write_threshold_offset;
      NotifierWithReturn write_threshold_notifier;
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
      /* counter for nested bdrv_io_plug */
      unsigned io_plugged;
 +    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
 +    CoQueue flush_queue;                  /* Serializing flush queue */
 +    bool active_flush_req;                /* Flush request in flight? */
 +    unsigned int write_gen;               /* Current data generation */
 +    unsigned int flushed_gen;             /* Flushed write generation */
 +
 +    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
 +
 +    /* do we need to tell the quest if we have a volatile write cache? */
 +    int enable_write_cache;
 +
      int quiesce_counter;
  };
 diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/sysemu/block-backend.h
 +++ b/include/sysemu/block-backend.h
@@ -XXX,XX +XXX,XX @@ typedef struct BlockDevOps {
   * fields that must be public. This is in particular for QLIST_ENTRY() and
   * friends so that BlockBackends can be kept in lists outside block-backend.c */
  typedef struct BlockBackendPublic {
 -    /* I/O throttling.
 -     * throttle_state tells us if this BlockBackend has I/O limits configured.
 -     * io_limits_disabled tells us if they are currently being enforced */
 +    /* I/O throttling has its own locking, but also some fields are
 +     * protected by the AioContext lock.
 +     */
 +
 +    /* Protected by AioContext lock.  */
      CoQueue      throttled_reqs[2];
 +
 +    /* Nonzero if the I/O limits are currently being ignored; generally
 +     * it is zero.  */
      unsigned int io_limits_disabled;
      /* The following fields are protected by the ThrottleGroup lock.
 -     * See the ThrottleGroup documentation for details. */
 +     * See the ThrottleGroup documentation for details.
 +     * throttle_state tells us if I/O limits are configured. */
      ThrottleState *throttle_state;
      ThrottleTimers throttle_timers;
      unsigned       pending_reqs[2];
 --
-.21.0
+.9.3

-New patch
+[Qemu-devel] [PULL v2 19/24] coroutine-lock: make CoMutex thread-safe
+From: Paolo Bonzini <pbonzini@redhat.com>
 This uses the lock-free mutex described in the paper '"Blocking without
 Locking", or LFTHREADS: A lock-free thread library' by Gidenstam and
 Papatriantafilou.  The same technique is used in OSv, and in fact
 the code is essentially a conversion to C of OSv's code.
 [Added missing coroutine_fn in tests/test-aio-multithread.c.
 --Stefan]
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213181244.16297-2-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  include/qemu/coroutine.h     |  17 ++++-
  tests/test-aio-multithread.c |  86 ++++++++++++++++++++++++
  util/qemu-coroutine-lock.c   | 155 ++++++++++++++++++++++++++++++++++++++++---
  util/trace-events            |   1 +
 files changed, 246 insertions(+), 13 deletions(-)
 diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/coroutine.h
 +++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue);
  /**
   * Provides a mutex that can be used to synchronise coroutines
   */
 +struct CoWaitRecord;
  typedef struct CoMutex {
 -    bool locked;
 +    /* Count of pending lockers; 0 for a free mutex, 1 for an
 +     * uncontended mutex.
 +     */
 +    unsigned locked;
 +
 +    /* A queue of waiters.  Elements are added atomically in front of
 +     * from_push.  to_pop is only populated, and popped from, by whoever
 +     * is in charge of the next wakeup.  This can be an unlocker or,
 +     * through the handoff protocol, a locker that is about to go to sleep.
 +     */
 +    QSLIST_HEAD(, CoWaitRecord) from_push, to_pop;
 +
 +    unsigned handoff, sequence;
 +
      Coroutine *holder;
 -    CoQueue queue;
  } CoMutex;
  /**
 diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/test-aio-multithread.c
 +++ b/tests/test-aio-multithread.c
@@ -XXX,XX +XXX,XX @@ static void test_multi_co_schedule_10(void)
      test_multi_co_schedule(10);
  }
 +/* CoMutex thread-safety.  */
 +
 +static uint32_t atomic_counter;
 +static uint32_t running;
 +static uint32_t counter;
 +static CoMutex comutex;
 +
 +static void coroutine_fn test_multi_co_mutex_entry(void *opaque)
 +{
 +    while (!atomic_mb_read(&now_stopping)) {
 +        qemu_co_mutex_lock(&comutex);
 +        counter++;
 +        qemu_co_mutex_unlock(&comutex);
 +
 +        /* Increase atomic_counter *after* releasing the mutex.  Otherwise
 +         * there is a chance (it happens about 1 in 3 runs) that the iothread
 +         * exits before the coroutine is woken up, causing a spurious
 +         * assertion failure.
 +         */
 +        atomic_inc(&atomic_counter);
 +    }
 +    atomic_dec(&running);
 +}
 +
 +static void test_multi_co_mutex(int threads, int seconds)
 +{
 +    int i;
 +
 +    qemu_co_mutex_init(&comutex);
 +    counter = 0;
 +    atomic_counter = 0;
 +    now_stopping = false;
 +
 +    create_aio_contexts();
 +    assert(threads <= NUM_CONTEXTS);
 +    running = threads;
 +    for (i = 0; i < threads; i++) {
 +        Coroutine *co1 = qemu_coroutine_create(test_multi_co_mutex_entry, NULL);
 +        aio_co_schedule(ctx[i], co1);
 +    }
 +
 +    g_usleep(seconds * 1000000);
 +
 +    atomic_mb_set(&now_stopping, true);
 +    while (running > 0) {
 +        g_usleep(100000);
 +    }
 +
 +    join_aio_contexts();
 +    g_test_message("%d iterations/second\n", counter / seconds);
 +    g_assert_cmpint(counter, ==, atomic_counter);
 +}
 +
 +/* Testing with NUM_CONTEXTS threads focuses on the queue.  The mutex however
 + * is too contended (and the threads spend too much time in aio_poll)
 + * to actually stress the handoff protocol.
 + */
 +static void test_multi_co_mutex_1(void)
 +{
 +    test_multi_co_mutex(NUM_CONTEXTS, 1);
 +}
 +
 +static void test_multi_co_mutex_10(void)
 +{
 +    test_multi_co_mutex(NUM_CONTEXTS, 10);
 +}
 +
 +/* Testing with fewer threads stresses the handoff protocol too.  Still, the
 + * case where the locker _can_ pick up a handoff is very rare, happening
 + * about 10 times in 1 million, so increase the runtime a bit compared to
 + * other "quick" testcases that only run for 1 second.
 + */
 +static void test_multi_co_mutex_2_3(void)
 +{
 +    test_multi_co_mutex(2, 3);
 +}
 +
 +static void test_multi_co_mutex_2_30(void)
 +{
 +    test_multi_co_mutex(2, 30);
 +}
 +
  /* End of tests.  */
  int main(int argc, char **argv)
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/aio/multi/lifecycle", test_lifecycle);
      if (g_test_quick()) {
          g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
 +        g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1);
 +        g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3);
      } else {
          g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
 +        g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10);
 +        g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30);
      }
      return g_test_run();
  }
 diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/qemu-coroutine-lock.c
 +++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@
   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
   * THE SOFTWARE.
 + *
 + * The lock-free mutex implementation is based on OSv
 + * (core/lfmutex.cc, include/lockfree/mutex.hh).
 + * Copyright (C) 2013 Cloudius Systems, Ltd.
   */
  #include "qemu/osdep.h"
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue)
      return QSIMPLEQ_FIRST(&queue->entries) == NULL;
  }
 +/* The wait records are handled with a multiple-producer, single-consumer
 + * lock-free queue.  There cannot be two concurrent pop_waiter() calls
 + * because pop_waiter() can only be called while mutex->handoff is zero.
 + * This can happen in three cases:
 + * - in qemu_co_mutex_unlock, before the hand-off protocol has started.
 + *   In this case, qemu_co_mutex_lock will see mutex->handoff == 0 and
 + *   not take part in the handoff.
 + * - in qemu_co_mutex_lock, if it steals the hand-off responsibility from
 + *   qemu_co_mutex_unlock.  In this case, qemu_co_mutex_unlock will fail
 + *   the cmpxchg (it will see either 0 or the next sequence value) and
 + *   exit.  The next hand-off cannot begin until qemu_co_mutex_lock has
 + *   woken up someone.
 + * - in qemu_co_mutex_unlock, if it takes the hand-off token itself.
 + *   In this case another iteration starts with mutex->handoff == 0;
 + *   a concurrent qemu_co_mutex_lock will fail the cmpxchg, and
 + *   qemu_co_mutex_unlock will go back to case (1).
 + *
 + * The following functions manage this queue.
 + */
 +typedef struct CoWaitRecord {
 +    Coroutine *co;
 +    QSLIST_ENTRY(CoWaitRecord) next;
 +} CoWaitRecord;
 +
 +static void push_waiter(CoMutex *mutex, CoWaitRecord *w)
 +{
 +    w->co = qemu_coroutine_self();
 +    QSLIST_INSERT_HEAD_ATOMIC(&mutex->from_push, w, next);
 +}
 +
 +static void move_waiters(CoMutex *mutex)
 +{
 +    QSLIST_HEAD(, CoWaitRecord) reversed;
 +    QSLIST_MOVE_ATOMIC(&reversed, &mutex->from_push);
 +    while (!QSLIST_EMPTY(&reversed)) {
 +        CoWaitRecord *w = QSLIST_FIRST(&reversed);
 +        QSLIST_REMOVE_HEAD(&reversed, next);
 +        QSLIST_INSERT_HEAD(&mutex->to_pop, w, next);
 +    }
 +}
 +
 +static CoWaitRecord *pop_waiter(CoMutex *mutex)
 +{
 +    CoWaitRecord *w;
 +
 +    if (QSLIST_EMPTY(&mutex->to_pop)) {
 +        move_waiters(mutex);
 +        if (QSLIST_EMPTY(&mutex->to_pop)) {
 +            return NULL;
 +        }
 +    }
 +    w = QSLIST_FIRST(&mutex->to_pop);
 +    QSLIST_REMOVE_HEAD(&mutex->to_pop, next);
 +    return w;
 +}
 +
 +static bool has_waiters(CoMutex *mutex)
 +{
 +    return QSLIST_EMPTY(&mutex->to_pop) || QSLIST_EMPTY(&mutex->from_push);
 +}
 +
  void qemu_co_mutex_init(CoMutex *mutex)
  {
      memset(mutex, 0, sizeof(*mutex));
 -    qemu_co_queue_init(&mutex->queue);
  }
 -void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
 +static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
  {
      Coroutine *self = qemu_coroutine_self();
 +    CoWaitRecord w;
 +    unsigned old_handoff;
      trace_qemu_co_mutex_lock_entry(mutex, self);
 +    w.co = self;
 +    push_waiter(mutex, &w);
 -    while (mutex->locked) {
 -        qemu_co_queue_wait(&mutex->queue);
 +    /* This is the "Responsibility Hand-Off" protocol; a lock() picks from
 +     * a concurrent unlock() the responsibility of waking somebody up.
 +     */
 +    old_handoff = atomic_mb_read(&mutex->handoff);
 +    if (old_handoff &&
 +        has_waiters(mutex) &&
 +        atomic_cmpxchg(&mutex->handoff, old_handoff, 0) == old_handoff) {
 +        /* There can be no concurrent pops, because there can be only
 +         * one active handoff at a time.
 +         */
 +        CoWaitRecord *to_wake = pop_waiter(mutex);
 +        Coroutine *co = to_wake->co;
 +        if (co == self) {
 +            /* We got the lock ourselves!  */
 +            assert(to_wake == &w);
 +            return;
 +        }
 +
 +        aio_co_wake(co);
      }
 -    mutex->locked = true;
 -    mutex->holder = self;
 -    self->locks_held++;
 -
 +    qemu_coroutine_yield();
      trace_qemu_co_mutex_lock_return(mutex, self);
  }
 +void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
 +{
 +    Coroutine *self = qemu_coroutine_self();
 +
 +    if (atomic_fetch_inc(&mutex->locked) == 0) {
 +        /* Uncontended.  */
 +        trace_qemu_co_mutex_lock_uncontended(mutex, self);
 +    } else {
 +        qemu_co_mutex_lock_slowpath(mutex);
 +    }
 +    mutex->holder = self;
 +    self->locks_held++;
 +}
 +
  void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
  {
      Coroutine *self = qemu_coroutine_self();
      trace_qemu_co_mutex_unlock_entry(mutex, self);
 -    assert(mutex->locked == true);
 +    assert(mutex->locked);
      assert(mutex->holder == self);
      assert(qemu_in_coroutine());
 -    mutex->locked = false;
      mutex->holder = NULL;
      self->locks_held--;
 -    qemu_co_queue_next(&mutex->queue);
 +    if (atomic_fetch_dec(&mutex->locked) == 1) {
 +        /* No waiting qemu_co_mutex_lock().  Pfew, that was easy!  */
 +        return;
 +    }
 +
 +    for (;;) {
 +        CoWaitRecord *to_wake = pop_waiter(mutex);
 +        unsigned our_handoff;
 +
 +        if (to_wake) {
 +            Coroutine *co = to_wake->co;
 +            aio_co_wake(co);
 +            break;
 +        }
 +
 +        /* Some concurrent lock() is in progress (we know this because
 +         * mutex->locked was >1) but it hasn't yet put itself on the wait
 +         * queue.  Pick a sequence number for the handoff protocol (not 0).
 +         */
 +        if (++mutex->sequence == 0) {
 +            mutex->sequence = 1;
 +        }
 +
 +        our_handoff = mutex->sequence;
 +        atomic_mb_set(&mutex->handoff, our_handoff);
 +        if (!has_waiters(mutex)) {
 +            /* The concurrent lock has not added itself yet, so it
 +             * will be able to pick our handoff.
 +             */
 +            break;
 +        }
 +
 +        /* Try to do the handoff protocol ourselves; if somebody else has
 +         * already taken it, however, we're done and they're responsible.
 +         */
 +        if (atomic_cmpxchg(&mutex->handoff, our_handoff, 0) != our_handoff) {
 +            break;
 +        }
 +    }
      trace_qemu_co_mutex_unlock_return(mutex, self);
  }
 diff --git a/util/trace-events b/util/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/util/trace-events
 +++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_coroutine_terminate(void *co) "self %p"
  # util/qemu-coroutine-lock.c
  qemu_co_queue_run_restart(void *co) "co %p"
 +qemu_co_mutex_lock_uncontended(void *mutex, void *self) "mutex %p self %p"
  qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
  qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
  qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
 --
 .9.3

-[Qemu-devel] [PULL 15/20] qemu-img: Add salvaging mode to convert
+[Qemu-devel] [PULL v2 20/24] coroutine-lock: add limited spinning to CoMutex
-This adds a salvaging mode (--salvage) to qemu-img convert which ignores
+From: Paolo Bonzini <pbonzini@redhat.com>
 read errors and treats the respective areas as containing only zeroes.
 This can be used for instance to at least partially recover the data
 from terminally corrupted qcow2 images.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Running a very small critical section on pthread_mutex_t and CoMutex
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+shows that pthread_mutex_t is much faster because it doesn't actually
-Message-id: 20190507203508.18026-3-mreitz@redhat.com
+go to sleep.  What happens is that the critical section is shorter
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+than the latency of entering the kernel and thus FUTEX_WAIT always
 fails.  With CoMutex there is no such latency but you still want to
 avoid wait and wakeup.  So introduce it artificially.
 This only works with one waiters; because CoMutex is fair, it will
 always have more waits and wakeups than a pthread_mutex_t.
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213181244.16297-3-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- qemu-img.c       | 90 +++++++++++++++++++++++++++++++++++++-----------
+ include/qemu/coroutine.h   |  5 +++++
- qemu-img-cmds.hx |  4 +--
+ util/qemu-coroutine-lock.c | 51 ++++++++++++++++++++++++++++++++++++++++------
- qemu-img.texi    |  4 +++
+ util/qemu-coroutine.c      |  2 +-
-files changed, 75 insertions(+), 23 deletions(-)
+files changed, 51 insertions(+), 7 deletions(-)
-diff --git a/qemu-img.c b/qemu-img.c
+diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
 index XXXXXXX..XXXXXXX 100644
---- a/qemu-img.c
+--- a/include/qemu/coroutine.h
-+++ b/qemu-img.c
++++ b/include/qemu/coroutine.h
-@@ -XXX,XX +XXX,XX @@ enum {
+@@ -XXX,XX +XXX,XX @@ typedef struct CoMutex {
-     OPTION_SIZE = 264,
+      */
-     OPTION_PREALLOCATION = 265,
+     unsigned locked;
-     OPTION_SHRINK = 266,
-+    OPTION_SALVAGE = 267,
++    /* Context that is holding the lock.  Useful to avoid spinning
- };
++     * when two coroutines on the same AioContext try to get the lock. :)
++     */
- typedef enum OutputFormat {
++    AioContext *ctx;
-@@ -XXX,XX +XXX,XX @@ typedef struct ImgConvertState {
++
-     int64_t target_backing_sectors; /* negative if unknown */
+     /* A queue of waiters.  Elements are added atomically in front of
-     bool wr_in_order;
+      * from_push.  to_pop is only populated, and popped from, by whoever
-     bool copy_range;
+      * is in charge of the next wakeup.  This can be an unlocker or,
-+    bool salvage;
+diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
-     bool quiet;
+index XXXXXXX..XXXXXXX 100644
-     int min_sparse;
+--- a/util/qemu-coroutine-lock.c
-     int alignment;
++++ b/util/qemu-coroutine-lock.c
-@@ -XXX,XX +XXX,XX @@ static int convert_iteration_sectors(ImgConvertState *s, int64_t sector_num)
+@@ -XXX,XX +XXX,XX @@
  #include "qemu-common.h"
  #include "qemu/coroutine.h"
  #include "qemu/coroutine_int.h"
 +#include "qemu/processor.h"
  #include "qemu/queue.h"
  #include "block/aio.h"
  #include "trace.h"
@@ -XXX,XX +XXX,XX @@ void qemu_co_mutex_init(CoMutex *mutex)
      memset(mutex, 0, sizeof(*mutex));
  }
 -static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
 +static void coroutine_fn qemu_co_mutex_wake(CoMutex *mutex, Coroutine *co)
 +{
 +    /* Read co before co->ctx; pairs with smp_wmb() in
 +     * qemu_coroutine_enter().
 +     */
 +    smp_read_barrier_depends();
 +    mutex->ctx = co->ctx;
 +    aio_co_wake(co);
 +}
 +
 +static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx,
 +                                                     CoMutex *mutex)
  {
      Coroutine *self = qemu_coroutine_self();
      CoWaitRecord w;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
          if (co == self) {
              /* We got the lock ourselves!  */
              assert(to_wake == &w);
 +            mutex->ctx = ctx;
              return;
          }
 -        aio_co_wake(co);
 +        qemu_co_mutex_wake(mutex, co);
      }
-     if (s->sector_next_status <= sector_num) {
+     qemu_coroutine_yield();
--        int64_t count = n * BDRV_SECTOR_SIZE;
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
-+        uint64_t offset = (sector_num - src_cur_offset) * BDRV_SECTOR_SIZE;
-+        int64_t count;
+ void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
+ {
--        if (s->target_has_backing) {
++    AioContext *ctx = qemu_get_current_aio_context();
-+        do {
+     Coroutine *self = qemu_coroutine_self();
-+            count = n * BDRV_SECTOR_SIZE;
++    int waiters, i;
-+
-+            if (s->target_has_backing) {
+-    if (atomic_fetch_inc(&mutex->locked) == 0) {
-+                ret = bdrv_block_status(blk_bs(s->src[src_cur]), offset,
++    /* Running a very small critical section on pthread_mutex_t and CoMutex
-+                                        count, &count, NULL, NULL);
++     * shows that pthread_mutex_t is much faster because it doesn't actually
-+            } else {
++     * go to sleep.  What happens is that the critical section is shorter
-+                ret = bdrv_block_status_above(blk_bs(s->src[src_cur]), NULL,
++     * than the latency of entering the kernel and thus FUTEX_WAIT always
-+                                              offset, count, &count, NULL,
++     * fails.  With CoMutex there is no such latency but you still want to
-+                                              NULL);
++     * avoid wait and wakeup.  So introduce it artificially.
 +     */
 +    i = 0;
 +retry_fast_path:
 +    waiters = atomic_cmpxchg(&mutex->locked, 0, 1);
 +    if (waiters != 0) {
 +        while (waiters == 1 && ++i < 1000) {
 +            if (atomic_read(&mutex->ctx) == ctx) {
 +                break;
 +            }
-+
++            if (atomic_read(&mutex->locked) == 0) {
-+            if (ret < 0) {
++                goto retry_fast_path;
 +                if (s->salvage) {
 +                    if (n == 1) {
 +                        if (!s->quiet) {
 +                            warn_report("error while reading block status at "
 +                                        "offset %" PRIu64 ": %s", offset,
 +                                        strerror(-ret));
 +                        }
 +                        /* Just try to read the data, then */
 +                        ret = BDRV_BLOCK_DATA;
 +                        count = BDRV_SECTOR_SIZE;
 +                    } else {
 +                        /* Retry on a shorter range */
 +                        n = DIV_ROUND_UP(n, 4);
 +                    }
 +                } else {
 +                    error_report("error while reading block status at offset "
 +                                 "%" PRIu64 ": %s", offset, strerror(-ret));
 +                    return ret;
 +                }
 +            }
-+        } while (ret < 0);
++            cpu_relax();
 -            ret = bdrv_block_status(blk_bs(s->src[src_cur]),
 -                                    (sector_num - src_cur_offset) *
 -                                    BDRV_SECTOR_SIZE,
 -                                    count, &count, NULL, NULL);
 -        } else {
 -            ret = bdrv_block_status_above(blk_bs(s->src[src_cur]), NULL,
 -                                          (sector_num - src_cur_offset) *
 -                                          BDRV_SECTOR_SIZE,
 -                                          count, &count, NULL, NULL);
 -        }
 -        if (ret < 0) {
 -            error_report("error while reading block status of sector %" PRId64
 -                         ": %s", sector_num, strerror(-ret));
 -            return ret;
 -        }
          n = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
          if (ret & BDRV_BLOCK_ZERO) {
@@ -XXX,XX +XXX,XX @@ static int convert_iteration_sectors(ImgConvertState *s, int64_t sector_num)
  static int coroutine_fn convert_co_read(ImgConvertState *s, int64_t sector_num,
                                          int nb_sectors, uint8_t *buf)
  {
 +    uint64_t single_read_until = 0;
      int n, ret;
      assert(nb_sectors <= s->buf_sectors);
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn convert_co_read(ImgConvertState *s, int64_t sector_num,
          BlockBackend *blk;
          int src_cur;
          int64_t bs_sectors, src_cur_offset;
 +        uint64_t offset;
          /* In the case of compression with multiple source files, we can get a
           * nb_sectors that spreads into the next part. So we must be able to
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn convert_co_read(ImgConvertState *s, int64_t sector_num,
          blk = s->src[src_cur];
          bs_sectors = s->src_sectors[src_cur];
 +        offset = (sector_num - src_cur_offset) << BDRV_SECTOR_BITS;
 +
          n = MIN(nb_sectors, bs_sectors - (sector_num - src_cur_offset));
 +        if (single_read_until > offset) {
 +            n = 1;
 +        }
++        waiters = atomic_fetch_inc(&mutex->locked);
 -        ret = blk_co_pread(
 -                blk, (sector_num - src_cur_offset) << BDRV_SECTOR_BITS,
 -                n << BDRV_SECTOR_BITS, buf, 0);
 +        ret = blk_co_pread(blk, offset, n << BDRV_SECTOR_BITS, buf, 0);
          if (ret < 0) {
 -            return ret;
 +            if (s->salvage) {
 +                if (n > 1) {
 +                    single_read_until = offset + (n << BDRV_SECTOR_BITS);
 +                    continue;
 +                } else {
 +                    if (!s->quiet) {
 +                        warn_report("error while reading offset %" PRIu64
 +                                    ": %s", offset, strerror(-ret));
 +                    }
 +                    memset(buf, 0, BDRV_SECTOR_SIZE);
 +                }
 +            } else {
 +                return ret;
 +            }
          }
          sector_num += n;
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
              {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS},
              {"force-share", no_argument, 0, 'U'},
              {"target-image-opts", no_argument, 0, OPTION_TARGET_IMAGE_OPTS},
 +            {"salvage", no_argument, 0, OPTION_SALVAGE},
              {0, 0, 0, 0}
          };
          c = getopt_long(argc, argv, ":hf:O:B:Cco:l:S:pt:T:qnm:WU",
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
          case OPTION_IMAGE_OPTS:
              image_opts = true;
              break;
 +        case OPTION_SALVAGE:
 +            s.salvage = true;
 +            break;
          case OPTION_TARGET_IMAGE_OPTS:
              tgt_image_opts = true;
              break;
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
          goto fail_getopt;
      }
 +    if (s.copy_range && s.salvage) {
 +        error_report("Cannot use copy offloading in salvaging mode");
 +        goto fail_getopt;
 +    }
 +
-     if (tgt_image_opts && !skip_create) {
++    if (waiters == 0) {
-         error_report("--target-image-opts requires use of -n flag");
+         /* Uncontended.  */
-         goto fail_getopt;
+         trace_qemu_co_mutex_lock_uncontended(mutex, self);
-diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
++        mutex->ctx = ctx;
      } else {
 -        qemu_co_mutex_lock_slowpath(mutex);
 +        qemu_co_mutex_lock_slowpath(ctx, mutex);
      }
      mutex->holder = self;
      self->locks_held++;
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
      assert(mutex->holder == self);
      assert(qemu_in_coroutine());
 +    mutex->ctx = NULL;
      mutex->holder = NULL;
      self->locks_held--;
      if (atomic_fetch_dec(&mutex->locked) == 1) {
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
          unsigned our_handoff;
          if (to_wake) {
 -            Coroutine *co = to_wake->co;
 -            aio_co_wake(co);
 +            qemu_co_mutex_wake(mutex, to_wake->co);
              break;
          }
 diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
 index XXXXXXX..XXXXXXX 100644
---- a/qemu-img-cmds.hx
+--- a/util/qemu-coroutine.c
-+++ b/qemu-img-cmds.hx
++++ b/util/qemu-coroutine.c
-@@ -XXX,XX +XXX,XX @@ STEXI
+@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_enter(Coroutine *co)
- ETEXI
+     co->ctx = qemu_get_current_aio_context();
- DEF("convert", img_convert,
+     /* Store co->ctx before anything that stores co.  Matches
--    "convert [--object objectdef] [--image-opts] [--target-image-opts] [-U] [-C] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-B backing_file] [-o options] [-l snapshot_param] [-S sparse_size] [-m num_coroutines] [-W] filename [filename2 [...]] output_filename")
+-     * barrier in aio_co_wake.
-+    "convert [--object objectdef] [--image-opts] [--target-image-opts] [-U] [-C] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-B backing_file] [-o options] [-l snapshot_param] [-S sparse_size] [-m num_coroutines] [-W] [--salvage] filename [filename2 [...]] output_filename")
++     * barrier in aio_co_wake and qemu_co_mutex_wake.
- STEXI
+      */
--@item convert [--object @var{objectdef}] [--image-opts] [--target-image-opts] [-U] [-C] [-c] [-p] [-q] [-n] [-f @var{fmt}] [-t @var{cache}] [-T @var{src_cache}] [-O @var{output_fmt}] [-B @var{backing_file}] [-o @var{options}] [-l @var{snapshot_param}] [-S @var{sparse_size}] [-m @var{num_coroutines}] [-W] @var{filename} [@var{filename2} [...]] @var{output_filename}
+     smp_wmb();
-+@item convert [--object @var{objectdef}] [--image-opts] [--target-image-opts] [-U] [-C] [-c] [-p] [-q] [-n] [-f @var{fmt}] [-t @var{cache}] [-T @var{src_cache}] [-O @var{output_fmt}] [-B @var{backing_file}] [-o @var{options}] [-l @var{snapshot_param}] [-S @var{sparse_size}] [-m @var{num_coroutines}] [-W] [--salvage] @var{filename} [@var{filename2} [...]] @var{output_filename}
  ETEXI
  DEF("create", img_create,
 diff --git a/qemu-img.texi b/qemu-img.texi
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-img.texi
 +++ b/qemu-img.texi
@@ -XXX,XX +XXX,XX @@ improve performance if the data is remote, such as with NFS or iSCSI backends,
  but will not automatically sparsify zero sectors, and may result in a fully
  allocated target image depending on the host support for getting allocation
  information.
 +@item --salvage
 +Try to ignore I/O errors when reading.  Unless in quiet mode (@code{-q}), errors
 +will still be printed.  Areas that cannot be read from the source will be
 +treated as containing only zeroes.
  @end table
  Parameters to dd subcommand:
 --
-.21.0
+.9.3

-New patch
+[Qemu-devel] [PULL v2 21/24] test-aio-multithread: add performance comparison with thread-based mutexes
+From: Paolo Bonzini <pbonzini@redhat.com>
 Add two implementations of the same benchmark as the previous patch,
 but using pthreads.  One uses a normal QemuMutex, the other is Linux
 only and implements a fair mutex based on MCS locks and futexes.
 This shows that the slower performance of the 5-thread case is due to
 the fairness of CoMutex, rather than to coroutines.  If fairness does
 not matter, as is the case with two threads, CoMutex can actually be
 faster than pthreads.
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213181244.16297-4-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  tests/test-aio-multithread.c | 164 +++++++++++++++++++++++++++++++++++++++++++
 file changed, 164 insertions(+)
 diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/test-aio-multithread.c
 +++ b/tests/test-aio-multithread.c
@@ -XXX,XX +XXX,XX @@ static void test_multi_co_mutex_2_30(void)
      test_multi_co_mutex(2, 30);
  }
 +/* Same test with fair mutexes, for performance comparison.  */
 +
 +#ifdef CONFIG_LINUX
 +#include "qemu/futex.h"
 +
 +/* The nodes for the mutex reside in this structure (on which we try to avoid
 + * false sharing).  The head of the mutex is in the "mutex_head" variable.
 + */
 +static struct {
 +    int next, locked;
 +    int padding[14];
 +} nodes[NUM_CONTEXTS] __attribute__((__aligned__(64)));
 +
 +static int mutex_head = -1;
 +
 +static void mcs_mutex_lock(void)
 +{
 +    int prev;
 +
 +    nodes[id].next = -1;
 +    nodes[id].locked = 1;
 +    prev = atomic_xchg(&mutex_head, id);
 +    if (prev != -1) {
 +        atomic_set(&nodes[prev].next, id);
 +        qemu_futex_wait(&nodes[id].locked, 1);
 +    }
 +}
 +
 +static void mcs_mutex_unlock(void)
 +{
 +    int next;
 +    if (nodes[id].next == -1) {
 +        if (atomic_read(&mutex_head) == id &&
 +            atomic_cmpxchg(&mutex_head, id, -1) == id) {
 +            /* Last item in the list, exit.  */
 +            return;
 +        }
 +        while (atomic_read(&nodes[id].next) == -1) {
 +            /* mcs_mutex_lock did the xchg, but has not updated
 +             * nodes[prev].next yet.
 +             */
 +        }
 +    }
 +
 +    /* Wake up the next in line.  */
 +    next = nodes[id].next;
 +    nodes[next].locked = 0;
 +    qemu_futex_wake(&nodes[next].locked, 1);
 +}
 +
 +static void test_multi_fair_mutex_entry(void *opaque)
 +{
 +    while (!atomic_mb_read(&now_stopping)) {
 +        mcs_mutex_lock();
 +        counter++;
 +        mcs_mutex_unlock();
 +        atomic_inc(&atomic_counter);
 +    }
 +    atomic_dec(&running);
 +}
 +
 +static void test_multi_fair_mutex(int threads, int seconds)
 +{
 +    int i;
 +
 +    assert(mutex_head == -1);
 +    counter = 0;
 +    atomic_counter = 0;
 +    now_stopping = false;
 +
 +    create_aio_contexts();
 +    assert(threads <= NUM_CONTEXTS);
 +    running = threads;
 +    for (i = 0; i < threads; i++) {
 +        Coroutine *co1 = qemu_coroutine_create(test_multi_fair_mutex_entry, NULL);
 +        aio_co_schedule(ctx[i], co1);
 +    }
 +
 +    g_usleep(seconds * 1000000);
 +
 +    atomic_mb_set(&now_stopping, true);
 +    while (running > 0) {
 +        g_usleep(100000);
 +    }
 +
 +    join_aio_contexts();
 +    g_test_message("%d iterations/second\n", counter / seconds);
 +    g_assert_cmpint(counter, ==, atomic_counter);
 +}
 +
 +static void test_multi_fair_mutex_1(void)
 +{
 +    test_multi_fair_mutex(NUM_CONTEXTS, 1);
 +}
 +
 +static void test_multi_fair_mutex_10(void)
 +{
 +    test_multi_fair_mutex(NUM_CONTEXTS, 10);
 +}
 +#endif
 +
 +/* Same test with pthread mutexes, for performance comparison and
 + * portability.  */
 +
 +static QemuMutex mutex;
 +
 +static void test_multi_mutex_entry(void *opaque)
 +{
 +    while (!atomic_mb_read(&now_stopping)) {
 +        qemu_mutex_lock(&mutex);
 +        counter++;
 +        qemu_mutex_unlock(&mutex);
 +        atomic_inc(&atomic_counter);
 +    }
 +    atomic_dec(&running);
 +}
 +
 +static void test_multi_mutex(int threads, int seconds)
 +{
 +    int i;
 +
 +    qemu_mutex_init(&mutex);
 +    counter = 0;
 +    atomic_counter = 0;
 +    now_stopping = false;
 +
 +    create_aio_contexts();
 +    assert(threads <= NUM_CONTEXTS);
 +    running = threads;
 +    for (i = 0; i < threads; i++) {
 +        Coroutine *co1 = qemu_coroutine_create(test_multi_mutex_entry, NULL);
 +        aio_co_schedule(ctx[i], co1);
 +    }
 +
 +    g_usleep(seconds * 1000000);
 +
 +    atomic_mb_set(&now_stopping, true);
 +    while (running > 0) {
 +        g_usleep(100000);
 +    }
 +
 +    join_aio_contexts();
 +    g_test_message("%d iterations/second\n", counter / seconds);
 +    g_assert_cmpint(counter, ==, atomic_counter);
 +}
 +
 +static void test_multi_mutex_1(void)
 +{
 +    test_multi_mutex(NUM_CONTEXTS, 1);
 +}
 +
 +static void test_multi_mutex_10(void)
 +{
 +    test_multi_mutex(NUM_CONTEXTS, 10);
 +}
 +
  /* End of tests.  */
  int main(int argc, char **argv)
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
          g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
          g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1);
          g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3);
 +#ifdef CONFIG_LINUX
 +        g_test_add_func("/aio/multi/mutex/mcs", test_multi_fair_mutex_1);
 +#endif
 +        g_test_add_func("/aio/multi/mutex/pthread", test_multi_mutex_1);
      } else {
          g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
          g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10);
          g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30);
 +#ifdef CONFIG_LINUX
 +        g_test_add_func("/aio/multi/mutex/mcs", test_multi_fair_mutex_10);
 +#endif
 +        g_test_add_func("/aio/multi/mutex/pthread", test_multi_mutex_10);
      }
      return g_test_run();
  }
 --
 .9.3

-[Qemu-devel] [PULL 19/20] iotests: Test qemu-img convert --salvage
+[Qemu-devel] [PULL v2 22/24] coroutine-lock: place CoMutex before CoQueue in header
-This test converts a simple image to another, but blkdebug injects
+From: Paolo Bonzini <pbonzini@redhat.com>
 block_status and read faults at some offsets.  The resulting image
 should be the same as the input image, except that sectors that could
 not be read have to be 0.
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+This will avoid forward references in the next patch.  It is also
-Message-id: 20190507203508.18026-7-mreitz@redhat.com
+more logical because CoQueue is not anymore the basic primitive.
-Tested-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-[mreitz: Dropped superfluous printf from _filter_offsets, as suggested
+Reviewed-by: Fam Zheng <famz@redhat.com>
-         by Vladimir; disable test for VDI and IMGOPTSSYNTAX]
+Message-id: 20170213181244.16297-5-pbonzini@redhat.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- tests/qemu-iotests/251     | 170 +++++++++++++++++++++++++++++++++++++
+ include/qemu/coroutine.h | 89 ++++++++++++++++++++++++------------------------
- tests/qemu-iotests/251.out |  43 ++++++++++
+file changed, 44 insertions(+), 45 deletions(-)
  tests/qemu-iotests/group   |   1 +
 files changed, 214 insertions(+)
  create mode 100755 tests/qemu-iotests/251
  create mode 100644 tests/qemu-iotests/251.out
-diff --git a/tests/qemu-iotests/251 b/tests/qemu-iotests/251
+diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
-new file mode 100755
+index XXXXXXX..XXXXXXX 100644
-index XXXXXXX..XXXXXXX
+--- a/include/qemu/coroutine.h
---- /dev/null
++++ b/include/qemu/coroutine.h
-+++ b/tests/qemu-iotests/251
+@@ -XXX,XX +XXX,XX @@ bool qemu_in_coroutine(void);
-@@ -XXX,XX +XXX,XX @@
+  */
-+#!/usr/bin/env bash
+ bool qemu_coroutine_entered(Coroutine *co);
-+#
-+# Test qemu-img convert --salvage
+-
-+#
+-/**
-+# Copyright (C) 2019 Red Hat, Inc.
+- * CoQueues are a mechanism to queue coroutines in order to continue executing
-+#
+- * them later. They provide the fundamental primitives on which coroutine locks
-+# This program is free software; you can redistribute it and/or modify
+- * are built.
-+# it under the terms of the GNU General Public License as published by
+- */
-+# the Free Software Foundation; either version 2 of the License, or
+-typedef struct CoQueue {
-+# (at your option) any later version.
+-    QSIMPLEQ_HEAD(, Coroutine) entries;
-+#
+-} CoQueue;
-+# This program is distributed in the hope that it will be useful,
+-
-+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+-/**
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+- * Initialise a CoQueue. This must be called before any other operation is used
-+# GNU General Public License for more details.
+- * on the CoQueue.
-+#
+- */
-+# You should have received a copy of the GNU General Public License
+-void qemu_co_queue_init(CoQueue *queue);
-+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-
-+#
+-/**
 - * Adds the current coroutine to the CoQueue and transfers control to the
 - * caller of the coroutine.
 - */
 -void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
 -
 -/**
 - * Restarts the next coroutine in the CoQueue and removes it from the queue.
 - *
 - * Returns true if a coroutine was restarted, false if the queue is empty.
 - */
 -bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
 -
 -/**
 - * Restarts all coroutines in the CoQueue and leaves the queue empty.
 - */
 -void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
 -
 -/**
 - * Enter the next coroutine in the queue
 - */
 -bool qemu_co_enter_next(CoQueue *queue);
 -
 -/**
 - * Checks if the CoQueue is empty.
 - */
 -bool qemu_co_queue_empty(CoQueue *queue);
 -
 -
  /**
   * Provides a mutex that can be used to synchronise coroutines
   */
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex);
   */
  void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
 +
-+# creator
++/**
-+owner=mreitz@redhat.com
++ * CoQueues are a mechanism to queue coroutines in order to continue executing
 + * them later.
 + */
 +typedef struct CoQueue {
 +    QSIMPLEQ_HEAD(, Coroutine) entries;
 +} CoQueue;
 +
-+seq=$(basename $0)
++/**
-+echo "QA output created by $seq"
++ * Initialise a CoQueue. This must be called before any other operation is used
 + * on the CoQueue.
 + */
 +void qemu_co_queue_init(CoQueue *queue);
 +
-+status=1    # failure is the default!
++/**
 + * Adds the current coroutine to the CoQueue and transfers control to the
 + * caller of the coroutine.
 + */
 +void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
 +
-+_cleanup()
++/**
-+{
++ * Restarts the next coroutine in the CoQueue and removes it from the queue.
-+    _cleanup_test_img
++ *
-+}
++ * Returns true if a coroutine was restarted, false if the queue is empty.
-+trap "_cleanup; exit \$status" 0 1 2 3 15
++ */
 +bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
 +
-+# get standard environment, filters and checks
++/**
-+. ./common.rc
++ * Restarts all coroutines in the CoQueue and leaves the queue empty.
-+. ./common.filter
++ */
-+. ./common.qemu
++void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
 +
-+_supported_fmt generic
++/**
-+_supported_proto file
++ * Enter the next coroutine in the queue
-+_supported_os Linux
++ */
 +bool qemu_co_enter_next(CoQueue *queue);
 +
-+if [ "$IMGOPTSSYNTAX" = "true" ]; then
++/**
-+    # We use json:{} filenames here, so we cannot work with additional options.
++ * Checks if the CoQueue is empty.
-+    _unsupported_fmt $IMGFMT
++ */
-+else
++bool qemu_co_queue_empty(CoQueue *queue);
 +    # With VDI, the output is ordered differently.  Just disable it.
 +    _unsupported_fmt vdi
 +fi
 +
 +
-+TEST_IMG="$TEST_IMG.orig" _make_test_img 64M
+ typedef struct CoRwlock {
-+
+     bool writer;
-+$QEMU_IO -c 'write -P 42 0 64M' "$TEST_IMG.orig" | _filter_qemu_io
+     int reader;
 +
 +
 +sector_size=512
 +
 +# Offsets on which to fail block-status.  Keep in ascending order so
 +# the indexing done by _filter_offsets will appear in ascending order
 +# in the output as well.
 +status_fail_offsets="$((16 * 1024 * 1024 + 8192))
 +                     $((33 * 1024 * 1024 + 512))"
 +
 +# Offsets on which to fail reads.  Keep in ascending order for the
 +# same reason.
 +# The second element is shared with $status_fail_offsets on purpose.
 +# Starting with the third element, we test what happens when a
 +# continuous range of sectors is inaccessible.
 +read_fail_offsets="$((32 * 1024 * 1024 - 65536))
 +                   $((33 * 1024 * 1024 + 512))
 +                   $(seq $((34 * 1024 * 1024)) $sector_size \
 +                         $((34 * 1024 * 1024 + 4096 - $sector_size)))"
 +
 +
 +# blkdebug must be above the format layer so it can intercept all
 +# block-status events
 +source_img="json:{'driver': 'blkdebug',
 +                  'image': {
 +                      'driver': '$IMGFMT',
 +                      'file': {
 +                          'driver': 'file',
 +                          'filename': '$TEST_IMG.orig'
 +                      }
 +                  },
 +                  'inject-error': ["
 +
 +for ofs in $status_fail_offsets
 +do
 +    source_img+="{ 'event': 'none',
 +                   'iotype': 'block-status',
 +                   'errno': 5,
 +                   'sector': $((ofs / sector_size)) },"
 +done
 +
 +for ofs in $read_fail_offsets
 +do
 +    source_img+="{ 'event': 'none',
 +                   'iotype': 'read',
 +                   'errno': 5,
 +                   'sector': $((ofs / sector_size)) },"
 +done
 +
 +# Remove the trailing comma and terminate @inject-error and json:{}
 +source_img="${source_img%,} ] }"
 +
 +
 +echo
 +
 +
 +_filter_offsets() {
 +    filters=
 +
 +    index=0
 +    for ofs in $1
 +    do
 +        filters+=" -e s/$ofs/status_fail_offset_$index/"
 +        index=$((index + 1))
 +    done
 +
 +    index=0
 +    for ofs in $2
 +    do
 +        filters+=" -e s/$ofs/read_fail_offset_$index/"
 +        index=$((index + 1))
 +    done
 +
 +    sed $filters
 +}
 +
 +# While determining the number of allocated sectors in the input
 +# image, we should see one block status warning per element of
 +# $status_fail_offsets.
 +#
 +# Then, the image is read.  Since the block status is queried in
 +# basically the same way, the same warnings as in the previous step
 +# should reappear.  Interleaved with those we should see a read
 +# warning per element of $read_fail_offsets.
 +# Note that $read_fail_offsets and $status_fail_offsets share an
 +# element (read_fail_offset_1 == status_fail_offset_1), so
 +# "status_fail_offset_1" in the output is the same as
 +# "read_fail_offset_1".
 +$QEMU_IMG convert --salvage "$source_img" "$TEST_IMG" 2>&1 \
 +    | _filter_offsets "$status_fail_offsets" "$read_fail_offsets"
 +
 +echo
 +
 +# The offsets where the block status could not be determined should
 +# have been treated as containing data and thus should be correct in
 +# the output image.
 +# The offsets where reading failed altogether should be 0.  Make them
 +# 0 in the input image, too, so we can compare both images.
 +for ofs in $read_fail_offsets
 +do
 +    $QEMU_IO -c "write -z $ofs $sector_size" "$TEST_IMG.orig" \
 +        | _filter_qemu_io \
 +        | _filter_offsets '' "$read_fail_offsets"
 +done
 +
 +echo
 +
 +# These should be equal now.
 +$QEMU_IMG compare "$TEST_IMG.orig" "$TEST_IMG"
 +
 +
 +# success, all done
 +echo "*** done"
 +rm -f $seq.full
 +status=0
 diff --git a/tests/qemu-iotests/251.out b/tests/qemu-iotests/251.out
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/qemu-iotests/251.out
@@ -XXX,XX +XXX,XX @@
 +QA output created by 251
 +Formatting 'TEST_DIR/t.IMGFMT.orig', fmt=IMGFMT size=67108864
 +wrote 67108864/67108864 bytes at offset 0
 +64 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
 +qemu-img: warning: error while reading block status at offset status_fail_offset_0: Input/output error
 +qemu-img: warning: error while reading block status at offset status_fail_offset_1: Input/output error
 +qemu-img: warning: error while reading block status at offset status_fail_offset_0: Input/output error
 +qemu-img: warning: error while reading offset read_fail_offset_0: Input/output error
 +qemu-img: warning: error while reading block status at offset status_fail_offset_1: Input/output error
 +qemu-img: warning: error while reading offset status_fail_offset_1: Input/output error
 +qemu-img: warning: error while reading offset read_fail_offset_2: Input/output error
 +qemu-img: warning: error while reading offset read_fail_offset_3: Input/output error
 +qemu-img: warning: error while reading offset read_fail_offset_4: Input/output error
 +qemu-img: warning: error while reading offset read_fail_offset_5: Input/output error
 +qemu-img: warning: error while reading offset read_fail_offset_6: Input/output error
 +qemu-img: warning: error while reading offset read_fail_offset_7: Input/output error
 +qemu-img: warning: error while reading offset read_fail_offset_8: Input/output error
 +qemu-img: warning: error while reading offset read_fail_offset_9: Input/output error
 +
 +wrote 512/512 bytes at offset read_fail_offset_0
 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +wrote 512/512 bytes at offset read_fail_offset_1
 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +wrote 512/512 bytes at offset read_fail_offset_2
 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +wrote 512/512 bytes at offset read_fail_offset_3
 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +wrote 512/512 bytes at offset read_fail_offset_4
 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +wrote 512/512 bytes at offset read_fail_offset_5
 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +wrote 512/512 bytes at offset read_fail_offset_6
 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +wrote 512/512 bytes at offset read_fail_offset_7
 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +wrote 512/512 bytes at offset read_fail_offset_8
 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +wrote 512/512 bytes at offset read_fail_offset_9
 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
 +Images are identical.
 +*** done
 diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/qemu-iotests/group
 +++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 rw quick
 rw auto quick
 rw auto quick
 +251 rw auto quick
 rw auto backing quick
 rw auto quick
 rw auto backing quick
 --
-.21.0
+.9.3

-New patch
+[Qemu-devel] [PULL v2 23/24] coroutine-lock: add mutex argument to CoQueue APIs
+From: Paolo Bonzini <pbonzini@redhat.com>
 All that CoQueue needs in order to become thread-safe is help
 from an external mutex.  Add this to the API.
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Fam Zheng <famz@redhat.com>
 Message-id: 20170213181244.16297-6-pbonzini@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  include/qemu/coroutine.h   |  8 +++++---
  block/backup.c             |  2 +-
  block/io.c                 |  4 ++--
  block/nbd-client.c         |  2 +-
  block/qcow2-cluster.c      |  4 +---
  block/sheepdog.c           |  2 +-
  block/throttle-groups.c    |  2 +-
  hw/9pfs/9p.c               |  2 +-
  util/qemu-coroutine-lock.c | 24 +++++++++++++++++++++---
 files changed, 34 insertions(+), 16 deletions(-)
 diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/coroutine.h
 +++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
  /**
   * CoQueues are a mechanism to queue coroutines in order to continue executing
 - * them later.
 + * them later.  They are similar to condition variables, but they need help
 + * from an external mutex in order to maintain thread-safety.
   */
  typedef struct CoQueue {
      QSIMPLEQ_HEAD(, Coroutine) entries;
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_init(CoQueue *queue);
  /**
   * Adds the current coroutine to the CoQueue and transfers control to the
 - * caller of the coroutine.
 + * caller of the coroutine.  The mutex is unlocked during the wait and
 + * locked again afterwards.
   */
 -void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
 +void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex);
  /**
   * Restarts the next coroutine in the CoQueue and removes it from the queue.
 diff --git a/block/backup.c b/block/backup.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/backup.c
 +++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
          retry = false;
          QLIST_FOREACH(req, &job->inflight_reqs, list) {
              if (end > req->start && start < req->end) {
 -                qemu_co_queue_wait(&req->wait_queue);
 +                qemu_co_queue_wait(&req->wait_queue, NULL);
                  retry = true;
                  break;
              }
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
                   * (instead of producing a deadlock in the former case). */
                  if (!req->waiting_for) {
                      self->waiting_for = req;
 -                    qemu_co_queue_wait(&req->wait_queue);
 +                    qemu_co_queue_wait(&req->wait_queue, NULL);
                      self->waiting_for = NULL;
                      retry = true;
                      waited = true;
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
      /* Wait until any previous flushes are completed */
      while (bs->active_flush_req) {
 -        qemu_co_queue_wait(&bs->flush_queue);
 +        qemu_co_queue_wait(&bs->flush_queue, NULL);
      }
      bs->active_flush_req = true;
 diff --git a/block/nbd-client.c b/block/nbd-client.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/nbd-client.c
 +++ b/block/nbd-client.c
@@ -XXX,XX +XXX,XX @@ static void nbd_coroutine_start(NBDClientSession *s,
      /* Poor man semaphore.  The free_sema is locked when no other request
       * can be accepted, and unlocked after receiving one reply.  */
      if (s->in_flight == MAX_NBD_REQUESTS) {
 -        qemu_co_queue_wait(&s->free_sema);
 +        qemu_co_queue_wait(&s->free_sema, NULL);
          assert(s->in_flight < MAX_NBD_REQUESTS);
      }
      s->in_flight++;
 diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2-cluster.c
 +++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
              if (bytes == 0) {
                  /* Wait for the dependency to complete. We need to recheck
                   * the free/allocated clusters when we continue. */
 -                qemu_co_mutex_unlock(&s->lock);
 -                qemu_co_queue_wait(&old_alloc->dependent_requests);
 -                qemu_co_mutex_lock(&s->lock);
 +                qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
                  return -EAGAIN;
              }
          }
 diff --git a/block/sheepdog.c b/block/sheepdog.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/sheepdog.c
 +++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
  retry:
      QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
          if (AIOCBOverlapping(acb, cb)) {
 -            qemu_co_queue_wait(&s->overlapping_queue);
 +            qemu_co_queue_wait(&s->overlapping_queue, NULL);
              goto retry;
          }
      }
 diff --git a/block/throttle-groups.c b/block/throttle-groups.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/throttle-groups.c
 +++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
      if (must_wait || blkp->pending_reqs[is_write]) {
          blkp->pending_reqs[is_write]++;
          qemu_mutex_unlock(&tg->lock);
 -        qemu_co_queue_wait(&blkp->throttled_reqs[is_write]);
 +        qemu_co_queue_wait(&blkp->throttled_reqs[is_write], NULL);
          qemu_mutex_lock(&tg->lock);
          blkp->pending_reqs[is_write]--;
      }
 diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/9pfs/9p.c
 +++ b/hw/9pfs/9p.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn v9fs_flush(void *opaque)
          /*
           * Wait for pdu to complete.
           */
 -        qemu_co_queue_wait(&cancel_pdu->complete);
 +        qemu_co_queue_wait(&cancel_pdu->complete, NULL);
          cancel_pdu->cancelled = 0;
          pdu_free(cancel_pdu);
      }
 diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/qemu-coroutine-lock.c
 +++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_init(CoQueue *queue)
      QSIMPLEQ_INIT(&queue->entries);
  }
 -void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
 +void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex)
  {
      Coroutine *self = qemu_coroutine_self();
      QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next);
 +
 +    if (mutex) {
 +        qemu_co_mutex_unlock(mutex);
 +    }
 +
 +    /* There is no race condition here.  Other threads will call
 +     * aio_co_schedule on our AioContext, which can reenter this
 +     * coroutine but only after this yield and after the main loop
 +     * has gone through the next iteration.
 +     */
      qemu_coroutine_yield();
      assert(qemu_in_coroutine());
 +
 +    /* TODO: OSv implements wait morphing here, where the wakeup
 +     * primitive automatically places the woken coroutine on the
 +     * mutex's queue.  This avoids the thundering herd effect.
 +     */
 +    if (mutex) {
 +        qemu_co_mutex_lock(mutex);
 +    }
  }
  /**
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock)
      Coroutine *self = qemu_coroutine_self();
      while (lock->writer) {
 -        qemu_co_queue_wait(&lock->queue);
 +        qemu_co_queue_wait(&lock->queue, NULL);
      }
      lock->reader++;
      self->locks_held++;
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_wrlock(CoRwlock *lock)
      Coroutine *self = qemu_coroutine_self();
      while (lock->writer || lock->reader) {
 -        qemu_co_queue_wait(&lock->queue);
 +        qemu_co_queue_wait(&lock->queue, NULL);
      }
      lock->writer = true;
      self->locks_held++;
 --
 .9.3

-[Qemu-devel] [PULL 02/20] iotests.py: do not use infinite waits
+[Qemu-devel] [PULL v2 24/24] coroutine-lock: make CoRwlock thread-safe and fair
-From: John Snow <jsnow@redhat.com>
+From: Paolo Bonzini <pbonzini@redhat.com>
-Cap waits to 60 seconds so that iotests can fail gracefully if something
+This adds a CoMutex around the existing CoQueue.  Because the write-side
-goes wrong.
+can just take CoMutex, the old "writer" field is not necessary anymore.
 Instead of removing it altogether, count the number of pending writers
 during a read-side critical section and forbid further readers from
 entering.
-Signed-off-by: John Snow <jsnow@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-id: 20190523170643.20794-3-jsnow@redhat.com
+Reviewed-by: Fam Zheng <famz@redhat.com>
-Reviewed-by: Max Reitz <mreitz@redhat.com>
+Message-id: 20170213181244.16297-7-pbonzini@redhat.com
-Signed-off-by: Max Reitz <mreitz@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- tests/qemu-iotests/iotests.py | 14 +++++++-------
+ include/qemu/coroutine.h   |  3 ++-
-file changed, 7 insertions(+), 7 deletions(-)
+ util/qemu-coroutine-lock.c | 35 ++++++++++++++++++++++++-----------
 files changed, 26 insertions(+), 12 deletions(-)
-diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
+diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
 index XXXXXXX..XXXXXXX 100644
---- a/tests/qemu-iotests/iotests.py
+--- a/include/qemu/coroutine.h
-+++ b/tests/qemu-iotests/iotests.py
++++ b/include/qemu/coroutine.h
-@@ -XXX,XX +XXX,XX @@ class VM(qtest.QEMUQtestMachine):
+@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue);
-             output_list += [key + '=' + obj[key]]
-         return ','.join(output_list)
+ typedef struct CoRwlock {
--    def get_qmp_events_filtered(self, wait=True):
+-    bool writer;
-+    def get_qmp_events_filtered(self, wait=60.0):
++    int pending_writer;
-         result = []
+     int reader;
-         for ev in self.get_qmp_events(wait=wait):
++    CoMutex mutex;
-             result.append(filter_qmp_event(ev))
+     CoQueue queue;
-@@ -XXX,XX +XXX,XX @@ class VM(qtest.QEMUQtestMachine):
+ } CoRwlock;
-     # Returns None on success, and an error string on failure
+diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
-     def run_job(self, job, auto_finalize=True, auto_dismiss=False,
+index XXXXXXX..XXXXXXX 100644
--                pre_finalize=None):
+--- a/util/qemu-coroutine-lock.c
-+                pre_finalize=None, wait=60.0):
++++ b/util/qemu-coroutine-lock.c
-         error = None
+@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_init(CoRwlock *lock)
-         while True:
+ {
--            for ev in self.get_qmp_events_filtered(wait=True):
+     memset(lock, 0, sizeof(*lock));
-+            for ev in self.get_qmp_events_filtered(wait=wait):
+     qemu_co_queue_init(&lock->queue);
-                 if ev['event'] == 'JOB_STATUS_CHANGE':
++    qemu_co_mutex_init(&lock->mutex);
-                     status = ev['data']['status']
+ }
-                     if status == 'aborting':
-@@ -XXX,XX +XXX,XX @@ class QMPTestCase(unittest.TestCase):
+ void qemu_co_rwlock_rdlock(CoRwlock *lock)
-         self.assertEqual(self.vm.flatten_qmp_object(json.loads(json_filename[5:])),
+ {
-                          self.vm.flatten_qmp_object(reference))
+     Coroutine *self = qemu_coroutine_self();
--    def cancel_and_wait(self, drive='drive0', force=False, resume=False):
+-    while (lock->writer) {
-+    def cancel_and_wait(self, drive='drive0', force=False, resume=False, wait=60.0):
+-        qemu_co_queue_wait(&lock->queue, NULL);
-         '''Cancel a block job and wait for it to finish, returning the event'''
++    qemu_co_mutex_lock(&lock->mutex);
-         result = self.vm.qmp('block-job-cancel', device=drive, force=force)
++    /* For fairness, wait if a writer is in line.  */
-         self.assert_qmp(result, 'return', {})
++    while (lock->pending_writer) {
-@@ -XXX,XX +XXX,XX @@ class QMPTestCase(unittest.TestCase):
++        qemu_co_queue_wait(&lock->queue, &lock->mutex);
-         cancelled = False
+     }
-         result = None
+     lock->reader++;
-         while not cancelled:
++    qemu_co_mutex_unlock(&lock->mutex);
--            for event in self.vm.get_qmp_events(wait=True):
++
-+            for event in self.vm.get_qmp_events(wait=wait):
++    /* The rest of the read-side critical section is run without the mutex.  */
-                 if event['event'] == 'BLOCK_JOB_COMPLETED' or \
+     self->locks_held++;
-                    event['event'] == 'BLOCK_JOB_CANCELLED':
+ }
-                     self.assert_qmp(event, 'data/device', drive)
-@@ -XXX,XX +XXX,XX @@ class QMPTestCase(unittest.TestCase):
+@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
-         self.assert_no_active_block_jobs()
+     Coroutine *self = qemu_coroutine_self();
-         return result
+     assert(qemu_in_coroutine());
--    def wait_until_completed(self, drive='drive0', check_offset=True):
+-    if (lock->writer) {
-+    def wait_until_completed(self, drive='drive0', check_offset=True, wait=60.0):
+-        lock->writer = false;
-         '''Wait for a block job to finish, returning the event'''
++    if (!lock->reader) {
-         while True:
++        /* The critical section started in qemu_co_rwlock_wrlock.  */
--            for event in self.vm.get_qmp_events(wait=True):
+         qemu_co_queue_restart_all(&lock->queue);
-+            for event in self.vm.get_qmp_events(wait=wait):
+     } else {
-                 if event['event'] == 'BLOCK_JOB_COMPLETED':
++        self->locks_held--;
-                     self.assert_qmp(event, 'data/device', drive)
++
-                     self.assert_qmp_absent(event, 'data/error')
++        qemu_co_mutex_lock(&lock->mutex);
          lock->reader--;
          assert(lock->reader >= 0);
          /* Wakeup only one waiting writer */
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
              qemu_co_queue_next(&lock->queue);
          }
      }
 -    self->locks_held--;
 +    qemu_co_mutex_unlock(&lock->mutex);
  }
  void qemu_co_rwlock_wrlock(CoRwlock *lock)
  {
 -    Coroutine *self = qemu_coroutine_self();
 -
 -    while (lock->writer || lock->reader) {
 -        qemu_co_queue_wait(&lock->queue, NULL);
 +    qemu_co_mutex_lock(&lock->mutex);
 +    lock->pending_writer++;
 +    while (lock->reader) {
 +        qemu_co_queue_wait(&lock->queue, &lock->mutex);
      }
 -    lock->writer = true;
 -    self->locks_held++;
 +    lock->pending_writer--;
 +
 +    /* The rest of the write-side critical section is run with
 +     * the mutex taken, so that lock->reader remains zero.
 +     * There is no need to update self->locks_held.
 +     */
  }
 --
-.21.0
+.9.3

The following changes since commit 5ec2eca83dc478ddf24077e02a8b34dd26cd3ff9:

Merge remote-tracking branch 'remotes/awilliam/tags/vfio-updates-20190613.0' into staging (2019-06-14 09:33:55 +0100)

are available in the Git repository at:

https://github.com/XanClic/qemu.git tags/pull-block-2019-06-14

for you to fetch changes up to 21c1ce592a144188dfe59b9e156a97da412a59a2:

iotests: Test qemu-img convert -C --salvage (2019-06-14 15:09:42 +0200)

----------------------------------------------------------------
Block patches:
- Allow blockdev-backup from nodes that are not in qemu's main AIO
  context to newly added nodes
- Add salvaging mode to qemu-img convert
- Minor fixes to tests, documentation, and for less Valgrind annoyance

----------------------------------------------------------------
Andrey Shinkevich (1):
  hw/block/fdc: floppy command FIFO memory initialization

John Snow (6):
  blockdev-backup: don't check aio_context too early
  iotests.py: do not use infinite waits
  QEMUMachine: add events_wait method
  iotests.py: rewrite run_job to be pickier
  iotests: add iotest 256 for testing blockdev-backup across iothread
    contexts
  event_match: always match on None value

Max Reitz (12):
  iotests: Filter 175's allocation information
  iotests: Fix intermittent failure in 219
  qemu-img: Fix options leakage in img_rebase()
  qapi/block-core: Overlays are not snapshots
  blockdev: Overlays are not snapshots
  qemu-img: Move quiet into ImgConvertState
  qemu-img: Add salvaging mode to convert
  blkdebug: Add @iotype error option
  blkdebug: Add "none" event
  blkdebug: Inject errors on .bdrv_co_block_status()
  iotests: Test qemu-img convert --salvage
  iotests: Test qemu-img convert -C --salvage

Vladimir Sementsov-Ogievskiy (1):
  iotests: restrict 254 to support only qcow2

qapi/block-core.json          |  53 ++++++++---
 block/blkdebug.c              |  60 ++++++++++--
 blockdev.c                    |  14 +--
 hw/block/fdc.c                |   1 +
 qemu-img.c                    | 106 +++++++++++++++------
 python/qemu/__init__.py       |  67 ++++++++++----
 qemu-img-cmds.hx              |   4 +-
 qemu-img.texi                 |   4 +
 tests/qemu-iotests/082        |   1 +
 tests/qemu-iotests/082.out    |   3 +
 tests/qemu-iotests/085.out    |  10 +-
 tests/qemu-iotests/175        |  26 +++++-
 tests/qemu-iotests/175.out    |   8 +-
 tests/qemu-iotests/219        |  13 ++-
 tests/qemu-iotests/251        | 170 ++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/251.out    |  43 +++++++++
 tests/qemu-iotests/254        |   2 +
 tests/qemu-iotests/256        | 122 ++++++++++++++++++++++++
 tests/qemu-iotests/256.out    | 119 ++++++++++++++++++++++++
 tests/qemu-iotests/group      |   2 +
 tests/qemu-iotests/iotests.py |  60 +++++++-----
 21 files changed, 772 insertions(+), 116 deletions(-)
 create mode 100755 tests/qemu-iotests/251
 create mode 100644 tests/qemu-iotests/251.out
 create mode 100755 tests/qemu-iotests/256
 create mode 100644 tests/qemu-iotests/256.out

-- 
2.21.0

From: John Snow <jsnow@redhat.com>

in blockdev_backup_prepare, we check to make sure that the target is
associated with a compatible aio context. However, do_blockdev_backup is
called later and has some logic to move the target to a compatible
aio_context. The transaction version will fail certain commands
needlessly early as a result.

Allow blockdev_backup_prepare to simply call do_blockdev_backup, which
will ultimately decide if the contexts are compatible or not.

Note: the transaction version has always disallowed this operation since
its initial commit bd8baecd (2014), whereas the version of
qmp_blockdev_backup at the time, from commit c29c1dd312f, tried to
enforce the aio_context switch instead. It's not clear, and I can't see
from the mailing list archives at the time, why the two functions take a
different approach. It wasn't until later in efd7556708b (2016) that the
standalone version tried to determine if it could set the context or
not.

Reported-by: aihua liang <aliang@redhat.com>
Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1683498
Signed-off-by: John Snow <jsnow@redhat.com>
Message-id: 20190523170643.20794-2-jsnow@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 blockdev.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index XXXXXXX..XXXXXXX 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -XXX,XX +XXX,XX @@ static void blockdev_backup_prepare(BlkActionState *common, Error **errp)
     }
 
     aio_context = bdrv_get_aio_context(bs);
-    if (aio_context != bdrv_get_aio_context(target)) {
-        error_setg(errp, "Backup between two IO threads is not implemented");
-        return;
-    }
     aio_context_acquire(aio_context);
     state->bs = bs;
 
-- 
2.21.0

From: John Snow <jsnow@redhat.com>

Cap waits to 60 seconds so that iotests can fail gracefully if something
goes wrong.

Signed-off-by: John Snow <jsnow@redhat.com>
Message-id: 20190523170643.20794-3-jsnow@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/iotests.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -XXX,XX +XXX,XX @@ class VM(qtest.QEMUQtestMachine):
             output_list += [key + '=' + obj[key]]
         return ','.join(output_list)
 
-    def get_qmp_events_filtered(self, wait=True):
+    def get_qmp_events_filtered(self, wait=60.0):
         result = []
         for ev in self.get_qmp_events(wait=wait):
             result.append(filter_qmp_event(ev))
@@ -XXX,XX +XXX,XX @@ class VM(qtest.QEMUQtestMachine):
 
     # Returns None on success, and an error string on failure
     def run_job(self, job, auto_finalize=True, auto_dismiss=False,
-                pre_finalize=None):
+                pre_finalize=None, wait=60.0):
         error = None
         while True:
-            for ev in self.get_qmp_events_filtered(wait=True):
+            for ev in self.get_qmp_events_filtered(wait=wait):
                 if ev['event'] == 'JOB_STATUS_CHANGE':
                     status = ev['data']['status']
                     if status == 'aborting':
@@ -XXX,XX +XXX,XX @@ class QMPTestCase(unittest.TestCase):
         self.assertEqual(self.vm.flatten_qmp_object(json.loads(json_filename[5:])),
                          self.vm.flatten_qmp_object(reference))
 
-    def cancel_and_wait(self, drive='drive0', force=False, resume=False):
+    def cancel_and_wait(self, drive='drive0', force=False, resume=False, wait=60.0):
         '''Cancel a block job and wait for it to finish, returning the event'''
         result = self.vm.qmp('block-job-cancel', device=drive, force=force)
         self.assert_qmp(result, 'return', {})
@@ -XXX,XX +XXX,XX @@ class QMPTestCase(unittest.TestCase):
         cancelled = False
         result = None
         while not cancelled:
-            for event in self.vm.get_qmp_events(wait=True):
+            for event in self.vm.get_qmp_events(wait=wait):
                 if event['event'] == 'BLOCK_JOB_COMPLETED' or \
                    event['event'] == 'BLOCK_JOB_CANCELLED':
                     self.assert_qmp(event, 'data/device', drive)
@@ -XXX,XX +XXX,XX @@ class QMPTestCase(unittest.TestCase):
         self.assert_no_active_block_jobs()
         return result
 
-    def wait_until_completed(self, drive='drive0', check_offset=True):
+    def wait_until_completed(self, drive='drive0', check_offset=True, wait=60.0):
         '''Wait for a block job to finish, returning the event'''
         while True:
-            for event in self.vm.get_qmp_events(wait=True):
+            for event in self.vm.get_qmp_events(wait=wait):
                 if event['event'] == 'BLOCK_JOB_COMPLETED':
                     self.assert_qmp(event, 'data/device', drive)
                     self.assert_qmp_absent(event, 'data/error')
-- 
2.21.0

From: John Snow <jsnow@redhat.com>

Instead of event_wait which looks for a single event, add an events_wait
which can look for any number of events simultaneously. However, it
will still only return one at a time, whichever happens first.

Signed-off-by: John Snow <jsnow@redhat.com>
Message-id: 20190523170643.20794-4-jsnow@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 python/qemu/__init__.py | 69 +++++++++++++++++++++++++++++------------
 1 file changed, 49 insertions(+), 20 deletions(-)

diff --git a/python/qemu/__init__.py b/python/qemu/__init__.py
index XXXXXXX..XXXXXXX 100644
--- a/python/qemu/__init__.py
+++ b/python/qemu/__init__.py
@@ -XXX,XX +XXX,XX @@ class QEMUMachine(object):
         self._qmp.clear_events()
         return events
 
-    def event_wait(self, name, timeout=60.0, match=None):
+    @staticmethod
+    def event_match(event, match=None):
         """
-        Wait for specified timeout on named event in QMP; optionally filter
-        results by match.
+        Check if an event matches optional match criteria.
 
-        The 'match' is checked to be a recursive subset of the 'event'; skips
-        branch processing on match's value None
-           {"foo": {"bar": 1}} matches {"foo": None}
-           {"foo": {"bar": 1}} does not matches {"foo": {"baz": None}}
+        The match criteria takes the form of a matching subdict. The event is
+        checked to be a superset of the subdict, recursively, with matching
+        values whenever those values are not None.
+
+        Examples, with the subdict queries on the left:
+         - None matches any object.
+         - {"foo": None} matches {"foo": {"bar": 1}}
+         - {"foo": {"baz": None}} does not match {"foo": {"bar": 1}}
+         - {"foo": {"baz": 2}} matches {"foo": {"bar": 1, "baz": 2}}
         """
-        def event_match(event, match=None):
-            if match is None:
-                return True
+        if match is None:
+            return True
 
-            for key in match:
-                if key in event:
-                    if isinstance(event[key], dict):
-                        if not event_match(event[key], match[key]):
-                            return False
-                    elif event[key] != match[key]:
+        for key in match:
+            if key in event:
+                if isinstance(event[key], dict):
+                    if not QEMUMachine.event_match(event[key], match[key]):
                         return False
-                else:
+                elif event[key] != match[key]:
                     return False
+            else:
+                return False
+        return True
 
-            return True
+    def event_wait(self, name, timeout=60.0, match=None):
+        """
+        event_wait waits for and returns a named event from QMP with a timeout.
+
+        name: The event to wait for.
+        timeout: QEMUMonitorProtocol.pull_event timeout parameter.
+        match: Optional match criteria. See event_match for details.
+        """
+        return self.events_wait([(name, match)], timeout)
+
+    def events_wait(self, events, timeout=60.0):
+        """
+        events_wait waits for and returns a named event from QMP with a timeout.
+
+        events: a sequence of (name, match_criteria) tuples.
+                The match criteria are optional and may be None.
+                See event_match for details.
+        timeout: QEMUMonitorProtocol.pull_event timeout parameter.
+        """
+        def _match(event):
+            for name, match in events:
+                if (event['event'] == name and
+                    self.event_match(event, match)):
+                    return True
+            return False
 
         # Search cached events
         for event in self._events:
-            if (event['event'] == name) and event_match(event, match):
+            if _match(event):
                 self._events.remove(event)
                 return event
 
         # Poll for new events
         while True:
             event = self._qmp.pull_event(wait=timeout)
-            if (event['event'] == name) and event_match(event, match):
+            if _match(event):
                 return event
             self._events.append(event)
 
-- 
2.21.0

From: John Snow <jsnow@redhat.com>

Don't pull events out of the queue that don't belong to us;
be choosier so that we can use this method to drive jobs that
were launched by transactions that may have more jobs.

Signed-off-by: John Snow <jsnow@redhat.com>
Message-id: 20190523170643.20794-5-jsnow@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/iotests.py | 48 +++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -XXX,XX +XXX,XX @@ class VM(qtest.QEMUQtestMachine):
     # Returns None on success, and an error string on failure
     def run_job(self, job, auto_finalize=True, auto_dismiss=False,
                 pre_finalize=None, wait=60.0):
+        match_device = {'data': {'device': job}}
+        match_id = {'data': {'id': job}}
+        events = [
+            ('BLOCK_JOB_COMPLETED', match_device),
+            ('BLOCK_JOB_CANCELLED', match_device),
+            ('BLOCK_JOB_ERROR', match_device),
+            ('BLOCK_JOB_READY', match_device),
+            ('BLOCK_JOB_PENDING', match_id),
+            ('JOB_STATUS_CHANGE', match_id)
+        ]
         error = None
         while True:
-            for ev in self.get_qmp_events_filtered(wait=wait):
-                if ev['event'] == 'JOB_STATUS_CHANGE':
-                    status = ev['data']['status']
-                    if status == 'aborting':
-                        result = self.qmp('query-jobs')
-                        for j in result['return']:
-                            if j['id'] == job:
-                                error = j['error']
-                                log('Job failed: %s' % (j['error']))
-                    elif status == 'pending' and not auto_finalize:
-                        if pre_finalize:
-                            pre_finalize()
-                        self.qmp_log('job-finalize', id=job)
-                    elif status == 'concluded' and not auto_dismiss:
-                        self.qmp_log('job-dismiss', id=job)
-                    elif status == 'null':
-                        return error
-                else:
-                    log(ev)
+            ev = filter_qmp_event(self.events_wait(events))
+            if ev['event'] != 'JOB_STATUS_CHANGE':
+                log(ev)
+                continue
+            status = ev['data']['status']
+            if status == 'aborting':
+                result = self.qmp('query-jobs')
+                for j in result['return']:
+                    if j['id'] == job:
+                        error = j['error']
+                        log('Job failed: %s' % (j['error']))
+            elif status == 'pending' and not auto_finalize:
+                if pre_finalize:
+                    pre_finalize()
+                self.qmp_log('job-finalize', id=job)
+            elif status == 'concluded' and not auto_dismiss:
+                self.qmp_log('job-dismiss', id=job)
+            elif status == 'null':
+                return error
 
     def node_info(self, node_name):
         nodes = self.qmp('query-named-block-nodes')
-- 
2.21.0

From: John Snow <jsnow@redhat.com>

Signed-off-by: John Snow <jsnow@redhat.com>
Message-id: 20190523170643.20794-6-jsnow@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
[mreitz: Moved from 250 to 256]
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/256     | 122 +++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/256.out | 119 ++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 242 insertions(+)
 create mode 100755 tests/qemu-iotests/256
 create mode 100644 tests/qemu-iotests/256.out

diff --git a/tests/qemu-iotests/256 b/tests/qemu-iotests/256
new file mode 100755
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/256
@@ -XXX,XX +XXX,XX @@
+#!/usr/bin/env python
+#
+# Test incremental/backup across iothread contexts
+#
+# Copyright (c) 2019 John Snow for Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# owner=jsnow@redhat.com
+
+import os
+import iotests
+from iotests import log
+
+iotests.verify_image_format(supported_fmts=['qcow2'])
+size = 64 * 1024 * 1024
+
+with iotests.FilePath('img0') as img0_path, \
+     iotests.FilePath('img1') as img1_path, \
+     iotests.FilePath('img0-full') as img0_full_path, \
+     iotests.FilePath('img1-full') as img1_full_path, \
+     iotests.FilePath('img0-incr') as img0_incr_path, \
+     iotests.FilePath('img1-incr') as img1_incr_path, \
+     iotests.VM() as vm:
+
+    def create_target(filepath, name, size):
+        basename = os.path.basename(filepath)
+        nodename = "file_{}".format(basename)
+        log(vm.command('blockdev-create', job_id='job1',
+                       options={
+                           'driver': 'file',
+                           'filename': filepath,
+                           'size': 0,
+                       }))
+        vm.run_job('job1')
+        log(vm.command('blockdev-add', driver='file',
+                       node_name=nodename, filename=filepath))
+        log(vm.command('blockdev-create', job_id='job2',
+                       options={
+                           'driver': iotests.imgfmt,
+                           'file': nodename,
+                           'size': size,
+                       }))
+        vm.run_job('job2')
+        log(vm.command('blockdev-add', driver=iotests.imgfmt,
+                       node_name=name,
+                       file=nodename))
+
+    log('--- Preparing images & VM ---\n')
+    vm.add_object('iothread,id=iothread0')
+    vm.add_object('iothread,id=iothread1')
+    vm.add_device('virtio-scsi-pci,id=scsi0,iothread=iothread0')
+    vm.add_device('virtio-scsi-pci,id=scsi1,iothread=iothread1')
+    iotests.qemu_img_create('-f', iotests.imgfmt, img0_path, str(size))
+    iotests.qemu_img_create('-f', iotests.imgfmt, img1_path, str(size))
+    vm.add_drive(img0_path, interface='none')
+    vm.add_device('scsi-hd,id=device0,drive=drive0,bus=scsi0.0')
+    vm.add_drive(img1_path, interface='none')
+    vm.add_device('scsi-hd,id=device1,drive=drive1,bus=scsi1.0')
+
+    log('--- Starting VM ---\n')
+    vm.launch()
+
+    log('--- Create Targets & Full Backups ---\n')
+    create_target(img0_full_path, 'img0-full', size)
+    create_target(img1_full_path, 'img1-full', size)
+    ret = vm.qmp_log('transaction', indent=2, actions=[
+        { 'type': 'block-dirty-bitmap-add',
+          'data': { 'node': 'drive0', 'name': 'bitmap0' }},
+        { 'type': 'block-dirty-bitmap-add',
+          'data': { 'node': 'drive1', 'name': 'bitmap1' }},
+        { 'type': 'blockdev-backup',
+          'data': { 'device': 'drive0',
+                    'target': 'img0-full',
+                    'sync': 'full',
+                    'job-id': 'j0' }},
+        { 'type': 'blockdev-backup',
+          'data': { 'device': 'drive1',
+                    'target': 'img1-full',
+                    'sync': 'full',
+                    'job-id': 'j1' }}
+    ])
+    if "error" in ret:
+        raise Exception(ret['error']['desc'])
+    vm.run_job('j0', auto_dismiss=True)
+    vm.run_job('j1', auto_dismiss=True)
+
+    log('\n--- Create Targets & Incremental Backups ---\n')
+    create_target(img0_incr_path, 'img0-incr', size)
+    create_target(img1_incr_path, 'img1-incr', size)
+    ret = vm.qmp_log('transaction', indent=2, actions=[
+        { 'type': 'blockdev-backup',
+          'data': { 'device': 'drive0',
+                    'target': 'img0-incr',
+                    'sync': 'incremental',
+                    'bitmap': 'bitmap0',
+                    'job-id': 'j2' }},
+        { 'type': 'blockdev-backup',
+          'data': { 'device': 'drive1',
+                    'target': 'img1-incr',
+                    'sync': 'incremental',
+                    'bitmap': 'bitmap1',
+                    'job-id': 'j3' }}
+    ])
+    if "error" in ret:
+        raise Exception(ret['error']['desc'])
+    vm.run_job('j2', auto_dismiss=True)
+    vm.run_job('j3', auto_dismiss=True)
+
+    log('\n--- Done ---')
+    vm.shutdown()
diff --git a/tests/qemu-iotests/256.out b/tests/qemu-iotests/256.out
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/256.out
@@ -XXX,XX +XXX,XX @@
+--- Preparing images & VM ---
+
+--- Starting VM ---
+
+--- Create Targets & Full Backups ---
+
+{}
+{"execute": "job-dismiss", "arguments": {"id": "job1"}}
+{"return": {}}
+{}
+{}
+{"execute": "job-dismiss", "arguments": {"id": "job2"}}
+{"return": {}}
+{}
+{}
+{"execute": "job-dismiss", "arguments": {"id": "job1"}}
+{"return": {}}
+{}
+{}
+{"execute": "job-dismiss", "arguments": {"id": "job2"}}
+{"return": {}}
+{}
+{
+  "execute": "transaction",
+  "arguments": {
+    "actions": [
+      {
+        "data": {
+          "name": "bitmap0",
+          "node": "drive0"
+        },
+        "type": "block-dirty-bitmap-add"
+      },
+      {
+        "data": {
+          "name": "bitmap1",
+          "node": "drive1"
+        },
+        "type": "block-dirty-bitmap-add"
+      },
+      {
+        "data": {
+          "device": "drive0",
+          "job-id": "j0",
+          "sync": "full",
+          "target": "img0-full"
+        },
+        "type": "blockdev-backup"
+      },
+      {
+        "data": {
+          "device": "drive1",
+          "job-id": "j1",
+          "sync": "full",
+          "target": "img1-full"
+        },
+        "type": "blockdev-backup"
+      }
+    ]
+  }
+}
+{
+  "return": {}
+}
+{"data": {"device": "j0", "len": 67108864, "offset": 67108864, "speed": 0, "type": "backup"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
+{"data": {"device": "j1", "len": 67108864, "offset": 67108864, "speed": 0, "type": "backup"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
+
+--- Create Targets & Incremental Backups ---
+
+{}
+{"execute": "job-dismiss", "arguments": {"id": "job1"}}
+{"return": {}}
+{}
+{}
+{"execute": "job-dismiss", "arguments": {"id": "job2"}}
+{"return": {}}
+{}
+{}
+{"execute": "job-dismiss", "arguments": {"id": "job1"}}
+{"return": {}}
+{}
+{}
+{"execute": "job-dismiss", "arguments": {"id": "job2"}}
+{"return": {}}
+{}
+{
+  "execute": "transaction",
+  "arguments": {
+    "actions": [
+      {
+        "data": {
+          "bitmap": "bitmap0",
+          "device": "drive0",
+          "job-id": "j2",
+          "sync": "incremental",
+          "target": "img0-incr"
+        },
+        "type": "blockdev-backup"
+      },
+      {
+        "data": {
+          "bitmap": "bitmap1",
+          "device": "drive1",
+          "job-id": "j3",
+          "sync": "incremental",
+          "target": "img1-incr"
+        },
+        "type": "blockdev-backup"
+      }
+    ]
+  }
+}
+{
+  "return": {}
+}
+{"data": {"device": "j2", "len": 67108864, "offset": 67108864, "speed": 0, "type": "backup"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
+{"data": {"device": "j3", "len": 67108864, "offset": 67108864, "speed": 0, "type": "backup"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
+
+--- Done ---
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 253 rw auto quick
 254 rw auto backing quick
 255 rw auto quick
+256 rw auto quick
-- 
2.21.0

From: John Snow <jsnow@redhat.com>

Before, event_match didn't always recurse if the event value was not a
dictionary, and would instead check for equality immediately.

By delaying equality checking to post-recursion, we can allow leaf
values like "5" to match "None" and take advantage of the generic
None-returns-True clause.

This makes the matching a little more obviously consistent at the
expense of being able to check for explicit None values, which is
probably not that important given what this function is used for.

Signed-off-by: John Snow <jsnow@redhat.com>
Message-id: 20190528183857.26167-1-jsnow@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 python/qemu/__init__.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/python/qemu/__init__.py b/python/qemu/__init__.py
index XXXXXXX..XXXXXXX 100644
--- a/python/qemu/__init__.py
+++ b/python/qemu/__init__.py
@@ -XXX,XX +XXX,XX @@ class QEMUMachine(object):
 
         The match criteria takes the form of a matching subdict. The event is
         checked to be a superset of the subdict, recursively, with matching
-        values whenever those values are not None.
+        values whenever the subdict values are not None.
+
+        This has a limitation that you cannot explicitly check for None values.
 
         Examples, with the subdict queries on the left:
          - None matches any object.
          - {"foo": None} matches {"foo": {"bar": 1}}
-         - {"foo": {"baz": None}} does not match {"foo": {"bar": 1}}
-         - {"foo": {"baz": 2}} matches {"foo": {"bar": 1, "baz": 2}}
+         - {"foo": None} matches {"foo": 5}
+         - {"foo": {"abc": None}} does not match {"foo": {"bar": 1}}
+         - {"foo": {"rab": 2}} matches {"foo": {"bar": 1, "rab": 2}}
         """
         if match is None:
             return True
 
-        for key in match:
-            if key in event:
-                if isinstance(event[key], dict):
+        try:
+            for key in match:
+                if key in event:
                     if not QEMUMachine.event_match(event[key], match[key]):
                         return False
-                elif event[key] != match[key]:
+                else:
                     return False
-            else:
-                return False
-        return True
+            return True
+        except TypeError:
+            # either match or event wasn't iterable (not a dict)
+            return match == event
 
     def event_wait(self, name, timeout=60.0, match=None):
         """
-- 
2.21.0

It is possible for an empty file to take up blocks on a filesystem, for
example:

$ qemu-img create -f raw test.img 1G
Formatting 'test.img', fmt=raw size=1073741824
$ mkfs.ext4 -I 128 -q test.img
$ mkdir test-mount
$ sudo mount -o loop test.img test-mount
$ sudo touch test-mount/test-file
$ stat -c 'blocks=%b' test-mount/test-file
blocks=8

These extra blocks (one cluster) are apparently used for metadata,
because they are always there, on top of blocks used for data:

$ sudo dd if=/dev/zero of=test-mount/test-file bs=1M count=1
1+0 records in
1+0 records out
1048576 bytes (1.0 MB, 1.0 MiB) copied, 0.00135339 s, 775 MB/s
$ stat -c 'blocks=%b' test-mount/test-file
blocks=2056

Make iotest 175 take this into account.

Reported-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Nir Soffer <nsoffer@redhat.com>
Message-id: 20190516144319.12570-1-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/175     | 26 ++++++++++++++++++++++----
 tests/qemu-iotests/175.out |  8 ++++----
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/tests/qemu-iotests/175 b/tests/qemu-iotests/175
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/175
+++ b/tests/qemu-iotests/175
@@ -XXX,XX +XXX,XX @@ status=1	# failure is the default!
 
 _cleanup()
 {
-	_cleanup_test_img
+    _cleanup_test_img
+    rm -f "$TEST_DIR/empty"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
+# Some file systems sometimes allocate extra blocks independently of
+# the file size.  This function hides the resulting difference in the
+# stat -c '%b' output.
+# Parameter 1: Number of blocks an empty file occupies
+# Parameter 2: Image size in bytes
+_filter_blocks()
+{
+    extra_blocks=$1
+    img_size=$2
+
+    sed -e "s/blocks=$extra_blocks\$\$\\|[^0-9]\$/nothing allocated/" \
+        -e "s/blocks=$((extra_blocks + img_size / 512))\$\$\\|[^0-9]\$/everything allocated/"
+}
+
 # get standard environment, filters and checks
 . ./common.rc
 . ./common.filter
@@ -XXX,XX +XXX,XX @@ _supported_fmt raw
 _supported_proto file
 _supported_os Linux
 
-size=1m
+size=$((1 * 1024 * 1024))
+
+touch "$TEST_DIR/empty"
+extra_blocks=$(stat -c '%b' "$TEST_DIR/empty")
 
 echo
 echo "== creating image with default preallocation =="
 _make_test_img $size | _filter_imgfmt
-stat -c "size=%s, blocks=%b" $TEST_IMG
+stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $size
 
 for mode in off full falloc; do
     echo
     echo "== creating image with preallocation $mode =="
     IMGOPTS=preallocation=$mode _make_test_img $size | _filter_imgfmt
-    stat -c "size=%s, blocks=%b" $TEST_IMG
+    stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $size
 done
 
 # success, all done
diff --git a/tests/qemu-iotests/175.out b/tests/qemu-iotests/175.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/175.out
+++ b/tests/qemu-iotests/175.out
@@ -XXX,XX +XXX,XX @@ QA output created by 175
 
 == creating image with default preallocation ==
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
-size=1048576, blocks=0
+size=1048576, nothing allocated
 
 == creating image with preallocation off ==
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=off
-size=1048576, blocks=0
+size=1048576, nothing allocated
 
 == creating image with preallocation full ==
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=full
-size=1048576, blocks=2048
+size=1048576, everything allocated
 
 == creating image with preallocation falloc ==
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=falloc
-size=1048576, blocks=2048
+size=1048576, everything allocated
  *** done
-- 
2.21.0

In 219, we wait for the job to make progress before we emit its status.
This makes the output reliable.  We do not wait for any more progress if
the job's current-progress already matches its total-progress.

Unfortunately, there is a bug: Right after the job has been started,
it's possible that total-progress is still 0.  In that case, we may skip
the first progress-making step and keep ending up 64 kB short.

To fix that bug, we can simply wait for total-progress to reach 4 MB
(the image size) after starting the job.

Reported-by: Karen Mezick <kmezick@redhat.com>
Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1686651
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190516161114.27596-1-mreitz@redhat.com
Reviewed-by: John Snow <jsnow@redhat.com>
[mreitz: Adjusted commit message as per John's proposal]
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/219 | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/219 b/tests/qemu-iotests/219
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/219
+++ b/tests/qemu-iotests/219
@@ -XXX,XX +XXX,XX @@ import iotests
 
 iotests.verify_image_format(supported_fmts=['qcow2'])
 
+img_size = 4 * 1024 * 1024
+
 def pause_wait(vm, job_id):
     with iotests.Timeout(3, "Timeout waiting for job to pause"):
         while True:
@@ -XXX,XX +XXX,XX @@ def test_pause_resume(vm):
                 iotests.log(vm.qmp('query-jobs'))
 
 def test_job_lifecycle(vm, job, job_args, has_ready=False):
+    global img_size
+
     iotests.log('')
     iotests.log('')
     iotests.log('Starting block job: %s (auto-finalize: %s; auto-dismiss: %s)' %
@@ -XXX,XX +XXX,XX @@ def test_job_lifecycle(vm, job, job_args, has_ready=False):
     iotests.log(iotests.filter_qmp_event(vm.event_wait('JOB_STATUS_CHANGE')))
     iotests.log(iotests.filter_qmp_event(vm.event_wait('JOB_STATUS_CHANGE')))
 
+    # Wait for total-progress to stabilize
+    while vm.qmp('query-jobs')['return'][0]['total-progress'] < img_size:
+        pass
+
     # RUNNING state:
     # pause/resume should work, complete/finalize/dismiss should error out
     iotests.log('')
@@ -XXX,XX +XXX,XX @@ with iotests.FilePath('disk.img') as disk_path, \
      iotests.FilePath('copy.img') as copy_path, \
      iotests.VM() as vm:
 
-    img_size = '4M'
-    iotests.qemu_img_create('-f', iotests.imgfmt, disk_path, img_size)
-    iotests.qemu_io('-c', 'write 0 %s' % (img_size),
+    iotests.qemu_img_create('-f', iotests.imgfmt, disk_path, str(img_size))
+    iotests.qemu_io('-c', 'write 0 %i' % (img_size),
                     '-f', iotests.imgfmt, disk_path)
 
     iotests.log('Launching VM...')
-- 
2.21.0

From: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>

The uninitialized memory allocated for the command FIFO of the
floppy controller during the VM hardware initialization incurs
many unwanted reports by Valgrind when VM state is being saved.
That verbosity hardens a search for the real memory issues when
the iotests run. Particularly, the patch eliminates 20 unnecessary
reports of the Valgrind tool in the iotest #169.

Signed-off-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
Message-id: 1559154027-282547-1-git-send-email-andrey.shinkevich@virtuozzo.com
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 hw/block/fdc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/block/fdc.c b/hw/block/fdc.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/fdc.c
+++ b/hw/block/fdc.c
@@ -XXX,XX +XXX,XX @@ static void fdctrl_realize_common(DeviceState *dev, FDCtrl *fdctrl,
 
     FLOPPY_DPRINTF("init controller\n");
     fdctrl->fifo = qemu_memalign(512, FD_SECTOR_LEN);
+    memset(fdctrl->fifo, 0, FD_SECTOR_LEN);
     fdctrl->fifo_size = 512;
     fdctrl->result_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
                                              fdctrl_result_timer, fdctrl);
-- 
2.21.0

From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

Test fails at least for qcow, because of different cluster sizes in
base and top (and therefore different granularities of bitmaps we are
trying to merge).

The test aim is to check block-dirty-bitmap-merge between different
nodes functionality, no needs to check all formats. So, let's just drop
support for anything except qcow2.

Reported-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-id: 20190605155405.104384-1-vsementsov@virtuozzo.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/254 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/qemu-iotests/254 b/tests/qemu-iotests/254
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/254
+++ b/tests/qemu-iotests/254
@@ -XXX,XX +XXX,XX @@
 import iotests
 from iotests import qemu_img_create, file_path, log
 
+iotests.verify_image_format(supported_fmts=['qcow2'])
+
 disk, top = file_path('disk', 'top')
 size = 1024 * 1024
 
-- 
2.21.0

img_rebase() can leak a QDict in two occasions.  Fix it.

Coverity: CID 1401416
Fixes: d16699b64671466b42079c45b89127aeea1ca565
Fixes: 330c72957196e0ae382abcaa97ebf4eb9bc8574f
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190528195338.12376-1-mreitz@redhat.com
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qemu-img.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/qemu-img.c b/qemu-img.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ static int img_rebase(int argc, char **argv)
                                                              out_baseimg,
                                                              &local_err);
             if (local_err) {
+                qobject_unref(options);
                 error_reportf_err(local_err,
                                   "Could not resolve backing filename: ");
                 ret = -1;
@@ -XXX,XX +XXX,XX @@ static int img_rebase(int argc, char **argv)
              */
             prefix_chain_bs = bdrv_find_backing_image(bs, out_real_path);
             if (prefix_chain_bs) {
+                qobject_unref(options);
                 g_free(out_real_path);
+
                 blk_new_backing = blk_new(qemu_get_aio_context(),
                                           BLK_PERM_CONSISTENT_READ,
                                           BLK_PERM_ALL);
-- 
2.21.0

A snapshot is something that reflects the state of something at a
certain point in time.  It does not change.

The file our snapshot commands create (or the node they install) is not
a snapshot, as it does change over time.  It is an overlay.  We cannot
do anything about the parameter names, but we can at least adjust the
descriptions to reflect that fact.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20190603202236.1342-2-mreitz@redhat.com
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qapi/block-core.json | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 #
 # Either @device or @node-name must be set but not both.
 #
-# @device: the name of the device to generate the snapshot from.
+# @device: the name of the device to take a snapshot of.
 #
 # @node-name: graph node name to generate the snapshot from (Since 2.0)
 #
-# @snapshot-file: the target of the new image. If the file exists, or
-# if it is a device, the snapshot will be created in the existing
-# file/device. Otherwise, a new file will be created.
+# @snapshot-file: the target of the new overlay image. If the file
+# exists, or if it is a device, the overlay will be created in the
+# existing file/device. Otherwise, a new file will be created.
 #
 # @snapshot-node-name: the graph node name of the new image (Since 2.0)
 #
-# @format: the format of the snapshot image, default is 'qcow2'.
+# @format: the format of the overlay image, default is 'qcow2'.
 #
 # @mode: whether and how QEMU should create a new image, default is
 #        'absolute-paths'.
@@ -XXX,XX +XXX,XX @@
 ##
 # @BlockdevSnapshot:
 #
-# @node: device or node name that will have a snapshot created.
+# @node: device or node name that will have a snapshot taken.
 #
 # @overlay: reference to the existing block device that will become
-#           the overlay of @node, as part of creating the snapshot.
+#           the overlay of @node, as part of taking the snapshot.
 #           It must not have a current backing file (this can be
 #           achieved by passing "backing": null to blockdev-add).
 #
@@ -XXX,XX +XXX,XX @@
 ##
 # @blockdev-snapshot-sync:
 #
-# Generates a synchronous snapshot of a block device.
+# Takes a synchronous snapshot of a block device.
 #
 # For the arguments, see the documentation of BlockdevSnapshotSync.
 #
@@ -XXX,XX +XXX,XX @@
 ##
 # @blockdev-snapshot:
 #
-# Generates a snapshot of a block device.
+# Takes a snapshot of a block device.
 #
-# Create a snapshot, by installing 'node' as the backing image of
+# Take a snapshot, by installing 'node' as the backing image of
 # 'overlay'. Additionally, if 'node' is associated with a block
 # device, the block device changes to using 'overlay' as its new active
 # image.
-- 
2.21.0

There are error messages which refer to an overlay node as the snapshot.
That is wrong, those are two different things.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20190603202236.1342-3-mreitz@redhat.com
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 blockdev.c                 | 10 +++++-----
 tests/qemu-iotests/085.out | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index XXXXXXX..XXXXXXX 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -XXX,XX +XXX,XX @@ static void external_snapshot_prepare(BlkActionState *common,
             s->has_snapshot_node_name ? s->snapshot_node_name : NULL;
 
         if (node_name && !snapshot_node_name) {
-            error_setg(errp, "New snapshot node name missing");
+            error_setg(errp, "New overlay node name missing");
             goto out;
         }
 
         if (snapshot_node_name &&
             bdrv_lookup_bs(snapshot_node_name, snapshot_node_name, NULL)) {
-            error_setg(errp, "New snapshot node name already in use");
+            error_setg(errp, "New overlay node name already in use");
             goto out;
         }
 
@@ -XXX,XX +XXX,XX @@ static void external_snapshot_prepare(BlkActionState *common,
     }
 
     if (bdrv_has_blk(state->new_bs)) {
-        error_setg(errp, "The snapshot is already in use");
+        error_setg(errp, "The overlay is already in use");
         goto out;
     }
 
@@ -XXX,XX +XXX,XX @@ static void external_snapshot_prepare(BlkActionState *common,
     }
 
     if (state->new_bs->backing != NULL) {
-        error_setg(errp, "The snapshot already has a backing image");
+        error_setg(errp, "The overlay already has a backing image");
         goto out;
     }
 
     if (!state->new_bs->drv->supports_backing) {
-        error_setg(errp, "The snapshot does not support backing images");
+        error_setg(errp, "The overlay does not support backing images");
         goto out;
     }
 
diff --git a/tests/qemu-iotests/085.out b/tests/qemu-iotests/085.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/085.out
+++ b/tests/qemu-iotests/085.out
@@ -XXX,XX +XXX,XX @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/
 
 === Invalid command - cannot create a snapshot using a file BDS ===
 
-{"error": {"class": "GenericError", "desc": "The snapshot does not support backing images"}}
+{"error": {"class": "GenericError", "desc": "The overlay does not support backing images"}}
 
 === Invalid command - snapshot node used as active layer ===
 
-{"error": {"class": "GenericError", "desc": "The snapshot is already in use"}}
-{"error": {"class": "GenericError", "desc": "The snapshot is already in use"}}
-{"error": {"class": "GenericError", "desc": "The snapshot is already in use"}}
+{"error": {"class": "GenericError", "desc": "The overlay is already in use"}}
+{"error": {"class": "GenericError", "desc": "The overlay is already in use"}}
+{"error": {"class": "GenericError", "desc": "The overlay is already in use"}}
 
 === Invalid command - snapshot node used as backing hd ===
 
@@ -XXX,XX +XXX,XX @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/
 Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134217728
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base
 {"return": {}}
-{"error": {"class": "GenericError", "desc": "The snapshot already has a backing image"}}
+{"error": {"class": "GenericError", "desc": "The overlay already has a backing image"}}
 
 === Invalid command - The node does not exist ===
 
-- 
2.21.0

Move img_convert()'s quiet flag into the ImgConvertState so it is
accessible by nested functions.  -q dictates that it suppresses anything
but errors, so if those functions want to emit warnings, they need to
query this flag first.  (There currently are no such warnings, but there
will be as of the next patch.)

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-id: 20190507203508.18026-2-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qemu-img.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ typedef struct ImgConvertState {
     int64_t target_backing_sectors; /* negative if unknown */
     bool wr_in_order;
     bool copy_range;
+    bool quiet;
     int min_sparse;
     int alignment;
     size_t cluster_sectors;
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
     QDict *open_opts = NULL;
     char *options = NULL;
     Error *local_err = NULL;
-    bool writethrough, src_writethrough, quiet = false, image_opts = false,
+    bool writethrough, src_writethrough, image_opts = false,
          skip_create = false, progress = false, tgt_image_opts = false;
     int64_t ret = -EINVAL;
     bool force_share = false;
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
             src_cache = optarg;
             break;
         case 'q':
-            quiet = true;
+            s.quiet = true;
             break;
         case 'n':
             skip_create = true;
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
     }
 
     /* Initialize before goto out */
-    if (quiet) {
+    if (s.quiet) {
         progress = false;
     }
     qemu_progress_init(progress, 1.0);
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
 
     for (bs_i = 0; bs_i < s.src_num; bs_i++) {
         s.src[bs_i] = img_open(image_opts, argv[optind + bs_i],
-                               fmt, src_flags, src_writethrough, quiet,
+                               fmt, src_flags, src_writethrough, s.quiet,
                                force_share);
         if (!s.src[bs_i]) {
             ret = -1;
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
 
     if (skip_create) {
         s.target = img_open(tgt_image_opts, out_filename, out_fmt,
-                            flags, writethrough, quiet, false);
+                            flags, writethrough, s.quiet, false);
     } else {
         /* TODO ultimately we should allow --target-image-opts
          * to be used even when -n is not given.
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
          * to allow filenames in option syntax
          */
         s.target = img_open_file(out_filename, open_opts, out_fmt,
-                                 flags, writethrough, quiet, false);
+                                 flags, writethrough, s.quiet, false);
         open_opts = NULL; /* blk_new_open will have freed it */
     }
     if (!s.target) {
-- 
2.21.0

This adds a salvaging mode (--salvage) to qemu-img convert which ignores
read errors and treats the respective areas as containing only zeroes.
This can be used for instance to at least partially recover the data
from terminally corrupted qcow2 images.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-id: 20190507203508.18026-3-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qemu-img.c       | 90 +++++++++++++++++++++++++++++++++++++-----------
 qemu-img-cmds.hx |  4 +--
 qemu-img.texi    |  4 +++
 3 files changed, 75 insertions(+), 23 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ enum {
     OPTION_SIZE = 264,
     OPTION_PREALLOCATION = 265,
     OPTION_SHRINK = 266,
+    OPTION_SALVAGE = 267,
 };
 
 typedef enum OutputFormat {
@@ -XXX,XX +XXX,XX @@ typedef struct ImgConvertState {
     int64_t target_backing_sectors; /* negative if unknown */
     bool wr_in_order;
     bool copy_range;
+    bool salvage;
     bool quiet;
     int min_sparse;
     int alignment;
@@ -XXX,XX +XXX,XX @@ static int convert_iteration_sectors(ImgConvertState *s, int64_t sector_num)
     }
 
     if (s->sector_next_status <= sector_num) {
-        int64_t count = n * BDRV_SECTOR_SIZE;
+        uint64_t offset = (sector_num - src_cur_offset) * BDRV_SECTOR_SIZE;
+        int64_t count;
 
-        if (s->target_has_backing) {
+        do {
+            count = n * BDRV_SECTOR_SIZE;
+
+            if (s->target_has_backing) {
+                ret = bdrv_block_status(blk_bs(s->src[src_cur]), offset,
+                                        count, &count, NULL, NULL);
+            } else {
+                ret = bdrv_block_status_above(blk_bs(s->src[src_cur]), NULL,
+                                              offset, count, &count, NULL,
+                                              NULL);
+            }
+
+            if (ret < 0) {
+                if (s->salvage) {
+                    if (n == 1) {
+                        if (!s->quiet) {
+                            warn_report("error while reading block status at "
+                                        "offset %" PRIu64 ": %s", offset,
+                                        strerror(-ret));
+                        }
+                        /* Just try to read the data, then */
+                        ret = BDRV_BLOCK_DATA;
+                        count = BDRV_SECTOR_SIZE;
+                    } else {
+                        /* Retry on a shorter range */
+                        n = DIV_ROUND_UP(n, 4);
+                    }
+                } else {
+                    error_report("error while reading block status at offset "
+                                 "%" PRIu64 ": %s", offset, strerror(-ret));
+                    return ret;
+                }
+            }
+        } while (ret < 0);
 
-            ret = bdrv_block_status(blk_bs(s->src[src_cur]),
-                                    (sector_num - src_cur_offset) *
-                                    BDRV_SECTOR_SIZE,
-                                    count, &count, NULL, NULL);
-        } else {
-            ret = bdrv_block_status_above(blk_bs(s->src[src_cur]), NULL,
-                                          (sector_num - src_cur_offset) *
-                                          BDRV_SECTOR_SIZE,
-                                          count, &count, NULL, NULL);
-        }
-        if (ret < 0) {
-            error_report("error while reading block status of sector %" PRId64
-                         ": %s", sector_num, strerror(-ret));
-            return ret;
-        }
         n = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
 
         if (ret & BDRV_BLOCK_ZERO) {
@@ -XXX,XX +XXX,XX @@ static int convert_iteration_sectors(ImgConvertState *s, int64_t sector_num)
 static int coroutine_fn convert_co_read(ImgConvertState *s, int64_t sector_num,
                                         int nb_sectors, uint8_t *buf)
 {
+    uint64_t single_read_until = 0;
     int n, ret;
 
     assert(nb_sectors <= s->buf_sectors);
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn convert_co_read(ImgConvertState *s, int64_t sector_num,
         BlockBackend *blk;
         int src_cur;
         int64_t bs_sectors, src_cur_offset;
+        uint64_t offset;
 
         /* In the case of compression with multiple source files, we can get a
          * nb_sectors that spreads into the next part. So we must be able to
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn convert_co_read(ImgConvertState *s, int64_t sector_num,
         blk = s->src[src_cur];
         bs_sectors = s->src_sectors[src_cur];
 
+        offset = (sector_num - src_cur_offset) << BDRV_SECTOR_BITS;
+
         n = MIN(nb_sectors, bs_sectors - (sector_num - src_cur_offset));
+        if (single_read_until > offset) {
+            n = 1;
+        }
 
-        ret = blk_co_pread(
-                blk, (sector_num - src_cur_offset) << BDRV_SECTOR_BITS,
-                n << BDRV_SECTOR_BITS, buf, 0);
+        ret = blk_co_pread(blk, offset, n << BDRV_SECTOR_BITS, buf, 0);
         if (ret < 0) {
-            return ret;
+            if (s->salvage) {
+                if (n > 1) {
+                    single_read_until = offset + (n << BDRV_SECTOR_BITS);
+                    continue;
+                } else {
+                    if (!s->quiet) {
+                        warn_report("error while reading offset %" PRIu64
+                                    ": %s", offset, strerror(-ret));
+                    }
+                    memset(buf, 0, BDRV_SECTOR_SIZE);
+                }
+            } else {
+                return ret;
+            }
         }
 
         sector_num += n;
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
             {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS},
             {"force-share", no_argument, 0, 'U'},
             {"target-image-opts", no_argument, 0, OPTION_TARGET_IMAGE_OPTS},
+            {"salvage", no_argument, 0, OPTION_SALVAGE},
             {0, 0, 0, 0}
         };
         c = getopt_long(argc, argv, ":hf:O:B:Cco:l:S:pt:T:qnm:WU",
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
         case OPTION_IMAGE_OPTS:
             image_opts = true;
             break;
+        case OPTION_SALVAGE:
+            s.salvage = true;
+            break;
         case OPTION_TARGET_IMAGE_OPTS:
             tgt_image_opts = true;
             break;
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
         goto fail_getopt;
     }
 
+    if (s.copy_range && s.salvage) {
+        error_report("Cannot use copy offloading in salvaging mode");
+        goto fail_getopt;
+    }
+
     if (tgt_image_opts && !skip_create) {
         error_report("--target-image-opts requires use of -n flag");
         goto fail_getopt;
diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -XXX,XX +XXX,XX @@ STEXI
 ETEXI
 
 DEF("convert", img_convert,
-    "convert [--object objectdef] [--image-opts] [--target-image-opts] [-U] [-C] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-B backing_file] [-o options] [-l snapshot_param] [-S sparse_size] [-m num_coroutines] [-W] filename [filename2 [...]] output_filename")
+    "convert [--object objectdef] [--image-opts] [--target-image-opts] [-U] [-C] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-B backing_file] [-o options] [-l snapshot_param] [-S sparse_size] [-m num_coroutines] [-W] [--salvage] filename [filename2 [...]] output_filename")
 STEXI
-@item convert [--object @var{objectdef}] [--image-opts] [--target-image-opts] [-U] [-C] [-c] [-p] [-q] [-n] [-f @var{fmt}] [-t @var{cache}] [-T @var{src_cache}] [-O @var{output_fmt}] [-B @var{backing_file}] [-o @var{options}] [-l @var{snapshot_param}] [-S @var{sparse_size}] [-m @var{num_coroutines}] [-W] @var{filename} [@var{filename2} [...]] @var{output_filename}
+@item convert [--object @var{objectdef}] [--image-opts] [--target-image-opts] [-U] [-C] [-c] [-p] [-q] [-n] [-f @var{fmt}] [-t @var{cache}] [-T @var{src_cache}] [-O @var{output_fmt}] [-B @var{backing_file}] [-o @var{options}] [-l @var{snapshot_param}] [-S @var{sparse_size}] [-m @var{num_coroutines}] [-W] [--salvage] @var{filename} [@var{filename2} [...]] @var{output_filename}
 ETEXI
 
 DEF("create", img_create,
diff --git a/qemu-img.texi b/qemu-img.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -XXX,XX +XXX,XX @@ improve performance if the data is remote, such as with NFS or iSCSI backends,
 but will not automatically sparsify zero sectors, and may result in a fully
 allocated target image depending on the host support for getting allocation
 information.
+@item --salvage
+Try to ignore I/O errors when reading.  Unless in quiet mode (@code{-q}), errors
+will still be printed.  Areas that cannot be read from the source will be
+treated as containing only zeroes.
 @end table
 
 Parameters to dd subcommand:
-- 
2.21.0

This new error option allows users of blkdebug to inject errors only on
certain kinds of I/O operations.  Users usually want to make a very
specific operation fail, not just any; but right now they simply hope
that the event that triggers the error injection is followed up with
that very operation.  That may not be true, however, because the block
layer is changing (including blkdebug, which may increase the number of
types of I/O operations on which to inject errors).

The new option's default has been chosen to keep backwards
compatibility.

Note that similar to the internal representation, we could choose to
expose this option as a list of I/O types.  But there is no practical
use for this, because as described above, users usually know exactly
which kind of operation they want to make fail, so there is no need to
specify multiple I/O types at once.  In addition, exposing this option
as a list would require non-trivial changes to qemu_opts_absorb_qdict().

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-id: 20190507203508.18026-4-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qapi/block-core.json | 26 +++++++++++++++++++++++
 block/blkdebug.c     | 50 ++++++++++++++++++++++++++++++++++++--------
 2 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
             'l1_shrink_write_table', 'l1_shrink_free_l2_clusters',
             'cor_write', 'cluster_alloc_space'] }
 
+##
+# @BlkdebugIOType:
+#
+# Kinds of I/O that blkdebug can inject errors in.
+#
+# @read: .bdrv_co_preadv()
+#
+# @write: .bdrv_co_pwritev()
+#
+# @write-zeroes: .bdrv_co_pwrite_zeroes()
+#
+# @discard: .bdrv_co_pdiscard()
+#
+# @flush: .bdrv_co_flush_to_disk()
+#
+# Since: 4.1
+##
+{ 'enum': 'BlkdebugIOType', 'prefix': 'BLKDEBUG_IO_TYPE',
+  'data': [ 'read', 'write', 'write-zeroes', 'discard', 'flush' ] }
+
 ##
 # @BlkdebugInjectErrorOptions:
 #
@@ -XXX,XX +XXX,XX @@
 # @state:       the state identifier blkdebug needs to be in to
 #               actually trigger the event; defaults to "any"
 #
+# @iotype:      the type of I/O operations on which this error should
+#               be injected; defaults to "all read, write,
+#               write-zeroes, discard, and flush operations"
+#               (since: 4.1)
+#
 # @errno:       error identifier (errno) to be returned; defaults to
 #               EIO
 #
@@ -XXX,XX +XXX,XX @@
 { 'struct': 'BlkdebugInjectErrorOptions',
   'data': { 'event': 'BlkdebugEvent',
             '*state': 'int',
+            '*iotype': 'BlkdebugIOType',
             '*errno': 'int',
             '*sector': 'int',
             '*once': 'bool',
diff --git a/block/blkdebug.c b/block/blkdebug.c
index XXXXXXX..XXXXXXX 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -XXX,XX +XXX,XX @@ typedef struct BlkdebugRule {
     int state;
     union {
         struct {
+            uint64_t iotype_mask;
             int error;
             int immediately;
             int once;
@@ -XXX,XX +XXX,XX @@ typedef struct BlkdebugRule {
     QSIMPLEQ_ENTRY(BlkdebugRule) active_next;
 } BlkdebugRule;
 
+QEMU_BUILD_BUG_MSG(BLKDEBUG_IO_TYPE__MAX > 64,
+                   "BlkdebugIOType mask does not fit into an uint64_t");
+
 static QemuOptsList inject_error_opts = {
     .name = "inject-error",
     .head = QTAILQ_HEAD_INITIALIZER(inject_error_opts.head),
@@ -XXX,XX +XXX,XX @@ static QemuOptsList inject_error_opts = {
             .name = "state",
             .type = QEMU_OPT_NUMBER,
         },
+        {
+            .name = "iotype",
+            .type = QEMU_OPT_STRING,
+        },
         {
             .name = "errno",
             .type = QEMU_OPT_NUMBER,
@@ -XXX,XX +XXX,XX @@ static int add_rule(void *opaque, QemuOpts *opts, Error **errp)
     int event;
     struct BlkdebugRule *rule;
     int64_t sector;
+    BlkdebugIOType iotype;
+    Error *local_error = NULL;
 
     /* Find the right event for the rule */
     event_name = qemu_opt_get(opts, "event");
@@ -XXX,XX +XXX,XX @@ static int add_rule(void *opaque, QemuOpts *opts, Error **errp)
         sector = qemu_opt_get_number(opts, "sector", -1);
         rule->options.inject.offset =
             sector == -1 ? -1 : sector * BDRV_SECTOR_SIZE;
+
+        iotype = qapi_enum_parse(&BlkdebugIOType_lookup,
+                                 qemu_opt_get(opts, "iotype"),
+                                 BLKDEBUG_IO_TYPE__MAX, &local_error);
+        if (local_error) {
+            error_propagate(errp, local_error);
+            return -1;
+        }
+        if (iotype != BLKDEBUG_IO_TYPE__MAX) {
+            rule->options.inject.iotype_mask = (1ull << iotype);
+        } else {
+            /* Apply the default */
+            rule->options.inject.iotype_mask =
+                (1ull << BLKDEBUG_IO_TYPE_READ)
+                | (1ull << BLKDEBUG_IO_TYPE_WRITE)
+                | (1ull << BLKDEBUG_IO_TYPE_WRITE_ZEROES)
+                | (1ull << BLKDEBUG_IO_TYPE_DISCARD)
+                | (1ull << BLKDEBUG_IO_TYPE_FLUSH);
+        }
+
         break;
 
     case ACTION_SET_STATE:
@@ -XXX,XX +XXX,XX @@ out:
     return ret;
 }
 
-static int rule_check(BlockDriverState *bs, uint64_t offset, uint64_t bytes)
+static int rule_check(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                      BlkdebugIOType iotype)
 {
     BDRVBlkdebugState *s = bs->opaque;
     BlkdebugRule *rule = NULL;
@@ -XXX,XX +XXX,XX @@ static int rule_check(BlockDriverState *bs, uint64_t offset, uint64_t bytes)
     QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
         uint64_t inject_offset = rule->options.inject.offset;
 
-        if (inject_offset == -1 ||
-            (bytes && inject_offset >= offset &&
-             inject_offset < offset + bytes))
+        if ((inject_offset == -1 ||
+             (bytes && inject_offset >= offset &&
+              inject_offset < offset + bytes)) &&
+            (rule->options.inject.iotype_mask & (1ull << iotype)))
         {
             break;
         }
@@ -XXX,XX +XXX,XX @@ blkdebug_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
         assert(bytes <= bs->bl.max_transfer);
     }
 
-    err = rule_check(bs, offset, bytes);
+    err = rule_check(bs, offset, bytes, BLKDEBUG_IO_TYPE_READ);
     if (err) {
         return err;
     }
@@ -XXX,XX +XXX,XX @@ blkdebug_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
         assert(bytes <= bs->bl.max_transfer);
     }
 
-    err = rule_check(bs, offset, bytes);
+    err = rule_check(bs, offset, bytes, BLKDEBUG_IO_TYPE_WRITE);
     if (err) {
         return err;
     }
@@ -XXX,XX +XXX,XX @@ blkdebug_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
 
 static int blkdebug_co_flush(BlockDriverState *bs)
 {
-    int err = rule_check(bs, 0, 0);
+    int err = rule_check(bs, 0, 0, BLKDEBUG_IO_TYPE_FLUSH);
 
     if (err) {
         return err;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkdebug_co_pwrite_zeroes(BlockDriverState *bs,
         assert(bytes <= bs->bl.max_pwrite_zeroes);
     }
 
-    err = rule_check(bs, offset, bytes);
+    err = rule_check(bs, offset, bytes, BLKDEBUG_IO_TYPE_WRITE_ZEROES);
     if (err) {
         return err;
     }
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs,
         assert(bytes <= bs->bl.max_pdiscard);
     }
 
-    err = rule_check(bs, offset, bytes);
+    err = rule_check(bs, offset, bytes, BLKDEBUG_IO_TYPE_DISCARD);
     if (err) {
         return err;
     }
-- 
2.21.0

Together with @iotypes and @sector, this can be used to trap e.g. the
first read or write access to a certain sector without having to know
what happens internally in the block layer, i.e. which "real" events
happen right before such an access.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-id: 20190507203508.18026-5-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qapi/block-core.json | 4 +++-
 block/blkdebug.c     | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 #
 # @cluster_alloc_space: an allocation of file space for a cluster (since 4.1)
 #
+# @none: triggers once at creation of the blkdebug node (since 4.1)
+#
 # Since: 2.9
 ##
 { 'enum': 'BlkdebugEvent', 'prefix': 'BLKDBG',
@@ -XXX,XX +XXX,XX @@
             'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
             'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
             'l1_shrink_write_table', 'l1_shrink_free_l2_clusters',
-            'cor_write', 'cluster_alloc_space'] }
+            'cor_write', 'cluster_alloc_space', 'none'] }
 
 ##
 # @BlkdebugIOType:
diff --git a/block/blkdebug.c b/block/blkdebug.c
index XXXXXXX..XXXXXXX 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -XXX,XX +XXX,XX @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
         goto out;
     }
 
+    bdrv_debug_event(bs, BLKDBG_NONE);
+
     ret = 0;
 out:
     if (ret < 0) {
-- 
2.21.0

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-id: 20190507203508.18026-6-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qapi/block-core.json | 5 ++++-
 block/blkdebug.c     | 8 ++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 #
 # @flush: .bdrv_co_flush_to_disk()
 #
+# @block-status: .bdrv_co_block_status()
+#
 # Since: 4.1
 ##
 { 'enum': 'BlkdebugIOType', 'prefix': 'BLKDEBUG_IO_TYPE',
-  'data': [ 'read', 'write', 'write-zeroes', 'discard', 'flush' ] }
+  'data': [ 'read', 'write', 'write-zeroes', 'discard', 'flush',
+            'block-status' ] }
 
 ##
 # @BlkdebugInjectErrorOptions:
diff --git a/block/blkdebug.c b/block/blkdebug.c
index XXXXXXX..XXXXXXX 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkdebug_co_block_status(BlockDriverState *bs,
                                                  int64_t *map,
                                                  BlockDriverState **file)
 {
+    int err;
+
     assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
+
+    err = rule_check(bs, offset, bytes, BLKDEBUG_IO_TYPE_BLOCK_STATUS);
+    if (err) {
+        return err;
+    }
+
     return bdrv_co_block_status_from_file(bs, want_zero, offset, bytes,
                                           pnum, map, file);
 }
-- 
2.21.0

This test converts a simple image to another, but blkdebug injects
block_status and read faults at some offsets.  The resulting image
should be the same as the input image, except that sectors that could
not be read have to be 0.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190507203508.18026-7-mreitz@redhat.com
Tested-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
[mreitz: Dropped superfluous printf from _filter_offsets, as suggested
         by Vladimir; disable test for VDI and IMGOPTSSYNTAX]
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/251     | 170 +++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/251.out |  43 ++++++++++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 214 insertions(+)
 create mode 100755 tests/qemu-iotests/251
 create mode 100644 tests/qemu-iotests/251.out

diff --git a/tests/qemu-iotests/251 b/tests/qemu-iotests/251
new file mode 100755
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/251
@@ -XXX,XX +XXX,XX @@
+#!/usr/bin/env bash
+#
+# Test qemu-img convert --salvage
+#
+# Copyright (C) 2019 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=mreitz@redhat.com
+
+seq=$(basename $0)
+echo "QA output created by $seq"
+
+status=1	# failure is the default!
+
+_cleanup()
+{
+    _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.qemu
+
+_supported_fmt generic
+_supported_proto file
+_supported_os Linux
+
+if [ "$IMGOPTSSYNTAX" = "true" ]; then
+    # We use json:{} filenames here, so we cannot work with additional options.
+    _unsupported_fmt $IMGFMT
+else
+    # With VDI, the output is ordered differently.  Just disable it.
+    _unsupported_fmt vdi
+fi
+
+
+TEST_IMG="$TEST_IMG.orig" _make_test_img 64M
+
+$QEMU_IO -c 'write -P 42 0 64M' "$TEST_IMG.orig" | _filter_qemu_io
+
+
+sector_size=512
+
+# Offsets on which to fail block-status.  Keep in ascending order so
+# the indexing done by _filter_offsets will appear in ascending order
+# in the output as well.
+status_fail_offsets="$((16 * 1024 * 1024 + 8192))
+                     $((33 * 1024 * 1024 + 512))"
+
+# Offsets on which to fail reads.  Keep in ascending order for the
+# same reason.
+# The second element is shared with $status_fail_offsets on purpose.
+# Starting with the third element, we test what happens when a
+# continuous range of sectors is inaccessible.
+read_fail_offsets="$((32 * 1024 * 1024 - 65536))
+                   $((33 * 1024 * 1024 + 512))
+                   $(seq $((34 * 1024 * 1024)) $sector_size \
+                         $((34 * 1024 * 1024 + 4096 - $sector_size)))"
+
+
+# blkdebug must be above the format layer so it can intercept all
+# block-status events
+source_img="json:{'driver': 'blkdebug',
+                  'image': {
+                      'driver': '$IMGFMT',
+                      'file': {
+                          'driver': 'file',
+                          'filename': '$TEST_IMG.orig'
+                      }
+                  },
+                  'inject-error': ["
+
+for ofs in $status_fail_offsets
+do
+    source_img+="{ 'event': 'none',
+                   'iotype': 'block-status',
+                   'errno': 5,
+                   'sector': $((ofs / sector_size)) },"
+done
+
+for ofs in $read_fail_offsets
+do
+    source_img+="{ 'event': 'none',
+                   'iotype': 'read',
+                   'errno': 5,
+                   'sector': $((ofs / sector_size)) },"
+done
+
+# Remove the trailing comma and terminate @inject-error and json:{}
+source_img="${source_img%,} ] }"
+
+
+echo
+
+
+_filter_offsets() {
+    filters=
+
+    index=0
+    for ofs in $1
+    do
+        filters+=" -e s/$ofs/status_fail_offset_$index/"
+        index=$((index + 1))
+    done
+
+    index=0
+    for ofs in $2
+    do
+        filters+=" -e s/$ofs/read_fail_offset_$index/"
+        index=$((index + 1))
+    done
+
+    sed $filters
+}
+
+# While determining the number of allocated sectors in the input
+# image, we should see one block status warning per element of
+# $status_fail_offsets.
+#
+# Then, the image is read.  Since the block status is queried in
+# basically the same way, the same warnings as in the previous step
+# should reappear.  Interleaved with those we should see a read
+# warning per element of $read_fail_offsets.
+# Note that $read_fail_offsets and $status_fail_offsets share an
+# element (read_fail_offset_1 == status_fail_offset_1), so
+# "status_fail_offset_1" in the output is the same as
+# "read_fail_offset_1".
+$QEMU_IMG convert --salvage "$source_img" "$TEST_IMG" 2>&1 \
+    | _filter_offsets "$status_fail_offsets" "$read_fail_offsets"
+
+echo
+
+# The offsets where the block status could not be determined should
+# have been treated as containing data and thus should be correct in
+# the output image.
+# The offsets where reading failed altogether should be 0.  Make them
+# 0 in the input image, too, so we can compare both images.
+for ofs in $read_fail_offsets
+do
+    $QEMU_IO -c "write -z $ofs $sector_size" "$TEST_IMG.orig" \
+        | _filter_qemu_io \
+        | _filter_offsets '' "$read_fail_offsets"
+done
+
+echo
+
+# These should be equal now.
+$QEMU_IMG compare "$TEST_IMG.orig" "$TEST_IMG"
+
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/251.out b/tests/qemu-iotests/251.out
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qemu-iotests/251.out
@@ -XXX,XX +XXX,XX @@
+QA output created by 251
+Formatting 'TEST_DIR/t.IMGFMT.orig', fmt=IMGFMT size=67108864
+wrote 67108864/67108864 bytes at offset 0
+64 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+qemu-img: warning: error while reading block status at offset status_fail_offset_0: Input/output error
+qemu-img: warning: error while reading block status at offset status_fail_offset_1: Input/output error
+qemu-img: warning: error while reading block status at offset status_fail_offset_0: Input/output error
+qemu-img: warning: error while reading offset read_fail_offset_0: Input/output error
+qemu-img: warning: error while reading block status at offset status_fail_offset_1: Input/output error
+qemu-img: warning: error while reading offset status_fail_offset_1: Input/output error
+qemu-img: warning: error while reading offset read_fail_offset_2: Input/output error
+qemu-img: warning: error while reading offset read_fail_offset_3: Input/output error
+qemu-img: warning: error while reading offset read_fail_offset_4: Input/output error
+qemu-img: warning: error while reading offset read_fail_offset_5: Input/output error
+qemu-img: warning: error while reading offset read_fail_offset_6: Input/output error
+qemu-img: warning: error while reading offset read_fail_offset_7: Input/output error
+qemu-img: warning: error while reading offset read_fail_offset_8: Input/output error
+qemu-img: warning: error while reading offset read_fail_offset_9: Input/output error
+
+wrote 512/512 bytes at offset read_fail_offset_0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset read_fail_offset_1
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset read_fail_offset_2
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset read_fail_offset_3
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset read_fail_offset_4
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset read_fail_offset_5
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset read_fail_offset_6
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset read_fail_offset_7
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset read_fail_offset_8
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset read_fail_offset_9
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+Images are identical.
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -XXX,XX +XXX,XX @@
 248 rw quick
 249 rw auto quick
 250 rw auto quick
+251 rw auto quick
 252 rw auto backing quick
 253 rw auto quick
 254 rw auto backing quick
-- 
2.21.0

We do not support this combination (yet), so this should yield an error
message.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Tested-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-id: 20190507203508.18026-8-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/082     | 1 +
 tests/qemu-iotests/082.out | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/tests/qemu-iotests/082 b/tests/qemu-iotests/082
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/082
+++ b/tests/qemu-iotests/082
@@ -XXX,XX +XXX,XX @@ echo === convert: -C and other options ===
 run_qemu_img convert -C -S 4k -O $IMGFMT "$TEST_IMG" "$TEST_IMG".target
 run_qemu_img convert -C -S 8k -O $IMGFMT "$TEST_IMG" "$TEST_IMG".target
 run_qemu_img convert -C -c -O $IMGFMT "$TEST_IMG" "$TEST_IMG".target
+run_qemu_img convert -C --salvage -O $IMGFMT "$TEST_IMG" "$TEST_IMG".target
 
 echo
 echo === amend: Options specified more than once ===
diff --git a/tests/qemu-iotests/082.out b/tests/qemu-iotests/082.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/082.out
+++ b/tests/qemu-iotests/082.out
@@ -XXX,XX +XXX,XX @@ qemu-img: Cannot enable copy offloading when -S is used
 Testing: convert -C -c -O qcow2 TEST_DIR/t.qcow2 TEST_DIR/t.qcow2.target
 qemu-img: Cannot enable copy offloading when -c is used
 
+Testing: convert -C --salvage -O qcow2 TEST_DIR/t.qcow2 TEST_DIR/t.qcow2.target
+qemu-img: Cannot use copy offloading in salvaging mode
+
 === amend: Options specified more than once ===
 
 Testing: amend -f foo -f qcow2 -o lazy_refcounts=on TEST_DIR/t.qcow2
-- 
2.21.0

The following changes since commit 56f9e46b841c7be478ca038d8d4085d776ab4b0d:

Merge remote-tracking branch 'remotes/armbru/tags/pull-qapi-2017-02-20' into staging (2017-02-20 17:42:47 +0000)

are available in the git repository at:

git://github.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to a7b91d35bab97a2d3e779d0c64c9b837b52a6cf7:

coroutine-lock: make CoRwlock thread-safe and fair (2017-02-21 11:39:40 +0000)

----------------------------------------------------------------
Pull request

v2:
 * Rebased to resolve scsi conflicts

----------------------------------------------------------------

Paolo Bonzini (24):
  block: move AioContext, QEMUTimer, main-loop to libqemuutil
  aio: introduce aio_co_schedule and aio_co_wake
  block-backend: allow blk_prw from coroutine context
  test-thread-pool: use generic AioContext infrastructure
  io: add methods to set I/O handlers on AioContext
  io: make qio_channel_yield aware of AioContexts
  nbd: convert to use qio_channel_yield
  coroutine-lock: reschedule coroutine on the AioContext it was running
    on
  blkdebug: reschedule coroutine on the AioContext it is running on
  qed: introduce qed_aio_start_io and qed_aio_next_io_cb
  aio: push aio_context_acquire/release down to dispatching
  block: explicitly acquire aiocontext in timers that need it
  block: explicitly acquire aiocontext in callbacks that need it
  block: explicitly acquire aiocontext in bottom halves that need it
  block: explicitly acquire aiocontext in aio callbacks that need it
  aio-posix: partially inline aio_dispatch into aio_poll
  async: remove unnecessary inc/dec pairs
  block: document fields protected by AioContext lock
  coroutine-lock: make CoMutex thread-safe
  coroutine-lock: add limited spinning to CoMutex
  test-aio-multithread: add performance comparison with thread-based
    mutexes
  coroutine-lock: place CoMutex before CoQueue in header
  coroutine-lock: add mutex argument to CoQueue APIs
  coroutine-lock: make CoRwlock thread-safe and fair

-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

AioContext is fairly self contained, the only dependency is QEMUTimer but
that in turn doesn't need anything else.  So move them out of block-obj-y
to avoid introducing a dependency from io/ to block-obj-y.

main-loop and its dependency iohandler also need to be moved, because
later in this series io/ will call iohandler_get_aio_context.

[Changed copyright "the QEMU team" to "other QEMU contributors" as
suggested by Daniel Berrange and agreed by Paolo.
--Stefan]

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-2-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 Makefile.objs                       |  4 ---
 stubs/Makefile.objs                 |  1 +
 tests/Makefile.include              | 11 ++++----
 util/Makefile.objs                  |  6 +++-
 block/io.c                          | 29 -------------------
 stubs/linux-aio.c                   | 32 +++++++++++++++++++++
 stubs/set-fd-handler.c              | 11 --------
 aio-posix.c => util/aio-posix.c     |  2 +-
 aio-win32.c => util/aio-win32.c     |  0
 util/aiocb.c                        | 55 +++++++++++++++++++++++++++++++++++++
 async.c => util/async.c             |  3 +-
 iohandler.c => util/iohandler.c     |  0
 main-loop.c => util/main-loop.c     |  0
 qemu-timer.c => util/qemu-timer.c   |  0
 thread-pool.c => util/thread-pool.c |  2 +-
 trace-events                        | 11 --------
 util/trace-events                   | 11 ++++++++
 17 files changed, 114 insertions(+), 64 deletions(-)
 create mode 100644 stubs/linux-aio.c
 rename aio-posix.c => util/aio-posix.c (99%)
 rename aio-win32.c => util/aio-win32.c (100%)
 create mode 100644 util/aiocb.c
 rename async.c => util/async.c (99%)
 rename iohandler.c => util/iohandler.c (100%)
 rename main-loop.c => util/main-loop.c (100%)
 rename qemu-timer.c => util/qemu-timer.c (100%)
 rename thread-pool.c => util/thread-pool.c (99%)

diff --git a/Makefile.objs b/Makefile.objs
index XXXXXXX..XXXXXXX 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -XXX,XX +XXX,XX @@ chardev-obj-y = chardev/
 #######################################################################
 # block-obj-y is code used by both qemu system emulation and qemu-img
 
-block-obj-y = async.o thread-pool.o
 block-obj-y += nbd/
 block-obj-y += block.o blockjob.o
-block-obj-y += main-loop.o iohandler.o qemu-timer.o
-block-obj-$(CONFIG_POSIX) += aio-posix.o
-block-obj-$(CONFIG_WIN32) += aio-win32.o
 block-obj-y += block/
 block-obj-y += qemu-io-cmds.o
 block-obj-$(CONFIG_REPLICATION) += replication.o
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index XXXXXXX..XXXXXXX 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -XXX,XX +XXX,XX @@ stub-obj-y += get-vm-name.o
 stub-obj-y += iothread.o
 stub-obj-y += iothread-lock.o
 stub-obj-y += is-daemonized.o
+stub-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 stub-obj-y += machine-init-done.o
 stub-obj-y += migr-blocker.o
 stub-obj-y += monitor.o
diff --git a/tests/Makefile.include b/tests/Makefile.include
index XXXXXXX..XXXXXXX 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -XXX,XX +XXX,XX @@ check-unit-y += tests/test-visitor-serialization$(EXESUF)
 check-unit-y += tests/test-iov$(EXESUF)
 gcov-files-test-iov-y = util/iov.c
 check-unit-y += tests/test-aio$(EXESUF)
+gcov-files-test-aio-y = util/async.c util/qemu-timer.o
+gcov-files-test-aio-$(CONFIG_WIN32) += util/aio-win32.c
+gcov-files-test-aio-$(CONFIG_POSIX) += util/aio-posix.c
 check-unit-y += tests/test-throttle$(EXESUF)
 gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
 gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
@@ -XXX,XX +XXX,XX @@ tests/check-qjson$(EXESUF): tests/check-qjson.o $(test-util-obj-y)
 tests/check-qom-interface$(EXESUF): tests/check-qom-interface.o $(test-qom-obj-y)
 tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
 
-tests/test-char$(EXESUF): tests/test-char.o qemu-timer.o \
-	$(test-util-obj-y) $(qtest-obj-y) $(test-block-obj-y) $(chardev-obj-y)
+tests/test-char$(EXESUF): tests/test-char.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y) $(chardev-obj-y)
 tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
 tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
@@ -XXX,XX +XXX,XX @@ tests/test-vmstate$(EXESUF): tests/test-vmstate.o \
 	migration/vmstate.o migration/qemu-file.o \
         migration/qemu-file-channel.o migration/qjson.o \
 	$(test-io-obj-y)
-tests/test-timed-average$(EXESUF): tests/test-timed-average.o qemu-timer.o \
-	$(test-util-obj-y)
+tests/test-timed-average$(EXESUF): tests/test-timed-average.o $(test-util-obj-y)
 tests/test-base64$(EXESUF): tests/test-base64.o \
 	libqemuutil.a libqemustub.a
 tests/ptimer-test$(EXESUF): tests/ptimer-test.o tests/ptimer-test-stubs.o hw/core/ptimer.o libqemustub.a
@@ -XXX,XX +XXX,XX @@ tests/usb-hcd-ehci-test$(EXESUF): tests/usb-hcd-ehci-test.o $(libqos-usb-obj-y)
 tests/usb-hcd-xhci-test$(EXESUF): tests/usb-hcd-xhci-test.o $(libqos-usb-obj-y)
 tests/pc-cpu-test$(EXESUF): tests/pc-cpu-test.o
 tests/postcopy-test$(EXESUF): tests/postcopy-test.o
-tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-timer.o \
+tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o $(test-util-obj-y) \
 	$(qtest-obj-y) $(test-io-obj-y) $(libqos-virtio-obj-y) $(libqos-pc-obj-y) \
 	$(chardev-obj-y)
 tests/qemu-iotests/socket_scm_helper$(EXESUF): tests/qemu-iotests/socket_scm_helper.o
diff --git a/util/Makefile.objs b/util/Makefile.objs
index XXXXXXX..XXXXXXX 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -XXX,XX +XXX,XX @@
 util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
 util-obj-y += bufferiszero.o
 util-obj-y += lockcnt.o
+util-obj-y += aiocb.o async.o thread-pool.o qemu-timer.o
+util-obj-y += main-loop.o iohandler.o
+util-obj-$(CONFIG_POSIX) += aio-posix.o
 util-obj-$(CONFIG_POSIX) += compatfd.o
 util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
 util-obj-$(CONFIG_POSIX) += mmap-alloc.o
 util-obj-$(CONFIG_POSIX) += oslib-posix.o
 util-obj-$(CONFIG_POSIX) += qemu-openpty.o
 util-obj-$(CONFIG_POSIX) += qemu-thread-posix.o
-util-obj-$(CONFIG_WIN32) += event_notifier-win32.o
 util-obj-$(CONFIG_POSIX) += memfd.o
+util-obj-$(CONFIG_WIN32) += aio-win32.o
+util-obj-$(CONFIG_WIN32) += event_notifier-win32.o
 util-obj-$(CONFIG_WIN32) += oslib-win32.o
 util-obj-$(CONFIG_WIN32) += qemu-thread-win32.o
 util-obj-y += envlist.o path.o module.o
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
     return &acb->common;
 }
 
-void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
-                   BlockCompletionFunc *cb, void *opaque)
-{
-    BlockAIOCB *acb;
-
-    acb = g_malloc(aiocb_info->aiocb_size);
-    acb->aiocb_info = aiocb_info;
-    acb->bs = bs;
-    acb->cb = cb;
-    acb->opaque = opaque;
-    acb->refcnt = 1;
-    return acb;
-}
-
-void qemu_aio_ref(void *p)
-{
-    BlockAIOCB *acb = p;
-    acb->refcnt++;
-}
-
-void qemu_aio_unref(void *p)
-{
-    BlockAIOCB *acb = p;
-    assert(acb->refcnt > 0);
-    if (--acb->refcnt == 0) {
-        g_free(acb);
-    }
-}
-
 /**************************************************************/
 /* Coroutine block device emulation */
 
diff --git a/stubs/linux-aio.c b/stubs/linux-aio.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/stubs/linux-aio.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Linux native AIO support.
+ *
+ * Copyright (C) 2009 IBM, Corp.
+ * Copyright (C) 2009 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "block/aio.h"
+#include "block/raw-aio.h"
+
+void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
+{
+    abort();
+}
+
+void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
+{
+    abort();
+}
+
+LinuxAioState *laio_init(void)
+{
+    abort();
+}
+
+void laio_cleanup(LinuxAioState *s)
+{
+    abort();
+}
diff --git a/stubs/set-fd-handler.c b/stubs/set-fd-handler.c
index XXXXXXX..XXXXXXX 100644
--- a/stubs/set-fd-handler.c
+++ b/stubs/set-fd-handler.c
@@ -XXX,XX +XXX,XX @@ void qemu_set_fd_handler(int fd,
 {
     abort();
 }
-
-void aio_set_fd_handler(AioContext *ctx,
-                        int fd,
-                        bool is_external,
-                        IOHandler *io_read,
-                        IOHandler *io_write,
-                        AioPollFn *io_poll,
-                        void *opaque)
-{
-    abort();
-}
diff --git a/aio-posix.c b/util/aio-posix.c
similarity index 99%
rename from aio-posix.c
rename to util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/rcu_queue.h"
 #include "qemu/sockets.h"
 #include "qemu/cutils.h"
-#include "trace-root.h"
+#include "trace.h"
 #ifdef CONFIG_EPOLL_CREATE1
 #include <sys/epoll.h>
 #endif
diff --git a/aio-win32.c b/util/aio-win32.c
similarity index 100%
rename from aio-win32.c
rename to util/aio-win32.c
diff --git a/util/aiocb.c b/util/aiocb.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/util/aiocb.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * BlockAIOCB allocation
+ *
+ * Copyright (c) 2003-2017 Fabrice Bellard and other QEMU contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "block/aio.h"
+
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
+                   BlockCompletionFunc *cb, void *opaque)
+{
+    BlockAIOCB *acb;
+
+    acb = g_malloc(aiocb_info->aiocb_size);
+    acb->aiocb_info = aiocb_info;
+    acb->bs = bs;
+    acb->cb = cb;
+    acb->opaque = opaque;
+    acb->refcnt = 1;
+    return acb;
+}
+
+void qemu_aio_ref(void *p)
+{
+    BlockAIOCB *acb = p;
+    acb->refcnt++;
+}
+
+void qemu_aio_unref(void *p)
+{
+    BlockAIOCB *acb = p;
+    assert(acb->refcnt > 0);
+    if (--acb->refcnt == 0) {
+        g_free(acb);
+    }
+}
diff --git a/async.c b/util/async.c
similarity index 99%
rename from async.c
rename to util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@
 /*
- * QEMU System Emulator
+ * Data plane event loop
  *
  * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2009-2017 QEMU contributors
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/iohandler.c b/util/iohandler.c
similarity index 100%
rename from iohandler.c
rename to util/iohandler.c
diff --git a/main-loop.c b/util/main-loop.c
similarity index 100%
rename from main-loop.c
rename to util/main-loop.c
diff --git a/qemu-timer.c b/util/qemu-timer.c
similarity index 100%
rename from qemu-timer.c
rename to util/qemu-timer.c
diff --git a/thread-pool.c b/util/thread-pool.c
similarity index 99%
rename from thread-pool.c
rename to util/thread-pool.c
index XXXXXXX..XXXXXXX 100644
--- a/thread-pool.c
+++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/queue.h"
 #include "qemu/thread.h"
 #include "qemu/coroutine.h"
-#include "trace-root.h"
+#include "trace.h"
 #include "block/thread-pool.h"
 #include "qemu/main-loop.h"
 
diff --git a/trace-events b/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/trace-events
+++ b/trace-events
@@ -XXX,XX +XXX,XX @@
 #
 # The <format-string> should be a sprintf()-compatible format string.
 
-# aio-posix.c
-run_poll_handlers_begin(void *ctx, int64_t max_ns) "ctx %p max_ns %"PRId64
-run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
-poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
-poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
-
-# thread-pool.c
-thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
-thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
-thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
-
 # ioport.c
 cpu_in(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
 cpu_out(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@
 # See docs/tracing.txt for syntax documentation.
 
+# util/aio-posix.c
+run_poll_handlers_begin(void *ctx, int64_t max_ns) "ctx %p max_ns %"PRId64
+run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
+poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
+poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
+
+# util/thread-pool.c
+thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
+thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
+thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
+
 # util/buffer.c
 buffer_resize(const char *buf, size_t olen, size_t len) "%s: old %zd, new %zd"
 buffer_move_empty(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s"
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

aio_co_wake provides the infrastructure to start a coroutine on a "home"
AioContext.  It will be used by CoMutex and CoQueue, so that coroutines
don't jump from one context to another when they go to sleep on a
mutex or waitqueue.  However, it can also be used as a more efficient
alternative to one-shot bottom halves, and saves the effort of tracking
which AioContext a coroutine is running on.

aio_co_schedule is the part of aio_co_wake that starts a coroutine
on a remove AioContext, but it is also useful to implement e.g.
bdrv_set_aio_context callbacks.

The implementation of aio_co_schedule is based on a lock-free
multiple-producer, single-consumer queue.  The multiple producers use
cmpxchg to add to a LIFO stack.  The consumer (a per-AioContext bottom
half) grabs all items added so far, inverts the list to make it FIFO,
and goes through it one item at a time until it's empty.  The data
structure was inspired by OSv, which uses it in the very code we'll
"port" to QEMU for the thread-safe CoMutex.

Most of the new code is really tests.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-3-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/Makefile.include       |   8 +-
 include/block/aio.h          |  32 +++++++
 include/qemu/coroutine_int.h |  11 ++-
 tests/iothread.h             |  25 +++++
 tests/iothread.c             |  91 ++++++++++++++++++
 tests/test-aio-multithread.c | 213 +++++++++++++++++++++++++++++++++++++++++++
 util/async.c                 |  65 +++++++++++++
 util/qemu-coroutine.c        |   8 ++
 util/trace-events            |   4 +
 9 files changed, 453 insertions(+), 4 deletions(-)
 create mode 100644 tests/iothread.h
 create mode 100644 tests/iothread.c
 create mode 100644 tests/test-aio-multithread.c

diff --git a/tests/Makefile.include b/tests/Makefile.include
index XXXXXXX..XXXXXXX 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -XXX,XX +XXX,XX @@ check-unit-y += tests/test-aio$(EXESUF)
 gcov-files-test-aio-y = util/async.c util/qemu-timer.o
 gcov-files-test-aio-$(CONFIG_WIN32) += util/aio-win32.c
 gcov-files-test-aio-$(CONFIG_POSIX) += util/aio-posix.c
+check-unit-y += tests/test-aio-multithread$(EXESUF)
+gcov-files-test-aio-multithread-y = $(gcov-files-test-aio-y)
+gcov-files-test-aio-multithread-y += util/qemu-coroutine.c tests/iothread.c
 check-unit-y += tests/test-throttle$(EXESUF)
-gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
-gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
 check-unit-y += tests/test-thread-pool$(EXESUF)
 gcov-files-test-thread-pool-y = thread-pool.c
 gcov-files-test-hbitmap-y = util/hbitmap.c
@@ -XXX,XX +XXX,XX @@ test-qapi-obj-y = tests/test-qapi-visit.o tests/test-qapi-types.o \
 	$(test-qom-obj-y)
 test-crypto-obj-y = $(crypto-obj-y) $(test-qom-obj-y)
 test-io-obj-y = $(io-obj-y) $(test-crypto-obj-y)
-test-block-obj-y = $(block-obj-y) $(test-io-obj-y)
+test-block-obj-y = $(block-obj-y) $(test-io-obj-y) tests/iothread.o
 
 tests/check-qint$(EXESUF): tests/check-qint.o $(test-util-obj-y)
 tests/check-qstring$(EXESUF): tests/check-qstring.o $(test-util-obj-y)
@@ -XXX,XX +XXX,XX @@ tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
 tests/test-char$(EXESUF): tests/test-char.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y) $(chardev-obj-y)
 tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
 tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
+tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
 tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
diff --git a/include/block/aio.h b/include/block/aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ typedef void QEMUBHFunc(void *opaque);
 typedef bool AioPollFn(void *opaque);
 typedef void IOHandler(void *opaque);
 
+struct Coroutine;
 struct ThreadPool;
 struct LinuxAioState;
 
@@ -XXX,XX +XXX,XX @@ struct AioContext {
     bool notified;
     EventNotifier notifier;
 
+    QSLIST_HEAD(, Coroutine) scheduled_coroutines;
+    QEMUBH *co_schedule_bh;
+
     /* Thread pool for performing work and receiving completion callbacks.
      * Has its own locking.
      */
@@ -XXX,XX +XXX,XX @@ static inline bool aio_node_check(AioContext *ctx, bool is_external)
 }
 
 /**
+ * aio_co_schedule:
+ * @ctx: the aio context
+ * @co: the coroutine
+ *
+ * Start a coroutine on a remote AioContext.
+ *
+ * The coroutine must not be entered by anyone else while aio_co_schedule()
+ * is active.  In addition the coroutine must have yielded unless ctx
+ * is the context in which the coroutine is running (i.e. the value of
+ * qemu_get_current_aio_context() from the coroutine itself).
+ */
+void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
+
+/**
+ * aio_co_wake:
+ * @co: the coroutine
+ *
+ * Restart a coroutine on the AioContext where it was running last, thus
+ * preventing coroutines from jumping from one context to another when they
+ * go to sleep.
+ *
+ * aio_co_wake may be executed either in coroutine or non-coroutine
+ * context.  The coroutine must not be entered by anyone else while
+ * aio_co_wake() is active.
+ */
+void aio_co_wake(struct Coroutine *co);
+
+/**
  * Return the AioContext whose event loop runs in the current thread.
  *
  * If called from an IOThread this will be the IOThread's AioContext.  If
diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine_int.h
+++ b/include/qemu/coroutine_int.h
@@ -XXX,XX +XXX,XX @@ struct Coroutine {
     CoroutineEntry *entry;
     void *entry_arg;
     Coroutine *caller;
+
+    /* Only used when the coroutine has terminated.  */
     QSLIST_ENTRY(Coroutine) pool_next;
+
     size_t locks_held;
 
-    /* Coroutines that should be woken up when we yield or terminate */
+    /* Coroutines that should be woken up when we yield or terminate.
+     * Only used when the coroutine is running.
+     */
     QSIMPLEQ_HEAD(, Coroutine) co_queue_wakeup;
+
+    /* Only used when the coroutine has yielded.  */
+    AioContext *ctx;
     QSIMPLEQ_ENTRY(Coroutine) co_queue_next;
+    QSLIST_ENTRY(Coroutine) co_scheduled_next;
 };
 
 Coroutine *qemu_coroutine_new(void);
diff --git a/tests/iothread.h b/tests/iothread.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/iothread.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * Event loop thread implementation for unit tests
+ *
+ * Copyright Red Hat Inc., 2013, 2016
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@redhat.com>
+ *  Paolo Bonzini     <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#ifndef TEST_IOTHREAD_H
+#define TEST_IOTHREAD_H
+
+#include "block/aio.h"
+#include "qemu/thread.h"
+
+typedef struct IOThread IOThread;
+
+IOThread *iothread_new(void);
+void iothread_join(IOThread *iothread);
+AioContext *iothread_get_aio_context(IOThread *iothread);
+
+#endif
diff --git a/tests/iothread.c b/tests/iothread.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/iothread.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Event loop thread implementation for unit tests
+ *
+ * Copyright Red Hat Inc., 2013, 2016
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@redhat.com>
+ *  Paolo Bonzini     <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "block/aio.h"
+#include "qemu/main-loop.h"
+#include "qemu/rcu.h"
+#include "iothread.h"
+
+struct IOThread {
+    AioContext *ctx;
+
+    QemuThread thread;
+    QemuMutex init_done_lock;
+    QemuCond init_done_cond;    /* is thread initialization done? */
+    bool stopping;
+};
+
+static __thread IOThread *my_iothread;
+
+AioContext *qemu_get_current_aio_context(void)
+{
+    return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
+}
+
+static void *iothread_run(void *opaque)
+{
+    IOThread *iothread = opaque;
+
+    rcu_register_thread();
+
+    my_iothread = iothread;
+    qemu_mutex_lock(&iothread->init_done_lock);
+    iothread->ctx = aio_context_new(&error_abort);
+    qemu_cond_signal(&iothread->init_done_cond);
+    qemu_mutex_unlock(&iothread->init_done_lock);
+
+    while (!atomic_read(&iothread->stopping)) {
+        aio_poll(iothread->ctx, true);
+    }
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+void iothread_join(IOThread *iothread)
+{
+    iothread->stopping = true;
+    aio_notify(iothread->ctx);
+    qemu_thread_join(&iothread->thread);
+    qemu_cond_destroy(&iothread->init_done_cond);
+    qemu_mutex_destroy(&iothread->init_done_lock);
+    aio_context_unref(iothread->ctx);
+    g_free(iothread);
+}
+
+IOThread *iothread_new(void)
+{
+    IOThread *iothread = g_new0(IOThread, 1);
+
+    qemu_mutex_init(&iothread->init_done_lock);
+    qemu_cond_init(&iothread->init_done_cond);
+    qemu_thread_create(&iothread->thread, NULL, iothread_run,
+                       iothread, QEMU_THREAD_JOINABLE);
+
+    /* Wait for initialization to complete */
+    qemu_mutex_lock(&iothread->init_done_lock);
+    while (iothread->ctx == NULL) {
+        qemu_cond_wait(&iothread->init_done_cond,
+                       &iothread->init_done_lock);
+    }
+    qemu_mutex_unlock(&iothread->init_done_lock);
+    return iothread;
+}
+
+AioContext *iothread_get_aio_context(IOThread *iothread)
+{
+    return iothread->ctx;
+}
diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/test-aio-multithread.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * AioContext multithreading tests
+ *
+ * Copyright Red Hat, Inc. 2016
+ *
+ * Authors:
+ *  Paolo Bonzini    <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include <glib.h>
+#include "block/aio.h"
+#include "qapi/error.h"
+#include "qemu/coroutine.h"
+#include "qemu/thread.h"
+#include "qemu/error-report.h"
+#include "iothread.h"
+
+/* AioContext management */
+
+#define NUM_CONTEXTS 5
+
+static IOThread *threads[NUM_CONTEXTS];
+static AioContext *ctx[NUM_CONTEXTS];
+static __thread int id = -1;
+
+static QemuEvent done_event;
+
+/* Run a function synchronously on a remote iothread. */
+
+typedef struct CtxRunData {
+    QEMUBHFunc *cb;
+    void *arg;
+} CtxRunData;
+
+static void ctx_run_bh_cb(void *opaque)
+{
+    CtxRunData *data = opaque;
+
+    data->cb(data->arg);
+    qemu_event_set(&done_event);
+}
+
+static void ctx_run(int i, QEMUBHFunc *cb, void *opaque)
+{
+    CtxRunData data = {
+        .cb = cb,
+        .arg = opaque
+    };
+
+    qemu_event_reset(&done_event);
+    aio_bh_schedule_oneshot(ctx[i], ctx_run_bh_cb, &data);
+    qemu_event_wait(&done_event);
+}
+
+/* Starting the iothreads. */
+
+static void set_id_cb(void *opaque)
+{
+    int *i = opaque;
+
+    id = *i;
+}
+
+static void create_aio_contexts(void)
+{
+    int i;
+
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        threads[i] = iothread_new();
+        ctx[i] = iothread_get_aio_context(threads[i]);
+    }
+
+    qemu_event_init(&done_event, false);
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        ctx_run(i, set_id_cb, &i);
+    }
+}
+
+/* Stopping the iothreads. */
+
+static void join_aio_contexts(void)
+{
+    int i;
+
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        aio_context_ref(ctx[i]);
+    }
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        iothread_join(threads[i]);
+    }
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        aio_context_unref(ctx[i]);
+    }
+    qemu_event_destroy(&done_event);
+}
+
+/* Basic test for the stuff above. */
+
+static void test_lifecycle(void)
+{
+    create_aio_contexts();
+    join_aio_contexts();
+}
+
+/* aio_co_schedule test.  */
+
+static Coroutine *to_schedule[NUM_CONTEXTS];
+
+static bool now_stopping;
+
+static int count_retry;
+static int count_here;
+static int count_other;
+
+static bool schedule_next(int n)
+{
+    Coroutine *co;
+
+    co = atomic_xchg(&to_schedule[n], NULL);
+    if (!co) {
+        atomic_inc(&count_retry);
+        return false;
+    }
+
+    if (n == id) {
+        atomic_inc(&count_here);
+    } else {
+        atomic_inc(&count_other);
+    }
+
+    aio_co_schedule(ctx[n], co);
+    return true;
+}
+
+static void finish_cb(void *opaque)
+{
+    schedule_next(id);
+}
+
+static coroutine_fn void test_multi_co_schedule_entry(void *opaque)
+{
+    g_assert(to_schedule[id] == NULL);
+    atomic_mb_set(&to_schedule[id], qemu_coroutine_self());
+
+    while (!atomic_mb_read(&now_stopping)) {
+        int n;
+
+        n = g_test_rand_int_range(0, NUM_CONTEXTS);
+        schedule_next(n);
+        qemu_coroutine_yield();
+
+        g_assert(to_schedule[id] == NULL);
+        atomic_mb_set(&to_schedule[id], qemu_coroutine_self());
+    }
+}
+
+
+static void test_multi_co_schedule(int seconds)
+{
+    int i;
+
+    count_here = count_other = count_retry = 0;
+    now_stopping = false;
+
+    create_aio_contexts();
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        Coroutine *co1 = qemu_coroutine_create(test_multi_co_schedule_entry, NULL);
+        aio_co_schedule(ctx[i], co1);
+    }
+
+    g_usleep(seconds * 1000000);
+
+    atomic_mb_set(&now_stopping, true);
+    for (i = 0; i < NUM_CONTEXTS; i++) {
+        ctx_run(i, finish_cb, NULL);
+        to_schedule[i] = NULL;
+    }
+
+    join_aio_contexts();
+    g_test_message("scheduled %d, queued %d, retry %d, total %d\n",
+                  count_other, count_here, count_retry,
+                  count_here + count_other + count_retry);
+}
+
+static void test_multi_co_schedule_1(void)
+{
+    test_multi_co_schedule(1);
+}
+
+static void test_multi_co_schedule_10(void)
+{
+    test_multi_co_schedule(10);
+}
+
+/* End of tests.  */
+
+int main(int argc, char **argv)
+{
+    init_clocks();
+
+    g_test_init(&argc, &argv, NULL);
+    g_test_add_func("/aio/multi/lifecycle", test_lifecycle);
+    if (g_test_quick()) {
+        g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
+    } else {
+        g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
+    }
+    return g_test_run();
+}
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/main-loop.h"
 #include "qemu/atomic.h"
 #include "block/raw-aio.h"
+#include "qemu/coroutine_int.h"
+#include "trace.h"
 
 /***********************************************************/
 /* bottom halves (can be seen as timers which expire ASAP) */
@@ -XXX,XX +XXX,XX @@ aio_ctx_finalize(GSource     *source)
     }
 #endif
 
+    assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
+    qemu_bh_delete(ctx->co_schedule_bh);
+
     qemu_lockcnt_lock(&ctx->list_lock);
     assert(!qemu_lockcnt_count(&ctx->list_lock));
     while (ctx->first_bh) {
@@ -XXX,XX +XXX,XX @@ static bool event_notifier_poll(void *opaque)
     return atomic_read(&ctx->notified);
 }
 
+static void co_schedule_bh_cb(void *opaque)
+{
+    AioContext *ctx = opaque;
+    QSLIST_HEAD(, Coroutine) straight, reversed;
+
+    QSLIST_MOVE_ATOMIC(&reversed, &ctx->scheduled_coroutines);
+    QSLIST_INIT(&straight);
+
+    while (!QSLIST_EMPTY(&reversed)) {
+        Coroutine *co = QSLIST_FIRST(&reversed);
+        QSLIST_REMOVE_HEAD(&reversed, co_scheduled_next);
+        QSLIST_INSERT_HEAD(&straight, co, co_scheduled_next);
+    }
+
+    while (!QSLIST_EMPTY(&straight)) {
+        Coroutine *co = QSLIST_FIRST(&straight);
+        QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
+        trace_aio_co_schedule_bh_cb(ctx, co);
+        qemu_coroutine_enter(co);
+    }
+}
+
 AioContext *aio_context_new(Error **errp)
 {
     int ret;
@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
     }
     g_source_set_can_recurse(&ctx->source, true);
     qemu_lockcnt_init(&ctx->list_lock);
+
+    ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx);
+    QSLIST_INIT(&ctx->scheduled_coroutines);
+
     aio_set_event_notifier(ctx, &ctx->notifier,
                            false,
                            (EventNotifierHandler *)
@@ -XXX,XX +XXX,XX @@ fail:
     return NULL;
 }
 
+void aio_co_schedule(AioContext *ctx, Coroutine *co)
+{
+    trace_aio_co_schedule(ctx, co);
+    QSLIST_INSERT_HEAD_ATOMIC(&ctx->scheduled_coroutines,
+                              co, co_scheduled_next);
+    qemu_bh_schedule(ctx->co_schedule_bh);
+}
+
+void aio_co_wake(struct Coroutine *co)
+{
+    AioContext *ctx;
+
+    /* Read coroutine before co->ctx.  Matches smp_wmb in
+     * qemu_coroutine_enter.
+     */
+    smp_read_barrier_depends();
+    ctx = atomic_read(&co->ctx);
+
+    if (ctx != qemu_get_current_aio_context()) {
+        aio_co_schedule(ctx, co);
+        return;
+    }
+
+    if (qemu_in_coroutine()) {
+        Coroutine *self = qemu_coroutine_self();
+        assert(self != co);
+        QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, co, co_queue_next);
+    } else {
+        aio_context_acquire(ctx);
+        qemu_coroutine_enter(co);
+        aio_context_release(ctx);
+    }
+}
+
 void aio_context_ref(AioContext *ctx)
 {
     g_source_ref(&ctx->source);
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine.c
+++ b/util/qemu-coroutine.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/atomic.h"
 #include "qemu/coroutine.h"
 #include "qemu/coroutine_int.h"
+#include "block/aio.h"
 
 enum {
     POOL_BATCH_SIZE = 64,
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_enter(Coroutine *co)
     }
 
     co->caller = self;
+    co->ctx = qemu_get_current_aio_context();
+
+    /* Store co->ctx before anything that stores co.  Matches
+     * barrier in aio_co_wake.
+     */
+    smp_wmb();
+
     ret = qemu_coroutine_switch(self, co, COROUTINE_ENTER);
 
     qemu_co_queue_run_restart(co);
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
 poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
 
+# util/async.c
+aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
+aio_co_schedule_bh_cb(void *ctx, void *co) "ctx %p co %p"
+
 # util/thread-pool.c
 thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
 thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

qcow2_create2 calls this.  Do not run a nested event loop, as that
breaks when aio_co_wake tries to queue the coroutine on the co_queue_wakeup
list of the currently running one.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-4-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/block-backend.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
 {
     QEMUIOVector qiov;
     struct iovec iov;
-    Coroutine *co;
     BlkRwCo rwco;
 
     iov = (struct iovec) {
@@ -XXX,XX +XXX,XX @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
         .ret    = NOT_DONE,
     };
 
-    co = qemu_coroutine_create(co_entry, &rwco);
-    qemu_coroutine_enter(co);
-    BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        co_entry(&rwco);
+    } else {
+        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
+        qemu_coroutine_enter(co);
+        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
+    }
 
     return rwco.ret;
 }
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Once the thread pool starts using aio_co_wake, it will also need
qemu_get_current_aio_context().  Make test-thread-pool create
an AioContext with qemu_init_main_loop, so that stubs/iothread.c
and tests/iothread.c can provide the rest.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-5-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/test-thread-pool.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/tests/test-thread-pool.c b/tests/test-thread-pool.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-thread-pool.c
+++ b/tests/test-thread-pool.c
@@ -XXX,XX +XXX,XX @@
 #include "qapi/error.h"
 #include "qemu/timer.h"
 #include "qemu/error-report.h"
+#include "qemu/main-loop.h"
 
 static AioContext *ctx;
 static ThreadPool *pool;
@@ -XXX,XX +XXX,XX @@ static void test_cancel_async(void)
 int main(int argc, char **argv)
 {
     int ret;
-    Error *local_error = NULL;
 
-    init_clocks();
-
-    ctx = aio_context_new(&local_error);
-    if (!ctx) {
-        error_reportf_err(local_error, "Failed to create AIO Context: ");
-        exit(1);
-    }
+    qemu_init_main_loop(&error_abort);
+    ctx = qemu_get_current_aio_context();
     pool = aio_get_thread_pool(ctx);
 
     g_test_init(&argc, &argv, NULL);
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
 
     ret = g_test_run();
 
-    aio_context_unref(ctx);
     return ret;
 }
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This is in preparation for making qio_channel_yield work on
AioContexts other than the main one.

Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-6-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/io/channel.h | 25 +++++++++++++++++++++++++
 io/channel-command.c | 13 +++++++++++++
 io/channel-file.c    | 11 +++++++++++
 io/channel-socket.c  | 16 +++++++++++-----
 io/channel-tls.c     | 12 ++++++++++++
 io/channel-watch.c   |  6 ++++++
 io/channel.c         | 11 +++++++++++
 7 files changed, 89 insertions(+), 5 deletions(-)

diff --git a/include/io/channel.h b/include/io/channel.h
index XXXXXXX..XXXXXXX 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu-common.h"
 #include "qom/object.h"
+#include "block/aio.h"
 
 #define TYPE_QIO_CHANNEL "qio-channel"
 #define QIO_CHANNEL(obj)                                    \
@@ -XXX,XX +XXX,XX @@ struct QIOChannelClass {
                      off_t offset,
                      int whence,
                      Error **errp);
+    void (*io_set_aio_fd_handler)(QIOChannel *ioc,
+                                  AioContext *ctx,
+                                  IOHandler *io_read,
+                                  IOHandler *io_write,
+                                  void *opaque);
 };
 
 /* General I/O handling functions */
@@ -XXX,XX +XXX,XX @@ void qio_channel_yield(QIOChannel *ioc,
 void qio_channel_wait(QIOChannel *ioc,
                       GIOCondition condition);
 
+/**
+ * qio_channel_set_aio_fd_handler:
+ * @ioc: the channel object
+ * @ctx: the AioContext to set the handlers on
+ * @io_read: the read handler
+ * @io_write: the write handler
+ * @opaque: the opaque value passed to the handler
+ *
+ * This is used internally by qio_channel_yield().  It can
+ * be used by channel implementations to forward the handlers
+ * to another channel (e.g. from #QIOChannelTLS to the
+ * underlying socket).
+ */
+void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
+                                    AioContext *ctx,
+                                    IOHandler *io_read,
+                                    IOHandler *io_write,
+                                    void *opaque);
+
 #endif /* QIO_CHANNEL_H */
diff --git a/io/channel-command.c b/io/channel-command.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel-command.c
+++ b/io/channel-command.c
@@ -XXX,XX +XXX,XX @@ static int qio_channel_command_close(QIOChannel *ioc,
 }
 
 
+static void qio_channel_command_set_aio_fd_handler(QIOChannel *ioc,
+                                                   AioContext *ctx,
+                                                   IOHandler *io_read,
+                                                   IOHandler *io_write,
+                                                   void *opaque)
+{
+    QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);
+    aio_set_fd_handler(ctx, cioc->readfd, false, io_read, NULL, NULL, opaque);
+    aio_set_fd_handler(ctx, cioc->writefd, false, NULL, io_write, NULL, opaque);
+}
+
+
 static GSource *qio_channel_command_create_watch(QIOChannel *ioc,
                                                  GIOCondition condition)
 {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_command_class_init(ObjectClass *klass,
     ioc_klass->io_set_blocking = qio_channel_command_set_blocking;
     ioc_klass->io_close = qio_channel_command_close;
     ioc_klass->io_create_watch = qio_channel_command_create_watch;
+    ioc_klass->io_set_aio_fd_handler = qio_channel_command_set_aio_fd_handler;
 }
 
 static const TypeInfo qio_channel_command_info = {
diff --git a/io/channel-file.c b/io/channel-file.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel-file.c
+++ b/io/channel-file.c
@@ -XXX,XX +XXX,XX @@ static int qio_channel_file_close(QIOChannel *ioc,
 }
 
 
+static void qio_channel_file_set_aio_fd_handler(QIOChannel *ioc,
+                                                AioContext *ctx,
+                                                IOHandler *io_read,
+                                                IOHandler *io_write,
+                                                void *opaque)
+{
+    QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);
+    aio_set_fd_handler(ctx, fioc->fd, false, io_read, io_write, NULL, opaque);
+}
+
 static GSource *qio_channel_file_create_watch(QIOChannel *ioc,
                                               GIOCondition condition)
 {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_file_class_init(ObjectClass *klass,
     ioc_klass->io_seek = qio_channel_file_seek;
     ioc_klass->io_close = qio_channel_file_close;
     ioc_klass->io_create_watch = qio_channel_file_create_watch;
+    ioc_klass->io_set_aio_fd_handler = qio_channel_file_set_aio_fd_handler;
 }
 
 static const TypeInfo qio_channel_file_info = {
diff --git a/io/channel-socket.c b/io/channel-socket.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -XXX,XX +XXX,XX @@ qio_channel_socket_set_blocking(QIOChannel *ioc,
         qemu_set_block(sioc->fd);
     } else {
         qemu_set_nonblock(sioc->fd);
-#ifdef WIN32
-        WSAEventSelect(sioc->fd, ioc->event,
-                       FD_READ | FD_ACCEPT | FD_CLOSE |
-                       FD_CONNECT | FD_WRITE | FD_OOB);
-#endif
     }
     return 0;
 }
@@ -XXX,XX +XXX,XX @@ qio_channel_socket_shutdown(QIOChannel *ioc,
     return 0;
 }
 
+static void qio_channel_socket_set_aio_fd_handler(QIOChannel *ioc,
+                                                  AioContext *ctx,
+                                                  IOHandler *io_read,
+                                                  IOHandler *io_write,
+                                                  void *opaque)
+{
+    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+    aio_set_fd_handler(ctx, sioc->fd, false, io_read, io_write, NULL, opaque);
+}
+
 static GSource *qio_channel_socket_create_watch(QIOChannel *ioc,
                                                 GIOCondition condition)
 {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_socket_class_init(ObjectClass *klass,
     ioc_klass->io_set_cork = qio_channel_socket_set_cork;
     ioc_klass->io_set_delay = qio_channel_socket_set_delay;
     ioc_klass->io_create_watch = qio_channel_socket_create_watch;
+    ioc_klass->io_set_aio_fd_handler = qio_channel_socket_set_aio_fd_handler;
 }
 
 static const TypeInfo qio_channel_socket_info = {
diff --git a/io/channel-tls.c b/io/channel-tls.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel-tls.c
+++ b/io/channel-tls.c
@@ -XXX,XX +XXX,XX @@ static int qio_channel_tls_close(QIOChannel *ioc,
     return qio_channel_close(tioc->master, errp);
 }
 
+static void qio_channel_tls_set_aio_fd_handler(QIOChannel *ioc,
+                                               AioContext *ctx,
+                                               IOHandler *io_read,
+                                               IOHandler *io_write,
+                                               void *opaque)
+{
+    QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc);
+
+    qio_channel_set_aio_fd_handler(tioc->master, ctx, io_read, io_write, opaque);
+}
+
 static GSource *qio_channel_tls_create_watch(QIOChannel *ioc,
                                              GIOCondition condition)
 {
@@ -XXX,XX +XXX,XX @@ static void qio_channel_tls_class_init(ObjectClass *klass,
     ioc_klass->io_close = qio_channel_tls_close;
     ioc_klass->io_shutdown = qio_channel_tls_shutdown;
     ioc_klass->io_create_watch = qio_channel_tls_create_watch;
+    ioc_klass->io_set_aio_fd_handler = qio_channel_tls_set_aio_fd_handler;
 }
 
 static const TypeInfo qio_channel_tls_info = {
diff --git a/io/channel-watch.c b/io/channel-watch.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel-watch.c
+++ b/io/channel-watch.c
@@ -XXX,XX +XXX,XX @@ GSource *qio_channel_create_socket_watch(QIOChannel *ioc,
     GSource *source;
     QIOChannelSocketSource *ssource;
 
+#ifdef WIN32
+    WSAEventSelect(socket, ioc->event,
+                   FD_READ | FD_ACCEPT | FD_CLOSE |
+                   FD_CONNECT | FD_WRITE | FD_OOB);
+#endif
+
     source = g_source_new(&qio_channel_socket_source_funcs,
                           sizeof(QIOChannelSocketSource));
     ssource = (QIOChannelSocketSource *)source;
diff --git a/io/channel.c b/io/channel.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -XXX,XX +XXX,XX @@ GSource *qio_channel_create_watch(QIOChannel *ioc,
 }
 
 
+void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
+                                    AioContext *ctx,
+                                    IOHandler *io_read,
+                                    IOHandler *io_write,
+                                    void *opaque)
+{
+    QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
+
+    klass->io_set_aio_fd_handler(ioc, ctx, io_read, io_write, opaque);
+}
+
 guint qio_channel_add_watch(QIOChannel *ioc,
                             GIOCondition condition,
                             QIOChannelFunc func,
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Support separate coroutines for reading and writing, and place the
read/write handlers on the AioContext that the QIOChannel is registered
with.

Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-7-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/io/channel.h | 47 ++++++++++++++++++++++++++--
 io/channel.c         | 86 +++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 109 insertions(+), 24 deletions(-)

diff --git a/include/io/channel.h b/include/io/channel.h
index XXXXXXX..XXXXXXX 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu-common.h"
 #include "qom/object.h"
+#include "qemu/coroutine.h"
 #include "block/aio.h"
 
 #define TYPE_QIO_CHANNEL "qio-channel"
@@ -XXX,XX +XXX,XX @@ struct QIOChannel {
     Object parent;
     unsigned int features; /* bitmask of QIOChannelFeatures */
     char *name;
+    AioContext *ctx;
+    Coroutine *read_coroutine;
+    Coroutine *write_coroutine;
 #ifdef _WIN32
     HANDLE event; /* For use with GSource on Win32 */
 #endif
@@ -XXX,XX +XXX,XX @@ guint qio_channel_add_watch(QIOChannel *ioc,
 
 
 /**
+ * qio_channel_attach_aio_context:
+ * @ioc: the channel object
+ * @ctx: the #AioContext to set the handlers on
+ *
+ * Request that qio_channel_yield() sets I/O handlers on
+ * the given #AioContext.  If @ctx is %NULL, qio_channel_yield()
+ * uses QEMU's main thread event loop.
+ *
+ * You can move a #QIOChannel from one #AioContext to another even if
+ * I/O handlers are set for a coroutine.  However, #QIOChannel provides
+ * no synchronization between the calls to qio_channel_yield() and
+ * qio_channel_attach_aio_context().
+ *
+ * Therefore you should first call qio_channel_detach_aio_context()
+ * to ensure that the coroutine is not entered concurrently.  Then,
+ * while the coroutine has yielded, call qio_channel_attach_aio_context(),
+ * and then aio_co_schedule() to place the coroutine on the new
+ * #AioContext.  The calls to qio_channel_detach_aio_context()
+ * and qio_channel_attach_aio_context() should be protected with
+ * aio_context_acquire() and aio_context_release().
+ */
+void qio_channel_attach_aio_context(QIOChannel *ioc,
+                                    AioContext *ctx);
+
+/**
+ * qio_channel_detach_aio_context:
+ * @ioc: the channel object
+ *
+ * Disable any I/O handlers set by qio_channel_yield().  With the
+ * help of aio_co_schedule(), this allows moving a coroutine that was
+ * paused by qio_channel_yield() to another context.
+ */
+void qio_channel_detach_aio_context(QIOChannel *ioc);
+
+/**
  * qio_channel_yield:
  * @ioc: the channel object
  * @condition: the I/O condition to wait for
  *
- * Yields execution from the current coroutine until
- * the condition indicated by @condition becomes
- * available.
+ * Yields execution from the current coroutine until the condition
+ * indicated by @condition becomes available.  @condition must
+ * be either %G_IO_IN or %G_IO_OUT; it cannot contain both.  In
+ * addition, no two coroutine can be waiting on the same condition
+ * and channel at the same time.
  *
  * This must only be called from coroutine context
  */
diff --git a/io/channel.c b/io/channel.c
index XXXXXXX..XXXXXXX 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "io/channel.h"
 #include "qapi/error.h"
-#include "qemu/coroutine.h"
+#include "qemu/main-loop.h"
 
 bool qio_channel_has_feature(QIOChannel *ioc,
                              QIOChannelFeature feature)
@@ -XXX,XX +XXX,XX @@ off_t qio_channel_io_seek(QIOChannel *ioc,
 }
 
 
-typedef struct QIOChannelYieldData QIOChannelYieldData;
-struct QIOChannelYieldData {
-    QIOChannel *ioc;
-    Coroutine *co;
-};
+static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc);
 
+static void qio_channel_restart_read(void *opaque)
+{
+    QIOChannel *ioc = opaque;
+    Coroutine *co = ioc->read_coroutine;
+
+    ioc->read_coroutine = NULL;
+    qio_channel_set_aio_fd_handlers(ioc);
+    aio_co_wake(co);
+}
 
-static gboolean qio_channel_yield_enter(QIOChannel *ioc,
-                                        GIOCondition condition,
-                                        gpointer opaque)
+static void qio_channel_restart_write(void *opaque)
 {
-    QIOChannelYieldData *data = opaque;
-    qemu_coroutine_enter(data->co);
-    return FALSE;
+    QIOChannel *ioc = opaque;
+    Coroutine *co = ioc->write_coroutine;
+
+    ioc->write_coroutine = NULL;
+    qio_channel_set_aio_fd_handlers(ioc);
+    aio_co_wake(co);
 }
 
+static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc)
+{
+    IOHandler *rd_handler = NULL, *wr_handler = NULL;
+    AioContext *ctx;
+
+    if (ioc->read_coroutine) {
+        rd_handler = qio_channel_restart_read;
+    }
+    if (ioc->write_coroutine) {
+        wr_handler = qio_channel_restart_write;
+    }
+
+    ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
+    qio_channel_set_aio_fd_handler(ioc, ctx, rd_handler, wr_handler, ioc);
+}
+
+void qio_channel_attach_aio_context(QIOChannel *ioc,
+                                    AioContext *ctx)
+{
+    AioContext *old_ctx;
+    if (ioc->ctx == ctx) {
+        return;
+    }
+
+    old_ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
+    qio_channel_set_aio_fd_handler(ioc, old_ctx, NULL, NULL, NULL);
+    ioc->ctx = ctx;
+    qio_channel_set_aio_fd_handlers(ioc);
+}
+
+void qio_channel_detach_aio_context(QIOChannel *ioc)
+{
+    ioc->read_coroutine = NULL;
+    ioc->write_coroutine = NULL;
+    qio_channel_set_aio_fd_handlers(ioc);
+    ioc->ctx = NULL;
+}
 
 void coroutine_fn qio_channel_yield(QIOChannel *ioc,
                                     GIOCondition condition)
 {
-    QIOChannelYieldData data;
-
     assert(qemu_in_coroutine());
-    data.ioc = ioc;
-    data.co = qemu_coroutine_self();
-    qio_channel_add_watch(ioc,
-                          condition,
-                          qio_channel_yield_enter,
-                          &data,
-                          NULL);
+    if (condition == G_IO_IN) {
+        assert(!ioc->read_coroutine);
+        ioc->read_coroutine = qemu_coroutine_self();
+    } else if (condition == G_IO_OUT) {
+        assert(!ioc->write_coroutine);
+        ioc->write_coroutine = qemu_coroutine_self();
+    } else {
+        abort();
+    }
+    qio_channel_set_aio_fd_handlers(ioc);
     qemu_coroutine_yield();
 }
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

In the client, read the reply headers from a coroutine, switching the
read side between the "read header" coroutine and the I/O coroutine that
reads the body of the reply.

In the server, if the server can read more requests it will create a new
"read request" coroutine as soon as a request has been read.  Otherwise,
the new coroutine is created in nbd_request_put.

diff --git a/block/nbd-client.h b/block/nbd-client.h
index XXXXXXX..XXXXXXX 100644
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -XXX,XX +XXX,XX @@ typedef struct NBDClientSession {
 
     CoMutex send_mutex;
     CoQueue free_sema;
-    Coroutine *send_coroutine;
+    Coroutine *read_reply_co;
     int in_flight;
 
     Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
diff --git a/block/nbd-client.c b/block/nbd-client.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -XXX,XX +XXX,XX @@
 #define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
 #define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))
 
-static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
+static void nbd_recv_coroutines_enter_all(BlockDriverState *bs)
 {
+    NBDClientSession *s = nbd_get_client_session(bs);
     int i;
 
     for (i = 0; i < MAX_NBD_REQUESTS; i++) {
@@ -XXX,XX +XXX,XX @@ static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
             qemu_coroutine_enter(s->recv_coroutine[i]);
         }
     }
+    BDRV_POLL_WHILE(bs, s->read_reply_co);
 }
 
 static void nbd_teardown_connection(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
     qio_channel_shutdown(client->ioc,
                          QIO_CHANNEL_SHUTDOWN_BOTH,
                          NULL);
-    nbd_recv_coroutines_enter_all(client);
+    nbd_recv_coroutines_enter_all(bs);
 
     nbd_client_detach_aio_context(bs);
     object_unref(OBJECT(client->sioc));
@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
     client->ioc = NULL;
 }
 
-static void nbd_reply_ready(void *opaque)
+static coroutine_fn void nbd_read_reply_entry(void *opaque)
 {
-    BlockDriverState *bs = opaque;
-    NBDClientSession *s = nbd_get_client_session(bs);
+    NBDClientSession *s = opaque;
     uint64_t i;
     int ret;
 
-    if (!s->ioc) { /* Already closed */
-        return;
-    }
-
-    if (s->reply.handle == 0) {
-        /* No reply already in flight.  Fetch a header.  It is possible
-         * that another thread has done the same thing in parallel, so
-         * the socket is not readable anymore.
-         */
+    for (;;) {
+        assert(s->reply.handle == 0);
         ret = nbd_receive_reply(s->ioc, &s->reply);
-        if (ret == -EAGAIN) {
-            return;
-        }
         if (ret < 0) {
-            s->reply.handle = 0;
-            goto fail;
+            break;
         }
-    }
 
-    /* There's no need for a mutex on the receive side, because the
-     * handler acts as a synchronization point and ensures that only
-     * one coroutine is called until the reply finishes.  */
-    i = HANDLE_TO_INDEX(s, s->reply.handle);
-    if (i >= MAX_NBD_REQUESTS) {
-        goto fail;
-    }
+        /* There's no need for a mutex on the receive side, because the
+         * handler acts as a synchronization point and ensures that only
+         * one coroutine is called until the reply finishes.
+         */
+        i = HANDLE_TO_INDEX(s, s->reply.handle);
+        if (i >= MAX_NBD_REQUESTS || !s->recv_coroutine[i]) {
+            break;
+        }
 
-    if (s->recv_coroutine[i]) {
-        qemu_coroutine_enter(s->recv_coroutine[i]);
-        return;
+        /* We're woken up by the recv_coroutine itself.  Note that there
+         * is no race between yielding and reentering read_reply_co.  This
+         * is because:
+         *
+         * - if recv_coroutine[i] runs on the same AioContext, it is only
+         *   entered after we yield
+         *
+         * - if recv_coroutine[i] runs on a different AioContext, reentering
+         *   read_reply_co happens through a bottom half, which can only
+         *   run after we yield.
+         */
+        aio_co_wake(s->recv_coroutine[i]);
+        qemu_coroutine_yield();
     }
-
-fail:
-    nbd_teardown_connection(bs);
-}
-
-static void nbd_restart_write(void *opaque)
-{
-    BlockDriverState *bs = opaque;
-
-    qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine);
+    s->read_reply_co = NULL;
 }
 
 static int nbd_co_send_request(BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
                                QEMUIOVector *qiov)
 {
     NBDClientSession *s = nbd_get_client_session(bs);
-    AioContext *aio_context;
     int rc, ret, i;
 
     qemu_co_mutex_lock(&s->send_mutex);
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
         return -EPIPE;
     }
 
-    s->send_coroutine = qemu_coroutine_self();
-    aio_context = bdrv_get_aio_context(bs);
-
-    aio_set_fd_handler(aio_context, s->sioc->fd, false,
-                       nbd_reply_ready, nbd_restart_write, NULL, bs);
     if (qiov) {
         qio_channel_set_cork(s->ioc, true);
         rc = nbd_send_request(s->ioc, request);
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
     } else {
         rc = nbd_send_request(s->ioc, request);
     }
-    aio_set_fd_handler(aio_context, s->sioc->fd, false,
-                       nbd_reply_ready, NULL, NULL, bs);
-    s->send_coroutine = NULL;
     qemu_co_mutex_unlock(&s->send_mutex);
     return rc;
 }
@@ -XXX,XX +XXX,XX @@ static void nbd_co_receive_reply(NBDClientSession *s,
 {
     int ret;
 
-    /* Wait until we're woken up by the read handler.  TODO: perhaps
-     * peek at the next reply and avoid yielding if it's ours?  */
+    /* Wait until we're woken up by nbd_read_reply_entry.  */
     qemu_coroutine_yield();
     *reply = s->reply;
     if (reply->handle != request->handle ||
@@ -XXX,XX +XXX,XX @@ static void nbd_coroutine_start(NBDClientSession *s,
     /* s->recv_coroutine[i] is set as soon as we get the send_lock.  */
 }
 
-static void nbd_coroutine_end(NBDClientSession *s,
+static void nbd_coroutine_end(BlockDriverState *bs,
                               NBDRequest *request)
 {
+    NBDClientSession *s = nbd_get_client_session(bs);
     int i = HANDLE_TO_INDEX(s, request->handle);
+
     s->recv_coroutine[i] = NULL;
-    if (s->in_flight-- == MAX_NBD_REQUESTS) {
-        qemu_co_queue_next(&s->free_sema);
+    s->in_flight--;
+    qemu_co_queue_next(&s->free_sema);
+
+    /* Kick the read_reply_co to get the next reply.  */
+    if (s->read_reply_co) {
+        aio_co_wake(s->read_reply_co);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
     } else {
         nbd_co_receive_reply(client, &request, &reply, qiov);
     }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
     return -reply.error;
 }
 
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
     } else {
         nbd_co_receive_reply(client, &request, &reply, NULL);
     }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
     return -reply.error;
 }
 
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
     } else {
         nbd_co_receive_reply(client, &request, &reply, NULL);
     }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
     return -reply.error;
 }
 
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_flush(BlockDriverState *bs)
     } else {
         nbd_co_receive_reply(client, &request, &reply, NULL);
     }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
     return -reply.error;
 }
 
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
     } else {
         nbd_co_receive_reply(client, &request, &reply, NULL);
     }
-    nbd_coroutine_end(client, &request);
+    nbd_coroutine_end(bs, &request);
     return -reply.error;
 
 }
 
 void nbd_client_detach_aio_context(BlockDriverState *bs)
 {
-    aio_set_fd_handler(bdrv_get_aio_context(bs),
-                       nbd_get_client_session(bs)->sioc->fd,
-                       false, NULL, NULL, NULL, NULL);
+    NBDClientSession *client = nbd_get_client_session(bs);
+    qio_channel_detach_aio_context(QIO_CHANNEL(client->sioc));
 }
 
 void nbd_client_attach_aio_context(BlockDriverState *bs,
                                    AioContext *new_context)
 {
-    aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sioc->fd,
-                       false, nbd_reply_ready, NULL, NULL, bs);
+    NBDClientSession *client = nbd_get_client_session(bs);
+    qio_channel_attach_aio_context(QIO_CHANNEL(client->sioc), new_context);
+    aio_co_schedule(new_context, client->read_reply_co);
 }
 
 void nbd_client_close(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ int nbd_client_init(BlockDriverState *bs,
     /* Now that we're connected, set the socket to be non-blocking and
      * kick the reply mechanism.  */
     qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL);
-
+    client->read_reply_co = qemu_coroutine_create(nbd_read_reply_entry, client);
     nbd_client_attach_aio_context(bs, bdrv_get_aio_context(bs));
 
     logout("Established connection with NBD server\n");
diff --git a/nbd/client.c b/nbd/client.c
index XXXXXXX..XXXXXXX 100644
--- a/nbd/client.c
+++ b/nbd/client.c
@@ -XXX,XX +XXX,XX @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply)
     ssize_t ret;
 
     ret = read_sync(ioc, buf, sizeof(buf));
-    if (ret < 0) {
+    if (ret <= 0) {
         return ret;
     }
 
diff --git a/nbd/common.c b/nbd/common.c
index XXXXXXX..XXXXXXX 100644
--- a/nbd/common.c
+++ b/nbd/common.c
@@ -XXX,XX +XXX,XX @@ ssize_t nbd_wr_syncv(QIOChannel *ioc,
         }
         if (len == QIO_CHANNEL_ERR_BLOCK) {
             if (qemu_in_coroutine()) {
-                /* XXX figure out if we can create a variant on
-                 * qio_channel_yield() that works with AIO contexts
-                 * and consider using that in this branch */
-                qemu_coroutine_yield();
-            } else if (done) {
-                /* XXX this is needed by nbd_reply_ready.  */
-                qio_channel_wait(ioc,
-                                 do_read ? G_IO_IN : G_IO_OUT);
+                qio_channel_yield(ioc, do_read ? G_IO_IN : G_IO_OUT);
             } else {
                 return -EAGAIN;
             }
diff --git a/nbd/server.c b/nbd/server.c
index XXXXXXX..XXXXXXX 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -XXX,XX +XXX,XX @@ struct NBDClient {
     CoMutex send_lock;
     Coroutine *send_coroutine;
 
-    bool can_read;
-
     QTAILQ_ENTRY(NBDClient) next;
     int nb_requests;
     bool closing;
@@ -XXX,XX +XXX,XX @@ struct NBDClient {
 
 /* That's all folks */
 
-static void nbd_set_handlers(NBDClient *client);
-static void nbd_unset_handlers(NBDClient *client);
-static void nbd_update_can_read(NBDClient *client);
+static void nbd_client_receive_next_request(NBDClient *client);
 
 static gboolean nbd_negotiate_continue(QIOChannel *ioc,
                                        GIOCondition condition,
@@ -XXX,XX +XXX,XX @@ void nbd_client_put(NBDClient *client)
          */
         assert(client->closing);
 
-        nbd_unset_handlers(client);
+        qio_channel_detach_aio_context(client->ioc);
         object_unref(OBJECT(client->sioc));
         object_unref(OBJECT(client->ioc));
         if (client->tlscreds) {
@@ -XXX,XX +XXX,XX @@ static NBDRequestData *nbd_request_get(NBDClient *client)
 
     assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
     client->nb_requests++;
-    nbd_update_can_read(client);
 
     req = g_new0(NBDRequestData, 1);
     nbd_client_get(client);
@@ -XXX,XX +XXX,XX @@ static void nbd_request_put(NBDRequestData *req)
     g_free(req);
 
     client->nb_requests--;
-    nbd_update_can_read(client);
+    nbd_client_receive_next_request(client);
+
     nbd_client_put(client);
 }
 
@@ -XXX,XX +XXX,XX @@ static void blk_aio_attached(AioContext *ctx, void *opaque)
     exp->ctx = ctx;
 
     QTAILQ_FOREACH(client, &exp->clients, next) {
-        nbd_set_handlers(client);
+        qio_channel_attach_aio_context(client->ioc, ctx);
+        if (client->recv_coroutine) {
+            aio_co_schedule(ctx, client->recv_coroutine);
+        }
+        if (client->send_coroutine) {
+            aio_co_schedule(ctx, client->send_coroutine);
+        }
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void blk_aio_detach(void *opaque)
     TRACE("Export %s: Detaching clients from AIO context %p\n", exp->name, exp->ctx);
 
     QTAILQ_FOREACH(client, &exp->clients, next) {
-        nbd_unset_handlers(client);
+        qio_channel_detach_aio_context(client->ioc);
     }
 
     exp->ctx = NULL;
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
     g_assert(qemu_in_coroutine());
     qemu_co_mutex_lock(&client->send_lock);
     client->send_coroutine = qemu_coroutine_self();
-    nbd_set_handlers(client);
 
     if (!len) {
         rc = nbd_send_reply(client->ioc, reply);
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
     }
 
     client->send_coroutine = NULL;
-    nbd_set_handlers(client);
     qemu_co_mutex_unlock(&client->send_lock);
     return rc;
 }
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
     ssize_t rc;
 
     g_assert(qemu_in_coroutine());
-    client->recv_coroutine = qemu_coroutine_self();
-    nbd_update_can_read(client);
-
+    assert(client->recv_coroutine == qemu_coroutine_self());
     rc = nbd_receive_request(client->ioc, request);
     if (rc < 0) {
         if (rc != -EAGAIN) {
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
 
 out:
     client->recv_coroutine = NULL;
-    nbd_update_can_read(client);
+    nbd_client_receive_next_request(client);
 
     return rc;
 }
 
-static void nbd_trip(void *opaque)
+/* Owns a reference to the NBDClient passed as opaque.  */
+static coroutine_fn void nbd_trip(void *opaque)
 {
     NBDClient *client = opaque;
     NBDExport *exp = client->exp;
     NBDRequestData *req;
-    NBDRequest request;
+    NBDRequest request = { 0 };    /* GCC thinks it can be used uninitialized */
     NBDReply reply;
     ssize_t ret;
     int flags;
 
     TRACE("Reading request.");
     if (client->closing) {
+        nbd_client_put(client);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void nbd_trip(void *opaque)
 
 done:
     nbd_request_put(req);
+    nbd_client_put(client);
     return;
 
 out:
     nbd_request_put(req);
     client_close(client);
+    nbd_client_put(client);
 }
 
-static void nbd_read(void *opaque)
+static void nbd_client_receive_next_request(NBDClient *client)
 {
-    NBDClient *client = opaque;
-
-    if (client->recv_coroutine) {
-        qemu_coroutine_enter(client->recv_coroutine);
-    } else {
-        qemu_coroutine_enter(qemu_coroutine_create(nbd_trip, client));
-    }
-}
-
-static void nbd_restart_write(void *opaque)
-{
-    NBDClient *client = opaque;
-
-    qemu_coroutine_enter(client->send_coroutine);
-}
-
-static void nbd_set_handlers(NBDClient *client)
-{
-    if (client->exp && client->exp->ctx) {
-        aio_set_fd_handler(client->exp->ctx, client->sioc->fd, true,
-                           client->can_read ? nbd_read : NULL,
-                           client->send_coroutine ? nbd_restart_write : NULL,
-                           NULL, client);
-    }
-}
-
-static void nbd_unset_handlers(NBDClient *client)
-{
-    if (client->exp && client->exp->ctx) {
-        aio_set_fd_handler(client->exp->ctx, client->sioc->fd, true, NULL,
-                           NULL, NULL, NULL);
-    }
-}
-
-static void nbd_update_can_read(NBDClient *client)
-{
-    bool can_read = client->recv_coroutine ||
-                    client->nb_requests < MAX_NBD_REQUESTS;
-
-    if (can_read != client->can_read) {
-        client->can_read = can_read;
-        nbd_set_handlers(client);
-
-        /* There is no need to invoke aio_notify(), since aio_set_fd_handler()
-         * in nbd_set_handlers() will have taken care of that */
+    if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS) {
+        nbd_client_get(client);
+        client->recv_coroutine = qemu_coroutine_create(nbd_trip, client);
+        aio_co_schedule(client->exp->ctx, client->recv_coroutine);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void nbd_co_client_start(void *opaque)
         goto out;
     }
     qemu_co_mutex_init(&client->send_lock);
-    nbd_set_handlers(client);
 
     if (exp) {
         QTAILQ_INSERT_TAIL(&exp->clients, client, next);
     }
+
+    nbd_client_receive_next_request(client);
+
 out:
     g_free(data);
 }
@@ -XXX,XX +XXX,XX @@ void nbd_client_new(NBDExport *exp,
     object_ref(OBJECT(client->sioc));
     client->ioc = QIO_CHANNEL(sioc);
     object_ref(OBJECT(client->ioc));
-    client->can_read = true;
     client->close = close_fn;
 
     data->client = client;
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

As a small step towards the introduction of multiqueue, we want
coroutines to remain on the same AioContext that started them,
unless they are moved explicitly with e.g. aio_co_schedule.  This patch
avoids that coroutines switch AioContext when they use a CoMutex.
For now it does not make much of a difference, because the CoMutex
is not thread-safe and the AioContext itself is used to protect the
CoMutex from concurrent access.  However, this is going to change.

diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/coroutine.h"
 #include "qemu/coroutine_int.h"
 #include "qemu/queue.h"
+#include "block/aio.h"
 #include "trace.h"
 
 void qemu_co_queue_init(CoQueue *queue)
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_run_restart(Coroutine *co)
 
 static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
 {
-    Coroutine *self = qemu_coroutine_self();
     Coroutine *next;
 
     if (QSIMPLEQ_EMPTY(&queue->entries)) {
@@ -XXX,XX +XXX,XX @@ static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
 
     while ((next = QSIMPLEQ_FIRST(&queue->entries)) != NULL) {
         QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next);
-        QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, next, co_queue_next);
-        trace_qemu_co_queue_next(next);
+        aio_co_wake(next);
         if (single) {
             break;
         }
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_coroutine_terminate(void *co) "self %p"
 
 # util/qemu-coroutine-lock.c
 qemu_co_queue_run_restart(void *co) "co %p"
-qemu_co_queue_next(void *nxt) "next %p"
 qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Keep the coroutine on the same AioContext.  Without this change,
there would be a race between yielding the coroutine and reentering it.
While the race cannot happen now, because the code only runs from a single
AioContext, this will change with multiqueue support in the block layer.

While doing the change, replace custom bottom half with aio_co_schedule.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Message-id: 20170213135235.12274-10-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/blkdebug.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index XXXXXXX..XXXXXXX 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -XXX,XX +XXX,XX @@ out:
     return ret;
 }
 
-static void error_callback_bh(void *opaque)
-{
-    Coroutine *co = opaque;
-    qemu_coroutine_enter(co);
-}
-
 static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
 {
     BDRVBlkdebugState *s = bs->opaque;
@@ -XXX,XX +XXX,XX @@ static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
     }
 
     if (!immediately) {
-        aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), error_callback_bh,
-                                qemu_coroutine_self());
+        aio_co_schedule(qemu_get_current_aio_context(), qemu_coroutine_self());
         qemu_coroutine_yield();
     }
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

qed_aio_start_io and qed_aio_next_io will not have to acquire/release
the AioContext, while qed_aio_next_io_cb will.  Split the functionality
and gain a little type-safety in the process.

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
     return l2_table;
 }
 
-static void qed_aio_next_io(void *opaque, int ret);
+static void qed_aio_next_io(QEDAIOCB *acb, int ret);
+
+static void qed_aio_start_io(QEDAIOCB *acb)
+{
+    qed_aio_next_io(acb, 0);
+}
+
+static void qed_aio_next_io_cb(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+
+    qed_aio_next_io(acb, ret);
+}
 
 static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
 {
@@ -XXX,XX +XXX,XX @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
 
     acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
     if (acb) {
-        qed_aio_next_io(acb, 0);
+        qed_aio_start_io(acb);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
         QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
         acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
         if (acb) {
-            qed_aio_next_io(acb, 0);
+            qed_aio_start_io(acb);
         } else if (s->header.features & QED_F_NEED_CHECK) {
             qed_start_need_check_timer(s);
         }
@@ -XXX,XX +XXX,XX @@ static void qed_commit_l2_update(void *opaque, int ret)
     acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
     assert(acb->request.l2_table != NULL);
 
-    qed_aio_next_io(opaque, ret);
+    qed_aio_next_io(acb, ret);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
     if (need_alloc) {
         /* Write out the whole new L2 table */
         qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
-                            qed_aio_write_l1_update, acb);
+                           qed_aio_write_l1_update, acb);
     } else {
         /* Write out only the updated part of the L2 table */
         qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
-                            qed_aio_next_io, acb);
+                           qed_aio_next_io_cb, acb);
     }
     return;
 
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
     }
 
     if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
-        next_fn = qed_aio_next_io;
+        next_fn = qed_aio_next_io_cb;
     } else {
         if (s->bs->backing) {
             next_fn = qed_aio_write_flush_before_l2_update;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
     if (acb->flags & QED_AIOCB_ZERO) {
         /* Skip ahead if the clusters are already zero */
         if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
-            qed_aio_next_io(acb, 0);
+            qed_aio_start_io(acb);
             return;
         }
 
@@ -XXX,XX +XXX,XX @@ static void qed_aio_read_data(void *opaque, int ret,
     /* Handle zero cluster and backing file reads */
     if (ret == QED_CLUSTER_ZERO) {
         qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
-        qed_aio_next_io(acb, 0);
+        qed_aio_start_io(acb);
         return;
     } else if (ret != QED_CLUSTER_FOUND) {
         qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
-                              &acb->backing_qiov, qed_aio_next_io, acb);
+                              &acb->backing_qiov, qed_aio_next_io_cb, acb);
         return;
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
     bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
                    &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
-                   qed_aio_next_io, acb);
+                   qed_aio_next_io_cb, acb);
     return;
 
 err:
@@ -XXX,XX +XXX,XX @@ err:
 /**
  * Begin next I/O or complete the request
  */
-static void qed_aio_next_io(void *opaque, int ret)
+static void qed_aio_next_io(QEDAIOCB *acb, int ret)
 {
-    QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
     QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
                                 qed_aio_write_data : qed_aio_read_data;
@@ -XXX,XX +XXX,XX @@ static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
     qemu_iovec_init(&acb->cur_qiov, qiov->niov);
 
     /* Start request */
-    qed_aio_next_io(acb, 0);
+    qed_aio_start_io(acb);
     return &acb->common;
 }
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

The AioContext data structures are now protected by list_lock and/or
they are walked with FOREACH_RCU primitives.  There is no need anymore
to acquire the AioContext for the entire duration of aio_dispatch.
Instead, just acquire it before and after invoking the callbacks.
The next step is then to push it further down.

diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
             (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
             aio_node_check(ctx, node->is_external) &&
             node->io_read) {
+            aio_context_acquire(ctx);
             node->io_read(node->opaque);
+            aio_context_release(ctx);
 
             /* aio_notify() does not count as progress */
             if (node->opaque != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
             (revents & (G_IO_OUT | G_IO_ERR)) &&
             aio_node_check(ctx, node->is_external) &&
             node->io_write) {
+            aio_context_acquire(ctx);
             node->io_write(node->opaque);
+            aio_context_release(ctx);
             progress = true;
         }
 
@@ -XXX,XX +XXX,XX @@ bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
     }
 
     /* Run our timers */
+    aio_context_acquire(ctx);
     progress |= timerlistgroup_run_timers(&ctx->tlg);
+    aio_context_release(ctx);
 
     return progress;
 }
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     int64_t timeout;
     int64_t start = 0;
 
-    aio_context_acquire(ctx);
-    progress = false;
-
     /* aio_notify can avoid the expensive event_notifier_set if
      * everything (file descriptors, bottom halves, timers) will
      * be re-evaluated before the next blocking poll().  This is
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     }
 
-    if (try_poll_mode(ctx, blocking)) {
-        progress = true;
-    } else {
+    aio_context_acquire(ctx);
+    progress = try_poll_mode(ctx, blocking);
+    aio_context_release(ctx);
+
+    if (!progress) {
         assert(npfd == 0);
 
         /* fill pollfds */
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         timeout = blocking ? aio_compute_timeout(ctx) : 0;
 
         /* wait until next event */
-        if (timeout) {
-            aio_context_release(ctx);
-        }
         if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
             AioHandler epoll_handler;
 
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         } else  {
             ret = qemu_poll_ns(pollfds, npfd, timeout);
         }
-        if (timeout) {
-            aio_context_acquire(ctx);
-        }
     }
 
     if (blocking) {
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         progress = true;
     }
 
-    aio_context_release(ctx);
-
     return progress;
 }
 
diff --git a/util/aio-win32.c b/util/aio-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
             (revents || event_notifier_get_handle(node->e) == event) &&
             node->io_notify) {
             node->pfd.revents = 0;
+            aio_context_acquire(ctx);
             node->io_notify(node->e);
+            aio_context_release(ctx);
 
             /* aio_notify() does not count as progress */
             if (node->e != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
             (node->io_read || node->io_write)) {
             node->pfd.revents = 0;
             if ((revents & G_IO_IN) && node->io_read) {
+                aio_context_acquire(ctx);
                 node->io_read(node->opaque);
+                aio_context_release(ctx);
                 progress = true;
             }
             if ((revents & G_IO_OUT) && node->io_write) {
+                aio_context_acquire(ctx);
                 node->io_write(node->opaque);
+                aio_context_release(ctx);
                 progress = true;
             }
 
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     int count;
     int timeout;
 
-    aio_context_acquire(ctx);
     progress = false;
 
     /* aio_notify can avoid the expensive event_notifier_set if
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
 
         timeout = blocking && !have_select_revents
             ? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
-        if (timeout) {
-            aio_context_release(ctx);
-        }
         ret = WaitForMultipleObjects(count, events, FALSE, timeout);
         if (blocking) {
             assert(first);
             atomic_sub(&ctx->notify_me, 2);
         }
-        if (timeout) {
-            aio_context_acquire(ctx);
-        }
 
         if (first) {
             aio_notify_accept(ctx);
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         progress |= aio_dispatch_handlers(ctx, event);
     } while (count > 0);
 
+    aio_context_acquire(ctx);
     progress |= timerlistgroup_run_timers(&ctx->tlg);
-
     aio_context_release(ctx);
     return progress;
 }
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
                 ret = 1;
             }
             bh->idle = 0;
+            aio_context_acquire(ctx);
             aio_bh_call(bh);
+            aio_context_release(ctx);
         }
         if (bh->deleted) {
             deleted = true;
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Message-id: 20170213135235.12274-13-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qed.h                 |  3 +++
 block/curl.c                |  2 ++
 block/io.c                  |  5 +++++
 block/iscsi.c               |  8 ++++++--
 block/null.c                |  4 ++++
 block/qed.c                 | 12 ++++++++++++
 block/throttle-groups.c     |  2 ++
 util/aio-posix.c            |  2 --
 util/aio-win32.c            |  2 --
 util/qemu-coroutine-sleep.c |  2 +-
 10 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/block/qed.h b/block/qed.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -XXX,XX +XXX,XX @@ enum {
  */
 typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len);
 
+void qed_acquire(BDRVQEDState *s);
+void qed_release(BDRVQEDState *s);
+
 /**
  * Generic callback for chaining async callbacks
  */
diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_multi_timeout_do(void *arg)
         return;
     }
 
+    aio_context_acquire(s->aio_context);
     curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
 
     curl_multi_check_completion(s);
+    aio_context_release(s->aio_context);
 #else
     abort();
 #endif
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ void bdrv_aio_cancel(BlockAIOCB *acb)
         if (acb->aiocb_info->get_aio_context) {
             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
         } else if (acb->bs) {
+            /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
+             * assert that we're not using an I/O thread.  Thread-safe
+             * code should use bdrv_aio_cancel_async exclusively.
+             */
+            assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
             aio_poll(bdrv_get_aio_context(acb->bs), true);
         } else {
             abort();
diff --git a/block/iscsi.c b/block/iscsi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static void iscsi_retry_timer_expired(void *opaque)
     struct IscsiTask *iTask = opaque;
     iTask->complete = 1;
     if (iTask->co) {
-        qemu_coroutine_enter(iTask->co);
+        aio_co_wake(iTask->co);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void iscsi_nop_timed_event(void *opaque)
 {
     IscsiLun *iscsilun = opaque;
 
+    aio_context_acquire(iscsilun->aio_context);
     if (iscsi_get_nops_in_flight(iscsilun->iscsi) >= MAX_NOP_FAILURES) {
         error_report("iSCSI: NOP timeout. Reconnecting...");
         iscsilun->request_timed_out = true;
     } else if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) {
         error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages.");
-        return;
+        goto out;
     }
 
     timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL);
     iscsi_set_events(iscsilun);
+
+out:
+    aio_context_release(iscsilun->aio_context);
 }
 
 static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
diff --git a/block/null.c b/block/null.c
index XXXXXXX..XXXXXXX 100644
--- a/block/null.c
+++ b/block/null.c
@@ -XXX,XX +XXX,XX @@ static void null_bh_cb(void *opaque)
 static void null_timer_cb(void *opaque)
 {
     NullAIOCB *acb = opaque;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+
+    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, 0);
+    aio_context_release(ctx);
     timer_deinit(&acb->timer);
     qemu_aio_unref(acb);
 }
diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_need_check_timer_cb(void *opaque)
 
     trace_qed_need_check_timer_cb(s);
 
+    qed_acquire(s);
     qed_plug_allocating_write_reqs(s);
 
     /* Ensure writes are on disk before clearing flag */
     bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
+    qed_release(s);
+}
+
+void qed_acquire(BDRVQEDState *s)
+{
+    aio_context_acquire(bdrv_get_aio_context(s->bs));
+}
+
+void qed_release(BDRVQEDState *s)
+{
+    aio_context_release(bdrv_get_aio_context(s->bs));
 }
 
 static void qed_start_need_check_timer(BDRVQEDState *s)
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index XXXXXXX..XXXXXXX 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@ static void timer_cb(BlockBackend *blk, bool is_write)
     qemu_mutex_unlock(&tg->lock);
 
     /* Run the request that was waiting for this timer */
+    aio_context_acquire(blk_get_aio_context(blk));
     empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]);
+    aio_context_release(blk_get_aio_context(blk));
 
     /* If the request queue was empty then we have to take care of
      * scheduling the next one */
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
     }
 
     /* Run our timers */
-    aio_context_acquire(ctx);
     progress |= timerlistgroup_run_timers(&ctx->tlg);
-    aio_context_release(ctx);
 
     return progress;
 }
diff --git a/util/aio-win32.c b/util/aio-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         progress |= aio_dispatch_handlers(ctx, event);
     } while (count > 0);
 
-    aio_context_acquire(ctx);
     progress |= timerlistgroup_run_timers(&ctx->tlg);
-    aio_context_release(ctx);
     return progress;
 }
 
diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-sleep.c
+++ b/util/qemu-coroutine-sleep.c
@@ -XXX,XX +XXX,XX @@ static void co_sleep_cb(void *opaque)
 {
     CoSleepCB *sleep_cb = opaque;
 
-    qemu_coroutine_enter(sleep_cb->co);
+    aio_co_wake(sleep_cb->co);
 }
 
 void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type,
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This covers both file descriptor callbacks and polling callbacks,
since they execute related code.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Message-id: 20170213135235.12274-14-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/curl.c          | 16 +++++++++++++---
 block/iscsi.c         |  4 ++++
 block/linux-aio.c     |  4 ++++
 block/nfs.c           |  6 ++++++
 block/sheepdog.c      | 29 +++++++++++++++--------------
 block/ssh.c           | 29 +++++++++--------------------
 block/win32-aio.c     | 10 ++++++----
 hw/block/virtio-blk.c |  5 ++++-
 hw/scsi/virtio-scsi.c |  7 +++++++
 util/aio-posix.c      |  7 -------
 util/aio-win32.c      |  6 ------
 11 files changed, 68 insertions(+), 55 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
     }
 }
 
-static void curl_multi_do(void *arg)
+static void curl_multi_do_locked(CURLState *s)
 {
-    CURLState *s = (CURLState *)arg;
     CURLSocket *socket, *next_socket;
     int running;
     int r;
@@ -XXX,XX +XXX,XX @@ static void curl_multi_do(void *arg)
     }
 }
 
+static void curl_multi_do(void *arg)
+{
+    CURLState *s = (CURLState *)arg;
+
+    aio_context_acquire(s->s->aio_context);
+    curl_multi_do_locked(s);
+    aio_context_release(s->s->aio_context);
+}
+
 static void curl_multi_read(void *arg)
 {
     CURLState *s = (CURLState *)arg;
 
-    curl_multi_do(arg);
+    aio_context_acquire(s->s->aio_context);
+    curl_multi_do_locked(s);
     curl_multi_check_completion(s->s);
+    aio_context_release(s->s->aio_context);
 }
 
 static void curl_multi_timeout_do(void *arg)
diff --git a/block/iscsi.c b/block/iscsi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ iscsi_process_read(void *arg)
     IscsiLun *iscsilun = arg;
     struct iscsi_context *iscsi = iscsilun->iscsi;
 
+    aio_context_acquire(iscsilun->aio_context);
     iscsi_service(iscsi, POLLIN);
     iscsi_set_events(iscsilun);
+    aio_context_release(iscsilun->aio_context);
 }
 
 static void
@@ -XXX,XX +XXX,XX @@ iscsi_process_write(void *arg)
     IscsiLun *iscsilun = arg;
     struct iscsi_context *iscsi = iscsilun->iscsi;
 
+    aio_context_acquire(iscsilun->aio_context);
     iscsi_service(iscsi, POLLOUT);
     iscsi_set_events(iscsilun);
+    aio_context_release(iscsilun->aio_context);
 }
 
 static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun)
diff --git a/block/linux-aio.c b/block/linux-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_completion_cb(EventNotifier *e)
     LinuxAioState *s = container_of(e, LinuxAioState, e);
 
     if (event_notifier_test_and_clear(&s->e)) {
+        aio_context_acquire(s->aio_context);
         qemu_laio_process_completions_and_submit(s);
+        aio_context_release(s->aio_context);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static bool qemu_laio_poll_cb(void *opaque)
         return false;
     }
 
+    aio_context_acquire(s->aio_context);
     qemu_laio_process_completions_and_submit(s);
+    aio_context_release(s->aio_context);
     return true;
 }
 
diff --git a/block/nfs.c b/block/nfs.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -XXX,XX +XXX,XX @@ static void nfs_set_events(NFSClient *client)
 static void nfs_process_read(void *arg)
 {
     NFSClient *client = arg;
+
+    aio_context_acquire(client->aio_context);
     nfs_service(client->context, POLLIN);
     nfs_set_events(client);
+    aio_context_release(client->aio_context);
 }
 
 static void nfs_process_write(void *arg)
 {
     NFSClient *client = arg;
+
+    aio_context_acquire(client->aio_context);
     nfs_service(client->context, POLLOUT);
     nfs_set_events(client);
+    aio_context_release(client->aio_context);
 }
 
 static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
diff --git a/block/sheepdog.c b/block/sheepdog.c
index XXXXXXX..XXXXXXX 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
     return ret;
 }
 
-static void restart_co_req(void *opaque)
-{
-    Coroutine *co = opaque;
-
-    qemu_coroutine_enter(co);
-}
-
 typedef struct SheepdogReqCo {
     int sockfd;
     BlockDriverState *bs;
@@ -XXX,XX +XXX,XX @@ typedef struct SheepdogReqCo {
     unsigned int *rlen;
     int ret;
     bool finished;
+    Coroutine *co;
 } SheepdogReqCo;
 
+static void restart_co_req(void *opaque)
+{
+    SheepdogReqCo *srco = opaque;
+
+    aio_co_wake(srco->co);
+}
+
 static coroutine_fn void do_co_req(void *opaque)
 {
     int ret;
-    Coroutine *co;
     SheepdogReqCo *srco = opaque;
     int sockfd = srco->sockfd;
     SheepdogReq *hdr = srco->hdr;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void do_co_req(void *opaque)
     unsigned int *wlen = srco->wlen;
     unsigned int *rlen = srco->rlen;
 
-    co = qemu_coroutine_self();
+    srco->co = qemu_coroutine_self();
     aio_set_fd_handler(srco->aio_context, sockfd, false,
-                       NULL, restart_co_req, NULL, co);
+                       NULL, restart_co_req, NULL, srco);
 
     ret = send_co_req(sockfd, hdr, data, wlen);
     if (ret < 0) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void do_co_req(void *opaque)
     }
 
     aio_set_fd_handler(srco->aio_context, sockfd, false,
-                       restart_co_req, NULL, NULL, co);
+                       restart_co_req, NULL, NULL, srco);
 
     ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
     if (ret != sizeof(*hdr)) {
@@ -XXX,XX +XXX,XX @@ out:
     aio_set_fd_handler(srco->aio_context, sockfd, false,
                        NULL, NULL, NULL, NULL);
 
+    srco->co = NULL;
     srco->ret = ret;
     srco->finished = true;
     if (srco->bs) {
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
          * We've finished all requests which belong to the AIOCB, so
          * we can switch back to sd_co_readv/writev now.
          */
-        qemu_coroutine_enter(acb->coroutine);
+        aio_co_wake(acb->coroutine);
     }
 
     return;
@@ -XXX,XX +XXX,XX @@ static void co_read_response(void *opaque)
         s->co_recv = qemu_coroutine_create(aio_read_response, opaque);
     }
 
-    qemu_coroutine_enter(s->co_recv);
+    aio_co_wake(s->co_recv);
 }
 
 static void co_write_request(void *opaque)
 {
     BDRVSheepdogState *s = opaque;
 
-    qemu_coroutine_enter(s->co_send);
+    aio_co_wake(s->co_send);
 }
 
 /*
diff --git a/block/ssh.c b/block/ssh.c
index XXXXXXX..XXXXXXX 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -XXX,XX +XXX,XX @@ static void restart_coroutine(void *opaque)
 
     DPRINTF("co=%p", co);
 
-    qemu_coroutine_enter(co);
+    aio_co_wake(co);
 }
 
-static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
+/* A non-blocking call returned EAGAIN, so yield, ensuring the
+ * handlers are set up so that we'll be rescheduled when there is an
+ * interesting event on the socket.
+ */
+static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
 {
     int r;
     IOHandler *rd_handler = NULL, *wr_handler = NULL;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
 
     aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
                        false, rd_handler, wr_handler, NULL, co);
-}
-
-static coroutine_fn void clear_fd_handler(BDRVSSHState *s,
-                                          BlockDriverState *bs)
-{
-    DPRINTF("s->sock=%d", s->sock);
-    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
-                       false, NULL, NULL, NULL, NULL);
-}
-
-/* A non-blocking call returned EAGAIN, so yield, ensuring the
- * handlers are set up so that we'll be rescheduled when there is an
- * interesting event on the socket.
- */
-static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
-{
-    set_fd_handler(s, bs);
     qemu_coroutine_yield();
-    clear_fd_handler(s, bs);
+    DPRINTF("s->sock=%d - back", s->sock);
+    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, false,
+                       NULL, NULL, NULL, NULL);
 }
 
 /* SFTP has a function `libssh2_sftp_seek64' which seeks to a position
diff --git a/block/win32-aio.c b/block/win32-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -XXX,XX +XXX,XX @@ struct QEMUWin32AIOState {
     HANDLE hIOCP;
     EventNotifier e;
     int count;
-    bool is_aio_context_attached;
+    AioContext *aio_ctx;
 };
 
 typedef struct QEMUWin32AIOCB {
@@ -XXX,XX +XXX,XX @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
     }
 
 
+    aio_context_acquire(s->aio_ctx);
     waiocb->common.cb(waiocb->common.opaque, ret);
+    aio_context_release(s->aio_ctx);
     qemu_aio_unref(waiocb);
 }
 
@@ -XXX,XX +XXX,XX @@ void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
                                   AioContext *old_context)
 {
     aio_set_event_notifier(old_context, &aio->e, false, NULL, NULL);
-    aio->is_aio_context_attached = false;
+    aio->aio_ctx = NULL;
 }
 
 void win32_aio_attach_aio_context(QEMUWin32AIOState *aio,
                                   AioContext *new_context)
 {
-    aio->is_aio_context_attached = true;
+    aio->aio_ctx = new_context;
     aio_set_event_notifier(new_context, &aio->e, false,
                            win32_aio_completion_cb, NULL);
 }
@@ -XXX,XX +XXX,XX @@ out_free_state:
 
 void win32_aio_cleanup(QEMUWin32AIOState *aio)
 {
-    assert(!aio->is_aio_context_attached);
+    assert(!aio->aio_ctx);
     CloseHandle(aio->hIOCP);
     event_notifier_cleanup(&aio->e);
     g_free(aio);
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
 {
     VirtIOBlockIoctlReq *ioctl_req = opaque;
     VirtIOBlockReq *req = ioctl_req->req;
-    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
     struct virtio_scsi_inhdr *scsi;
     struct sg_io_hdr *hdr;
 
@@ -XXX,XX +XXX,XX @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
     MultiReqBuffer mrb = {};
     bool progress = false;
 
+    aio_context_acquire(blk_get_aio_context(s->blk));
     blk_io_plug(s->blk);
 
     do {
@@ -XXX,XX +XXX,XX @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
     }
 
     blk_io_unplug(s->blk);
+    aio_context_release(blk_get_aio_context(s->blk));
     return progress;
 }
 
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
     VirtIOSCSIReq *req;
     bool progress = false;
 
+    virtio_scsi_acquire(s);
     while ((req = virtio_scsi_pop_req(s, vq))) {
         progress = true;
         virtio_scsi_handle_ctrl_req(s, req);
     }
+    virtio_scsi_release(s);
     return progress;
 }
 
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
 
     QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs);
 
+    virtio_scsi_acquire(s);
     do {
         virtio_queue_set_notification(vq, 0);
 
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
     QTAILQ_FOREACH_SAFE(req, &reqs, next, next) {
         virtio_scsi_handle_cmd_req_submit(s, req);
     }
+    virtio_scsi_release(s);
     return progress;
 }
 
@@ -XXX,XX +XXX,XX @@ out:
 
 bool virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq)
 {
+    virtio_scsi_acquire(s);
     if (s->events_dropped) {
         virtio_scsi_push_event(s, NULL, VIRTIO_SCSI_T_NO_EVENT, 0);
+        virtio_scsi_release(s);
         return true;
     }
+    virtio_scsi_release(s);
     return false;
 }
 
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
             (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
             aio_node_check(ctx, node->is_external) &&
             node->io_read) {
-            aio_context_acquire(ctx);
             node->io_read(node->opaque);
-            aio_context_release(ctx);
 
             /* aio_notify() does not count as progress */
             if (node->opaque != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
             (revents & (G_IO_OUT | G_IO_ERR)) &&
             aio_node_check(ctx, node->is_external) &&
             node->io_write) {
-            aio_context_acquire(ctx);
             node->io_write(node->opaque);
-            aio_context_release(ctx);
             progress = true;
         }
 
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     }
 
-    aio_context_acquire(ctx);
     progress = try_poll_mode(ctx, blocking);
-    aio_context_release(ctx);
-
     if (!progress) {
         assert(npfd == 0);
 
diff --git a/util/aio-win32.c b/util/aio-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
             (revents || event_notifier_get_handle(node->e) == event) &&
             node->io_notify) {
             node->pfd.revents = 0;
-            aio_context_acquire(ctx);
             node->io_notify(node->e);
-            aio_context_release(ctx);
 
             /* aio_notify() does not count as progress */
             if (node->e != &ctx->notifier) {
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
             (node->io_read || node->io_write)) {
             node->pfd.revents = 0;
             if ((revents & G_IO_IN) && node->io_read) {
-                aio_context_acquire(ctx);
                 node->io_read(node->opaque);
-                aio_context_release(ctx);
                 progress = true;
             }
             if ((revents & G_IO_OUT) && node->io_write) {
-                aio_context_acquire(ctx);
                 node->io_write(node->opaque);
-                aio_context_release(ctx);
                 progress = true;
             }
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Message-id: 20170213135235.12274-15-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/archipelago.c   |  3 +++
 block/blkreplay.c     |  2 +-
 block/block-backend.c |  6 ++++++
 block/curl.c          | 26 ++++++++++++++++++--------
 block/gluster.c       |  9 +--------
 block/io.c            |  6 +++++-
 block/iscsi.c         |  6 +++++-
 block/linux-aio.c     | 15 +++++++++------
 block/nfs.c           |  3 ++-
 block/null.c          |  4 ++++
 block/qed.c           |  3 +++
 block/rbd.c           |  4 ++++
 dma-helpers.c         |  2 ++
 hw/block/virtio-blk.c |  2 ++
 hw/scsi/scsi-bus.c    |  2 ++
 util/async.c          |  4 ++--
 util/thread-pool.c    |  2 ++
 17 files changed, 71 insertions(+), 28 deletions(-)

diff --git a/block/archipelago.c b/block/archipelago.c
index XXXXXXX..XXXXXXX 100644
--- a/block/archipelago.c
+++ b/block/archipelago.c
@@ -XXX,XX +XXX,XX @@ static void qemu_archipelago_complete_aio(void *opaque)
 {
     AIORequestData *reqdata = (AIORequestData *) opaque;
     ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;
+    AioContext *ctx = bdrv_get_aio_context(aio_cb->common.bs);
 
+    aio_context_acquire(ctx);
     aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
+    aio_context_release(ctx);
     aio_cb->status = 0;
 
     qemu_aio_unref(aio_cb);
diff --git a/block/blkreplay.c b/block/blkreplay.c
index XXXXXXX..XXXXXXX 100755
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -XXX,XX +XXX,XX @@ static int64_t blkreplay_getlength(BlockDriverState *bs)
 static void blkreplay_bh_cb(void *opaque)
 {
     Request *req = opaque;
-    qemu_coroutine_enter(req->co);
+    aio_co_wake(req->co);
     qemu_bh_delete(req->bh);
     g_free(req);
 }
diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
 static void error_callback_bh(void *opaque)
 {
     struct BlockBackendAIOCB *acb = opaque;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 
     bdrv_dec_in_flight(acb->common.bs);
+    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, acb->ret);
+    aio_context_release(ctx);
     qemu_aio_unref(acb);
 }
 
@@ -XXX,XX +XXX,XX @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)
 static void blk_aio_complete_bh(void *opaque)
 {
     BlkAioEmAIOCB *acb = opaque;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 
     assert(acb->has_returned);
+    aio_context_acquire(ctx);
     blk_aio_complete(acb);
+    aio_context_release(ctx);
 }
 
 static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
 {
     CURLState *state;
     int running;
+    int ret = -EINPROGRESS;
 
     CURLAIOCB *acb = p;
-    BDRVCURLState *s = acb->common.bs->opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVCURLState *s = bs->opaque;
+    AioContext *ctx = bdrv_get_aio_context(bs);
 
     size_t start = acb->sector_num * BDRV_SECTOR_SIZE;
     size_t end;
 
+    aio_context_acquire(ctx);
+
     // In case we have the requested data already (e.g. read-ahead),
     // we can just call the callback and be done.
     switch (curl_find_buf(s, start, acb->nb_sectors * BDRV_SECTOR_SIZE, acb)) {
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
             qemu_aio_unref(acb);
             // fall through
         case FIND_RET_WAIT:
-            return;
+            goto out;
         default:
             break;
     }
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
     // No cache found, so let's start a new request
     state = curl_init_state(acb->common.bs, s);
     if (!state) {
-        acb->common.cb(acb->common.opaque, -EIO);
-        qemu_aio_unref(acb);
-        return;
+        ret = -EIO;
+        goto out;
     }
 
     acb->start = 0;
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
     state->orig_buf = g_try_malloc(state->buf_len);
     if (state->buf_len && state->orig_buf == NULL) {
         curl_clean_state(state);
-        acb->common.cb(acb->common.opaque, -ENOMEM);
-        qemu_aio_unref(acb);
-        return;
+        ret = -ENOMEM;
+        goto out;
     }
     state->acb[0] = acb;
 
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
 
     /* Tell curl it needs to kick things off */
     curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
+
+out:
+    if (ret != -EINPROGRESS) {
+        acb->common.cb(acb->common.opaque, ret);
+        qemu_aio_unref(acb);
+    }
+    aio_context_release(ctx);
 }
 
 static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
diff --git a/block/gluster.c b/block/gluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -XXX,XX +XXX,XX @@ static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
     return qemu_gluster_glfs_init(gconf, errp);
 }
 
-static void qemu_gluster_complete_aio(void *opaque)
-{
-    GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
-
-    qemu_coroutine_enter(acb->coroutine);
-}
-
 /*
  * AIO callback routine called from GlusterFS thread.
  */
@@ -XXX,XX +XXX,XX @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
         acb->ret = -EIO; /* Partial read/write - fail it */
     }
 
-    aio_bh_schedule_oneshot(acb->aio_context, qemu_gluster_complete_aio, acb);
+    aio_co_schedule(acb->aio_context, acb->coroutine);
 }
 
 static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
     bdrv_dec_in_flight(bs);
     bdrv_drained_begin(bs);
     data->done = true;
-    qemu_coroutine_enter(co);
+    aio_co_wake(co);
 }
 
 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
 static void bdrv_co_em_bh(void *opaque)
 {
     BlockAIOCBCoroutine *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    AioContext *ctx = bdrv_get_aio_context(bs);
 
     assert(!acb->need_bh);
+    aio_context_acquire(ctx);
     bdrv_co_complete(acb);
+    aio_context_release(ctx);
 }
 
 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
diff --git a/block/iscsi.c b/block/iscsi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static void
 iscsi_bh_cb(void *p)
 {
     IscsiAIOCB *acb = p;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 
     qemu_bh_delete(acb->bh);
 
     g_free(acb->buf);
     acb->buf = NULL;
 
+    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, acb->status);
+    aio_context_release(ctx);
 
     if (acb->task != NULL) {
         scsi_free_scsi_task(acb->task);
@@ -XXX,XX +XXX,XX @@ iscsi_schedule_bh(IscsiAIOCB *acb)
 static void iscsi_co_generic_bh_cb(void *opaque)
 {
     struct IscsiTask *iTask = opaque;
+
     iTask->complete = 1;
-    qemu_coroutine_enter(iTask->co);
+    aio_co_wake(iTask->co);
 }
 
 static void iscsi_retry_timer_expired(void *opaque)
diff --git a/block/linux-aio.c b/block/linux-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -XXX,XX +XXX,XX @@ struct LinuxAioState {
     io_context_t ctx;
     EventNotifier e;
 
-    /* io queue for submit at batch */
+    /* io queue for submit at batch.  Protected by AioContext lock. */
     LaioQueue io_q;
 
-    /* I/O completion processing */
+    /* I/O completion processing.  Only runs in I/O thread.  */
     QEMUBH *completion_bh;
     int event_idx;
     int event_max;
@@ -XXX,XX +XXX,XX @@ static inline ssize_t io_event_ret(struct io_event *ev)
  */
 static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
 {
+    LinuxAioState *s = laiocb->ctx;
     int ret;
 
     ret = laiocb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
     }
 
     laiocb->ret = ret;
+    aio_context_acquire(s->aio_context);
     if (laiocb->co) {
         /* If the coroutine is already entered it must be in ioq_submit() and
          * will notice laio->ret has been filled in when it eventually runs
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
         laiocb->common.cb(laiocb->common.opaque, ret);
         qemu_aio_unref(laiocb);
     }
+    aio_context_release(s->aio_context);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completions(LinuxAioState *s)
 static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
 {
     qemu_laio_process_completions(s);
+
+    aio_context_acquire(s->aio_context);
     if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
         ioq_submit(s);
     }
+    aio_context_release(s->aio_context);
 }
 
 static void qemu_laio_completion_bh(void *opaque)
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_completion_cb(EventNotifier *e)
     LinuxAioState *s = container_of(e, LinuxAioState, e);
 
     if (event_notifier_test_and_clear(&s->e)) {
-        aio_context_acquire(s->aio_context);
         qemu_laio_process_completions_and_submit(s);
-        aio_context_release(s->aio_context);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static bool qemu_laio_poll_cb(void *opaque)
         return false;
     }
 
-    aio_context_acquire(s->aio_context);
     qemu_laio_process_completions_and_submit(s);
-    aio_context_release(s->aio_context);
     return true;
 }
 
@@ -XXX,XX +XXX,XX @@ void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
 {
     aio_set_event_notifier(old_context, &s->e, false, NULL, NULL);
     qemu_bh_delete(s->completion_bh);
+    s->aio_context = NULL;
 }
 
 void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
diff --git a/block/nfs.c b/block/nfs.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -XXX,XX +XXX,XX @@ static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
 static void nfs_co_generic_bh_cb(void *opaque)
 {
     NFSRPC *task = opaque;
+
     task->complete = 1;
-    qemu_coroutine_enter(task->co);
+    aio_co_wake(task->co);
 }
 
 static void
diff --git a/block/null.c b/block/null.c
index XXXXXXX..XXXXXXX 100644
--- a/block/null.c
+++ b/block/null.c
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo null_aiocb_info = {
 static void null_bh_cb(void *opaque)
 {
     NullAIOCB *acb = opaque;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
+
+    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, 0);
+    aio_context_release(ctx);
     qemu_aio_unref(acb);
 }
 
diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
 static void qed_aio_complete_bh(void *opaque)
 {
     QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
     BlockCompletionFunc *cb = acb->common.cb;
     void *user_opaque = acb->common.opaque;
     int ret = acb->bh_ret;
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete_bh(void *opaque)
     qemu_aio_unref(acb);
 
     /* Invoke callback */
+    qed_acquire(s);
     cb(user_opaque, ret);
+    qed_release(s);
 }
 
 static void qed_aio_complete(QEDAIOCB *acb, int ret)
diff --git a/block/rbd.c b/block/rbd.c
index XXXXXXX..XXXXXXX 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -XXX,XX +XXX,XX @@ shutdown:
 static void qemu_rbd_complete_aio(RADOSCB *rcb)
 {
     RBDAIOCB *acb = rcb->acb;
+    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
     int64_t r;
 
     r = rcb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
     }
     qemu_vfree(acb->bounce);
+
+    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
+    aio_context_release(ctx);
 
     qemu_aio_unref(acb);
 }
diff --git a/dma-helpers.c b/dma-helpers.c
index XXXXXXX..XXXXXXX 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -XXX,XX +XXX,XX @@ static void dma_blk_cb(void *opaque, int ret)
                                 QEMU_ALIGN_DOWN(dbs->iov.size, dbs->align));
     }
 
+    aio_context_acquire(dbs->ctx);
     dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
                             dma_blk_cb, dbs, dbs->io_func_opaque);
+    aio_context_release(dbs->ctx);
     assert(dbs->acb);
 }
 
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_dma_restart_bh(void *opaque)
 
     s->rq = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
     while (req) {
         VirtIOBlockReq *next = req->next;
         if (virtio_blk_handle_request(req, &mrb)) {
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_dma_restart_bh(void *opaque)
     if (mrb.num_reqs) {
         virtio_blk_submit_multireq(s->blk, &mrb);
     }
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
 }
 
 static void virtio_blk_dma_restart_cb(void *opaque, int running,
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_restart_bh(void *opaque)
     qemu_bh_delete(s->bh);
     s->bh = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.blk));
     QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
         scsi_req_ref(req);
         if (req->retry) {
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_restart_bh(void *opaque)
         }
         scsi_req_unref(req);
     }
+    aio_context_release(blk_get_aio_context(s->conf.blk));
 }
 
 void scsi_req_retry(SCSIRequest *req)
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
                 ret = 1;
             }
             bh->idle = 0;
-            aio_context_acquire(ctx);
             aio_bh_call(bh);
-            aio_context_release(ctx);
         }
         if (bh->deleted) {
             deleted = true;
@@ -XXX,XX +XXX,XX @@ static void co_schedule_bh_cb(void *opaque)
         Coroutine *co = QSLIST_FIRST(&straight);
         QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
         trace_aio_co_schedule_bh_cb(ctx, co);
+        aio_context_acquire(ctx);
         qemu_coroutine_enter(co);
+        aio_context_release(ctx);
     }
 }
 
diff --git a/util/thread-pool.c b/util/thread-pool.c
index XXXXXXX..XXXXXXX 100644
--- a/util/thread-pool.c
+++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@ static void thread_pool_completion_bh(void *opaque)
     ThreadPool *pool = opaque;
     ThreadPoolElement *elem, *next;
 
+    aio_context_acquire(pool->ctx);
 restart:
     QLIST_FOREACH_SAFE(elem, &pool->head, all, next) {
         if (elem->state != THREAD_DONE) {
@@ -XXX,XX +XXX,XX @@ restart:
             qemu_aio_unref(elem);
         }
     }
+    aio_context_release(pool->ctx);
 }
 
 static void thread_pool_cancel(BlockAIOCB *acb)
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Message-id: 20170213135235.12274-16-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/archipelago.c    |  3 ---
 block/block-backend.c  |  7 -------
 block/curl.c           |  2 +-
 block/io.c             |  6 +-----
 block/iscsi.c          |  3 ---
 block/linux-aio.c      |  5 +----
 block/mirror.c         | 12 +++++++++---
 block/null.c           |  8 --------
 block/qed-cluster.c    |  2 ++
 block/qed-table.c      | 12 ++++++++++--
 block/qed.c            |  4 ++--
 block/rbd.c            |  4 ----
 block/win32-aio.c      |  3 ---
 hw/block/virtio-blk.c  | 12 +++++++++++-
 hw/scsi/scsi-disk.c    | 15 +++++++++++++++
 hw/scsi/scsi-generic.c | 20 +++++++++++++++++---
 util/thread-pool.c     |  4 +++-
 17 files changed, 72 insertions(+), 50 deletions(-)

diff --git a/block/archipelago.c b/block/archipelago.c
index XXXXXXX..XXXXXXX 100644
--- a/block/archipelago.c
+++ b/block/archipelago.c
@@ -XXX,XX +XXX,XX @@ static void qemu_archipelago_complete_aio(void *opaque)
 {
     AIORequestData *reqdata = (AIORequestData *) opaque;
     ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;
-    AioContext *ctx = bdrv_get_aio_context(aio_cb->common.bs);
 
-    aio_context_acquire(ctx);
     aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
-    aio_context_release(ctx);
     aio_cb->status = 0;
 
     qemu_aio_unref(aio_cb);
diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
 static void error_callback_bh(void *opaque)
 {
     struct BlockBackendAIOCB *acb = opaque;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 
     bdrv_dec_in_flight(acb->common.bs);
-    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, acb->ret);
-    aio_context_release(ctx);
     qemu_aio_unref(acb);
 }
 
@@ -XXX,XX +XXX,XX @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)
 static void blk_aio_complete_bh(void *opaque)
 {
     BlkAioEmAIOCB *acb = opaque;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
-
     assert(acb->has_returned);
-    aio_context_acquire(ctx);
     blk_aio_complete(acb);
-    aio_context_release(ctx);
 }
 
 static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
diff --git a/block/curl.c b/block/curl.c
index XXXXXXX..XXXXXXX 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
     curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
 
 out:
+    aio_context_release(ctx);
     if (ret != -EINPROGRESS) {
         acb->common.cb(acb->common.opaque, ret);
         qemu_aio_unref(acb);
     }
-    aio_context_release(ctx);
 }
 
 static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_io_em_complete(void *opaque, int ret)
     CoroutineIOCompletion *co = opaque;
 
     co->ret = ret;
-    qemu_coroutine_enter(co->coroutine);
+    aio_co_wake(co->coroutine);
 }
 
 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
 static void bdrv_co_em_bh(void *opaque)
 {
     BlockAIOCBCoroutine *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
-    AioContext *ctx = bdrv_get_aio_context(bs);
 
     assert(!acb->need_bh);
-    aio_context_acquire(ctx);
     bdrv_co_complete(acb);
-    aio_context_release(ctx);
 }
 
 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
diff --git a/block/iscsi.c b/block/iscsi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ static void
 iscsi_bh_cb(void *p)
 {
     IscsiAIOCB *acb = p;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
 
     qemu_bh_delete(acb->bh);
 
     g_free(acb->buf);
     acb->buf = NULL;
 
-    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, acb->status);
-    aio_context_release(ctx);
 
     if (acb->task != NULL) {
         scsi_free_scsi_task(acb->task);
diff --git a/block/linux-aio.c b/block/linux-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -XXX,XX +XXX,XX @@ static inline ssize_t io_event_ret(struct io_event *ev)
  */
 static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
 {
-    LinuxAioState *s = laiocb->ctx;
     int ret;
 
     ret = laiocb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
     }
 
     laiocb->ret = ret;
-    aio_context_acquire(s->aio_context);
     if (laiocb->co) {
         /* If the coroutine is already entered it must be in ioq_submit() and
          * will notice laio->ret has been filled in when it eventually runs
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
          * that!
          */
         if (!qemu_coroutine_entered(laiocb->co)) {
-            qemu_coroutine_enter(laiocb->co);
+            aio_co_wake(laiocb->co);
         }
     } else {
         laiocb->common.cb(laiocb->common.opaque, ret);
         qemu_aio_unref(laiocb);
     }
-    aio_context_release(s->aio_context);
 }
 
 /**
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
 {
     MirrorOp *op = opaque;
     MirrorBlockJob *s = op->s;
+
+    aio_context_acquire(blk_get_aio_context(s->common.blk));
     if (ret < 0) {
         BlockErrorAction action;
 
@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
         }
     }
     mirror_iteration_done(op, ret);
+    aio_context_release(blk_get_aio_context(s->common.blk));
 }
 
 static void mirror_read_complete(void *opaque, int ret)
 {
     MirrorOp *op = opaque;
     MirrorBlockJob *s = op->s;
+
+    aio_context_acquire(blk_get_aio_context(s->common.blk));
     if (ret < 0) {
         BlockErrorAction action;
 
@@ -XXX,XX +XXX,XX @@ static void mirror_read_complete(void *opaque, int ret)
         }
 
         mirror_iteration_done(op, ret);
-        return;
+    } else {
+        blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
+                        0, mirror_write_complete, op);
     }
-    blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
-                    0, mirror_write_complete, op);
+    aio_context_release(blk_get_aio_context(s->common.blk));
 }
 
 static inline void mirror_clip_sectors(MirrorBlockJob *s,
diff --git a/block/null.c b/block/null.c
index XXXXXXX..XXXXXXX 100644
--- a/block/null.c
+++ b/block/null.c
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo null_aiocb_info = {
 static void null_bh_cb(void *opaque)
 {
     NullAIOCB *acb = opaque;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
-
-    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, 0);
-    aio_context_release(ctx);
     qemu_aio_unref(acb);
 }
 
 static void null_timer_cb(void *opaque)
 {
     NullAIOCB *acb = opaque;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
-
-    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, 0);
-    aio_context_release(ctx);
     timer_deinit(&acb->timer);
     qemu_aio_unref(acb);
 }
diff --git a/block/qed-cluster.c b/block/qed-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed-cluster.c
+++ b/block/qed-cluster.c
@@ -XXX,XX +XXX,XX @@ static void qed_find_cluster_cb(void *opaque, int ret)
     unsigned int index;
     unsigned int n;
 
+    qed_acquire(s);
     if (ret) {
         goto out;
     }
@@ -XXX,XX +XXX,XX @@ static void qed_find_cluster_cb(void *opaque, int ret)
 
 out:
     find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
+    qed_release(s);
     g_free(find_cluster_cb);
 }
 
diff --git a/block/qed-table.c b/block/qed-table.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -XXX,XX +XXX,XX @@ static void qed_read_table_cb(void *opaque, int ret)
 {
     QEDReadTableCB *read_table_cb = opaque;
     QEDTable *table = read_table_cb->table;
+    BDRVQEDState *s = read_table_cb->s;
     int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
     int i;
 
@@ -XXX,XX +XXX,XX @@ static void qed_read_table_cb(void *opaque, int ret)
     }
 
     /* Byteswap offsets */
+    qed_acquire(s);
     for (i = 0; i < noffsets; i++) {
         table->offsets[i] = le64_to_cpu(table->offsets[i]);
     }
+    qed_release(s);
 
 out:
     /* Completion */
-    trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret);
+    trace_qed_read_table_cb(s, read_table_cb->table, ret);
     gencb_complete(&read_table_cb->gencb, ret);
 }
 
@@ -XXX,XX +XXX,XX @@ typedef struct {
 static void qed_write_table_cb(void *opaque, int ret)
 {
     QEDWriteTableCB *write_table_cb = opaque;
+    BDRVQEDState *s = write_table_cb->s;
 
-    trace_qed_write_table_cb(write_table_cb->s,
+    trace_qed_write_table_cb(s,
                              write_table_cb->orig_table,
                              write_table_cb->flush,
                              ret);
@@ -XXX,XX +XXX,XX @@ static void qed_write_table_cb(void *opaque, int ret)
     if (write_table_cb->flush) {
         /* We still need to flush first */
         write_table_cb->flush = false;
+        qed_acquire(s);
         bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb,
                        write_table_cb);
+        qed_release(s);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void qed_read_l2_table_cb(void *opaque, int ret)
     CachedL2Table *l2_table = request->l2_table;
     uint64_t l2_offset = read_l2_table_cb->l2_offset;
 
+    qed_acquire(s);
     if (ret) {
         /* can't trust loaded L2 table anymore */
         qed_unref_l2_cache_entry(l2_table);
@@ -XXX,XX +XXX,XX @@ static void qed_read_l2_table_cb(void *opaque, int ret)
         request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
         assert(request->l2_table != NULL);
     }
+    qed_release(s);
 
     gencb_complete(&read_l2_table_cb->gencb, ret);
 }
diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l
     }
 
     if (cb->co) {
-        qemu_coroutine_enter(cb->co);
+        aio_co_wake(cb->co);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
     cb->done = true;
     cb->ret = ret;
     if (cb->co) {
-        qemu_coroutine_enter(cb->co);
+        aio_co_wake(cb->co);
     }
 }
 
diff --git a/block/rbd.c b/block/rbd.c
index XXXXXXX..XXXXXXX 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -XXX,XX +XXX,XX @@ shutdown:
 static void qemu_rbd_complete_aio(RADOSCB *rcb)
 {
     RBDAIOCB *acb = rcb->acb;
-    AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
     int64_t r;
 
     r = rcb->ret;
@@ -XXX,XX +XXX,XX @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
     }
     qemu_vfree(acb->bounce);
-
-    aio_context_acquire(ctx);
     acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
-    aio_context_release(ctx);
 
     qemu_aio_unref(acb);
 }
diff --git a/block/win32-aio.c b/block/win32-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -XXX,XX +XXX,XX @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
         qemu_vfree(waiocb->buf);
     }
 
-
-    aio_context_acquire(s->aio_ctx);
     waiocb->common.cb(waiocb->common.opaque, ret);
-    aio_context_release(s->aio_ctx);
     qemu_aio_unref(waiocb);
 }
 
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
 static void virtio_blk_rw_complete(void *opaque, int ret)
 {
     VirtIOBlockReq *next = opaque;
+    VirtIOBlock *s = next->dev;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
     while (next) {
         VirtIOBlockReq *req = next;
         next = req->mr_next;
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_rw_complete(void *opaque, int ret)
         block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
         virtio_blk_free_request(req);
     }
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
 }
 
 static void virtio_blk_flush_complete(void *opaque, int ret)
 {
     VirtIOBlockReq *req = opaque;
+    VirtIOBlock *s = req->dev;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
     if (ret) {
         if (virtio_blk_handle_rw_error(req, -ret, 0)) {
-            return;
+            goto out;
         }
     }
 
     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
     block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
     virtio_blk_free_request(req);
+
+out:
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
 }
 
 #ifdef __linux__
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
     virtio_stl_p(vdev, &scsi->data_len, hdr->dxfer_len);
 
 out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
     virtio_blk_req_complete(req, status);
     virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
     g_free(ioctl_req);
 }
 
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (scsi_disk_req_check_error(r, ret, true)) {
         goto done;
     }
@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
     scsi_req_complete(&r->req, GOOD);
 
 done:
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
     scsi_req_unref(&r->req);
 }
 
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_complete(void *opaque, int ret)
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (ret < 0) {
         block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
     } else {
         block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
     }
     scsi_dma_complete_noio(r, ret);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 static void scsi_read_complete(void * opaque, int ret)
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (scsi_disk_req_check_error(r, ret, true)) {
         goto done;
     }
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
 
 done:
     scsi_req_unref(&r->req);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 /* Actually issue a read to the block device.  */
@@ -XXX,XX +XXX,XX @@ static void scsi_do_read_cb(void *opaque, int ret)
     assert (r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (ret < 0) {
         block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
     } else {
         block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
     }
     scsi_do_read(opaque, ret);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 /* Read more data from scsi device into buffer.  */
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
     assert (r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (ret < 0) {
         block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
     } else {
         block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
     }
     scsi_write_complete_noio(r, ret);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 static void scsi_write_data(SCSIRequest *req)
@@ -XXX,XX +XXX,XX @@ static void scsi_unmap_complete(void *opaque, int ret)
 {
     UnmapCBData *data = opaque;
     SCSIDiskReq *r = data->r;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     scsi_unmap_complete_noio(data, ret);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 static void scsi_disk_emulate_unmap(SCSIDiskReq *r, uint8_t *inbuf)
@@ -XXX,XX +XXX,XX @@ static void scsi_write_same_complete(void *opaque, int ret)
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
+    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
     if (scsi_disk_req_check_error(r, ret, true)) {
         goto done;
     }
@@ -XXX,XX +XXX,XX @@ done:
     scsi_req_unref(&r->req);
     qemu_vfree(data->iov.iov_base);
     g_free(data);
+    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 
 static void scsi_disk_emulate_write_same(SCSIDiskReq *r, uint8_t *inbuf)
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -XXX,XX +XXX,XX @@ done:
 static void scsi_command_complete(void *opaque, int ret)
 {
     SCSIGenericReq *r = (SCSIGenericReq *)opaque;
+    SCSIDevice *s = r->req.dev;
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
+
+    aio_context_acquire(blk_get_aio_context(s->conf.blk));
     scsi_command_complete_noio(r, ret);
+    aio_context_release(blk_get_aio_context(s->conf.blk));
 }
 
 static int execute_command(BlockBackend *blk,
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.blk));
+
     if (ret || r->req.io_canceled) {
         scsi_command_complete_noio(r, ret);
-        return;
+        goto done;
     }
 
     len = r->io_header.dxfer_len - r->io_header.resid;
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
     r->len = -1;
     if (len == 0) {
         scsi_command_complete_noio(r, 0);
-        return;
+        goto done;
     }
 
     /* Snoop READ CAPACITY output to set the blocksize.  */
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
     }
     scsi_req_data(&r->req, len);
     scsi_req_unref(&r->req);
+
+done:
+    aio_context_release(blk_get_aio_context(s->conf.blk));
 }
 
 /* Read more data from scsi device into buffer.  */
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
+    aio_context_acquire(blk_get_aio_context(s->conf.blk));
+
     if (ret || r->req.io_canceled) {
         scsi_command_complete_noio(r, ret);
-        return;
+        goto done;
     }
 
     if (r->req.cmd.buf[0] == MODE_SELECT && r->req.cmd.buf[4] == 12 &&
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
     }
 
     scsi_command_complete_noio(r, ret);
+
+done:
+    aio_context_release(blk_get_aio_context(s->conf.blk));
 }
 
 /* Write data to a scsi device.  Returns nonzero on failure.
diff --git a/util/thread-pool.c b/util/thread-pool.c
index XXXXXXX..XXXXXXX 100644
--- a/util/thread-pool.c
+++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@ restart:
              */
             qemu_bh_schedule(pool->completion_bh);
 
+            aio_context_release(pool->ctx);
             elem->common.cb(elem->common.opaque, elem->ret);
+            aio_context_acquire(pool->ctx);
             qemu_aio_unref(elem);
             goto restart;
         } else {
@@ -XXX,XX +XXX,XX @@ static void thread_pool_co_cb(void *opaque, int ret)
     ThreadPoolCo *co = opaque;
 
     co->ret = ret;
-    qemu_coroutine_enter(co->co);
+    aio_co_wake(co->co);
 }
 
 int coroutine_fn thread_pool_submit_co(ThreadPool *pool, ThreadPoolFunc *func,
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This patch prepares for the removal of unnecessary lockcnt inc/dec pairs.
Extract the dispatching loop for file descriptor handlers into a new
function aio_dispatch_handlers, and then inline aio_dispatch into
aio_poll.

aio_dispatch can now become void.

diff --git a/include/block/aio.h b/include/block/aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ bool aio_pending(AioContext *ctx);
 /* Dispatch any pending callbacks from the GSource attached to the AioContext.
  *
  * This is used internally in the implementation of the GSource.
- *
- * @dispatch_fds: true to process fds, false to skip them
- *                (can be used as an optimization by callers that know there
- *                are no fds ready)
  */
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds);
+void aio_dispatch(AioContext *ctx);
 
 /* Progress in completing AIO work to occur.  This can issue new pending
  * aio as a result of executing I/O completion or bh callbacks.
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
     AioHandler *node, *tmp;
     bool progress = false;
 
-    /*
-     * We have to walk very carefully in case aio_set_fd_handler is
-     * called while we're walking.
-     */
-    qemu_lockcnt_inc(&ctx->list_lock);
-
     QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
         int revents;
 
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
         }
     }
 
-    qemu_lockcnt_dec(&ctx->list_lock);
     return progress;
 }
 
-/*
- * Note that dispatch_fds == false has the side-effect of post-poning the
- * freeing of deleted handlers.
- */
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
+void aio_dispatch(AioContext *ctx)
 {
-    bool progress;
+    aio_bh_poll(ctx);
 
-    /*
-     * If there are callbacks left that have been queued, we need to call them.
-     * Do not call select in this case, because it is possible that the caller
-     * does not need a complete flush (as is the case for aio_poll loops).
-     */
-    progress = aio_bh_poll(ctx);
+    qemu_lockcnt_inc(&ctx->list_lock);
+    aio_dispatch_handlers(ctx);
+    qemu_lockcnt_dec(&ctx->list_lock);
 
-    if (dispatch_fds) {
-        progress |= aio_dispatch_handlers(ctx);
-    }
-
-    /* Run our timers */
-    progress |= timerlistgroup_run_timers(&ctx->tlg);
-
-    return progress;
+    timerlistgroup_run_timers(&ctx->tlg);
 }
 
 /* These thread-local variables are used only in a small part of aio_poll
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     npfd = 0;
     qemu_lockcnt_dec(&ctx->list_lock);
 
-    /* Run dispatch even if there were no readable fds to run timers */
-    if (aio_dispatch(ctx, ret > 0)) {
-        progress = true;
+    progress |= aio_bh_poll(ctx);
+
+    if (ret > 0) {
+        qemu_lockcnt_inc(&ctx->list_lock);
+        progress |= aio_dispatch_handlers(ctx);
+        qemu_lockcnt_dec(&ctx->list_lock);
     }
 
+    progress |= timerlistgroup_run_timers(&ctx->tlg);
+
     return progress;
 }
 
diff --git a/util/aio-win32.c b/util/aio-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
     return progress;
 }
 
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
+void aio_dispatch(AioContext *ctx)
 {
-    bool progress;
-
-    progress = aio_bh_poll(ctx);
-    if (dispatch_fds) {
-        progress |= aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
-    }
-    progress |= timerlistgroup_run_timers(&ctx->tlg);
-    return progress;
+    aio_bh_poll(ctx);
+    aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
+    timerlistgroup_run_timers(&ctx->tlg);
 }
 
 bool aio_poll(AioContext *ctx, bool blocking)
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ aio_ctx_dispatch(GSource     *source,
     AioContext *ctx = (AioContext *) source;
 
     assert(callback == NULL);
-    aio_dispatch(ctx, true);
+    aio_dispatch(ctx);
     return true;
 }
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Pull the increment/decrement pair out of aio_bh_poll and into the
callers.

diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
 
 void aio_dispatch(AioContext *ctx)
 {
+    qemu_lockcnt_inc(&ctx->list_lock);
     aio_bh_poll(ctx);
-
-    qemu_lockcnt_inc(&ctx->list_lock);
     aio_dispatch_handlers(ctx);
     qemu_lockcnt_dec(&ctx->list_lock);
 
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     }
 
     npfd = 0;
-    qemu_lockcnt_dec(&ctx->list_lock);
 
     progress |= aio_bh_poll(ctx);
 
     if (ret > 0) {
-        qemu_lockcnt_inc(&ctx->list_lock);
         progress |= aio_dispatch_handlers(ctx);
-        qemu_lockcnt_dec(&ctx->list_lock);
     }
 
+    qemu_lockcnt_dec(&ctx->list_lock);
+
     progress |= timerlistgroup_run_timers(&ctx->tlg);
 
     return progress;
diff --git a/util/aio-win32.c b/util/aio-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
     bool progress = false;
     AioHandler *tmp;
 
-    qemu_lockcnt_inc(&ctx->list_lock);
-
     /*
      * We have to walk very carefully in case aio_set_fd_handler is
      * called while we're walking.
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
         }
     }
 
-    qemu_lockcnt_dec(&ctx->list_lock);
     return progress;
 }
 
 void aio_dispatch(AioContext *ctx)
 {
+    qemu_lockcnt_inc(&ctx->list_lock);
     aio_bh_poll(ctx);
     aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
+    qemu_lockcnt_dec(&ctx->list_lock);
     timerlistgroup_run_timers(&ctx->tlg);
 }
 
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         }
     }
 
-    qemu_lockcnt_dec(&ctx->list_lock);
     first = true;
 
     /* ctx->notifier is always registered.  */
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
         progress |= aio_dispatch_handlers(ctx, event);
     } while (count > 0);
 
+    qemu_lockcnt_dec(&ctx->list_lock);
+
     progress |= timerlistgroup_run_timers(&ctx->tlg);
     return progress;
 }
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ void aio_bh_call(QEMUBH *bh)
     bh->cb(bh->opaque);
 }
 
-/* Multiple occurrences of aio_bh_poll cannot be called concurrently */
+/* Multiple occurrences of aio_bh_poll cannot be called concurrently.
+ * The count in ctx->list_lock is incremented before the call, and is
+ * not affected by the call.
+ */
 int aio_bh_poll(AioContext *ctx)
 {
     QEMUBH *bh, **bhp, *next;
     int ret;
     bool deleted = false;
 
-    qemu_lockcnt_inc(&ctx->list_lock);
-
     ret = 0;
     for (bh = atomic_rcu_read(&ctx->first_bh); bh; bh = next) {
         next = atomic_rcu_read(&bh->next);
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
 
     /* remove deleted bhs */
     if (!deleted) {
-        qemu_lockcnt_dec(&ctx->list_lock);
         return ret;
     }
 
-    if (qemu_lockcnt_dec_and_lock(&ctx->list_lock)) {
+    if (qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
         bhp = &ctx->first_bh;
         while (*bhp) {
             bh = *bhp;
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
                 bhp = &bh->next;
             }
         }
-        qemu_lockcnt_unlock(&ctx->list_lock);
+        qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
     }
     return ret;
 }
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

diff --git a/include/block/block_int.h b/include/block/block_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -XXX,XX +XXX,XX @@ struct BdrvChild {
  * copied as well.
  */
 struct BlockDriverState {
-    int64_t total_sectors; /* if we are reading a disk image, give its
-                              size in sectors */
+    /* Protected by big QEMU lock or read-only after opening.  No special
+     * locking needed during I/O...
+     */
     int open_flags; /* flags used to open the file, re-used for re-open */
     bool read_only; /* if true, the media is read only */
     bool encrypted; /* if true, the media is encrypted */
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     bool sg;        /* if true, the device is a /dev/sg* */
     bool probed;    /* if true, format was probed rather than specified */
 
-    int copy_on_read; /* if nonzero, copy read backing sectors into image.
-                         note this is a reference count */
-
-    CoQueue flush_queue;            /* Serializing flush queue */
-    bool active_flush_req;          /* Flush request in flight? */
-    unsigned int write_gen;         /* Current data generation */
-    unsigned int flushed_gen;       /* Flushed write generation */
-
     BlockDriver *drv; /* NULL means no media */
     void *opaque;
 
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     BdrvChild *backing;
     BdrvChild *file;
 
-    /* Callback before write request is processed */
-    NotifierWithReturnList before_write_notifiers;
-
-    /* number of in-flight requests; overall and serialising */
-    unsigned int in_flight;
-    unsigned int serialising_in_flight;
-
-    bool wakeup;
-
-    /* Offset after the highest byte written to */
-    uint64_t wr_highest_offset;
-
     /* I/O Limits */
     BlockLimits bl;
 
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     QTAILQ_ENTRY(BlockDriverState) bs_list;
     /* element of the list of monitor-owned BDS */
     QTAILQ_ENTRY(BlockDriverState) monitor_list;
-    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
     int refcnt;
 
-    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
-
     /* operation blockers */
     QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX];
 
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     /* The error object in use for blocking operations on backing_hd */
     Error *backing_blocker;
 
+    /* Protected by AioContext lock */
+
+    /* If true, copy read backing sectors into image.  Can be >1 if more
+     * than one client has requested copy-on-read.
+     */
+    int copy_on_read;
+
+    /* If we are reading a disk image, give its size in sectors.
+     * Generally read-only; it is written to by load_vmstate and save_vmstate,
+     * but the block layer is quiescent during those.
+     */
+    int64_t total_sectors;
+
+    /* Callback before write request is processed */
+    NotifierWithReturnList before_write_notifiers;
+
+    /* number of in-flight requests; overall and serialising */
+    unsigned int in_flight;
+    unsigned int serialising_in_flight;
+
+    bool wakeup;
+
+    /* Offset after the highest byte written to */
+    uint64_t wr_highest_offset;
+
     /* threshold limit for writes, in bytes. "High water mark". */
     uint64_t write_threshold_offset;
     NotifierWithReturn write_threshold_notifier;
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
     /* counter for nested bdrv_io_plug */
     unsigned io_plugged;
 
+    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
+    CoQueue flush_queue;                  /* Serializing flush queue */
+    bool active_flush_req;                /* Flush request in flight? */
+    unsigned int write_gen;               /* Current data generation */
+    unsigned int flushed_gen;             /* Flushed write generation */
+
+    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
+
+    /* do we need to tell the quest if we have a volatile write cache? */
+    int enable_write_cache;
+
     int quiesce_counter;
 };
 
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -XXX,XX +XXX,XX @@ typedef struct BlockDevOps {
  * fields that must be public. This is in particular for QLIST_ENTRY() and
  * friends so that BlockBackends can be kept in lists outside block-backend.c */
 typedef struct BlockBackendPublic {
-    /* I/O throttling.
-     * throttle_state tells us if this BlockBackend has I/O limits configured.
-     * io_limits_disabled tells us if they are currently being enforced */
+    /* I/O throttling has its own locking, but also some fields are
+     * protected by the AioContext lock.
+     */
+
+    /* Protected by AioContext lock.  */
     CoQueue      throttled_reqs[2];
+
+    /* Nonzero if the I/O limits are currently being ignored; generally
+     * it is zero.  */
     unsigned int io_limits_disabled;
 
     /* The following fields are protected by the ThrottleGroup lock.
-     * See the ThrottleGroup documentation for details. */
+     * See the ThrottleGroup documentation for details.
+     * throttle_state tells us if I/O limits are configured. */
     ThrottleState *throttle_state;
     ThrottleTimers throttle_timers;
     unsigned       pending_reqs[2];
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This uses the lock-free mutex described in the paper '"Blocking without
Locking", or LFTHREADS: A lock-free thread library' by Gidenstam and
Papatriantafilou.  The same technique is used in OSv, and in fact
the code is essentially a conversion to C of OSv's code.

[Added missing coroutine_fn in tests/test-aio-multithread.c.
--Stefan]

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-2-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h     |  17 ++++-
 tests/test-aio-multithread.c |  86 ++++++++++++++++++++++++
 util/qemu-coroutine-lock.c   | 155 ++++++++++++++++++++++++++++++++++++++++---
 util/trace-events            |   1 +
 4 files changed, 246 insertions(+), 13 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue);
 /**
  * Provides a mutex that can be used to synchronise coroutines
  */
+struct CoWaitRecord;
 typedef struct CoMutex {
-    bool locked;
+    /* Count of pending lockers; 0 for a free mutex, 1 for an
+     * uncontended mutex.
+     */
+    unsigned locked;
+
+    /* A queue of waiters.  Elements are added atomically in front of
+     * from_push.  to_pop is only populated, and popped from, by whoever
+     * is in charge of the next wakeup.  This can be an unlocker or,
+     * through the handoff protocol, a locker that is about to go to sleep.
+     */
+    QSLIST_HEAD(, CoWaitRecord) from_push, to_pop;
+
+    unsigned handoff, sequence;
+
     Coroutine *holder;
-    CoQueue queue;
 } CoMutex;
 
 /**
diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-aio-multithread.c
+++ b/tests/test-aio-multithread.c
@@ -XXX,XX +XXX,XX @@ static void test_multi_co_schedule_10(void)
     test_multi_co_schedule(10);
 }
 
+/* CoMutex thread-safety.  */
+
+static uint32_t atomic_counter;
+static uint32_t running;
+static uint32_t counter;
+static CoMutex comutex;
+
+static void coroutine_fn test_multi_co_mutex_entry(void *opaque)
+{
+    while (!atomic_mb_read(&now_stopping)) {
+        qemu_co_mutex_lock(&comutex);
+        counter++;
+        qemu_co_mutex_unlock(&comutex);
+
+        /* Increase atomic_counter *after* releasing the mutex.  Otherwise
+         * there is a chance (it happens about 1 in 3 runs) that the iothread
+         * exits before the coroutine is woken up, causing a spurious
+         * assertion failure.
+         */
+        atomic_inc(&atomic_counter);
+    }
+    atomic_dec(&running);
+}
+
+static void test_multi_co_mutex(int threads, int seconds)
+{
+    int i;
+
+    qemu_co_mutex_init(&comutex);
+    counter = 0;
+    atomic_counter = 0;
+    now_stopping = false;
+
+    create_aio_contexts();
+    assert(threads <= NUM_CONTEXTS);
+    running = threads;
+    for (i = 0; i < threads; i++) {
+        Coroutine *co1 = qemu_coroutine_create(test_multi_co_mutex_entry, NULL);
+        aio_co_schedule(ctx[i], co1);
+    }
+
+    g_usleep(seconds * 1000000);
+
+    atomic_mb_set(&now_stopping, true);
+    while (running > 0) {
+        g_usleep(100000);
+    }
+
+    join_aio_contexts();
+    g_test_message("%d iterations/second\n", counter / seconds);
+    g_assert_cmpint(counter, ==, atomic_counter);
+}
+
+/* Testing with NUM_CONTEXTS threads focuses on the queue.  The mutex however
+ * is too contended (and the threads spend too much time in aio_poll)
+ * to actually stress the handoff protocol.
+ */
+static void test_multi_co_mutex_1(void)
+{
+    test_multi_co_mutex(NUM_CONTEXTS, 1);
+}
+
+static void test_multi_co_mutex_10(void)
+{
+    test_multi_co_mutex(NUM_CONTEXTS, 10);
+}
+
+/* Testing with fewer threads stresses the handoff protocol too.  Still, the
+ * case where the locker _can_ pick up a handoff is very rare, happening
+ * about 10 times in 1 million, so increase the runtime a bit compared to
+ * other "quick" testcases that only run for 1 second.
+ */
+static void test_multi_co_mutex_2_3(void)
+{
+    test_multi_co_mutex(2, 3);
+}
+
+static void test_multi_co_mutex_2_30(void)
+{
+    test_multi_co_mutex(2, 30);
+}
+
 /* End of tests.  */
 
 int main(int argc, char **argv)
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/aio/multi/lifecycle", test_lifecycle);
     if (g_test_quick()) {
         g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
+        g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1);
+        g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3);
     } else {
         g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
+        g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10);
+        g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30);
     }
     return g_test_run();
 }
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
+ *
+ * The lock-free mutex implementation is based on OSv
+ * (core/lfmutex.cc, include/lockfree/mutex.hh).
+ * Copyright (C) 2013 Cloudius Systems, Ltd.
  */
 
 #include "qemu/osdep.h"
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue)
     return QSIMPLEQ_FIRST(&queue->entries) == NULL;
 }
 
+/* The wait records are handled with a multiple-producer, single-consumer
+ * lock-free queue.  There cannot be two concurrent pop_waiter() calls
+ * because pop_waiter() can only be called while mutex->handoff is zero.
+ * This can happen in three cases:
+ * - in qemu_co_mutex_unlock, before the hand-off protocol has started.
+ *   In this case, qemu_co_mutex_lock will see mutex->handoff == 0 and
+ *   not take part in the handoff.
+ * - in qemu_co_mutex_lock, if it steals the hand-off responsibility from
+ *   qemu_co_mutex_unlock.  In this case, qemu_co_mutex_unlock will fail
+ *   the cmpxchg (it will see either 0 or the next sequence value) and
+ *   exit.  The next hand-off cannot begin until qemu_co_mutex_lock has
+ *   woken up someone.
+ * - in qemu_co_mutex_unlock, if it takes the hand-off token itself.
+ *   In this case another iteration starts with mutex->handoff == 0;
+ *   a concurrent qemu_co_mutex_lock will fail the cmpxchg, and
+ *   qemu_co_mutex_unlock will go back to case (1).
+ *
+ * The following functions manage this queue.
+ */
+typedef struct CoWaitRecord {
+    Coroutine *co;
+    QSLIST_ENTRY(CoWaitRecord) next;
+} CoWaitRecord;
+
+static void push_waiter(CoMutex *mutex, CoWaitRecord *w)
+{
+    w->co = qemu_coroutine_self();
+    QSLIST_INSERT_HEAD_ATOMIC(&mutex->from_push, w, next);
+}
+
+static void move_waiters(CoMutex *mutex)
+{
+    QSLIST_HEAD(, CoWaitRecord) reversed;
+    QSLIST_MOVE_ATOMIC(&reversed, &mutex->from_push);
+    while (!QSLIST_EMPTY(&reversed)) {
+        CoWaitRecord *w = QSLIST_FIRST(&reversed);
+        QSLIST_REMOVE_HEAD(&reversed, next);
+        QSLIST_INSERT_HEAD(&mutex->to_pop, w, next);
+    }
+}
+
+static CoWaitRecord *pop_waiter(CoMutex *mutex)
+{
+    CoWaitRecord *w;
+
+    if (QSLIST_EMPTY(&mutex->to_pop)) {
+        move_waiters(mutex);
+        if (QSLIST_EMPTY(&mutex->to_pop)) {
+            return NULL;
+        }
+    }
+    w = QSLIST_FIRST(&mutex->to_pop);
+    QSLIST_REMOVE_HEAD(&mutex->to_pop, next);
+    return w;
+}
+
+static bool has_waiters(CoMutex *mutex)
+{
+    return QSLIST_EMPTY(&mutex->to_pop) || QSLIST_EMPTY(&mutex->from_push);
+}
+
 void qemu_co_mutex_init(CoMutex *mutex)
 {
     memset(mutex, 0, sizeof(*mutex));
-    qemu_co_queue_init(&mutex->queue);
 }
 
-void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
+static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
 {
     Coroutine *self = qemu_coroutine_self();
+    CoWaitRecord w;
+    unsigned old_handoff;
 
     trace_qemu_co_mutex_lock_entry(mutex, self);
+    w.co = self;
+    push_waiter(mutex, &w);
 
-    while (mutex->locked) {
-        qemu_co_queue_wait(&mutex->queue);
+    /* This is the "Responsibility Hand-Off" protocol; a lock() picks from
+     * a concurrent unlock() the responsibility of waking somebody up.
+     */
+    old_handoff = atomic_mb_read(&mutex->handoff);
+    if (old_handoff &&
+        has_waiters(mutex) &&
+        atomic_cmpxchg(&mutex->handoff, old_handoff, 0) == old_handoff) {
+        /* There can be no concurrent pops, because there can be only
+         * one active handoff at a time.
+         */
+        CoWaitRecord *to_wake = pop_waiter(mutex);
+        Coroutine *co = to_wake->co;
+        if (co == self) {
+            /* We got the lock ourselves!  */
+            assert(to_wake == &w);
+            return;
+        }
+
+        aio_co_wake(co);
     }
 
-    mutex->locked = true;
-    mutex->holder = self;
-    self->locks_held++;
-
+    qemu_coroutine_yield();
     trace_qemu_co_mutex_lock_return(mutex, self);
 }
 
+void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
+{
+    Coroutine *self = qemu_coroutine_self();
+
+    if (atomic_fetch_inc(&mutex->locked) == 0) {
+        /* Uncontended.  */
+        trace_qemu_co_mutex_lock_uncontended(mutex, self);
+    } else {
+        qemu_co_mutex_lock_slowpath(mutex);
+    }
+    mutex->holder = self;
+    self->locks_held++;
+}
+
 void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
 {
     Coroutine *self = qemu_coroutine_self();
 
     trace_qemu_co_mutex_unlock_entry(mutex, self);
 
-    assert(mutex->locked == true);
+    assert(mutex->locked);
     assert(mutex->holder == self);
     assert(qemu_in_coroutine());
 
-    mutex->locked = false;
     mutex->holder = NULL;
     self->locks_held--;
-    qemu_co_queue_next(&mutex->queue);
+    if (atomic_fetch_dec(&mutex->locked) == 1) {
+        /* No waiting qemu_co_mutex_lock().  Pfew, that was easy!  */
+        return;
+    }
+
+    for (;;) {
+        CoWaitRecord *to_wake = pop_waiter(mutex);
+        unsigned our_handoff;
+
+        if (to_wake) {
+            Coroutine *co = to_wake->co;
+            aio_co_wake(co);
+            break;
+        }
+
+        /* Some concurrent lock() is in progress (we know this because
+         * mutex->locked was >1) but it hasn't yet put itself on the wait
+         * queue.  Pick a sequence number for the handoff protocol (not 0).
+         */
+        if (++mutex->sequence == 0) {
+            mutex->sequence = 1;
+        }
+
+        our_handoff = mutex->sequence;
+        atomic_mb_set(&mutex->handoff, our_handoff);
+        if (!has_waiters(mutex)) {
+            /* The concurrent lock has not added itself yet, so it
+             * will be able to pick our handoff.
+             */
+            break;
+        }
+
+        /* Try to do the handoff protocol ourselves; if somebody else has
+         * already taken it, however, we're done and they're responsible.
+         */
+        if (atomic_cmpxchg(&mutex->handoff, our_handoff, 0) != our_handoff) {
+            break;
+        }
+    }
 
     trace_qemu_co_mutex_unlock_return(mutex, self);
 }
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_coroutine_terminate(void *co) "self %p"
 
 # util/qemu-coroutine-lock.c
 qemu_co_queue_run_restart(void *co) "co %p"
+qemu_co_mutex_lock_uncontended(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Running a very small critical section on pthread_mutex_t and CoMutex
shows that pthread_mutex_t is much faster because it doesn't actually
go to sleep.  What happens is that the critical section is shorter
than the latency of entering the kernel and thus FUTEX_WAIT always
fails.  With CoMutex there is no such latency but you still want to
avoid wait and wakeup.  So introduce it artificially.

This only works with one waiters; because CoMutex is fair, it will
always have more waits and wakeups than a pthread_mutex_t.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-3-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h   |  5 +++++
 util/qemu-coroutine-lock.c | 51 ++++++++++++++++++++++++++++++++++++++++------
 util/qemu-coroutine.c      |  2 +-
 3 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ typedef struct CoMutex {
      */
     unsigned locked;
 
+    /* Context that is holding the lock.  Useful to avoid spinning
+     * when two coroutines on the same AioContext try to get the lock. :)
+     */
+    AioContext *ctx;
+
     /* A queue of waiters.  Elements are added atomically in front of
      * from_push.  to_pop is only populated, and popped from, by whoever
      * is in charge of the next wakeup.  This can be an unlocker or,
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu-common.h"
 #include "qemu/coroutine.h"
 #include "qemu/coroutine_int.h"
+#include "qemu/processor.h"
 #include "qemu/queue.h"
 #include "block/aio.h"
 #include "trace.h"
@@ -XXX,XX +XXX,XX @@ void qemu_co_mutex_init(CoMutex *mutex)
     memset(mutex, 0, sizeof(*mutex));
 }
 
-static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
+static void coroutine_fn qemu_co_mutex_wake(CoMutex *mutex, Coroutine *co)
+{
+    /* Read co before co->ctx; pairs with smp_wmb() in
+     * qemu_coroutine_enter().
+     */
+    smp_read_barrier_depends();
+    mutex->ctx = co->ctx;
+    aio_co_wake(co);
+}
+
+static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx,
+                                                     CoMutex *mutex)
 {
     Coroutine *self = qemu_coroutine_self();
     CoWaitRecord w;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
         if (co == self) {
             /* We got the lock ourselves!  */
             assert(to_wake == &w);
+            mutex->ctx = ctx;
             return;
         }
 
-        aio_co_wake(co);
+        qemu_co_mutex_wake(mutex, co);
     }
 
     qemu_coroutine_yield();
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
 
 void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
 {
+    AioContext *ctx = qemu_get_current_aio_context();
     Coroutine *self = qemu_coroutine_self();
+    int waiters, i;
 
-    if (atomic_fetch_inc(&mutex->locked) == 0) {
+    /* Running a very small critical section on pthread_mutex_t and CoMutex
+     * shows that pthread_mutex_t is much faster because it doesn't actually
+     * go to sleep.  What happens is that the critical section is shorter
+     * than the latency of entering the kernel and thus FUTEX_WAIT always
+     * fails.  With CoMutex there is no such latency but you still want to
+     * avoid wait and wakeup.  So introduce it artificially.
+     */
+    i = 0;
+retry_fast_path:
+    waiters = atomic_cmpxchg(&mutex->locked, 0, 1);
+    if (waiters != 0) {
+        while (waiters == 1 && ++i < 1000) {
+            if (atomic_read(&mutex->ctx) == ctx) {
+                break;
+            }
+            if (atomic_read(&mutex->locked) == 0) {
+                goto retry_fast_path;
+            }
+            cpu_relax();
+        }
+        waiters = atomic_fetch_inc(&mutex->locked);
+    }
+
+    if (waiters == 0) {
         /* Uncontended.  */
         trace_qemu_co_mutex_lock_uncontended(mutex, self);
+        mutex->ctx = ctx;
     } else {
-        qemu_co_mutex_lock_slowpath(mutex);
+        qemu_co_mutex_lock_slowpath(ctx, mutex);
     }
     mutex->holder = self;
     self->locks_held++;
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
     assert(mutex->holder == self);
     assert(qemu_in_coroutine());
 
+    mutex->ctx = NULL;
     mutex->holder = NULL;
     self->locks_held--;
     if (atomic_fetch_dec(&mutex->locked) == 1) {
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
         unsigned our_handoff;
 
         if (to_wake) {
-            Coroutine *co = to_wake->co;
-            aio_co_wake(co);
+            qemu_co_mutex_wake(mutex, to_wake->co);
             break;
         }
 
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine.c
+++ b/util/qemu-coroutine.c
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_enter(Coroutine *co)
     co->ctx = qemu_get_current_aio_context();
 
     /* Store co->ctx before anything that stores co.  Matches
-     * barrier in aio_co_wake.
+     * barrier in aio_co_wake and qemu_co_mutex_wake.
      */
     smp_wmb();
 
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

Add two implementations of the same benchmark as the previous patch,
but using pthreads.  One uses a normal QemuMutex, the other is Linux
only and implements a fair mutex based on MCS locks and futexes.
This shows that the slower performance of the 5-thread case is due to
the fairness of CoMutex, rather than to coroutines.  If fairness does
not matter, as is the case with two threads, CoMutex can actually be
faster than pthreads.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-4-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/test-aio-multithread.c | 164 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)

diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-aio-multithread.c
+++ b/tests/test-aio-multithread.c
@@ -XXX,XX +XXX,XX @@ static void test_multi_co_mutex_2_30(void)
     test_multi_co_mutex(2, 30);
 }
 
+/* Same test with fair mutexes, for performance comparison.  */
+
+#ifdef CONFIG_LINUX
+#include "qemu/futex.h"
+
+/* The nodes for the mutex reside in this structure (on which we try to avoid
+ * false sharing).  The head of the mutex is in the "mutex_head" variable.
+ */
+static struct {
+    int next, locked;
+    int padding[14];
+} nodes[NUM_CONTEXTS] __attribute__((__aligned__(64)));
+
+static int mutex_head = -1;
+
+static void mcs_mutex_lock(void)
+{
+    int prev;
+
+    nodes[id].next = -1;
+    nodes[id].locked = 1;
+    prev = atomic_xchg(&mutex_head, id);
+    if (prev != -1) {
+        atomic_set(&nodes[prev].next, id);
+        qemu_futex_wait(&nodes[id].locked, 1);
+    }
+}
+
+static void mcs_mutex_unlock(void)
+{
+    int next;
+    if (nodes[id].next == -1) {
+        if (atomic_read(&mutex_head) == id &&
+            atomic_cmpxchg(&mutex_head, id, -1) == id) {
+            /* Last item in the list, exit.  */
+            return;
+        }
+        while (atomic_read(&nodes[id].next) == -1) {
+            /* mcs_mutex_lock did the xchg, but has not updated
+             * nodes[prev].next yet.
+             */
+        }
+    }
+
+    /* Wake up the next in line.  */
+    next = nodes[id].next;
+    nodes[next].locked = 0;
+    qemu_futex_wake(&nodes[next].locked, 1);
+}
+
+static void test_multi_fair_mutex_entry(void *opaque)
+{
+    while (!atomic_mb_read(&now_stopping)) {
+        mcs_mutex_lock();
+        counter++;
+        mcs_mutex_unlock();
+        atomic_inc(&atomic_counter);
+    }
+    atomic_dec(&running);
+}
+
+static void test_multi_fair_mutex(int threads, int seconds)
+{
+    int i;
+
+    assert(mutex_head == -1);
+    counter = 0;
+    atomic_counter = 0;
+    now_stopping = false;
+
+    create_aio_contexts();
+    assert(threads <= NUM_CONTEXTS);
+    running = threads;
+    for (i = 0; i < threads; i++) {
+        Coroutine *co1 = qemu_coroutine_create(test_multi_fair_mutex_entry, NULL);
+        aio_co_schedule(ctx[i], co1);
+    }
+
+    g_usleep(seconds * 1000000);
+
+    atomic_mb_set(&now_stopping, true);
+    while (running > 0) {
+        g_usleep(100000);
+    }
+
+    join_aio_contexts();
+    g_test_message("%d iterations/second\n", counter / seconds);
+    g_assert_cmpint(counter, ==, atomic_counter);
+}
+
+static void test_multi_fair_mutex_1(void)
+{
+    test_multi_fair_mutex(NUM_CONTEXTS, 1);
+}
+
+static void test_multi_fair_mutex_10(void)
+{
+    test_multi_fair_mutex(NUM_CONTEXTS, 10);
+}
+#endif
+
+/* Same test with pthread mutexes, for performance comparison and
+ * portability.  */
+
+static QemuMutex mutex;
+
+static void test_multi_mutex_entry(void *opaque)
+{
+    while (!atomic_mb_read(&now_stopping)) {
+        qemu_mutex_lock(&mutex);
+        counter++;
+        qemu_mutex_unlock(&mutex);
+        atomic_inc(&atomic_counter);
+    }
+    atomic_dec(&running);
+}
+
+static void test_multi_mutex(int threads, int seconds)
+{
+    int i;
+
+    qemu_mutex_init(&mutex);
+    counter = 0;
+    atomic_counter = 0;
+    now_stopping = false;
+
+    create_aio_contexts();
+    assert(threads <= NUM_CONTEXTS);
+    running = threads;
+    for (i = 0; i < threads; i++) {
+        Coroutine *co1 = qemu_coroutine_create(test_multi_mutex_entry, NULL);
+        aio_co_schedule(ctx[i], co1);
+    }
+
+    g_usleep(seconds * 1000000);
+
+    atomic_mb_set(&now_stopping, true);
+    while (running > 0) {
+        g_usleep(100000);
+    }
+
+    join_aio_contexts();
+    g_test_message("%d iterations/second\n", counter / seconds);
+    g_assert_cmpint(counter, ==, atomic_counter);
+}
+
+static void test_multi_mutex_1(void)
+{
+    test_multi_mutex(NUM_CONTEXTS, 1);
+}
+
+static void test_multi_mutex_10(void)
+{
+    test_multi_mutex(NUM_CONTEXTS, 10);
+}
+
 /* End of tests.  */
 
 int main(int argc, char **argv)
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
         g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
         g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1);
         g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3);
+#ifdef CONFIG_LINUX
+        g_test_add_func("/aio/multi/mutex/mcs", test_multi_fair_mutex_1);
+#endif
+        g_test_add_func("/aio/multi/mutex/pthread", test_multi_mutex_1);
     } else {
         g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
         g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10);
         g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30);
+#ifdef CONFIG_LINUX
+        g_test_add_func("/aio/multi/mutex/mcs", test_multi_fair_mutex_10);
+#endif
+        g_test_add_func("/aio/multi/mutex/pthread", test_multi_mutex_10);
     }
     return g_test_run();
 }
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This will avoid forward references in the next patch.  It is also
more logical because CoQueue is not anymore the basic primitive.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-5-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h | 89 ++++++++++++++++++++++++------------------------
 1 file changed, 44 insertions(+), 45 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ bool qemu_in_coroutine(void);
  */
 bool qemu_coroutine_entered(Coroutine *co);
 
-
-/**
- * CoQueues are a mechanism to queue coroutines in order to continue executing
- * them later. They provide the fundamental primitives on which coroutine locks
- * are built.
- */
-typedef struct CoQueue {
-    QSIMPLEQ_HEAD(, Coroutine) entries;
-} CoQueue;
-
-/**
- * Initialise a CoQueue. This must be called before any other operation is used
- * on the CoQueue.
- */
-void qemu_co_queue_init(CoQueue *queue);
-
-/**
- * Adds the current coroutine to the CoQueue and transfers control to the
- * caller of the coroutine.
- */
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
-
-/**
- * Restarts the next coroutine in the CoQueue and removes it from the queue.
- *
- * Returns true if a coroutine was restarted, false if the queue is empty.
- */
-bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
-
-/**
- * Restarts all coroutines in the CoQueue and leaves the queue empty.
- */
-void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
-
-/**
- * Enter the next coroutine in the queue
- */
-bool qemu_co_enter_next(CoQueue *queue);
-
-/**
- * Checks if the CoQueue is empty.
- */
-bool qemu_co_queue_empty(CoQueue *queue);
-
-
 /**
  * Provides a mutex that can be used to synchronise coroutines
  */
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex);
  */
 void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
 
+
+/**
+ * CoQueues are a mechanism to queue coroutines in order to continue executing
+ * them later.
+ */
+typedef struct CoQueue {
+    QSIMPLEQ_HEAD(, Coroutine) entries;
+} CoQueue;
+
+/**
+ * Initialise a CoQueue. This must be called before any other operation is used
+ * on the CoQueue.
+ */
+void qemu_co_queue_init(CoQueue *queue);
+
+/**
+ * Adds the current coroutine to the CoQueue and transfers control to the
+ * caller of the coroutine.
+ */
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
+
+/**
+ * Restarts the next coroutine in the CoQueue and removes it from the queue.
+ *
+ * Returns true if a coroutine was restarted, false if the queue is empty.
+ */
+bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
+
+/**
+ * Restarts all coroutines in the CoQueue and leaves the queue empty.
+ */
+void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
+
+/**
+ * Enter the next coroutine in the queue
+ */
+bool qemu_co_enter_next(CoQueue *queue);
+
+/**
+ * Checks if the CoQueue is empty.
+ */
+bool qemu_co_queue_empty(CoQueue *queue);
+
+
 typedef struct CoRwlock {
     bool writer;
     int reader;
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

All that CoQueue needs in order to become thread-safe is help
from an external mutex.  Add this to the API.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-6-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h   |  8 +++++---
 block/backup.c             |  2 +-
 block/io.c                 |  4 ++--
 block/nbd-client.c         |  2 +-
 block/qcow2-cluster.c      |  4 +---
 block/sheepdog.c           |  2 +-
 block/throttle-groups.c    |  2 +-
 hw/9pfs/9p.c               |  2 +-
 util/qemu-coroutine-lock.c | 24 +++++++++++++++++++++---
 9 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
 
 /**
  * CoQueues are a mechanism to queue coroutines in order to continue executing
- * them later.
+ * them later.  They are similar to condition variables, but they need help
+ * from an external mutex in order to maintain thread-safety.
  */
 typedef struct CoQueue {
     QSIMPLEQ_HEAD(, Coroutine) entries;
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_init(CoQueue *queue);
 
 /**
  * Adds the current coroutine to the CoQueue and transfers control to the
- * caller of the coroutine.
+ * caller of the coroutine.  The mutex is unlocked during the wait and
+ * locked again afterwards.
  */
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex);
 
 /**
  * Restarts the next coroutine in the CoQueue and removes it from the queue.
diff --git a/block/backup.c b/block/backup.c
index XXXXXXX..XXXXXXX 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
         retry = false;
         QLIST_FOREACH(req, &job->inflight_reqs, list) {
             if (end > req->start && start < req->end) {
-                qemu_co_queue_wait(&req->wait_queue);
+                qemu_co_queue_wait(&req->wait_queue, NULL);
                 retry = true;
                 break;
             }
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
                  * (instead of producing a deadlock in the former case). */
                 if (!req->waiting_for) {
                     self->waiting_for = req;
-                    qemu_co_queue_wait(&req->wait_queue);
+                    qemu_co_queue_wait(&req->wait_queue, NULL);
                     self->waiting_for = NULL;
                     retry = true;
                     waited = true;
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 
     /* Wait until any previous flushes are completed */
     while (bs->active_flush_req) {
-        qemu_co_queue_wait(&bs->flush_queue);
+        qemu_co_queue_wait(&bs->flush_queue, NULL);
     }
 
     bs->active_flush_req = true;
diff --git a/block/nbd-client.c b/block/nbd-client.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -XXX,XX +XXX,XX @@ static void nbd_coroutine_start(NBDClientSession *s,
     /* Poor man semaphore.  The free_sema is locked when no other request
      * can be accepted, and unlocked after receiving one reply.  */
     if (s->in_flight == MAX_NBD_REQUESTS) {
-        qemu_co_queue_wait(&s->free_sema);
+        qemu_co_queue_wait(&s->free_sema, NULL);
         assert(s->in_flight < MAX_NBD_REQUESTS);
     }
     s->in_flight++;
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
             if (bytes == 0) {
                 /* Wait for the dependency to complete. We need to recheck
                  * the free/allocated clusters when we continue. */
-                qemu_co_mutex_unlock(&s->lock);
-                qemu_co_queue_wait(&old_alloc->dependent_requests);
-                qemu_co_mutex_lock(&s->lock);
+                qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
                 return -EAGAIN;
             }
         }
diff --git a/block/sheepdog.c b/block/sheepdog.c
index XXXXXXX..XXXXXXX 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -XXX,XX +XXX,XX @@ static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
 retry:
     QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
         if (AIOCBOverlapping(acb, cb)) {
-            qemu_co_queue_wait(&s->overlapping_queue);
+            qemu_co_queue_wait(&s->overlapping_queue, NULL);
             goto retry;
         }
     }
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index XXXXXXX..XXXXXXX 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -XXX,XX +XXX,XX @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
     if (must_wait || blkp->pending_reqs[is_write]) {
         blkp->pending_reqs[is_write]++;
         qemu_mutex_unlock(&tg->lock);
-        qemu_co_queue_wait(&blkp->throttled_reqs[is_write]);
+        qemu_co_queue_wait(&blkp->throttled_reqs[is_write], NULL);
         qemu_mutex_lock(&tg->lock);
         blkp->pending_reqs[is_write]--;
     }
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn v9fs_flush(void *opaque)
         /*
          * Wait for pdu to complete.
          */
-        qemu_co_queue_wait(&cancel_pdu->complete);
+        qemu_co_queue_wait(&cancel_pdu->complete, NULL);
         cancel_pdu->cancelled = 0;
         pdu_free(cancel_pdu);
     }
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_init(CoQueue *queue)
     QSIMPLEQ_INIT(&queue->entries);
 }
 
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex)
 {
     Coroutine *self = qemu_coroutine_self();
     QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next);
+
+    if (mutex) {
+        qemu_co_mutex_unlock(mutex);
+    }
+
+    /* There is no race condition here.  Other threads will call
+     * aio_co_schedule on our AioContext, which can reenter this
+     * coroutine but only after this yield and after the main loop
+     * has gone through the next iteration.
+     */
     qemu_coroutine_yield();
     assert(qemu_in_coroutine());
+
+    /* TODO: OSv implements wait morphing here, where the wakeup
+     * primitive automatically places the woken coroutine on the
+     * mutex's queue.  This avoids the thundering herd effect.
+     */
+    if (mutex) {
+        qemu_co_mutex_lock(mutex);
+    }
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock)
     Coroutine *self = qemu_coroutine_self();
 
     while (lock->writer) {
-        qemu_co_queue_wait(&lock->queue);
+        qemu_co_queue_wait(&lock->queue, NULL);
     }
     lock->reader++;
     self->locks_held++;
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_wrlock(CoRwlock *lock)
     Coroutine *self = qemu_coroutine_self();
 
     while (lock->writer || lock->reader) {
-        qemu_co_queue_wait(&lock->queue);
+        qemu_co_queue_wait(&lock->queue, NULL);
     }
     lock->writer = true;
     self->locks_held++;
-- 
2.9.3

From: Paolo Bonzini <pbonzini@redhat.com>

This adds a CoMutex around the existing CoQueue.  Because the write-side
can just take CoMutex, the old "writer" field is not necessary anymore.
Instead of removing it altogether, count the number of pending writers
during a read-side critical section and forbid further readers from
entering.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-7-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h   |  3 ++-
 util/qemu-coroutine-lock.c | 35 ++++++++++++++++++++++++-----------
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue);
 
 
 typedef struct CoRwlock {
-    bool writer;
+    int pending_writer;
     int reader;
+    CoMutex mutex;
     CoQueue queue;
 } CoRwlock;
 
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_init(CoRwlock *lock)
 {
     memset(lock, 0, sizeof(*lock));
     qemu_co_queue_init(&lock->queue);
+    qemu_co_mutex_init(&lock->mutex);
 }
 
 void qemu_co_rwlock_rdlock(CoRwlock *lock)
 {
     Coroutine *self = qemu_coroutine_self();
 
-    while (lock->writer) {
-        qemu_co_queue_wait(&lock->queue, NULL);
+    qemu_co_mutex_lock(&lock->mutex);
+    /* For fairness, wait if a writer is in line.  */
+    while (lock->pending_writer) {
+        qemu_co_queue_wait(&lock->queue, &lock->mutex);
     }
     lock->reader++;
+    qemu_co_mutex_unlock(&lock->mutex);
+
+    /* The rest of the read-side critical section is run without the mutex.  */
     self->locks_held++;
 }
 
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
     Coroutine *self = qemu_coroutine_self();
 
     assert(qemu_in_coroutine());
-    if (lock->writer) {
-        lock->writer = false;
+    if (!lock->reader) {
+        /* The critical section started in qemu_co_rwlock_wrlock.  */
         qemu_co_queue_restart_all(&lock->queue);
     } else {
+        self->locks_held--;
+
+        qemu_co_mutex_lock(&lock->mutex);
         lock->reader--;
         assert(lock->reader >= 0);
         /* Wakeup only one waiting writer */
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
             qemu_co_queue_next(&lock->queue);
         }
     }
-    self->locks_held--;
+    qemu_co_mutex_unlock(&lock->mutex);
 }
 
 void qemu_co_rwlock_wrlock(CoRwlock *lock)
 {
-    Coroutine *self = qemu_coroutine_self();
-
-    while (lock->writer || lock->reader) {
-        qemu_co_queue_wait(&lock->queue, NULL);
+    qemu_co_mutex_lock(&lock->mutex);
+    lock->pending_writer++;
+    while (lock->reader) {
+        qemu_co_queue_wait(&lock->queue, &lock->mutex);
     }
-    lock->writer = true;
-    self->locks_held++;
+    lock->pending_writer--;
+
+    /* The rest of the write-side critical section is run with
+     * the mutex taken, so that lock->reader remains zero.
+     * There is no need to update self->locks_held.
+     */
 }
-- 
2.9.3