Series comparison

-[PULL 00/50] Block layer patches
+[PULL 00/22] Block layer patches
-The following changes since commit f1d33f55c47dfdaf8daacd618588ad3ae4c452d1:
+The following changes since commit 825b96dbcee23d134b691fc75618b59c5f53da32:
-  Merge tag 'pull-testing-gdbstub-plugins-gitdm-061022-3' of https://github.com/stsquad/qemu into staging (2022-10-06 07:11:56 -0400)
+  Merge tag 'migration-20250310-pull-request' of https://gitlab.com/farosas/qemu into staging (2025-03-11 09:32:07 +0800)
 are available in the Git repository at:
-  git://repo.or.cz/qemu/kevin.git tags/for-upstream
+  https://repo.or.cz/qemu/kevin.git tags/for-upstream
-for you to fetch changes up to a7ca2eb488ff149c898f43abe103f8bd8e3ca3c4:
+for you to fetch changes up to a93c04f3cbe690877b3297a9df4767aa811fcd97:
-  file-posix: Remove unused s->discard_zeroes (2022-10-07 12:11:41 +0200)
+  virtio-scsi: only expose cmd vqs via iothread-vq-mapping (2025-03-11 15:49:22 +0100)
 ----------------------------------------------------------------
 Block layer patches
-- job: replace AioContext lock with job_mutex
+- virtio-scsi: add iothread-vq-mapping parameter
-- Fixes to make coroutine_fn annotations more accurate
+- Improve writethrough performance
-- QAPI schema: Fix incorrect example
+- Fix missing zero init in bdrv_snapshot_goto()
-- Code cleanup
+- Code cleanup and iotests fixes
 ----------------------------------------------------------------
-Alberto Faria (1):
+Kevin Wolf (8):
-      coroutine: Drop coroutine_fn annotation from qemu_coroutine_self()
+      block: Remove unused blk_op_is_blocked()
       block: Zero block driver state before reopening
       file-posix: Support FUA writes
       block/io: Ignore FUA with cache.no-flush=on
       aio: Create AioPolledEvent
       aio-posix: Factor out adjust_polling_time()
       aio-posix: Separate AioPolledEvent per AioHandler
       aio-posix: Adjust polling time also for new handlers
-Emanuele Giuseppe Esposito (20):
+Stefan Hajnoczi (13):
-      job.c: make job_mutex and job_lock/unlock() public
+      scsi-disk: drop unused SCSIDiskState->bh field
-      job.h: categorize fields in struct Job
+      dma: use current AioContext for dma_blk_io()
-      job.c: API functions not used outside should be static
+      scsi: track per-SCSIRequest AioContext
-      aio-wait.h: introduce AIO_WAIT_WHILE_UNLOCKED
+      scsi: introduce requests_lock
-      job.c: add job_lock/unlock while keeping job.h intact
+      virtio-scsi: introduce event and ctrl virtqueue locks
-      job: move and update comments from blockjob.c
+      virtio-scsi: protect events_dropped field
-      blockjob: introduce block_job _locked() APIs
+      virtio-scsi: perform TMFs in appropriate AioContexts
-      jobs: add job lock in find_* functions
+      virtio-blk: extract cleanup_iothread_vq_mapping() function
-      jobs: use job locks also in the unit tests
+      virtio-blk: tidy up iothread_vq_mapping functions
-      block/mirror.c: use of job helpers in drivers
+      virtio: extract iothread-vq-mapping.h API
-      jobs: group together API calls under the same job lock
+      virtio-scsi: add iothread-vq-mapping parameter
-      jobs: protect job.aio_context with BQL and job_mutex
+      virtio-scsi: handle ctrl virtqueue in main loop
-      blockjob.h: categorize fields in struct BlockJob
+      virtio-scsi: only expose cmd vqs via iothread-vq-mapping
       blockjob: rename notifier callbacks as _locked
       blockjob: protect iostatus field in BlockJob struct
       job.h: categorize JobDriver callbacks that need the AioContext lock
       job.c: enable job lock/unlock and remove Aiocontext locks
       block_job_query: remove atomic read
       blockjob: remove unused functions
       job: remove unused functions
-Kevin Wolf (2):
+Thomas Huth (1):
-      quorum: Remove unnecessary forward declaration
+      iotests: Limit qsd-migrate to working formats
       file-posix: Remove unused s->discard_zeroes
-Marc-André Lureau (3):
+ include/block/aio.h                         |   5 +-
-p: add missing coroutine_fn annotations
+ include/block/raw-aio.h                     |   8 +-
-      migration: add missing coroutine_fn annotations
+ include/hw/scsi/scsi.h                      |   8 +-
-      test-coroutine: add missing coroutine_fn annotations
+ include/hw/virtio/iothread-vq-mapping.h     |  45 +++
+ include/hw/virtio/virtio-scsi.h             |  15 +-
-Markus Armbruster (1):
+ include/system/block-backend-global-state.h |   1 -
-      Revert "qapi: fix examples of blockdev-add with qcow2"
+ include/system/dma.h                        |   3 +-
+ util/aio-posix.h                            |   1 +
-Paolo Bonzini (23):
+ block/block-backend.c                       |  12 -
-      block/nvme: separate nvme_get_free_req cases for coroutine/non-coroutine context
+ block/file-posix.c                          |  26 +-
-      block: add missing coroutine_fn annotations
+ block/io.c                                  |   4 +
-      qcow2: remove incorrect coroutine_fn annotations
+ block/io_uring.c                            |  13 +-
-      nbd: remove incorrect coroutine_fn annotations
+ block/linux-aio.c                           |  24 +-
-      coroutine: remove incorrect coroutine_fn annotations
+ block/snapshot.c                            |   1 +
-      blkverify: add missing coroutine_fn annotations
+ hw/block/virtio-blk.c                       | 132 +-------
-      file-posix: add missing coroutine_fn annotations
+ hw/ide/core.c                               |   3 +-
-      iscsi: add missing coroutine_fn annotations
+ hw/ide/macio.c                              |   3 +-
-      nbd: add missing coroutine_fn annotations
+ hw/scsi/scsi-bus.c                          | 121 +++++--
-      nfs: add missing coroutine_fn annotations
+ hw/scsi/scsi-disk.c                         |  24 +-
-      nvme: add missing coroutine_fn annotations
+ hw/scsi/virtio-scsi-dataplane.c             | 103 ++++--
-      parallels: add missing coroutine_fn annotations
+ hw/scsi/virtio-scsi.c                       | 502 ++++++++++++++++------------
-      qcow2: add missing coroutine_fn annotations
+ hw/virtio/iothread-vq-mapping.c             | 131 ++++++++
-      copy-before-write: add missing coroutine_fn annotations
+ system/dma-helpers.c                        |   8 +-
-      curl: add missing coroutine_fn annotations
+ util/aio-posix.c                            | 114 ++++---
-      qed: add missing coroutine_fn annotations
+ util/async.c                                |   1 -
-      quorum: add missing coroutine_fn annotations
+ hw/virtio/meson.build                       |   1 +
-      throttle: add missing coroutine_fn annotations
+ meson.build                                 |   4 +
-      vmdk: add missing coroutine_fn annotations
+ tests/qemu-iotests/tests/qsd-migrate        |   2 +-
-      job: add missing coroutine_fn annotations
+files changed, 803 insertions(+), 512 deletions(-)
-      coroutine-lock: add missing coroutine_fn annotations
+ create mode 100644 include/hw/virtio/iothread-vq-mapping.h
-      raw-format: add missing coroutine_fn annotations
+ create mode 100644 hw/virtio/iothread-vq-mapping.c
       job: detect change of aiocontext within job coroutine
  qapi/block-core.json             |  10 +-
  block/qcow2.h                    |  19 +-
  hw/9pfs/9p.h                     |   9 +-
  include/block/aio-wait.h         |  17 +-
  include/block/blockjob.h         |  59 +++-
  include/block/nbd.h              |   2 +-
  include/qemu/coroutine.h         |   4 +-
  include/qemu/job.h               | 306 +++++++++++++-----
  block.c                          |  24 +-
  block/blkverify.c                |   2 +-
  block/block-backend.c            |  10 +-
  block/copy-before-write.c        |   9 +-
  block/curl.c                     |   2 +-
  block/file-posix.c               |  11 +-
  block/io.c                       |  22 +-
  block/iscsi.c                    |   3 +-
  block/mirror.c                   |  19 +-
  block/nbd.c                      |  11 +-
  block/nfs.c                      |   2 +-
  block/nvme.c                     |  54 ++--
  block/parallels.c                |   5 +-
  block/qcow2-cluster.c            |  21 +-
  block/qcow2-refcount.c           |   6 +-
  block/qcow2.c                    |   5 +-
  block/qed.c                      |   4 +-
  block/quorum.c                   |  38 +--
  block/raw-format.c               |   3 +-
  block/replication.c              |   3 +
  block/throttle.c                 |   2 +-
  block/vmdk.c                     |  22 +-
  blockdev.c                       | 129 ++++----
  blockjob.c                       | 132 ++++----
  job-qmp.c                        |  92 +++---
  job.c                            | 674 +++++++++++++++++++++++++--------------
  migration/migration.c            |   3 +-
  monitor/qmp-cmds.c               |   7 +-
  qemu-img.c                       |  17 +-
  tests/unit/test-bdrv-drain.c     |  80 +++--
  tests/unit/test-block-iothread.c |   8 +-
  tests/unit/test-blockjob-txn.c   |  24 +-
  tests/unit/test-blockjob.c       | 136 ++++----
  tests/unit/test-coroutine.c      |   2 +-
  util/qemu-coroutine-lock.c       |  14 +-
  util/qemu-coroutine.c            |   2 +-
 files changed, 1237 insertions(+), 787 deletions(-)

-[PULL 01/50] Revert "qapi: fix examples of blockdev-add with qcow2"
+Deleted patch
-From: Markus Armbruster <armbru@redhat.com>
-This reverts commit b6522938327141235b97ab38e40c6c4512587373.
-Kevin Wolf NAKed this patch, because:
-    'file' is a required member (defined in BlockdevOptionsGenericFormat),
-    removing it makes the example invalid. 'data-file' is only an additional
-    optional member to be used for external data files (i.e. when the guest
-    data is kept separate from the metadata in the .qcow2 file).
-However, it had already been merged then.  Revert.
-Signed-off-by: Markus Armbruster <armbru@redhat.com>
-Message-Id: <20220930171908.846769-1-armbru@redhat.com>
-Reviewed-by: Victor Toso <victortoso@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- qapi/block-core.json | 10 +++++-----
-file changed, 5 insertions(+), 5 deletions(-)
-diff --git a/qapi/block-core.json b/qapi/block-core.json
-index XXXXXXX..XXXXXXX 100644
---- a/qapi/block-core.json
-+++ b/qapi/block-core.json
-@@ -XXX,XX +XXX,XX @@
- # -> { "execute": "blockdev-add",
- #      "arguments": { "driver": "qcow2",
- #                     "node-name": "node1534",
--#                     "data-file": { "driver": "file",
--#                                    "filename": "hd1.qcow2" },
-+#                     "file": { "driver": "file",
-+#                               "filename": "hd1.qcow2" },
- #                     "backing": null } }
- #
- # <- { "return": {} }
-@@ -XXX,XX +XXX,XX @@
- #      "arguments": {
- #           "driver": "qcow2",
- #           "node-name": "test1",
--#           "data-file": {
-+#           "file": {
- #               "driver": "file",
- #               "filename": "test.qcow2"
- #            }
-@@ -XXX,XX +XXX,XX @@
- #           "cache": {
- #              "direct": true
- #            },
--#           "data-file": {
-+#            "file": {
- #              "driver": "file",
- #              "filename": "/tmp/test.qcow2"
- #            },
-@@ -XXX,XX +XXX,XX @@
- #      "arguments": {
- #           "driver": "qcow2",
- #           "node-name": "node0",
--#           "data-file": {
-+#           "file": {
- #               "driver": "file",
- #               "filename": "test.qcow2"
- #           }
---
-.37.3

-[PULL 02/50] coroutine: Drop coroutine_fn annotation from qemu_coroutine_self()
+Deleted patch
-From: Alberto Faria <afaria@redhat.com>
-qemu_coroutine_self() can be called from outside coroutine context,
-returning the leader coroutine, and several such invocations currently
-exist (mostly in qcow2 tracing calls).
-Signed-off-by: Alberto Faria <afaria@redhat.com>
-Message-Id: <20221005175209.975797-1-afaria@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- include/qemu/coroutine.h | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/coroutine.h
-+++ b/include/qemu/coroutine.h
-@@ -XXX,XX +XXX,XX @@ AioContext *coroutine_fn qemu_coroutine_get_aio_context(Coroutine *co);
- /**
-  * Get the currently executing coroutine
-  */
--Coroutine *coroutine_fn qemu_coroutine_self(void);
-+Coroutine *qemu_coroutine_self(void);
- /**
-  * Return whether or not currently inside a coroutine
---
-.37.3

-[PULL 26/50] migration: add missing coroutine_fn annotations
+[PULL 01/22] block: Remove unused blk_op_is_blocked()
-From: Marc-André Lureau <marcandre.lureau@redhat.com>
+Commit fc4e394b28 removed the last caller of blk_op_is_blocked(). Remove
 the now unused function.
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
+Message-ID: <20250206165331.379033-1-kwolf@redhat.com>
-functions where this holds.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
 Reviewed-by: Juan Quintela <quintela@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-26-pbonzini@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- migration/migration.c | 3 ++-
+ include/system/block-backend-global-state.h |  1 -
-file changed, 2 insertions(+), 1 deletion(-)
+ block/block-backend.c                       | 12 ------------
 files changed, 13 deletions(-)
-diff --git a/migration/migration.c b/migration/migration.c
+diff --git a/include/system/block-backend-global-state.h b/include/system/block-backend-global-state.h
 index XXXXXXX..XXXXXXX 100644
---- a/migration/migration.c
+--- a/include/system/block-backend-global-state.h
-+++ b/migration/migration.c
++++ b/include/system/block-backend-global-state.h
-@@ -XXX,XX +XXX,XX @@ static void process_incoming_migration_bh(void *opaque)
+@@ -XXX,XX +XXX,XX @@ bool blk_supports_write_perm(BlockBackend *blk);
-     migration_incoming_state_destroy();
+ bool blk_is_sg(BlockBackend *blk);
  void blk_set_enable_write_cache(BlockBackend *blk, bool wce);
  int blk_get_flags(BlockBackend *blk);
 -bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp);
  int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
                          Error **errp);
  void blk_add_aio_context_notifier(BlockBackend *blk,
 diff --git a/block/block-backend.c b/block/block-backend.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/block-backend.c
 +++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ void *blk_blockalign(BlockBackend *blk, size_t size)
      return qemu_blockalign(blk ? blk_bs(blk) : NULL, size);
  }
--static void process_incoming_migration_co(void *opaque)
+-bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
-+static void coroutine_fn
+-{
-+process_incoming_migration_co(void *opaque)
+-    BlockDriverState *bs = blk_bs(blk);
- {
+-    GLOBAL_STATE_CODE();
-     MigrationIncomingState *mis = migration_incoming_get_current();
+-    GRAPH_RDLOCK_GUARD_MAINLOOP();
-     PostcopyState ps;
+-
 -    if (!bs) {
 -        return false;
 -    }
 -
 -    return bdrv_op_is_blocked(bs, op, errp);
 -}
  /**
   * Return BB's current AioContext.  Note that this context may change
 --
-.37.3
+.48.1

-[PULL 49/50] job: remove unused functions
+[PULL 02/22] block: Zero block driver state before reopening
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+Block drivers assume in their .bdrv_open() implementation that their
 state in bs->opaque has been zeroed; it is initially allocated with
 g_malloc0() in bdrv_open_driver().
-These public functions are not used anywhere, thus can be dropped.
+bdrv_snapshot_goto() needs to make sure that it is zeroed again before
-Also, since this is the final job API that doesn't use AioContext
+calling drv->bdrv_open() to avoid that block drivers use stale values.
 lock and replaces it with job_lock, adjust all remaining function
 documentation to clearly specify if the job lock is taken or not.
-Also document the locking requirements for a few functions
+One symptom of this bug is VMDK running into a double free when the user
-where the second version is not removed.
+tries to apply an internal snapshot like 'qemu-img snapshot -a test
 test.vmdk'. This should be a graceful error because VMDK doesn't support
 internal snapshots.
-Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+==25507== Invalid free() / delete / delete[] / realloc()
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+==25507==    at 0x484B347: realloc (vg_replace_malloc.c:1801)
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
+==25507==    by 0x54B592A: g_realloc (gmem.c:171)
-Message-Id: <20220926093214.506243-22-eesposit@redhat.com>
+==25507==    by 0x1B221D: vmdk_add_extent (../block/vmdk.c:570)
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+==25507==    by 0x1B1084: vmdk_open_sparse (../block/vmdk.c:1059)
 ==25507==    by 0x1AF3D8: vmdk_open (../block/vmdk.c:1371)
 ==25507==    by 0x1A2AE0: bdrv_snapshot_goto (../block/snapshot.c:299)
 ==25507==    by 0x205C77: img_snapshot (../qemu-img.c:3500)
 ==25507==    by 0x58FA087: (below main) (libc_start_call_main.h:58)
 ==25507==  Address 0x832f3e0 is 0 bytes inside a block of size 272 free'd
 ==25507==    at 0x4846B83: free (vg_replace_malloc.c:989)
 ==25507==    by 0x54AEAC4: g_free (gmem.c:208)
 ==25507==    by 0x1AF629: vmdk_close (../block/vmdk.c:2889)
 ==25507==    by 0x1A2A9C: bdrv_snapshot_goto (../block/snapshot.c:290)
 ==25507==    by 0x205C77: img_snapshot (../qemu-img.c:3500)
 ==25507==    by 0x58FA087: (below main) (libc_start_call_main.h:58)
 This error was discovered by fuzzing qemu-img.
 Cc: qemu-stable@nongnu.org
 Closes: https://gitlab.com/qemu-project/qemu/-/issues/2853
 Closes: https://gitlab.com/qemu-project/qemu/-/issues/2851
 Reported-by: Denis Rastyogin <gerben@altlinux.org>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Message-ID: <20250310104858.28221-1-kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/qemu/job.h         | 110 +++++++++++++------------------------
+ block/snapshot.c | 1 +
- job.c                      | 107 ++----------------------------------
+file changed, 1 insertion(+)
  tests/unit/test-blockjob.c |   4 +-
 files changed, 46 insertions(+), 175 deletions(-)
-diff --git a/include/qemu/job.h b/include/qemu/job.h
+diff --git a/block/snapshot.c b/block/snapshot.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/job.h
+--- a/block/snapshot.c
-+++ b/include/qemu/job.h
++++ b/block/snapshot.c
-@@ -XXX,XX +XXX,XX @@ JobTxn *job_txn_new(void);
+@@ -XXX,XX +XXX,XX @@ int bdrv_snapshot_goto(BlockDriverState *bs,
- /**
+         bdrv_graph_wrunlock();
-  * Release a reference that was previously acquired with job_txn_add_job or
-  * job_txn_new. If it's the last reference to the object, it will be freed.
+         ret = bdrv_snapshot_goto(fallback_bs, snapshot_id, errp);
-+ *
++        memset(bs->opaque, 0, drv->instance_size);
-+ * Called with job lock *not* held.
+         open_ret = drv->bdrv_open(bs, options, bs->open_flags, &local_err);
-  */
+         qobject_unref(options);
- void job_txn_unref(JobTxn *txn);
+         if (open_ret < 0) {
@@ -XXX,XX +XXX,XX @@ void *job_create(const char *job_id, const JobDriver *driver, JobTxn *txn,
  /**
   * Add a reference to Job refcnt, it will be decreased with job_unref, and then
   * be freed if it comes to be the last reference.
 + *
 + * Called with job lock held.
   */
 -void job_ref(Job *job);
 -
 -/* Same as job_ref(), but called with job lock held. */
  void job_ref_locked(Job *job);
  /**
 - * Release a reference that was previously acquired with job_ref() or
 + * Release a reference that was previously acquired with job_ref_locked() or
   * job_create(). If it's the last reference to the object, it will be freed.
   *
   * Takes AioContext lock internally to invoke a job->driver callback.
 + * Called with job lock held.
   */
 -void job_unref(Job *job);
 -
 -/* Same as job_unref(), but called with job lock held. */
  void job_unref_locked(Job *job);
  /**
@@ -XXX,XX +XXX,XX @@ void job_progress_increase_remaining(Job *job, uint64_t delta);
   * Conditionally enter the job coroutine if the job is ready to run, not
   * already busy and fn() returns true. fn() is called while under the job_lock
   * critical section.
 - */
 -void job_enter_cond(Job *job, bool(*fn)(Job *job));
 -
 -/*
 - * Same as job_enter_cond(), but called with job lock held.
 - * Might release the lock temporarily.
 + *
 + * Called with job lock held, but might release it temporarily.
   */
  void job_enter_cond_locked(Job *job, bool(*fn)(Job *job));
@@ -XXX,XX +XXX,XX @@ bool job_cancel_requested(Job *job);
  /**
   * Returns whether the job is in a completed state.
 - * Called with job_mutex *not* held.
 + * Called with job lock held.
   */
 -bool job_is_completed(Job *job);
 -
 -/* Same as job_is_completed(), but called with job lock held. */
  bool job_is_completed_locked(Job *job);
  /**
@@ -XXX,XX +XXX,XX @@ bool job_is_ready_locked(Job *job);
  /**
   * Request @job to pause at the next pause point. Must be paired with
   * job_resume(). If the job is supposed to be resumed by user action, call
 - * job_user_pause() instead.
 + * job_user_pause_locked() instead.
 + *
 + * Called with job lock *not* held.
   */
  void job_pause(Job *job);
  /* Same as job_pause(), but called with job lock held. */
  void job_pause_locked(Job *job);
 -/** Resumes a @job paused with job_pause. */
 +/** Resumes a @job paused with job_pause. Called with job lock *not* held. */
  void job_resume(Job *job);
  /*
@@ -XXX,XX +XXX,XX @@ void job_resume_locked(Job *job);
  /**
   * Asynchronously pause the specified @job.
   * Do not allow a resume until a matching call to job_user_resume.
 + * Called with job lock held.
   */
 -void job_user_pause(Job *job, Error **errp);
 -
 -/* Same as job_user_pause(), but called with job lock held. */
  void job_user_pause_locked(Job *job, Error **errp);
 -/** Returns true if the job is user-paused. */
 -bool job_user_paused(Job *job);
 -
 -/* Same as job_user_paused(), but called with job lock held. */
 +/**
 + * Returns true if the job is user-paused.
 + * Called with job lock held.
 + */
  bool job_user_paused_locked(Job *job);
  /**
   * Resume the specified @job.
 - * Must be paired with a preceding job_user_pause.
 - */
 -void job_user_resume(Job *job, Error **errp);
 -
 -/*
 - * Same as job_user_resume(), but called with job lock held.
 - * Might release the lock temporarily.
 + * Must be paired with a preceding job_user_pause_locked.
 + * Called with job lock held, but might release it temporarily.
   */
  void job_user_resume_locked(Job *job, Error **errp);
@@ -XXX,XX +XXX,XX @@ void job_user_resume_locked(Job *job, Error **errp);
   * first one if @job is %NULL.
   *
   * Returns the requested job, or %NULL if there are no more jobs left.
 + * Called with job lock *not* held.
   */
  Job *job_next(Job *job);
@@ -XXX,XX +XXX,XX @@ Job *job_next_locked(Job *job);
   * Get the job identified by @id (which must not be %NULL).
   *
   * Returns the requested job, or %NULL if it doesn't exist.
 + * Called with job lock held.
   */
 -Job *job_get(const char *id);
 -
 -/* Same as job_get(), but called with job lock held. */
  Job *job_get_locked(const char *id);
  /**
   * Check whether the verb @verb can be applied to @job in its current state.
   * Returns 0 if the verb can be applied; otherwise errp is set and -EPERM
   * returned.
 + *
 + * Called with job lock held.
   */
 -int job_apply_verb(Job *job, JobVerb verb, Error **errp);
 -
 -/* Same as job_apply_verb, but called with job lock held. */
  int job_apply_verb_locked(Job *job, JobVerb verb, Error **errp);
  /**
@@ -XXX,XX +XXX,XX @@ void job_early_fail(Job *job);
   */
  void job_transition_to_ready(Job *job);
 -/** Asynchronously complete the specified @job. */
 -void job_complete(Job *job, Error **errp);
 -
 -/*
 - * Same as job_complete(), but called with job lock held.
 - * Might release the lock temporarily.
 +/**
 + * Asynchronously complete the specified @job.
 + * Called with job lock held, but might release it temporarily.
   */
  void job_complete_locked(Job *job, Error **errp);
  /**
   * Asynchronously cancel the specified @job. If @force is true, the job should
   * be cancelled immediately without waiting for a consistent state.
 + * Called with job lock held.
   */
 -void job_cancel(Job *job, bool force);
 -
 -/* Same as job_cancel(), but called with job lock held. */
  void job_cancel_locked(Job *job, bool force);
  /**
 - * Cancels the specified job like job_cancel(), but may refuse to do so if the
 - * operation isn't meaningful in the current state of the job.
 + * Cancels the specified job like job_cancel_locked(), but may refuse
 + * to do so if the operation isn't meaningful in the current state of the job.
 + * Called with job lock held.
   */
 -void job_user_cancel(Job *job, bool force, Error **errp);
 -
 -/* Same as job_user_cancel(), but called with job lock held. */
  void job_user_cancel_locked(Job *job, bool force, Error **errp);
  /**
@@ -XXX,XX +XXX,XX @@ void job_cancel_sync_all(void);
  /**
   * @job: The job to be completed.
 - * @errp: Error object which may be set by job_complete(); this is not
 + * @errp: Error object which may be set by job_complete_locked(); this is not
   *        necessarily set on every error, the job return value has to be
   *        checked as well.
   *
@@ -XXX,XX +XXX,XX @@ void job_cancel_sync_all(void);
   * function).
   *
   * Returns the return value from the job.
 - * Called with job_lock *not* held.
 + * Called with job_lock held.
   */
 -int job_complete_sync(Job *job, Error **errp);
 -
 -/* Same as job_complete_sync, but called with job lock held. */
  int job_complete_sync_locked(Job *job, Error **errp);
  /**
@@ -XXX,XX +XXX,XX @@ int job_complete_sync_locked(Job *job, Error **errp);
   * FIXME: Make the below statement universally true:
   * For jobs that support the manual workflow mode, all graph changes that occur
   * as a result will occur after this command and before a successful reply.
 + *
 + * Called with job lock held.
   */
 -void job_finalize(Job *job, Error **errp);
 -
 -/* Same as job_finalize(), but called with job lock held. */
  void job_finalize_locked(Job *job, Error **errp);
  /**
   * Remove the concluded @job from the query list and resets the passed pointer
   * to %NULL. Returns an error if the job is not actually concluded.
 + *
 + * Called with job lock held.
   */
 -void job_dismiss(Job **job, Error **errp);
 -
 -/* Same as job_dismiss(), but called with job lock held. */
  void job_dismiss_locked(Job **job, Error **errp);
  /**
@@ -XXX,XX +XXX,XX @@ void job_dismiss_locked(Job **job, Error **errp);
   * Returns 0 if the job is successfully completed, -ECANCELED if the job was
   * cancelled before completing, and -errno in other error cases.
   *
 - * Called with job_lock *not* held.
 - */
 -int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp),
 -                    Error **errp);
 -
 -/*
 - * Same as job_finish_sync(), but called with job lock held.
 - * Might release the lock temporarily.
 + * Called with job_lock held, but might release it temporarily.
   */
  int job_finish_sync_locked(Job *job, void (*finish)(Job *, Error **errp),
                             Error **errp);
 diff --git a/job.c b/job.c
 index XXXXXXX..XXXXXXX 100644
 --- a/job.c
 +++ b/job.c
@@ -XXX,XX +XXX,XX @@ int job_apply_verb_locked(Job *job, JobVerb verb, Error **errp)
      return -EPERM;
  }
 -int job_apply_verb(Job *job, JobVerb verb, Error **errp)
 -{
 -    JOB_LOCK_GUARD();
 -    return job_apply_verb_locked(job, verb, errp);
 -}
 -
  JobType job_type(const Job *job)
  {
      return job->driver->job_type;
@@ -XXX,XX +XXX,XX @@ bool job_is_completed_locked(Job *job)
      return false;
  }
 -bool job_is_completed(Job *job)
 +static bool job_is_completed(Job *job)
  {
      JOB_LOCK_GUARD();
      return job_is_completed_locked(job);
@@ -XXX,XX +XXX,XX @@ Job *job_get_locked(const char *id)
      return NULL;
  }
 -Job *job_get(const char *id)
 -{
 -    JOB_LOCK_GUARD();
 -    return job_get_locked(id);
 -}
 -
  void job_set_aio_context(Job *job, AioContext *ctx)
  {
      /* protect against read in job_finish_sync_locked and job_start */
@@ -XXX,XX +XXX,XX @@ void job_ref_locked(Job *job)
      ++job->refcnt;
  }
 -void job_ref(Job *job)
 -{
 -    JOB_LOCK_GUARD();
 -    job_ref_locked(job);
 -}
 -
  void job_unref_locked(Job *job)
  {
      GLOBAL_STATE_CODE();
@@ -XXX,XX +XXX,XX @@ void job_unref_locked(Job *job)
      }
  }
 -void job_unref(Job *job)
 -{
 -    JOB_LOCK_GUARD();
 -    job_unref_locked(job);
 -}
 -
  void job_progress_update(Job *job, uint64_t done)
  {
      progress_work_done(&job->progress, done);
@@ -XXX,XX +XXX,XX @@ void job_enter_cond_locked(Job *job, bool(*fn)(Job *job))
      job_lock();
  }
 -void job_enter_cond(Job *job, bool(*fn)(Job *job))
 -{
 -    JOB_LOCK_GUARD();
 -    job_enter_cond_locked(job, fn);
 -}
 -
  void job_enter(Job *job)
  {
      JOB_LOCK_GUARD();
@@ -XXX,XX +XXX,XX @@ void coroutine_fn job_pause_point(Job *job)
      job_pause_point_locked(job);
  }
 -static void coroutine_fn job_yield_locked(Job *job)
 +void coroutine_fn job_yield(Job *job)
  {
 +    JOB_LOCK_GUARD();
      assert(job->busy);
      /* Check cancellation *before* setting busy = false, too!  */
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn job_yield_locked(Job *job)
      job_pause_point_locked(job);
  }
 -void coroutine_fn job_yield(Job *job)
 -{
 -    JOB_LOCK_GUARD();
 -    job_yield_locked(job);
 -}
 -
  void coroutine_fn job_sleep_ns(Job *job, int64_t ns)
  {
      JOB_LOCK_GUARD();
@@ -XXX,XX +XXX,XX @@ void job_user_pause_locked(Job *job, Error **errp)
      job_pause_locked(job);
  }
 -void job_user_pause(Job *job, Error **errp)
 -{
 -    JOB_LOCK_GUARD();
 -    job_user_pause_locked(job, errp);
 -}
 -
  bool job_user_paused_locked(Job *job)
  {
      return job->user_paused;
  }
 -bool job_user_paused(Job *job)
 -{
 -    JOB_LOCK_GUARD();
 -    return job_user_paused_locked(job);
 -}
 -
  void job_user_resume_locked(Job *job, Error **errp)
  {
      assert(job);
@@ -XXX,XX +XXX,XX @@ void job_user_resume_locked(Job *job, Error **errp)
      job_resume_locked(job);
  }
 -void job_user_resume(Job *job, Error **errp)
 -{
 -    JOB_LOCK_GUARD();
 -    job_user_resume_locked(job, errp);
 -}
 -
  /* Called with job_mutex held, but releases it temporarily. */
  static void job_do_dismiss_locked(Job *job)
  {
@@ -XXX,XX +XXX,XX @@ void job_dismiss_locked(Job **jobptr, Error **errp)
      *jobptr = NULL;
  }
 -void job_dismiss(Job **jobptr, Error **errp)
 -{
 -    JOB_LOCK_GUARD();
 -    job_dismiss_locked(jobptr, errp);
 -}
 -
  void job_early_fail(Job *job)
  {
      JOB_LOCK_GUARD();
@@ -XXX,XX +XXX,XX @@ void job_finalize_locked(Job *job, Error **errp)
      job_do_finalize_locked(job);
  }
 -void job_finalize(Job *job, Error **errp)
 -{
 -    JOB_LOCK_GUARD();
 -    job_finalize_locked(job, errp);
 -}
 -
  /* Called with job_mutex held. */
  static int job_transition_to_pending_locked(Job *job)
  {
@@ -XXX,XX +XXX,XX @@ void job_cancel_locked(Job *job, bool force)
      }
  }
 -void job_cancel(Job *job, bool force)
 -{
 -    JOB_LOCK_GUARD();
 -    job_cancel_locked(job, force);
 -}
 -
  void job_user_cancel_locked(Job *job, bool force, Error **errp)
  {
      if (job_apply_verb_locked(job, JOB_VERB_CANCEL, errp)) {
@@ -XXX,XX +XXX,XX @@ void job_user_cancel_locked(Job *job, bool force, Error **errp)
      job_cancel_locked(job, force);
  }
 -void job_user_cancel(Job *job, bool force, Error **errp)
 -{
 -    JOB_LOCK_GUARD();
 -    job_user_cancel_locked(job, force, errp);
 -}
 -
 -/* A wrapper around job_cancel() taking an Error ** parameter so it may be
 - * used with job_finish_sync() without the need for (rather nasty) function
 - * pointer casts there.
 +/* A wrapper around job_cancel_locked() taking an Error ** parameter so it may
 + * be used with job_finish_sync_locked() without the need for (rather nasty)
 + * function pointer casts there.
   *
   * Called with job_mutex held.
   */
@@ -XXX,XX +XXX,XX @@ int job_complete_sync_locked(Job *job, Error **errp)
      return job_finish_sync_locked(job, job_complete_locked, errp);
  }
 -int job_complete_sync(Job *job, Error **errp)
 -{
 -    JOB_LOCK_GUARD();
 -    return job_complete_sync_locked(job, errp);
 -}
 -
  void job_complete_locked(Job *job, Error **errp)
  {
      /* Should not be reachable via external interface for internal jobs */
@@ -XXX,XX +XXX,XX @@ void job_complete_locked(Job *job, Error **errp)
      job_lock();
  }
 -void job_complete(Job *job, Error **errp)
 -{
 -    JOB_LOCK_GUARD();
 -    job_complete_locked(job, errp);
 -}
 -
  int job_finish_sync_locked(Job *job,
                             void (*finish)(Job *, Error **errp),
                             Error **errp)
@@ -XXX,XX +XXX,XX @@ int job_finish_sync_locked(Job *job,
      job_unref_locked(job);
      return ret;
  }
 -
 -int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error **errp)
 -{
 -    JOB_LOCK_GUARD();
 -    return job_finish_sync_locked(job, finish, errp);
 -}
 diff --git a/tests/unit/test-blockjob.c b/tests/unit/test-blockjob.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/unit/test-blockjob.c
 +++ b/tests/unit/test-blockjob.c
@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver test_yielding_driver = {
  };
  /*
 - * Test that job_complete() works even on jobs that are in a paused
 + * Test that job_complete_locked() works even on jobs that are in a paused
   * state (i.e., STANDBY).
   *
   * To do this, run YieldingJob in an IO thread, get it into the READY
@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver test_yielding_driver = {
   * acquire the context so the job will not be entered and will thus
   * remain on STANDBY.
   *
 - * job_complete() should still work without error.
 + * job_complete_locked() should still work without error.
   *
   * Note that on the QMP interface, it is impossible to lock an IO
   * thread before a drained section ends.  In practice, the
 --
-.37.3
+.48.1

-[PULL 09/50] file-posix: add missing coroutine_fn annotations
+[PULL 03/22] file-posix: Support FUA writes
-From: Paolo Bonzini <pbonzini@redhat.com>
+Until now, FUA was always emulated with a separate flush after the write
+for file-posix. The overhead of processing a second request can reduce
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
+performance significantly for a guest disk that has disabled the write
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
+cache, especially if the host disk is already write through, too, and
-functions where this holds.
+the flush isn't actually doing anything.
-Reviewed-by: Alberto Faria <afaria@redhat.com>
+Advertise support for REQ_FUA in write requests and implement it for
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Linux AIO and io_uring using the RWF_DSYNC flag for write requests. The
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+thread pool still performs a separate fdatasync() call. This can be
-Message-Id: <20220922084924.201610-9-pbonzini@redhat.com>
+improved later by using the pwritev2() syscall if available.
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 As an example, this is how fio numbers can be improved in some scenarios
 with this patch (all using virtio-blk with cache=directsync on an nvme
 block device for the VM, fio with ioengine=libaio,direct=1,sync=1):
                               | old           | with FUA support
 ------------------------------+---------------+-------------------
 bs=4k, iodepth=1, numjobs=1   |  45.6k iops   |  56.1k iops
 bs=4k, iodepth=1, numjobs=16  | 183.3k iops   | 236.0k iops
 bs=4k, iodepth=16, numjobs=1  | 258.4k iops   | 311.1k iops
 However, not all scenarios are clear wins. On another slower disk I saw
 little to no improvment. In fact, in two corner case scenarios, I even
 observed a regression, which I however consider acceptable:
 . On slow host disks in a write through cache mode, when the guest is
    using virtio-blk in a separate iothread so that polling can be
    enabled, and each completion is quickly followed up with a new
    request (so that polling gets it), it can happen that enabling FUA
    makes things slower - the additional very fast no-op flush we used to
    have gave the adaptive polling algorithm a success so that it kept
    polling. Without it, we only have the slow write request, which
    disables polling. This is a problem in the polling algorithm that
    will be fixed later in this series.
 . With a high queue depth, it can be beneficial to have flush requests
    for another reason: The optimisation in bdrv_co_flush() that flushes
    only once per write generation acts as a synchronisation mechanism
    that lets all requests complete at the same time. This can result in
    better batching and if the disk is very fast (I only saw this with a
    null_blk backend), this can make up for the overhead of the flush and
    improve throughput. In theory, we could optionally introduce a
    similar artificial latency in the normal completion path to achieve
    the same kind of completion batching. This is not implemented in this
    series.
 Compatibility is not a concern for io_uring, it has supported RWF_DSYNC
 from the start. Linux AIO started supporting it in Linux 4.13 and libaio
 .3.111. The kernel is not a problem for any supported build platform,
 so it's not necessary to add runtime checks. However, openSUSE is still
 stuck with an older libaio version that would break the build. We must
 detect this at build time to avoid build failures.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Message-ID: <20250307221634.71951-2-kwolf@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/file-posix.c | 2 +-
+ include/block/raw-aio.h |  8 ++++++--
-file changed, 1 insertion(+), 1 deletion(-)
+ block/file-posix.c      | 26 ++++++++++++++++++--------
+ block/io_uring.c        | 13 ++++++++-----
  block/linux-aio.c       | 24 +++++++++++++++++++++---
  meson.build             |  4 ++++
 files changed, 57 insertions(+), 18 deletions(-)
 diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/raw-aio.h
 +++ b/include/block/raw-aio.h
@@ -XXX,XX +XXX,XX @@
  #define QEMU_RAW_AIO_H
  #include "block/aio.h"
 +#include "block/block-common.h"
  #include "qemu/iov.h"
  /* AIO request types */
@@ -XXX,XX +XXX,XX @@ void laio_cleanup(LinuxAioState *s);
  /* laio_co_submit: submit I/O requests in the thread's current AioContext. */
  int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
 -                                int type, uint64_t dev_max_batch);
 +                                int type, BdrvRequestFlags flags,
 +                                uint64_t dev_max_batch);
  bool laio_has_fdsync(int);
 +bool laio_has_fua(void);
  void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
  void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
  #endif
@@ -XXX,XX +XXX,XX @@ void luring_cleanup(LuringState *s);
  /* luring_co_submit: submit I/O requests in the thread's current AioContext. */
  int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
 -                                  QEMUIOVector *qiov, int type);
 +                                  QEMUIOVector *qiov, int type,
 +                                  BdrvRequestFlags flags);
  void luring_detach_aio_context(LuringState *s, AioContext *old_context);
  void luring_attach_aio_context(LuringState *s, AioContext *new_context);
  #endif
 diff --git a/block/file-posix.c b/block/file-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/file-posix.c
 +++ b/block/file-posix.c
-@@ -XXX,XX +XXX,XX @@ static void raw_aio_unplug(BlockDriverState *bs)
+@@ -XXX,XX +XXX,XX @@ static int fd_open(BlockDriverState *bs)
- #endif
+ }
- }
+ static int64_t raw_getlength(BlockDriverState *bs);
--static int raw_co_flush_to_disk(BlockDriverState *bs)
++static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs);
-+static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
  typedef struct RawPosixAIOData {
      BlockDriverState *bs;
@@ -XXX,XX +XXX,XX @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
  #endif
      s->needs_alignment = raw_needs_alignment(bs);
 +    if (!s->use_linux_aio || laio_has_fua()) {
 +        bs->supported_write_flags = BDRV_REQ_FUA;
 +    }
 +
      bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
      if (S_ISREG(st.st_mode)) {
          /* When extending regular files, we get zeros from the OS */
@@ -XXX,XX +XXX,XX @@ static inline bool raw_check_linux_aio(BDRVRawState *s)
  #endif
  static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
 -                                   uint64_t bytes, QEMUIOVector *qiov, int type)
 +                                   uint64_t bytes, QEMUIOVector *qiov, int type,
 +                                   int flags)
  {
      BDRVRawState *s = bs->opaque;
      RawPosixAIOData acb;
+@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
+ #ifdef CONFIG_LINUX_IO_URING
+     } else if (raw_check_linux_io_uring(s)) {
+         assert(qiov->size == bytes);
+-        ret = luring_co_submit(bs, s->fd, offset, qiov, type);
++        ret = luring_co_submit(bs, s->fd, offset, qiov, type, flags);
+         goto out;
+ #endif
+ #ifdef CONFIG_LINUX_AIO
+     } else if (raw_check_linux_aio(s)) {
+         assert(qiov->size == bytes);
+-        ret = laio_co_submit(s->fd, offset, qiov, type,
++        ret = laio_co_submit(s->fd, offset, qiov, type, flags,
+                               s->aio_max_batch);
+         goto out;
+ #endif
+@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
+     assert(qiov->size == bytes);
+     ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
++    if (ret == 0 && (flags & BDRV_REQ_FUA)) {
++        /* TODO Use pwritev2() instead if it's available */
++        ret = raw_co_flush_to_disk(bs);
++    }
+     goto out; /* Avoid the compiler err of unused label */
+ out:
+@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
+                                       int64_t bytes, QEMUIOVector *qiov,
+                                       BdrvRequestFlags flags)
+ {
+-    return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ);
++    return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ, flags);
+ }
+ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
+                                        int64_t bytes, QEMUIOVector *qiov,
+                                        BdrvRequestFlags flags)
+ {
+-    return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE);
++    return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE, flags);
+ }
+ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
+@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
+ #ifdef CONFIG_LINUX_IO_URING
+     if (raw_check_linux_io_uring(s)) {
+-        return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH);
++        return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH, 0);
+     }
+ #endif
+ #ifdef CONFIG_LINUX_AIO
+     if (s->has_laio_fdsync && raw_check_linux_aio(s)) {
+-        return laio_co_submit(s->fd, 0, NULL, QEMU_AIO_FLUSH, 0);
++        return laio_co_submit(s->fd, 0, NULL, QEMU_AIO_FLUSH, 0, 0);
+     }
+ #endif
+     return raw_thread_pool_submit(handle_aiocb_flush, &acb);
+@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
+     }
+     trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
+-    return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND);
++    return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND, 0);
+ }
+ #endif
+diff --git a/block/io_uring.c b/block/io_uring.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/io_uring.c
++++ b/block/io_uring.c
+@@ -XXX,XX +XXX,XX @@ static void luring_deferred_fn(void *opaque)
+  *
+  */
+ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
+-                            uint64_t offset, int type)
++                            uint64_t offset, int type, BdrvRequestFlags flags)
+ {
+     int ret;
+     struct io_uring_sqe *sqes = &luringcb->sqeq;
++    int luring_flags;
+     switch (type) {
+     case QEMU_AIO_WRITE:
+-        io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
+-                             luringcb->qiov->niov, offset);
++        luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
++        io_uring_prep_writev2(sqes, fd, luringcb->qiov->iov,
++                              luringcb->qiov->niov, offset, luring_flags);
+         break;
+     case QEMU_AIO_ZONE_APPEND:
+         io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
+@@ -XXX,XX +XXX,XX @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
+ }
+ int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
+-                                  QEMUIOVector *qiov, int type)
++                                  QEMUIOVector *qiov, int type,
++                                  BdrvRequestFlags flags)
+ {
+     int ret;
+     AioContext *ctx = qemu_get_current_aio_context();
+@@ -XXX,XX +XXX,XX @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
+     };
+     trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0,
+                            type);
+-    ret = luring_do_submit(fd, &luringcb, s, offset, type);
++    ret = luring_do_submit(fd, &luringcb, s, offset, type, flags);
+     if (ret < 0) {
+         return ret;
+diff --git a/block/linux-aio.c b/block/linux-aio.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/linux-aio.c
++++ b/block/linux-aio.c
+@@ -XXX,XX +XXX,XX @@ static void laio_deferred_fn(void *opaque)
+ }
+ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
+-                          int type, uint64_t dev_max_batch)
++                          int type, BdrvRequestFlags flags,
++                          uint64_t dev_max_batch)
+ {
+     LinuxAioState *s = laiocb->ctx;
+     struct iocb *iocbs = &laiocb->iocb;
+     QEMUIOVector *qiov = laiocb->qiov;
++    int laio_flags;
+     switch (type) {
+     case QEMU_AIO_WRITE:
++#ifdef HAVE_IO_PREP_PWRITEV2
++        laio_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
++        io_prep_pwritev2(iocbs, fd, qiov->iov, qiov->niov, offset, laio_flags);
++#else
++        assert(flags == 0);
+         io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
++#endif
+         break;
+     case QEMU_AIO_ZONE_APPEND:
+         io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
+@@ -XXX,XX +XXX,XX @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
+ }
+ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
+-                                int type, uint64_t dev_max_batch)
++                                int type, BdrvRequestFlags flags,
++                                uint64_t dev_max_batch)
+ {
+     int ret;
+     AioContext *ctx = qemu_get_current_aio_context();
+@@ -XXX,XX +XXX,XX @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
+         .qiov       = qiov,
+     };
+-    ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
++    ret = laio_do_submit(fd, &laiocb, offset, type, flags, dev_max_batch);
+     if (ret < 0) {
+         return ret;
+     }
+@@ -XXX,XX +XXX,XX @@ bool laio_has_fdsync(int fd)
+     io_destroy(ctx);
+     return (ret == -EINVAL) ? false : true;
+ }
++
++bool laio_has_fua(void)
++{
++#ifdef HAVE_IO_PREP_PWRITEV2
++    return true;
++#else
++    return false;
++#endif
++}
+diff --git a/meson.build b/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/meson.build
++++ b/meson.build
+@@ -XXX,XX +XXX,XX @@ config_host_data.set('HAVE_OPTRESET',
+                      cc.has_header_symbol('getopt.h', 'optreset'))
+ config_host_data.set('HAVE_IPPROTO_MPTCP',
+                      cc.has_header_symbol('netinet/in.h', 'IPPROTO_MPTCP'))
++if libaio.found()
++  config_host_data.set('HAVE_IO_PREP_PWRITEV2',
++                       cc.has_header_symbol('libaio.h', 'io_prep_pwritev2'))
++endif
+ # has_member
+ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID',
 --
-.37.3
+.48.1

-[PULL 04/50] block: add missing coroutine_fn annotations
+[PULL 04/22] block/io: Ignore FUA with cache.no-flush=on
-From: Paolo Bonzini <pbonzini@redhat.com>
+For block drivers that don't advertise FUA support, we already call
 bdrv_co_flush(), which considers BDRV_O_NO_FLUSH. However, drivers that
 do support FUA still see the FUA flag with BDRV_O_NO_FLUSH and get the
 associated performance penalty that cache.no-flush=on was supposed to
 avoid.
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
+Clear FUA for write requests if BDRV_O_NO_FLUSH is set.
 must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
 functions where this holds.
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Message-Id: <20220922084924.201610-3-pbonzini@redhat.com>
+Message-ID: <20250307221634.71951-3-kwolf@redhat.com>
-[kwolf: Fixed up coding style]
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block.c               |  7 ++++---
+ block/io.c | 4 ++++
- block/block-backend.c | 10 +++++-----
+file changed, 4 insertions(+)
  block/io.c            | 22 +++++++++++-----------
 files changed, 20 insertions(+), 19 deletions(-)
-diff --git a/block.c b/block.c
-index XXXXXXX..XXXXXXX 100644
---- a/block.c
-+++ b/block.c
-@@ -XXX,XX +XXX,XX @@ static int64_t create_file_fallback_truncate(BlockBackend *blk,
-  * Helper function for bdrv_create_file_fallback(): Zero the first
-  * sector to remove any potentially pre-existing image header.
-  */
--static int create_file_fallback_zero_first_sector(BlockBackend *blk,
--                                                  int64_t current_size,
--                                                  Error **errp)
-+static int coroutine_fn
-+create_file_fallback_zero_first_sector(BlockBackend *blk,
-+                                       int64_t current_size,
-+                                       Error **errp)
- {
-     int64_t bytes_to_clear;
-     int ret;
-diff --git a/block/block-backend.c b/block/block-backend.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/block-backend.c
-+++ b/block/block-backend.c
-@@ -XXX,XX +XXX,XX @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset,
-     return &acb->common;
- }
--static void blk_aio_read_entry(void *opaque)
-+static void coroutine_fn blk_aio_read_entry(void *opaque)
- {
-     BlkAioEmAIOCB *acb = opaque;
-     BlkRwCo *rwco = &acb->rwco;
-@@ -XXX,XX +XXX,XX @@ static void blk_aio_read_entry(void *opaque)
-     blk_aio_complete(acb);
- }
--static void blk_aio_write_entry(void *opaque)
-+static void coroutine_fn blk_aio_write_entry(void *opaque)
- {
-     BlkAioEmAIOCB *acb = opaque;
-     BlkRwCo *rwco = &acb->rwco;
-@@ -XXX,XX +XXX,XX @@ int coroutine_fn blk_co_ioctl(BlockBackend *blk, unsigned long int req,
-     return ret;
- }
--static void blk_aio_ioctl_entry(void *opaque)
-+static void coroutine_fn blk_aio_ioctl_entry(void *opaque)
- {
-     BlkAioEmAIOCB *acb = opaque;
-     BlkRwCo *rwco = &acb->rwco;
-@@ -XXX,XX +XXX,XX @@ blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes)
-     return bdrv_co_pdiscard(blk->root, offset, bytes);
- }
--static void blk_aio_pdiscard_entry(void *opaque)
-+static void coroutine_fn blk_aio_pdiscard_entry(void *opaque)
- {
-     BlkAioEmAIOCB *acb = opaque;
-     BlkRwCo *rwco = &acb->rwco;
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blk_co_do_flush(BlockBackend *blk)
-     return bdrv_co_flush(blk_bs(blk));
- }
--static void blk_aio_flush_entry(void *opaque)
-+static void coroutine_fn blk_aio_flush_entry(void *opaque)
- {
-     BlkAioEmAIOCB *acb = opaque;
-     BlkRwCo *rwco = &acb->rwco;
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/io.c
 +++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn tracked_request_end(BdrvTrackedRequest *req)
+@@ -XXX,XX +XXX,XX @@ bdrv_driver_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
- /**
+         return -ENOMEDIUM;
-  * Add an active request to the tracked requests list
+     }
-  */
--static void tracked_request_begin(BdrvTrackedRequest *req,
++    if (bs->open_flags & BDRV_O_NO_FLUSH) {
--                                  BlockDriverState *bs,
++        flags &= ~BDRV_REQ_FUA;
--                                  int64_t offset,
++    }
--                                  int64_t bytes,
++
--                                  enum BdrvTrackedRequestType type)
+     if ((flags & BDRV_REQ_FUA) &&
-+static void coroutine_fn tracked_request_begin(BdrvTrackedRequest *req,
+         (~bs->supported_write_flags & BDRV_REQ_FUA)) {
-+                                               BlockDriverState *bs,
+         flags &= ~BDRV_REQ_FUA;
 +                                               int64_t offset,
 +                                               int64_t bytes,
 +                                               enum BdrvTrackedRequestType type)
  {
      bdrv_check_request(offset, bytes, &error_abort);
@@ -XXX,XX +XXX,XX @@ static bool tracked_request_overlaps(BdrvTrackedRequest *req,
  }
  /* Called with self->bs->reqs_lock held */
 -static BdrvTrackedRequest *
 +static coroutine_fn BdrvTrackedRequest *
  bdrv_find_conflicting_request(BdrvTrackedRequest *self)
  {
      BdrvTrackedRequest *req;
@@ -XXX,XX +XXX,XX @@ static bool bdrv_init_padding(BlockDriverState *bs,
      return true;
  }
 -static int bdrv_padding_rmw_read(BdrvChild *child,
 -                                 BdrvTrackedRequest *req,
 -                                 BdrvRequestPadding *pad,
 -                                 bool zero_middle)
 +static coroutine_fn int bdrv_padding_rmw_read(BdrvChild *child,
 +                                              BdrvTrackedRequest *req,
 +                                              BdrvRequestPadding *pad,
 +                                              bool zero_middle)
  {
      QEMUIOVector local_qiov;
      BlockDriverState *bs = child->bs;
@@ -XXX,XX +XXX,XX @@ out:
      return ret;
  }
 -int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
 +int coroutine_fn bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
  {
      BlockDriver *drv = bs->drv;
      CoroutineIOCompletion co = {
 --
-.37.3
+.48.1

-[PULL 50/50] file-posix: Remove unused s->discard_zeroes
+[PULL 05/22] aio: Create AioPolledEvent
-The field is unused (only ever set, but never read) since commit
+As a preparation for having multiple adaptive polling states per
-ac9185603. Additionally, the commit message of commit 34fa110e already
+AioContext, move the 'ns' field into a separate struct.
 explained earlier why it's unreliable. Remove it.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Message-Id: <20220923142838.91043-1-kwolf@redhat.com>
+Message-ID: <20250307221634.71951-4-kwolf@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/file-posix.c | 9 ---------
+ include/block/aio.h |  6 +++++-
-file changed, 9 deletions(-)
+ util/aio-posix.c    | 31 ++++++++++++++++---------------
  util/async.c        |  3 ++-
 files changed, 23 insertions(+), 17 deletions(-)
-diff --git a/block/file-posix.c b/block/file-posix.c
+diff --git a/include/block/aio.h b/include/block/aio.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/file-posix.c
+--- a/include/block/aio.h
-+++ b/block/file-posix.c
++++ b/include/block/aio.h
-@@ -XXX,XX +XXX,XX @@ typedef struct BDRVRawState {
+@@ -XXX,XX +XXX,XX @@ struct BHListSlice {
-     bool has_discard:1;
+ typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
-     bool has_write_zeroes:1;
--    bool discard_zeroes:1;
++typedef struct AioPolledEvent {
-     bool use_linux_aio:1;
++    int64_t ns;        /* current polling time in nanoseconds */
-     bool use_linux_io_uring:1;
++} AioPolledEvent;
-     int page_cache_inconsistent; /* errno from fdatasync failure */
++
-@@ -XXX,XX +XXX,XX @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
+ struct AioContext {
-             ret = -EINVAL;
+     GSource source;
-             goto fail;
-         } else {
+@@ -XXX,XX +XXX,XX @@ struct AioContext {
--            s->discard_zeroes = true;
+     int poll_disable_cnt;
-             s->has_fallocate = true;
      /* Polling mode parameters */
 -    int64_t poll_ns;        /* current polling time in nanoseconds */
 +    AioPolledEvent poll;
      int64_t poll_max_ns;    /* maximum polling time in nanoseconds */
      int64_t poll_grow;      /* polling time growth factor */
      int64_t poll_shrink;    /* polling time shrink factor */
 diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-posix.c
 +++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
          return false;
      }
 -    max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
 +    max_ns = qemu_soonest_timeout(*timeout, ctx->poll.ns);
      if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
          /*
           * Enable poll mode. It pairs with the poll_set_started() in
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
      if (ctx->poll_max_ns) {
          int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
 -        if (block_ns <= ctx->poll_ns) {
 +        if (block_ns <= ctx->poll.ns) {
              /* This is the sweet spot, no adjustment needed */
          } else if (block_ns > ctx->poll_max_ns) {
              /* We'd have to poll for too long, poll less */
 -            int64_t old = ctx->poll_ns;
 +            int64_t old = ctx->poll.ns;
              if (ctx->poll_shrink) {
 -                ctx->poll_ns /= ctx->poll_shrink;
 +                ctx->poll.ns /= ctx->poll_shrink;
              } else {
 -                ctx->poll_ns = 0;
 +                ctx->poll.ns = 0;
              }
 -            trace_poll_shrink(ctx, old, ctx->poll_ns);
 -        } else if (ctx->poll_ns < ctx->poll_max_ns &&
 +            trace_poll_shrink(ctx, old, ctx->poll.ns);
 +        } else if (ctx->poll.ns < ctx->poll_max_ns &&
                     block_ns < ctx->poll_max_ns) {
              /* There is room to grow, poll longer */
 -            int64_t old = ctx->poll_ns;
 +            int64_t old = ctx->poll.ns;
              int64_t grow = ctx->poll_grow;
              if (grow == 0) {
                  grow = 2;
              }
 -            if (ctx->poll_ns) {
 -                ctx->poll_ns *= grow;
 +            if (ctx->poll.ns) {
 +                ctx->poll.ns *= grow;
              } else {
 -                ctx->poll_ns = 4000; /* start polling at 4 microseconds */
 +                ctx->poll.ns = 4000; /* start polling at 4 microseconds */
              }
 -            if (ctx->poll_ns > ctx->poll_max_ns) {
 -                ctx->poll_ns = ctx->poll_max_ns;
 +            if (ctx->poll.ns > ctx->poll_max_ns) {
 +                ctx->poll.ns = ctx->poll_max_ns;
              }
 -            trace_poll_grow(ctx, old, ctx->poll_ns);
 +            trace_poll_grow(ctx, old, ctx->poll.ns);
          }
-     } else {
-@@ -XXX,XX +XXX,XX @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
      }
-     if (S_ISBLK(st.st_mode)) {
+@@ -XXX,XX +XXX,XX @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
--#ifdef BLKDISCARDZEROES
+     /* No thread synchronization here, it doesn't matter if an incorrect value
--        unsigned int arg;
+      * is used once.
--        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
+      */
--            s->discard_zeroes = true;
++    ctx->poll.ns = 0;
--        }
++
--#endif
+     ctx->poll_max_ns = max_ns;
- #ifdef __linux__
+-    ctx->poll_ns = 0;
-         /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
+     ctx->poll_grow = grow;
-          * not rely on the contents of discarded blocks unless using O_DIRECT.
+     ctx->poll_shrink = shrink;
-          * Same for BLKZEROOUT.
-          */
+diff --git a/util/async.c b/util/async.c
-         if (!(bs->open_flags & BDRV_O_NOCACHE)) {
+index XXXXXXX..XXXXXXX 100644
--            s->discard_zeroes = false;
+--- a/util/async.c
-             s->has_write_zeroes = false;
++++ b/util/async.c
-         }
+@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
- #endif
+     qemu_rec_mutex_init(&ctx->lock);
      timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
 -    ctx->poll_ns = 0;
 +    ctx->poll.ns = 0;
 +
      ctx->poll_max_ns = 0;
      ctx->poll_grow = 0;
      ctx->poll_shrink = 0;
 --
-.37.3
+.48.1

-[PULL 28/50] quorum: Remove unnecessary forward declaration
+[PULL 06/22] aio-posix: Factor out adjust_polling_time()
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Message-Id: <20221006122607.162769-1-kwolf@redhat.com>
+Message-ID: <20250307221634.71951-5-kwolf@redhat.com>
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/quorum.c | 2 --
+ util/aio-posix.c | 77 ++++++++++++++++++++++++++----------------------
-file changed, 2 deletions(-)
+file changed, 41 insertions(+), 36 deletions(-)
-diff --git a/block/quorum.c b/block/quorum.c
+diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/quorum.c
+--- a/util/aio-posix.c
-+++ b/block/quorum.c
++++ b/util/aio-posix.c
-@@ -XXX,XX +XXX,XX @@ static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb)
+@@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
      return false;
  }
--static int read_fifo_child(QuorumAIOCB *acb);
++static void adjust_polling_time(AioContext *ctx, AioPolledEvent *poll,
 +                                int64_t block_ns)
 +{
 +    if (block_ns <= poll->ns) {
 +        /* This is the sweet spot, no adjustment needed */
 +    } else if (block_ns > ctx->poll_max_ns) {
 +        /* We'd have to poll for too long, poll less */
 +        int64_t old = poll->ns;
 +
 +        if (ctx->poll_shrink) {
 +            poll->ns /= ctx->poll_shrink;
 +        } else {
 +            poll->ns = 0;
 +        }
 +
 +        trace_poll_shrink(ctx, old, poll->ns);
 +    } else if (poll->ns < ctx->poll_max_ns &&
 +               block_ns < ctx->poll_max_ns) {
 +        /* There is room to grow, poll longer */
 +        int64_t old = poll->ns;
 +        int64_t grow = ctx->poll_grow;
 +
 +        if (grow == 0) {
 +            grow = 2;
 +        }
 +
 +        if (poll->ns) {
 +            poll->ns *= grow;
 +        } else {
 +            poll->ns = 4000; /* start polling at 4 microseconds */
 +        }
 +
 +        if (poll->ns > ctx->poll_max_ns) {
 +            poll->ns = ctx->poll_max_ns;
 +        }
 +
 +        trace_poll_grow(ctx, old, poll->ns);
 +    }
 +}
 +
  bool aio_poll(AioContext *ctx, bool blocking)
  {
      AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
      /* Adjust polling time */
      if (ctx->poll_max_ns) {
          int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
 -
- static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
+-        if (block_ns <= ctx->poll.ns) {
- {
+-            /* This is the sweet spot, no adjustment needed */
-     int i;
+-        } else if (block_ns > ctx->poll_max_ns) {
 -            /* We'd have to poll for too long, poll less */
 -            int64_t old = ctx->poll.ns;
 -
 -            if (ctx->poll_shrink) {
 -                ctx->poll.ns /= ctx->poll_shrink;
 -            } else {
 -                ctx->poll.ns = 0;
 -            }
 -
 -            trace_poll_shrink(ctx, old, ctx->poll.ns);
 -        } else if (ctx->poll.ns < ctx->poll_max_ns &&
 -                   block_ns < ctx->poll_max_ns) {
 -            /* There is room to grow, poll longer */
 -            int64_t old = ctx->poll.ns;
 -            int64_t grow = ctx->poll_grow;
 -
 -            if (grow == 0) {
 -                grow = 2;
 -            }
 -
 -            if (ctx->poll.ns) {
 -                ctx->poll.ns *= grow;
 -            } else {
 -                ctx->poll.ns = 4000; /* start polling at 4 microseconds */
 -            }
 -
 -            if (ctx->poll.ns > ctx->poll_max_ns) {
 -                ctx->poll.ns = ctx->poll_max_ns;
 -            }
 -
 -            trace_poll_grow(ctx, old, ctx->poll.ns);
 -        }
 +        adjust_polling_time(ctx, &ctx->poll, block_ns);
      }
      progress |= aio_bh_poll(ctx);
 --
-.37.3
+.48.1

-[PULL 39/50] jobs: group together API calls under the same job lock
+[PULL 07/22] aio-posix: Separate AioPolledEvent per AioHandler
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+Adaptive polling has a big problem: It doesn't consider that an event
 loop can wait for many different events that may have very different
 typical latencies.
-Now that the API offers also _locked() functions, take advantage
+For example, think of a guest that tends to send a new I/O request soon
-of it and give also the caller control to take the lock and call
+after the previous I/O request completes, but the storage on the host is
-_locked functions.
+rather slow. In this case, getting the new request from guest quickly
 means that polling is enabled, but the next thing is performing the I/O
 request on the backend, which is slow and disables polling again for the
 next guest request. This means that in such a scenario, polling could
 help for every other event, but is only ever enabled when it can't
 succeed.
-This makes sense especially when we have for loops, because it
+In order to fix this, keep a separate AioPolledEvent for each
-makes no sense to have:
+AioHandler. We will then know that the backend file descriptor always
 has a high latency and isn't worth polling for, but we also know that
 the guest is always fast and we should poll for it. This solves at least
 half of the problem, we can now keep polling for those cases where it
 makes sense and get the improved performance from it.
-for(job = job_next(); ...)
+Since the event loop doesn't know which event will be next, we still do
 some unnecessary polling while we're waiting for the slow disk. I made
 some attempts to be more clever than just randomly growing and shrinking
 the polling time, and even to let callers be explicit about when they
 expect a new event, but so far this hasn't resulted in improved
 performance or even caused performance regressions. For now, let's just
 fix the part that is easy enough to fix, we can revisit the rest later.
-where each job_next() takes the lock internally.
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Instead we want
+Message-ID: <20250307221634.71951-6-kwolf@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 JOB_LOCK_GUARD();
 for(job = job_next_locked(); ...)
 In addition, protect also direct field accesses, by either creating a
 new critical section or widening the existing ones.
 Note: at this stage, job_{lock/unlock} and job lock guard macros
 are *nop*.
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
 Message-Id: <20220926093214.506243-12-eesposit@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block.c            | 17 ++++++++++-------
+ include/block/aio.h |  1 -
- blockdev.c         | 14 ++++++++++----
+ util/aio-posix.h    |  1 +
- blockjob.c         | 35 ++++++++++++++++++++++-------------
+ util/aio-posix.c    | 26 ++++++++++++++++++++++----
- job-qmp.c          |  9 ++++++---
+ util/async.c        |  2 --
- monitor/qmp-cmds.c |  7 +++++--
+files changed, 23 insertions(+), 7 deletions(-)
  qemu-img.c         | 15 ++++++++++-----
 files changed, 63 insertions(+), 34 deletions(-)
-diff --git a/block.c b/block.c
+diff --git a/include/block/aio.h b/include/block/aio.h
 index XXXXXXX..XXXXXXX 100644
---- a/block.c
+--- a/include/block/aio.h
-+++ b/block.c
++++ b/include/block/aio.h
-@@ -XXX,XX +XXX,XX @@ static void bdrv_close(BlockDriverState *bs)
+@@ -XXX,XX +XXX,XX @@ struct AioContext {
+     int poll_disable_cnt;
- void bdrv_close_all(void)
      /* Polling mode parameters */
 -    AioPolledEvent poll;
      int64_t poll_max_ns;    /* maximum polling time in nanoseconds */
      int64_t poll_grow;      /* polling time growth factor */
      int64_t poll_shrink;    /* polling time shrink factor */
 diff --git a/util/aio-posix.h b/util/aio-posix.h
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-posix.h
 +++ b/util/aio-posix.h
@@ -XXX,XX +XXX,XX @@ struct AioHandler {
  #endif
      int64_t poll_idle_timeout; /* when to stop userspace polling */
      bool poll_ready; /* has polling detected an event? */
 +    AioPolledEvent poll;
  };
  /* Add a handler to a ready list */
 diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-posix.c
 +++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list,
  static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
                            int64_t *timeout)
  {
--    assert(job_next(NULL) == NULL);
++    AioHandler *node;
-     GLOBAL_STATE_CODE();
+     int64_t max_ns;
-+    assert(job_next(NULL) == NULL);
+     if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
-     /* Drop references from requests still in flight, such as canceled block
+         return false;
       * jobs whose AIO context has not been polled yet */
@@ -XXX,XX +XXX,XX @@ XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp)
          }
      }
--    for (job = block_job_next(NULL); job; job = block_job_next(job)) {
+-    max_ns = qemu_soonest_timeout(*timeout, ctx->poll.ns);
--        GSList *el;
++    max_ns = 0;
-+    WITH_JOB_LOCK_GUARD() {
++    QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
-+        for (job = block_job_next_locked(NULL); job;
++        max_ns = MAX(max_ns, node->poll.ns);
-+             job = block_job_next_locked(job)) {
++    }
-+            GSList *el;
++    max_ns = qemu_soonest_timeout(*timeout, max_ns);
++
--        xdbg_graph_add_node(gr, job, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_JOB,
+     if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
--                           job->job.id);
+         /*
--        for (el = job->nodes; el; el = el->next) {
+          * Enable poll mode. It pairs with the poll_set_started() in
--            xdbg_graph_add_edge(gr, job, (BdrvChild *)el->data);
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
-+            xdbg_graph_add_node(gr, job, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_JOB,
-+                                job->job.id);
+     /* Adjust polling time */
-+            for (el = job->nodes; el; el = el->next) {
+     if (ctx->poll_max_ns) {
-+                xdbg_graph_add_edge(gr, job, (BdrvChild *)el->data);
++        AioHandler *node;
          int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
 -        adjust_polling_time(ctx, &ctx->poll, block_ns);
 +
 +        QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
 +            if (QLIST_IS_INSERTED(node, node_ready)) {
 +                adjust_polling_time(ctx, &node->poll, block_ns);
 +            }
-         }
-     }
-diff --git a/blockdev.c b/blockdev.c
-index XXXXXXX..XXXXXXX 100644
---- a/blockdev.c
-+++ b/blockdev.c
-@@ -XXX,XX +XXX,XX @@ void blockdev_mark_auto_del(BlockBackend *blk)
-         return;
-     }
--    for (job = block_job_next(NULL); job; job = block_job_next(job)) {
-+    JOB_LOCK_GUARD();
-+
-+    for (job = block_job_next_locked(NULL); job;
-+         job = block_job_next_locked(job)) {
-         if (block_job_has_bdrv(job, blk_bs(blk))) {
-             AioContext *aio_context = job->job.aio_context;
-             aio_context_acquire(aio_context);
--            job_cancel(&job->job, false);
-+            job_cancel_locked(&job->job, false);
-             aio_context_release(aio_context);
-         }
-@@ -XXX,XX +XXX,XX @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
-     BlockJobInfoList *head = NULL, **tail = &head;
-     BlockJob *job;
--    for (job = block_job_next(NULL); job; job = block_job_next(job)) {
-+    JOB_LOCK_GUARD();
-+
-+    for (job = block_job_next_locked(NULL); job;
-+         job = block_job_next_locked(job)) {
-         BlockJobInfo *value;
-         AioContext *aio_context;
-@@ -XXX,XX +XXX,XX @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
-         }
-         aio_context = block_job_get_aio_context(job);
-         aio_context_acquire(aio_context);
--        value = block_job_query(job, errp);
-+        value = block_job_query_locked(job, errp);
-         aio_context_release(aio_context);
-         if (!value) {
-             qapi_free_BlockJobInfoList(head);
-diff --git a/blockjob.c b/blockjob.c
-index XXXXXXX..XXXXXXX 100644
---- a/blockjob.c
-+++ b/blockjob.c
-@@ -XXX,XX +XXX,XX @@ static bool child_job_drained_poll(BdrvChild *c)
-     /* An inactive or completed job doesn't have any pending requests. Jobs
-      * with !job->busy are either already paused or have a pause point after
-      * being reentered, so no job driver code will run before they pause. */
--    if (!job->busy || job_is_completed(job)) {
--        return false;
-+    WITH_JOB_LOCK_GUARD() {
-+        if (!job->busy || job_is_completed_locked(job)) {
-+            return false;
 +        }
      }
-     /* Otherwise, assume that it isn't fully stopped yet, but allow the job to
+     progress |= aio_bh_poll(ctx);
-@@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
+@@ -XXX,XX +XXX,XX @@ void aio_context_use_g_source(AioContext *ctx)
-     job->ready_notifier.notify = block_job_event_ready;
+ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
-     job->idle_notifier.notify = block_job_on_idle;
+                                  int64_t grow, int64_t shrink, Error **errp)
+ {
--    notifier_list_add(&job->job.on_finalize_cancelled,
++    AioHandler *node;
--                      &job->finalize_cancelled_notifier);
++
--    notifier_list_add(&job->job.on_finalize_completed,
++    qemu_lockcnt_inc(&ctx->list_lock);
--                      &job->finalize_completed_notifier);
++    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
--    notifier_list_add(&job->job.on_pending, &job->pending_notifier);
++        node->poll.ns = 0;
 -    notifier_list_add(&job->job.on_ready, &job->ready_notifier);
 -    notifier_list_add(&job->job.on_idle, &job->idle_notifier);
 +    WITH_JOB_LOCK_GUARD() {
 +        notifier_list_add(&job->job.on_finalize_cancelled,
 +                          &job->finalize_cancelled_notifier);
 +        notifier_list_add(&job->job.on_finalize_completed,
 +                          &job->finalize_completed_notifier);
 +        notifier_list_add(&job->job.on_pending, &job->pending_notifier);
 +        notifier_list_add(&job->job.on_ready, &job->ready_notifier);
 +        notifier_list_add(&job->job.on_idle, &job->idle_notifier);
 +    }
++    qemu_lockcnt_dec(&ctx->list_lock);
-     error_setg(&job->blocker, "block device is in use by block job: %s",
++
-                job_type_str(&job->job));
+     /* No thread synchronization here, it doesn't matter if an incorrect value
-@@ -XXX,XX +XXX,XX @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
+      * is used once.
-                                         action);
+      */
-     }
+-    ctx->poll.ns = 0;
-     if (action == BLOCK_ERROR_ACTION_STOP) {
+-
--        if (!job->job.user_paused) {
+     ctx->poll_max_ns = max_ns;
--            job_pause(&job->job);
+     ctx->poll_grow = grow;
--            /* make the pause user visible, which will be resumed from QMP. */
+     ctx->poll_shrink = shrink;
--            job->job.user_paused = true;
+diff --git a/util/async.c b/util/async.c
 +        WITH_JOB_LOCK_GUARD() {
 +            if (!job->job.user_paused) {
 +                job_pause_locked(&job->job);
 +                /*
 +                 * make the pause user visible, which will be
 +                 * resumed from QMP.
 +                 */
 +                job->job.user_paused = true;
 +            }
          }
          block_job_iostatus_set_err(job, error);
      }
 diff --git a/job-qmp.c b/job-qmp.c
 index XXXXXXX..XXXXXXX 100644
---- a/job-qmp.c
+--- a/util/async.c
-+++ b/job-qmp.c
++++ b/util/async.c
-@@ -XXX,XX +XXX,XX @@ void qmp_job_dismiss(const char *id, Error **errp)
+@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
-     aio_context_release(aio_context);
+     qemu_rec_mutex_init(&ctx->lock);
- }
+     timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
--static JobInfo *job_query_single(Job *job, Error **errp)
+-    ctx->poll.ns = 0;
-+/* Called with job_mutex held. */
+-
-+static JobInfo *job_query_single_locked(Job *job, Error **errp)
+     ctx->poll_max_ns = 0;
- {
+     ctx->poll_grow = 0;
-     JobInfo *info;
+     ctx->poll_shrink = 0;
      uint64_t progress_current;
@@ -XXX,XX +XXX,XX @@ JobInfoList *qmp_query_jobs(Error **errp)
      JobInfoList *head = NULL, **tail = &head;
      Job *job;
 -    for (job = job_next(NULL); job; job = job_next(job)) {
 +    JOB_LOCK_GUARD();
 +
 +    for (job = job_next_locked(NULL); job; job = job_next_locked(job)) {
          JobInfo *value;
          AioContext *aio_context;
@@ -XXX,XX +XXX,XX @@ JobInfoList *qmp_query_jobs(Error **errp)
          }
          aio_context = job->aio_context;
          aio_context_acquire(aio_context);
 -        value = job_query_single(job, errp);
 +        value = job_query_single_locked(job, errp);
          aio_context_release(aio_context);
          if (!value) {
              qapi_free_JobInfoList(head);
 diff --git a/monitor/qmp-cmds.c b/monitor/qmp-cmds.c
 index XXXXXXX..XXXXXXX 100644
 --- a/monitor/qmp-cmds.c
 +++ b/monitor/qmp-cmds.c
@@ -XXX,XX +XXX,XX @@ void qmp_cont(Error **errp)
          blk_iostatus_reset(blk);
      }
 -    for (job = block_job_next(NULL); job; job = block_job_next(job)) {
 -        block_job_iostatus_reset(job);
 +    WITH_JOB_LOCK_GUARD() {
 +        for (job = block_job_next_locked(NULL); job;
 +             job = block_job_next_locked(job)) {
 +            block_job_iostatus_reset_locked(job);
 +        }
      }
      /* Continuing after completed migration. Images have been inactivated to
 diff --git a/qemu-img.c b/qemu-img.c
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-img.c
 +++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ static void run_block_job(BlockJob *job, Error **errp)
      int ret = 0;
      aio_context_acquire(aio_context);
 -    job_ref(&job->job);
 +    job_lock();
 +    job_ref_locked(&job->job);
      do {
          float progress = 0.0f;
 +        job_unlock();
          aio_poll(aio_context, true);
          progress_get_snapshot(&job->job.progress, &progress_current,
@@ -XXX,XX +XXX,XX @@ static void run_block_job(BlockJob *job, Error **errp)
              progress = (float)progress_current / progress_total * 100.f;
          }
          qemu_progress_print(progress, 0);
 -    } while (!job_is_ready(&job->job) && !job_is_completed(&job->job));
 +        job_lock();
 +    } while (!job_is_ready_locked(&job->job) &&
 +             !job_is_completed_locked(&job->job));
 -    if (!job_is_completed(&job->job)) {
 -        ret = job_complete_sync(&job->job, errp);
 +    if (!job_is_completed_locked(&job->job)) {
 +        ret = job_complete_sync_locked(&job->job, errp);
      } else {
          ret = job->job.ret;
      }
 -    job_unref(&job->job);
 +    job_unref_locked(&job->job);
 +    job_unlock();
      aio_context_release(aio_context);
      /* publish completion progress only when success */
 --
-.37.3
+.48.1

-[PULL 36/50] jobs: add job lock in find_* functions
+[PULL 08/22] aio-posix: Adjust polling time also for new handlers
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+aio_dispatch_handler() adds handlers to ctx->poll_aio_handlers if
 polling should be enabled. If we call adjust_polling_time() for all
 polling handlers before this, new polling handlers are still left at
 poll->ns = 0 and polling is only actually enabled after the next event.
 Move the adjust_polling_time() call after aio_dispatch_handler().
-Both blockdev.c and job-qmp.c have TOC/TOU conditions, because
+This fixes test-nested-aio-poll, which expects that polling becomes
-they first search for the job and then perform an action on it.
+effective the first time around.
 Therefore, we need to do the search + action under the same
 job mutex critical section.
-Note: at this stage, job_{lock/unlock} and job lock guard macros
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-are *nop*.
+Message-ID: <20250311141912.135657-1-kwolf@redhat.com>
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Message-Id: <20220926093214.506243-9-eesposit@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- blockdev.c | 67 +++++++++++++++++++++++++++++++++++++-----------------
+ util/aio-posix.c | 28 +++++++++++++++++-----------
- job-qmp.c  | 57 ++++++++++++++++++++++++++++++++--------------
+file changed, 17 insertions(+), 11 deletions(-)
 files changed, 86 insertions(+), 38 deletions(-)
-diff --git a/blockdev.c b/blockdev.c
+diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
---- a/blockdev.c
+--- a/util/aio-posix.c
-+++ b/blockdev.c
++++ b/util/aio-posix.c
-@@ -XXX,XX +XXX,XX @@ out:
+@@ -XXX,XX +XXX,XX @@
-     aio_context_release(aio_context);
+ /* Stop userspace polling on a handler if it isn't active for some time */
- }
+ #define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
--/* Get a block job using its ID and acquire its AioContext */
++static void adjust_polling_time(AioContext *ctx, AioPolledEvent *poll,
--static BlockJob *find_block_job(const char *id, AioContext **aio_context,
++                                int64_t block_ns);
--                                Error **errp)
++
-+/*
+ bool aio_poll_disabled(AioContext *ctx)
 + * Get a block job using its ID and acquire its AioContext.
 + * Called with job_mutex held.
 + */
 +static BlockJob *find_block_job_locked(const char *id,
 +                                       AioContext **aio_context,
 +                                       Error **errp)
  {
-     BlockJob *job;
+     return qatomic_read(&ctx->poll_disable_cnt);
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
-@@ -XXX,XX +XXX,XX @@ static BlockJob *find_block_job(const char *id, AioContext **aio_context,
+  * scanning all handlers with aio_dispatch_handlers().
+  */
-     *aio_context = NULL;
+ static bool aio_dispatch_ready_handlers(AioContext *ctx,
+-                                        AioHandlerList *ready_list)
--    job = block_job_get(id);
++                                        AioHandlerList *ready_list,
-+    job = block_job_get_locked(id);
++                                        int64_t block_ns)
      if (!job) {
          error_set(errp, ERROR_CLASS_DEVICE_NOT_ACTIVE,
@@ -XXX,XX +XXX,XX @@ static BlockJob *find_block_job(const char *id, AioContext **aio_context,
  void qmp_block_job_set_speed(const char *device, int64_t speed, Error **errp)
  {
-     AioContext *aio_context;
+     bool progress = false;
--    BlockJob *job = find_block_job(device, &aio_context, errp);
+     AioHandler *node;
-+    BlockJob *job;
+@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_ready_handlers(AioContext *ctx,
      while ((node = QLIST_FIRST(ready_list))) {
          QLIST_REMOVE(node, node_ready);
          progress = aio_dispatch_handler(ctx, node) || progress;
 +
-+    JOB_LOCK_GUARD();
++        /*
-+    job = find_block_job_locked(device, &aio_context, errp);
++         * Adjust polling time only after aio_dispatch_handler(), which can
++         * add the handler to ctx->poll_aio_handlers.
-     if (!job) {
++         */
-         return;
++        if (ctx->poll_max_ns && QLIST_IS_INSERTED(node, node_poll)) {
 +            adjust_polling_time(ctx, &node->poll, block_ns);
 +        }
      }
--    block_job_set_speed(job, speed, errp);
+     return progress;
-+    block_job_set_speed_locked(job, speed, errp);
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
-     aio_context_release(aio_context);
+     bool use_notify_me;
- }
+     int64_t timeout;
+     int64_t start = 0;
-@@ -XXX,XX +XXX,XX @@ void qmp_block_job_cancel(const char *device,
++    int64_t block_ns = 0;
-                           bool has_force, bool force, Error **errp)
- {
+     /*
-     AioContext *aio_context;
+      * There cannot be two concurrent aio_poll calls for the same AioContext (or
--    BlockJob *job = find_block_job(device, &aio_context, errp);
+@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
-+    BlockJob *job;
-+
+     aio_notify_accept(ctx);
-+    JOB_LOCK_GUARD();
-+    job = find_block_job_locked(device, &aio_context, errp);
+-    /* Adjust polling time */
++    /* Calculate blocked time for adaptive polling */
-     if (!job) {
+     if (ctx->poll_max_ns) {
-         return;
+-        AioHandler *node;
-@@ -XXX,XX +XXX,XX @@ void qmp_block_job_cancel(const char *device,
+-        int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
-         force = false;
+-
 -        QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
 -            if (QLIST_IS_INSERTED(node, node_ready)) {
 -                adjust_polling_time(ctx, &node->poll, block_ns);
 -            }
 -        }
 +        block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
      }
--    if (job_user_paused(&job->job) && !force) {
+     progress |= aio_bh_poll(ctx);
-+    if (job_user_paused_locked(&job->job) && !force) {
+-    progress |= aio_dispatch_ready_handlers(ctx, &ready_list);
-         error_setg(errp, "The block job for device '%s' is currently paused",
++    progress |= aio_dispatch_ready_handlers(ctx, &ready_list, block_ns);
-                    device);
-         goto out;
+     aio_free_deleted_handlers(ctx);
      }
      trace_qmp_block_job_cancel(job);
 -    job_user_cancel(&job->job, force, errp);
 +    job_user_cancel_locked(&job->job, force, errp);
  out:
      aio_context_release(aio_context);
  }
@@ -XXX,XX +XXX,XX @@ out:
  void qmp_block_job_pause(const char *device, Error **errp)
  {
      AioContext *aio_context;
 -    BlockJob *job = find_block_job(device, &aio_context, errp);
 +    BlockJob *job;
 +
 +    JOB_LOCK_GUARD();
 +    job = find_block_job_locked(device, &aio_context, errp);
      if (!job) {
          return;
      }
      trace_qmp_block_job_pause(job);
 -    job_user_pause(&job->job, errp);
 +    job_user_pause_locked(&job->job, errp);
      aio_context_release(aio_context);
  }
  void qmp_block_job_resume(const char *device, Error **errp)
  {
      AioContext *aio_context;
 -    BlockJob *job = find_block_job(device, &aio_context, errp);
 +    BlockJob *job;
 +
 +    JOB_LOCK_GUARD();
 +    job = find_block_job_locked(device, &aio_context, errp);
      if (!job) {
          return;
      }
      trace_qmp_block_job_resume(job);
 -    job_user_resume(&job->job, errp);
 +    job_user_resume_locked(&job->job, errp);
      aio_context_release(aio_context);
  }
  void qmp_block_job_complete(const char *device, Error **errp)
  {
      AioContext *aio_context;
 -    BlockJob *job = find_block_job(device, &aio_context, errp);
 +    BlockJob *job;
 +
 +    JOB_LOCK_GUARD();
 +    job = find_block_job_locked(device, &aio_context, errp);
      if (!job) {
          return;
      }
      trace_qmp_block_job_complete(job);
 -    job_complete(&job->job, errp);
 +    job_complete_locked(&job->job, errp);
      aio_context_release(aio_context);
  }
  void qmp_block_job_finalize(const char *id, Error **errp)
  {
      AioContext *aio_context;
 -    BlockJob *job = find_block_job(id, &aio_context, errp);
 +    BlockJob *job;
 +
 +    JOB_LOCK_GUARD();
 +    job = find_block_job_locked(id, &aio_context, errp);
      if (!job) {
          return;
      }
      trace_qmp_block_job_finalize(job);
 -    job_ref(&job->job);
 -    job_finalize(&job->job, errp);
 +    job_ref_locked(&job->job);
 +    job_finalize_locked(&job->job, errp);
      /*
       * Job's context might have changed via job_finalize (and job_txn_apply
@@ -XXX,XX +XXX,XX @@ void qmp_block_job_finalize(const char *id, Error **errp)
       * one.
       */
      aio_context = block_job_get_aio_context(job);
 -    job_unref(&job->job);
 +    job_unref_locked(&job->job);
      aio_context_release(aio_context);
  }
  void qmp_block_job_dismiss(const char *id, Error **errp)
  {
      AioContext *aio_context;
 -    BlockJob *bjob = find_block_job(id, &aio_context, errp);
 +    BlockJob *bjob;
      Job *job;
 +    JOB_LOCK_GUARD();
 +    bjob = find_block_job_locked(id, &aio_context, errp);
 +
      if (!bjob) {
          return;
      }
      trace_qmp_block_job_dismiss(bjob);
      job = &bjob->job;
 -    job_dismiss(&job, errp);
 +    job_dismiss_locked(&job, errp);
      aio_context_release(aio_context);
  }
 diff --git a/job-qmp.c b/job-qmp.c
 index XXXXXXX..XXXXXXX 100644
 --- a/job-qmp.c
 +++ b/job-qmp.c
@@ -XXX,XX +XXX,XX @@
  #include "qapi/error.h"
  #include "trace/trace-root.h"
 -/* Get a job using its ID and acquire its AioContext */
 -static Job *find_job(const char *id, AioContext **aio_context, Error **errp)
 +/*
 + * Get a job using its ID and acquire its AioContext.
 + * Called with job_mutex held.
 + */
 +static Job *find_job_locked(const char *id,
 +                            AioContext **aio_context,
 +                            Error **errp)
  {
      Job *job;
      *aio_context = NULL;
 -    job = job_get(id);
 +    job = job_get_locked(id);
      if (!job) {
          error_setg(errp, "Job not found");
          return NULL;
@@ -XXX,XX +XXX,XX @@ static Job *find_job(const char *id, AioContext **aio_context, Error **errp)
  void qmp_job_cancel(const char *id, Error **errp)
  {
      AioContext *aio_context;
 -    Job *job = find_job(id, &aio_context, errp);
 +    Job *job;
 +
 +    JOB_LOCK_GUARD();
 +    job = find_job_locked(id, &aio_context, errp);
      if (!job) {
          return;
      }
      trace_qmp_job_cancel(job);
 -    job_user_cancel(job, true, errp);
 +    job_user_cancel_locked(job, true, errp);
      aio_context_release(aio_context);
  }
  void qmp_job_pause(const char *id, Error **errp)
  {
      AioContext *aio_context;
 -    Job *job = find_job(id, &aio_context, errp);
 +    Job *job;
 +
 +    JOB_LOCK_GUARD();
 +    job = find_job_locked(id, &aio_context, errp);
      if (!job) {
          return;
      }
      trace_qmp_job_pause(job);
 -    job_user_pause(job, errp);
 +    job_user_pause_locked(job, errp);
      aio_context_release(aio_context);
  }
  void qmp_job_resume(const char *id, Error **errp)
  {
      AioContext *aio_context;
 -    Job *job = find_job(id, &aio_context, errp);
 +    Job *job;
 +
 +    JOB_LOCK_GUARD();
 +    job = find_job_locked(id, &aio_context, errp);
      if (!job) {
          return;
      }
      trace_qmp_job_resume(job);
 -    job_user_resume(job, errp);
 +    job_user_resume_locked(job, errp);
      aio_context_release(aio_context);
  }
  void qmp_job_complete(const char *id, Error **errp)
  {
      AioContext *aio_context;
 -    Job *job = find_job(id, &aio_context, errp);
 +    Job *job;
 +
 +    JOB_LOCK_GUARD();
 +    job = find_job_locked(id, &aio_context, errp);
      if (!job) {
          return;
      }
      trace_qmp_job_complete(job);
 -    job_complete(job, errp);
 +    job_complete_locked(job, errp);
      aio_context_release(aio_context);
  }
  void qmp_job_finalize(const char *id, Error **errp)
  {
      AioContext *aio_context;
 -    Job *job = find_job(id, &aio_context, errp);
 +    Job *job;
 +
 +    JOB_LOCK_GUARD();
 +    job = find_job_locked(id, &aio_context, errp);
      if (!job) {
          return;
      }
      trace_qmp_job_finalize(job);
 -    job_ref(job);
 -    job_finalize(job, errp);
 +    job_ref_locked(job);
 +    job_finalize_locked(job, errp);
      /*
       * Job's context might have changed via job_finalize (and job_txn_apply
@@ -XXX,XX +XXX,XX @@ void qmp_job_finalize(const char *id, Error **errp)
       * one.
       */
      aio_context = job->aio_context;
 -    job_unref(job);
 +    job_unref_locked(job);
      aio_context_release(aio_context);
  }
  void qmp_job_dismiss(const char *id, Error **errp)
  {
      AioContext *aio_context;
 -    Job *job = find_job(id, &aio_context, errp);
 +    Job *job;
 +
 +    JOB_LOCK_GUARD();
 +    job = find_job_locked(id, &aio_context, errp);
      if (!job) {
          return;
      }
      trace_qmp_job_dismiss(job);
 -    job_dismiss(&job, errp);
 +    job_dismiss_locked(&job, errp);
      aio_context_release(aio_context);
  }
 --
-.37.3
+.48.1

-[PULL 08/50] blkverify: add missing coroutine_fn annotations
+[PULL 09/22] iotests: Limit qsd-migrate to working formats
-From: Paolo Bonzini <pbonzini@redhat.com>
+From: Thomas Huth <thuth@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
+qsd-migrate is currently only working for raw, qcow2 and qed.
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
+Other formats are failing, e.g. because they don't support migration.
-functions where this holds.
+Thus let's limit this test to the three usable formats now.
-Reviewed-by: Alberto Faria <afaria@redhat.com>
+Suggested-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+Signed-off-by: Thomas Huth <thuth@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Message-ID: <20250224214058.205889-1-thuth@redhat.com>
 Message-Id: <20220922084924.201610-8-pbonzini@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/blkverify.c | 2 +-
+ tests/qemu-iotests/tests/qsd-migrate | 2 +-
 file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/block/blkverify.c b/block/blkverify.c
+diff --git a/tests/qemu-iotests/tests/qsd-migrate b/tests/qemu-iotests/tests/qsd-migrate
-index XXXXXXX..XXXXXXX 100644
+index XXXXXXX..XXXXXXX 100755
---- a/block/blkverify.c
+--- a/tests/qemu-iotests/tests/qsd-migrate
-+++ b/block/blkverify.c
++++ b/tests/qemu-iotests/tests/qsd-migrate
-@@ -XXX,XX +XXX,XX @@ blkverify_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
+@@ -XXX,XX +XXX,XX @@ import iotests
-     return blkverify_co_prwv(bs, &r, offset, bytes, qiov, qiov, flags, true);
- }
+ from iotests import filter_qemu_io, filter_qtest
--static int blkverify_co_flush(BlockDriverState *bs)
+-iotests.script_initialize(supported_fmts=['generic'],
-+static int coroutine_fn blkverify_co_flush(BlockDriverState *bs)
++iotests.script_initialize(supported_fmts=['qcow2', 'qed', 'raw'],
- {
+                           supported_protocols=['file'],
-     BDRVBlkverifyState *s = bs->opaque;
+                           supported_platforms=['linux'])
 --
-.37.3
+.48.1

-[PULL 25/50] 9p: add missing coroutine_fn annotations
+[PULL 10/22] scsi-disk: drop unused SCSIDiskState->bh field
-From: Marc-André Lureau <marcandre.lureau@redhat.com>
+From: Stefan Hajnoczi <stefanha@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
+Commit 71544d30a6f8 ("scsi: push request restart to SCSIDevice") removed
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
+the only user of SCSIDiskState->bh.
 functions where this holds.
-Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Acked-by: Greg Kurz <groug@kaod.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Reviewed-by: Alberto Faria <afaria@redhat.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Message-Id: <20220922084924.201610-25-pbonzini@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
+Message-ID: <20250311132616.1049687-2-stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- hw/9pfs/9p.h | 9 ++++++---
+ hw/scsi/scsi-disk.c | 1 -
-file changed, 6 insertions(+), 3 deletions(-)
+file changed, 1 deletion(-)
-diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
+diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/9pfs/9p.h
+--- a/hw/scsi/scsi-disk.c
-+++ b/hw/9pfs/9p.h
++++ b/hw/scsi/scsi-disk.c
-@@ -XXX,XX +XXX,XX @@ typedef struct V9fsGetlock
+@@ -XXX,XX +XXX,XX @@ struct SCSIDiskState {
- extern int open_fd_hw;
+     uint64_t max_unmap_size;
- extern int total_open_fd;
+     uint64_t max_io_size;
+     uint32_t quirks;
--static inline void v9fs_path_write_lock(V9fsState *s)
+-    QEMUBH *bh;
-+static inline void coroutine_fn
+     char *version;
-+v9fs_path_write_lock(V9fsState *s)
+     char *serial;
- {
+     char *vendor;
      if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
          qemu_co_rwlock_wrlock(&s->rename_lock);
      }
  }
 -static inline void v9fs_path_read_lock(V9fsState *s)
 +static inline void coroutine_fn
 +v9fs_path_read_lock(V9fsState *s)
  {
      if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
          qemu_co_rwlock_rdlock(&s->rename_lock);
      }
  }
 -static inline void v9fs_path_unlock(V9fsState *s)
 +static inline void coroutine_fn
 +v9fs_path_unlock(V9fsState *s)
  {
      if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
          qemu_co_rwlock_unlock(&s->rename_lock);
 --
-.37.3
+.48.1

-[PULL 45/50] job.h: categorize JobDriver callbacks that need the AioContext lock
+[PULL 11/22] dma: use current AioContext for dma_blk_io()
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+From: Stefan Hajnoczi <stefanha@redhat.com>
-Some callbacks implementation use bdrv_* APIs that assume the
+In the past a single AioContext was used for block I/O and it was
-AioContext lock is held. Make sure this invariant is documented.
+fetched using blk_get_aio_context(). Nowadays the block layer supports
 running I/O from any AioContext and multiple AioContexts at the same
 time. Remove the dma_blk_io() AioContext argument and use the current
 AioContext instead.
-Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+This makes calling the function easier and enables multiple IOThreads to
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+use dma_blk_io() concurrently for the same block device.
-Message-Id: <20220926093214.506243-18-eesposit@redhat.com>
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
+Message-ID: <20250311132616.1049687-3-stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/qemu/job.h | 27 +++++++++++++++++++++++++--
+ include/system/dma.h | 3 +--
-file changed, 25 insertions(+), 2 deletions(-)
+ hw/ide/core.c        | 3 +--
  hw/ide/macio.c       | 3 +--
  hw/scsi/scsi-disk.c  | 6 ++----
  system/dma-helpers.c | 8 ++++----
 files changed, 9 insertions(+), 14 deletions(-)
-diff --git a/include/qemu/job.h b/include/qemu/job.h
+diff --git a/include/system/dma.h b/include/system/dma.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/job.h
+--- a/include/system/dma.h
-+++ b/include/qemu/job.h
++++ b/include/system/dma.h
-@@ -XXX,XX +XXX,XX @@ typedef struct Job {
+@@ -XXX,XX +XXX,XX @@ typedef BlockAIOCB *DMAIOFunc(int64_t offset, QEMUIOVector *iov,
-     /** True if this job should automatically dismiss itself */
+                               BlockCompletionFunc *cb, void *cb_opaque,
-     bool auto_dismiss;
+                               void *opaque);
--    /** The completion function that will be called when the job completes.  */
+-BlockAIOCB *dma_blk_io(AioContext *ctx,
-+    /**
+-                       QEMUSGList *sg, uint64_t offset, uint32_t align,
-+     * The completion function that will be called when the job completes.
++BlockAIOCB *dma_blk_io(QEMUSGList *sg, uint64_t offset, uint32_t align,
-+     * Called with AioContext lock held, since many callback implementations
+                        DMAIOFunc *io_func, void *io_func_opaque,
-+     * use bdrv_* functions that require to hold the lock.
+                        BlockCompletionFunc *cb, void *opaque, DMADirection dir);
-+     */
+ BlockAIOCB *dma_blk_read(BlockBackend *blk,
-     BlockCompletionFunc *cb;
+diff --git a/hw/ide/core.c b/hw/ide/core.c
+index XXXXXXX..XXXXXXX 100644
-     /** The opaque value that is passed to the completion function.  */
+--- a/hw/ide/core.c
-@@ -XXX,XX +XXX,XX @@ struct JobDriver {
++++ b/hw/ide/core.c
-      *
+@@ -XXX,XX +XXX,XX @@ static void ide_dma_cb(void *opaque, int ret)
-      * This callback will not be invoked if the job has already failed.
+                                            BDRV_SECTOR_SIZE, ide_dma_cb, s);
-      * If it fails, abort and then clean will be called.
+         break;
-+     *
+     case IDE_DMA_TRIM:
-+     * Called with AioContext lock held, since many callbacs implementations
+-        s->bus->dma->aiocb = dma_blk_io(blk_get_aio_context(s->blk),
-+     * use bdrv_* functions that require to hold the lock.
+-                                        &s->sg, offset, BDRV_SECTOR_SIZE,
-      */
++        s->bus->dma->aiocb = dma_blk_io(&s->sg, offset, BDRV_SECTOR_SIZE,
-     int (*prepare)(Job *job);
+                                         ide_issue_trim, s, ide_dma_cb, s,
+                                         DMA_DIRECTION_TO_DEVICE);
-@@ -XXX,XX +XXX,XX @@ struct JobDriver {
+         break;
-      *
+diff --git a/hw/ide/macio.c b/hw/ide/macio.c
-      * All jobs will complete with a call to either .commit() or .abort() but
+index XXXXXXX..XXXXXXX 100644
-      * never both.
+--- a/hw/ide/macio.c
-+     *
++++ b/hw/ide/macio.c
-+     * Called with AioContext lock held, since many callback implementations
+@@ -XXX,XX +XXX,XX @@ static void pmac_ide_transfer_cb(void *opaque, int ret)
-+     * use bdrv_* functions that require to hold the lock.
+                                            pmac_ide_transfer_cb, io);
-      */
+         break;
-     void (*commit)(Job *job);
+     case IDE_DMA_TRIM:
+-        s->bus->dma->aiocb = dma_blk_io(blk_get_aio_context(s->blk), &s->sg,
-@@ -XXX,XX +XXX,XX @@ struct JobDriver {
+-                                        offset, 0x1, ide_issue_trim, s,
-      *
++        s->bus->dma->aiocb = dma_blk_io(&s->sg, offset, 0x1, ide_issue_trim, s,
-      * All jobs will complete with a call to either .commit() or .abort() but
+                                         pmac_ide_transfer_cb, io,
-      * never both.
+                                         DMA_DIRECTION_TO_DEVICE);
-+     *
+         break;
-+     * Called with AioContext lock held, since many callback implementations
+diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
-+     * use bdrv_* functions that require to hold the lock.
+index XXXXXXX..XXXXXXX 100644
-      */
+--- a/hw/scsi/scsi-disk.c
-     void (*abort)(Job *job);
++++ b/hw/scsi/scsi-disk.c
+@@ -XXX,XX +XXX,XX @@ static void scsi_do_read(SCSIDiskReq *r, int ret)
-@@ -XXX,XX +XXX,XX @@ struct JobDriver {
+     if (r->req.sg) {
-      * .commit() or .abort(). Regardless of which callback is invoked after
+         dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_READ);
-      * completion, .clean() will always be called, even if the job does not
+         r->req.residual -= r->req.sg->size;
-      * belong to a transaction group.
+-        r->req.aiocb = dma_blk_io(blk_get_aio_context(s->qdev.conf.blk),
-+     *
+-                                  r->req.sg, r->sector << BDRV_SECTOR_BITS,
-+     * Called with AioContext lock held, since many callbacs implementations
++        r->req.aiocb = dma_blk_io(r->req.sg, r->sector << BDRV_SECTOR_BITS,
-+     * use bdrv_* functions that require to hold the lock.
+                                   BDRV_SECTOR_SIZE,
-      */
+                                   sdc->dma_readv, r, scsi_dma_complete, r,
-     void (*clean)(Job *job);
+                                   DMA_DIRECTION_FROM_DEVICE);
+@@ -XXX,XX +XXX,XX @@ static void scsi_write_data(SCSIRequest *req)
-@@ -XXX,XX +XXX,XX @@ struct JobDriver {
+     if (r->req.sg) {
-      * READY).
+         dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_WRITE);
-      * (If the callback is NULL, the job is assumed to terminate
+         r->req.residual -= r->req.sg->size;
-      * without I/O.)
+-        r->req.aiocb = dma_blk_io(blk_get_aio_context(s->qdev.conf.blk),
-+     *
+-                                  r->req.sg, r->sector << BDRV_SECTOR_BITS,
-+     * Called with AioContext lock held, since many callback implementations
++        r->req.aiocb = dma_blk_io(r->req.sg, r->sector << BDRV_SECTOR_BITS,
-+     * use bdrv_* functions that require to hold the lock.
+                                   BDRV_SECTOR_SIZE,
-      */
+                                   sdc->dma_writev, r, scsi_dma_complete, r,
-     bool (*cancel)(Job *job, bool force);
+                                   DMA_DIRECTION_TO_DEVICE);
+diff --git a/system/dma-helpers.c b/system/dma-helpers.c
+index XXXXXXX..XXXXXXX 100644
--    /** Called when the job is freed */
+--- a/system/dma-helpers.c
-+    /**
++++ b/system/dma-helpers.c
-+     * Called when the job is freed.
+@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo dma_aiocb_info = {
-+     * Called with AioContext lock held, since many callback implementations
+     .cancel_async       = dma_aio_cancel,
 +     * use bdrv_* functions that require to hold the lock.
 +     */
      void (*free)(Job *job);
  };
+-BlockAIOCB *dma_blk_io(AioContext *ctx,
++BlockAIOCB *dma_blk_io(
+     QEMUSGList *sg, uint64_t offset, uint32_t align,
+     DMAIOFunc *io_func, void *io_func_opaque,
+     BlockCompletionFunc *cb,
+@@ -XXX,XX +XXX,XX @@ BlockAIOCB *dma_blk_io(AioContext *ctx,
+     dbs->acb = NULL;
+     dbs->sg = sg;
+-    dbs->ctx = ctx;
++    dbs->ctx = qemu_get_current_aio_context();
+     dbs->offset = offset;
+     dbs->align = align;
+     dbs->sg_cur_index = 0;
+@@ -XXX,XX +XXX,XX @@ BlockAIOCB *dma_blk_read(BlockBackend *blk,
+                          QEMUSGList *sg, uint64_t offset, uint32_t align,
+                          void (*cb)(void *opaque, int ret), void *opaque)
+ {
+-    return dma_blk_io(blk_get_aio_context(blk), sg, offset, align,
++    return dma_blk_io(sg, offset, align,
+                       dma_blk_read_io_func, blk, cb, opaque,
+                       DMA_DIRECTION_FROM_DEVICE);
+ }
+@@ -XXX,XX +XXX,XX @@ BlockAIOCB *dma_blk_write(BlockBackend *blk,
+                           QEMUSGList *sg, uint64_t offset, uint32_t align,
+                           void (*cb)(void *opaque, int ret), void *opaque)
+ {
+-    return dma_blk_io(blk_get_aio_context(blk), sg, offset, align,
++    return dma_blk_io(sg, offset, align,
+                       dma_blk_write_io_func, blk, cb, opaque,
+                       DMA_DIRECTION_TO_DEVICE);
+ }
 --
-.37.3
+.48.1

-[PULL 33/50] job.c: add job_lock/unlock while keeping job.h intact
+[PULL 12/22] scsi: track per-SCSIRequest AioContext
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+From: Stefan Hajnoczi <stefanha@redhat.com>
-With "intact" we mean that all job.h functions implicitly
+Until now, a SCSIDevice's I/O requests have run in a single AioContext.
-take the lock. Therefore API callers are unmodified.
+In order to support multiple IOThreads it will be necessary to move to
 the concept of a per-SCSIRequest AioContext.
-This means that:
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 - many static functions that will be always called with job lock held
   become _locked, and call _locked functions
 - all public functions take the lock internally if needed, and call _locked
   functions
 - all public functions called internally by other functions in job.c will have a
   _locked counterpart (sometimes public), to avoid deadlocks (job lock already taken).
   These functions are not used for now.
 - some public functions called only from exernal files (not job.c) do not
   have _locked() counterpart and take the lock inside. Others won't need
   the lock at all because use fields only set at initialization and
   never modified.
 job_{lock/unlock} is independent from real_job_{lock/unlock}.
 Note: at this stage, job_{lock/unlock} and job lock guard macros
 are *nop*
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Message-ID: <20250311132616.1049687-4-stefanha@redhat.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
 Message-Id: <20220926093214.506243-6-eesposit@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/qemu/job.h | 138 +++++++++-
+ include/hw/scsi/scsi.h |  1 +
- job.c              | 610 ++++++++++++++++++++++++++++++++-------------
+ hw/scsi/scsi-bus.c     |  1 +
-files changed, 561 insertions(+), 187 deletions(-)
+ hw/scsi/scsi-disk.c    | 17 ++++++-----------
 files changed, 8 insertions(+), 11 deletions(-)
-diff --git a/include/qemu/job.h b/include/qemu/job.h
+diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/job.h
+--- a/include/hw/scsi/scsi.h
-+++ b/include/qemu/job.h
++++ b/include/hw/scsi/scsi.h
-@@ -XXX,XX +XXX,XX @@ JobTxn *job_txn_new(void);
+@@ -XXX,XX +XXX,XX @@ struct SCSIRequest {
-  */
+     SCSIBus           *bus;
- void job_txn_unref(JobTxn *txn);
+     SCSIDevice        *dev;
+     const SCSIReqOps  *ops;
-+/*
++    AioContext        *ctx;
-+ * Same as job_txn_unref(), but called with job lock held.
+     uint32_t          refcount;
-+ * Might release the lock temporarily.
+     uint32_t          tag;
-+ */
+     uint32_t          lun;
-+void job_txn_unref_locked(JobTxn *txn);
+diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
 +
  /**
   * Create a new long-running job and return it.
 + * Called with job_mutex *not* held.
   *
   * @job_id: The id of the newly-created job, or %NULL for internal jobs
   * @driver: The class object for the newly-created job.
@@ -XXX,XX +XXX,XX @@ void *job_create(const char *job_id, const JobDriver *driver, JobTxn *txn,
   */
  void job_ref(Job *job);
 +/* Same as job_ref(), but called with job lock held. */
 +void job_ref_locked(Job *job);
 +
  /**
   * Release a reference that was previously acquired with job_ref() or
   * job_create(). If it's the last reference to the object, it will be freed.
   */
  void job_unref(Job *job);
 +/* Same as job_unref(), but called with job lock held. */
 +void job_unref_locked(Job *job);
 +
  /**
   * @job: The job that has made progress
   * @done: How much progress the job made since the last call
   *
   * Updates the progress counter of the job.
 + *
 + * May be called with mutex held or not held.
   */
  void job_progress_update(Job *job, uint64_t done);
@@ -XXX,XX +XXX,XX @@ void job_progress_update(Job *job, uint64_t done);
   *
   * Sets the expected end value of the progress counter of a job so that a
   * completion percentage can be calculated when the progress is updated.
 + *
 + * May be called with mutex held or not held.
   */
  void job_progress_set_remaining(Job *job, uint64_t remaining);
@@ -XXX,XX +XXX,XX @@ void job_progress_set_remaining(Job *job, uint64_t remaining);
   * length before, and job_progress_update() afterwards.
   * (So the operation acts as a parenthesis in regards to the main job
   * operation running in background.)
 + *
 + * May be called with mutex held or not held.
   */
  void job_progress_increase_remaining(Job *job, uint64_t delta);
@@ -XXX,XX +XXX,XX @@ void job_progress_increase_remaining(Job *job, uint64_t delta);
   */
  void job_enter_cond(Job *job, bool(*fn)(Job *job));
 +/*
 + * Same as job_enter_cond(), but called with job lock held.
 + * Might release the lock temporarily.
 + */
 +void job_enter_cond_locked(Job *job, bool(*fn)(Job *job));
 +
  /**
   * @job: A job that has not yet been started.
   *
   * Begins execution of a job.
   * Takes ownership of one reference to the job object.
 + *
 + * Called with job_mutex *not* held.
   */
  void job_start(Job *job);
@@ -XXX,XX +XXX,XX @@ void job_start(Job *job);
   * @job: The job to enter.
   *
   * Continue the specified job by entering the coroutine.
 + * Called with job_mutex *not* held.
   */
  void job_enter(Job *job);
@@ -XXX,XX +XXX,XX @@ void job_enter(Job *job);
   *
   * Pause now if job_pause() has been called. Jobs that perform lots of I/O
   * must call this between requests so that the job can be paused.
 + *
 + * Called with job_mutex *not* held.
   */
  void coroutine_fn job_pause_point(Job *job);
@@ -XXX,XX +XXX,XX @@ void coroutine_fn job_pause_point(Job *job);
   * @job: The job that calls the function.
   *
   * Yield the job coroutine.
 + * Called with job_mutex *not* held.
   */
  void coroutine_fn job_yield(Job *job);
@@ -XXX,XX +XXX,XX @@ void coroutine_fn job_yield(Job *job);
   * Put the job to sleep (assuming that it wasn't canceled) for @ns
   * %QEMU_CLOCK_REALTIME nanoseconds.  Canceling the job will immediately
   * interrupt the wait.
 + *
 + * Called with job_mutex *not* held.
   */
  void coroutine_fn job_sleep_ns(Job *job, int64_t ns);
@@ -XXX,XX +XXX,XX @@ const char *job_type_str(const Job *job);
  /** Returns true if the job should not be visible to the management layer. */
  bool job_is_internal(Job *job);
 -/** Returns whether the job is being cancelled. */
 +/**
 + * Returns whether the job is being cancelled.
 + * Called with job_mutex *not* held.
 + */
  bool job_is_cancelled(Job *job);
 +/* Same as job_is_cancelled(), but called with job lock held. */
 +bool job_is_cancelled_locked(Job *job);
 +
  /**
   * Returns whether the job is scheduled for cancellation (at an
   * indefinite point).
 + * Called with job_mutex *not* held.
   */
  bool job_cancel_requested(Job *job);
 -/** Returns whether the job is in a completed state. */
 +/**
 + * Returns whether the job is in a completed state.
 + * Called with job_mutex *not* held.
 + */
  bool job_is_completed(Job *job);
 -/** Returns whether the job is ready to be completed. */
 +/* Same as job_is_completed(), but called with job lock held. */
 +bool job_is_completed_locked(Job *job);
 +
 +/**
 + * Returns whether the job is ready to be completed.
 + * Called with job_mutex *not* held.
 + */
  bool job_is_ready(Job *job);
 +/* Same as job_is_ready(), but called with job lock held. */
 +bool job_is_ready_locked(Job *job);
 +
  /**
   * Request @job to pause at the next pause point. Must be paired with
   * job_resume(). If the job is supposed to be resumed by user action, call
@@ -XXX,XX +XXX,XX @@ bool job_is_ready(Job *job);
   */
  void job_pause(Job *job);
 +/* Same as job_pause(), but called with job lock held. */
 +void job_pause_locked(Job *job);
 +
  /** Resumes a @job paused with job_pause. */
  void job_resume(Job *job);
 +/*
 + * Same as job_resume(), but called with job lock held.
 + * Might release the lock temporarily.
 + */
 +void job_resume_locked(Job *job);
 +
  /**
   * Asynchronously pause the specified @job.
   * Do not allow a resume until a matching call to job_user_resume.
   */
  void job_user_pause(Job *job, Error **errp);
 +/* Same as job_user_pause(), but called with job lock held. */
 +void job_user_pause_locked(Job *job, Error **errp);
 +
  /** Returns true if the job is user-paused. */
  bool job_user_paused(Job *job);
 +/* Same as job_user_paused(), but called with job lock held. */
 +bool job_user_paused_locked(Job *job);
 +
  /**
   * Resume the specified @job.
   * Must be paired with a preceding job_user_pause.
   */
  void job_user_resume(Job *job, Error **errp);
 +/*
 + * Same as job_user_resume(), but called with job lock held.
 + * Might release the lock temporarily.
 + */
 +void job_user_resume_locked(Job *job, Error **errp);
 +
  /**
   * Get the next element from the list of block jobs after @job, or the
   * first one if @job is %NULL.
@@ -XXX,XX +XXX,XX @@ void job_user_resume(Job *job, Error **errp);
   */
  Job *job_next(Job *job);
 +/* Same as job_next(), but called with job lock held. */
 +Job *job_next_locked(Job *job);
 +
  /**
   * Get the job identified by @id (which must not be %NULL).
   *
@@ -XXX,XX +XXX,XX @@ Job *job_next(Job *job);
   */
  Job *job_get(const char *id);
 +/* Same as job_get(), but called with job lock held. */
 +Job *job_get_locked(const char *id);
 +
  /**
   * Check whether the verb @verb can be applied to @job in its current state.
   * Returns 0 if the verb can be applied; otherwise errp is set and -EPERM
@@ -XXX,XX +XXX,XX @@ Job *job_get(const char *id);
   */
  int job_apply_verb(Job *job, JobVerb verb, Error **errp);
 -/** The @job could not be started, free it. */
 +/* Same as job_apply_verb, but called with job lock held. */
 +int job_apply_verb_locked(Job *job, JobVerb verb, Error **errp);
 +
 +/**
 + * The @job could not be started, free it.
 + * Called with job_mutex *not* held.
 + */
  void job_early_fail(Job *job);
 -/** Moves the @job from RUNNING to READY */
 +/**
 + * Moves the @job from RUNNING to READY.
 + * Called with job_mutex *not* held.
 + */
  void job_transition_to_ready(Job *job);
  /** Asynchronously complete the specified @job. */
  void job_complete(Job *job, Error **errp);
 +/*
 + * Same as job_complete(), but called with job lock held.
 + * Might release the lock temporarily.
 + */
 +void job_complete_locked(Job *job, Error **errp);
 +
  /**
   * Asynchronously cancel the specified @job. If @force is true, the job should
   * be cancelled immediately without waiting for a consistent state.
   */
  void job_cancel(Job *job, bool force);
 +/* Same as job_cancel(), but called with job lock held. */
 +void job_cancel_locked(Job *job, bool force);
 +
  /**
   * Cancels the specified job like job_cancel(), but may refuse to do so if the
   * operation isn't meaningful in the current state of the job.
   */
  void job_user_cancel(Job *job, bool force, Error **errp);
 +/* Same as job_user_cancel(), but called with job lock held. */
 +void job_user_cancel_locked(Job *job, bool force, Error **errp);
 +
  /**
   * Synchronously cancel the @job.  The completion callback is called
   * before the function returns.  If @force is false, the job may
@@ -XXX,XX +XXX,XX @@ void job_user_cancel(Job *job, bool force, Error **errp);
   */
  int job_cancel_sync(Job *job, bool force);
 -/** Synchronously force-cancels all jobs using job_cancel_sync(). */
 +/* Same as job_cancel_sync, but called with job lock held. */
 +int job_cancel_sync_locked(Job *job, bool force);
 +
 +/**
 + * Synchronously force-cancels all jobs using job_cancel_sync_locked().
 + *
 + * Called with job_lock *not* held.
 + */
  void job_cancel_sync_all(void);
  /**
@@ -XXX,XX +XXX,XX @@ void job_cancel_sync_all(void);
   */
  int job_complete_sync(Job *job, Error **errp);
 +/* Same as job_complete_sync, but called with job lock held. */
 +int job_complete_sync_locked(Job *job, Error **errp);
 +
  /**
   * For a @job that has finished its work and is pending awaiting explicit
   * acknowledgement to commit its work, this will commit that work.
@@ -XXX,XX +XXX,XX @@ int job_complete_sync(Job *job, Error **errp);
   */
  void job_finalize(Job *job, Error **errp);
 +/* Same as job_finalize(), but called with job lock held. */
 +void job_finalize_locked(Job *job, Error **errp);
 +
  /**
   * Remove the concluded @job from the query list and resets the passed pointer
   * to %NULL. Returns an error if the job is not actually concluded.
   */
  void job_dismiss(Job **job, Error **errp);
 +/* Same as job_dismiss(), but called with job lock held. */
 +void job_dismiss_locked(Job **job, Error **errp);
 +
  /**
   * Synchronously finishes the given @job. If @finish is given, it is called to
   * trigger completion or cancellation of the job.
@@ -XXX,XX +XXX,XX @@ void job_dismiss(Job **job, Error **errp);
   *
   * Callers must hold the AioContext lock of job->aio_context.
   */
 -int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error **errp);
 +int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp),
 +                    Error **errp);
 +
 +/*
 + * Same as job_finish_sync(), but called with job lock held.
 + * Might release the lock temporarily.
 + */
 +int job_finish_sync_locked(Job *job, void (*finish)(Job *, Error **errp),
 +                           Error **errp);
  #endif
 diff --git a/job.c b/job.c
 index XXXXXXX..XXXXXXX 100644
---- a/job.c
+--- a/hw/scsi/scsi-bus.c
-+++ b/job.c
++++ b/hw/scsi/scsi-bus.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ invalid_opcode:
   */
  QemuMutex job_mutex;
 +/* Protected by job_mutex */
  static QLIST_HEAD(, Job) jobs = QLIST_HEAD_INITIALIZER(jobs);
  /* Job State Transition Table */
@@ -XXX,XX +XXX,XX @@ JobTxn *job_txn_new(void)
      return txn;
  }
 -static void job_txn_ref(JobTxn *txn)
 +/* Called with job_mutex held. */
 +static void job_txn_ref_locked(JobTxn *txn)
  {
      txn->refcnt++;
  }
 -void job_txn_unref(JobTxn *txn)
 +void job_txn_unref_locked(JobTxn *txn)
  {
      if (txn && --txn->refcnt == 0) {
          g_free(txn);
      }
  }
 +void job_txn_unref(JobTxn *txn)
 +{
 +    JOB_LOCK_GUARD();
 +    job_txn_unref_locked(txn);
 +}
 +
  /**
   * @txn: The transaction (may be NULL)
   * @job: Job to add to the transaction
@@ -XXX,XX +XXX,XX @@ void job_txn_unref(JobTxn *txn)
   * the reference that is automatically grabbed here.
   *
   * If @txn is NULL, the function does nothing.
 + *
 + * Called with job_mutex held.
   */
 -static void job_txn_add_job(JobTxn *txn, Job *job)
 +static void job_txn_add_job_locked(JobTxn *txn, Job *job)
  {
      if (!txn) {
          return;
@@ -XXX,XX +XXX,XX @@ static void job_txn_add_job(JobTxn *txn, Job *job)
      job->txn = txn;
      QLIST_INSERT_HEAD(&txn->jobs, job, txn_list);
 -    job_txn_ref(txn);
 +    job_txn_ref_locked(txn);
  }
 -static void job_txn_del_job(Job *job)
 +/* Called with job_mutex held. */
 +static void job_txn_del_job_locked(Job *job)
  {
      if (job->txn) {
          QLIST_REMOVE(job, txn_list);
 -        job_txn_unref(job->txn);
 +        job_txn_unref_locked(job->txn);
          job->txn = NULL;
      }
  }
 -static int job_txn_apply(Job *job, int fn(Job *))
 +/* Called with job_mutex held, but releases it temporarily. */
 +static int job_txn_apply_locked(Job *job, int fn(Job *))
  {
      AioContext *inner_ctx;
      Job *other_job, *next;
@@ -XXX,XX +XXX,XX @@ static int job_txn_apply(Job *job, int fn(Job *))
       * we need to release it here to avoid holding the lock twice - which would
       * break AIO_WAIT_WHILE from within fn.
       */
 -    job_ref(job);
 +    job_ref_locked(job);
      aio_context_release(job->aio_context);
      QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
@@ -XXX,XX +XXX,XX @@ static int job_txn_apply(Job *job, int fn(Job *))
       * can't use a local variable to cache it.
       */
      aio_context_acquire(job->aio_context);
 -    job_unref(job);
 +    job_unref_locked(job);
      return rc;
  }
@@ -XXX,XX +XXX,XX @@ bool job_is_internal(Job *job)
      return (job->id == NULL);
  }
 -static void job_state_transition(Job *job, JobStatus s1)
 +/* Called with job_mutex held. */
 +static void job_state_transition_locked(Job *job, JobStatus s1)
  {
      JobStatus s0 = job->status;
      assert(s1 >= 0 && s1 < JOB_STATUS__MAX);
@@ -XXX,XX +XXX,XX @@ static void job_state_transition(Job *job, JobStatus s1)
      }
  }
 -int job_apply_verb(Job *job, JobVerb verb, Error **errp)
 +int job_apply_verb_locked(Job *job, JobVerb verb, Error **errp)
  {
      JobStatus s0 = job->status;
      assert(verb >= 0 && verb < JOB_VERB__MAX);
@@ -XXX,XX +XXX,XX @@ int job_apply_verb(Job *job, JobVerb verb, Error **errp)
      return -EPERM;
  }
 +int job_apply_verb(Job *job, JobVerb verb, Error **errp)
 +{
 +    JOB_LOCK_GUARD();
 +    return job_apply_verb_locked(job, verb, errp);
 +}
 +
  JobType job_type(const Job *job)
  {
      return job->driver->job_type;
@@ -XXX,XX +XXX,XX @@ const char *job_type_str(const Job *job)
      return JobType_str(job_type(job));
  }
 -bool job_is_cancelled(Job *job)
 +bool job_is_cancelled_locked(Job *job)
  {
      /* force_cancel may be true only if cancelled is true, too */
      assert(job->cancelled || !job->force_cancel);
      return job->force_cancel;
  }
 -bool job_cancel_requested(Job *job)
 +bool job_is_cancelled(Job *job)
 +{
 +    JOB_LOCK_GUARD();
 +    return job_is_cancelled_locked(job);
 +}
 +
 +/* Called with job_mutex held. */
 +static bool job_cancel_requested_locked(Job *job)
  {
      return job->cancelled;
  }
 -bool job_is_ready(Job *job)
 +bool job_cancel_requested(Job *job)
 +{
 +    JOB_LOCK_GUARD();
 +    return job_cancel_requested_locked(job);
 +}
 +
 +bool job_is_ready_locked(Job *job)
  {
      switch (job->status) {
      case JOB_STATUS_UNDEFINED:
@@ -XXX,XX +XXX,XX @@ bool job_is_ready(Job *job)
      return false;
  }
 -bool job_is_completed(Job *job)
 +bool job_is_ready(Job *job)
 +{
 +    JOB_LOCK_GUARD();
 +    return job_is_ready_locked(job);
 +}
 +
 +bool job_is_completed_locked(Job *job)
  {
      switch (job->status) {
      case JOB_STATUS_UNDEFINED:
@@ -XXX,XX +XXX,XX @@ bool job_is_completed(Job *job)
      return false;
  }
 -static bool job_started(Job *job)
 +bool job_is_completed(Job *job)
 +{
 +    JOB_LOCK_GUARD();
 +    return job_is_completed_locked(job);
 +}
 +
 +static bool job_started_locked(Job *job)
  {
      return job->co;
  }
 -static bool job_should_pause(Job *job)
 +/* Called with job_mutex held. */
 +static bool job_should_pause_locked(Job *job)
  {
      return job->pause_count > 0;
  }
 -Job *job_next(Job *job)
 +Job *job_next_locked(Job *job)
  {
      if (!job) {
          return QLIST_FIRST(&jobs);
@@ -XXX,XX +XXX,XX @@ Job *job_next(Job *job)
      return QLIST_NEXT(job, job_list);
  }
 -Job *job_get(const char *id)
 +Job *job_next(Job *job)
 +{
 +    JOB_LOCK_GUARD();
 +    return job_next_locked(job);
 +}
 +
 +Job *job_get_locked(const char *id)
  {
      Job *job;
@@ -XXX,XX +XXX,XX @@ Job *job_get(const char *id)
      return NULL;
  }
 +Job *job_get(const char *id)
 +{
 +    JOB_LOCK_GUARD();
 +    return job_get_locked(id);
 +}
 +
 +/* Called with job_mutex *not* held. */
  static void job_sleep_timer_cb(void *opaque)
  {
      Job *job = opaque;
@@ -XXX,XX +XXX,XX @@ void *job_create(const char *job_id, const JobDriver *driver, JobTxn *txn,
  {
      Job *job;
 +    JOB_LOCK_GUARD();
 +
      if (job_id) {
          if (flags & JOB_INTERNAL) {
              error_setg(errp, "Cannot specify job ID for internal job");
@@ -XXX,XX +XXX,XX @@ void *job_create(const char *job_id, const JobDriver *driver, JobTxn *txn,
              error_setg(errp, "Invalid job ID '%s'", job_id);
              return NULL;
          }
 -        if (job_get(job_id)) {
 +        if (job_get_locked(job_id)) {
              error_setg(errp, "Job ID '%s' already in use", job_id);
              return NULL;
          }
@@ -XXX,XX +XXX,XX @@ void *job_create(const char *job_id, const JobDriver *driver, JobTxn *txn,
      notifier_list_init(&job->on_ready);
      notifier_list_init(&job->on_idle);
 -    job_state_transition(job, JOB_STATUS_CREATED);
 +    job_state_transition_locked(job, JOB_STATUS_CREATED);
      aio_timer_init(qemu_get_aio_context(), &job->sleep_timer,
                     QEMU_CLOCK_REALTIME, SCALE_NS,
                     job_sleep_timer_cb, job);
@@ -XXX,XX +XXX,XX @@ void *job_create(const char *job_id, const JobDriver *driver, JobTxn *txn,
       * consolidating the job management logic */
      if (!txn) {
          txn = job_txn_new();
 -        job_txn_add_job(txn, job);
 -        job_txn_unref(txn);
 +        job_txn_add_job_locked(txn, job);
 +        job_txn_unref_locked(txn);
      } else {
 -        job_txn_add_job(txn, job);
 +        job_txn_add_job_locked(txn, job);
      }
      return job;
  }
 -void job_ref(Job *job)
 +void job_ref_locked(Job *job)
  {
      ++job->refcnt;
  }
 -void job_unref(Job *job)
 +void job_ref(Job *job)
 +{
 +    JOB_LOCK_GUARD();
 +    job_ref_locked(job);
 +}
 +
 +void job_unref_locked(Job *job)
  {
      GLOBAL_STATE_CODE();
@@ -XXX,XX +XXX,XX @@ void job_unref(Job *job)
          assert(!job->txn);
          if (job->driver->free) {
 +            job_unlock();
              job->driver->free(job);
 +            job_lock();
          }
          QLIST_REMOVE(job, job_list);
@@ -XXX,XX +XXX,XX @@ void job_unref(Job *job)
      }
  }
 +void job_unref(Job *job)
 +{
 +    JOB_LOCK_GUARD();
 +    job_unref_locked(job);
 +}
 +
  void job_progress_update(Job *job, uint64_t done)
  {
      progress_work_done(&job->progress, done);
@@ -XXX,XX +XXX,XX @@ void job_progress_increase_remaining(Job *job, uint64_t delta)
  /**
   * To be called when a cancelled job is finalised.
 + * Called with job_mutex held.
   */
 -static void job_event_cancelled(Job *job)
 +static void job_event_cancelled_locked(Job *job)
  {
      notifier_list_notify(&job->on_finalize_cancelled, job);
  }
  /**
   * To be called when a successfully completed job is finalised.
 + * Called with job_mutex held.
   */
 -static void job_event_completed(Job *job)
 +static void job_event_completed_locked(Job *job)
  {
      notifier_list_notify(&job->on_finalize_completed, job);
  }
 -static void job_event_pending(Job *job)
 +/* Called with job_mutex held. */
 +static void job_event_pending_locked(Job *job)
  {
      notifier_list_notify(&job->on_pending, job);
  }
 -static void job_event_ready(Job *job)
 +/* Called with job_mutex held. */
 +static void job_event_ready_locked(Job *job)
  {
      notifier_list_notify(&job->on_ready, job);
  }
 -static void job_event_idle(Job *job)
 +/* Called with job_mutex held. */
 +static void job_event_idle_locked(Job *job)
  {
      notifier_list_notify(&job->on_idle, job);
  }
 -void job_enter_cond(Job *job, bool(*fn)(Job *job))
 +void job_enter_cond_locked(Job *job, bool(*fn)(Job *job))
  {
 -    if (!job_started(job)) {
 +    if (!job_started_locked(job)) {
          return;
      }
      if (job->deferred_to_main_loop) {
@@ -XXX,XX +XXX,XX @@ void job_enter_cond(Job *job, bool(*fn)(Job *job))
      timer_del(&job->sleep_timer);
      job->busy = true;
      real_job_unlock();
 +    job_unlock();
      aio_co_enter(job->aio_context, job->co);
 +    job_lock();
 +}
 +
 +void job_enter_cond(Job *job, bool(*fn)(Job *job))
 +{
 +    JOB_LOCK_GUARD();
 +    job_enter_cond_locked(job, fn);
  }
  void job_enter(Job *job)
  {
 -    job_enter_cond(job, NULL);
 +    JOB_LOCK_GUARD();
 +    job_enter_cond_locked(job, NULL);
  }
  /* Yield, and schedule a timer to reenter the coroutine after @ns nanoseconds.
@@ -XXX,XX +XXX,XX @@ void job_enter(Job *job)
   * is allowed and cancels the timer.
   *
   * If @ns is (uint64_t) -1, no timer is scheduled and job_enter() must be
 - * called explicitly. */
 -static void coroutine_fn job_do_yield(Job *job, uint64_t ns)
 + * called explicitly.
 + *
 + * Called with job_mutex held, but releases it temporarily.
 + */
 +static void coroutine_fn job_do_yield_locked(Job *job, uint64_t ns)
  {
      real_job_lock();
      if (ns != -1) {
          timer_mod(&job->sleep_timer, ns);
      }
      job->busy = false;
 -    job_event_idle(job);
 +    job_event_idle_locked(job);
      real_job_unlock();
 +    job_unlock();
      qemu_coroutine_yield();
 +    job_lock();
      /* Set by job_enter_cond() before re-entering the coroutine.  */
      assert(job->busy);
  }
 -void coroutine_fn job_pause_point(Job *job)
 +/* Called with job_mutex held, but releases it temporarily. */
 +static void coroutine_fn job_pause_point_locked(Job *job)
  {
 -    assert(job && job_started(job));
 +    assert(job && job_started_locked(job));
 -    if (!job_should_pause(job)) {
 +    if (!job_should_pause_locked(job)) {
          return;
      }
 -    if (job_is_cancelled(job)) {
 +    if (job_is_cancelled_locked(job)) {
          return;
      }
      if (job->driver->pause) {
 +        job_unlock();
          job->driver->pause(job);
 +        job_lock();
      }
 -    if (job_should_pause(job) && !job_is_cancelled(job)) {
 +    if (job_should_pause_locked(job) && !job_is_cancelled_locked(job)) {
          JobStatus status = job->status;
 -        job_state_transition(job, status == JOB_STATUS_READY
 -                                  ? JOB_STATUS_STANDBY
 -                                  : JOB_STATUS_PAUSED);
 +        job_state_transition_locked(job, status == JOB_STATUS_READY
 +                                    ? JOB_STATUS_STANDBY
 +                                    : JOB_STATUS_PAUSED);
          job->paused = true;
 -        job_do_yield(job, -1);
 +        job_do_yield_locked(job, -1);
          job->paused = false;
 -        job_state_transition(job, status);
 +        job_state_transition_locked(job, status);
      }
      if (job->driver->resume) {
 +        job_unlock();
          job->driver->resume(job);
 +        job_lock();
      }
  }
 -void coroutine_fn job_yield(Job *job)
 +void coroutine_fn job_pause_point(Job *job)
 +{
 +    JOB_LOCK_GUARD();
 +    job_pause_point_locked(job);
 +}
 +
 +static void coroutine_fn job_yield_locked(Job *job)
  {
      assert(job->busy);
      /* Check cancellation *before* setting busy = false, too!  */
 -    if (job_is_cancelled(job)) {
 +    if (job_is_cancelled_locked(job)) {
          return;
      }
 -    if (!job_should_pause(job)) {
 -        job_do_yield(job, -1);
 +    if (!job_should_pause_locked(job)) {
 +        job_do_yield_locked(job, -1);
      }
 -    job_pause_point(job);
 +    job_pause_point_locked(job);
 +}
 +
 +void coroutine_fn job_yield(Job *job)
 +{
 +    JOB_LOCK_GUARD();
 +    job_yield_locked(job);
  }
  void coroutine_fn job_sleep_ns(Job *job, int64_t ns)
  {
 +    JOB_LOCK_GUARD();
      assert(job->busy);
      /* Check cancellation *before* setting busy = false, too!  */
 -    if (job_is_cancelled(job)) {
 +    if (job_is_cancelled_locked(job)) {
          return;
      }
 -    if (!job_should_pause(job)) {
 -        job_do_yield(job, qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + ns);
 +    if (!job_should_pause_locked(job)) {
 +        job_do_yield_locked(job, qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + ns);
      }
 -    job_pause_point(job);
 +    job_pause_point_locked(job);
  }
 -/* Assumes the block_job_mutex is held */
 -static bool job_timer_not_pending(Job *job)
 +/* Assumes the job_mutex is held */
 +static bool job_timer_not_pending_locked(Job *job)
  {
      return !timer_pending(&job->sleep_timer);
  }
 -void job_pause(Job *job)
 +void job_pause_locked(Job *job)
  {
      job->pause_count++;
      if (!job->paused) {
 -        job_enter(job);
 +        job_enter_cond_locked(job, NULL);
      }
  }
 -void job_resume(Job *job)
 +void job_pause(Job *job)
 +{
 +    JOB_LOCK_GUARD();
 +    job_pause_locked(job);
 +}
 +
 +void job_resume_locked(Job *job)
  {
      assert(job->pause_count > 0);
      job->pause_count--;
@@ -XXX,XX +XXX,XX @@ void job_resume(Job *job)
      }
      /* kick only if no timer is pending */
 -    job_enter_cond(job, job_timer_not_pending);
 +    job_enter_cond_locked(job, job_timer_not_pending_locked);
  }
 -void job_user_pause(Job *job, Error **errp)
 +void job_resume(Job *job)
  {
 -    if (job_apply_verb(job, JOB_VERB_PAUSE, errp)) {
 +    JOB_LOCK_GUARD();
 +    job_resume_locked(job);
 +}
 +
 +void job_user_pause_locked(Job *job, Error **errp)
 +{
 +    if (job_apply_verb_locked(job, JOB_VERB_PAUSE, errp)) {
          return;
      }
      if (job->user_paused) {
@@ -XXX,XX +XXX,XX @@ void job_user_pause(Job *job, Error **errp)
          return;
      }
      job->user_paused = true;
 -    job_pause(job);
 +    job_pause_locked(job);
  }
 -bool job_user_paused(Job *job)
 +void job_user_pause(Job *job, Error **errp)
 +{
 +    JOB_LOCK_GUARD();
 +    job_user_pause_locked(job, errp);
 +}
 +
 +bool job_user_paused_locked(Job *job)
  {
      return job->user_paused;
  }
 -void job_user_resume(Job *job, Error **errp)
 +bool job_user_paused(Job *job)
 +{
 +    JOB_LOCK_GUARD();
 +    return job_user_paused_locked(job);
 +}
 +
 +void job_user_resume_locked(Job *job, Error **errp)
  {
      assert(job);
      GLOBAL_STATE_CODE();
@@ -XXX,XX +XXX,XX @@ void job_user_resume(Job *job, Error **errp)
          error_setg(errp, "Can't resume a job that was not paused");
          return;
      }
 -    if (job_apply_verb(job, JOB_VERB_RESUME, errp)) {
 +    if (job_apply_verb_locked(job, JOB_VERB_RESUME, errp)) {
          return;
      }
      if (job->driver->user_resume) {
 +        job_unlock();
          job->driver->user_resume(job);
 +        job_lock();
      }
      job->user_paused = false;
 -    job_resume(job);
 +    job_resume_locked(job);
  }
 -static void job_do_dismiss(Job *job)
 +void job_user_resume(Job *job, Error **errp)
 +{
 +    JOB_LOCK_GUARD();
 +    job_user_resume_locked(job, errp);
 +}
 +
 +/* Called with job_mutex held, but releases it temporarily. */
 +static void job_do_dismiss_locked(Job *job)
  {
      assert(job);
      job->busy = false;
      job->paused = false;
      job->deferred_to_main_loop = true;
 -    job_txn_del_job(job);
 +    job_txn_del_job_locked(job);
 -    job_state_transition(job, JOB_STATUS_NULL);
 -    job_unref(job);
 +    job_state_transition_locked(job, JOB_STATUS_NULL);
 +    job_unref_locked(job);
  }
 -void job_dismiss(Job **jobptr, Error **errp)
 +void job_dismiss_locked(Job **jobptr, Error **errp)
  {
      Job *job = *jobptr;
      /* similarly to _complete, this is QMP-interface only. */
      assert(job->id);
 -    if (job_apply_verb(job, JOB_VERB_DISMISS, errp)) {
 +    if (job_apply_verb_locked(job, JOB_VERB_DISMISS, errp)) {
          return;
      }
 -    job_do_dismiss(job);
 +    job_do_dismiss_locked(job);
      *jobptr = NULL;
  }
 +void job_dismiss(Job **jobptr, Error **errp)
 +{
 +    JOB_LOCK_GUARD();
 +    job_dismiss_locked(jobptr, errp);
 +}
 +
  void job_early_fail(Job *job)
  {
 +    JOB_LOCK_GUARD();
      assert(job->status == JOB_STATUS_CREATED);
 -    job_do_dismiss(job);
 +    job_do_dismiss_locked(job);
  }
 -static void job_conclude(Job *job)
 +/* Called with job_mutex held. */
 +static void job_conclude_locked(Job *job)
  {
 -    job_state_transition(job, JOB_STATUS_CONCLUDED);
 -    if (job->auto_dismiss || !job_started(job)) {
 -        job_do_dismiss(job);
 +    job_state_transition_locked(job, JOB_STATUS_CONCLUDED);
 +    if (job->auto_dismiss || !job_started_locked(job)) {
 +        job_do_dismiss_locked(job);
      }
  }
 -static void job_update_rc(Job *job)
 +/* Called with job_mutex held. */
 +static void job_update_rc_locked(Job *job)
  {
 -    if (!job->ret && job_is_cancelled(job)) {
 +    if (!job->ret && job_is_cancelled_locked(job)) {
          job->ret = -ECANCELED;
      }
      if (job->ret) {
          if (!job->err) {
              error_setg(&job->err, "%s", strerror(-job->ret));
          }
 -        job_state_transition(job, JOB_STATUS_ABORTING);
 +        job_state_transition_locked(job, JOB_STATUS_ABORTING);
      }
  }
@@ -XXX,XX +XXX,XX @@ static void job_clean(Job *job)
      }
  }
 -static int job_finalize_single(Job *job)
 +/* Called with job_mutex held, but releases it temporarily */
 +static int job_finalize_single_locked(Job *job)
  {
 -    assert(job_is_completed(job));
 +    int job_ret;
 +
 +    assert(job_is_completed_locked(job));
      /* Ensure abort is called for late-transactional failures */
 -    job_update_rc(job);
 +    job_update_rc_locked(job);
 +
 +    job_ret = job->ret;
 +    job_unlock();
 -    if (!job->ret) {
 +    if (!job_ret) {
          job_commit(job);
      } else {
          job_abort(job);
      }
      job_clean(job);
 +    job_lock();
 +
      if (job->cb) {
 -        job->cb(job->opaque, job->ret);
 +        job_ret = job->ret;
 +        job_unlock();
 +        job->cb(job->opaque, job_ret);
 +        job_lock();
      }
      /* Emit events only if we actually started */
 -    if (job_started(job)) {
 -        if (job_is_cancelled(job)) {
 -            job_event_cancelled(job);
 +    if (job_started_locked(job)) {
 +        if (job_is_cancelled_locked(job)) {
 +            job_event_cancelled_locked(job);
          } else {
 -            job_event_completed(job);
 +            job_event_completed_locked(job);
          }
      }
--    job_txn_del_job(job);
++    req->ctx = qemu_get_current_aio_context();
--    job_conclude(job);
+     req->cmd = cmd;
-+    job_txn_del_job_locked(job);
+     req->residual = req->cmd.xfer;
-+    job_conclude_locked(job);
-     return 0;
+diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
- }
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/scsi/scsi-disk.c
--static void job_cancel_async(Job *job, bool force)
++++ b/hw/scsi/scsi-disk.c
-+/* Called with job_mutex held, but releases it temporarily */
+@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
-+static void job_cancel_async_locked(Job *job, bool force)
+     SCSIDiskReq *r = (SCSIDiskReq *)opaque;
      SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
 -    /* The request must only run in the BlockBackend's AioContext */
 -    assert(blk_get_aio_context(s->qdev.conf.blk) ==
 -           qemu_get_current_aio_context());
 +    /* The request must run in its AioContext */
 +    assert(r->req.ctx == qemu_get_current_aio_context());
      assert(r->req.aiocb != NULL);
      r->req.aiocb = NULL;
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_complete(void *opaque, int ret)
  static void scsi_read_complete_noio(SCSIDiskReq *r, int ret)
  {
-     GLOBAL_STATE_CODE();
+-    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
-     if (job->driver->cancel) {
+     uint32_t n;
-+        job_unlock();
-         force = job->driver->cancel(job, force);
+-    /* The request must only run in the BlockBackend's AioContext */
-+        job_lock();
+-    assert(blk_get_aio_context(s->qdev.conf.blk) ==
-     } else {
+-           qemu_get_current_aio_context());
-         /* No .cancel() means the job will behave as if force-cancelled */
++    /* The request must run in its AioContext */
-         force = true;
++    assert(r->req.ctx == qemu_get_current_aio_context());
-@@ -XXX,XX +XXX,XX @@ static void job_cancel_async(Job *job, bool force)
-     if (job->user_paused) {
+     assert(r->req.aiocb == NULL);
-         /* Do not call job_enter here, the caller will handle it.  */
+     if (scsi_disk_req_check_error(r, ret, ret > 0)) {
-         if (job->driver->user_resume) {
+@@ -XXX,XX +XXX,XX @@ static void scsi_read_data(SCSIRequest *req)
-+            job_unlock();
-             job->driver->user_resume(job);
+ static void scsi_write_complete_noio(SCSIDiskReq *r, int ret)
 +            job_lock();
          }
          job->user_paused = false;
          assert(job->pause_count > 0);
@@ -XXX,XX +XXX,XX @@ static void job_cancel_async(Job *job, bool force)
      }
  }
 -static void job_completed_txn_abort(Job *job)
 +/* Called with job_mutex held, but releases it temporarily. */
 +static void job_completed_txn_abort_locked(Job *job)
  {
-     AioContext *ctx;
+-    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
-     JobTxn *txn = job->txn;
+     uint32_t n;
-@@ -XXX,XX +XXX,XX @@ static void job_completed_txn_abort(Job *job)
-         return;
+-    /* The request must only run in the BlockBackend's AioContext */
-     }
+-    assert(blk_get_aio_context(s->qdev.conf.blk) ==
-     txn->aborting = true;
+-           qemu_get_current_aio_context());
--    job_txn_ref(txn);
++    /* The request must run in its AioContext */
-+    job_txn_ref_locked(txn);
++    assert(r->req.ctx == qemu_get_current_aio_context());
-     /*
+     assert (r->req.aiocb == NULL);
-      * We can only hold the single job's AioContext lock while calling
+     if (scsi_disk_req_check_error(r, ret, ret > 0)) {
@@ -XXX,XX +XXX,XX @@ static void job_completed_txn_abort(Job *job)
       * calls of AIO_WAIT_WHILE(), which could deadlock otherwise.
       * Note that the job's AioContext may change when it is finalized.
       */
 -    job_ref(job);
 +    job_ref_locked(job);
      aio_context_release(job->aio_context);
      /* Other jobs are effectively cancelled by us, set the status for
@@ -XXX,XX +XXX,XX @@ static void job_completed_txn_abort(Job *job)
               * Therefore, pass force=true to terminate all other jobs as quickly
               * as possible.
               */
 -            job_cancel_async(other_job, true);
 +            job_cancel_async_locked(other_job, true);
              aio_context_release(ctx);
          }
      }
@@ -XXX,XX +XXX,XX @@ static void job_completed_txn_abort(Job *job)
           */
          ctx = other_job->aio_context;
          aio_context_acquire(ctx);
 -        if (!job_is_completed(other_job)) {
 -            assert(job_cancel_requested(other_job));
 -            job_finish_sync(other_job, NULL, NULL);
 +        if (!job_is_completed_locked(other_job)) {
 +            assert(job_cancel_requested_locked(other_job));
 +            job_finish_sync_locked(other_job, NULL, NULL);
          }
 -        job_finalize_single(other_job);
 +        job_finalize_single_locked(other_job);
          aio_context_release(ctx);
      }
@@ -XXX,XX +XXX,XX @@ static void job_completed_txn_abort(Job *job)
       * even if the job went away during job_finalize_single().
       */
      aio_context_acquire(job->aio_context);
 -    job_unref(job);
 +    job_unref_locked(job);
 -    job_txn_unref(txn);
 +    job_txn_unref_locked(txn);
  }
 -static int job_prepare(Job *job)
 +/* Called with job_mutex held, but releases it temporarily */
 +static int job_prepare_locked(Job *job)
  {
 +    int ret;
 +
      GLOBAL_STATE_CODE();
      if (job->ret == 0 && job->driver->prepare) {
 -        job->ret = job->driver->prepare(job);
 -        job_update_rc(job);
 +        job_unlock();
 +        ret = job->driver->prepare(job);
 +        job_lock();
 +        job->ret = ret;
 +        job_update_rc_locked(job);
      }
      return job->ret;
  }
 -static int job_needs_finalize(Job *job)
 +/* Called with job_mutex held */
 +static int job_needs_finalize_locked(Job *job)
  {
      return !job->auto_finalize;
  }
 -static void job_do_finalize(Job *job)
 +/* Called with job_mutex held */
 +static void job_do_finalize_locked(Job *job)
  {
      int rc;
      assert(job && job->txn);
      /* prepare the transaction to complete */
 -    rc = job_txn_apply(job, job_prepare);
 +    rc = job_txn_apply_locked(job, job_prepare_locked);
      if (rc) {
 -        job_completed_txn_abort(job);
 +        job_completed_txn_abort_locked(job);
      } else {
 -        job_txn_apply(job, job_finalize_single);
 +        job_txn_apply_locked(job, job_finalize_single_locked);
      }
  }
 -void job_finalize(Job *job, Error **errp)
 +void job_finalize_locked(Job *job, Error **errp)
  {
      assert(job && job->id);
 -    if (job_apply_verb(job, JOB_VERB_FINALIZE, errp)) {
 +    if (job_apply_verb_locked(job, JOB_VERB_FINALIZE, errp)) {
          return;
      }
 -    job_do_finalize(job);
 +    job_do_finalize_locked(job);
  }
 -static int job_transition_to_pending(Job *job)
 +void job_finalize(Job *job, Error **errp)
  {
 -    job_state_transition(job, JOB_STATUS_PENDING);
 +    JOB_LOCK_GUARD();
 +    job_finalize_locked(job, errp);
 +}
 +
 +/* Called with job_mutex held. */
 +static int job_transition_to_pending_locked(Job *job)
 +{
 +    job_state_transition_locked(job, JOB_STATUS_PENDING);
      if (!job->auto_finalize) {
 -        job_event_pending(job);
 +        job_event_pending_locked(job);
      }
      return 0;
  }
  void job_transition_to_ready(Job *job)
  {
 -    job_state_transition(job, JOB_STATUS_READY);
 -    job_event_ready(job);
 +    JOB_LOCK_GUARD();
 +    job_state_transition_locked(job, JOB_STATUS_READY);
 +    job_event_ready_locked(job);
  }
 -static void job_completed_txn_success(Job *job)
 +/* Called with job_mutex held. */
 +static void job_completed_txn_success_locked(Job *job)
  {
      JobTxn *txn = job->txn;
      Job *other_job;
 -    job_state_transition(job, JOB_STATUS_WAITING);
 +    job_state_transition_locked(job, JOB_STATUS_WAITING);
      /*
       * Successful completion, see if there are other running jobs in this
       * txn.
       */
      QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
 -        if (!job_is_completed(other_job)) {
 +        if (!job_is_completed_locked(other_job)) {
              return;
          }
          assert(other_job->ret == 0);
      }
 -    job_txn_apply(job, job_transition_to_pending);
 +    job_txn_apply_locked(job, job_transition_to_pending_locked);
      /* If no jobs need manual finalization, automatically do so */
 -    if (job_txn_apply(job, job_needs_finalize) == 0) {
 -        job_do_finalize(job);
 +    if (job_txn_apply_locked(job, job_needs_finalize_locked) == 0) {
 +        job_do_finalize_locked(job);
      }
  }
 -static void job_completed(Job *job)
 +/* Called with job_mutex held. */
 +static void job_completed_locked(Job *job)
  {
 -    assert(job && job->txn && !job_is_completed(job));
 +    assert(job && job->txn && !job_is_completed_locked(job));
 -    job_update_rc(job);
 +    job_update_rc_locked(job);
      trace_job_completed(job, job->ret);
      if (job->ret) {
 -        job_completed_txn_abort(job);
 +        job_completed_txn_abort_locked(job);
      } else {
 -        job_completed_txn_success(job);
 +        job_completed_txn_success_locked(job);
      }
  }
 -/** Useful only as a type shim for aio_bh_schedule_oneshot. */
 +/**
 + * Useful only as a type shim for aio_bh_schedule_oneshot.
 + * Called with job_mutex *not* held.
 + */
  static void job_exit(void *opaque)
  {
      Job *job = (Job *)opaque;
      AioContext *ctx;
 +    JOB_LOCK_GUARD();
 -    job_ref(job);
 +    job_ref_locked(job);
      aio_context_acquire(job->aio_context);
      /* This is a lie, we're not quiescent, but still doing the completion
@@ -XXX,XX +XXX,XX @@ static void job_exit(void *opaque)
       * drain block nodes, and if .drained_poll still returned true, we would
       * deadlock. */
      job->busy = false;
 -    job_event_idle(job);
 +    job_event_idle_locked(job);
 -    job_completed(job);
 +    job_completed_locked(job);
      /*
       * Note that calling job_completed can move the job to a different
@@ -XXX,XX +XXX,XX @@ static void job_exit(void *opaque)
       * the job underneath us.
       */
      ctx = job->aio_context;
 -    job_unref(job);
 +    job_unref_locked(job);
      aio_context_release(ctx);
  }
@@ -XXX,XX +XXX,XX @@ static void job_exit(void *opaque)
  static void coroutine_fn job_co_entry(void *opaque)
  {
      Job *job = opaque;
 +    int ret;
      assert(job && job->driver && job->driver->run);
 -    assert(job->aio_context == qemu_get_current_aio_context());
 -    job_pause_point(job);
 -    job->ret = job->driver->run(job, &job->err);
 -    job->deferred_to_main_loop = true;
 -    job->busy = true;
 +    WITH_JOB_LOCK_GUARD() {
 +        assert(job->aio_context == qemu_get_current_aio_context());
 +        job_pause_point_locked(job);
 +    }
 +    ret = job->driver->run(job, &job->err);
 +    WITH_JOB_LOCK_GUARD() {
 +        job->ret = ret;
 +        job->deferred_to_main_loop = true;
 +        job->busy = true;
 +    }
      aio_bh_schedule_oneshot(qemu_get_aio_context(), job_exit, job);
  }
  void job_start(Job *job)
  {
 -    assert(job && !job_started(job) && job->paused &&
 -           job->driver && job->driver->run);
 -    job->co = qemu_coroutine_create(job_co_entry, job);
 -    job->pause_count--;
 -    job->busy = true;
 -    job->paused = false;
 -    job_state_transition(job, JOB_STATUS_RUNNING);
 +    assert(qemu_in_main_thread());
 +
 +    WITH_JOB_LOCK_GUARD() {
 +        assert(job && !job_started_locked(job) && job->paused &&
 +            job->driver && job->driver->run);
 +        job->co = qemu_coroutine_create(job_co_entry, job);
 +        job->pause_count--;
 +        job->busy = true;
 +        job->paused = false;
 +        job_state_transition_locked(job, JOB_STATUS_RUNNING);
 +    }
      aio_co_enter(job->aio_context, job->co);
  }
 -void job_cancel(Job *job, bool force)
 +void job_cancel_locked(Job *job, bool force)
  {
      if (job->status == JOB_STATUS_CONCLUDED) {
 -        job_do_dismiss(job);
 +        job_do_dismiss_locked(job);
          return;
      }
 -    job_cancel_async(job, force);
 -    if (!job_started(job)) {
 -        job_completed(job);
 +    job_cancel_async_locked(job, force);
 +    if (!job_started_locked(job)) {
 +        job_completed_locked(job);
      } else if (job->deferred_to_main_loop) {
          /*
           * job_cancel_async() ignores soft-cancel requests for jobs
@@ -XXX,XX +XXX,XX @@ void job_cancel(Job *job, bool force)
           * choose to call job_is_cancelled() to show that we invoke
           * job_completed_txn_abort() only for force-cancelled jobs.)
           */
 -        if (job_is_cancelled(job)) {
 -            job_completed_txn_abort(job);
 +        if (job_is_cancelled_locked(job)) {
 +            job_completed_txn_abort_locked(job);
          }
      } else {
 -        job_enter(job);
 +        job_enter_cond_locked(job, NULL);
      }
  }
 -void job_user_cancel(Job *job, bool force, Error **errp)
 +void job_cancel(Job *job, bool force)
  {
 -    if (job_apply_verb(job, JOB_VERB_CANCEL, errp)) {
 +    JOB_LOCK_GUARD();
 +    job_cancel_locked(job, force);
 +}
 +
 +void job_user_cancel_locked(Job *job, bool force, Error **errp)
 +{
 +    if (job_apply_verb_locked(job, JOB_VERB_CANCEL, errp)) {
          return;
      }
 -    job_cancel(job, force);
 +    job_cancel_locked(job, force);
 +}
 +
 +void job_user_cancel(Job *job, bool force, Error **errp)
 +{
 +    JOB_LOCK_GUARD();
 +    job_user_cancel_locked(job, force, errp);
  }
  /* A wrapper around job_cancel() taking an Error ** parameter so it may be
   * used with job_finish_sync() without the need for (rather nasty) function
 - * pointer casts there. */
 -static void job_cancel_err(Job *job, Error **errp)
 + * pointer casts there.
 + *
 + * Called with job_mutex held.
 + */
 +static void job_cancel_err_locked(Job *job, Error **errp)
  {
 -    job_cancel(job, false);
 +    job_cancel_locked(job, false);
  }
  /**
   * Same as job_cancel_err(), but force-cancel.
 + * Called with job_mutex held.
   */
 -static void job_force_cancel_err(Job *job, Error **errp)
 +static void job_force_cancel_err_locked(Job *job, Error **errp)
  {
 -    job_cancel(job, true);
 +    job_cancel_locked(job, true);
  }
 -int job_cancel_sync(Job *job, bool force)
 +int job_cancel_sync_locked(Job *job, bool force)
  {
      if (force) {
 -        return job_finish_sync(job, &job_force_cancel_err, NULL);
 +        return job_finish_sync_locked(job, &job_force_cancel_err_locked, NULL);
      } else {
 -        return job_finish_sync(job, &job_cancel_err, NULL);
 +        return job_finish_sync_locked(job, &job_cancel_err_locked, NULL);
      }
  }
 +int job_cancel_sync(Job *job, bool force)
 +{
 +    JOB_LOCK_GUARD();
 +    return job_cancel_sync_locked(job, force);
 +}
 +
  void job_cancel_sync_all(void)
  {
      Job *job;
      AioContext *aio_context;
 +    JOB_LOCK_GUARD();
 -    while ((job = job_next(NULL))) {
 +    while ((job = job_next_locked(NULL))) {
          aio_context = job->aio_context;
          aio_context_acquire(aio_context);
 -        job_cancel_sync(job, true);
 +        job_cancel_sync_locked(job, true);
          aio_context_release(aio_context);
      }
  }
 +int job_complete_sync_locked(Job *job, Error **errp)
 +{
 +    return job_finish_sync_locked(job, job_complete_locked, errp);
 +}
 +
  int job_complete_sync(Job *job, Error **errp)
  {
 -    return job_finish_sync(job, job_complete, errp);
 +    JOB_LOCK_GUARD();
 +    return job_complete_sync_locked(job, errp);
  }
 -void job_complete(Job *job, Error **errp)
 +void job_complete_locked(Job *job, Error **errp)
  {
      /* Should not be reachable via external interface for internal jobs */
      assert(job->id);
      GLOBAL_STATE_CODE();
 -    if (job_apply_verb(job, JOB_VERB_COMPLETE, errp)) {
 +    if (job_apply_verb_locked(job, JOB_VERB_COMPLETE, errp)) {
          return;
      }
 -    if (job_cancel_requested(job) || !job->driver->complete) {
 +    if (job_cancel_requested_locked(job) || !job->driver->complete) {
          error_setg(errp, "The active block job '%s' cannot be completed",
                     job->id);
          return;
      }
 +    job_unlock();
      job->driver->complete(job, errp);
 +    job_lock();
  }
 -int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error **errp)
 +void job_complete(Job *job, Error **errp)
 +{
 +    JOB_LOCK_GUARD();
 +    job_complete_locked(job, errp);
 +}
 +
 +int job_finish_sync_locked(Job *job,
 +                           void (*finish)(Job *, Error **errp),
 +                           Error **errp)
  {
      Error *local_err = NULL;
      int ret;
 -    job_ref(job);
 +    job_ref_locked(job);
      if (finish) {
          finish(job, &local_err);
      }
      if (local_err) {
          error_propagate(errp, local_err);
 -        job_unref(job);
 +        job_unref_locked(job);
          return -EBUSY;
      }
 +    job_unlock();
      AIO_WAIT_WHILE(job->aio_context,
                     (job_enter(job), !job_is_completed(job)));
 +    job_lock();
 -    ret = (job_is_cancelled(job) && job->ret == 0) ? -ECANCELED : job->ret;
 -    job_unref(job);
 +    ret = (job_is_cancelled_locked(job) && job->ret == 0)
 +          ? -ECANCELED : job->ret;
 +    job_unref_locked(job);
      return ret;
  }
 +
 +int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error **errp)
 +{
 +    JOB_LOCK_GUARD();
 +    return job_finish_sync_locked(job, finish, errp);
 +}
 --
-.37.3
+.48.1

-[PULL 48/50] blockjob: remove unused functions
+[PULL 13/22] scsi: introduce requests_lock
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+From: Stefan Hajnoczi <stefanha@redhat.com>
-These public functions are not used anywhere, thus can be dropped.
+SCSIDevice keeps track of in-flight requests for device reset and Task
 Management Functions (TMFs). The request list requires protection so
 that multi-threaded SCSI emulation can be implemented in commits that
 follow.
-Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Message-ID: <20250311132616.1049687-5-stefanha@redhat.com>
 Message-Id: <20220926093214.506243-21-eesposit@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/blockjob.h | 31 ++++++++++++-------------------
+ include/hw/scsi/scsi.h |   7 ++-
- blockjob.c               | 16 ++--------------
+ hw/scsi/scsi-bus.c     | 120 +++++++++++++++++++++++++++++------------
-files changed, 14 insertions(+), 33 deletions(-)
+files changed, 88 insertions(+), 39 deletions(-)
-diff --git a/include/block/blockjob.h b/include/block/blockjob.h
+diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/blockjob.h
+--- a/include/hw/scsi/scsi.h
-+++ b/include/block/blockjob.h
++++ b/include/hw/scsi/scsi.h
-@@ -XXX,XX +XXX,XX @@ typedef struct BlockJob {
+@@ -XXX,XX +XXX,XX @@ struct SCSIRequest {
      bool              dma_started;
      BlockAIOCB        *aiocb;
      QEMUSGList        *sg;
 +
 +    /* Protected by SCSIDevice->requests_lock */
      QTAILQ_ENTRY(SCSIRequest) next;
  };
@@ -XXX,XX +XXX,XX @@ struct SCSIDevice
      uint8_t sense[SCSI_SENSE_BUF_SIZE];
      uint32_t sense_len;
 -    /*
 -     * The requests list is only accessed from the AioContext that executes
 -     * requests or from the main loop when IOThread processing is stopped.
 -     */
 +    QemuMutex requests_lock; /* protects the requests list */
      QTAILQ_HEAD(, SCSIRequest) requests;
      uint32_t channel;
 diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/scsi/scsi-bus.c
 +++ b/hw/scsi/scsi-bus.c
@@ -XXX,XX +XXX,XX @@ static void scsi_device_for_each_req_sync(SCSIDevice *s,
      assert(!runstate_is_running());
      assert(qemu_in_main_thread());
 -    QTAILQ_FOREACH_SAFE(req, &s->requests, next, next_req) {
 -        fn(req, opaque);
 +    /*
 +     * Locking is not necessary because the guest is stopped and no other
 +     * threads can be accessing the requests list, but take the lock for
 +     * consistency.
 +     */
 +    WITH_QEMU_LOCK_GUARD(&s->requests_lock) {
 +        QTAILQ_FOREACH_SAFE(req, &s->requests, next, next_req) {
 +            fn(req, opaque);
 +        }
      }
  }
@@ -XXX,XX +XXX,XX @@ static void scsi_device_for_each_req_async_bh(void *opaque)
  {
      g_autofree SCSIDeviceForEachReqAsyncData *data = opaque;
      SCSIDevice *s = data->s;
 -    AioContext *ctx;
 -    SCSIRequest *req;
 -    SCSIRequest *next;
 +    g_autoptr(GList) reqs = NULL;
      /*
 -     * The BB cannot have changed contexts between this BH being scheduled and
 -     * now: BBs' AioContexts, when they have a node attached, can only be
 -     * changed via bdrv_try_change_aio_context(), in a drained section.  While
 -     * we have the in-flight counter incremented, that drain must block.
 +     * Build a list of requests in this AioContext so fn() can be invoked later
 +     * outside requests_lock.
       */
 -    ctx = blk_get_aio_context(s->conf.blk);
 -    assert(ctx == qemu_get_current_aio_context());
 +    WITH_QEMU_LOCK_GUARD(&s->requests_lock) {
 +        AioContext *ctx = qemu_get_current_aio_context();
 +        SCSIRequest *req;
 +        SCSIRequest *next;
 +
 +        QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
 +            if (req->ctx == ctx) {
 +                scsi_req_ref(req); /* dropped after calling fn() */
 +                reqs = g_list_prepend(reqs, req);
 +            }
 +        }
 +    }
 -    QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
 -        data->fn(req, data->fn_opaque);
 +    /* Call fn() on each request */
 +    for (GList *elem = g_list_first(reqs); elem; elem = g_list_next(elem)) {
 +        data->fn(elem->data, data->fn_opaque);
 +        scsi_req_unref(elem->data);
      }
      /* Drop the reference taken by scsi_device_for_each_req_async() */
@@ -XXX,XX +XXX,XX @@ static void scsi_device_for_each_req_async_bh(void *opaque)
      blk_dec_in_flight(s->conf.blk);
  }
 +static void scsi_device_for_each_req_async_do_ctx(gpointer key, gpointer value,
 +                                                  gpointer user_data)
 +{
 +    AioContext *ctx = key;
 +    SCSIDeviceForEachReqAsyncData *params = user_data;
 +    SCSIDeviceForEachReqAsyncData *data;
 +
 +    data = g_new(SCSIDeviceForEachReqAsyncData, 1);
 +    data->s = params->s;
 +    data->fn = params->fn;
 +    data->fn_opaque = params->fn_opaque;
 +
 +    /*
 +     * Hold a reference to the SCSIDevice until
 +     * scsi_device_for_each_req_async_bh() finishes.
 +     */
 +    object_ref(OBJECT(data->s));
 +
 +    /* Paired with scsi_device_for_each_req_async_bh() */
 +    blk_inc_in_flight(data->s->conf.blk);
 +
 +    aio_bh_schedule_oneshot(ctx, scsi_device_for_each_req_async_bh, data);
 +}
 +
  /*
   * Schedule @fn() to be invoked for each enqueued request in device @s. @fn()
 - * runs in the AioContext that is executing the request.
 + * must be thread-safe because it runs concurrently in each AioContext that is
 + * executing a request.
 + *
   * Keeps the BlockBackend's in-flight counter incremented until everything is
   * done, so draining it will settle all scheduled @fn() calls.
   */
+@@ -XXX,XX +XXX,XX @@ static void scsi_device_for_each_req_async(SCSIDevice *s,
- /**
+ {
-- * block_job_next:
+     assert(qemu_in_main_thread());
-+ * block_job_next_locked:
-  * @job: A block job, or %NULL.
+-    SCSIDeviceForEachReqAsyncData *data =
-  *
+-        g_new(SCSIDeviceForEachReqAsyncData, 1);
   * Get the next element from the list of block jobs after @job, or the
   * first one if @job is %NULL.
   *
   * Returns the requested job, or %NULL if there are no more jobs left.
 + * Called with job lock held.
   */
 -BlockJob *block_job_next(BlockJob *job);
 -
--/* Same as block_job_next(), but called with job lock held. */
+-    data->s = s;
- BlockJob *block_job_next_locked(BlockJob *job);
+-    data->fn = fn;
+-    data->fn_opaque = opaque;
  /**
@@ -XXX,XX +XXX,XX @@ BlockJob *block_job_next_locked(BlockJob *job);
   * Get the block job identified by @id (which must not be %NULL).
   *
   * Returns the requested job, or %NULL if it doesn't exist.
 + * Called with job lock *not* held.
   */
  BlockJob *block_job_get(const char *id);
@@ -XXX,XX +XXX,XX @@ void block_job_remove_all_bdrv(BlockJob *job);
  bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs);
  /**
 - * block_job_set_speed:
 + * block_job_set_speed_locked:
   * @job: The job to set the speed for.
   * @speed: The new value
   * @errp: Error object.
   *
   * Set a rate-limiting parameter for the job; the actual meaning may
   * vary depending on the job type.
 - */
 -bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp);
 -
--/*
+-    /*
-- * Same as block_job_set_speed(), but called with job lock held.
+-     * Hold a reference to the SCSIDevice until
-- * Might release the lock temporarily.
+-     * scsi_device_for_each_req_async_bh() finishes.
-+ *
+-     */
-+ * Called with job lock held, but might release it temporarily.
+-    object_ref(OBJECT(s));
-  */
++    /* The set of AioContexts where the requests are being processed */
- bool block_job_set_speed_locked(BlockJob *job, int64_t speed, Error **errp);
++    g_autoptr(GHashTable) aio_contexts = g_hash_table_new(NULL, NULL);
++    WITH_QEMU_LOCK_GUARD(&s->requests_lock) {
- /**
++        SCSIRequest *req;
-- * block_job_query:
++        QTAILQ_FOREACH(req, &s->requests, next) {
-+ * block_job_query_locked:
++            g_hash_table_add(aio_contexts, req->ctx);
-  * @job: The job to get information about.
++        }
-  *
++    }
-  * Return information about a job.
-+ *
+-    /* Paired with blk_dec_in_flight() in scsi_device_for_each_req_async_bh() */
-+ * Called with job lock held.
+-    blk_inc_in_flight(s->conf.blk);
-  */
+-    aio_bh_schedule_oneshot(blk_get_aio_context(s->conf.blk),
--BlockJobInfo *block_job_query(BlockJob *job, Error **errp);
+-                            scsi_device_for_each_req_async_bh,
--
+-                            data);
--/* Same as block_job_query(), but called with job lock held. */
++    /* Schedule a BH for each AioContext */
- BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp);
++    SCSIDeviceForEachReqAsyncData params = {
++        .s = s,
- /**
++        .fn = fn,
-- * block_job_iostatus_reset:
++        .fn_opaque = opaque,
-+ * block_job_iostatus_reset_locked:
++    };
-  * @job: The job whose I/O status should be reset.
++    g_hash_table_foreach(
-  *
++            aio_contexts,
-  * Reset I/O status on @job and on BlockDriverState objects it uses,
++            scsi_device_for_each_req_async_do_ctx,
-  * other than job->blk.
++            &params
-+ *
++    );
-+ * Called with job lock held.
+ }
-  */
--void block_job_iostatus_reset(BlockJob *job);
+ static void scsi_device_realize(SCSIDevice *s, Error **errp)
--
+@@ -XXX,XX +XXX,XX @@ static void scsi_qdev_realize(DeviceState *qdev, Error **errp)
--/* Same as block_job_iostatus_reset(), but called with job lock held. */
+         dev->lun = lun;
- void block_job_iostatus_reset_locked(BlockJob *job);
+     }
- /*
++    qemu_mutex_init(&dev->requests_lock);
-diff --git a/blockjob.c b/blockjob.c
+     QTAILQ_INIT(&dev->requests);
-index XXXXXXX..XXXXXXX 100644
+     scsi_device_realize(dev, &local_err);
---- a/blockjob.c
+     if (local_err) {
-+++ b/blockjob.c
+@@ -XXX,XX +XXX,XX @@ static void scsi_qdev_unrealize(DeviceState *qdev)
-@@ -XXX,XX +XXX,XX @@ BlockJob *block_job_next_locked(BlockJob *bjob)
-     return job ? container_of(job, BlockJob, job) : NULL;
+     scsi_device_purge_requests(dev, SENSE_CODE(NO_SENSE));
- }
++    qemu_mutex_destroy(&dev->requests_lock);
--BlockJob *block_job_next(BlockJob *bjob)
++
--{
+     scsi_device_unrealize(dev);
--    JOB_LOCK_GUARD();
--    return block_job_next_locked(bjob);
+     blockdev_mark_auto_del(dev->conf.blk);
--}
+@@ -XXX,XX +XXX,XX @@ static void scsi_req_enqueue_internal(SCSIRequest *req)
--
+         req->sg = NULL;
- BlockJob *block_job_get_locked(const char *id)
+     }
      req->enqueued = true;
 -    QTAILQ_INSERT_TAIL(&req->dev->requests, req, next);
 +
 +    WITH_QEMU_LOCK_GUARD(&req->dev->requests_lock) {
 +        QTAILQ_INSERT_TAIL(&req->dev->requests, req, next);
 +    }
  }
  int32_t scsi_req_enqueue(SCSIRequest *req)
@@ -XXX,XX +XXX,XX @@ static void scsi_req_dequeue(SCSIRequest *req)
      trace_scsi_req_dequeue(req->dev->id, req->lun, req->tag);
      req->retry = false;
      if (req->enqueued) {
 -        QTAILQ_REMOVE(&req->dev->requests, req, next);
 +        WITH_QEMU_LOCK_GUARD(&req->dev->requests_lock) {
 +            QTAILQ_REMOVE(&req->dev->requests, req, next);
 +        }
          req->enqueued = false;
          scsi_req_unref(req);
      }
@@ -XXX,XX +XXX,XX @@ static void scsi_device_class_init(ObjectClass *klass, void *data)
  static void scsi_dev_instance_init(Object *obj)
  {
-     Job *job = job_get_locked(id);
+-    DeviceState *dev = DEVICE(obj);
-@@ -XXX,XX +XXX,XX @@ bool block_job_set_speed_locked(BlockJob *job, int64_t speed, Error **errp)
+-    SCSIDevice *s = SCSI_DEVICE(dev);
-     return true;
++    SCSIDevice *s = SCSI_DEVICE(obj);
- }
+     device_add_bootindex_property(obj, &s->conf.bootindex,
--bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
+                                   "bootindex", NULL,
 +static bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
  {
      JOB_LOCK_GUARD();
      return block_job_set_speed_locked(job, speed, errp);
@@ -XXX,XX +XXX,XX @@ BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp)
      return info;
  }
 -BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
 -{
 -    JOB_LOCK_GUARD();
 -    return block_job_query_locked(job, errp);
 -}
 -
  /* Called with job lock held */
  static void block_job_iostatus_set_err_locked(BlockJob *job, int error)
  {
@@ -XXX,XX +XXX,XX @@ void block_job_iostatus_reset_locked(BlockJob *job)
      job->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
  }
 -void block_job_iostatus_reset(BlockJob *job)
 +static void block_job_iostatus_reset(BlockJob *job)
  {
      JOB_LOCK_GUARD();
      block_job_iostatus_reset_locked(job);
 --
-.37.3
+.48.1

-[PULL 43/50] blockjob: rename notifier callbacks as _locked
+[PULL 14/22] virtio-scsi: introduce event and ctrl virtqueue locks
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+From: Stefan Hajnoczi <stefanha@redhat.com>
-They all are called with job_lock held, in job_event_*_locked()
+Virtqueues are not thread-safe. Until now this was not a major issue
+since all virtqueue processing happened in the same thread. The ctrl
-Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+queue's Task Management Function (TMF) requests sometimes need the main
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+loop, so a BH was used to schedule the virtqueue completion back in the
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+thread that has virtqueue access.
 When IOThread Virtqueue Mapping is introduced in later commits, event
 and ctrl virtqueue accesses from other threads will become necessary.
 Introduce an optional per-virtqueue lock so the event and ctrl
 virtqueues can be protected in the commits that follow.
 The addition of the ctrl virtqueue lock makes
 virtio_scsi_complete_req_from_main_loop() and its BH unnecessary.
 Instead, take the ctrl virtqueue lock from the main loop thread.
 The cmd virtqueue does not have a lock because the entirety of SCSI
 command processing happens in one thread. Only one thread accesses the
 cmd virtqueue and a lock is unnecessary.
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Message-Id: <20220926093214.506243-16-eesposit@redhat.com>
+Message-ID: <20250311132616.1049687-6-stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- blockjob.c | 25 +++++++++++++++----------
+ include/hw/virtio/virtio-scsi.h |  3 ++
-file changed, 15 insertions(+), 10 deletions(-)
+ hw/scsi/virtio-scsi.c           | 84 ++++++++++++++++++---------------
+files changed, 49 insertions(+), 38 deletions(-)
-diff --git a/blockjob.c b/blockjob.c
 diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
 index XXXXXXX..XXXXXXX 100644
---- a/blockjob.c
+--- a/include/hw/virtio/virtio-scsi.h
-+++ b/blockjob.c
++++ b/include/hw/virtio/virtio-scsi.h
-@@ -XXX,XX +XXX,XX @@ int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ struct VirtIOSCSI {
      int resetting; /* written from main loop thread, read from any thread */
      bool events_dropped;
 +    QemuMutex ctrl_lock; /* protects ctrl_vq */
 +    QemuMutex event_lock; /* protects event_vq */
 +
      /*
       * TMFs deferred to main loop BH. These fields are protected by
       * tmf_bh_lock.
 diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/scsi/virtio-scsi.c
 +++ b/hw/scsi/virtio-scsi.c
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_free_req(VirtIOSCSIReq *req)
      g_free(req);
  }
 -static void virtio_scsi_complete_req(VirtIOSCSIReq *req)
 +static void virtio_scsi_complete_req(VirtIOSCSIReq *req, QemuMutex *vq_lock)
  {
      VirtIOSCSI *s = req->dev;
      VirtQueue *vq = req->vq;
      VirtIODevice *vdev = VIRTIO_DEVICE(s);
      qemu_iovec_from_buf(&req->resp_iov, 0, &req->resp, req->resp_size);
 +
 +    if (vq_lock) {
 +        qemu_mutex_lock(vq_lock);
 +    }
 +
      virtqueue_push(vq, &req->elem, req->qsgl.size + req->resp_iov.size);
      if (s->dataplane_started && !s->dataplane_fenced) {
          virtio_notify_irqfd(vdev, vq);
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_complete_req(VirtIOSCSIReq *req)
          virtio_notify(vdev, vq);
      }
 +    if (vq_lock) {
 +        qemu_mutex_unlock(vq_lock);
 +    }
 +
      if (req->sreq) {
          req->sreq->hba_private = NULL;
          scsi_req_unref(req->sreq);
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_complete_req(VirtIOSCSIReq *req)
      virtio_scsi_free_req(req);
  }
 -static void virtio_scsi_complete_req_bh(void *opaque)
 +static void virtio_scsi_bad_req(VirtIOSCSIReq *req, QemuMutex *vq_lock)
  {
 -    VirtIOSCSIReq *req = opaque;
 +    virtio_error(VIRTIO_DEVICE(req->dev), "wrong size for virtio-scsi headers");
 -    virtio_scsi_complete_req(req);
 -}
 +    if (vq_lock) {
 +        qemu_mutex_lock(vq_lock);
 +    }
 -/*
 - * Called from virtio_scsi_do_one_tmf_bh() in main loop thread. The main loop
 - * thread cannot touch the virtqueue since that could race with an IOThread.
 - */
 -static void virtio_scsi_complete_req_from_main_loop(VirtIOSCSIReq *req)
 -{
 -    VirtIOSCSI *s = req->dev;
 +    virtqueue_detach_element(req->vq, &req->elem, 0);
 -    if (!s->ctx || s->ctx == qemu_get_aio_context()) {
 -        /* No need to schedule a BH when there is no IOThread */
 -        virtio_scsi_complete_req(req);
 -    } else {
 -        /* Run request completion in the IOThread */
 -        aio_wait_bh_oneshot(s->ctx, virtio_scsi_complete_req_bh, req);
 +    if (vq_lock) {
 +        qemu_mutex_unlock(vq_lock);
      }
 -}
 -static void virtio_scsi_bad_req(VirtIOSCSIReq *req)
 -{
 -    virtio_error(VIRTIO_DEVICE(req->dev), "wrong size for virtio-scsi headers");
 -    virtqueue_detach_element(req->vq, &req->elem, 0);
      virtio_scsi_free_req(req);
  }
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_parse_req(VirtIOSCSIReq *req,
      return 0;
  }
--static void block_job_on_idle(Notifier *n, void *opaque)
+-static VirtIOSCSIReq *virtio_scsi_pop_req(VirtIOSCSI *s, VirtQueue *vq)
-+/* Called with job_mutex lock held. */
++static VirtIOSCSIReq *virtio_scsi_pop_req(VirtIOSCSI *s, VirtQueue *vq, QemuMutex *vq_lock)
-+static void block_job_on_idle_locked(Notifier *n, void *opaque)
+ {
- {
+     VirtIOSCSICommon *vs = (VirtIOSCSICommon *)s;
-     aio_wait_kick();
+     VirtIOSCSIReq *req;
- }
-@@ -XXX,XX +XXX,XX @@ static void block_job_iostatus_set_err(BlockJob *job, int error)
++    if (vq_lock) {
-     }
++        qemu_mutex_lock(vq_lock);
- }
++    }
++
--static void block_job_event_cancelled(Notifier *n, void *opaque)
+     req = virtqueue_pop(vq, sizeof(VirtIOSCSIReq) + vs->cdb_size);
-+/* Called with job_mutex lock held. */
++
-+static void block_job_event_cancelled_locked(Notifier *n, void *opaque)
++    if (vq_lock) {
- {
++        qemu_mutex_unlock(vq_lock);
-     BlockJob *job = opaque;
++    }
-     uint64_t progress_current, progress_total;
++
-@@ -XXX,XX +XXX,XX @@ static void block_job_event_cancelled(Notifier *n, void *opaque)
+     if (!req) {
-                                         job->speed);
+         return NULL;
- }
+     }
+@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_cancel_notify(Notifier *notifier, void *data)
--static void block_job_event_completed(Notifier *n, void *opaque)
-+/* Called with job_mutex lock held. */
+         trace_virtio_scsi_tmf_resp(virtio_scsi_get_lun(req->req.tmf.lun),
-+static void block_job_event_completed_locked(Notifier *n, void *opaque)
+                                    req->req.tmf.tag, req->resp.tmf.response);
- {
+-        virtio_scsi_complete_req(req);
-     BlockJob *job = opaque;
++        virtio_scsi_complete_req(req, &req->dev->ctrl_lock);
-     const char *msg = NULL;
+     }
-@@ -XXX,XX +XXX,XX @@ static void block_job_event_completed(Notifier *n, void *opaque)
+     g_free(n);
-                                         msg);
+ }
- }
+@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_do_one_tmf_bh(VirtIOSCSIReq *req)
--static void block_job_event_pending(Notifier *n, void *opaque)
+ out:
-+/* Called with job_mutex lock held. */
+     object_unref(OBJECT(d));
-+static void block_job_event_pending_locked(Notifier *n, void *opaque)
+-    virtio_scsi_complete_req_from_main_loop(req);
- {
++    virtio_scsi_complete_req(req, &s->ctrl_lock);
-     BlockJob *job = opaque;
+ }
-@@ -XXX,XX +XXX,XX @@ static void block_job_event_pending(Notifier *n, void *opaque)
+ /* Some TMFs must be processed from the main loop thread */
-                                       job->job.id);
+@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_reset_tmf_bh(VirtIOSCSI *s)
- }
+         /* SAM-6 6.3.2 Hard reset */
--static void block_job_event_ready(Notifier *n, void *opaque)
+         req->resp.tmf.response = VIRTIO_SCSI_S_TARGET_FAILURE;
-+/* Called with job_mutex lock held. */
+-        virtio_scsi_complete_req(req);
-+static void block_job_event_ready_locked(Notifier *n, void *opaque)
++        virtio_scsi_complete_req(req, &req->dev->ctrl_lock);
- {
+     }
-     BlockJob *job = opaque;
+ }
-     uint64_t progress_current, progress_total;
-@@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
+@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req)
-     ratelimit_init(&job->limit);
+     if (iov_to_buf(req->elem.out_sg, req->elem.out_num, 0,
+                 &type, sizeof(type)) < sizeof(type)) {
--    job->finalize_cancelled_notifier.notify = block_job_event_cancelled;
+-        virtio_scsi_bad_req(req);
--    job->finalize_completed_notifier.notify = block_job_event_completed;
++        virtio_scsi_bad_req(req, &s->ctrl_lock);
--    job->pending_notifier.notify = block_job_event_pending;
+         return;
--    job->ready_notifier.notify = block_job_event_ready;
+     }
--    job->idle_notifier.notify = block_job_on_idle;
-+    job->finalize_cancelled_notifier.notify = block_job_event_cancelled_locked;
+@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req)
-+    job->finalize_completed_notifier.notify = block_job_event_completed_locked;
+     if (type == VIRTIO_SCSI_T_TMF) {
-+    job->pending_notifier.notify = block_job_event_pending_locked;
+         if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICtrlTMFReq),
-+    job->ready_notifier.notify = block_job_event_ready_locked;
+                     sizeof(VirtIOSCSICtrlTMFResp)) < 0) {
-+    job->idle_notifier.notify = block_job_on_idle_locked;
+-            virtio_scsi_bad_req(req);
++            virtio_scsi_bad_req(req, &s->ctrl_lock);
-     WITH_JOB_LOCK_GUARD() {
+             return;
-         notifier_list_add(&job->job.on_finalize_cancelled,
+         } else {
              r = virtio_scsi_do_tmf(s, req);
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req)
                 type == VIRTIO_SCSI_T_AN_SUBSCRIBE) {
          if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICtrlANReq),
                      sizeof(VirtIOSCSICtrlANResp)) < 0) {
 -            virtio_scsi_bad_req(req);
 +            virtio_scsi_bad_req(req, &s->ctrl_lock);
              return;
          } else {
              req->req.an.event_requested =
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req)
                   type == VIRTIO_SCSI_T_AN_SUBSCRIBE)
              trace_virtio_scsi_an_resp(virtio_scsi_get_lun(req->req.an.lun),
                                        req->resp.an.response);
 -        virtio_scsi_complete_req(req);
 +        virtio_scsi_complete_req(req, &s->ctrl_lock);
      } else {
          assert(r == -EINPROGRESS);
      }
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
  {
      VirtIOSCSIReq *req;
 -    while ((req = virtio_scsi_pop_req(s, vq))) {
 +    while ((req = virtio_scsi_pop_req(s, vq, &s->ctrl_lock))) {
          virtio_scsi_handle_ctrl_req(s, req);
      }
  }
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_complete_cmd_req(VirtIOSCSIReq *req)
       * in virtio_scsi_command_complete.
       */
      req->resp_size = sizeof(VirtIOSCSICmdResp);
 -    virtio_scsi_complete_req(req);
 +    virtio_scsi_complete_req(req, NULL);
  }
  static void virtio_scsi_command_failed(SCSIRequest *r)
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req)
              virtio_scsi_fail_cmd_req(req);
              return -ENOTSUP;
          } else {
 -            virtio_scsi_bad_req(req);
 +            virtio_scsi_bad_req(req, NULL);
              return -EINVAL;
          }
      }
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
              virtio_queue_set_notification(vq, 0);
          }
 -        while ((req = virtio_scsi_pop_req(s, vq))) {
 +        while ((req = virtio_scsi_pop_req(s, vq, NULL))) {
              ret = virtio_scsi_handle_cmd_req_prepare(s, req);
              if (!ret) {
                  QTAILQ_INSERT_TAIL(&reqs, req, next);
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_push_event(VirtIOSCSI *s,
          return;
      }
 -    req = virtio_scsi_pop_req(s, vs->event_vq);
 +    req = virtio_scsi_pop_req(s, vs->event_vq, &s->event_lock);
      if (!req) {
          s->events_dropped = true;
          return;
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_push_event(VirtIOSCSI *s,
      }
      if (virtio_scsi_parse_req(req, 0, sizeof(VirtIOSCSIEvent))) {
 -        virtio_scsi_bad_req(req);
 +        virtio_scsi_bad_req(req, &s->event_lock);
          return;
      }
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_push_event(VirtIOSCSI *s,
      }
      trace_virtio_scsi_event(virtio_scsi_get_lun(evt->lun), event, reason);
 -    virtio_scsi_complete_req(req);
 +    virtio_scsi_complete_req(req, &s->event_lock);
  }
  static void virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq)
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_device_realize(DeviceState *dev, Error **errp)
      Error *err = NULL;
      QTAILQ_INIT(&s->tmf_bh_list);
 +    qemu_mutex_init(&s->ctrl_lock);
 +    qemu_mutex_init(&s->event_lock);
      qemu_mutex_init(&s->tmf_bh_lock);
      virtio_scsi_common_realize(dev,
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_device_unrealize(DeviceState *dev)
      qbus_set_hotplug_handler(BUS(&s->bus), NULL);
      virtio_scsi_common_unrealize(dev);
      qemu_mutex_destroy(&s->tmf_bh_lock);
 +    qemu_mutex_destroy(&s->event_lock);
 +    qemu_mutex_destroy(&s->ctrl_lock);
  }
  static const Property virtio_scsi_properties[] = {
 --
-.37.3
+.48.1

-[PULL 37/50] jobs: use job locks also in the unit tests
+[PULL 15/22] virtio-scsi: protect events_dropped field
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+From: Stefan Hajnoczi <stefanha@redhat.com>
-Add missing job synchronization in the unit tests, with
+The block layer can invoke the resize callback from any AioContext that
-explicit locks.
+is processing requests. The virtqueue is already protected but the
 events_dropped field also needs to be protected against races. Cover it
 using the event virtqueue lock because it is closely associated with
 accesses to the virtqueue.
-We are deliberately using _locked functions wrapped by a guard
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 instead of a normal call because the normal call will be removed
 in future, as the only usage is limited to the tests.
 In other words, if a function like job_pause() is/will be only used
 in tests to avoid:
 WITH_JOB_LOCK_GUARD(){
     job_pause_locked();
 }
 then it is not worth keeping job_pause(), and just use the guard.
 Note: at this stage, job_{lock/unlock} and job lock guard macros
 are *nop*.
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Message-Id: <20220926093214.506243-10-eesposit@redhat.com>
+Message-ID: <20250311132616.1049687-7-stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- tests/unit/test-bdrv-drain.c     |  76 ++++++++++++--------
+ include/hw/virtio/virtio-scsi.h |  3 ++-
- tests/unit/test-block-iothread.c |   8 ++-
+ hw/scsi/virtio-scsi.c           | 29 ++++++++++++++++++++---------
- tests/unit/test-blockjob-txn.c   |  24 ++++---
+files changed, 22 insertions(+), 10 deletions(-)
  tests/unit/test-blockjob.c       | 115 +++++++++++++++++++------------
 files changed, 140 insertions(+), 83 deletions(-)
-diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c
+diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
 index XXXXXXX..XXXXXXX 100644
---- a/tests/unit/test-bdrv-drain.c
+--- a/include/hw/virtio/virtio-scsi.h
-+++ b/tests/unit/test-bdrv-drain.c
++++ b/include/hw/virtio/virtio-scsi.h
-@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common_drain_node(enum drain_type drain_type,
+@@ -XXX,XX +XXX,XX @@ struct VirtIOSCSI {
-         }
      SCSIBus bus;
      int resetting; /* written from main loop thread, read from any thread */
 +
 +    QemuMutex event_lock; /* protects event_vq and events_dropped */
      bool events_dropped;
      QemuMutex ctrl_lock; /* protects ctrl_vq */
 -    QemuMutex event_lock; /* protects event_vq */
      /*
       * TMFs deferred to main loop BH. These fields are protected by
 diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/scsi/virtio-scsi.c
 +++ b/hw/scsi/virtio-scsi.c
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_reset(VirtIODevice *vdev)
      vs->sense_size = VIRTIO_SCSI_SENSE_DEFAULT_SIZE;
      vs->cdb_size = VIRTIO_SCSI_CDB_DEFAULT_SIZE;
 -    s->events_dropped = false;
 +
 +    WITH_QEMU_LOCK_GUARD(&s->event_lock) {
 +        s->events_dropped = false;
 +    }
  }
  typedef struct {
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_push_event(VirtIOSCSI *s,
      }
--    g_assert_cmpint(job->job.pause_count, ==, 0);
+     req = virtio_scsi_pop_req(s, vs->event_vq, &s->event_lock);
--    g_assert_false(job->job.paused);
+-    if (!req) {
--    g_assert_true(tjob->running);
+-        s->events_dropped = true;
--    g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
+-        return;
-+    WITH_JOB_LOCK_GUARD() {
+-    }
-+        g_assert_cmpint(job->job.pause_count, ==, 0);
++    WITH_QEMU_LOCK_GUARD(&s->event_lock) {
-+        g_assert_false(job->job.paused);
++        if (!req) {
-+        g_assert_true(tjob->running);
++            s->events_dropped = true;
-+        g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
++            return;
 +    }
      do_drain_begin_unlocked(drain_type, drain_bs);
 -    if (drain_type == BDRV_DRAIN_ALL) {
 -        /* bdrv_drain_all() drains both src and target */
 -        g_assert_cmpint(job->job.pause_count, ==, 2);
 -    } else {
 -        g_assert_cmpint(job->job.pause_count, ==, 1);
 +    WITH_JOB_LOCK_GUARD() {
 +        if (drain_type == BDRV_DRAIN_ALL) {
 +            /* bdrv_drain_all() drains both src and target */
 +            g_assert_cmpint(job->job.pause_count, ==, 2);
 +        } else {
 +            g_assert_cmpint(job->job.pause_count, ==, 1);
 +        }
-+        g_assert_true(job->job.paused);
-+        g_assert_false(job->job.busy); /* The job is paused */
+-    if (s->events_dropped) {
-     }
+-        event |= VIRTIO_SCSI_T_EVENTS_MISSED;
--    g_assert_true(job->job.paused);
+-        s->events_dropped = false;
--    g_assert_false(job->job.busy); /* The job is paused */
++        if (s->events_dropped) {
++            event |= VIRTIO_SCSI_T_EVENTS_MISSED;
-     do_drain_end_unlocked(drain_type, drain_bs);
++            s->events_dropped = false;
      if (use_iothread) {
 -        /* paused is reset in the I/O thread, wait for it */
 +        /*
 +         * Here we are waiting for the paused status to change,
 +         * so don't bother protecting the read every time.
 +         *
 +         * paused is reset in the I/O thread, wait for it
 +         */
          while (job->job.paused) {
              aio_poll(qemu_get_aio_context(), false);
          }
      }
 -    g_assert_cmpint(job->job.pause_count, ==, 0);
 -    g_assert_false(job->job.paused);
 -    g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
 +    WITH_JOB_LOCK_GUARD() {
 +        g_assert_cmpint(job->job.pause_count, ==, 0);
 +        g_assert_false(job->job.paused);
 +        g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
 +    }
      do_drain_begin_unlocked(drain_type, target);
 -    if (drain_type == BDRV_DRAIN_ALL) {
 -        /* bdrv_drain_all() drains both src and target */
 -        g_assert_cmpint(job->job.pause_count, ==, 2);
 -    } else {
 -        g_assert_cmpint(job->job.pause_count, ==, 1);
 +    WITH_JOB_LOCK_GUARD() {
 +        if (drain_type == BDRV_DRAIN_ALL) {
 +            /* bdrv_drain_all() drains both src and target */
 +            g_assert_cmpint(job->job.pause_count, ==, 2);
 +        } else {
 +            g_assert_cmpint(job->job.pause_count, ==, 1);
 +        }
 +        g_assert_true(job->job.paused);
 +        g_assert_false(job->job.busy); /* The job is paused */
      }
 -    g_assert_true(job->job.paused);
 -    g_assert_false(job->job.busy); /* The job is paused */
      do_drain_end_unlocked(drain_type, target);
      if (use_iothread) {
 -        /* paused is reset in the I/O thread, wait for it */
 +        /*
 +         * Here we are waiting for the paused status to change,
 +         * so don't bother protecting the read every time.
 +         *
 +         * paused is reset in the I/O thread, wait for it
 +         */
          while (job->job.paused) {
              aio_poll(qemu_get_aio_context(), false);
          }
      }
 -    g_assert_cmpint(job->job.pause_count, ==, 0);
 -    g_assert_false(job->job.paused);
 -    g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
 +    WITH_JOB_LOCK_GUARD() {
 +        g_assert_cmpint(job->job.pause_count, ==, 0);
 +        g_assert_false(job->job.paused);
 +        g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
 +    }
      aio_context_acquire(ctx);
 -    ret = job_complete_sync(&job->job, &error_abort);
 +    WITH_JOB_LOCK_GUARD() {
 +        ret = job_complete_sync_locked(&job->job, &error_abort);
 +    }
      g_assert_cmpint(ret, ==, (result == TEST_JOB_SUCCESS ? 0 : -EIO));
      if (use_iothread) {
 diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/unit/test-block-iothread.c
 +++ b/tests/unit/test-block-iothread.c
@@ -XXX,XX +XXX,XX @@ static void test_attach_blockjob(void)
      }
      aio_context_acquire(ctx);
 -    job_complete_sync(&tjob->common.job, &error_abort);
 +    WITH_JOB_LOCK_GUARD() {
 +        job_complete_sync_locked(&tjob->common.job, &error_abort);
 +    }
      blk_set_aio_context(blk, qemu_get_aio_context(), &error_abort);
      aio_context_release(ctx);
@@ -XXX,XX +XXX,XX @@ static void test_propagate_mirror(void)
                   BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
                   false, "filter_node", MIRROR_COPY_MODE_BACKGROUND,
                   &error_abort);
 -    job = job_get("job0");
 +    WITH_JOB_LOCK_GUARD() {
 +        job = job_get_locked("job0");
 +    }
      filter = bdrv_find_node("filter_node");
      /* Change the AioContext of src */
 diff --git a/tests/unit/test-blockjob-txn.c b/tests/unit/test-blockjob-txn.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/unit/test-blockjob-txn.c
 +++ b/tests/unit/test-blockjob-txn.c
@@ -XXX,XX +XXX,XX @@ static void test_single_job(int expected)
      job = test_block_job_start(1, true, expected, &result, txn);
      job_start(&job->job);
 -    if (expected == -ECANCELED) {
 -        job_cancel(&job->job, false);
 +    WITH_JOB_LOCK_GUARD() {
 +        if (expected == -ECANCELED) {
 +            job_cancel_locked(&job->job, false);
 +        }
      }
-     while (result == -EINPROGRESS) {
+     if (virtio_scsi_parse_req(req, 0, sizeof(VirtIOSCSIEvent))) {
-@@ -XXX,XX +XXX,XX @@ static void test_pair_jobs(int expected1, int expected2)
+@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_push_event(VirtIOSCSI *s,
-     /* Release our reference now to trigger as many nice
-      * use-after-free bugs as possible.
+ static void virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq)
-      */
+ {
--    job_txn_unref(txn);
+-    if (s->events_dropped) {
-+    WITH_JOB_LOCK_GUARD() {
++    bool events_dropped;
-+        job_txn_unref_locked(txn);
++
++    WITH_QEMU_LOCK_GUARD(&s->event_lock) {
--    if (expected1 == -ECANCELED) {
++        events_dropped = s->events_dropped;
 -        job_cancel(&job1->job, false);
 -    }
 -    if (expected2 == -ECANCELED) {
 -        job_cancel(&job2->job, false);
 +        if (expected1 == -ECANCELED) {
 +            job_cancel_locked(&job1->job, false);
 +        }
 +        if (expected2 == -ECANCELED) {
 +            job_cancel_locked(&job2->job, false);
 +        }
      }
      while (result1 == -EINPROGRESS || result2 == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static void test_pair_jobs_fail_cancel_race(void)
      job_start(&job1->job);
      job_start(&job2->job);
 -    job_cancel(&job1->job, false);
 +    WITH_JOB_LOCK_GUARD() {
 +        job_cancel_locked(&job1->job, false);
 +    }
      /* Now make job2 finish before the main loop kicks jobs.  This simulates
       * the race between a pending kick and another job completing.
 diff --git a/tests/unit/test-blockjob.c b/tests/unit/test-blockjob.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/unit/test-blockjob.c
 +++ b/tests/unit/test-blockjob.c
@@ -XXX,XX +XXX,XX @@ static CancelJob *create_common(Job **pjob)
      bjob = mk_job(blk, "Steve", &test_cancel_driver, true,
                    JOB_MANUAL_FINALIZE | JOB_MANUAL_DISMISS);
      job = &bjob->job;
 -    job_ref(job);
 -    assert(job->status == JOB_STATUS_CREATED);
 +    WITH_JOB_LOCK_GUARD() {
 +        job_ref_locked(job);
 +        assert(job->status == JOB_STATUS_CREATED);
 +    }
 +
-     s = container_of(bjob, CancelJob, common);
++    if (events_dropped) {
-     s->blk = blk;
+         VirtIOSCSIEventInfo info = {
+             .event = VIRTIO_SCSI_T_NO_EVENT,
-@@ -XXX,XX +XXX,XX @@ static void cancel_common(CancelJob *s)
+         };
      aio_context_acquire(ctx);
      job_cancel_sync(&job->job, true);
 -    if (sts != JOB_STATUS_CREATED && sts != JOB_STATUS_CONCLUDED) {
 -        Job *dummy = &job->job;
 -        job_dismiss(&dummy, &error_abort);
 +    WITH_JOB_LOCK_GUARD() {
 +        if (sts != JOB_STATUS_CREATED && sts != JOB_STATUS_CONCLUDED) {
 +            Job *dummy = &job->job;
 +            job_dismiss_locked(&dummy, &error_abort);
 +        }
 +        assert(job->job.status == JOB_STATUS_NULL);
 +        job_unref_locked(&job->job);
      }
 -    assert(job->job.status == JOB_STATUS_NULL);
 -    job_unref(&job->job);
      destroy_blk(blk);
      aio_context_release(ctx);
@@ -XXX,XX +XXX,XX @@ static void test_cancel_created(void)
      cancel_common(s);
  }
 +static void assert_job_status_is(Job *job, int status)
 +{
 +    WITH_JOB_LOCK_GUARD() {
 +        assert(job->status == status);
 +    }
 +}
 +
  static void test_cancel_running(void)
  {
      Job *job;
@@ -XXX,XX +XXX,XX @@ static void test_cancel_running(void)
      s = create_common(&job);
      job_start(job);
 -    assert(job->status == JOB_STATUS_RUNNING);
 +    assert_job_status_is(job, JOB_STATUS_RUNNING);
      cancel_common(s);
  }
@@ -XXX,XX +XXX,XX @@ static void test_cancel_paused(void)
      s = create_common(&job);
      job_start(job);
 -    assert(job->status == JOB_STATUS_RUNNING);
 -
 -    job_user_pause(job, &error_abort);
 +    WITH_JOB_LOCK_GUARD() {
 +        assert(job->status == JOB_STATUS_RUNNING);
 +        job_user_pause_locked(job, &error_abort);
 +    }
      job_enter(job);
 -    assert(job->status == JOB_STATUS_PAUSED);
 +    assert_job_status_is(job, JOB_STATUS_PAUSED);
      cancel_common(s);
  }
@@ -XXX,XX +XXX,XX @@ static void test_cancel_ready(void)
      s = create_common(&job);
      job_start(job);
 -    assert(job->status == JOB_STATUS_RUNNING);
 +    assert_job_status_is(job, JOB_STATUS_RUNNING);
      s->should_converge = true;
      job_enter(job);
 -    assert(job->status == JOB_STATUS_READY);
 +    assert_job_status_is(job, JOB_STATUS_READY);
      cancel_common(s);
  }
@@ -XXX,XX +XXX,XX @@ static void test_cancel_standby(void)
      s = create_common(&job);
      job_start(job);
 -    assert(job->status == JOB_STATUS_RUNNING);
 +    assert_job_status_is(job, JOB_STATUS_RUNNING);
      s->should_converge = true;
      job_enter(job);
 -    assert(job->status == JOB_STATUS_READY);
 -
 -    job_user_pause(job, &error_abort);
 +    WITH_JOB_LOCK_GUARD() {
 +        assert(job->status == JOB_STATUS_READY);
 +        job_user_pause_locked(job, &error_abort);
 +    }
      job_enter(job);
 -    assert(job->status == JOB_STATUS_STANDBY);
 +    assert_job_status_is(job, JOB_STATUS_STANDBY);
      cancel_common(s);
  }
@@ -XXX,XX +XXX,XX @@ static void test_cancel_pending(void)
      s = create_common(&job);
      job_start(job);
 -    assert(job->status == JOB_STATUS_RUNNING);
 +    assert_job_status_is(job, JOB_STATUS_RUNNING);
      s->should_converge = true;
      job_enter(job);
 -    assert(job->status == JOB_STATUS_READY);
 -
 -    job_complete(job, &error_abort);
 +    WITH_JOB_LOCK_GUARD() {
 +        assert(job->status == JOB_STATUS_READY);
 +        job_complete_locked(job, &error_abort);
 +    }
      job_enter(job);
      while (!job->deferred_to_main_loop) {
          aio_poll(qemu_get_aio_context(), true);
      }
 -    assert(job->status == JOB_STATUS_READY);
 +    assert_job_status_is(job, JOB_STATUS_READY);
      aio_poll(qemu_get_aio_context(), true);
 -    assert(job->status == JOB_STATUS_PENDING);
 +    assert_job_status_is(job, JOB_STATUS_PENDING);
      cancel_common(s);
  }
@@ -XXX,XX +XXX,XX @@ static void test_cancel_concluded(void)
      s = create_common(&job);
      job_start(job);
 -    assert(job->status == JOB_STATUS_RUNNING);
 +    assert_job_status_is(job, JOB_STATUS_RUNNING);
      s->should_converge = true;
      job_enter(job);
 -    assert(job->status == JOB_STATUS_READY);
 -
 -    job_complete(job, &error_abort);
 +    WITH_JOB_LOCK_GUARD() {
 +        assert(job->status == JOB_STATUS_READY);
 +        job_complete_locked(job, &error_abort);
 +    }
      job_enter(job);
      while (!job->deferred_to_main_loop) {
          aio_poll(qemu_get_aio_context(), true);
      }
 -    assert(job->status == JOB_STATUS_READY);
 +    assert_job_status_is(job, JOB_STATUS_READY);
      aio_poll(qemu_get_aio_context(), true);
 -    assert(job->status == JOB_STATUS_PENDING);
 +    assert_job_status_is(job, JOB_STATUS_PENDING);
      aio_context_acquire(job->aio_context);
 -    job_finalize(job, &error_abort);
 +    WITH_JOB_LOCK_GUARD() {
 +        job_finalize_locked(job, &error_abort);
 +    }
      aio_context_release(job->aio_context);
 -    assert(job->status == JOB_STATUS_CONCLUDED);
 +    assert_job_status_is(job, JOB_STATUS_CONCLUDED);
      cancel_common(s);
  }
@@ -XXX,XX +XXX,XX @@ static void test_complete_in_standby(void)
      bjob = mk_job(blk, "job", &test_yielding_driver, true,
                    JOB_MANUAL_FINALIZE | JOB_MANUAL_DISMISS);
      job = &bjob->job;
 -    assert(job->status == JOB_STATUS_CREATED);
 +    assert_job_status_is(job, JOB_STATUS_CREATED);
      /* Wait for the job to become READY */
      job_start(job);
      aio_context_acquire(ctx);
 +    /*
 +     * Here we are waiting for the status to change, so don't bother
 +     * protecting the read every time.
 +     */
      AIO_WAIT_WHILE(ctx, job->status != JOB_STATUS_READY);
      aio_context_release(ctx);
      /* Begin the drained section, pausing the job */
      bdrv_drain_all_begin();
 -    assert(job->status == JOB_STATUS_STANDBY);
 +    assert_job_status_is(job, JOB_STATUS_STANDBY);
 +
      /* Lock the IO thread to prevent the job from being run */
      aio_context_acquire(ctx);
      /* This will schedule the job to resume it */
      bdrv_drain_all_end();
 -    /* But the job cannot run, so it will remain on standby */
 -    assert(job->status == JOB_STATUS_STANDBY);
 +    WITH_JOB_LOCK_GUARD() {
 +        /* But the job cannot run, so it will remain on standby */
 +        assert(job->status == JOB_STATUS_STANDBY);
 -    /* Even though the job is on standby, this should work */
 -    job_complete(job, &error_abort);
 +        /* Even though the job is on standby, this should work */
 +        job_complete_locked(job, &error_abort);
 -    /* The test is done now, clean up. */
 -    job_finish_sync(job, NULL, &error_abort);
 -    assert(job->status == JOB_STATUS_PENDING);
 +        /* The test is done now, clean up. */
 +        job_finish_sync_locked(job, NULL, &error_abort);
 +        assert(job->status == JOB_STATUS_PENDING);
 -    job_finalize(job, &error_abort);
 -    assert(job->status == JOB_STATUS_CONCLUDED);
 +        job_finalize_locked(job, &error_abort);
 +        assert(job->status == JOB_STATUS_CONCLUDED);
 -    job_dismiss(&job, &error_abort);
 +        job_dismiss_locked(&job, &error_abort);
 +    }
      destroy_blk(blk);
      aio_context_release(ctx);
 --
-.37.3
+.48.1

-[PULL 19/50] quorum: add missing coroutine_fn annotations
+[PULL 16/22] virtio-scsi: perform TMFs in appropriate AioContexts
-From: Paolo Bonzini <pbonzini@redhat.com>
+From: Stefan Hajnoczi <stefanha@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
+With IOThread Virtqueue Mapping there will be multiple AioContexts
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
+processing SCSI requests. scsi_req_cancel() and other SCSI request
-functions where this holds.
+operations must be performed from the AioContext where the request is
+running.
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Introduce a virtio_scsi_defer_tmf_to_aio_context() function and the
-Message-Id: <20220922084924.201610-19-pbonzini@redhat.com>
+necessary VirtIOSCSIReq->remaining refcount infrastructure to move the
-[kwolf: Fixed up coding style]
+TMF code into the AioContext where the request is running.
 For the time being there is still just one AioContext: the main loop or
 the IOThread. When the iothread-vq-mapping parameter is added in a later
 patch this will be changed to per-virtqueue AioContexts.
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
+Message-ID: <20250311132616.1049687-8-stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/quorum.c | 36 +++++++++++++++++++-----------------
+ hw/scsi/virtio-scsi.c | 270 ++++++++++++++++++++++++++++++++----------
-file changed, 19 insertions(+), 17 deletions(-)
+file changed, 206 insertions(+), 64 deletions(-)
-diff --git a/block/quorum.c b/block/quorum.c
+diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/quorum.c
+--- a/hw/scsi/virtio-scsi.c
-+++ b/block/quorum.c
++++ b/hw/scsi/virtio-scsi.c
-@@ -XXX,XX +XXX,XX @@ static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b)
+@@ -XXX,XX +XXX,XX @@ typedef struct VirtIOSCSIReq {
-     return a->l == b->l;
+     /* Used for two-stage request submission and TMFs deferred to BH */
      QTAILQ_ENTRY(VirtIOSCSIReq) next;
 -    /* Used for cancellation of request during TMFs */
 +    /* Used for cancellation of request during TMFs. Atomic. */
      int remaining;
      SCSIRequest *sreq;
@@ -XXX,XX +XXX,XX @@ typedef struct {
      VirtIOSCSIReq  *tmf_req;
  } VirtIOSCSICancelNotifier;
 +static void virtio_scsi_tmf_dec_remaining(VirtIOSCSIReq *tmf)
 +{
 +    if (qatomic_fetch_dec(&tmf->remaining) == 1) {
 +        trace_virtio_scsi_tmf_resp(virtio_scsi_get_lun(tmf->req.tmf.lun),
 +                                   tmf->req.tmf.tag, tmf->resp.tmf.response);
 +
 +        virtio_scsi_complete_req(tmf, &tmf->dev->ctrl_lock);
 +    }
 +}
 +
  static void virtio_scsi_cancel_notify(Notifier *notifier, void *data)
  {
      VirtIOSCSICancelNotifier *n = container_of(notifier,
                                                 VirtIOSCSICancelNotifier,
                                                 notifier);
 -    if (--n->tmf_req->remaining == 0) {
 -        VirtIOSCSIReq *req = n->tmf_req;
 -
 -        trace_virtio_scsi_tmf_resp(virtio_scsi_get_lun(req->req.tmf.lun),
 -                                   req->req.tmf.tag, req->resp.tmf.response);
 -        virtio_scsi_complete_req(req, &req->dev->ctrl_lock);
 -    }
 +    virtio_scsi_tmf_dec_remaining(n->tmf_req);
      g_free(n);
  }
--static QuorumAIOCB *quorum_aio_get(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_reset_tmf_bh(VirtIOSCSI *s)
 -                                   QEMUIOVector *qiov,
 -                                   uint64_t offset,
 -                                   uint64_t bytes,
 -                                   int flags)
 +static QuorumAIOCB *coroutine_fn quorum_aio_get(BlockDriverState *bs,
 +                                                QEMUIOVector *qiov,
 +                                                uint64_t offset, uint64_t bytes,
 +                                                int flags)
  {
      BDRVQuorumState *s = bs->opaque;
      QuorumAIOCB *acb = g_new(QuorumAIOCB, 1);
@@ -XXX,XX +XXX,XX @@ static void quorum_report_bad_versions(BDRVQuorumState *s,
      }
  }
--static void quorum_rewrite_entry(void *opaque)
+-static void virtio_scsi_defer_tmf_to_bh(VirtIOSCSIReq *req)
-+static void coroutine_fn quorum_rewrite_entry(void *opaque)
++static void virtio_scsi_defer_tmf_to_main_loop(VirtIOSCSIReq *req)
  {
-     QuorumCo *co = opaque;
+     VirtIOSCSI *s = req->dev;
-     QuorumAIOCB *acb = co->acb;
-@@ -XXX,XX +XXX,XX @@ free_exit:
+@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_defer_tmf_to_bh(VirtIOSCSIReq *req)
      quorum_free_vote_list(&acb->votes);
  }
 -static void read_quorum_children_entry(void *opaque)
 +static void coroutine_fn read_quorum_children_entry(void *opaque)
  {
      QuorumCo *co = opaque;
      QuorumAIOCB *acb = co->acb;
@@ -XXX,XX +XXX,XX @@ static void read_quorum_children_entry(void *opaque)
      }
  }
--static int read_quorum_children(QuorumAIOCB *acb)
++static void virtio_scsi_tmf_cancel_req(VirtIOSCSIReq *tmf, SCSIRequest *r)
-+static int coroutine_fn read_quorum_children(QuorumAIOCB *acb)
++{
 +    VirtIOSCSICancelNotifier *notifier;
 +
 +    assert(r->ctx == qemu_get_current_aio_context());
 +
 +    /* Decremented in virtio_scsi_cancel_notify() */
 +    qatomic_inc(&tmf->remaining);
 +
 +    notifier = g_new(VirtIOSCSICancelNotifier, 1);
 +    notifier->notifier.notify = virtio_scsi_cancel_notify;
 +    notifier->tmf_req = tmf;
 +    scsi_req_cancel_async(r, &notifier->notifier);
 +}
 +
 +/* Execute a TMF on the requests in the current AioContext */
 +static void virtio_scsi_do_tmf_aio_context(void *opaque)
 +{
 +    AioContext *ctx = qemu_get_current_aio_context();
 +    VirtIOSCSIReq *tmf = opaque;
 +    VirtIOSCSI *s = tmf->dev;
 +    SCSIDevice *d = virtio_scsi_device_get(s, tmf->req.tmf.lun);
 +    SCSIRequest *r;
 +    bool match_tag;
 +
 +    if (!d) {
 +        tmf->resp.tmf.response = VIRTIO_SCSI_S_BAD_TARGET;
 +        virtio_scsi_tmf_dec_remaining(tmf);
 +        return;
 +    }
 +
 +    /*
 +     * This function could handle other subtypes that need to be processed in
 +     * the request's AioContext in the future, but for now only request
 +     * cancelation subtypes are performed here.
 +     */
 +    switch (tmf->req.tmf.subtype) {
 +    case VIRTIO_SCSI_T_TMF_ABORT_TASK:
 +        match_tag = true;
 +        break;
 +    case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
 +    case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET:
 +        match_tag = false;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    WITH_QEMU_LOCK_GUARD(&d->requests_lock) {
 +        QTAILQ_FOREACH(r, &d->requests, next) {
 +            VirtIOSCSIReq *cmd_req = r->hba_private;
 +            assert(cmd_req); /* request has hba_private while enqueued */
 +
 +            if (r->ctx != ctx) {
 +                continue;
 +            }
 +            if (match_tag && cmd_req->req.cmd.tag != tmf->req.tmf.tag) {
 +                continue;
 +            }
 +            virtio_scsi_tmf_cancel_req(tmf, r);
 +        }
 +    }
 +
 +    /* Incremented by virtio_scsi_do_tmf() */
 +    virtio_scsi_tmf_dec_remaining(tmf);
 +
 +    object_unref(d);
 +}
 +
 +static void dummy_bh(void *opaque)
 +{
 +    /* Do nothing */
 +}
 +
 +/*
 + * Wait for pending virtio_scsi_defer_tmf_to_aio_context() BHs.
 + */
 +static void virtio_scsi_flush_defer_tmf_to_aio_context(VirtIOSCSI *s)
 +{
 +    GLOBAL_STATE_CODE();
 +
 +    assert(!s->dataplane_started);
 +
 +    if (s->ctx) {
 +        /* Our BH only runs after previously scheduled BHs */
 +        aio_wait_bh_oneshot(s->ctx, dummy_bh, NULL);
 +    }
 +}
 +
 +/*
 + * Run the TMF in a specific AioContext, handling only requests in that
 + * AioContext. This is necessary because requests can run in different
 + * AioContext and it is only possible to cancel them from the AioContext where
 + * they are running.
 + */
 +static void virtio_scsi_defer_tmf_to_aio_context(VirtIOSCSIReq *tmf,
 +                                                 AioContext *ctx)
 +{
 +    /* Decremented in virtio_scsi_do_tmf_aio_context() */
 +    qatomic_inc(&tmf->remaining);
 +
 +    /* See virtio_scsi_flush_defer_tmf_to_aio_context() cleanup during reset */
 +    aio_bh_schedule_oneshot(ctx, virtio_scsi_do_tmf_aio_context, tmf);
 +}
 +
 +/*
 + * Returns the AioContext for a given TMF's tag field or NULL. Note that the
 + * request identified by the tag may have completed by the time you can execute
 + * a BH in the AioContext, so don't assume the request still exists in your BH.
 + */
 +static AioContext *find_aio_context_for_tmf_tag(SCSIDevice *d,
 +                                                VirtIOSCSIReq *tmf)
 +{
 +    WITH_QEMU_LOCK_GUARD(&d->requests_lock) {
 +        SCSIRequest *r;
 +        SCSIRequest *next;
 +
 +        QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) {
 +            VirtIOSCSIReq *cmd_req = r->hba_private;
 +
 +            /* hba_private is non-NULL while the request is enqueued */
 +            assert(cmd_req);
 +
 +            if (cmd_req->req.cmd.tag == tmf->req.tmf.tag) {
 +                return r->ctx;
 +            }
 +        }
 +    }
 +    return NULL;
 +}
 +
  /* Return 0 if the request is ready to be completed and return to guest;
   * -EINPROGRESS if the request is submitted and will be completed later, in the
   *  case of async cancellation. */
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
  {
-     BDRVQuorumState *s = acb->bs->opaque;
+     SCSIDevice *d = virtio_scsi_device_get(s, req->req.tmf.lun);
-     int i;
+     SCSIRequest *r, *next;
-@@ -XXX,XX +XXX,XX @@ static int read_quorum_children(QuorumAIOCB *acb)
++    AioContext *ctx;
-     return acb->vote_ret;
+     int ret = 0;
- }
+     virtio_scsi_ctx_check(s, d);
--static int read_fifo_child(QuorumAIOCB *acb)
+@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
-+static int coroutine_fn read_fifo_child(QuorumAIOCB *acb)
+                               req->req.tmf.tag, req->req.tmf.subtype);
- {
-     BDRVQuorumState *s = acb->bs->opaque;
+     switch (req->req.tmf.subtype) {
-     int n, ret;
+-    case VIRTIO_SCSI_T_TMF_ABORT_TASK:
-@@ -XXX,XX +XXX,XX @@ static int read_fifo_child(QuorumAIOCB *acb)
+-    case VIRTIO_SCSI_T_TMF_QUERY_TASK:
-     return ret;
++    case VIRTIO_SCSI_T_TMF_ABORT_TASK: {
- }
+         if (!d) {
+             goto fail;
--static int quorum_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
+         }
--                            QEMUIOVector *qiov, BdrvRequestFlags flags)
+         if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
-+static int coroutine_fn quorum_co_preadv(BlockDriverState *bs,
+             goto incorrect_lun;
-+                                         int64_t offset, int64_t bytes,
+         }
-+                                         QEMUIOVector *qiov,
+-        QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) {
-+                                         BdrvRequestFlags flags)
+-            VirtIOSCSIReq *cmd_req = r->hba_private;
- {
+-            if (cmd_req && cmd_req->req.cmd.tag == req->req.tmf.tag) {
-     BDRVQuorumState *s = bs->opaque;
+-                break;
-     QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags);
+-            }
-@@ -XXX,XX +XXX,XX @@ static int quorum_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
++
-     return ret;
++        ctx = find_aio_context_for_tmf_tag(d, req);
- }
++        if (ctx) {
++            virtio_scsi_defer_tmf_to_aio_context(req, ctx);
--static void write_quorum_entry(void *opaque)
++            ret = -EINPROGRESS;
-+static void coroutine_fn write_quorum_entry(void *opaque)
+         }
- {
+-        if (r) {
-     QuorumCo *co = opaque;
+-            /*
-     QuorumAIOCB *acb = co->acb;
+-             * Assert that the request has not been completed yet, we
-@@ -XXX,XX +XXX,XX @@ static void write_quorum_entry(void *opaque)
+-             * check for it in the loop above.
-     }
+-             */
- }
+-            assert(r->hba_private);
+-            if (req->req.tmf.subtype == VIRTIO_SCSI_T_TMF_QUERY_TASK) {
--static int quorum_co_pwritev(BlockDriverState *bs, int64_t offset,
+-                /* "If the specified command is present in the task set, then
--                             int64_t bytes, QEMUIOVector *qiov,
+-                 * return a service response set to FUNCTION SUCCEEDED".
--                             BdrvRequestFlags flags)
+-                 */
-+static int coroutine_fn quorum_co_pwritev(BlockDriverState *bs, int64_t offset,
+-                req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
-+                                          int64_t bytes, QEMUIOVector *qiov,
+-            } else {
-+                                          BdrvRequestFlags flags)
+-                VirtIOSCSICancelNotifier *notifier;
- {
+-
-     BDRVQuorumState *s = bs->opaque;
+-                req->remaining = 1;
-     QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags);
+-                notifier = g_new(VirtIOSCSICancelNotifier, 1);
-@@ -XXX,XX +XXX,XX @@ static int quorum_co_pwritev(BlockDriverState *bs, int64_t offset,
+-                notifier->tmf_req = req;
-     return ret;
+-                notifier->notifier.notify = virtio_scsi_cancel_notify;
- }
+-                scsi_req_cancel_async(r, &notifier->notifier);
+-                ret = -EINPROGRESS;
--static int quorum_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
++        break;
--                                   int64_t bytes, BdrvRequestFlags flags)
++    }
-+static int coroutine_fn quorum_co_pwrite_zeroes(BlockDriverState *bs,
++
-+                                                int64_t offset, int64_t bytes,
++    case VIRTIO_SCSI_T_TMF_QUERY_TASK:
-+                                                BdrvRequestFlags flags)
++        if (!d) {
++            goto fail;
- {
++        }
-     return quorum_co_pwritev(bs, offset, bytes, NULL,
++        if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
 +            goto incorrect_lun;
 +        }
 +
 +        WITH_QEMU_LOCK_GUARD(&d->requests_lock) {
 +            QTAILQ_FOREACH(r, &d->requests, next) {
 +                VirtIOSCSIReq *cmd_req = r->hba_private;
 +                assert(cmd_req); /* request has hba_private while enqueued */
 +
 +                if (cmd_req->req.cmd.tag == req->req.tmf.tag) {
 +                    /*
 +                     * "If the specified command is present in the task set,
 +                     * then return a service response set to FUNCTION
 +                     * SUCCEEDED".
 +                     */
 +                    req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
 +                }
              }
          }
          break;
      case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
      case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET:
 -        virtio_scsi_defer_tmf_to_bh(req);
 +        virtio_scsi_defer_tmf_to_main_loop(req);
          ret = -EINPROGRESS;
          break;
      case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
 -    case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET:
 +    case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: {
 +        if (!d) {
 +            goto fail;
 +        }
 +        if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
 +            goto incorrect_lun;
 +        }
 +
 +        qatomic_inc(&req->remaining);
 +
 +        ctx = s->ctx ?: qemu_get_aio_context();
 +        virtio_scsi_defer_tmf_to_aio_context(req, ctx);
 +
 +        virtio_scsi_tmf_dec_remaining(req);
 +        ret = -EINPROGRESS;
 +        break;
 +    }
 +
      case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET:
          if (!d) {
              goto fail;
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
              goto incorrect_lun;
          }
 -        /* Add 1 to "remaining" until virtio_scsi_do_tmf returns.
 -         * This way, if the bus starts calling back to the notifiers
 -         * even before we finish the loop, virtio_scsi_cancel_notify
 -         * will not complete the TMF too early.
 -         */
 -        req->remaining = 1;
 -        QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) {
 -            if (r->hba_private) {
 -                if (req->req.tmf.subtype == VIRTIO_SCSI_T_TMF_QUERY_TASK_SET) {
 -                    /* "If there is any command present in the task set, then
 -                     * return a service response set to FUNCTION SUCCEEDED".
 -                     */
 -                    req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
 -                    break;
 -                } else {
 -                    VirtIOSCSICancelNotifier *notifier;
 -
 -                    req->remaining++;
 -                    notifier = g_new(VirtIOSCSICancelNotifier, 1);
 -                    notifier->notifier.notify = virtio_scsi_cancel_notify;
 -                    notifier->tmf_req = req;
 -                    scsi_req_cancel_async(r, &notifier->notifier);
 -                }
 +        WITH_QEMU_LOCK_GUARD(&d->requests_lock) {
 +            QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) {
 +                /* Request has hba_private while enqueued */
 +                assert(r->hba_private);
 +
 +                /*
 +                 * "If there is any command present in the task set, then
 +                 * return a service response set to FUNCTION SUCCEEDED".
 +                 */
 +                req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
 +                break;
              }
          }
 -        if (--req->remaining > 0) {
 -            ret = -EINPROGRESS;
 -        }
          break;
      case VIRTIO_SCSI_T_TMF_CLEAR_ACA:
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_reset(VirtIODevice *vdev)
      assert(!s->dataplane_started);
      virtio_scsi_reset_tmf_bh(s);
 +    virtio_scsi_flush_defer_tmf_to_aio_context(s);
      qatomic_inc(&s->resetting);
      bus_cold_reset(BUS(&s->bus));
 --
-.37.3
+.48.1

-[PULL 41/50] jobs: protect job.aio_context with BQL and job_mutex
+[PULL 17/22] virtio-blk: extract cleanup_iothread_vq_mapping() function
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+From: Stefan Hajnoczi <stefanha@redhat.com>
-In order to make it thread safe, implement a "fake rwlock",
+This is the cleanup function that must be called after
-where we allow reads under BQL *or* job_mutex held, but
+apply_iothread_vq_mapping() succeeds. virtio-scsi will need this
-writes only under BQL *and* job_mutex.
+function too, so extract it.
-The only write we have is in child_job_set_aio_ctx, which always
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 happens under drain (so the job is paused).
 For this reason, introduce job_set_aio_context and make sure that
 the context is set under BQL, job_mutex and drain.
 Also make sure all other places where the aiocontext is read
 are protected.
 The reads in commit.c and mirror.c are actually safe, because always
 done under BQL.
 Note: at this stage, job_{lock/unlock} and job lock guard macros
 are *nop*.
 Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Message-Id: <20220926093214.506243-14-eesposit@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Message-ID: <20250311132616.1049687-9-stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/qemu/job.h  | 23 ++++++++++++++++++++---
+ hw/block/virtio-blk.c | 27 +++++++++++++++++++++------
- block/replication.c |  1 +
+file changed, 21 insertions(+), 6 deletions(-)
  blockjob.c          |  3 ++-
  job.c               | 12 ++++++++++++
 files changed, 35 insertions(+), 4 deletions(-)
-diff --git a/include/qemu/job.h b/include/qemu/job.h
+diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/job.h
+--- a/hw/block/virtio-blk.c
-+++ b/include/qemu/job.h
++++ b/hw/block/virtio-blk.c
-@@ -XXX,XX +XXX,XX @@ typedef struct Job {
+@@ -XXX,XX +XXX,XX @@ validate_iothread_vq_mapping_list(IOThreadVirtQueueMappingList *list,
-     /* ProgressMeter API is thread-safe */
+  * Fill in the AioContext for each virtqueue in the @vq_aio_context array given
-     ProgressMeter progress;
+  * the iothread-vq-mapping parameter in @iothread_vq_mapping_list.
+  *
-+    /**
++ * cleanup_iothread_vq_mapping() must be called to free IOThread object
-+     * AioContext to run the job coroutine in.
++ * references after this function returns success.
-+     * The job Aiocontext can be read when holding *either*
++ *
-+     * the BQL (so we are in the main loop) or the job_mutex.
+  * Returns: %true on success, %false on failure.
-+     * It can only be written when we hold *both* BQL
+  **/
-+     * and the job_mutex.
+ static bool apply_iothread_vq_mapping(
-+     */
+@@ -XXX,XX +XXX,XX @@ static bool apply_iothread_vq_mapping(
-+    AioContext *aio_context;
+     return true;
+ }
 -    /** Protected by AioContext lock */
 -    /** AioContext to run the job coroutine in */
 -    AioContext *aio_context;
 +    /** Protected by AioContext lock */
      /** Reference count of the block job */
      int refcnt;
@@ -XXX,XX +XXX,XX @@ int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp),
  int job_finish_sync_locked(Job *job, void (*finish)(Job *, Error **errp),
                             Error **errp);
 +/**
-+ * Sets the @job->aio_context.
++ * cleanup_iothread_vq_mapping:
-+ * Called with job_mutex *not* held.
++ * @list: The mapping of virtqueues to IOThreads.
 + *
-+ * This function must run in the main thread to protect against
++ * Release IOThread object references that were acquired by
-+ * concurrent read in job_finish_sync_locked(), takes the job_mutex
++ * apply_iothread_vq_mapping().
 + * lock to protect against the read in job_do_yield_locked(), and must
 + * be called when the job is quiescent.
 + */
-+void job_set_aio_context(Job *job, AioContext *ctx);
++static void cleanup_iothread_vq_mapping(IOThreadVirtQueueMappingList *list)
 +{
 +    IOThreadVirtQueueMappingList *node;
 +
- #endif
++    for (node = list; node; node = node->next) {
-diff --git a/block/replication.c b/block/replication.c
++        IOThread *iothread = iothread_by_id(node->value->iothread);
-index XXXXXXX..XXXXXXX 100644
++        object_unref(OBJECT(iothread));
---- a/block/replication.c
++    }
 +++ b/block/replication.c
@@ -XXX,XX +XXX,XX @@ static void replication_close(BlockDriverState *bs)
  {
      BDRVReplicationState *s = bs->opaque;
      Job *commit_job;
 +    GLOBAL_STATE_CODE();
      if (s->stage == BLOCK_REPLICATION_RUNNING) {
          replication_stop(s->rs, false, NULL);
 diff --git a/blockjob.c b/blockjob.c
 index XXXXXXX..XXXXXXX 100644
 --- a/blockjob.c
 +++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static void child_job_set_aio_ctx(BdrvChild *c, AioContext *ctx,
          bdrv_set_aio_context_ignore(sibling->bs, ctx, ignore);
      }
 -    job->job.aio_context = ctx;
 +    job_set_aio_context(&job->job, ctx);
  }
  static AioContext *child_job_get_parent_aio_context(BdrvChild *c)
  {
      BlockJob *job = c->opaque;
 +    GLOBAL_STATE_CODE();
      return job->job.aio_context;
  }
 diff --git a/job.c b/job.c
 index XXXXXXX..XXXXXXX 100644
 --- a/job.c
 +++ b/job.c
@@ -XXX,XX +XXX,XX @@ Job *job_get(const char *id)
      return job_get_locked(id);
  }
 +void job_set_aio_context(Job *job, AioContext *ctx)
 +{
 +    /* protect against read in job_finish_sync_locked and job_start */
 +    GLOBAL_STATE_CODE();
 +    /* protect against read in job_do_yield_locked */
 +    JOB_LOCK_GUARD();
 +    /* ensure the job is quiescent while the AioContext is changed */
 +    assert(job->paused || job_is_completed_locked(job));
 +    job->aio_context = ctx;
 +}
 +
- /* Called with job_mutex *not* held. */
+ /* Context: BQL held */
- static void job_sleep_timer_cb(void *opaque)
+ static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp)
  {
-@@ -XXX,XX +XXX,XX @@ int job_finish_sync_locked(Job *job,
+@@ -XXX,XX +XXX,XX @@ static void virtio_blk_vq_aio_context_cleanup(VirtIOBlock *s)
- {
+     assert(!s->ioeventfd_started);
-     Error *local_err = NULL;
-     int ret;
+     if (conf->iothread_vq_mapping_list) {
-+    GLOBAL_STATE_CODE();
+-        IOThreadVirtQueueMappingList *node;
+-
-     job_ref_locked(job);
+-        for (node = conf->iothread_vq_mapping_list; node; node = node->next) {
+-            IOThread *iothread = iothread_by_id(node->value->iothread);
 -            object_unref(OBJECT(iothread));
 -        }
 +        cleanup_iothread_vq_mapping(conf->iothread_vq_mapping_list);
      }
      if (conf->iothread) {
 --
-.37.3
+.48.1

-[PULL 35/50] blockjob: introduce block_job _locked() APIs
+[PULL 18/22] virtio-blk: tidy up iothread_vq_mapping functions
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+From: Stefan Hajnoczi <stefanha@redhat.com>
-Just as done with job.h, create _locked() functions in blockjob.h
+Use noun_verb() function naming instead of verb_noun() because the
 former is the most common naming style for APIs. The next commit will
 move these functions into a header file so that virtio-scsi can call
 them.
-These functions will be later useful when caller has already taken
+Shorten iothread_vq_mapping_apply()'s iothread_vq_mapping_list argument
-the lock. All blockjob _locked functions call job _locked functions.
+to just "list" like in the other functions.
-Note: at this stage, job_{lock/unlock} and job lock guard macros
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 are *nop*.
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Message-ID: <20250311132616.1049687-10-stefanha@redhat.com>
 Message-Id: <20220926093214.506243-8-eesposit@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/block/blockjob.h | 18 ++++++++++++++
+ hw/block/virtio-blk.c | 33 ++++++++++++++++-----------------
- blockjob.c               | 52 ++++++++++++++++++++++++++++++++--------
+file changed, 16 insertions(+), 17 deletions(-)
 files changed, 60 insertions(+), 10 deletions(-)
-diff --git a/include/block/blockjob.h b/include/block/blockjob.h
+diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/block/blockjob.h
+--- a/hw/block/virtio-blk.c
-+++ b/include/block/blockjob.h
++++ b/hw/block/virtio-blk.c
-@@ -XXX,XX +XXX,XX @@ typedef struct BlockJob {
+@@ -XXX,XX +XXX,XX @@ static const BlockDevOps virtio_block_ops = {
-  */
+ };
- BlockJob *block_job_next(BlockJob *job);
+ static bool
-+/* Same as block_job_next(), but called with job lock held. */
+-validate_iothread_vq_mapping_list(IOThreadVirtQueueMappingList *list,
-+BlockJob *block_job_next_locked(BlockJob *job);
+-        uint16_t num_queues, Error **errp)
-+
++iothread_vq_mapping_validate(IOThreadVirtQueueMappingList *list, uint16_t
 +        num_queues, Error **errp)
  {
      g_autofree unsigned long *vqs = bitmap_new(num_queues);
      g_autoptr(GHashTable) iothreads =
@@ -XXX,XX +XXX,XX @@ validate_iothread_vq_mapping_list(IOThreadVirtQueueMappingList *list,
  }
  /**
-  * block_job_get:
+- * apply_iothread_vq_mapping:
-  * @id: The id of the block job.
+- * @iothread_vq_mapping_list: The mapping of virtqueues to IOThreads.
-@@ -XXX,XX +XXX,XX @@ BlockJob *block_job_next(BlockJob *job);
++ * iothread_vq_mapping_apply:
-  */
++ * @list: The mapping of virtqueues to IOThreads.
- BlockJob *block_job_get(const char *id);
+  * @vq_aio_context: The array of AioContext pointers to fill in.
+  * @num_queues: The length of @vq_aio_context.
-+/* Same as block_job_get(), but called with job lock held. */
+  * @errp: If an error occurs, a pointer to the area to store the error.
 +BlockJob *block_job_get_locked(const char *id);
 +
  /**
   * block_job_add_bdrv:
   * @job: A block job
@@ -XXX,XX +XXX,XX @@ bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs);
   */
  bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp);
 +/*
 + * Same as block_job_set_speed(), but called with job lock held.
 + * Might release the lock temporarily.
 + */
 +bool block_job_set_speed_locked(BlockJob *job, int64_t speed, Error **errp);
 +
  /**
   * block_job_query:
   * @job: The job to get information about.
@@ -XXX,XX +XXX,XX @@ bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp);
   */
  BlockJobInfo *block_job_query(BlockJob *job, Error **errp);
 +/* Same as block_job_query(), but called with job lock held. */
 +BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp);
 +
  /**
   * block_job_iostatus_reset:
   * @job: The job whose I/O status should be reset.
@@ -XXX,XX +XXX,XX @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp);
   */
  void block_job_iostatus_reset(BlockJob *job);
 +/* Same as block_job_iostatus_reset(), but called with job lock held. */
 +void block_job_iostatus_reset_locked(BlockJob *job);
 +
  /*
   * block_job_get_aio_context:
   *
-diff --git a/blockjob.c b/blockjob.c
+  * Fill in the AioContext for each virtqueue in the @vq_aio_context array given
-index XXXXXXX..XXXXXXX 100644
+- * the iothread-vq-mapping parameter in @iothread_vq_mapping_list.
---- a/blockjob.c
++ * the iothread-vq-mapping parameter in @list.
-+++ b/blockjob.c
+  *
-@@ -XXX,XX +XXX,XX @@ static bool is_block_job(Job *job)
+- * cleanup_iothread_vq_mapping() must be called to free IOThread object
-            job_type(job) == JOB_TYPE_STREAM;
++ * iothread_vq_mapping_cleanup() must be called to free IOThread object
- }
+  * references after this function returns success.
+  *
--BlockJob *block_job_next(BlockJob *bjob)
+  * Returns: %true on success, %false on failure.
-+BlockJob *block_job_next_locked(BlockJob *bjob)
+  **/
- {
+-static bool apply_iothread_vq_mapping(
-     Job *job = bjob ? &bjob->job : NULL;
+-        IOThreadVirtQueueMappingList *iothread_vq_mapping_list,
-     GLOBAL_STATE_CODE();
++static bool iothread_vq_mapping_apply(
++        IOThreadVirtQueueMappingList *list,
-     do {
+         AioContext **vq_aio_context,
--        job = job_next(job);
+         uint16_t num_queues,
-+        job = job_next_locked(job);
+         Error **errp)
-     } while (job && !is_block_job(job));
+@@ -XXX,XX +XXX,XX @@ static bool apply_iothread_vq_mapping(
+     size_t num_iothreads = 0;
-     return job ? container_of(job, BlockJob, job) : NULL;
+     size_t cur_iothread = 0;
- }
+-    if (!validate_iothread_vq_mapping_list(iothread_vq_mapping_list,
--BlockJob *block_job_get(const char *id)
+-                                           num_queues, errp)) {
-+BlockJob *block_job_next(BlockJob *bjob)
++    if (!iothread_vq_mapping_validate(list, num_queues, errp)) {
  {
 -    Job *job = job_get(id);
 +    JOB_LOCK_GUARD();
 +    return block_job_next_locked(bjob);
 +}
 +
 +BlockJob *block_job_get_locked(const char *id)
 +{
 +    Job *job = job_get_locked(id);
      GLOBAL_STATE_CODE();
      if (job && is_block_job(job)) {
@@ -XXX,XX +XXX,XX @@ BlockJob *block_job_get(const char *id)
      }
  }
 +BlockJob *block_job_get(const char *id)
 +{
 +    JOB_LOCK_GUARD();
 +    return block_job_get_locked(id);
 +}
 +
  void block_job_free(Job *job)
  {
      BlockJob *bjob = container_of(job, BlockJob, job);
@@ -XXX,XX +XXX,XX @@ static bool job_timer_pending(Job *job)
      return timer_pending(&job->sleep_timer);
  }
 -bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
 +bool block_job_set_speed_locked(BlockJob *job, int64_t speed, Error **errp)
  {
      const BlockJobDriver *drv = block_job_driver(job);
      int64_t old_speed = job->speed;
      GLOBAL_STATE_CODE();
 -    if (job_apply_verb(&job->job, JOB_VERB_SET_SPEED, errp) < 0) {
 +    if (job_apply_verb_locked(&job->job, JOB_VERB_SET_SPEED, errp) < 0) {
          return false;
      }
-     if (speed < 0) {
-@@ -XXX,XX +XXX,XX @@ bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
+-    for (node = iothread_vq_mapping_list; node; node = node->next) {
-     job->speed = speed;
++    for (node = list; node; node = node->next) {
+         num_iothreads++;
      if (drv->set_speed) {
 +        job_unlock();
          drv->set_speed(job, speed);
 +        job_lock();
      }
-     if (speed && speed <= old_speed) {
+-    for (node = iothread_vq_mapping_list; node; node = node->next) {
-@@ -XXX,XX +XXX,XX @@ bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
++    for (node = list; node; node = node->next) {
          IOThread *iothread = iothread_by_id(node->value->iothread);
          AioContext *ctx = iothread_get_aio_context(iothread);
@@ -XXX,XX +XXX,XX @@ static bool apply_iothread_vq_mapping(
  }
  /**
 - * cleanup_iothread_vq_mapping:
 + * iothread_vq_mapping_cleanup:
   * @list: The mapping of virtqueues to IOThreads.
   *
   * Release IOThread object references that were acquired by
 - * apply_iothread_vq_mapping().
 + * iothread_vq_mapping_apply().
   */
 -static void cleanup_iothread_vq_mapping(IOThreadVirtQueueMappingList *list)
 +static void iothread_vq_mapping_cleanup(IOThreadVirtQueueMappingList *list)
  {
      IOThreadVirtQueueMappingList *node;
@@ -XXX,XX +XXX,XX @@ static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp)
      s->vq_aio_context = g_new(AioContext *, conf->num_queues);
      if (conf->iothread_vq_mapping_list) {
 -        if (!apply_iothread_vq_mapping(conf->iothread_vq_mapping_list,
 +        if (!iothread_vq_mapping_apply(conf->iothread_vq_mapping_list,
                                         s->vq_aio_context,
                                         conf->num_queues,
                                         errp)) {
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_vq_aio_context_cleanup(VirtIOBlock *s)
      assert(!s->ioeventfd_started);
      if (conf->iothread_vq_mapping_list) {
 -        cleanup_iothread_vq_mapping(conf->iothread_vq_mapping_list);
 +        iothread_vq_mapping_cleanup(conf->iothread_vq_mapping_list);
      }
-     /* kick only if a timer is pending */
+     if (conf->iothread) {
 -    job_enter_cond(&job->job, job_timer_pending);
 +    job_enter_cond_locked(&job->job, job_timer_pending);
      return true;
  }
 +bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
 +{
 +    JOB_LOCK_GUARD();
 +    return block_job_set_speed_locked(job, speed, errp);
 +}
 +
  int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n)
  {
      IO_CODE();
      return ratelimit_calculate_delay(&job->limit, n);
  }
 -BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
 +BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp)
  {
      BlockJobInfo *info;
      uint64_t progress_current, progress_total;
@@ -XXX,XX +XXX,XX @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
      info->len       = progress_total;
      info->speed     = job->speed;
      info->io_status = job->iostatus;
 -    info->ready     = job_is_ready(&job->job),
 +    info->ready     = job_is_ready_locked(&job->job),
      info->status    = job->job.status;
      info->auto_finalize = job->job.auto_finalize;
      info->auto_dismiss  = job->job.auto_dismiss;
@@ -XXX,XX +XXX,XX @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
      return info;
  }
 +BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
 +{
 +    JOB_LOCK_GUARD();
 +    return block_job_query_locked(job, errp);
 +}
 +
  static void block_job_iostatus_set_err(BlockJob *job, int error)
  {
      if (job->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
@@ -XXX,XX +XXX,XX @@ fail:
      return NULL;
  }
 -void block_job_iostatus_reset(BlockJob *job)
 +void block_job_iostatus_reset_locked(BlockJob *job)
  {
      GLOBAL_STATE_CODE();
      if (job->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
@@ -XXX,XX +XXX,XX @@ void block_job_iostatus_reset(BlockJob *job)
      job->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
  }
 +void block_job_iostatus_reset(BlockJob *job)
 +{
 +    JOB_LOCK_GUARD();
 +    block_job_iostatus_reset_locked(job);
 +}
 +
  void block_job_user_resume(Job *job)
  {
      BlockJob *bjob = container_of(job, BlockJob, job);
 --
-.37.3
+.48.1

-[PULL 03/50] block/nvme: separate nvme_get_free_req cases for coroutine/non-coroutine context
+[PULL 19/22] virtio: extract iothread-vq-mapping.h API
-From: Paolo Bonzini <pbonzini@redhat.com>
+From: Stefan Hajnoczi <stefanha@redhat.com>
-nvme_get_free_req has very difference semantics when called in
+The code that builds an array of AioContext pointers indexed by the
-coroutine context (where it waits) and in non-coroutine context
+virtqueue is not specific to virtio-blk. virtio-scsi will need to do the
-(where it doesn't).  Split the two cases to make it clear what
+same thing, so extract the functions.
 is being requested.
-Cc: qemu-block@nongnu.org
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Alberto Faria <afaria@redhat.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Message-Id: <20220922084924.201610-2-pbonzini@redhat.com>
 [kwolf: Fixed up coding style]
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
+Message-ID: <20250311132616.1049687-11-stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/nvme.c | 48 ++++++++++++++++++++++++++++--------------------
+ include/hw/virtio/iothread-vq-mapping.h |  45 ++++++++
-file changed, 28 insertions(+), 20 deletions(-)
+ hw/block/virtio-blk.c                   | 142 +-----------------------
  hw/virtio/iothread-vq-mapping.c         | 131 ++++++++++++++++++++++
  hw/virtio/meson.build                   |   1 +
 files changed, 178 insertions(+), 141 deletions(-)
  create mode 100644 include/hw/virtio/iothread-vq-mapping.h
  create mode 100644 hw/virtio/iothread-vq-mapping.c
-diff --git a/block/nvme.c b/block/nvme.c
+diff --git a/include/hw/virtio/iothread-vq-mapping.h b/include/hw/virtio/iothread-vq-mapping.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/include/hw/virtio/iothread-vq-mapping.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * IOThread Virtqueue Mapping
 + *
 + * Copyright Red Hat, Inc
 + *
 + * SPDX-License-Identifier: GPL-2.0-only
 + */
 +
 +#ifndef HW_VIRTIO_IOTHREAD_VQ_MAPPING_H
 +#define HW_VIRTIO_IOTHREAD_VQ_MAPPING_H
 +
 +#include "qapi/error.h"
 +#include "qapi/qapi-types-virtio.h"
 +
 +/**
 + * iothread_vq_mapping_apply:
 + * @list: The mapping of virtqueues to IOThreads.
 + * @vq_aio_context: The array of AioContext pointers to fill in.
 + * @num_queues: The length of @vq_aio_context.
 + * @errp: If an error occurs, a pointer to the area to store the error.
 + *
 + * Fill in the AioContext for each virtqueue in the @vq_aio_context array given
 + * the iothread-vq-mapping parameter in @list.
 + *
 + * iothread_vq_mapping_cleanup() must be called to free IOThread object
 + * references after this function returns success.
 + *
 + * Returns: %true on success, %false on failure.
 + **/
 +bool iothread_vq_mapping_apply(
 +        IOThreadVirtQueueMappingList *list,
 +        AioContext **vq_aio_context,
 +        uint16_t num_queues,
 +        Error **errp);
 +
 +/**
 + * iothread_vq_mapping_cleanup:
 + * @list: The mapping of virtqueues to IOThreads.
 + *
 + * Release IOThread object references that were acquired by
 + * iothread_vq_mapping_apply().
 + */
 +void iothread_vq_mapping_cleanup(IOThreadVirtQueueMappingList *list);
 +
 +#endif /* HW_VIRTIO_IOTHREAD_VQ_MAPPING_H */
 diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
+--- a/hw/block/virtio-blk.c
-+++ b/block/nvme.c
++++ b/hw/block/virtio-blk.c
-@@ -XXX,XX +XXX,XX @@ static void nvme_kick(NVMeQueuePair *q)
+@@ -XXX,XX +XXX,XX @@
-     q->need_kick = 0;
+ #endif
- }
+ #include "hw/virtio/virtio-bus.h"
+ #include "migration/qemu-file-types.h"
--/* Find a free request element if any, otherwise:
++#include "hw/virtio/iothread-vq-mapping.h"
-- * a) if in coroutine context, try to wait for one to become available;
+ #include "hw/virtio/virtio-access.h"
-- * b) if not in coroutine, return NULL;
+ #include "hw/virtio/virtio-blk-common.h"
  #include "qemu/coroutine.h"
@@ -XXX,XX +XXX,XX @@ static const BlockDevOps virtio_block_ops = {
      .drained_end   = virtio_blk_drained_end,
  };
 -static bool
 -iothread_vq_mapping_validate(IOThreadVirtQueueMappingList *list, uint16_t
 -        num_queues, Error **errp)
 -{
 -    g_autofree unsigned long *vqs = bitmap_new(num_queues);
 -    g_autoptr(GHashTable) iothreads =
 -        g_hash_table_new(g_str_hash, g_str_equal);
 -
 -    for (IOThreadVirtQueueMappingList *node = list; node; node = node->next) {
 -        const char *name = node->value->iothread;
 -        uint16List *vq;
 -
 -        if (!iothread_by_id(name)) {
 -            error_setg(errp, "IOThread \"%s\" object does not exist", name);
 -            return false;
 -        }
 -
 -        if (!g_hash_table_add(iothreads, (gpointer)name)) {
 -            error_setg(errp,
 -                    "duplicate IOThread name \"%s\" in iothread-vq-mapping",
 -                    name);
 -            return false;
 -        }
 -
 -        if (node != list) {
 -            if (!!node->value->vqs != !!list->value->vqs) {
 -                error_setg(errp, "either all items in iothread-vq-mapping "
 -                                 "must have vqs or none of them must have it");
 -                return false;
 -            }
 -        }
 -
 -        for (vq = node->value->vqs; vq; vq = vq->next) {
 -            if (vq->value >= num_queues) {
 -                error_setg(errp, "vq index %u for IOThread \"%s\" must be "
 -                        "less than num_queues %u in iothread-vq-mapping",
 -                        vq->value, name, num_queues);
 -                return false;
 -            }
 -
 -            if (test_and_set_bit(vq->value, vqs)) {
 -                error_setg(errp, "cannot assign vq %u to IOThread \"%s\" "
 -                        "because it is already assigned", vq->value, name);
 -                return false;
 -            }
 -        }
 -    }
 -
 -    if (list->value->vqs) {
 -        for (uint16_t i = 0; i < num_queues; i++) {
 -            if (!test_bit(i, vqs)) {
 -                error_setg(errp,
 -                        "missing vq %u IOThread assignment in iothread-vq-mapping",
 -                        i);
 -                return false;
 -            }
 -        }
 -    }
 -
 -    return true;
 -}
 -
 -/**
 - * iothread_vq_mapping_apply:
 - * @list: The mapping of virtqueues to IOThreads.
 - * @vq_aio_context: The array of AioContext pointers to fill in.
 - * @num_queues: The length of @vq_aio_context.
 - * @errp: If an error occurs, a pointer to the area to store the error.
 - *
 - * Fill in the AioContext for each virtqueue in the @vq_aio_context array given
 - * the iothread-vq-mapping parameter in @list.
 - *
 - * iothread_vq_mapping_cleanup() must be called to free IOThread object
 - * references after this function returns success.
 - *
 - * Returns: %true on success, %false on failure.
 - **/
 -static bool iothread_vq_mapping_apply(
 -        IOThreadVirtQueueMappingList *list,
 -        AioContext **vq_aio_context,
 -        uint16_t num_queues,
 -        Error **errp)
 -{
 -    IOThreadVirtQueueMappingList *node;
 -    size_t num_iothreads = 0;
 -    size_t cur_iothread = 0;
 -
 -    if (!iothread_vq_mapping_validate(list, num_queues, errp)) {
 -        return false;
 -    }
 -
 -    for (node = list; node; node = node->next) {
 -        num_iothreads++;
 -    }
 -
 -    for (node = list; node; node = node->next) {
 -        IOThread *iothread = iothread_by_id(node->value->iothread);
 -        AioContext *ctx = iothread_get_aio_context(iothread);
 -
 -        /* Released in virtio_blk_vq_aio_context_cleanup() */
 -        object_ref(OBJECT(iothread));
 -
 -        if (node->value->vqs) {
 -            uint16List *vq;
 -
 -            /* Explicit vq:IOThread assignment */
 -            for (vq = node->value->vqs; vq; vq = vq->next) {
 -                assert(vq->value < num_queues);
 -                vq_aio_context[vq->value] = ctx;
 -            }
 -        } else {
 -            /* Round-robin vq:IOThread assignment */
 -            for (unsigned i = cur_iothread; i < num_queues;
 -                 i += num_iothreads) {
 -                vq_aio_context[i] = ctx;
 -            }
 -        }
 -
 -        cur_iothread++;
 -    }
 -
 -    return true;
 -}
 -
 -/**
 - * iothread_vq_mapping_cleanup:
 - * @list: The mapping of virtqueues to IOThreads.
 - *
 - * Release IOThread object references that were acquired by
 - * iothread_vq_mapping_apply().
 - */
--static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
+-static void iothread_vq_mapping_cleanup(IOThreadVirtQueueMappingList *list)
-+static NVMeRequest *nvme_get_free_req_nofail_locked(NVMeQueuePair *q)
+-{
 -    IOThreadVirtQueueMappingList *node;
 -
 -    for (node = list; node; node = node->next) {
 -        IOThread *iothread = iothread_by_id(node->value->iothread);
 -        object_unref(OBJECT(iothread));
 -    }
 -}
 -
  /* Context: BQL held */
  static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp)
  {
-     NVMeRequest *req;
+diff --git a/hw/virtio/iothread-vq-mapping.c b/hw/virtio/iothread-vq-mapping.c
+new file mode 100644
--    qemu_mutex_lock(&q->lock);
+index XXXXXXX..XXXXXXX
--
+--- /dev/null
--    while (q->free_req_head == -1) {
++++ b/hw/virtio/iothread-vq-mapping.c
--        if (qemu_in_coroutine()) {
+@@ -XXX,XX +XXX,XX @@
--            trace_nvme_free_req_queue_wait(q->s, q->index);
++/*
--            qemu_co_queue_wait(&q->free_req_queue, &q->lock);
++ * IOThread Virtqueue Mapping
--        } else {
++ *
--            qemu_mutex_unlock(&q->lock);
++ * Copyright Red Hat, Inc
--            return NULL;
++ *
--        }
++ * SPDX-License-Identifier: GPL-2.0-only
--    }
++ */
--
++
-     req = &q->reqs[q->free_req_head];
++#include "qemu/osdep.h"
-     q->free_req_head = req->free_req_next;
++#include "system/iothread.h"
-     req->free_req_next = -1;
++#include "hw/virtio/iothread-vq-mapping.h"
--
++
--    qemu_mutex_unlock(&q->lock);
++static bool
-     return req;
++iothread_vq_mapping_validate(IOThreadVirtQueueMappingList *list, uint16_t
- }
++        num_queues, Error **errp)
 +/* Return a free request element if any, otherwise return NULL.  */
 +static NVMeRequest *nvme_get_free_req_nowait(NVMeQueuePair *q)
 +{
-+    QEMU_LOCK_GUARD(&q->lock);
++    g_autofree unsigned long *vqs = bitmap_new(num_queues);
-+    if (q->free_req_head == -1) {
++    g_autoptr(GHashTable) iothreads =
-+        return NULL;
++        g_hash_table_new(g_str_hash, g_str_equal);
-+    }
++
-+    return nvme_get_free_req_nofail_locked(q);
++    for (IOThreadVirtQueueMappingList *node = list; node; node = node->next) {
 +        const char *name = node->value->iothread;
 +        uint16List *vq;
 +
 +        if (!iothread_by_id(name)) {
 +            error_setg(errp, "IOThread \"%s\" object does not exist", name);
 +            return false;
 +        }
 +
 +        if (!g_hash_table_add(iothreads, (gpointer)name)) {
 +            error_setg(errp,
 +                    "duplicate IOThread name \"%s\" in iothread-vq-mapping",
 +                    name);
 +            return false;
 +        }
 +
 +        if (node != list) {
 +            if (!!node->value->vqs != !!list->value->vqs) {
 +                error_setg(errp, "either all items in iothread-vq-mapping "
 +                                 "must have vqs or none of them must have it");
 +                return false;
 +            }
 +        }
 +
 +        for (vq = node->value->vqs; vq; vq = vq->next) {
 +            if (vq->value >= num_queues) {
 +                error_setg(errp, "vq index %u for IOThread \"%s\" must be "
 +                        "less than num_queues %u in iothread-vq-mapping",
 +                        vq->value, name, num_queues);
 +                return false;
 +            }
 +
 +            if (test_and_set_bit(vq->value, vqs)) {
 +                error_setg(errp, "cannot assign vq %u to IOThread \"%s\" "
 +                        "because it is already assigned", vq->value, name);
 +                return false;
 +            }
 +        }
 +    }
 +
 +    if (list->value->vqs) {
 +        for (uint16_t i = 0; i < num_queues; i++) {
 +            if (!test_bit(i, vqs)) {
 +                error_setg(errp,
 +                        "missing vq %u IOThread assignment in iothread-vq-mapping",
 +                        i);
 +                return false;
 +            }
 +        }
 +    }
 +
 +    return true;
 +}
 +
-+/*
++bool iothread_vq_mapping_apply(
-+ * Wait for a free request to become available if necessary, then
++        IOThreadVirtQueueMappingList *list,
-+ * return it.
++        AioContext **vq_aio_context,
-+ */
++        uint16_t num_queues,
-+static coroutine_fn NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
++        Error **errp)
 +{
-+    QEMU_LOCK_GUARD(&q->lock);
++    IOThreadVirtQueueMappingList *node;
-+
++    size_t num_iothreads = 0;
-+    while (q->free_req_head == -1) {
++    size_t cur_iothread = 0;
-+        trace_nvme_free_req_queue_wait(q->s, q->index);
++
-+        qemu_co_queue_wait(&q->free_req_queue, &q->lock);
++    if (!iothread_vq_mapping_validate(list, num_queues, errp)) {
-+    }
++        return false;
-+
++    }
-+    return nvme_get_free_req_nofail_locked(q);
++
 +    for (node = list; node; node = node->next) {
 +        num_iothreads++;
 +    }
 +
 +    for (node = list; node; node = node->next) {
 +        IOThread *iothread = iothread_by_id(node->value->iothread);
 +        AioContext *ctx = iothread_get_aio_context(iothread);
 +
 +        /* Released in virtio_blk_vq_aio_context_cleanup() */
 +        object_ref(OBJECT(iothread));
 +
 +        if (node->value->vqs) {
 +            uint16List *vq;
 +
 +            /* Explicit vq:IOThread assignment */
 +            for (vq = node->value->vqs; vq; vq = vq->next) {
 +                assert(vq->value < num_queues);
 +                vq_aio_context[vq->value] = ctx;
 +            }
 +        } else {
 +            /* Round-robin vq:IOThread assignment */
 +            for (unsigned i = cur_iothread; i < num_queues;
 +                 i += num_iothreads) {
 +                vq_aio_context[i] = ctx;
 +            }
 +        }
 +
 +        cur_iothread++;
 +    }
 +
 +    return true;
 +}
 +
- /* With q->lock */
++void iothread_vq_mapping_cleanup(IOThreadVirtQueueMappingList *list)
- static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
++{
- {
++    IOThreadVirtQueueMappingList *node;
-@@ -XXX,XX +XXX,XX @@ static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd)
++
-     AioContext *aio_context = bdrv_get_aio_context(bs);
++    for (node = list; node; node = node->next) {
-     NVMeRequest *req;
++        IOThread *iothread = iothread_by_id(node->value->iothread);
-     int ret = -EINPROGRESS;
++        object_unref(OBJECT(iothread));
--    req = nvme_get_free_req(q);
++    }
-+    req = nvme_get_free_req_nowait(q);
++}
-     if (!req) {
++
-         return -EBUSY;
+diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
-     }
+index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/meson.build
 +++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@
  system_virtio_ss = ss.source_set()
  system_virtio_ss.add(files('virtio-bus.c'))
 +system_virtio_ss.add(files('iothread-vq-mapping.c'))
  system_virtio_ss.add(when: 'CONFIG_VIRTIO_PCI', if_true: files('virtio-pci.c'))
  system_virtio_ss.add(when: 'CONFIG_VIRTIO_MMIO', if_true: files('virtio-mmio.c'))
  system_virtio_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto.c'))
 --
-.37.3
+.48.1

-[PULL 05/50] qcow2: remove incorrect coroutine_fn annotations
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-This is incorrect because qcow2_mark_clean() calls qcow2_flush_caches().
-qcow2_mark_clean() is called from non-coroutine context in
-qcow2_inactivate() and qcow2_amend_options().
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-4-pbonzini@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- block/qcow2.h          | 4 ++--
- block/qcow2-refcount.c | 4 ++--
-files changed, 4 insertions(+), 4 deletions(-)
-diff --git a/block/qcow2.h b/block/qcow2.h
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
-+++ b/block/qcow2.h
-@@ -XXX,XX +XXX,XX @@ void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
- int qcow2_update_snapshot_refcount(BlockDriverState *bs,
-     int64_t l1_table_offset, int l1_size, int addend);
--int coroutine_fn qcow2_flush_caches(BlockDriverState *bs);
--int coroutine_fn qcow2_write_caches(BlockDriverState *bs);
-+int qcow2_flush_caches(BlockDriverState *bs);
-+int qcow2_write_caches(BlockDriverState *bs);
- int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
-                           BdrvCheckMode fix);
-diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-refcount.c
-+++ b/block/qcow2-refcount.c
-@@ -XXX,XX +XXX,XX @@ void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
-     }
- }
--int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
-+int qcow2_write_caches(BlockDriverState *bs)
- {
-     BDRVQcow2State *s = bs->opaque;
-     int ret;
-@@ -XXX,XX +XXX,XX @@ int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
-     return 0;
- }
--int coroutine_fn qcow2_flush_caches(BlockDriverState *bs)
-+int qcow2_flush_caches(BlockDriverState *bs)
- {
-     int ret = qcow2_write_caches(bs);
-     if (ret < 0) {
---
-.37.3

-[PULL 06/50] nbd: remove incorrect coroutine_fn annotations
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-nbd_co_establish_connection_cancel() cancels a coroutine but is not called
-from coroutine context itself, for example in nbd_cancel_in_flight()
-and in timer callbacks reconnect_delay_timer_cb() and open_timer_cb().
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-5-pbonzini@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- include/block/nbd.h | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/include/block/nbd.h b/include/block/nbd.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/nbd.h
-+++ b/include/block/nbd.h
-@@ -XXX,XX +XXX,XX @@ QIOChannel *coroutine_fn
- nbd_co_establish_connection(NBDClientConnection *conn, NBDExportInfo *info,
-                             bool blocking, Error **errp);
--void coroutine_fn nbd_co_establish_connection_cancel(NBDClientConnection *conn);
-+void nbd_co_establish_connection_cancel(NBDClientConnection *conn);
- #endif
---
-.37.3

-[PULL 07/50] coroutine: remove incorrect coroutine_fn annotations
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-qemu_coroutine_get_aio_context inspects a coroutine, but it does
-not have to be called from the coroutine itself (or from any
-coroutine).
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-6-pbonzini@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- include/qemu/coroutine.h | 2 +-
- util/qemu-coroutine.c    | 2 +-
-files changed, 2 insertions(+), 2 deletions(-)
-diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/coroutine.h
-+++ b/include/qemu/coroutine.h
-@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_coroutine_yield(void);
- /**
-  * Get the AioContext of the given coroutine
-  */
--AioContext *coroutine_fn qemu_coroutine_get_aio_context(Coroutine *co);
-+AioContext *qemu_coroutine_get_aio_context(Coroutine *co);
- /**
-  * Get the currently executing coroutine
-diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
-index XXXXXXX..XXXXXXX 100644
---- a/util/qemu-coroutine.c
-+++ b/util/qemu-coroutine.c
-@@ -XXX,XX +XXX,XX @@ bool qemu_coroutine_entered(Coroutine *co)
-     return co->caller;
- }
--AioContext *coroutine_fn qemu_coroutine_get_aio_context(Coroutine *co)
-+AioContext *qemu_coroutine_get_aio_context(Coroutine *co)
- {
-     return co->ctx;
- }
---
-.37.3

-[PULL 10/50] iscsi: add missing coroutine_fn annotations
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
-functions where this holds.
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-10-pbonzini@redhat.com>
-[kwolf: Fixed up coding style]
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- block/iscsi.c | 3 ++-
-file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/block/iscsi.c b/block/iscsi.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/iscsi.c
-+++ b/block/iscsi.c
-@@ -XXX,XX +XXX,XX @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
-     }
- }
--static void iscsi_co_init_iscsitask(IscsiLun *iscsilun, struct IscsiTask *iTask)
-+static void coroutine_fn
-+iscsi_co_init_iscsitask(IscsiLun *iscsilun, struct IscsiTask *iTask)
- {
-     *iTask = (struct IscsiTask) {
-         .co         = qemu_coroutine_self(),
---
-.37.3

-[PULL 11/50] nbd: add missing coroutine_fn annotations
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
-functions where this holds.
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-11-pbonzini@redhat.com>
-[kwolf: Fixed up coding style]
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- block/nbd.c | 11 ++++++-----
-file changed, 6 insertions(+), 5 deletions(-)
-diff --git a/block/nbd.c b/block/nbd.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nbd.c
-+++ b/block/nbd.c
-@@ -XXX,XX +XXX,XX @@ static void nbd_iter_request_error(NBDReplyChunkIter *iter, int ret)
-  * nbd_reply_chunk_iter_receive
-  * The pointer stored in @payload requires g_free() to free it.
-  */
--static bool nbd_reply_chunk_iter_receive(BDRVNBDState *s,
--                                         NBDReplyChunkIter *iter,
--                                         uint64_t handle,
--                                         QEMUIOVector *qiov, NBDReply *reply,
--                                         void **payload)
-+static bool coroutine_fn nbd_reply_chunk_iter_receive(BDRVNBDState *s,
-+                                                      NBDReplyChunkIter *iter,
-+                                                      uint64_t handle,
-+                                                      QEMUIOVector *qiov,
-+                                                      NBDReply *reply,
-+                                                      void **payload)
- {
-     int ret, request_ret;
-     NBDReply local_reply;
---
-.37.3

-[PULL 12/50] nfs: add missing coroutine_fn annotations
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
-functions where this holds.
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-12-pbonzini@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- block/nfs.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/block/nfs.c b/block/nfs.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nfs.c
-+++ b/block/nfs.c
-@@ -XXX,XX +XXX,XX @@ static void nfs_process_write(void *arg)
-     qemu_mutex_unlock(&client->mutex);
- }
--static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
-+static void coroutine_fn nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
- {
-     *task = (NFSRPC) {
-         .co             = qemu_coroutine_self(),
---
-.37.3

-[PULL 13/50] nvme: add missing coroutine_fn annotations
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
-functions where this holds.
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-13-pbonzini@redhat.com>
-[kwolf: Fixed up coding style]
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- block/nvme.c | 6 ++++--
-file changed, 4 insertions(+), 2 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static inline bool nvme_qiov_aligned(BlockDriverState *bs,
-     return true;
- }
--static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
--                       QEMUIOVector *qiov, bool is_write, int flags)
-+static coroutine_fn int nvme_co_prw(BlockDriverState *bs,
-+                                    uint64_t offset, uint64_t bytes,
-+                                    QEMUIOVector *qiov, bool is_write,
-+                                    int flags)
- {
-     BDRVNVMeState *s = bs->opaque;
-     int r;
---
-.37.3

-[PULL 14/50] parallels: add missing coroutine_fn annotations
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
-functions where this holds.
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-14-pbonzini@redhat.com>
-[kwolf: Fixed up coding style]
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- block/parallels.c | 5 +++--
-file changed, 3 insertions(+), 2 deletions(-)
-diff --git a/block/parallels.c b/block/parallels.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/parallels.c
-+++ b/block/parallels.c
-@@ -XXX,XX +XXX,XX @@ static int64_t block_status(BDRVParallelsState *s, int64_t sector_num,
-     return start_off;
- }
--static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
--                                 int nb_sectors, int *pnum)
-+static coroutine_fn int64_t allocate_clusters(BlockDriverState *bs,
-+                                              int64_t sector_num,
-+                                              int nb_sectors, int *pnum)
- {
-     int ret = 0;
-     BDRVParallelsState *s = bs->opaque;
---
-.37.3

-[PULL 15/50] qcow2: add missing coroutine_fn annotations
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
-functions where this holds.
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-15-pbonzini@redhat.com>
-[kwolf: Fixed up coding style]
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- block/qcow2.h          | 15 ++++++++-------
- block/qcow2-cluster.c  | 21 ++++++++++++---------
- block/qcow2-refcount.c |  2 +-
- block/qcow2.c          |  5 +++--
-files changed, 24 insertions(+), 19 deletions(-)
-diff --git a/block/qcow2.h b/block/qcow2.h
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.h
-+++ b/block/qcow2.h
-@@ -XXX,XX +XXX,XX @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
-                                 void *cb_opaque, Error **errp);
- int qcow2_shrink_reftable(BlockDriverState *bs);
- int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size);
--int qcow2_detect_metadata_preallocation(BlockDriverState *bs);
-+int coroutine_fn qcow2_detect_metadata_preallocation(BlockDriverState *bs);
- /* qcow2-cluster.c functions */
- int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
-@@ -XXX,XX +XXX,XX @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
- int qcow2_get_host_offset(BlockDriverState *bs, uint64_t offset,
-                           unsigned int *bytes, uint64_t *host_offset,
-                           QCow2SubclusterType *subcluster_type);
--int qcow2_alloc_host_offset(BlockDriverState *bs, uint64_t offset,
--                            unsigned int *bytes, uint64_t *host_offset,
--                            QCowL2Meta **m);
-+int coroutine_fn qcow2_alloc_host_offset(BlockDriverState *bs, uint64_t offset,
-+                                         unsigned int *bytes,
-+                                         uint64_t *host_offset, QCowL2Meta **m);
- int qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
-                                           uint64_t offset,
-                                           int compressed_size,
-@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
- void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry,
-                                      uint64_t *coffset, int *csize);
--int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
-+int coroutine_fn qcow2_alloc_cluster_link_l2(BlockDriverState *bs,
-+                                             QCowL2Meta *m);
- void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m);
- int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
-                           uint64_t bytes, enum qcow2_discard_type type,
-                           bool full_discard);
--int qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
--                             uint64_t bytes, int flags);
-+int coroutine_fn qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
-+                                          uint64_t bytes, int flags);
- int qcow2_expand_zero_clusters(BlockDriverState *bs,
-                                BlockDriverAmendStatusCB *status_cb,
-diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-cluster.c
-+++ b/block/qcow2-cluster.c
-@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
-     return 0;
- }
--static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
-+static int coroutine_fn perform_cow(BlockDriverState *bs, QCowL2Meta *m)
- {
-     BDRVQcow2State *s = bs->opaque;
-     Qcow2COWRegion *start = &m->cow_start;
-@@ -XXX,XX +XXX,XX @@ fail:
-     return ret;
- }
--int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
-+int coroutine_fn qcow2_alloc_cluster_link_l2(BlockDriverState *bs,
-+                                             QCowL2Meta *m)
- {
-     BDRVQcow2State *s = bs->opaque;
-     int i, j = 0, l2_index, ret;
-@@ -XXX,XX +XXX,XX @@ static int count_single_write_clusters(BlockDriverState *bs, int nb_clusters,
-  *           information on cluster allocation may be invalid now. The caller
-  *           must start over anyway, so consider *cur_bytes undefined.
-  */
--static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
--    uint64_t *cur_bytes, QCowL2Meta **m)
-+static int coroutine_fn handle_dependencies(BlockDriverState *bs,
-+                                            uint64_t guest_offset,
-+                                            uint64_t *cur_bytes, QCowL2Meta **m)
- {
-     BDRVQcow2State *s = bs->opaque;
-     QCowL2Meta *old_alloc;
-@@ -XXX,XX +XXX,XX @@ out:
-  *
-  * Return 0 on success and -errno in error cases
-  */
--int qcow2_alloc_host_offset(BlockDriverState *bs, uint64_t offset,
--                            unsigned int *bytes, uint64_t *host_offset,
--                            QCowL2Meta **m)
-+int coroutine_fn qcow2_alloc_host_offset(BlockDriverState *bs, uint64_t offset,
-+                                         unsigned int *bytes,
-+                                         uint64_t *host_offset,
-+                                         QCowL2Meta **m)
- {
-     BDRVQcow2State *s = bs->opaque;
-     uint64_t start, remaining;
-@@ -XXX,XX +XXX,XX @@ out:
-     return ret;
- }
--int qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
--                             uint64_t bytes, int flags)
-+int coroutine_fn qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
-+                                          uint64_t bytes, int flags)
- {
-     BDRVQcow2State *s = bs->opaque;
-     uint64_t end_offset = offset + bytes;
-diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2-refcount.c
-+++ b/block/qcow2-refcount.c
-@@ -XXX,XX +XXX,XX @@ int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
-     return -EIO;
- }
--int qcow2_detect_metadata_preallocation(BlockDriverState *bs)
-+int coroutine_fn qcow2_detect_metadata_preallocation(BlockDriverState *bs)
- {
-     BDRVQcow2State *s = bs->opaque;
-     int64_t i, end_cluster, cluster_count = 0, threshold;
-diff --git a/block/qcow2.c b/block/qcow2.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qcow2.c
-+++ b/block/qcow2.c
-@@ -XXX,XX +XXX,XX @@ static bool merge_cow(uint64_t offset, unsigned bytes,
-  * Return 1 if the COW regions read as zeroes, 0 if not, < 0 on error.
-  * Note that returning 0 does not guarantee non-zero data.
-  */
--static int is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
-+static int coroutine_fn is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
- {
-     /*
-      * This check is designed for optimization shortcut so it must be
-@@ -XXX,XX +XXX,XX @@ static int is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
-                                 m->cow_end.nb_bytes);
- }
--static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
-+static int coroutine_fn handle_alloc_space(BlockDriverState *bs,
-+                                           QCowL2Meta *l2meta)
- {
-     BDRVQcow2State *s = bs->opaque;
-     QCowL2Meta *m;
---
-.37.3

-[PULL 16/50] copy-before-write: add missing coroutine_fn annotations
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
-functions where this holds.
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-16-pbonzini@redhat.com>
-[kwolf: Fixed up coding style]
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- block/copy-before-write.c | 9 +++++----
-file changed, 5 insertions(+), 4 deletions(-)
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn cbw_co_flush(BlockDriverState *bs)
-  * It's guaranteed that guest writes will not interact in the region until
-  * cbw_snapshot_read_unlock() called.
-  */
--static BlockReq *cbw_snapshot_read_lock(BlockDriverState *bs,
--                                        int64_t offset, int64_t bytes,
--                                        int64_t *pnum, BdrvChild **file)
-+static coroutine_fn BlockReq *
-+cbw_snapshot_read_lock(BlockDriverState *bs, int64_t offset, int64_t bytes,
-+                       int64_t *pnum, BdrvChild **file)
- {
-     BDRVCopyBeforeWriteState *s = bs->opaque;
-     BlockReq *req = g_new(BlockReq, 1);
-@@ -XXX,XX +XXX,XX @@ static BlockReq *cbw_snapshot_read_lock(BlockDriverState *bs,
-     return req;
- }
--static void cbw_snapshot_read_unlock(BlockDriverState *bs, BlockReq *req)
-+static coroutine_fn void
-+cbw_snapshot_read_unlock(BlockDriverState *bs, BlockReq *req)
- {
-     BDRVCopyBeforeWriteState *s = bs->opaque;
---
-.37.3

-[PULL 17/50] curl: add missing coroutine_fn annotations
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
-functions where this holds.
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-17-pbonzini@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- block/curl.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/block/curl.c b/block/curl.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/curl.c
-+++ b/block/curl.c
-@@ -XXX,XX +XXX,XX @@ out_noclean:
-     return -EINVAL;
- }
--static void curl_setup_preadv(BlockDriverState *bs, CURLAIOCB *acb)
-+static void coroutine_fn curl_setup_preadv(BlockDriverState *bs, CURLAIOCB *acb)
- {
-     CURLState *state;
-     int running;
---
-.37.3

-[PULL 18/50] qed: add missing coroutine_fn annotations
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
-functions where this holds.
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-18-pbonzini@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- block/qed.c | 4 ++--
-file changed, 2 insertions(+), 2 deletions(-)
-diff --git a/block/qed.c b/block/qed.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/qed.c
-+++ b/block/qed.c
-@@ -XXX,XX +XXX,XX @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
-     return l2_table;
- }
--static bool qed_plug_allocating_write_reqs(BDRVQEDState *s)
-+static bool coroutine_fn qed_plug_allocating_write_reqs(BDRVQEDState *s)
- {
-     qemu_co_mutex_lock(&s->table_lock);
-@@ -XXX,XX +XXX,XX @@ static bool qed_plug_allocating_write_reqs(BDRVQEDState *s)
-     return true;
- }
--static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
-+static void coroutine_fn qed_unplug_allocating_write_reqs(BDRVQEDState *s)
- {
-     qemu_co_mutex_lock(&s->table_lock);
-     assert(s->allocating_write_reqs_plugged);
---
-.37.3

-[PULL 20/50] throttle: add missing coroutine_fn annotations
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
-functions where this holds.
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-20-pbonzini@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- block/throttle.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/block/throttle.c b/block/throttle.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/throttle.c
-+++ b/block/throttle.c
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn throttle_co_pwritev_compressed(BlockDriverState *bs,
-                                BDRV_REQ_WRITE_COMPRESSED);
- }
--static int throttle_co_flush(BlockDriverState *bs)
-+static int coroutine_fn throttle_co_flush(BlockDriverState *bs)
- {
-     return bdrv_co_flush(bs->file->bs);
- }
---
-.37.3

-[PULL 21/50] vmdk: add missing coroutine_fn annotations
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
-functions where this holds.
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-21-pbonzini@redhat.com>
-[kwolf: Fixed up coding style]
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- block/vmdk.c | 22 ++++++++++++----------
-file changed, 12 insertions(+), 10 deletions(-)
-diff --git a/block/vmdk.c b/block/vmdk.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/vmdk.c
-+++ b/block/vmdk.c
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn vmdk_co_block_status(BlockDriverState *bs,
-     return ret;
- }
--static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
--                            int64_t offset_in_cluster, QEMUIOVector *qiov,
--                            uint64_t qiov_offset, uint64_t n_bytes,
--                            uint64_t offset)
-+static int coroutine_fn
-+vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
-+                  int64_t offset_in_cluster, QEMUIOVector *qiov,
-+                  uint64_t qiov_offset, uint64_t n_bytes,
-+                  uint64_t offset)
- {
-     int ret;
-     VmdkGrainMarker *data = NULL;
-@@ -XXX,XX +XXX,XX @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
-     return ret;
- }
--static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
--                            int64_t offset_in_cluster, QEMUIOVector *qiov,
--                            int bytes)
-+static int coroutine_fn
-+vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
-+                 int64_t offset_in_cluster, QEMUIOVector *qiov,
-+                 int bytes)
- {
-     int ret;
-     int cluster_bytes, buf_bytes;
-@@ -XXX,XX +XXX,XX @@ fail:
-  *
-  * Returns: error code with 0 for success.
-  */
--static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
--                       uint64_t bytes, QEMUIOVector *qiov,
--                       bool zeroed, bool zero_dry_run)
-+static int coroutine_fn vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
-+                                     uint64_t bytes, QEMUIOVector *qiov,
-+                                     bool zeroed, bool zero_dry_run)
- {
-     BDRVVmdkState *s = bs->opaque;
-     VmdkExtent *extent = NULL;
---
-.37.3

-[PULL 22/50] job: add missing coroutine_fn annotations
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
-functions where this holds.
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-22-pbonzini@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- include/qemu/job.h | 2 +-
- job.c              | 2 +-
-files changed, 2 insertions(+), 2 deletions(-)
-diff --git a/include/qemu/job.h b/include/qemu/job.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/job.h
-+++ b/include/qemu/job.h
-@@ -XXX,XX +XXX,XX @@ void coroutine_fn job_pause_point(Job *job);
-  *
-  * Yield the job coroutine.
-  */
--void job_yield(Job *job);
-+void coroutine_fn job_yield(Job *job);
- /**
-  * @job: The job that calls the function.
-diff --git a/job.c b/job.c
-index XXXXXXX..XXXXXXX 100644
---- a/job.c
-+++ b/job.c
-@@ -XXX,XX +XXX,XX @@ void coroutine_fn job_pause_point(Job *job)
-     }
- }
--void job_yield(Job *job)
-+void coroutine_fn job_yield(Job *job)
- {
-     assert(job->busy);
---
-.37.3

-[PULL 23/50] coroutine-lock: add missing coroutine_fn annotations
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
-functions where this holds.
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-23-pbonzini@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- util/qemu-coroutine-lock.c | 14 +++++++-------
-file changed, 7 insertions(+), 7 deletions(-)
-diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
-index XXXXXXX..XXXXXXX 100644
---- a/util/qemu-coroutine-lock.c
-+++ b/util/qemu-coroutine-lock.c
-@@ -XXX,XX +XXX,XX @@ typedef struct CoWaitRecord {
-     QSLIST_ENTRY(CoWaitRecord) next;
- } CoWaitRecord;
--static void push_waiter(CoMutex *mutex, CoWaitRecord *w)
-+static void coroutine_fn push_waiter(CoMutex *mutex, CoWaitRecord *w)
- {
-     w->co = qemu_coroutine_self();
-     QSLIST_INSERT_HEAD_ATOMIC(&mutex->from_push, w, next);
-@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_init(CoRwlock *lock)
- }
- /* Releases the internal CoMutex.  */
--static void qemu_co_rwlock_maybe_wake_one(CoRwlock *lock)
-+static void coroutine_fn qemu_co_rwlock_maybe_wake_one(CoRwlock *lock)
- {
-     CoRwTicket *tkt = QSIMPLEQ_FIRST(&lock->tickets);
-     Coroutine *co = NULL;
-@@ -XXX,XX +XXX,XX @@ static void qemu_co_rwlock_maybe_wake_one(CoRwlock *lock)
-     }
- }
--void qemu_co_rwlock_rdlock(CoRwlock *lock)
-+void coroutine_fn qemu_co_rwlock_rdlock(CoRwlock *lock)
- {
-     Coroutine *self = qemu_coroutine_self();
-@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock)
-     self->locks_held++;
- }
--void qemu_co_rwlock_unlock(CoRwlock *lock)
-+void coroutine_fn qemu_co_rwlock_unlock(CoRwlock *lock)
- {
-     Coroutine *self = qemu_coroutine_self();
-@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
-     qemu_co_rwlock_maybe_wake_one(lock);
- }
--void qemu_co_rwlock_downgrade(CoRwlock *lock)
-+void coroutine_fn qemu_co_rwlock_downgrade(CoRwlock *lock)
- {
-     qemu_co_mutex_lock(&lock->mutex);
-     assert(lock->owners == -1);
-@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_downgrade(CoRwlock *lock)
-     qemu_co_rwlock_maybe_wake_one(lock);
- }
--void qemu_co_rwlock_wrlock(CoRwlock *lock)
-+void coroutine_fn qemu_co_rwlock_wrlock(CoRwlock *lock)
- {
-     Coroutine *self = qemu_coroutine_self();
-@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_wrlock(CoRwlock *lock)
-     self->locks_held++;
- }
--void qemu_co_rwlock_upgrade(CoRwlock *lock)
-+void coroutine_fn qemu_co_rwlock_upgrade(CoRwlock *lock)
- {
-     qemu_co_mutex_lock(&lock->mutex);
-     assert(lock->owners > 0);
---
-.37.3

-[PULL 24/50] raw-format: add missing coroutine_fn annotations
+Deleted patch
-From: Paolo Bonzini <pbonzini@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
-functions where this holds.
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-24-pbonzini@redhat.com>
-[kwolf: Fixed up coding style]
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- block/raw-format.c | 3 ++-
-file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/block/raw-format.c b/block/raw-format.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/raw-format.c
-+++ b/block/raw-format.c
-@@ -XXX,XX +XXX,XX @@ static void raw_lock_medium(BlockDriverState *bs, bool locked)
-     bdrv_lock_medium(bs->file->bs, locked);
- }
--static int raw_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
-+static int coroutine_fn raw_co_ioctl(BlockDriverState *bs,
-+                                     unsigned long int req, void *buf)
- {
-     BDRVRawState *s = bs->opaque;
-     if (s->offset || s->has_size) {
---
-.37.3

-[PULL 27/50] test-coroutine: add missing coroutine_fn annotations
+Deleted patch
-From: Marc-André Lureau <marcandre.lureau@redhat.com>
-Callers of coroutine_fn must be coroutine_fn themselves, or the call
-must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
-functions where this holds.
-Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
-Reviewed-by: Alberto Faria <afaria@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-Id: <20220922084924.201610-27-pbonzini@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- tests/unit/test-coroutine.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/tests/unit/test-coroutine.c b/tests/unit/test-coroutine.c
-index XXXXXXX..XXXXXXX 100644
---- a/tests/unit/test-coroutine.c
-+++ b/tests/unit/test-coroutine.c
-@@ -XXX,XX +XXX,XX @@ static void perf_baseline(void)
-     g_test_message("Function call %u iterations: %f s", maxcycles, duration);
- }
--static __attribute__((noinline)) void perf_cost_func(void *opaque)
-+static __attribute__((noinline)) void coroutine_fn perf_cost_func(void *opaque)
- {
-     qemu_coroutine_yield();
- }
---
-.37.3

-[PULL 29/50] job.c: make job_mutex and job_lock/unlock() public
+Deleted patch
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
-job mutex will be used to protect the job struct elements and list,
-replacing AioContext locks.
-Right now use a shared lock for all jobs, in order to keep things
-simple. Once the AioContext lock is gone, we can introduce per-job
-locks.
-To simplify the switch from aiocontext to job lock, introduce
-*nop* lock/unlock functions and macros.
-We want to always call job_lock/unlock outside the AioContext locks,
-and not vice-versa, otherwise we might get a deadlock. This is not
-straightforward to do, and that's why we start with nop functions.
-Once everything is protected by job_lock/unlock, we can change the nop into
-an actual mutex and remove the aiocontext lock.
-Since job_mutex is already being used, add static
-real_job_{lock/unlock} for the existing usage.
-Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Message-Id: <20220926093214.506243-2-eesposit@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- include/qemu/job.h | 24 ++++++++++++++++++++++++
- job.c              | 35 +++++++++++++++++++++++------------
-files changed, 47 insertions(+), 12 deletions(-)
-diff --git a/include/qemu/job.h b/include/qemu/job.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/job.h
-+++ b/include/qemu/job.h
-@@ -XXX,XX +XXX,XX @@ typedef enum JobCreateFlags {
-     JOB_MANUAL_DISMISS = 0x04,
- } JobCreateFlags;
-+extern QemuMutex job_mutex;
-+
-+#define JOB_LOCK_GUARD() /* QEMU_LOCK_GUARD(&job_mutex) */
-+
-+#define WITH_JOB_LOCK_GUARD() /* WITH_QEMU_LOCK_GUARD(&job_mutex) */
-+
-+/**
-+ * job_lock:
-+ *
-+ * Take the mutex protecting the list of jobs and their status.
-+ * Most functions called by the monitor need to call job_lock
-+ * and job_unlock manually.  On the other hand, function called
-+ * by the block jobs themselves and by the block layer will take the
-+ * lock for you.
-+ */
-+void job_lock(void);
-+
-+/**
-+ * job_unlock:
-+ *
-+ * Release the mutex protecting the list of jobs and their status.
-+ */
-+void job_unlock(void);
-+
- /**
-  * Allocate and return a new job transaction. Jobs can be added to the
-  * transaction using job_txn_add_job().
-diff --git a/job.c b/job.c
-index XXXXXXX..XXXXXXX 100644
---- a/job.c
-+++ b/job.c
-@@ -XXX,XX +XXX,XX @@
- #include "trace/trace-root.h"
- #include "qapi/qapi-events-job.h"
-+/*
-+ * job_mutex protects the jobs list, but also makes the
-+ * struct job fields thread-safe.
-+ */
-+QemuMutex job_mutex;
-+
- static QLIST_HEAD(, Job) jobs = QLIST_HEAD_INITIALIZER(jobs);
- /* Job State Transition Table */
-@@ -XXX,XX +XXX,XX @@ struct JobTxn {
-     int refcnt;
- };
--/* Right now, this mutex is only needed to synchronize accesses to job->busy
-- * and job->sleep_timer, such as concurrent calls to job_do_yield and
-- * job_enter. */
--static QemuMutex job_mutex;
-+void job_lock(void)
-+{
-+    /* nop */
-+}
-+
-+void job_unlock(void)
-+{
-+    /* nop */
-+}
--static void job_lock(void)
-+static void real_job_lock(void)
- {
-     qemu_mutex_lock(&job_mutex);
- }
--static void job_unlock(void)
-+static void real_job_unlock(void)
- {
-     qemu_mutex_unlock(&job_mutex);
- }
-@@ -XXX,XX +XXX,XX @@ void job_enter_cond(Job *job, bool(*fn)(Job *job))
-         return;
-     }
--    job_lock();
-+    real_job_lock();
-     if (job->busy) {
--        job_unlock();
-+        real_job_unlock();
-         return;
-     }
-     if (fn && !fn(job)) {
--        job_unlock();
-+        real_job_unlock();
-         return;
-     }
-     assert(!job->deferred_to_main_loop);
-     timer_del(&job->sleep_timer);
-     job->busy = true;
--    job_unlock();
-+    real_job_unlock();
-     aio_co_enter(job->aio_context, job->co);
- }
-@@ -XXX,XX +XXX,XX @@ void job_enter(Job *job)
-  * called explicitly. */
- static void coroutine_fn job_do_yield(Job *job, uint64_t ns)
- {
--    job_lock();
-+    real_job_lock();
-     if (ns != -1) {
-         timer_mod(&job->sleep_timer, ns);
-     }
-     job->busy = false;
-     job_event_idle(job);
--    job_unlock();
-+    real_job_unlock();
-     qemu_coroutine_yield();
-     /* Set by job_enter_cond() before re-entering the coroutine.  */
---
-.37.3

-[PULL 30/50] job.h: categorize fields in struct Job
+Deleted patch
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
-Categorize the fields in struct Job to understand which ones
-need to be protected by the job mutex and which don't.
-Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Message-Id: <20220926093214.506243-3-eesposit@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- include/qemu/job.h | 61 +++++++++++++++++++++++++++-------------------
-file changed, 36 insertions(+), 25 deletions(-)
-diff --git a/include/qemu/job.h b/include/qemu/job.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/job.h
-+++ b/include/qemu/job.h
-@@ -XXX,XX +XXX,XX @@ typedef struct JobTxn JobTxn;
-  * Long-running operation.
-  */
- typedef struct Job {
-+
-+    /* Fields set at initialization (job_create), and never modified */
-+
-     /** The ID of the job. May be NULL for internal jobs. */
-     char *id;
--    /** The type of this job. */
-+    /**
-+     * The type of this job.
-+     * All callbacks are called with job_mutex *not* held.
-+     */
-     const JobDriver *driver;
--    /** Reference count of the block job */
--    int refcnt;
--
--    /** Current state; See @JobStatus for details. */
--    JobStatus status;
--
--    /** AioContext to run the job coroutine in */
--    AioContext *aio_context;
--
-     /**
-      * The coroutine that executes the job.  If not NULL, it is reentered when
-      * busy is false and the job is cancelled.
-+     * Initialized in job_start()
-      */
-     Coroutine *co;
-+    /** True if this job should automatically finalize itself */
-+    bool auto_finalize;
-+
-+    /** True if this job should automatically dismiss itself */
-+    bool auto_dismiss;
-+
-+    /** The completion function that will be called when the job completes.  */
-+    BlockCompletionFunc *cb;
-+
-+    /** The opaque value that is passed to the completion function.  */
-+    void *opaque;
-+
-+    /* ProgressMeter API is thread-safe */
-+    ProgressMeter progress;
-+
-+
-+    /** Protected by AioContext lock */
-+
-+    /** AioContext to run the job coroutine in */
-+    AioContext *aio_context;
-+
-+    /** Reference count of the block job */
-+    int refcnt;
-+
-+    /** Current state; See @JobStatus for details. */
-+    JobStatus status;
-+
-     /**
-      * Timer that is used by @job_sleep_ns. Accessed under job_mutex (in
-      * job.c).
-@@ -XXX,XX +XXX,XX @@ typedef struct Job {
-     /** Set to true when the job has deferred work to the main loop. */
-     bool deferred_to_main_loop;
--    /** True if this job should automatically finalize itself */
--    bool auto_finalize;
--
--    /** True if this job should automatically dismiss itself */
--    bool auto_dismiss;
--
--    ProgressMeter progress;
--
-     /**
-      * Return code from @run and/or @prepare callback(s).
-      * Not final until the job has reached the CONCLUDED status.
-@@ -XXX,XX +XXX,XX @@ typedef struct Job {
-      */
-     Error *err;
--    /** The completion function that will be called when the job completes.  */
--    BlockCompletionFunc *cb;
--
--    /** The opaque value that is passed to the completion function.  */
--    void *opaque;
--
-     /** Notifiers called when a cancelled job is finalised */
-     NotifierList on_finalize_cancelled;
-@@ -XXX,XX +XXX,XX @@ typedef struct Job {
- /**
-  * Callbacks and other information about a Job driver.
-+ * All callbacks are invoked with job_mutex *not* held.
-  */
- struct JobDriver {
-@@ -XXX,XX +XXX,XX @@ void coroutine_fn job_yield(Job *job);
-  */
- void coroutine_fn job_sleep_ns(Job *job, int64_t ns);
--
- /** Returns the JobType of a given Job. */
- JobType job_type(const Job *job);
---
-.37.3

-[PULL 31/50] job.c: API functions not used outside should be static
+Deleted patch
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
-job_event_* functions can all be static, as they are not used
-outside job.c.
-Same applies for job_txn_add_job().
-Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Message-Id: <20220926093214.506243-4-eesposit@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- include/qemu/job.h | 18 ------------------
- job.c              | 22 +++++++++++++++++++---
-files changed, 19 insertions(+), 21 deletions(-)
-diff --git a/include/qemu/job.h b/include/qemu/job.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/job.h
-+++ b/include/qemu/job.h
-@@ -XXX,XX +XXX,XX @@ JobTxn *job_txn_new(void);
-  */
- void job_txn_unref(JobTxn *txn);
--/**
-- * @txn: The transaction (may be NULL)
-- * @job: Job to add to the transaction
-- *
-- * Add @job to the transaction.  The @job must not already be in a transaction.
-- * The caller must call either job_txn_unref() or job_completed() to release
-- * the reference that is automatically grabbed here.
-- *
-- * If @txn is NULL, the function does nothing.
-- */
--void job_txn_add_job(JobTxn *txn, Job *job);
--
- /**
-  * Create a new long-running job and return it.
-  *
-@@ -XXX,XX +XXX,XX @@ void job_progress_set_remaining(Job *job, uint64_t remaining);
-  */
- void job_progress_increase_remaining(Job *job, uint64_t delta);
--/** To be called when a cancelled job is finalised. */
--void job_event_cancelled(Job *job);
--
--/** To be called when a successfully completed job is finalised. */
--void job_event_completed(Job *job);
--
- /**
-  * Conditionally enter the job coroutine if the job is ready to run, not
-  * already busy and fn() returns true. fn() is called while under the job_lock
-diff --git a/job.c b/job.c
-index XXXXXXX..XXXXXXX 100644
---- a/job.c
-+++ b/job.c
-@@ -XXX,XX +XXX,XX @@ void job_txn_unref(JobTxn *txn)
-     }
- }
--void job_txn_add_job(JobTxn *txn, Job *job)
-+/**
-+ * @txn: The transaction (may be NULL)
-+ * @job: Job to add to the transaction
-+ *
-+ * Add @job to the transaction.  The @job must not already be in a transaction.
-+ * The caller must call either job_txn_unref() or job_completed() to release
-+ * the reference that is automatically grabbed here.
-+ *
-+ * If @txn is NULL, the function does nothing.
-+ */
-+static void job_txn_add_job(JobTxn *txn, Job *job)
- {
-     if (!txn) {
-         return;
-@@ -XXX,XX +XXX,XX @@ void job_progress_increase_remaining(Job *job, uint64_t delta)
-     progress_increase_remaining(&job->progress, delta);
- }
--void job_event_cancelled(Job *job)
-+/**
-+ * To be called when a cancelled job is finalised.
-+ */
-+static void job_event_cancelled(Job *job)
- {
-     notifier_list_notify(&job->on_finalize_cancelled, job);
- }
--void job_event_completed(Job *job)
-+/**
-+ * To be called when a successfully completed job is finalised.
-+ */
-+static void job_event_completed(Job *job)
- {
-     notifier_list_notify(&job->on_finalize_completed, job);
- }
---
-.37.3

-[PULL 32/50] aio-wait.h: introduce AIO_WAIT_WHILE_UNLOCKED
+Deleted patch
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
-Same as AIO_WAIT_WHILE macro, but if we are in the Main loop
-do not release and then acquire ctx_ 's aiocontext.
-Once all Aiocontext locks go away, this macro will replace
-AIO_WAIT_WHILE.
-Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Message-Id: <20220926093214.506243-5-eesposit@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- include/block/aio-wait.h | 17 +++++++++++++----
-file changed, 13 insertions(+), 4 deletions(-)
-diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/aio-wait.h
-+++ b/include/block/aio-wait.h
-@@ -XXX,XX +XXX,XX @@ typedef struct {
- extern AioWait global_aio_wait;
- /**
-- * AIO_WAIT_WHILE:
-+ * AIO_WAIT_WHILE_INTERNAL:
-  * @ctx: the aio context, or NULL if multiple aio contexts (for which the
-  *       caller does not hold a lock) are involved in the polling condition.
-  * @cond: wait while this conditional expression is true
-+ * @unlock: whether to unlock and then lock again @ctx. This apples
-+ * only when waiting for another AioContext from the main loop.
-+ * Otherwise it's ignored.
-  *
-  * Wait while a condition is true.  Use this to implement synchronous
-  * operations that require event loop activity.
-@@ -XXX,XX +XXX,XX @@ extern AioWait global_aio_wait;
-  * wait on conditions between two IOThreads since that could lead to deadlock,
-  * go via the main loop instead.
-  */
--#define AIO_WAIT_WHILE(ctx, cond) ({                               \
-+#define AIO_WAIT_WHILE_INTERNAL(ctx, cond, unlock) ({              \
-     bool waited_ = false;                                          \
-     AioWait *wait_ = &global_aio_wait;                             \
-     AioContext *ctx_ = (ctx);                                      \
-@@ -XXX,XX +XXX,XX @@ extern AioWait global_aio_wait;
-         assert(qemu_get_current_aio_context() ==                   \
-                qemu_get_aio_context());                            \
-         while ((cond)) {                                           \
--            if (ctx_) {                                            \
-+            if (unlock && ctx_) {                                  \
-                 aio_context_release(ctx_);                         \
-             }                                                      \
-             aio_poll(qemu_get_aio_context(), true);                \
--            if (ctx_) {                                            \
-+            if (unlock && ctx_) {                                  \
-                 aio_context_acquire(ctx_);                         \
-             }                                                      \
-             waited_ = true;                                        \
-@@ -XXX,XX +XXX,XX @@ extern AioWait global_aio_wait;
-     qatomic_dec(&wait_->num_waiters);                              \
-     waited_; })
-+#define AIO_WAIT_WHILE(ctx, cond)                                  \
-+    AIO_WAIT_WHILE_INTERNAL(ctx, cond, true)
-+
-+#define AIO_WAIT_WHILE_UNLOCKED(ctx, cond)                         \
-+    AIO_WAIT_WHILE_INTERNAL(ctx, cond, false)
-+
- /**
-  * aio_wait_kick:
-  * Wake up the main thread if it is waiting on AIO_WAIT_WHILE().  During
---
-.37.3

-[PULL 34/50] job: move and update comments from blockjob.c
+Deleted patch
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
-This comment applies more on job, it was left in blockjob as in the past
-the whole job logic was implemented there.
-Note: at this stage, job_{lock/unlock} and job lock guard macros
-are *nop*.
-No functional change intended.
-Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Message-Id: <20220926093214.506243-7-eesposit@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- blockjob.c | 20 --------------------
- job.c      | 16 ++++++++++++++++
-files changed, 16 insertions(+), 20 deletions(-)
-diff --git a/blockjob.c b/blockjob.c
-index XXXXXXX..XXXXXXX 100644
---- a/blockjob.c
-+++ b/blockjob.c
-@@ -XXX,XX +XXX,XX @@
- #include "qemu/main-loop.h"
- #include "qemu/timer.h"
--/*
-- * The block job API is composed of two categories of functions.
-- *
-- * The first includes functions used by the monitor.  The monitor is
-- * peculiar in that it accesses the block job list with block_job_get, and
-- * therefore needs consistency across block_job_get and the actual operation
-- * (e.g. block_job_set_speed).  The consistency is achieved with
-- * aio_context_acquire/release.  These functions are declared in blockjob.h.
-- *
-- * The second includes functions used by the block job drivers and sometimes
-- * by the core block layer.  These do not care about locking, because the
-- * whole coroutine runs under the AioContext lock, and are declared in
-- * blockjob_int.h.
-- */
--
- static bool is_block_job(Job *job)
- {
-     return job_type(job) == JOB_TYPE_BACKUP ||
-@@ -XXX,XX +XXX,XX @@ static void block_job_event_ready(Notifier *n, void *opaque)
- }
--/*
-- * API for block job drivers and the block layer.  These functions are
-- * declared in blockjob_int.h.
-- */
--
- void *block_job_create(const char *job_id, const BlockJobDriver *driver,
-                        JobTxn *txn, BlockDriverState *bs, uint64_t perm,
-                        uint64_t shared_perm, int64_t speed, int flags,
-diff --git a/job.c b/job.c
-index XXXXXXX..XXXXXXX 100644
---- a/job.c
-+++ b/job.c
-@@ -XXX,XX +XXX,XX @@
- #include "trace/trace-root.h"
- #include "qapi/qapi-events-job.h"
-+/*
-+ * The job API is composed of two categories of functions.
-+ *
-+ * The first includes functions used by the monitor.  The monitor is
-+ * peculiar in that it accesses the job list with job_get, and
-+ * therefore needs consistency across job_get and the actual operation
-+ * (e.g. job_user_cancel). To achieve this consistency, the caller
-+ * calls job_lock/job_unlock itself around the whole operation.
-+ *
-+ *
-+ * The second includes functions used by the job drivers and sometimes
-+ * by the core block layer. These delegate the locking to the callee instead.
-+ *
-+ * TODO Actually make this true
-+ */
-+
- /*
-  * job_mutex protects the jobs list, but also makes the
-  * struct job fields thread-safe.
---
-.37.3

-[PULL 38/50] block/mirror.c: use of job helpers in drivers
+[PULL 20/22] virtio-scsi: add iothread-vq-mapping parameter
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+From: Stefan Hajnoczi <stefanha@redhat.com>
-Once job lock is used and aiocontext is removed, mirror has
+Allow virtio-scsi virtqueues to be assigned to different IOThreads. This
-to perform job operations under the same critical section,
+makes it possible to take advantage of host multi-queue block layer
-Note: at this stage, job_{lock/unlock} and job lock guard macros
+scalability by assigning virtqueues that have affinity with vCPUs to
-are *nop*.
+different IOThreads that have affinity with host CPUs. The same feature
 was introduced for virtio-blk in the past:
 https://developers.redhat.com/articles/2024/09/05/scaling-virtio-blk-disk-io-iothread-virtqueue-mapping
-Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+Here are fio randread 4k iodepth=64 results from a 4 vCPU guest with an
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Intel P4800X SSD:
-Message-Id: <20220926093214.506243-11-eesposit@redhat.com>
+iothreads IOPS
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
+------------------------------
 189576
 312698
 346744
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Message-ID: <20250311132616.1049687-12-stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- block/mirror.c | 13 +++++++++----
+ include/hw/virtio/virtio-scsi.h |  5 +-
-file changed, 9 insertions(+), 4 deletions(-)
+ hw/scsi/virtio-scsi-dataplane.c | 90 ++++++++++++++++++++++++---------
  hw/scsi/virtio-scsi.c           | 63 ++++++++++++++---------
 files changed, 107 insertions(+), 51 deletions(-)
-diff --git a/block/mirror.c b/block/mirror.c
+diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/mirror.c
+--- a/include/hw/virtio/virtio-scsi.h
-+++ b/block/mirror.c
++++ b/include/hw/virtio/virtio-scsi.h
-@@ -XXX,XX +XXX,XX @@ static void mirror_complete(Job *job, Error **errp)
+@@ -XXX,XX +XXX,XX @@
-     s->should_complete = true;
+ #include "hw/virtio/virtio.h"
+ #include "hw/scsi/scsi.h"
-     /* If the job is paused, it will be re-entered when it is resumed */
+ #include "chardev/char-fe.h"
--    if (!job->paused) {
++#include "qapi/qapi-types-virtio.h"
--        job_enter(job);
+ #include "system/iothread.h"
-+    WITH_JOB_LOCK_GUARD() {
-+        if (!job->paused) {
+ #define TYPE_VIRTIO_SCSI_COMMON "virtio-scsi-common"
-+            job_enter_cond_locked(job, NULL);
+@@ -XXX,XX +XXX,XX @@ struct VirtIOSCSIConf {
      CharBackend chardev;
      uint32_t boot_tpgt;
      IOThread *iothread;
 +    IOThreadVirtQueueMappingList *iothread_vq_mapping_list;
  };
  struct VirtIOSCSI;
@@ -XXX,XX +XXX,XX @@ struct VirtIOSCSI {
      QTAILQ_HEAD(, VirtIOSCSIReq) tmf_bh_list;
      /* Fields for dataplane below */
 -    AioContext *ctx; /* one iothread per virtio-scsi-pci for now */
 +    AioContext **vq_aio_context; /* per-virtqueue AioContext pointer */
      bool dataplane_started;
      bool dataplane_starting;
@@ -XXX,XX +XXX,XX @@ void virtio_scsi_common_realize(DeviceState *dev,
  void virtio_scsi_common_unrealize(DeviceState *dev);
  void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp);
 +void virtio_scsi_dataplane_cleanup(VirtIOSCSI *s);
  int virtio_scsi_dataplane_start(VirtIODevice *s);
  void virtio_scsi_dataplane_stop(VirtIODevice *s);
 diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/scsi/virtio-scsi-dataplane.c
 +++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -XXX,XX +XXX,XX @@
  #include "system/block-backend.h"
  #include "hw/scsi/scsi.h"
  #include "scsi/constants.h"
 +#include "hw/virtio/iothread-vq-mapping.h"
  #include "hw/virtio/virtio-bus.h"
  /* Context: BQL held */
@@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
      VirtIODevice *vdev = VIRTIO_DEVICE(s);
      BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
      VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
 +    uint16_t num_vqs = vs->conf.num_queues + VIRTIO_SCSI_VQ_NUM_FIXED;
 -    if (vs->conf.iothread) {
 +    if (vs->conf.iothread && vs->conf.iothread_vq_mapping_list) {
 +        error_setg(errp,
 +                   "iothread and iothread-vq-mapping properties cannot be set "
 +                   "at the same time");
 +        return;
 +    }
 +
 +    if (vs->conf.iothread || vs->conf.iothread_vq_mapping_list) {
          if (!k->set_guest_notifiers || !k->ioeventfd_assign) {
              error_setg(errp,
                         "device is incompatible with iothread "
@@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
              error_setg(errp, "ioeventfd is required for iothread");
              return;
          }
 -        s->ctx = iothread_get_aio_context(vs->conf.iothread);
 -    } else {
 -        if (!virtio_device_ioeventfd_enabled(vdev)) {
 +    }
 +
 +    s->vq_aio_context = g_new(AioContext *, num_vqs);
 +
 +    if (vs->conf.iothread_vq_mapping_list) {
 +        if (!iothread_vq_mapping_apply(vs->conf.iothread_vq_mapping_list,
 +                                       s->vq_aio_context, num_vqs, errp)) {
 +            g_free(s->vq_aio_context);
 +            s->vq_aio_context = NULL;
              return;
          }
 -        s->ctx = qemu_get_aio_context();
 +    } else if (vs->conf.iothread) {
 +        AioContext *ctx = iothread_get_aio_context(vs->conf.iothread);
 +        for (uint16_t i = 0; i < num_vqs; i++) {
 +            s->vq_aio_context[i] = ctx;
 +        }
-     }
++
- }
++        /* Released in virtio_scsi_dataplane_cleanup() */
++        object_ref(OBJECT(vs->conf.iothread));
-@@ -XXX,XX +XXX,XX @@ static bool mirror_drained_poll(BlockJob *job)
++    } else {
-      * from one of our own drain sections, to avoid a deadlock waiting for
++        AioContext *ctx = qemu_get_aio_context();
-      * ourselves.
++        for (unsigned i = 0; i < num_vqs; i++) {
 +            s->vq_aio_context[i] = ctx;
 +        }
 +    }
 +}
 +
 +/* Context: BQL held */
 +void virtio_scsi_dataplane_cleanup(VirtIOSCSI *s)
 +{
 +    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
 +
 +    if (vs->conf.iothread_vq_mapping_list) {
 +        iothread_vq_mapping_cleanup(vs->conf.iothread_vq_mapping_list);
      }
 +
 +    if (vs->conf.iothread) {
 +        object_unref(OBJECT(vs->conf.iothread));
 +    }
 +
 +    g_free(s->vq_aio_context);
 +    s->vq_aio_context = NULL;
  }
  static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n)
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n)
  }
  /* Context: BH in IOThread */
 -static void virtio_scsi_dataplane_stop_bh(void *opaque)
 +static void virtio_scsi_dataplane_stop_vq_bh(void *opaque)
  {
 -    VirtIOSCSI *s = opaque;
 -    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
 +    AioContext *ctx = qemu_get_current_aio_context();
 +    VirtQueue *vq = opaque;
      EventNotifier *host_notifier;
 -    int i;
 -    virtio_queue_aio_detach_host_notifier(vs->ctrl_vq, s->ctx);
 -    host_notifier = virtio_queue_get_host_notifier(vs->ctrl_vq);
 +    virtio_queue_aio_detach_host_notifier(vq, ctx);
 +    host_notifier = virtio_queue_get_host_notifier(vq);
      /*
       * Test and clear notifier after disabling event, in case poll callback
       * didn't have time to run.
       */
--    if (!s->common.job.paused && !job_is_cancelled(&job->job) && !s->in_drain) {
+     virtio_queue_host_notifier_read(host_notifier);
--        return true;
+-
-+    WITH_JOB_LOCK_GUARD() {
+-    virtio_queue_aio_detach_host_notifier(vs->event_vq, s->ctx);
-+        if (!s->common.job.paused && !job_is_cancelled_locked(&job->job)
+-    host_notifier = virtio_queue_get_host_notifier(vs->event_vq);
-+            && !s->in_drain) {
+-    virtio_queue_host_notifier_read(host_notifier);
-+            return true;
+-
 -    for (i = 0; i < vs->conf.num_queues; i++) {
 -        virtio_queue_aio_detach_host_notifier(vs->cmd_vqs[i], s->ctx);
 -        host_notifier = virtio_queue_get_host_notifier(vs->cmd_vqs[i]);
 -        virtio_queue_host_notifier_read(host_notifier);
 -    }
  }
  /* Context: BQL held */
@@ -XXX,XX +XXX,XX @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev)
      smp_wmb(); /* paired with aio_notify_accept() */
      if (s->bus.drain_count == 0) {
 -        virtio_queue_aio_attach_host_notifier(vs->ctrl_vq, s->ctx);
 -        virtio_queue_aio_attach_host_notifier_no_poll(vs->event_vq, s->ctx);
 +        virtio_queue_aio_attach_host_notifier(vs->ctrl_vq,
 +                                              s->vq_aio_context[0]);
 +        virtio_queue_aio_attach_host_notifier_no_poll(vs->event_vq,
 +                                                      s->vq_aio_context[1]);
          for (i = 0; i < vs->conf.num_queues; i++) {
 -            virtio_queue_aio_attach_host_notifier(vs->cmd_vqs[i], s->ctx);
 +            AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
 +            virtio_queue_aio_attach_host_notifier(vs->cmd_vqs[i], ctx);
          }
      }
      return 0;
@@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_stop(VirtIODevice *vdev)
      s->dataplane_stopping = true;
      if (s->bus.drain_count == 0) {
 -        aio_wait_bh_oneshot(s->ctx, virtio_scsi_dataplane_stop_bh, s);
 +        for (i = 0; i < vs->conf.num_queues + VIRTIO_SCSI_VQ_NUM_FIXED; i++) {
 +            VirtQueue *vq = virtio_get_queue(&vs->parent_obj, i);
 +            AioContext *ctx = s->vq_aio_context[i];
 +            aio_wait_bh_oneshot(ctx, virtio_scsi_dataplane_stop_vq_bh, vq);
 +        }
      }
-     return !!s->in_flight;
+     blk_drain_all(); /* ensure there are no in-flight requests */
 diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/scsi/virtio-scsi.c
 +++ b/hw/scsi/virtio-scsi.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/qdev-properties.h"
  #include "hw/scsi/scsi.h"
  #include "scsi/constants.h"
 +#include "hw/virtio/iothread-vq-mapping.h"
  #include "hw/virtio/virtio-bus.h"
  #include "hw/virtio/virtio-access.h"
  #include "trace.h"
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_cancel_notify(Notifier *notifier, void *data)
      g_free(n);
  }
 -static inline void virtio_scsi_ctx_check(VirtIOSCSI *s, SCSIDevice *d)
 -{
 -    if (s->dataplane_started && d && blk_is_available(d->conf.blk)) {
 -        assert(blk_get_aio_context(d->conf.blk) == s->ctx);
 -    }
 -}
 -
  static void virtio_scsi_do_one_tmf_bh(VirtIOSCSIReq *req)
  {
      VirtIOSCSI *s = req->dev;
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_flush_defer_tmf_to_aio_context(VirtIOSCSI *s)
      assert(!s->dataplane_started);
 -    if (s->ctx) {
 +    for (uint32_t i = 0; i < s->parent_obj.conf.num_queues; i++) {
 +        AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
 +
          /* Our BH only runs after previously scheduled BHs */
 -        aio_wait_bh_oneshot(s->ctx, dummy_bh, NULL);
 +        aio_wait_bh_oneshot(ctx, dummy_bh, NULL);
      }
  }
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
      AioContext *ctx;
      int ret = 0;
 -    virtio_scsi_ctx_check(s, d);
      /* Here VIRTIO_SCSI_S_OK means "FUNCTION COMPLETE".  */
      req->resp.tmf.response = VIRTIO_SCSI_S_OK;
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
      case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
      case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: {
 +        g_autoptr(GHashTable) aio_contexts = g_hash_table_new(NULL, NULL);
 +
          if (!d) {
              goto fail;
          }
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
          qatomic_inc(&req->remaining);
 -        ctx = s->ctx ?: qemu_get_aio_context();
 -        virtio_scsi_defer_tmf_to_aio_context(req, ctx);
 +        for (uint32_t i = 0; i < s->parent_obj.conf.num_queues; i++) {
 +            ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
 +
 +            if (!g_hash_table_add(aio_contexts, ctx)) {
 +                continue; /* skip previously added AioContext */
 +            }
 +
 +            virtio_scsi_defer_tmf_to_aio_context(req, ctx);
 +        }
          virtio_scsi_tmf_dec_remaining(req);
          ret = -EINPROGRESS;
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
   */
  static bool virtio_scsi_defer_to_dataplane(VirtIOSCSI *s)
  {
 -    if (!s->ctx || s->dataplane_started) {
 +    if (s->dataplane_started) {
          return false;
      }
 +    if (s->vq_aio_context[0] == qemu_get_aio_context()) {
 +        return false; /* not using IOThreads */
 +    }
      virtio_device_start_ioeventfd(&s->parent_obj.parent_obj);
      return !s->dataplane_fenced;
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req)
          virtio_scsi_complete_cmd_req(req);
          return -ENOENT;
      }
 -    virtio_scsi_ctx_check(s, d);
      req->sreq = scsi_req_new(d, req->req.cmd.tag,
                               virtio_scsi_get_lun(req->req.cmd.lun),
                               req->req.cmd.cdb, vs->cdb_size, req);
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev,
  {
      VirtIODevice *vdev = VIRTIO_DEVICE(hotplug_dev);
      VirtIOSCSI *s = VIRTIO_SCSI(vdev);
 +    AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED];
      SCSIDevice *sd = SCSI_DEVICE(dev);
 -    int ret;
 -    if (s->ctx && !s->dataplane_fenced) {
 -        ret = blk_set_aio_context(sd->conf.blk, s->ctx, errp);
 -        if (ret < 0) {
 -            return;
 -        }
 +    if (ctx != qemu_get_aio_context() && !s->dataplane_fenced) {
 +        /*
 +         * Try to make the BlockBackend's AioContext match ours. Ignore failure
 +         * because I/O will still work although block jobs and other users
 +         * might be slower when multiple AioContexts use a BlockBackend.
 +         */
 +        blk_set_aio_context(sd->conf.blk, ctx, errp);
      }
      if (virtio_vdev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) {
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_hotunplug(HotplugHandler *hotplug_dev, DeviceState *dev,
      qdev_simple_device_unplug_cb(hotplug_dev, dev, errp);
 -    if (s->ctx) {
 +    if (s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED] != qemu_get_aio_context()) {
          /* If other users keep the BlockBackend in the iothread, that's ok */
          blk_set_aio_context(sd->conf.blk, qemu_get_aio_context(), NULL);
      }
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_drained_begin(SCSIBus *bus)
      for (uint32_t i = 0; i < total_queues; i++) {
          VirtQueue *vq = virtio_get_queue(vdev, i);
 -        virtio_queue_aio_detach_host_notifier(vq, s->ctx);
 +        virtio_queue_aio_detach_host_notifier(vq, s->vq_aio_context[i]);
      }
  }
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_drained_end(SCSIBus *bus)
      for (uint32_t i = 0; i < total_queues; i++) {
          VirtQueue *vq = virtio_get_queue(vdev, i);
 +        AioContext *ctx = s->vq_aio_context[i];
 +
          if (vq == vs->event_vq) {
 -            virtio_queue_aio_attach_host_notifier_no_poll(vq, s->ctx);
 +            virtio_queue_aio_attach_host_notifier_no_poll(vq, ctx);
          } else {
 -            virtio_queue_aio_attach_host_notifier(vq, s->ctx);
 +            virtio_queue_aio_attach_host_notifier(vq, ctx);
          }
      }
  }
@@ -XXX,XX +XXX,XX @@ void virtio_scsi_common_unrealize(DeviceState *dev)
      virtio_cleanup(vdev);
  }
 +/* main loop */
  static void virtio_scsi_device_unrealize(DeviceState *dev)
  {
      VirtIOSCSI *s = VIRTIO_SCSI(dev);
      virtio_scsi_reset_tmf_bh(s);
 -
 +    virtio_scsi_dataplane_cleanup(s);
      qbus_set_hotplug_handler(BUS(&s->bus), NULL);
      virtio_scsi_common_unrealize(dev);
      qemu_mutex_destroy(&s->tmf_bh_lock);
@@ -XXX,XX +XXX,XX @@ static const Property virtio_scsi_properties[] = {
                                                  VIRTIO_SCSI_F_CHANGE, true),
      DEFINE_PROP_LINK("iothread", VirtIOSCSI, parent_obj.conf.iothread,
                       TYPE_IOTHREAD, IOThread *),
 +    DEFINE_PROP_IOTHREAD_VQ_MAPPING_LIST("iothread-vq-mapping", VirtIOSCSI,
 +            parent_obj.conf.iothread_vq_mapping_list),
  };
  static const VMStateDescription vmstate_virtio_scsi = {
 --
-.37.3
+.48.1

-[PULL 40/50] job: detect change of aiocontext within job coroutine
+[PULL 21/22] virtio-scsi: handle ctrl virtqueue in main loop
-From: Paolo Bonzini <pbonzini@redhat.com>
+From: Stefan Hajnoczi <stefanha@redhat.com>
-We want to make sure access of job->aio_context is always done
+Previously the ctrl virtqueue was handled in the AioContext where SCSI
-under either BQL or job_mutex. The problem is that using
+requests are processed. When IOThread Virtqueue Mapping was added things
-aio_co_enter(job->aiocontext, job->co) in job_start and job_enter_cond
+become more complicated because SCSI requests could run in other
-makes the coroutine immediately resume, so we can't hold the job lock.
+AioContexts.
-And caching it is not safe either, as it might change.
+Simplify by handling the ctrl virtqueue in the main loop where reset
-job_start is under BQL, so it can freely read job->aiocontext, but
+operations can be performed. Note that BHs are still used canceling SCSI
-job_enter_cond is not.
+requests in their AioContexts but at least the mean loop activity
-We want to avoid reading job->aio_context in job_enter_cond, therefore:
+doesn't need BHs anymore.
-) use aio_co_wake(), since it doesn't want an aiocontext as argument
-   but uses job->co->ctx
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-) detect possible discrepancy between job->co->ctx and job->aio_context
+Message-ID: <20250311132616.1049687-13-stefanha@redhat.com>
    by checking right after the coroutine resumes back from yielding if
    job->aio_context has changed. If so, reschedule the coroutine to the
    new context.
 Calling bdrv_try_set_aio_context() will issue the following calls
 (simplified):
 * in terms of  bdrv callbacks:
   .drained_begin -> .set_aio_context -> .drained_end
 * in terms of child_job functions:
   child_job_drained_begin -> child_job_set_aio_context -> child_job_drained_end
 * in terms of job functions:
   job_pause_locked -> job_set_aio_context -> job_resume_locked
 We can see that after setting the new aio_context, job_resume_locked
 calls again job_enter_cond, which then invokes aio_co_wake(). But
 while job->aiocontext has been set in job_set_aio_context,
 job->co->ctx has not changed, so the coroutine would be entering in
 the wrong aiocontext.
 Using aio_co_schedule in job_resume_locked() might seem as a valid
 alternative, but the problem is that the bh resuming the coroutine
 is not scheduled immediately, and if in the meanwhile another
 bdrv_try_set_aio_context() is run (see test_propagate_mirror() in
 test-block-iothread.c), we would have the first schedule in the
 wrong aiocontext, and the second set of drains won't even manage
 to schedule the coroutine, as job->busy would still be true from
 the previous job_resume_locked().
 The solution is to stick with aio_co_wake() and detect every time
 the coroutine resumes back from yielding if job->aio_context
 has changed. If so, we can reschedule it to the new context.
 Check for the aiocontext change in job_do_yield_locked because:
 ) aio_co_reschedule_self requires to be in the running coroutine
 ) since child_job_set_aio_context allows changing the aiocontext only
    while the job is paused, this is the exact place where the coroutine
    resumes, before running JobDriver's code.
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Message-Id: <20220926093214.506243-13-eesposit@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- job.c | 19 +++++++++++++++++--
+ include/hw/virtio/virtio-scsi.h |   8 --
-file changed, 17 insertions(+), 2 deletions(-)
+ hw/scsi/virtio-scsi-dataplane.c |   6 ++
+ hw/scsi/virtio-scsi.c           | 144 ++++++--------------------------
-diff --git a/job.c b/job.c
+files changed, 33 insertions(+), 125 deletions(-)
 diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
 index XXXXXXX..XXXXXXX 100644
---- a/job.c
+--- a/include/hw/virtio/virtio-scsi.h
-+++ b/job.c
++++ b/include/hw/virtio/virtio-scsi.h
-@@ -XXX,XX +XXX,XX @@ void job_enter_cond_locked(Job *job, bool(*fn)(Job *job))
+@@ -XXX,XX +XXX,XX @@ struct VirtIOSCSI {
-     job->busy = true;
-     real_job_unlock();
+     QemuMutex ctrl_lock; /* protects ctrl_vq */
-     job_unlock();
--    aio_co_enter(job->aio_context, job->co);
+-    /*
-+    aio_co_wake(job->co);
+-     * TMFs deferred to main loop BH. These fields are protected by
-     job_lock();
+-     * tmf_bh_lock.
 -     */
 -    QemuMutex tmf_bh_lock;
 -    QEMUBH *tmf_bh;
 -    QTAILQ_HEAD(, VirtIOSCSIReq) tmf_bh_list;
 -
      /* Fields for dataplane below */
      AioContext **vq_aio_context; /* per-virtqueue AioContext pointer */
 diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/scsi/virtio-scsi-dataplane.c
 +++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
              s->vq_aio_context[i] = ctx;
          }
      }
 +
 +    /*
 +     * Always handle the ctrl virtqueue in the main loop thread where device
 +     * resets can be performed.
 +     */
 +    s->vq_aio_context[0] = qemu_get_aio_context();
  }
-@@ -XXX,XX +XXX,XX @@ void job_enter(Job *job)
+ /* Context: BQL held */
-  */
+diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
- static void coroutine_fn job_do_yield_locked(Job *job, uint64_t ns)
+index XXXXXXX..XXXXXXX 100644
 --- a/hw/scsi/virtio-scsi.c
 +++ b/hw/scsi/virtio-scsi.c
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_cancel_notify(Notifier *notifier, void *data)
      g_free(n);
  }
 -static void virtio_scsi_do_one_tmf_bh(VirtIOSCSIReq *req)
 -{
 -    VirtIOSCSI *s = req->dev;
 -    SCSIDevice *d = virtio_scsi_device_get(s, req->req.tmf.lun);
 -    BusChild *kid;
 -    int target;
 -
 -    switch (req->req.tmf.subtype) {
 -    case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
 -        if (!d) {
 -            req->resp.tmf.response = VIRTIO_SCSI_S_BAD_TARGET;
 -            goto out;
 -        }
 -        if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
 -            req->resp.tmf.response = VIRTIO_SCSI_S_INCORRECT_LUN;
 -            goto out;
 -        }
 -        qatomic_inc(&s->resetting);
 -        device_cold_reset(&d->qdev);
 -        qatomic_dec(&s->resetting);
 -        break;
 -
 -    case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET:
 -        target = req->req.tmf.lun[1];
 -        qatomic_inc(&s->resetting);
 -
 -        rcu_read_lock();
 -        QTAILQ_FOREACH_RCU(kid, &s->bus.qbus.children, sibling) {
 -            SCSIDevice *d1 = SCSI_DEVICE(kid->child);
 -            if (d1->channel == 0 && d1->id == target) {
 -                device_cold_reset(&d1->qdev);
 -            }
 -        }
 -        rcu_read_unlock();
 -
 -        qatomic_dec(&s->resetting);
 -        break;
 -
 -    default:
 -        g_assert_not_reached();
 -    }
 -
 -out:
 -    object_unref(OBJECT(d));
 -    virtio_scsi_complete_req(req, &s->ctrl_lock);
 -}
 -
 -/* Some TMFs must be processed from the main loop thread */
 -static void virtio_scsi_do_tmf_bh(void *opaque)
 -{
 -    VirtIOSCSI *s = opaque;
 -    QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs);
 -    VirtIOSCSIReq *req;
 -    VirtIOSCSIReq *tmp;
 -
 -    GLOBAL_STATE_CODE();
 -
 -    WITH_QEMU_LOCK_GUARD(&s->tmf_bh_lock) {
 -        QTAILQ_FOREACH_SAFE(req, &s->tmf_bh_list, next, tmp) {
 -            QTAILQ_REMOVE(&s->tmf_bh_list, req, next);
 -            QTAILQ_INSERT_TAIL(&reqs, req, next);
 -        }
 -
 -        qemu_bh_delete(s->tmf_bh);
 -        s->tmf_bh = NULL;
 -    }
 -
 -    QTAILQ_FOREACH_SAFE(req, &reqs, next, tmp) {
 -        QTAILQ_REMOVE(&reqs, req, next);
 -        virtio_scsi_do_one_tmf_bh(req);
 -    }
 -}
 -
 -static void virtio_scsi_reset_tmf_bh(VirtIOSCSI *s)
 -{
 -    VirtIOSCSIReq *req;
 -    VirtIOSCSIReq *tmp;
 -
 -    GLOBAL_STATE_CODE();
 -
 -    /* Called after ioeventfd has been stopped, so tmf_bh_lock is not needed */
 -    if (s->tmf_bh) {
 -        qemu_bh_delete(s->tmf_bh);
 -        s->tmf_bh = NULL;
 -    }
 -
 -    QTAILQ_FOREACH_SAFE(req, &s->tmf_bh_list, next, tmp) {
 -        QTAILQ_REMOVE(&s->tmf_bh_list, req, next);
 -
 -        /* SAM-6 6.3.2 Hard reset */
 -        req->resp.tmf.response = VIRTIO_SCSI_S_TARGET_FAILURE;
 -        virtio_scsi_complete_req(req, &req->dev->ctrl_lock);
 -    }
 -}
 -
 -static void virtio_scsi_defer_tmf_to_main_loop(VirtIOSCSIReq *req)
 -{
 -    VirtIOSCSI *s = req->dev;
 -
 -    WITH_QEMU_LOCK_GUARD(&s->tmf_bh_lock) {
 -        QTAILQ_INSERT_TAIL(&s->tmf_bh_list, req, next);
 -
 -        if (!s->tmf_bh) {
 -            s->tmf_bh = qemu_bh_new(virtio_scsi_do_tmf_bh, s);
 -            qemu_bh_schedule(s->tmf_bh);
 -        }
 -    }
 -}
 -
  static void virtio_scsi_tmf_cancel_req(VirtIOSCSIReq *tmf, SCSIRequest *r)
  {
-+    AioContext *next_aio_context;
+     VirtIOSCSICancelNotifier *notifier;
-+
+@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
-     real_job_lock();
+         break;
-     if (ns != -1) {
-         timer_mod(&job->sleep_timer, ns);
+     case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn job_do_yield_locked(Job *job, uint64_t ns)
+-    case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET:
-     qemu_coroutine_yield();
+-        virtio_scsi_defer_tmf_to_main_loop(req);
-     job_lock();
+-        ret = -EINPROGRESS;
++        if (!d) {
--    /* Set by job_enter_cond() before re-entering the coroutine.  */
++            goto fail;
-+    next_aio_context = job->aio_context;
++        }
-+    /*
++        if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
-+     * Coroutine has resumed, but in the meanwhile the job AioContext
++            goto incorrect_lun;
-+     * might have changed via bdrv_try_set_aio_context(), so we need to move
++        }
-+     * the coroutine too in the new aiocontext.
++        qatomic_inc(&s->resetting);
-+     */
++        device_cold_reset(&d->qdev);
-+    while (qemu_get_current_aio_context() != next_aio_context) {
++        qatomic_dec(&s->resetting);
-+        job_unlock();
+         break;
-+        aio_co_reschedule_self(next_aio_context);
-+        job_lock();
++    case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: {
-+        next_aio_context = job->aio_context;
++        BusChild *kid;
 +        int target = req->req.tmf.lun[1];
 +        qatomic_inc(&s->resetting);
 +
 +        rcu_read_lock();
 +        QTAILQ_FOREACH_RCU(kid, &s->bus.qbus.children, sibling) {
 +            SCSIDevice *d1 = SCSI_DEVICE(kid->child);
 +            if (d1->channel == 0 && d1->id == target) {
 +                device_cold_reset(&d1->qdev);
 +            }
 +        }
 +        rcu_read_unlock();
 +
 +        qatomic_dec(&s->resetting);
 +        break;
 +    }
 +
-+    /* Set by job_enter_cond_locked() before re-entering the coroutine.  */
+     case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
-     assert(job->busy);
+     case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: {
          g_autoptr(GHashTable) aio_contexts = g_hash_table_new(NULL, NULL);
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_reset(VirtIODevice *vdev)
      assert(!s->dataplane_started);
 -    virtio_scsi_reset_tmf_bh(s);
      virtio_scsi_flush_defer_tmf_to_aio_context(s);
      qatomic_inc(&s->resetting);
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_device_realize(DeviceState *dev, Error **errp)
      VirtIOSCSI *s = VIRTIO_SCSI(dev);
      Error *err = NULL;
 -    QTAILQ_INIT(&s->tmf_bh_list);
      qemu_mutex_init(&s->ctrl_lock);
      qemu_mutex_init(&s->event_lock);
 -    qemu_mutex_init(&s->tmf_bh_lock);
      virtio_scsi_common_realize(dev,
                                 virtio_scsi_handle_ctrl,
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_device_unrealize(DeviceState *dev)
  {
      VirtIOSCSI *s = VIRTIO_SCSI(dev);
 -    virtio_scsi_reset_tmf_bh(s);
      virtio_scsi_dataplane_cleanup(s);
      qbus_set_hotplug_handler(BUS(&s->bus), NULL);
      virtio_scsi_common_unrealize(dev);
 -    qemu_mutex_destroy(&s->tmf_bh_lock);
      qemu_mutex_destroy(&s->event_lock);
      qemu_mutex_destroy(&s->ctrl_lock);
  }
 --
-.37.3
+.48.1

-[PULL 42/50] blockjob.h: categorize fields in struct BlockJob
+Deleted patch
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
-The same job lock is being used also to protect some of blockjob fields.
-Categorize them just as done in job.h.
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
-Message-Id: <20220926093214.506243-15-eesposit@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- include/block/blockjob.h | 32 ++++++++++++++++++++++++++------
-file changed, 26 insertions(+), 6 deletions(-)
-diff --git a/include/block/blockjob.h b/include/block/blockjob.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/blockjob.h
-+++ b/include/block/blockjob.h
-@@ -XXX,XX +XXX,XX @@ typedef struct BlockJobDriver BlockJobDriver;
-  * Long-running operation on a BlockDriverState.
-  */
- typedef struct BlockJob {
--    /** Data belonging to the generic Job infrastructure */
-+    /**
-+     * Data belonging to the generic Job infrastructure.
-+     * Protected by job mutex.
-+     */
-     Job job;
--    /** Status that is published by the query-block-jobs QMP API */
-+    /**
-+     * Status that is published by the query-block-jobs QMP API.
-+     * Protected by job mutex.
-+     */
-     BlockDeviceIoStatus iostatus;
--    /** Speed that was set with @block_job_set_speed.  */
-+    /**
-+     * Speed that was set with @block_job_set_speed.
-+     * Always modified and read under QEMU global mutex (GLOBAL_STATE_CODE).
-+     */
-     int64_t speed;
--    /** Rate limiting data structure for implementing @speed. */
-+    /**
-+     * Rate limiting data structure for implementing @speed.
-+     * RateLimit API is thread-safe.
-+     */
-     RateLimit limit;
--    /** Block other operations when block job is running */
-+    /**
-+     * Block other operations when block job is running.
-+     * Always modified and read under QEMU global mutex (GLOBAL_STATE_CODE).
-+     */
-     Error *blocker;
-+    /** All notifiers are set once in block_job_create() and never modified. */
-+
-     /** Called when a cancelled job is finalised. */
-     Notifier finalize_cancelled_notifier;
-@@ -XXX,XX +XXX,XX @@ typedef struct BlockJob {
-     /** Called when the job coroutine yields or terminates */
-     Notifier idle_notifier;
--    /** BlockDriverStates that are involved in this block job */
-+    /**
-+     * BlockDriverStates that are involved in this block job.
-+     * Always modified and read under QEMU global mutex (GLOBAL_STATE_CODE).
-+     */
-     GSList *nodes;
- } BlockJob;
---
-.37.3

-[PULL 44/50] blockjob: protect iostatus field in BlockJob struct
+Deleted patch
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
-iostatus is the only field (together with .job) that needs
-protection using the job mutex.
-It is set in the main loop (GLOBAL_STATE functions) but read
-in I/O code (block_job_error_action).
-In order to protect it, change block_job_iostatus_set_err
-to block_job_iostatus_set_err_locked(), always called under
-job lock.
-Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Message-Id: <20220926093214.506243-17-eesposit@redhat.com>
-[kwolf: Fixed up type of iostatus]
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- block/mirror.c | 6 +++++-
- blockjob.c     | 5 +++--
-files changed, 8 insertions(+), 3 deletions(-)
-diff --git a/block/mirror.c b/block/mirror.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/mirror.c
-+++ b/block/mirror.c
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
-     BlockDriverState *bs = s->mirror_top_bs->backing->bs;
-     BlockDriverState *target_bs = blk_bs(s->target);
-     bool need_drain = true;
-+    BlockDeviceIoStatus iostatus;
-     int64_t length;
-     int64_t target_length;
-     BlockDriverInfo bdi;
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
-          * We do so every BLKOCK_JOB_SLICE_TIME nanoseconds, or when there is
-          * an error, or when the source is clean, whichever comes first. */
-         delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
-+        WITH_JOB_LOCK_GUARD() {
-+            iostatus = s->common.iostatus;
-+        }
-         if (delta < BLOCK_JOB_SLICE_TIME &&
--            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
-+            iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
-             if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
-                 (cnt == 0 && s->in_flight > 0)) {
-                 trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight);
-diff --git a/blockjob.c b/blockjob.c
-index XXXXXXX..XXXXXXX 100644
---- a/blockjob.c
-+++ b/blockjob.c
-@@ -XXX,XX +XXX,XX @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
-     return block_job_query_locked(job, errp);
- }
--static void block_job_iostatus_set_err(BlockJob *job, int error)
-+/* Called with job lock held */
-+static void block_job_iostatus_set_err_locked(BlockJob *job, int error)
- {
-     if (job->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
-         job->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
-@@ -XXX,XX +XXX,XX @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
-                  */
-                 job->job.user_paused = true;
-             }
-+            block_job_iostatus_set_err_locked(job, error);
-         }
--        block_job_iostatus_set_err(job, error);
-     }
-     return action;
- }
---
-.37.3

-[PULL 46/50] job.c: enable job lock/unlock and remove Aiocontext locks
+[PULL 22/22] virtio-scsi: only expose cmd vqs via iothread-vq-mapping
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
+From: Stefan Hajnoczi <stefanha@redhat.com>
-Change the job_{lock/unlock} and macros to use job_mutex.
+Peter Krempa and Kevin Wolf observed that iothread-vq-mapping is
 confusing to use because the control and event virtqueues have a fixed
 location before the command virtqueues but need to be treated
 differently.
-Now that they are not nop anymore, remove the aiocontext
+Only expose the command virtqueues via iothread-vq-mapping so that the
-to avoid deadlocks.
+command-line parameter is intuitive: it controls where SCSI requests are
 processed.
-Therefore:
+The control virtqueue needs to be hardcoded to the main loop thread for
-- when possible, remove completely the aiocontext lock/unlock pair
+technical reasons anyway. Kevin also pointed out that it's better to
-- if it is used by some other function too, reduce the locking
+place the event virtqueue in the main loop thread since its no poll
-  section as much as possible, leaving the job API outside.
+behavior would prevent polling if assigned to an IOThread.
 - change AIO_WAIT_WHILE in AIO_WAIT_WHILE_UNLOCKED, since we
   are not using the aiocontext lock anymore
-The only functions that still need the aiocontext lock are:
+This change is its own commit to avoid squashing the previous commit.
 - the JobDriver callbacks, already documented in job.h
 - job_cancel_sync() in replication.c is called with aio_context_lock
   taken, but now job is using AIO_WAIT_WHILE_UNLOCKED so we need to
   release the lock.
-Reduce the locking section to only cover the callback invocation
+Suggested-by: Kevin Wolf <kwolf@redhat.com>
-and document the functions that take the AioContext lock,
+Suggested-by: Peter Krempa <pkrempa@redhat.com>
-to avoid taking it twice.
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Message-ID: <20250311132616.1049687-14-stefanha@redhat.com>
 Also remove real_job_{lock/unlock}, as they are replaced by the
 public functions.
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Message-Id: <20220926093214.506243-19-eesposit@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
- include/qemu/job.h               |  17 ++---
+ hw/scsi/virtio-scsi-dataplane.c | 33 ++++++++++++++++++++-------------
- block/replication.c              |   2 +
+file changed, 20 insertions(+), 13 deletions(-)
  blockdev.c                       |  72 +++-----------------
  job-qmp.c                        |  46 +++----------
  job.c                            | 111 +++++++++----------------------
  qemu-img.c                       |   2 -
  tests/unit/test-bdrv-drain.c     |   4 +-
  tests/unit/test-block-iothread.c |   2 +-
  tests/unit/test-blockjob.c       |  19 +++---
 files changed, 72 insertions(+), 203 deletions(-)
-diff --git a/include/qemu/job.h b/include/qemu/job.h
+diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/job.h
+--- a/hw/scsi/virtio-scsi-dataplane.c
-+++ b/include/qemu/job.h
++++ b/hw/scsi/virtio-scsi-dataplane.c
-@@ -XXX,XX +XXX,XX @@ typedef struct Job {
+@@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
-     AioContext *aio_context;
+     VirtIODevice *vdev = VIRTIO_DEVICE(s);
+     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
--    /** Protected by AioContext lock */
+-    uint16_t num_vqs = vs->conf.num_queues + VIRTIO_SCSI_VQ_NUM_FIXED;
-+    /** Protected by job_mutex */
+     if (vs->conf.iothread && vs->conf.iothread_vq_mapping_list) {
-     /** Reference count of the block job */
+         error_setg(errp,
-     int refcnt;
+@@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
@@ -XXX,XX +XXX,XX @@ typedef struct Job {
      /**
       * Set to false by the job while the coroutine has yielded and may be
       * re-entered by job_enter(). There may still be I/O or event loop activity
 -     * pending. Accessed under block_job_mutex (in blockjob.c).
 +     * pending. Accessed under job_mutex.
       *
       * When the job is deferred to the main loop, busy is true as long as the
       * bottom half is still pending.
@@ -XXX,XX +XXX,XX @@ typedef enum JobCreateFlags {
  extern QemuMutex job_mutex;
 -#define JOB_LOCK_GUARD() /* QEMU_LOCK_GUARD(&job_mutex) */
 +#define JOB_LOCK_GUARD() QEMU_LOCK_GUARD(&job_mutex)
 -#define WITH_JOB_LOCK_GUARD() /* WITH_QEMU_LOCK_GUARD(&job_mutex) */
 +#define WITH_JOB_LOCK_GUARD() WITH_QEMU_LOCK_GUARD(&job_mutex)
  /**
   * job_lock:
@@ -XXX,XX +XXX,XX @@ void job_ref_locked(Job *job);
  /**
   * Release a reference that was previously acquired with job_ref() or
   * job_create(). If it's the last reference to the object, it will be freed.
 + *
 + * Takes AioContext lock internally to invoke a job->driver callback.
   */
  void job_unref(Job *job);
@@ -XXX,XX +XXX,XX @@ void job_user_cancel_locked(Job *job, bool force, Error **errp);
   * Returns the return value from the job if the job actually completed
   * during the call, or -ECANCELED if it was canceled.
   *
 - * Callers must hold the AioContext lock of job->aio_context.
 + * Called with job_lock *not* held.
   */
  int job_cancel_sync(Job *job, bool force);
@@ -XXX,XX +XXX,XX @@ void job_cancel_sync_all(void);
   * function).
   *
   * Returns the return value from the job.
 - *
 - * Callers must hold the AioContext lock of job->aio_context.
 + * Called with job_lock *not* held.
   */
  int job_complete_sync(Job *job, Error **errp);
@@ -XXX,XX +XXX,XX @@ void job_dismiss_locked(Job **job, Error **errp);
   * Returns 0 if the job is successfully completed, -ECANCELED if the job was
   * cancelled before completing, and -errno in other error cases.
   *
 - * Callers must hold the AioContext lock of job->aio_context.
 + * Called with job_lock *not* held.
   */
  int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp),
                      Error **errp);
 diff --git a/block/replication.c b/block/replication.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/replication.c
 +++ b/block/replication.c
@@ -XXX,XX +XXX,XX @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
           * disk, secondary disk in backup_job_completed().
           */
          if (s->backup_job) {
 +            aio_context_release(aio_context);
              job_cancel_sync(&s->backup_job->job, true);
 +            aio_context_acquire(aio_context);
          }
          if (!failover) {
 diff --git a/blockdev.c b/blockdev.c
 index XXXXXXX..XXXXXXX 100644
 --- a/blockdev.c
 +++ b/blockdev.c
@@ -XXX,XX +XXX,XX @@ void blockdev_mark_auto_del(BlockBackend *blk)
      for (job = block_job_next_locked(NULL); job;
           job = block_job_next_locked(job)) {
          if (block_job_has_bdrv(job, blk_bs(blk))) {
 -            AioContext *aio_context = job->job.aio_context;
 -            aio_context_acquire(aio_context);
 -
              job_cancel_locked(&job->job, false);
 -
 -            aio_context_release(aio_context);
          }
      }
-@@ -XXX,XX +XXX,XX @@ static void drive_backup_abort(BlkActionState *common)
+-    s->vq_aio_context = g_new(AioContext *, num_vqs);
-     DriveBackupState *state = DO_UPCAST(DriveBackupState, common, common);
++    s->vq_aio_context = g_new(AioContext *, vs->conf.num_queues +
++                                            VIRTIO_SCSI_VQ_NUM_FIXED);
-     if (state->job) {
++
--        AioContext *aio_context;
++    /*
--
++     * Handle the ctrl virtqueue in the main loop thread where device resets
--        aio_context = bdrv_get_aio_context(state->bs);
++     * can be performed.
--        aio_context_acquire(aio_context);
++     */
--
++    s->vq_aio_context[0] = qemu_get_aio_context();
-         job_cancel_sync(&state->job->job, true);
++
--
++    /*
--        aio_context_release(aio_context);
++     * Handle the event virtqueue in the main loop thread where its no_poll
-     }
++     * behavior won't stop IOThread polling.
- }
++     */
++    s->vq_aio_context[1] = qemu_get_aio_context();
-@@ -XXX,XX +XXX,XX @@ static void blockdev_backup_abort(BlkActionState *common)
-     BlockdevBackupState *state = DO_UPCAST(BlockdevBackupState, common, common);
+     if (vs->conf.iothread_vq_mapping_list) {
+         if (!iothread_vq_mapping_apply(vs->conf.iothread_vq_mapping_list,
-     if (state->job) {
+-                                       s->vq_aio_context, num_vqs, errp)) {
--        AioContext *aio_context;
++                    &s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED],
--
++                    vs->conf.num_queues, errp)) {
--        aio_context = bdrv_get_aio_context(state->bs);
+             g_free(s->vq_aio_context);
--        aio_context_acquire(aio_context);
+             s->vq_aio_context = NULL;
--
+             return;
          job_cancel_sync(&state->job->job, true);
 -
 -        aio_context_release(aio_context);
      }
  }
@@ -XXX,XX +XXX,XX @@ out:
  }
  /*
 - * Get a block job using its ID and acquire its AioContext.
 - * Called with job_mutex held.
 + * Get a block job using its ID. Called with job_mutex held.
   */
 -static BlockJob *find_block_job_locked(const char *id,
 -                                       AioContext **aio_context,
 -                                       Error **errp)
 +static BlockJob *find_block_job_locked(const char *id, Error **errp)
  {
      BlockJob *job;
      assert(id != NULL);
 -    *aio_context = NULL;
 -
      job = block_job_get_locked(id);
      if (!job) {
@@ -XXX,XX +XXX,XX @@ static BlockJob *find_block_job_locked(const char *id,
          return NULL;
      }
 -    *aio_context = block_job_get_aio_context(job);
 -    aio_context_acquire(*aio_context);
 -
      return job;
  }
  void qmp_block_job_set_speed(const char *device, int64_t speed, Error **errp)
  {
 -    AioContext *aio_context;
      BlockJob *job;
      JOB_LOCK_GUARD();
 -    job = find_block_job_locked(device, &aio_context, errp);
 +    job = find_block_job_locked(device, errp);
      if (!job) {
          return;
      }
      block_job_set_speed_locked(job, speed, errp);
 -    aio_context_release(aio_context);
  }
  void qmp_block_job_cancel(const char *device,
                            bool has_force, bool force, Error **errp)
  {
 -    AioContext *aio_context;
      BlockJob *job;
      JOB_LOCK_GUARD();
 -    job = find_block_job_locked(device, &aio_context, errp);
 +    job = find_block_job_locked(device, errp);
      if (!job) {
          return;
@@ -XXX,XX +XXX,XX @@ void qmp_block_job_cancel(const char *device,
      if (job_user_paused_locked(&job->job) && !force) {
          error_setg(errp, "The block job for device '%s' is currently paused",
                     device);
 -        goto out;
 +        return;
      }
      trace_qmp_block_job_cancel(job);
      job_user_cancel_locked(&job->job, force, errp);
 -out:
 -    aio_context_release(aio_context);
  }
  void qmp_block_job_pause(const char *device, Error **errp)
  {
 -    AioContext *aio_context;
      BlockJob *job;
      JOB_LOCK_GUARD();
 -    job = find_block_job_locked(device, &aio_context, errp);
 +    job = find_block_job_locked(device, errp);
      if (!job) {
          return;
@@ -XXX,XX +XXX,XX @@ void qmp_block_job_pause(const char *device, Error **errp)
      trace_qmp_block_job_pause(job);
      job_user_pause_locked(&job->job, errp);
 -    aio_context_release(aio_context);
  }
  void qmp_block_job_resume(const char *device, Error **errp)
  {
 -    AioContext *aio_context;
      BlockJob *job;
      JOB_LOCK_GUARD();
 -    job = find_block_job_locked(device, &aio_context, errp);
 +    job = find_block_job_locked(device, errp);
      if (!job) {
          return;
@@ -XXX,XX +XXX,XX @@ void qmp_block_job_resume(const char *device, Error **errp)
      trace_qmp_block_job_resume(job);
      job_user_resume_locked(&job->job, errp);
 -    aio_context_release(aio_context);
  }
  void qmp_block_job_complete(const char *device, Error **errp)
  {
 -    AioContext *aio_context;
      BlockJob *job;
      JOB_LOCK_GUARD();
 -    job = find_block_job_locked(device, &aio_context, errp);
 +    job = find_block_job_locked(device, errp);
      if (!job) {
          return;
@@ -XXX,XX +XXX,XX @@ void qmp_block_job_complete(const char *device, Error **errp)
      trace_qmp_block_job_complete(job);
      job_complete_locked(&job->job, errp);
 -    aio_context_release(aio_context);
  }
  void qmp_block_job_finalize(const char *id, Error **errp)
  {
 -    AioContext *aio_context;
      BlockJob *job;
      JOB_LOCK_GUARD();
 -    job = find_block_job_locked(id, &aio_context, errp);
 +    job = find_block_job_locked(id, errp);
      if (!job) {
          return;
@@ -XXX,XX +XXX,XX @@ void qmp_block_job_finalize(const char *id, Error **errp)
      job_ref_locked(&job->job);
      job_finalize_locked(&job->job, errp);
 -    /*
 -     * Job's context might have changed via job_finalize (and job_txn_apply
 -     * automatically acquires the new one), so make sure we release the correct
 -     * one.
 -     */
 -    aio_context = block_job_get_aio_context(job);
      job_unref_locked(&job->job);
 -    aio_context_release(aio_context);
  }
  void qmp_block_job_dismiss(const char *id, Error **errp)
  {
 -    AioContext *aio_context;
      BlockJob *bjob;
      Job *job;
      JOB_LOCK_GUARD();
 -    bjob = find_block_job_locked(id, &aio_context, errp);
 +    bjob = find_block_job_locked(id, errp);
      if (!bjob) {
          return;
@@ -XXX,XX +XXX,XX @@ void qmp_block_job_dismiss(const char *id, Error **errp)
      trace_qmp_block_job_dismiss(bjob);
      job = &bjob->job;
      job_dismiss_locked(&job, errp);
 -    aio_context_release(aio_context);
  }
  void qmp_change_backing_file(const char *device,
@@ -XXX,XX +XXX,XX @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
      for (job = block_job_next_locked(NULL); job;
           job = block_job_next_locked(job)) {
          BlockJobInfo *value;
 -        AioContext *aio_context;
          if (block_job_is_internal(job)) {
              continue;
          }
--        aio_context = block_job_get_aio_context(job);
+     } else if (vs->conf.iothread) {
--        aio_context_acquire(aio_context);
+         AioContext *ctx = iothread_get_aio_context(vs->conf.iothread);
-         value = block_job_query_locked(job, errp);
+-        for (uint16_t i = 0; i < num_vqs; i++) {
--        aio_context_release(aio_context);
+-            s->vq_aio_context[i] = ctx;
-         if (!value) {
++        for (uint16_t i = 0; i < vs->conf.num_queues; i++) {
-             qapi_free_BlockJobInfoList(head);
++            s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i] = ctx;
              return NULL;
 diff --git a/job-qmp.c b/job-qmp.c
 index XXXXXXX..XXXXXXX 100644
 --- a/job-qmp.c
 +++ b/job-qmp.c
@@ -XXX,XX +XXX,XX @@
  #include "trace/trace-root.h"
  /*
 - * Get a job using its ID and acquire its AioContext.
 - * Called with job_mutex held.
 + * Get a job using its ID. Called with job_mutex held.
   */
 -static Job *find_job_locked(const char *id,
 -                            AioContext **aio_context,
 -                            Error **errp)
 +static Job *find_job_locked(const char *id, Error **errp)
  {
      Job *job;
 -    *aio_context = NULL;
 -
      job = job_get_locked(id);
      if (!job) {
          error_setg(errp, "Job not found");
          return NULL;
      }
 -    *aio_context = job->aio_context;
 -    aio_context_acquire(*aio_context);
 -
      return job;
  }
  void qmp_job_cancel(const char *id, Error **errp)
  {
 -    AioContext *aio_context;
      Job *job;
      JOB_LOCK_GUARD();
 -    job = find_job_locked(id, &aio_context, errp);
 +    job = find_job_locked(id, errp);
      if (!job) {
          return;
@@ -XXX,XX +XXX,XX @@ void qmp_job_cancel(const char *id, Error **errp)
      trace_qmp_job_cancel(job);
      job_user_cancel_locked(job, true, errp);
 -    aio_context_release(aio_context);
  }
  void qmp_job_pause(const char *id, Error **errp)
  {
 -    AioContext *aio_context;
      Job *job;
      JOB_LOCK_GUARD();
 -    job = find_job_locked(id, &aio_context, errp);
 +    job = find_job_locked(id, errp);
      if (!job) {
          return;
@@ -XXX,XX +XXX,XX @@ void qmp_job_pause(const char *id, Error **errp)
      trace_qmp_job_pause(job);
      job_user_pause_locked(job, errp);
 -    aio_context_release(aio_context);
  }
  void qmp_job_resume(const char *id, Error **errp)
  {
 -    AioContext *aio_context;
      Job *job;
      JOB_LOCK_GUARD();
 -    job = find_job_locked(id, &aio_context, errp);
 +    job = find_job_locked(id, errp);
      if (!job) {
          return;
@@ -XXX,XX +XXX,XX @@ void qmp_job_resume(const char *id, Error **errp)
      trace_qmp_job_resume(job);
      job_user_resume_locked(job, errp);
 -    aio_context_release(aio_context);
  }
  void qmp_job_complete(const char *id, Error **errp)
  {
 -    AioContext *aio_context;
      Job *job;
      JOB_LOCK_GUARD();
 -    job = find_job_locked(id, &aio_context, errp);
 +    job = find_job_locked(id, errp);
      if (!job) {
          return;
@@ -XXX,XX +XXX,XX @@ void qmp_job_complete(const char *id, Error **errp)
      trace_qmp_job_complete(job);
      job_complete_locked(job, errp);
 -    aio_context_release(aio_context);
  }
  void qmp_job_finalize(const char *id, Error **errp)
  {
 -    AioContext *aio_context;
      Job *job;
      JOB_LOCK_GUARD();
 -    job = find_job_locked(id, &aio_context, errp);
 +    job = find_job_locked(id, errp);
      if (!job) {
          return;
@@ -XXX,XX +XXX,XX @@ void qmp_job_finalize(const char *id, Error **errp)
      job_ref_locked(job);
      job_finalize_locked(job, errp);
 -    /*
 -     * Job's context might have changed via job_finalize (and job_txn_apply
 -     * automatically acquires the new one), so make sure we release the correct
 -     * one.
 -     */
 -    aio_context = job->aio_context;
      job_unref_locked(job);
 -    aio_context_release(aio_context);
  }
  void qmp_job_dismiss(const char *id, Error **errp)
  {
 -    AioContext *aio_context;
      Job *job;
      JOB_LOCK_GUARD();
 -    job = find_job_locked(id, &aio_context, errp);
 +    job = find_job_locked(id, errp);
      if (!job) {
          return;
@@ -XXX,XX +XXX,XX @@ void qmp_job_dismiss(const char *id, Error **errp)
      trace_qmp_job_dismiss(job);
      job_dismiss_locked(&job, errp);
 -    aio_context_release(aio_context);
  }
  /* Called with job_mutex held. */
@@ -XXX,XX +XXX,XX @@ JobInfoList *qmp_query_jobs(Error **errp)
      for (job = job_next_locked(NULL); job; job = job_next_locked(job)) {
          JobInfo *value;
 -        AioContext *aio_context;
          if (job_is_internal(job)) {
              continue;
          }
--        aio_context = job->aio_context;
--        aio_context_acquire(aio_context);
+         /* Released in virtio_scsi_dataplane_cleanup() */
-         value = job_query_single_locked(job, errp);
+         object_ref(OBJECT(vs->conf.iothread));
--        aio_context_release(aio_context);
+     } else {
-         if (!value) {
+         AioContext *ctx = qemu_get_aio_context();
-             qapi_free_JobInfoList(head);
+-        for (unsigned i = 0; i < num_vqs; i++) {
-             return NULL;
+-            s->vq_aio_context[i] = ctx;
-diff --git a/job.c b/job.c
++        for (unsigned i = 0; i < vs->conf.num_queues; i++) {
-index XXXXXXX..XXXXXXX 100644
++            s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i] = ctx;
 --- a/job.c
 +++ b/job.c
@@ -XXX,XX +XXX,XX @@
   *
   * The second includes functions used by the job drivers and sometimes
   * by the core block layer. These delegate the locking to the callee instead.
 - *
 - * TODO Actually make this true
   */
  /*
@@ -XXX,XX +XXX,XX @@ struct JobTxn {
  };
  void job_lock(void)
 -{
 -    /* nop */
 -}
 -
 -void job_unlock(void)
 -{
 -    /* nop */
 -}
 -
 -static void real_job_lock(void)
  {
      qemu_mutex_lock(&job_mutex);
  }
 -static void real_job_unlock(void)
 +void job_unlock(void)
  {
      qemu_mutex_unlock(&job_mutex);
  }
@@ -XXX,XX +XXX,XX @@ static void job_txn_del_job_locked(Job *job)
  /* Called with job_mutex held, but releases it temporarily. */
  static int job_txn_apply_locked(Job *job, int fn(Job *))
  {
 -    AioContext *inner_ctx;
      Job *other_job, *next;
      JobTxn *txn = job->txn;
      int rc = 0;
@@ -XXX,XX +XXX,XX @@ static int job_txn_apply_locked(Job *job, int fn(Job *))
       * break AIO_WAIT_WHILE from within fn.
       */
      job_ref_locked(job);
 -    aio_context_release(job->aio_context);
      QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
 -        inner_ctx = other_job->aio_context;
 -        aio_context_acquire(inner_ctx);
          rc = fn(other_job);
 -        aio_context_release(inner_ctx);
          if (rc) {
              break;
          }
      }
--    /*
--     * Note that job->aio_context might have been changed by calling fn, so we
--     * can't use a local variable to cache it.
--     */
--    aio_context_acquire(job->aio_context);
-     job_unref_locked(job);
-     return rc;
- }
-@@ -XXX,XX +XXX,XX @@ void job_unref_locked(Job *job)
-         assert(!job->txn);
-         if (job->driver->free) {
-+            AioContext *aio_context = job->aio_context;
-             job_unlock();
-+            /* FIXME: aiocontext lock is required because cb calls blk_unref */
-+            aio_context_acquire(aio_context);
-             job->driver->free(job);
-+            aio_context_release(aio_context);
-             job_lock();
-         }
-@@ -XXX,XX +XXX,XX @@ void job_enter_cond_locked(Job *job, bool(*fn)(Job *job))
-         return;
-     }
--    real_job_lock();
-     if (job->busy) {
--        real_job_unlock();
-         return;
-     }
-     if (fn && !fn(job)) {
--        real_job_unlock();
-         return;
-     }
-     assert(!job->deferred_to_main_loop);
-     timer_del(&job->sleep_timer);
-     job->busy = true;
--    real_job_unlock();
-     job_unlock();
-     aio_co_wake(job->co);
-     job_lock();
-@@ -XXX,XX +XXX,XX @@ static void coroutine_fn job_do_yield_locked(Job *job, uint64_t ns)
- {
-     AioContext *next_aio_context;
--    real_job_lock();
-     if (ns != -1) {
-         timer_mod(&job->sleep_timer, ns);
-     }
-     job->busy = false;
-     job_event_idle_locked(job);
--    real_job_unlock();
-     job_unlock();
-     qemu_coroutine_yield();
-     job_lock();
-@@ -XXX,XX +XXX,XX @@ static void job_clean(Job *job)
-     }
- }
--/* Called with job_mutex held, but releases it temporarily */
-+/*
-+ * Called with job_mutex held, but releases it temporarily.
-+ * Takes AioContext lock internally to invoke a job->driver callback.
-+ */
- static int job_finalize_single_locked(Job *job)
- {
-     int job_ret;
-+    AioContext *ctx = job->aio_context;
-     assert(job_is_completed_locked(job));
-@@ -XXX,XX +XXX,XX @@ static int job_finalize_single_locked(Job *job)
-     job_ret = job->ret;
-     job_unlock();
-+    aio_context_acquire(ctx);
-     if (!job_ret) {
-         job_commit(job);
-@@ -XXX,XX +XXX,XX @@ static int job_finalize_single_locked(Job *job)
-     }
-     job_clean(job);
--    job_lock();
--
-     if (job->cb) {
--        job_ret = job->ret;
--        job_unlock();
-         job->cb(job->opaque, job_ret);
--        job_lock();
-     }
-+    aio_context_release(ctx);
-+    job_lock();
-+
-     /* Emit events only if we actually started */
-     if (job_started_locked(job)) {
-         if (job_is_cancelled_locked(job)) {
-@@ -XXX,XX +XXX,XX @@ static int job_finalize_single_locked(Job *job)
-     return 0;
- }
--/* Called with job_mutex held, but releases it temporarily */
-+/*
-+ * Called with job_mutex held, but releases it temporarily.
-+ * Takes AioContext lock internally to invoke a job->driver callback.
-+ */
- static void job_cancel_async_locked(Job *job, bool force)
- {
-+    AioContext *ctx = job->aio_context;
-     GLOBAL_STATE_CODE();
-     if (job->driver->cancel) {
-         job_unlock();
-+        aio_context_acquire(ctx);
-         force = job->driver->cancel(job, force);
-+        aio_context_release(ctx);
-         job_lock();
-     } else {
-         /* No .cancel() means the job will behave as if force-cancelled */
-@@ -XXX,XX +XXX,XX @@ static void job_cancel_async_locked(Job *job, bool force)
-     }
- }
--/* Called with job_mutex held, but releases it temporarily. */
-+/*
-+ * Called with job_mutex held, but releases it temporarily.
-+ * Takes AioContext lock internally to invoke a job->driver callback.
-+ */
- static void job_completed_txn_abort_locked(Job *job)
- {
--    AioContext *ctx;
-     JobTxn *txn = job->txn;
-     Job *other_job;
-@@ -XXX,XX +XXX,XX @@ static void job_completed_txn_abort_locked(Job *job)
-     txn->aborting = true;
-     job_txn_ref_locked(txn);
--    /*
--     * We can only hold the single job's AioContext lock while calling
--     * job_finalize_single() because the finalization callbacks can involve
--     * calls of AIO_WAIT_WHILE(), which could deadlock otherwise.
--     * Note that the job's AioContext may change when it is finalized.
--     */
-     job_ref_locked(job);
--    aio_context_release(job->aio_context);
-     /* Other jobs are effectively cancelled by us, set the status for
-      * them; this job, however, may or may not be cancelled, depending
-      * on the caller, so leave it. */
-     QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
-         if (other_job != job) {
--            ctx = other_job->aio_context;
--            aio_context_acquire(ctx);
-             /*
-              * This is a transaction: If one job failed, no result will matter.
-              * Therefore, pass force=true to terminate all other jobs as quickly
-              * as possible.
-              */
-             job_cancel_async_locked(other_job, true);
--            aio_context_release(ctx);
-         }
-     }
-     while (!QLIST_EMPTY(&txn->jobs)) {
-         other_job = QLIST_FIRST(&txn->jobs);
--        /*
--         * The job's AioContext may change, so store it in @ctx so we
--         * release the same context that we have acquired before.
--         */
--        ctx = other_job->aio_context;
--        aio_context_acquire(ctx);
-         if (!job_is_completed_locked(other_job)) {
-             assert(job_cancel_requested_locked(other_job));
-             job_finish_sync_locked(other_job, NULL, NULL);
-         }
-         job_finalize_single_locked(other_job);
--        aio_context_release(ctx);
-     }
--    /*
--     * Use job_ref()/job_unref() so we can read the AioContext here
--     * even if the job went away during job_finalize_single().
--     */
--    aio_context_acquire(job->aio_context);
-     job_unref_locked(job);
--
-     job_txn_unref_locked(txn);
- }
-@@ -XXX,XX +XXX,XX @@ static void job_completed_txn_abort_locked(Job *job)
- static int job_prepare_locked(Job *job)
- {
-     int ret;
-+    AioContext *ctx = job->aio_context;
-     GLOBAL_STATE_CODE();
-+
-     if (job->ret == 0 && job->driver->prepare) {
-         job_unlock();
-+        aio_context_acquire(ctx);
-         ret = job->driver->prepare(job);
-+        aio_context_release(ctx);
-         job_lock();
-         job->ret = ret;
-         job_update_rc_locked(job);
-     }
-+
-     return job->ret;
- }
-@@ -XXX,XX +XXX,XX @@ static void job_completed_locked(Job *job)
- static void job_exit(void *opaque)
- {
-     Job *job = (Job *)opaque;
--    AioContext *ctx;
-     JOB_LOCK_GUARD();
--
-     job_ref_locked(job);
--    aio_context_acquire(job->aio_context);
-     /* This is a lie, we're not quiescent, but still doing the completion
-      * callbacks. However, completion callbacks tend to involve operations that
-@@ -XXX,XX +XXX,XX @@ static void job_exit(void *opaque)
-     job_event_idle_locked(job);
-     job_completed_locked(job);
 -
 -    /*
--     * Note that calling job_completed can move the job to a different
+-     * Always handle the ctrl virtqueue in the main loop thread where device
--     * aio_context, so we cannot cache from above. job_txn_apply takes care of
+-     * resets can be performed.
 -     * acquiring the new lock, and we ref/unref to avoid job_completed freeing
 -     * the job underneath us.
 -     */
--    ctx = job->aio_context;
+-    s->vq_aio_context[0] = qemu_get_aio_context();
      job_unref_locked(job);
 -    aio_context_release(ctx);
  }
- /**
+ /* Context: BQL held */
@@ -XXX,XX +XXX,XX @@ int job_cancel_sync(Job *job, bool force)
  void job_cancel_sync_all(void)
  {
      Job *job;
 -    AioContext *aio_context;
      JOB_LOCK_GUARD();
      while ((job = job_next_locked(NULL))) {
 -        aio_context = job->aio_context;
 -        aio_context_acquire(aio_context);
          job_cancel_sync_locked(job, true);
 -        aio_context_release(aio_context);
      }
  }
@@ -XXX,XX +XXX,XX @@ int job_finish_sync_locked(Job *job,
      }
      job_unlock();
 -    AIO_WAIT_WHILE(job->aio_context,
 -                   (job_enter(job), !job_is_completed(job)));
 +    AIO_WAIT_WHILE_UNLOCKED(job->aio_context,
 +                            (job_enter(job), !job_is_completed(job)));
      job_lock();
      ret = (job_is_cancelled_locked(job) && job->ret == 0)
 diff --git a/qemu-img.c b/qemu-img.c
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-img.c
 +++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ static void run_block_job(BlockJob *job, Error **errp)
      AioContext *aio_context = block_job_get_aio_context(job);
      int ret = 0;
 -    aio_context_acquire(aio_context);
      job_lock();
      job_ref_locked(&job->job);
      do {
@@ -XXX,XX +XXX,XX @@ static void run_block_job(BlockJob *job, Error **errp)
      }
      job_unref_locked(&job->job);
      job_unlock();
 -    aio_context_release(aio_context);
      /* publish completion progress only when success */
      if (!ret) {
 diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/unit/test-bdrv-drain.c
 +++ b/tests/unit/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common_drain_node(enum drain_type drain_type,
          tjob->prepare_ret = -EIO;
          break;
      }
 +    aio_context_release(ctx);
      job_start(&job->job);
 -    aio_context_release(ctx);
      if (use_iothread) {
          /* job_co_entry() is run in the I/O thread, wait for the actual job
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common_drain_node(enum drain_type drain_type,
          g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
      }
 -    aio_context_acquire(ctx);
      WITH_JOB_LOCK_GUARD() {
          ret = job_complete_sync_locked(&job->job, &error_abort);
      }
      g_assert_cmpint(ret, ==, (result == TEST_JOB_SUCCESS ? 0 : -EIO));
 +    aio_context_acquire(ctx);
      if (use_iothread) {
          blk_set_aio_context(blk_src, qemu_get_aio_context(), &error_abort);
          assert(blk_get_aio_context(blk_target) == qemu_get_aio_context());
 diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/unit/test-block-iothread.c
 +++ b/tests/unit/test-block-iothread.c
@@ -XXX,XX +XXX,XX @@ static void test_attach_blockjob(void)
          aio_poll(qemu_get_aio_context(), false);
      }
 -    aio_context_acquire(ctx);
      WITH_JOB_LOCK_GUARD() {
          job_complete_sync_locked(&tjob->common.job, &error_abort);
      }
 +    aio_context_acquire(ctx);
      blk_set_aio_context(blk, qemu_get_aio_context(), &error_abort);
      aio_context_release(ctx);
 diff --git a/tests/unit/test-blockjob.c b/tests/unit/test-blockjob.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/unit/test-blockjob.c
 +++ b/tests/unit/test-blockjob.c
@@ -XXX,XX +XXX,XX @@ static void cancel_common(CancelJob *s)
      BlockJob *job = &s->common;
      BlockBackend *blk = s->blk;
      JobStatus sts = job->job.status;
 -    AioContext *ctx;
 -
 -    ctx = job->job.aio_context;
 -    aio_context_acquire(ctx);
 +    AioContext *ctx = job->job.aio_context;
      job_cancel_sync(&job->job, true);
      WITH_JOB_LOCK_GUARD() {
@@ -XXX,XX +XXX,XX @@ static void cancel_common(CancelJob *s)
          assert(job->job.status == JOB_STATUS_NULL);
          job_unref_locked(&job->job);
      }
 -    destroy_blk(blk);
 +    aio_context_acquire(ctx);
 +    destroy_blk(blk);
      aio_context_release(ctx);
 +
  }
  static void test_cancel_created(void)
@@ -XXX,XX +XXX,XX @@ static void test_cancel_concluded(void)
      aio_poll(qemu_get_aio_context(), true);
      assert_job_status_is(job, JOB_STATUS_PENDING);
 -    aio_context_acquire(job->aio_context);
      WITH_JOB_LOCK_GUARD() {
          job_finalize_locked(job, &error_abort);
 +        assert(job->status == JOB_STATUS_CONCLUDED);
      }
 -    aio_context_release(job->aio_context);
 -    assert_job_status_is(job, JOB_STATUS_CONCLUDED);
      cancel_common(s);
  }
@@ -XXX,XX +XXX,XX @@ static void test_complete_in_standby(void)
      /* Wait for the job to become READY */
      job_start(job);
 -    aio_context_acquire(ctx);
      /*
       * Here we are waiting for the status to change, so don't bother
       * protecting the read every time.
       */
 -    AIO_WAIT_WHILE(ctx, job->status != JOB_STATUS_READY);
 -    aio_context_release(ctx);
 +    AIO_WAIT_WHILE_UNLOCKED(ctx, job->status != JOB_STATUS_READY);
      /* Begin the drained section, pausing the job */
      bdrv_drain_all_begin();
@@ -XXX,XX +XXX,XX @@ static void test_complete_in_standby(void)
      aio_context_acquire(ctx);
      /* This will schedule the job to resume it */
      bdrv_drain_all_end();
 +    aio_context_release(ctx);
      WITH_JOB_LOCK_GUARD() {
          /* But the job cannot run, so it will remain on standby */
@@ -XXX,XX +XXX,XX @@ static void test_complete_in_standby(void)
          job_dismiss_locked(&job, &error_abort);
      }
 +    aio_context_acquire(ctx);
      destroy_blk(blk);
      aio_context_release(ctx);
      iothread_join(iothread);
 --
-.37.3
+.48.1

-[PULL 47/50] block_job_query: remove atomic read
+Deleted patch
-From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
-Not sure what the atomic here was supposed to do, since job.busy
-is protected by the job lock. Since the whole function
-is called under job_mutex, just remove the atomic.
-Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
-Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Message-Id: <20220926093214.506243-20-eesposit@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
----
- blockjob.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/blockjob.c b/blockjob.c
-index XXXXXXX..XXXXXXX 100644
---- a/blockjob.c
-+++ b/blockjob.c
-@@ -XXX,XX +XXX,XX @@ BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp)
-     info = g_new0(BlockJobInfo, 1);
-     info->type      = g_strdup(job_type_str(&job->job));
-     info->device    = g_strdup(job->job.id);
--    info->busy      = qatomic_read(&job->job.busy);
-+    info->busy      = job->job.busy;
-     info->paused    = job->job.pause_count > 0;
-     info->offset    = progress_current;
-     info->len       = progress_total;
---
-.37.3

The following changes since commit f1d33f55c47dfdaf8daacd618588ad3ae4c452d1:

Merge tag 'pull-testing-gdbstub-plugins-gitdm-061022-3' of https://github.com/stsquad/qemu into staging (2022-10-06 07:11:56 -0400)

are available in the Git repository at:

git://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to a7ca2eb488ff149c898f43abe103f8bd8e3ca3c4:

file-posix: Remove unused s->discard_zeroes (2022-10-07 12:11:41 +0200)

----------------------------------------------------------------
Block layer patches

- job: replace AioContext lock with job_mutex
- Fixes to make coroutine_fn annotations more accurate
- QAPI schema: Fix incorrect example
- Code cleanup

----------------------------------------------------------------
Alberto Faria (1):
      coroutine: Drop coroutine_fn annotation from qemu_coroutine_self()

Emanuele Giuseppe Esposito (20):
      job.c: make job_mutex and job_lock/unlock() public
      job.h: categorize fields in struct Job
      job.c: API functions not used outside should be static
      aio-wait.h: introduce AIO_WAIT_WHILE_UNLOCKED
      job.c: add job_lock/unlock while keeping job.h intact
      job: move and update comments from blockjob.c
      blockjob: introduce block_job _locked() APIs
      jobs: add job lock in find_* functions
      jobs: use job locks also in the unit tests
      block/mirror.c: use of job helpers in drivers
      jobs: group together API calls under the same job lock
      jobs: protect job.aio_context with BQL and job_mutex
      blockjob.h: categorize fields in struct BlockJob
      blockjob: rename notifier callbacks as _locked
      blockjob: protect iostatus field in BlockJob struct
      job.h: categorize JobDriver callbacks that need the AioContext lock
      job.c: enable job lock/unlock and remove Aiocontext locks
      block_job_query: remove atomic read
      blockjob: remove unused functions
      job: remove unused functions

Kevin Wolf (2):
      quorum: Remove unnecessary forward declaration
      file-posix: Remove unused s->discard_zeroes

Marc-André Lureau (3):
      9p: add missing coroutine_fn annotations
      migration: add missing coroutine_fn annotations
      test-coroutine: add missing coroutine_fn annotations

Markus Armbruster (1):
      Revert "qapi: fix examples of blockdev-add with qcow2"

Paolo Bonzini (23):
      block/nvme: separate nvme_get_free_req cases for coroutine/non-coroutine context
      block: add missing coroutine_fn annotations
      qcow2: remove incorrect coroutine_fn annotations
      nbd: remove incorrect coroutine_fn annotations
      coroutine: remove incorrect coroutine_fn annotations
      blkverify: add missing coroutine_fn annotations
      file-posix: add missing coroutine_fn annotations
      iscsi: add missing coroutine_fn annotations
      nbd: add missing coroutine_fn annotations
      nfs: add missing coroutine_fn annotations
      nvme: add missing coroutine_fn annotations
      parallels: add missing coroutine_fn annotations
      qcow2: add missing coroutine_fn annotations
      copy-before-write: add missing coroutine_fn annotations
      curl: add missing coroutine_fn annotations
      qed: add missing coroutine_fn annotations
      quorum: add missing coroutine_fn annotations
      throttle: add missing coroutine_fn annotations
      vmdk: add missing coroutine_fn annotations
      job: add missing coroutine_fn annotations
      coroutine-lock: add missing coroutine_fn annotations
      raw-format: add missing coroutine_fn annotations
      job: detect change of aiocontext within job coroutine

From: Markus Armbruster <armbru@redhat.com>

This reverts commit b6522938327141235b97ab38e40c6c4512587373.

Kevin Wolf NAKed this patch, because:

'file' is a required member (defined in BlockdevOptionsGenericFormat),
    removing it makes the example invalid. 'data-file' is only an additional
    optional member to be used for external data files (i.e. when the guest
    data is kept separate from the metadata in the .qcow2 file).

However, it had already been merged then.  Revert.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <20220930171908.846769-1-armbru@redhat.com>
Reviewed-by: Victor Toso <victortoso@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qapi/block-core.json | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
 # -> { "execute": "blockdev-add",
 #      "arguments": { "driver": "qcow2",
 #                     "node-name": "node1534",
-#                     "data-file": { "driver": "file",
-#                                    "filename": "hd1.qcow2" },
+#                     "file": { "driver": "file",
+#                               "filename": "hd1.qcow2" },
 #                     "backing": null } }
 #
 # <- { "return": {} }
@@ -XXX,XX +XXX,XX @@
 #      "arguments": {
 #           "driver": "qcow2",
 #           "node-name": "test1",
-#           "data-file": {
+#           "file": {
 #               "driver": "file",
 #               "filename": "test.qcow2"
 #            }
@@ -XXX,XX +XXX,XX @@
 #           "cache": {
 #              "direct": true
 #            },
-#           "data-file": {
+#            "file": {
 #              "driver": "file",
 #              "filename": "/tmp/test.qcow2"
 #            },
@@ -XXX,XX +XXX,XX @@
 #      "arguments": {
 #           "driver": "qcow2",
 #           "node-name": "node0",
-#           "data-file": {
+#           "file": {
 #               "driver": "file",
 #               "filename": "test.qcow2"
 #           }
-- 
2.37.3

From: Alberto Faria <afaria@redhat.com>

qemu_coroutine_self() can be called from outside coroutine context,
returning the leader coroutine, and several such invocations currently
exist (mostly in qcow2 tracing calls).

Signed-off-by: Alberto Faria <afaria@redhat.com>
Message-Id: <20221005175209.975797-1-afaria@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/qemu/coroutine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ AioContext *coroutine_fn qemu_coroutine_get_aio_context(Coroutine *co);
 /**
  * Get the currently executing coroutine
  */
-Coroutine *coroutine_fn qemu_coroutine_self(void);
+Coroutine *qemu_coroutine_self(void);
 
 /**
  * Return whether or not currently inside a coroutine
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

nvme_get_free_req has very difference semantics when called in
coroutine context (where it waits) and in non-coroutine context
(where it doesn't).  Split the two cases to make it clear what
is being requested.

Cc: qemu-block@nongnu.org
Reviewed-by: Alberto Faria <afaria@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-2-pbonzini@redhat.com>
[kwolf: Fixed up coding style]
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/nvme.c | 48 ++++++++++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static void nvme_kick(NVMeQueuePair *q)
     q->need_kick = 0;
 }
 
-/* Find a free request element if any, otherwise:
- * a) if in coroutine context, try to wait for one to become available;
- * b) if not in coroutine, return NULL;
- */
-static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
+static NVMeRequest *nvme_get_free_req_nofail_locked(NVMeQueuePair *q)
 {
     NVMeRequest *req;
 
-    qemu_mutex_lock(&q->lock);
-
-    while (q->free_req_head == -1) {
-        if (qemu_in_coroutine()) {
-            trace_nvme_free_req_queue_wait(q->s, q->index);
-            qemu_co_queue_wait(&q->free_req_queue, &q->lock);
-        } else {
-            qemu_mutex_unlock(&q->lock);
-            return NULL;
-        }
-    }
-
     req = &q->reqs[q->free_req_head];
     q->free_req_head = req->free_req_next;
     req->free_req_next = -1;
-
-    qemu_mutex_unlock(&q->lock);
     return req;
 }
 
+/* Return a free request element if any, otherwise return NULL.  */
+static NVMeRequest *nvme_get_free_req_nowait(NVMeQueuePair *q)
+{
+    QEMU_LOCK_GUARD(&q->lock);
+    if (q->free_req_head == -1) {
+        return NULL;
+    }
+    return nvme_get_free_req_nofail_locked(q);
+}
+
+/*
+ * Wait for a free request to become available if necessary, then
+ * return it.
+ */
+static coroutine_fn NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
+{
+    QEMU_LOCK_GUARD(&q->lock);
+
+    while (q->free_req_head == -1) {
+        trace_nvme_free_req_queue_wait(q->s, q->index);
+        qemu_co_queue_wait(&q->free_req_queue, &q->lock);
+    }
+
+    return nvme_get_free_req_nofail_locked(q);
+}
+
 /* With q->lock */
 static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
 {
@@ -XXX,XX +XXX,XX @@ static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd)
     AioContext *aio_context = bdrv_get_aio_context(bs);
     NVMeRequest *req;
     int ret = -EINPROGRESS;
-    req = nvme_get_free_req(q);
+    req = nvme_get_free_req_nowait(q);
     if (!req) {
         return -EBUSY;
     }
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

Callers of coroutine_fn must be coroutine_fn themselves, or the call
must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
functions where this holds.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-3-pbonzini@redhat.com>
[kwolf: Fixed up coding style]
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c               |  7 ++++---
 block/block-backend.c | 10 +++++-----
 block/io.c            | 22 +++++++++++-----------
 3 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static int64_t create_file_fallback_truncate(BlockBackend *blk,
  * Helper function for bdrv_create_file_fallback(): Zero the first
  * sector to remove any potentially pre-existing image header.
  */
-static int create_file_fallback_zero_first_sector(BlockBackend *blk,
-                                                  int64_t current_size,
-                                                  Error **errp)
+static int coroutine_fn
+create_file_fallback_zero_first_sector(BlockBackend *blk,
+                                       int64_t current_size,
+                                       Error **errp)
 {
     int64_t bytes_to_clear;
     int ret;
diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset,
     return &acb->common;
 }
 
-static void blk_aio_read_entry(void *opaque)
+static void coroutine_fn blk_aio_read_entry(void *opaque)
 {
     BlkAioEmAIOCB *acb = opaque;
     BlkRwCo *rwco = &acb->rwco;
@@ -XXX,XX +XXX,XX @@ static void blk_aio_read_entry(void *opaque)
     blk_aio_complete(acb);
 }
 
-static void blk_aio_write_entry(void *opaque)
+static void coroutine_fn blk_aio_write_entry(void *opaque)
 {
     BlkAioEmAIOCB *acb = opaque;
     BlkRwCo *rwco = &acb->rwco;
@@ -XXX,XX +XXX,XX @@ int coroutine_fn blk_co_ioctl(BlockBackend *blk, unsigned long int req,
     return ret;
 }
 
-static void blk_aio_ioctl_entry(void *opaque)
+static void coroutine_fn blk_aio_ioctl_entry(void *opaque)
 {
     BlkAioEmAIOCB *acb = opaque;
     BlkRwCo *rwco = &acb->rwco;
@@ -XXX,XX +XXX,XX @@ blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes)
     return bdrv_co_pdiscard(blk->root, offset, bytes);
 }
 
-static void blk_aio_pdiscard_entry(void *opaque)
+static void coroutine_fn blk_aio_pdiscard_entry(void *opaque)
 {
     BlkAioEmAIOCB *acb = opaque;
     BlkRwCo *rwco = &acb->rwco;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blk_co_do_flush(BlockBackend *blk)
     return bdrv_co_flush(blk_bs(blk));
 }
 
-static void blk_aio_flush_entry(void *opaque)
+static void coroutine_fn blk_aio_flush_entry(void *opaque)
 {
     BlkAioEmAIOCB *acb = opaque;
     BlkRwCo *rwco = &acb->rwco;
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn tracked_request_end(BdrvTrackedRequest *req)
 /**
  * Add an active request to the tracked requests list
  */
-static void tracked_request_begin(BdrvTrackedRequest *req,
-                                  BlockDriverState *bs,
-                                  int64_t offset,
-                                  int64_t bytes,
-                                  enum BdrvTrackedRequestType type)
+static void coroutine_fn tracked_request_begin(BdrvTrackedRequest *req,
+                                               BlockDriverState *bs,
+                                               int64_t offset,
+                                               int64_t bytes,
+                                               enum BdrvTrackedRequestType type)
 {
     bdrv_check_request(offset, bytes, &error_abort);
 
@@ -XXX,XX +XXX,XX @@ static bool tracked_request_overlaps(BdrvTrackedRequest *req,
 }
 
 /* Called with self->bs->reqs_lock held */
-static BdrvTrackedRequest *
+static coroutine_fn BdrvTrackedRequest *
 bdrv_find_conflicting_request(BdrvTrackedRequest *self)
 {
     BdrvTrackedRequest *req;
@@ -XXX,XX +XXX,XX @@ static bool bdrv_init_padding(BlockDriverState *bs,
     return true;
 }
 
-static int bdrv_padding_rmw_read(BdrvChild *child,
-                                 BdrvTrackedRequest *req,
-                                 BdrvRequestPadding *pad,
-                                 bool zero_middle)
+static coroutine_fn int bdrv_padding_rmw_read(BdrvChild *child,
+                                              BdrvTrackedRequest *req,
+                                              BdrvRequestPadding *pad,
+                                              bool zero_middle)
 {
     QEMUIOVector local_qiov;
     BlockDriverState *bs = child->bs;
@@ -XXX,XX +XXX,XX @@ out:
     return ret;
 }
 
-int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
+int coroutine_fn bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
 {
     BlockDriver *drv = bs->drv;
     CoroutineIOCompletion co = {
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

This is incorrect because qcow2_mark_clean() calls qcow2_flush_caches().
qcow2_mark_clean() is called from non-coroutine context in
qcow2_inactivate() and qcow2_amend_options().

Reviewed-by: Alberto Faria <afaria@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-4-pbonzini@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.h          | 4 ++--
 block/qcow2-refcount.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     int64_t l1_table_offset, int l1_size, int addend);
 
-int coroutine_fn qcow2_flush_caches(BlockDriverState *bs);
-int coroutine_fn qcow2_write_caches(BlockDriverState *bs);
+int qcow2_flush_caches(BlockDriverState *bs);
+int qcow2_write_caches(BlockDriverState *bs);
 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                           BdrvCheckMode fix);
 
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
     }
 }
 
-int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
+int qcow2_write_caches(BlockDriverState *bs)
 {
     BDRVQcow2State *s = bs->opaque;
     int ret;
@@ -XXX,XX +XXX,XX @@ int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
     return 0;
 }
 
-int coroutine_fn qcow2_flush_caches(BlockDriverState *bs)
+int qcow2_flush_caches(BlockDriverState *bs)
 {
     int ret = qcow2_write_caches(bs);
     if (ret < 0) {
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

nbd_co_establish_connection_cancel() cancels a coroutine but is not called
from coroutine context itself, for example in nbd_cancel_in_flight()
and in timer callbacks reconnect_delay_timer_cb() and open_timer_cb().

Reviewed-by: Alberto Faria <afaria@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-5-pbonzini@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/nbd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/block/nbd.h b/include/block/nbd.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -XXX,XX +XXX,XX @@ QIOChannel *coroutine_fn
 nbd_co_establish_connection(NBDClientConnection *conn, NBDExportInfo *info,
                             bool blocking, Error **errp);
 
-void coroutine_fn nbd_co_establish_connection_cancel(NBDClientConnection *conn);
+void nbd_co_establish_connection_cancel(NBDClientConnection *conn);
 
 #endif
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

qemu_coroutine_get_aio_context inspects a coroutine, but it does
not have to be called from the coroutine itself (or from any
coroutine).

Reviewed-by: Alberto Faria <afaria@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-6-pbonzini@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/qemu/coroutine.h | 2 +-
 util/qemu-coroutine.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_coroutine_yield(void);
 /**
  * Get the AioContext of the given coroutine
  */
-AioContext *coroutine_fn qemu_coroutine_get_aio_context(Coroutine *co);
+AioContext *qemu_coroutine_get_aio_context(Coroutine *co);
 
 /**
  * Get the currently executing coroutine
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine.c
+++ b/util/qemu-coroutine.c
@@ -XXX,XX +XXX,XX @@ bool qemu_coroutine_entered(Coroutine *co)
     return co->caller;
 }
 
-AioContext *coroutine_fn qemu_coroutine_get_aio_context(Coroutine *co)
+AioContext *qemu_coroutine_get_aio_context(Coroutine *co)
 {
     return co->ctx;
 }
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

Callers of coroutine_fn must be coroutine_fn themselves, or the call
must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
functions where this holds.

Reviewed-by: Alberto Faria <afaria@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-8-pbonzini@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/blkverify.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blkverify.c b/block/blkverify.c
index XXXXXXX..XXXXXXX 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -XXX,XX +XXX,XX @@ blkverify_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
     return blkverify_co_prwv(bs, &r, offset, bytes, qiov, qiov, flags, true);
 }
 
-static int blkverify_co_flush(BlockDriverState *bs)
+static int coroutine_fn blkverify_co_flush(BlockDriverState *bs)
 {
     BDRVBlkverifyState *s = bs->opaque;
 
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

Callers of coroutine_fn must be coroutine_fn themselves, or the call
must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
functions where this holds.

Reviewed-by: Alberto Faria <afaria@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-10-pbonzini@redhat.com>
[kwolf: Fixed up coding style]
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/iscsi.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index XXXXXXX..XXXXXXX 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -XXX,XX +XXX,XX @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
     }
 }
 
-static void iscsi_co_init_iscsitask(IscsiLun *iscsilun, struct IscsiTask *iTask)
+static void coroutine_fn
+iscsi_co_init_iscsitask(IscsiLun *iscsilun, struct IscsiTask *iTask)
 {
     *iTask = (struct IscsiTask) {
         .co         = qemu_coroutine_self(),
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

Callers of coroutine_fn must be coroutine_fn themselves, or the call
must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
functions where this holds.

Reviewed-by: Alberto Faria <afaria@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-11-pbonzini@redhat.com>
[kwolf: Fixed up coding style]
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/nbd.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -XXX,XX +XXX,XX @@ static void nbd_iter_request_error(NBDReplyChunkIter *iter, int ret)
  * nbd_reply_chunk_iter_receive
  * The pointer stored in @payload requires g_free() to free it.
  */
-static bool nbd_reply_chunk_iter_receive(BDRVNBDState *s,
-                                         NBDReplyChunkIter *iter,
-                                         uint64_t handle,
-                                         QEMUIOVector *qiov, NBDReply *reply,
-                                         void **payload)
+static bool coroutine_fn nbd_reply_chunk_iter_receive(BDRVNBDState *s,
+                                                      NBDReplyChunkIter *iter,
+                                                      uint64_t handle,
+                                                      QEMUIOVector *qiov,
+                                                      NBDReply *reply,
+                                                      void **payload)
 {
     int ret, request_ret;
     NBDReply local_reply;
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

Callers of coroutine_fn must be coroutine_fn themselves, or the call
must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
functions where this holds.

Reviewed-by: Alberto Faria <afaria@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-13-pbonzini@redhat.com>
[kwolf: Fixed up coding style]
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/nvme.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static inline bool nvme_qiov_aligned(BlockDriverState *bs,
     return true;
 }
 
-static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-                       QEMUIOVector *qiov, bool is_write, int flags)
+static coroutine_fn int nvme_co_prw(BlockDriverState *bs,
+                                    uint64_t offset, uint64_t bytes,
+                                    QEMUIOVector *qiov, bool is_write,
+                                    int flags)
 {
     BDRVNVMeState *s = bs->opaque;
     int r;
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

Callers of coroutine_fn must be coroutine_fn themselves, or the call
must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
functions where this holds.

Reviewed-by: Alberto Faria <afaria@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-14-pbonzini@redhat.com>
[kwolf: Fixed up coding style]
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/parallels.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index XXXXXXX..XXXXXXX 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -XXX,XX +XXX,XX @@ static int64_t block_status(BDRVParallelsState *s, int64_t sector_num,
     return start_off;
 }
 
-static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
-                                 int nb_sectors, int *pnum)
+static coroutine_fn int64_t allocate_clusters(BlockDriverState *bs,
+                                              int64_t sector_num,
+                                              int nb_sectors, int *pnum)
 {
     int ret = 0;
     BDRVParallelsState *s = bs->opaque;
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

Callers of coroutine_fn must be coroutine_fn themselves, or the call
must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
functions where this holds.

Reviewed-by: Alberto Faria <afaria@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-15-pbonzini@redhat.com>
[kwolf: Fixed up coding style]
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.h          | 15 ++++++++-------
 block/qcow2-cluster.c  | 21 ++++++++++++---------
 block/qcow2-refcount.c |  2 +-
 block/qcow2.c          |  5 +++--
 4 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -XXX,XX +XXX,XX @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
                                 void *cb_opaque, Error **errp);
 int qcow2_shrink_reftable(BlockDriverState *bs);
 int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size);
-int qcow2_detect_metadata_preallocation(BlockDriverState *bs);
+int coroutine_fn qcow2_detect_metadata_preallocation(BlockDriverState *bs);
 
 /* qcow2-cluster.c functions */
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
@@ -XXX,XX +XXX,XX @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
 int qcow2_get_host_offset(BlockDriverState *bs, uint64_t offset,
                           unsigned int *bytes, uint64_t *host_offset,
                           QCow2SubclusterType *subcluster_type);
-int qcow2_alloc_host_offset(BlockDriverState *bs, uint64_t offset,
-                            unsigned int *bytes, uint64_t *host_offset,
-                            QCowL2Meta **m);
+int coroutine_fn qcow2_alloc_host_offset(BlockDriverState *bs, uint64_t offset,
+                                         unsigned int *bytes,
+                                         uint64_t *host_offset, QCowL2Meta **m);
 int qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
                                           uint64_t offset,
                                           int compressed_size,
@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
 void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry,
                                      uint64_t *coffset, int *csize);
 
-int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
+int coroutine_fn qcow2_alloc_cluster_link_l2(BlockDriverState *bs,
+                                             QCowL2Meta *m);
 void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m);
 int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
                           uint64_t bytes, enum qcow2_discard_type type,
                           bool full_discard);
-int qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
-                             uint64_t bytes, int flags);
+int coroutine_fn qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
+                                          uint64_t bytes, int flags);
 
 int qcow2_expand_zero_clusters(BlockDriverState *bs,
                                BlockDriverAmendStatusCB *status_cb,
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
     return 0;
 }
 
-static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
+static int coroutine_fn perform_cow(BlockDriverState *bs, QCowL2Meta *m)
 {
     BDRVQcow2State *s = bs->opaque;
     Qcow2COWRegion *start = &m->cow_start;
@@ -XXX,XX +XXX,XX @@ fail:
     return ret;
 }
 
-int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
+int coroutine_fn qcow2_alloc_cluster_link_l2(BlockDriverState *bs,
+                                             QCowL2Meta *m)
 {
     BDRVQcow2State *s = bs->opaque;
     int i, j = 0, l2_index, ret;
@@ -XXX,XX +XXX,XX @@ static int count_single_write_clusters(BlockDriverState *bs, int nb_clusters,
  *           information on cluster allocation may be invalid now. The caller
  *           must start over anyway, so consider *cur_bytes undefined.
  */
-static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
-    uint64_t *cur_bytes, QCowL2Meta **m)
+static int coroutine_fn handle_dependencies(BlockDriverState *bs,
+                                            uint64_t guest_offset,
+                                            uint64_t *cur_bytes, QCowL2Meta **m)
 {
     BDRVQcow2State *s = bs->opaque;
     QCowL2Meta *old_alloc;
@@ -XXX,XX +XXX,XX @@ out:
  *
  * Return 0 on success and -errno in error cases
  */
-int qcow2_alloc_host_offset(BlockDriverState *bs, uint64_t offset,
-                            unsigned int *bytes, uint64_t *host_offset,
-                            QCowL2Meta **m)
+int coroutine_fn qcow2_alloc_host_offset(BlockDriverState *bs, uint64_t offset,
+                                         unsigned int *bytes,
+                                         uint64_t *host_offset,
+                                         QCowL2Meta **m)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t start, remaining;
@@ -XXX,XX +XXX,XX @@ out:
     return ret;
 }
 
-int qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
-                             uint64_t bytes, int flags)
+int coroutine_fn qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
+                                          uint64_t bytes, int flags)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t end_offset = offset + bytes;
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -XXX,XX +XXX,XX @@ int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
     return -EIO;
 }
 
-int qcow2_detect_metadata_preallocation(BlockDriverState *bs)
+int coroutine_fn qcow2_detect_metadata_preallocation(BlockDriverState *bs)
 {
     BDRVQcow2State *s = bs->opaque;
     int64_t i, end_cluster, cluster_count = 0, threshold;
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ static bool merge_cow(uint64_t offset, unsigned bytes,
  * Return 1 if the COW regions read as zeroes, 0 if not, < 0 on error.
  * Note that returning 0 does not guarantee non-zero data.
  */
-static int is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
+static int coroutine_fn is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
 {
     /*
      * This check is designed for optimization shortcut so it must be
@@ -XXX,XX +XXX,XX @@ static int is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
                                 m->cow_end.nb_bytes);
 }
 
-static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
+static int coroutine_fn handle_alloc_space(BlockDriverState *bs,
+                                           QCowL2Meta *l2meta)
 {
     BDRVQcow2State *s = bs->opaque;
     QCowL2Meta *m;
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

Callers of coroutine_fn must be coroutine_fn themselves, or the call
must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
functions where this holds.

Reviewed-by: Alberto Faria <afaria@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-16-pbonzini@redhat.com>
[kwolf: Fixed up coding style]
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/copy-before-write.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/copy-before-write.c b/block/copy-before-write.c
index XXXXXXX..XXXXXXX 100644
--- a/block/copy-before-write.c
+++ b/block/copy-before-write.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn cbw_co_flush(BlockDriverState *bs)
  * It's guaranteed that guest writes will not interact in the region until
  * cbw_snapshot_read_unlock() called.
  */
-static BlockReq *cbw_snapshot_read_lock(BlockDriverState *bs,
-                                        int64_t offset, int64_t bytes,
-                                        int64_t *pnum, BdrvChild **file)
+static coroutine_fn BlockReq *
+cbw_snapshot_read_lock(BlockDriverState *bs, int64_t offset, int64_t bytes,
+                       int64_t *pnum, BdrvChild **file)
 {
     BDRVCopyBeforeWriteState *s = bs->opaque;
     BlockReq *req = g_new(BlockReq, 1);
@@ -XXX,XX +XXX,XX @@ static BlockReq *cbw_snapshot_read_lock(BlockDriverState *bs,
     return req;
 }
 
-static void cbw_snapshot_read_unlock(BlockDriverState *bs, BlockReq *req)
+static coroutine_fn void
+cbw_snapshot_read_unlock(BlockDriverState *bs, BlockReq *req)
 {
     BDRVCopyBeforeWriteState *s = bs->opaque;
 
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

Callers of coroutine_fn must be coroutine_fn themselves, or the call
must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
functions where this holds.

Reviewed-by: Alberto Faria <afaria@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-18-pbonzini@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qed.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -XXX,XX +XXX,XX @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
     return l2_table;
 }
 
-static bool qed_plug_allocating_write_reqs(BDRVQEDState *s)
+static bool coroutine_fn qed_plug_allocating_write_reqs(BDRVQEDState *s)
 {
     qemu_co_mutex_lock(&s->table_lock);
 
@@ -XXX,XX +XXX,XX @@ static bool qed_plug_allocating_write_reqs(BDRVQEDState *s)
     return true;
 }
 
-static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
+static void coroutine_fn qed_unplug_allocating_write_reqs(BDRVQEDState *s)
 {
     qemu_co_mutex_lock(&s->table_lock);
     assert(s->allocating_write_reqs_plugged);
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

Callers of coroutine_fn must be coroutine_fn themselves, or the call
must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
functions where this holds.

Reviewed-by: Alberto Faria <afaria@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-19-pbonzini@redhat.com>
[kwolf: Fixed up coding style]
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/quorum.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/block/quorum.c b/block/quorum.c
index XXXXXXX..XXXXXXX 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -XXX,XX +XXX,XX @@ static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b)
     return a->l == b->l;
 }
 
-static QuorumAIOCB *quorum_aio_get(BlockDriverState *bs,
-                                   QEMUIOVector *qiov,
-                                   uint64_t offset,
-                                   uint64_t bytes,
-                                   int flags)
+static QuorumAIOCB *coroutine_fn quorum_aio_get(BlockDriverState *bs,
+                                                QEMUIOVector *qiov,
+                                                uint64_t offset, uint64_t bytes,
+                                                int flags)
 {
     BDRVQuorumState *s = bs->opaque;
     QuorumAIOCB *acb = g_new(QuorumAIOCB, 1);
@@ -XXX,XX +XXX,XX @@ static void quorum_report_bad_versions(BDRVQuorumState *s,
     }
 }
 
-static void quorum_rewrite_entry(void *opaque)
+static void coroutine_fn quorum_rewrite_entry(void *opaque)
 {
     QuorumCo *co = opaque;
     QuorumAIOCB *acb = co->acb;
@@ -XXX,XX +XXX,XX @@ free_exit:
     quorum_free_vote_list(&acb->votes);
 }
 
-static void read_quorum_children_entry(void *opaque)
+static void coroutine_fn read_quorum_children_entry(void *opaque)
 {
     QuorumCo *co = opaque;
     QuorumAIOCB *acb = co->acb;
@@ -XXX,XX +XXX,XX @@ static void read_quorum_children_entry(void *opaque)
     }
 }
 
-static int read_quorum_children(QuorumAIOCB *acb)
+static int coroutine_fn read_quorum_children(QuorumAIOCB *acb)
 {
     BDRVQuorumState *s = acb->bs->opaque;
     int i;
@@ -XXX,XX +XXX,XX @@ static int read_quorum_children(QuorumAIOCB *acb)
     return acb->vote_ret;
 }
 
-static int read_fifo_child(QuorumAIOCB *acb)
+static int coroutine_fn read_fifo_child(QuorumAIOCB *acb)
 {
     BDRVQuorumState *s = acb->bs->opaque;
     int n, ret;
@@ -XXX,XX +XXX,XX @@ static int read_fifo_child(QuorumAIOCB *acb)
     return ret;
 }
 
-static int quorum_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
-                            QEMUIOVector *qiov, BdrvRequestFlags flags)
+static int coroutine_fn quorum_co_preadv(BlockDriverState *bs,
+                                         int64_t offset, int64_t bytes,
+                                         QEMUIOVector *qiov,
+                                         BdrvRequestFlags flags)
 {
     BDRVQuorumState *s = bs->opaque;
     QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags);
@@ -XXX,XX +XXX,XX @@ static int quorum_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
     return ret;
 }
 
-static void write_quorum_entry(void *opaque)
+static void coroutine_fn write_quorum_entry(void *opaque)
 {
     QuorumCo *co = opaque;
     QuorumAIOCB *acb = co->acb;
@@ -XXX,XX +XXX,XX @@ static void write_quorum_entry(void *opaque)
     }
 }
 
-static int quorum_co_pwritev(BlockDriverState *bs, int64_t offset,
-                             int64_t bytes, QEMUIOVector *qiov,
-                             BdrvRequestFlags flags)
+static int coroutine_fn quorum_co_pwritev(BlockDriverState *bs, int64_t offset,
+                                          int64_t bytes, QEMUIOVector *qiov,
+                                          BdrvRequestFlags flags)
 {
     BDRVQuorumState *s = bs->opaque;
     QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags);
@@ -XXX,XX +XXX,XX @@ static int quorum_co_pwritev(BlockDriverState *bs, int64_t offset,
     return ret;
 }
 
-static int quorum_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
-                                   int64_t bytes, BdrvRequestFlags flags)
+static int coroutine_fn quorum_co_pwrite_zeroes(BlockDriverState *bs,
+                                                int64_t offset, int64_t bytes,
+                                                BdrvRequestFlags flags)
 
 {
     return quorum_co_pwritev(bs, offset, bytes, NULL,
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

Callers of coroutine_fn must be coroutine_fn themselves, or the call
must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
functions where this holds.

Reviewed-by: Alberto Faria <afaria@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-21-pbonzini@redhat.com>
[kwolf: Fixed up coding style]
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vmdk.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index XXXXXXX..XXXXXXX 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn vmdk_co_block_status(BlockDriverState *bs,
     return ret;
 }
 
-static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
-                            int64_t offset_in_cluster, QEMUIOVector *qiov,
-                            uint64_t qiov_offset, uint64_t n_bytes,
-                            uint64_t offset)
+static int coroutine_fn
+vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
+                  int64_t offset_in_cluster, QEMUIOVector *qiov,
+                  uint64_t qiov_offset, uint64_t n_bytes,
+                  uint64_t offset)
 {
     int ret;
     VmdkGrainMarker *data = NULL;
@@ -XXX,XX +XXX,XX @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
     return ret;
 }
 
-static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
-                            int64_t offset_in_cluster, QEMUIOVector *qiov,
-                            int bytes)
+static int coroutine_fn
+vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
+                 int64_t offset_in_cluster, QEMUIOVector *qiov,
+                 int bytes)
 {
     int ret;
     int cluster_bytes, buf_bytes;
@@ -XXX,XX +XXX,XX @@ fail:
  *
  * Returns: error code with 0 for success.
  */
-static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
-                       uint64_t bytes, QEMUIOVector *qiov,
-                       bool zeroed, bool zero_dry_run)
+static int coroutine_fn vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
+                                     uint64_t bytes, QEMUIOVector *qiov,
+                                     bool zeroed, bool zero_dry_run)
 {
     BDRVVmdkState *s = bs->opaque;
     VmdkExtent *extent = NULL;
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

Callers of coroutine_fn must be coroutine_fn themselves, or the call
must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
functions where this holds.

Reviewed-by: Alberto Faria <afaria@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-22-pbonzini@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/qemu/job.h | 2 +-
 job.c              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -XXX,XX +XXX,XX @@ void coroutine_fn job_pause_point(Job *job);
  *
  * Yield the job coroutine.
  */
-void job_yield(Job *job);
+void coroutine_fn job_yield(Job *job);
 
 /**
  * @job: The job that calls the function.
diff --git a/job.c b/job.c
index XXXXXXX..XXXXXXX 100644
--- a/job.c
+++ b/job.c
@@ -XXX,XX +XXX,XX @@ void coroutine_fn job_pause_point(Job *job)
     }
 }
 
-void job_yield(Job *job)
+void coroutine_fn job_yield(Job *job)
 {
     assert(job->busy);
 
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

Callers of coroutine_fn must be coroutine_fn themselves, or the call
must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
functions where this holds.

Reviewed-by: Alberto Faria <afaria@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-23-pbonzini@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 util/qemu-coroutine-lock.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index XXXXXXX..XXXXXXX 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -XXX,XX +XXX,XX @@ typedef struct CoWaitRecord {
     QSLIST_ENTRY(CoWaitRecord) next;
 } CoWaitRecord;
 
-static void push_waiter(CoMutex *mutex, CoWaitRecord *w)
+static void coroutine_fn push_waiter(CoMutex *mutex, CoWaitRecord *w)
 {
     w->co = qemu_coroutine_self();
     QSLIST_INSERT_HEAD_ATOMIC(&mutex->from_push, w, next);
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_init(CoRwlock *lock)
 }
 
 /* Releases the internal CoMutex.  */
-static void qemu_co_rwlock_maybe_wake_one(CoRwlock *lock)
+static void coroutine_fn qemu_co_rwlock_maybe_wake_one(CoRwlock *lock)
 {
     CoRwTicket *tkt = QSIMPLEQ_FIRST(&lock->tickets);
     Coroutine *co = NULL;
@@ -XXX,XX +XXX,XX @@ static void qemu_co_rwlock_maybe_wake_one(CoRwlock *lock)
     }
 }
 
-void qemu_co_rwlock_rdlock(CoRwlock *lock)
+void coroutine_fn qemu_co_rwlock_rdlock(CoRwlock *lock)
 {
     Coroutine *self = qemu_coroutine_self();
 
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock)
     self->locks_held++;
 }
 
-void qemu_co_rwlock_unlock(CoRwlock *lock)
+void coroutine_fn qemu_co_rwlock_unlock(CoRwlock *lock)
 {
     Coroutine *self = qemu_coroutine_self();
 
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
     qemu_co_rwlock_maybe_wake_one(lock);
 }
 
-void qemu_co_rwlock_downgrade(CoRwlock *lock)
+void coroutine_fn qemu_co_rwlock_downgrade(CoRwlock *lock)
 {
     qemu_co_mutex_lock(&lock->mutex);
     assert(lock->owners == -1);
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_downgrade(CoRwlock *lock)
     qemu_co_rwlock_maybe_wake_one(lock);
 }
 
-void qemu_co_rwlock_wrlock(CoRwlock *lock)
+void coroutine_fn qemu_co_rwlock_wrlock(CoRwlock *lock)
 {
     Coroutine *self = qemu_coroutine_self();
 
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_wrlock(CoRwlock *lock)
     self->locks_held++;
 }
 
-void qemu_co_rwlock_upgrade(CoRwlock *lock)
+void coroutine_fn qemu_co_rwlock_upgrade(CoRwlock *lock)
 {
     qemu_co_mutex_lock(&lock->mutex);
     assert(lock->owners > 0);
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

Callers of coroutine_fn must be coroutine_fn themselves, or the call
must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
functions where this holds.

Reviewed-by: Alberto Faria <afaria@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-24-pbonzini@redhat.com>
[kwolf: Fixed up coding style]
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-format.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/raw-format.c b/block/raw-format.c
index XXXXXXX..XXXXXXX 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -XXX,XX +XXX,XX @@ static void raw_lock_medium(BlockDriverState *bs, bool locked)
     bdrv_lock_medium(bs->file->bs, locked);
 }
 
-static int raw_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+static int coroutine_fn raw_co_ioctl(BlockDriverState *bs,
+                                     unsigned long int req, void *buf)
 {
     BDRVRawState *s = bs->opaque;
     if (s->offset || s->has_size) {
-- 
2.37.3

From: Marc-André Lureau <marcandre.lureau@redhat.com>

Callers of coroutine_fn must be coroutine_fn themselves, or the call
must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
functions where this holds.

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Acked-by: Greg Kurz <groug@kaod.org>
Reviewed-by: Alberto Faria <afaria@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-25-pbonzini@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/9pfs/9p.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -XXX,XX +XXX,XX @@ typedef struct V9fsGetlock
 extern int open_fd_hw;
 extern int total_open_fd;
 
-static inline void v9fs_path_write_lock(V9fsState *s)
+static inline void coroutine_fn
+v9fs_path_write_lock(V9fsState *s)
 {
     if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
         qemu_co_rwlock_wrlock(&s->rename_lock);
     }
 }
 
-static inline void v9fs_path_read_lock(V9fsState *s)
+static inline void coroutine_fn
+v9fs_path_read_lock(V9fsState *s)
 {
     if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
         qemu_co_rwlock_rdlock(&s->rename_lock);
     }
 }
 
-static inline void v9fs_path_unlock(V9fsState *s)
+static inline void coroutine_fn
+v9fs_path_unlock(V9fsState *s)
 {
     if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
         qemu_co_rwlock_unlock(&s->rename_lock);
-- 
2.37.3

From: Marc-André Lureau <marcandre.lureau@redhat.com>

Callers of coroutine_fn must be coroutine_fn themselves, or the call
must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
functions where this holds.

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Alberto Faria <afaria@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-26-pbonzini@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 migration/migration.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/migration/migration.c b/migration/migration.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -XXX,XX +XXX,XX @@ static void process_incoming_migration_bh(void *opaque)
     migration_incoming_state_destroy();
 }
 
-static void process_incoming_migration_co(void *opaque)
+static void coroutine_fn
+process_incoming_migration_co(void *opaque)
 {
     MigrationIncomingState *mis = migration_incoming_get_current();
     PostcopyState ps;
-- 
2.37.3

From: Marc-André Lureau <marcandre.lureau@redhat.com>

Callers of coroutine_fn must be coroutine_fn themselves, or the call
must be within "if (qemu_in_coroutine())".  Apply coroutine_fn to
functions where this holds.

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Alberto Faria <afaria@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220922084924.201610-27-pbonzini@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/unit/test-coroutine.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test-coroutine.c b/tests/unit/test-coroutine.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/test-coroutine.c
+++ b/tests/unit/test-coroutine.c
@@ -XXX,XX +XXX,XX @@ static void perf_baseline(void)
     g_test_message("Function call %u iterations: %f s", maxcycles, duration);
 }
 
-static __attribute__((noinline)) void perf_cost_func(void *opaque)
+static __attribute__((noinline)) void coroutine_fn perf_cost_func(void *opaque)
 {
     qemu_coroutine_yield();
 }
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

job mutex will be used to protect the job struct elements and list,
replacing AioContext locks.

Right now use a shared lock for all jobs, in order to keep things
simple. Once the AioContext lock is gone, we can introduce per-job
locks.

To simplify the switch from aiocontext to job lock, introduce
*nop* lock/unlock functions and macros.
We want to always call job_lock/unlock outside the AioContext locks,
and not vice-versa, otherwise we might get a deadlock. This is not
straightforward to do, and that's why we start with nop functions.
Once everything is protected by job_lock/unlock, we can change the nop into
an actual mutex and remove the aiocontext lock.

Since job_mutex is already being used, add static
real_job_{lock/unlock} for the existing usage.

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Message-Id: <20220926093214.506243-2-eesposit@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/qemu/job.h | 24 ++++++++++++++++++++++++
 job.c              | 35 +++++++++++++++++++++++------------
 2 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -XXX,XX +XXX,XX @@ typedef enum JobCreateFlags {
     JOB_MANUAL_DISMISS = 0x04,
 } JobCreateFlags;
 
+extern QemuMutex job_mutex;
+
+#define JOB_LOCK_GUARD() /* QEMU_LOCK_GUARD(&job_mutex) */
+
+#define WITH_JOB_LOCK_GUARD() /* WITH_QEMU_LOCK_GUARD(&job_mutex) */
+
+/**
+ * job_lock:
+ *
+ * Take the mutex protecting the list of jobs and their status.
+ * Most functions called by the monitor need to call job_lock
+ * and job_unlock manually.  On the other hand, function called
+ * by the block jobs themselves and by the block layer will take the
+ * lock for you.
+ */
+void job_lock(void);
+
+/**
+ * job_unlock:
+ *
+ * Release the mutex protecting the list of jobs and their status.
+ */
+void job_unlock(void);
+
 /**
  * Allocate and return a new job transaction. Jobs can be added to the
  * transaction using job_txn_add_job().
diff --git a/job.c b/job.c
index XXXXXXX..XXXXXXX 100644
--- a/job.c
+++ b/job.c
@@ -XXX,XX +XXX,XX @@
 #include "trace/trace-root.h"
 #include "qapi/qapi-events-job.h"
 
+/*
+ * job_mutex protects the jobs list, but also makes the
+ * struct job fields thread-safe.
+ */
+QemuMutex job_mutex;
+
 static QLIST_HEAD(, Job) jobs = QLIST_HEAD_INITIALIZER(jobs);
 
 /* Job State Transition Table */
@@ -XXX,XX +XXX,XX @@ struct JobTxn {
     int refcnt;
 };
 
-/* Right now, this mutex is only needed to synchronize accesses to job->busy
- * and job->sleep_timer, such as concurrent calls to job_do_yield and
- * job_enter. */
-static QemuMutex job_mutex;
+void job_lock(void)
+{
+    /* nop */
+}
+
+void job_unlock(void)
+{
+    /* nop */
+}
 
-static void job_lock(void)
+static void real_job_lock(void)
 {
     qemu_mutex_lock(&job_mutex);
 }
 
-static void job_unlock(void)
+static void real_job_unlock(void)
 {
     qemu_mutex_unlock(&job_mutex);
 }
@@ -XXX,XX +XXX,XX @@ void job_enter_cond(Job *job, bool(*fn)(Job *job))
         return;
     }
 
-    job_lock();
+    real_job_lock();
     if (job->busy) {
-        job_unlock();
+        real_job_unlock();
         return;
     }
 
     if (fn && !fn(job)) {
-        job_unlock();
+        real_job_unlock();
         return;
     }
 
     assert(!job->deferred_to_main_loop);
     timer_del(&job->sleep_timer);
     job->busy = true;
-    job_unlock();
+    real_job_unlock();
     aio_co_enter(job->aio_context, job->co);
 }
 
@@ -XXX,XX +XXX,XX @@ void job_enter(Job *job)
  * called explicitly. */
 static void coroutine_fn job_do_yield(Job *job, uint64_t ns)
 {
-    job_lock();
+    real_job_lock();
     if (ns != -1) {
         timer_mod(&job->sleep_timer, ns);
     }
     job->busy = false;
     job_event_idle(job);
-    job_unlock();
+    real_job_unlock();
     qemu_coroutine_yield();
 
     /* Set by job_enter_cond() before re-entering the coroutine.  */
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

Categorize the fields in struct Job to understand which ones
need to be protected by the job mutex and which don't.

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20220926093214.506243-3-eesposit@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/qemu/job.h | 61 +++++++++++++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 25 deletions(-)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -XXX,XX +XXX,XX @@ typedef struct JobTxn JobTxn;
  * Long-running operation.
  */
 typedef struct Job {
+
+    /* Fields set at initialization (job_create), and never modified */
+
     /** The ID of the job. May be NULL for internal jobs. */
     char *id;
 
-    /** The type of this job. */
+    /**
+     * The type of this job.
+     * All callbacks are called with job_mutex *not* held.
+     */
     const JobDriver *driver;
 
-    /** Reference count of the block job */
-    int refcnt;
-
-    /** Current state; See @JobStatus for details. */
-    JobStatus status;
-
-    /** AioContext to run the job coroutine in */
-    AioContext *aio_context;
-
     /**
      * The coroutine that executes the job.  If not NULL, it is reentered when
      * busy is false and the job is cancelled.
+     * Initialized in job_start()
      */
     Coroutine *co;
 
+    /** True if this job should automatically finalize itself */
+    bool auto_finalize;
+
+    /** True if this job should automatically dismiss itself */
+    bool auto_dismiss;
+
+    /** The completion function that will be called when the job completes.  */
+    BlockCompletionFunc *cb;
+
+    /** The opaque value that is passed to the completion function.  */
+    void *opaque;
+
+    /* ProgressMeter API is thread-safe */
+    ProgressMeter progress;
+
+
+    /** Protected by AioContext lock */
+
+    /** AioContext to run the job coroutine in */
+    AioContext *aio_context;
+
+    /** Reference count of the block job */
+    int refcnt;
+
+    /** Current state; See @JobStatus for details. */
+    JobStatus status;
+
     /**
      * Timer that is used by @job_sleep_ns. Accessed under job_mutex (in
      * job.c).
@@ -XXX,XX +XXX,XX @@ typedef struct Job {
     /** Set to true when the job has deferred work to the main loop. */
     bool deferred_to_main_loop;
 
-    /** True if this job should automatically finalize itself */
-    bool auto_finalize;
-
-    /** True if this job should automatically dismiss itself */
-    bool auto_dismiss;
-
-    ProgressMeter progress;
-
     /**
      * Return code from @run and/or @prepare callback(s).
      * Not final until the job has reached the CONCLUDED status.
@@ -XXX,XX +XXX,XX @@ typedef struct Job {
      */
     Error *err;
 
-    /** The completion function that will be called when the job completes.  */
-    BlockCompletionFunc *cb;
-
-    /** The opaque value that is passed to the completion function.  */
-    void *opaque;
-
     /** Notifiers called when a cancelled job is finalised */
     NotifierList on_finalize_cancelled;
 
@@ -XXX,XX +XXX,XX @@ typedef struct Job {
 
 /**
  * Callbacks and other information about a Job driver.
+ * All callbacks are invoked with job_mutex *not* held.
  */
 struct JobDriver {
 
@@ -XXX,XX +XXX,XX @@ void coroutine_fn job_yield(Job *job);
  */
 void coroutine_fn job_sleep_ns(Job *job, int64_t ns);
 
-
 /** Returns the JobType of a given Job. */
 JobType job_type(const Job *job);
 
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

job_event_* functions can all be static, as they are not used
outside job.c.

Same applies for job_txn_add_job().

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20220926093214.506243-4-eesposit@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/qemu/job.h | 18 ------------------
 job.c              | 22 +++++++++++++++++++---
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -XXX,XX +XXX,XX @@ JobTxn *job_txn_new(void);
  */
 void job_txn_unref(JobTxn *txn);
 
-/**
- * @txn: The transaction (may be NULL)
- * @job: Job to add to the transaction
- *
- * Add @job to the transaction.  The @job must not already be in a transaction.
- * The caller must call either job_txn_unref() or job_completed() to release
- * the reference that is automatically grabbed here.
- *
- * If @txn is NULL, the function does nothing.
- */
-void job_txn_add_job(JobTxn *txn, Job *job);
-
 /**
  * Create a new long-running job and return it.
  *
@@ -XXX,XX +XXX,XX @@ void job_progress_set_remaining(Job *job, uint64_t remaining);
  */
 void job_progress_increase_remaining(Job *job, uint64_t delta);
 
-/** To be called when a cancelled job is finalised. */
-void job_event_cancelled(Job *job);
-
-/** To be called when a successfully completed job is finalised. */
-void job_event_completed(Job *job);
-
 /**
  * Conditionally enter the job coroutine if the job is ready to run, not
  * already busy and fn() returns true. fn() is called while under the job_lock
diff --git a/job.c b/job.c
index XXXXXXX..XXXXXXX 100644
--- a/job.c
+++ b/job.c
@@ -XXX,XX +XXX,XX @@ void job_txn_unref(JobTxn *txn)
     }
 }
 
-void job_txn_add_job(JobTxn *txn, Job *job)
+/**
+ * @txn: The transaction (may be NULL)
+ * @job: Job to add to the transaction
+ *
+ * Add @job to the transaction.  The @job must not already be in a transaction.
+ * The caller must call either job_txn_unref() or job_completed() to release
+ * the reference that is automatically grabbed here.
+ *
+ * If @txn is NULL, the function does nothing.
+ */
+static void job_txn_add_job(JobTxn *txn, Job *job)
 {
     if (!txn) {
         return;
@@ -XXX,XX +XXX,XX @@ void job_progress_increase_remaining(Job *job, uint64_t delta)
     progress_increase_remaining(&job->progress, delta);
 }
 
-void job_event_cancelled(Job *job)
+/**
+ * To be called when a cancelled job is finalised.
+ */
+static void job_event_cancelled(Job *job)
 {
     notifier_list_notify(&job->on_finalize_cancelled, job);
 }
 
-void job_event_completed(Job *job)
+/**
+ * To be called when a successfully completed job is finalised.
+ */
+static void job_event_completed(Job *job)
 {
     notifier_list_notify(&job->on_finalize_completed, job);
 }
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

Same as AIO_WAIT_WHILE macro, but if we are in the Main loop
do not release and then acquire ctx_ 's aiocontext.

Once all Aiocontext locks go away, this macro will replace
AIO_WAIT_WHILE.

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Message-Id: <20220926093214.506243-5-eesposit@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/aio-wait.h | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio-wait.h
+++ b/include/block/aio-wait.h
@@ -XXX,XX +XXX,XX @@ typedef struct {
 extern AioWait global_aio_wait;
 
 /**
- * AIO_WAIT_WHILE:
+ * AIO_WAIT_WHILE_INTERNAL:
  * @ctx: the aio context, or NULL if multiple aio contexts (for which the
  *       caller does not hold a lock) are involved in the polling condition.
  * @cond: wait while this conditional expression is true
+ * @unlock: whether to unlock and then lock again @ctx. This apples
+ * only when waiting for another AioContext from the main loop.
+ * Otherwise it's ignored.
  *
  * Wait while a condition is true.  Use this to implement synchronous
  * operations that require event loop activity.
@@ -XXX,XX +XXX,XX @@ extern AioWait global_aio_wait;
  * wait on conditions between two IOThreads since that could lead to deadlock,
  * go via the main loop instead.
  */
-#define AIO_WAIT_WHILE(ctx, cond) ({                               \
+#define AIO_WAIT_WHILE_INTERNAL(ctx, cond, unlock) ({              \
     bool waited_ = false;                                          \
     AioWait *wait_ = &global_aio_wait;                             \
     AioContext *ctx_ = (ctx);                                      \
@@ -XXX,XX +XXX,XX @@ extern AioWait global_aio_wait;
         assert(qemu_get_current_aio_context() ==                   \
                qemu_get_aio_context());                            \
         while ((cond)) {                                           \
-            if (ctx_) {                                            \
+            if (unlock && ctx_) {                                  \
                 aio_context_release(ctx_);                         \
             }                                                      \
             aio_poll(qemu_get_aio_context(), true);                \
-            if (ctx_) {                                            \
+            if (unlock && ctx_) {                                  \
                 aio_context_acquire(ctx_);                         \
             }                                                      \
             waited_ = true;                                        \
@@ -XXX,XX +XXX,XX @@ extern AioWait global_aio_wait;
     qatomic_dec(&wait_->num_waiters);                              \
     waited_; })
 
+#define AIO_WAIT_WHILE(ctx, cond)                                  \
+    AIO_WAIT_WHILE_INTERNAL(ctx, cond, true)
+
+#define AIO_WAIT_WHILE_UNLOCKED(ctx, cond)                         \
+    AIO_WAIT_WHILE_INTERNAL(ctx, cond, false)
+
 /**
  * aio_wait_kick:
  * Wake up the main thread if it is waiting on AIO_WAIT_WHILE().  During
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

With "intact" we mean that all job.h functions implicitly
take the lock. Therefore API callers are unmodified.

This means that:
- many static functions that will be always called with job lock held
  become _locked, and call _locked functions
- all public functions take the lock internally if needed, and call _locked
  functions
- all public functions called internally by other functions in job.c will have a
  _locked counterpart (sometimes public), to avoid deadlocks (job lock already taken).
  These functions are not used for now.
- some public functions called only from exernal files (not job.c) do not
  have _locked() counterpart and take the lock inside. Others won't need
  the lock at all because use fields only set at initialization and
  never modified.

job_{lock/unlock} is independent from real_job_{lock/unlock}.

Note: at this stage, job_{lock/unlock} and job lock guard macros
are *nop*

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Message-Id: <20220926093214.506243-6-eesposit@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/qemu/job.h | 138 +++++++++-
 job.c              | 610 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 561 insertions(+), 187 deletions(-)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -XXX,XX +XXX,XX @@ JobTxn *job_txn_new(void);
  */
 void job_txn_unref(JobTxn *txn);
 
+/*
+ * Same as job_txn_unref(), but called with job lock held.
+ * Might release the lock temporarily.
+ */
+void job_txn_unref_locked(JobTxn *txn);
+
 /**
  * Create a new long-running job and return it.
+ * Called with job_mutex *not* held.
  *
  * @job_id: The id of the newly-created job, or %NULL for internal jobs
  * @driver: The class object for the newly-created job.
@@ -XXX,XX +XXX,XX @@ void *job_create(const char *job_id, const JobDriver *driver, JobTxn *txn,
  */
 void job_ref(Job *job);
 
+/* Same as job_ref(), but called with job lock held. */
+void job_ref_locked(Job *job);
+
 /**
  * Release a reference that was previously acquired with job_ref() or
  * job_create(). If it's the last reference to the object, it will be freed.
  */
 void job_unref(Job *job);
 
+/* Same as job_unref(), but called with job lock held. */
+void job_unref_locked(Job *job);
+
 /**
  * @job: The job that has made progress
  * @done: How much progress the job made since the last call
  *
  * Updates the progress counter of the job.
+ *
+ * May be called with mutex held or not held.
  */
 void job_progress_update(Job *job, uint64_t done);
 
@@ -XXX,XX +XXX,XX @@ void job_progress_update(Job *job, uint64_t done);
  *
  * Sets the expected end value of the progress counter of a job so that a
  * completion percentage can be calculated when the progress is updated.
+ *
+ * May be called with mutex held or not held.
  */
 void job_progress_set_remaining(Job *job, uint64_t remaining);
 
@@ -XXX,XX +XXX,XX @@ void job_progress_set_remaining(Job *job, uint64_t remaining);
  * length before, and job_progress_update() afterwards.
  * (So the operation acts as a parenthesis in regards to the main job
  * operation running in background.)
+ *
+ * May be called with mutex held or not held.
  */
 void job_progress_increase_remaining(Job *job, uint64_t delta);
 
@@ -XXX,XX +XXX,XX @@ void job_progress_increase_remaining(Job *job, uint64_t delta);
  */
 void job_enter_cond(Job *job, bool(*fn)(Job *job));
 
+/*
+ * Same as job_enter_cond(), but called with job lock held.
+ * Might release the lock temporarily.
+ */
+void job_enter_cond_locked(Job *job, bool(*fn)(Job *job));
+
 /**
  * @job: A job that has not yet been started.
  *
  * Begins execution of a job.
  * Takes ownership of one reference to the job object.
+ *
+ * Called with job_mutex *not* held.
  */
 void job_start(Job *job);
 
@@ -XXX,XX +XXX,XX @@ void job_start(Job *job);
  * @job: The job to enter.
  *
  * Continue the specified job by entering the coroutine.
+ * Called with job_mutex *not* held.
  */
 void job_enter(Job *job);
 
@@ -XXX,XX +XXX,XX @@ void job_enter(Job *job);
  *
  * Pause now if job_pause() has been called. Jobs that perform lots of I/O
  * must call this between requests so that the job can be paused.
+ *
+ * Called with job_mutex *not* held.
  */
 void coroutine_fn job_pause_point(Job *job);
 
@@ -XXX,XX +XXX,XX @@ void coroutine_fn job_pause_point(Job *job);
  * @job: The job that calls the function.
  *
  * Yield the job coroutine.
+ * Called with job_mutex *not* held.
  */
 void coroutine_fn job_yield(Job *job);
 
@@ -XXX,XX +XXX,XX @@ void coroutine_fn job_yield(Job *job);
  * Put the job to sleep (assuming that it wasn't canceled) for @ns
  * %QEMU_CLOCK_REALTIME nanoseconds.  Canceling the job will immediately
  * interrupt the wait.
+ *
+ * Called with job_mutex *not* held.
  */
 void coroutine_fn job_sleep_ns(Job *job, int64_t ns);
 
@@ -XXX,XX +XXX,XX @@ const char *job_type_str(const Job *job);
 /** Returns true if the job should not be visible to the management layer. */
 bool job_is_internal(Job *job);
 
-/** Returns whether the job is being cancelled. */
+/**
+ * Returns whether the job is being cancelled.
+ * Called with job_mutex *not* held.
+ */
 bool job_is_cancelled(Job *job);
 
+/* Same as job_is_cancelled(), but called with job lock held. */
+bool job_is_cancelled_locked(Job *job);
+
 /**
  * Returns whether the job is scheduled for cancellation (at an
  * indefinite point).
+ * Called with job_mutex *not* held.
  */
 bool job_cancel_requested(Job *job);
 
-/** Returns whether the job is in a completed state. */
+/**
+ * Returns whether the job is in a completed state.
+ * Called with job_mutex *not* held.
+ */
 bool job_is_completed(Job *job);
 
-/** Returns whether the job is ready to be completed. */
+/* Same as job_is_completed(), but called with job lock held. */
+bool job_is_completed_locked(Job *job);
+
+/**
+ * Returns whether the job is ready to be completed.
+ * Called with job_mutex *not* held.
+ */
 bool job_is_ready(Job *job);
 
+/* Same as job_is_ready(), but called with job lock held. */
+bool job_is_ready_locked(Job *job);
+
 /**
  * Request @job to pause at the next pause point. Must be paired with
  * job_resume(). If the job is supposed to be resumed by user action, call
@@ -XXX,XX +XXX,XX @@ bool job_is_ready(Job *job);
  */
 void job_pause(Job *job);
 
+/* Same as job_pause(), but called with job lock held. */
+void job_pause_locked(Job *job);
+
 /** Resumes a @job paused with job_pause. */
 void job_resume(Job *job);
 
+/*
+ * Same as job_resume(), but called with job lock held.
+ * Might release the lock temporarily.
+ */
+void job_resume_locked(Job *job);
+
 /**
  * Asynchronously pause the specified @job.
  * Do not allow a resume until a matching call to job_user_resume.
  */
 void job_user_pause(Job *job, Error **errp);
 
+/* Same as job_user_pause(), but called with job lock held. */
+void job_user_pause_locked(Job *job, Error **errp);
+
 /** Returns true if the job is user-paused. */
 bool job_user_paused(Job *job);
 
+/* Same as job_user_paused(), but called with job lock held. */
+bool job_user_paused_locked(Job *job);
+
 /**
  * Resume the specified @job.
  * Must be paired with a preceding job_user_pause.
  */
 void job_user_resume(Job *job, Error **errp);
 
+/*
+ * Same as job_user_resume(), but called with job lock held.
+ * Might release the lock temporarily.
+ */
+void job_user_resume_locked(Job *job, Error **errp);
+
 /**
  * Get the next element from the list of block jobs after @job, or the
  * first one if @job is %NULL.
@@ -XXX,XX +XXX,XX @@ void job_user_resume(Job *job, Error **errp);
  */
 Job *job_next(Job *job);
 
+/* Same as job_next(), but called with job lock held. */
+Job *job_next_locked(Job *job);
+
 /**
  * Get the job identified by @id (which must not be %NULL).
  *
@@ -XXX,XX +XXX,XX @@ Job *job_next(Job *job);
  */
 Job *job_get(const char *id);
 
+/* Same as job_get(), but called with job lock held. */
+Job *job_get_locked(const char *id);
+
 /**
  * Check whether the verb @verb can be applied to @job in its current state.
  * Returns 0 if the verb can be applied; otherwise errp is set and -EPERM
@@ -XXX,XX +XXX,XX @@ Job *job_get(const char *id);
  */
 int job_apply_verb(Job *job, JobVerb verb, Error **errp);
 
-/** The @job could not be started, free it. */
+/* Same as job_apply_verb, but called with job lock held. */
+int job_apply_verb_locked(Job *job, JobVerb verb, Error **errp);
+
+/**
+ * The @job could not be started, free it.
+ * Called with job_mutex *not* held.
+ */
 void job_early_fail(Job *job);
 
-/** Moves the @job from RUNNING to READY */
+/**
+ * Moves the @job from RUNNING to READY.
+ * Called with job_mutex *not* held.
+ */
 void job_transition_to_ready(Job *job);
 
 /** Asynchronously complete the specified @job. */
 void job_complete(Job *job, Error **errp);
 
+/*
+ * Same as job_complete(), but called with job lock held.
+ * Might release the lock temporarily.
+ */
+void job_complete_locked(Job *job, Error **errp);
+
 /**
  * Asynchronously cancel the specified @job. If @force is true, the job should
  * be cancelled immediately without waiting for a consistent state.
  */
 void job_cancel(Job *job, bool force);
 
+/* Same as job_cancel(), but called with job lock held. */
+void job_cancel_locked(Job *job, bool force);
+
 /**
  * Cancels the specified job like job_cancel(), but may refuse to do so if the
  * operation isn't meaningful in the current state of the job.
  */
 void job_user_cancel(Job *job, bool force, Error **errp);
 
+/* Same as job_user_cancel(), but called with job lock held. */
+void job_user_cancel_locked(Job *job, bool force, Error **errp);
+
 /**
  * Synchronously cancel the @job.  The completion callback is called
  * before the function returns.  If @force is false, the job may
@@ -XXX,XX +XXX,XX @@ void job_user_cancel(Job *job, bool force, Error **errp);
  */
 int job_cancel_sync(Job *job, bool force);
 
-/** Synchronously force-cancels all jobs using job_cancel_sync(). */
+/* Same as job_cancel_sync, but called with job lock held. */
+int job_cancel_sync_locked(Job *job, bool force);
+
+/**
+ * Synchronously force-cancels all jobs using job_cancel_sync_locked().
+ *
+ * Called with job_lock *not* held.
+ */
 void job_cancel_sync_all(void);
 
 /**
@@ -XXX,XX +XXX,XX @@ void job_cancel_sync_all(void);
  */
 int job_complete_sync(Job *job, Error **errp);
 
+/* Same as job_complete_sync, but called with job lock held. */
+int job_complete_sync_locked(Job *job, Error **errp);
+
 /**
  * For a @job that has finished its work and is pending awaiting explicit
  * acknowledgement to commit its work, this will commit that work.
@@ -XXX,XX +XXX,XX @@ int job_complete_sync(Job *job, Error **errp);
  */
 void job_finalize(Job *job, Error **errp);
 
+/* Same as job_finalize(), but called with job lock held. */
+void job_finalize_locked(Job *job, Error **errp);
+
 /**
  * Remove the concluded @job from the query list and resets the passed pointer
  * to %NULL. Returns an error if the job is not actually concluded.
  */
 void job_dismiss(Job **job, Error **errp);
 
+/* Same as job_dismiss(), but called with job lock held. */
+void job_dismiss_locked(Job **job, Error **errp);
+
 /**
  * Synchronously finishes the given @job. If @finish is given, it is called to
  * trigger completion or cancellation of the job.
@@ -XXX,XX +XXX,XX @@ void job_dismiss(Job **job, Error **errp);
  *
  * Callers must hold the AioContext lock of job->aio_context.
  */
-int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error **errp);
+int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp),
+                    Error **errp);
+
+/*
+ * Same as job_finish_sync(), but called with job lock held.
+ * Might release the lock temporarily.
+ */
+int job_finish_sync_locked(Job *job, void (*finish)(Job *, Error **errp),
+                           Error **errp);
 
 #endif
diff --git a/job.c b/job.c
index XXXXXXX..XXXXXXX 100644
--- a/job.c
+++ b/job.c
@@ -XXX,XX +XXX,XX @@
  */
 QemuMutex job_mutex;
 
+/* Protected by job_mutex */
 static QLIST_HEAD(, Job) jobs = QLIST_HEAD_INITIALIZER(jobs);
 
 /* Job State Transition Table */
@@ -XXX,XX +XXX,XX @@ JobTxn *job_txn_new(void)
     return txn;
 }
 
-static void job_txn_ref(JobTxn *txn)
+/* Called with job_mutex held. */
+static void job_txn_ref_locked(JobTxn *txn)
 {
     txn->refcnt++;
 }
 
-void job_txn_unref(JobTxn *txn)
+void job_txn_unref_locked(JobTxn *txn)
 {
     if (txn && --txn->refcnt == 0) {
         g_free(txn);
     }
 }
 
+void job_txn_unref(JobTxn *txn)
+{
+    JOB_LOCK_GUARD();
+    job_txn_unref_locked(txn);
+}
+
 /**
  * @txn: The transaction (may be NULL)
  * @job: Job to add to the transaction
@@ -XXX,XX +XXX,XX @@ void job_txn_unref(JobTxn *txn)
  * the reference that is automatically grabbed here.
  *
  * If @txn is NULL, the function does nothing.
+ *
+ * Called with job_mutex held.
  */
-static void job_txn_add_job(JobTxn *txn, Job *job)
+static void job_txn_add_job_locked(JobTxn *txn, Job *job)
 {
     if (!txn) {
         return;
@@ -XXX,XX +XXX,XX @@ static void job_txn_add_job(JobTxn *txn, Job *job)
     job->txn = txn;
 
     QLIST_INSERT_HEAD(&txn->jobs, job, txn_list);
-    job_txn_ref(txn);
+    job_txn_ref_locked(txn);
 }
 
-static void job_txn_del_job(Job *job)
+/* Called with job_mutex held. */
+static void job_txn_del_job_locked(Job *job)
 {
     if (job->txn) {
         QLIST_REMOVE(job, txn_list);
-        job_txn_unref(job->txn);
+        job_txn_unref_locked(job->txn);
         job->txn = NULL;
     }
 }
 
-static int job_txn_apply(Job *job, int fn(Job *))
+/* Called with job_mutex held, but releases it temporarily. */
+static int job_txn_apply_locked(Job *job, int fn(Job *))
 {
     AioContext *inner_ctx;
     Job *other_job, *next;
@@ -XXX,XX +XXX,XX @@ static int job_txn_apply(Job *job, int fn(Job *))
      * we need to release it here to avoid holding the lock twice - which would
      * break AIO_WAIT_WHILE from within fn.
      */
-    job_ref(job);
+    job_ref_locked(job);
     aio_context_release(job->aio_context);
 
     QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
@@ -XXX,XX +XXX,XX @@ static int job_txn_apply(Job *job, int fn(Job *))
      * can't use a local variable to cache it.
      */
     aio_context_acquire(job->aio_context);
-    job_unref(job);
+    job_unref_locked(job);
     return rc;
 }
 
@@ -XXX,XX +XXX,XX @@ bool job_is_internal(Job *job)
     return (job->id == NULL);
 }
 
-static void job_state_transition(Job *job, JobStatus s1)
+/* Called with job_mutex held. */
+static void job_state_transition_locked(Job *job, JobStatus s1)
 {
     JobStatus s0 = job->status;
     assert(s1 >= 0 && s1 < JOB_STATUS__MAX);
@@ -XXX,XX +XXX,XX @@ static void job_state_transition(Job *job, JobStatus s1)
     }
 }
 
-int job_apply_verb(Job *job, JobVerb verb, Error **errp)
+int job_apply_verb_locked(Job *job, JobVerb verb, Error **errp)
 {
     JobStatus s0 = job->status;
     assert(verb >= 0 && verb < JOB_VERB__MAX);
@@ -XXX,XX +XXX,XX @@ int job_apply_verb(Job *job, JobVerb verb, Error **errp)
     return -EPERM;
 }
 
+int job_apply_verb(Job *job, JobVerb verb, Error **errp)
+{
+    JOB_LOCK_GUARD();
+    return job_apply_verb_locked(job, verb, errp);
+}
+
 JobType job_type(const Job *job)
 {
     return job->driver->job_type;
@@ -XXX,XX +XXX,XX @@ const char *job_type_str(const Job *job)
     return JobType_str(job_type(job));
 }
 
-bool job_is_cancelled(Job *job)
+bool job_is_cancelled_locked(Job *job)
 {
     /* force_cancel may be true only if cancelled is true, too */
     assert(job->cancelled || !job->force_cancel);
     return job->force_cancel;
 }
 
-bool job_cancel_requested(Job *job)
+bool job_is_cancelled(Job *job)
+{
+    JOB_LOCK_GUARD();
+    return job_is_cancelled_locked(job);
+}
+
+/* Called with job_mutex held. */
+static bool job_cancel_requested_locked(Job *job)
 {
     return job->cancelled;
 }
 
-bool job_is_ready(Job *job)
+bool job_cancel_requested(Job *job)
+{
+    JOB_LOCK_GUARD();
+    return job_cancel_requested_locked(job);
+}
+
+bool job_is_ready_locked(Job *job)
 {
     switch (job->status) {
     case JOB_STATUS_UNDEFINED:
@@ -XXX,XX +XXX,XX @@ bool job_is_ready(Job *job)
     return false;
 }
 
-bool job_is_completed(Job *job)
+bool job_is_ready(Job *job)
+{
+    JOB_LOCK_GUARD();
+    return job_is_ready_locked(job);
+}
+
+bool job_is_completed_locked(Job *job)
 {
     switch (job->status) {
     case JOB_STATUS_UNDEFINED:
@@ -XXX,XX +XXX,XX @@ bool job_is_completed(Job *job)
     return false;
 }
 
-static bool job_started(Job *job)
+bool job_is_completed(Job *job)
+{
+    JOB_LOCK_GUARD();
+    return job_is_completed_locked(job);
+}
+
+static bool job_started_locked(Job *job)
 {
     return job->co;
 }
 
-static bool job_should_pause(Job *job)
+/* Called with job_mutex held. */
+static bool job_should_pause_locked(Job *job)
 {
     return job->pause_count > 0;
 }
 
-Job *job_next(Job *job)
+Job *job_next_locked(Job *job)
 {
     if (!job) {
         return QLIST_FIRST(&jobs);
@@ -XXX,XX +XXX,XX @@ Job *job_next(Job *job)
     return QLIST_NEXT(job, job_list);
 }
 
-Job *job_get(const char *id)
+Job *job_next(Job *job)
+{
+    JOB_LOCK_GUARD();
+    return job_next_locked(job);
+}
+
+Job *job_get_locked(const char *id)
 {
     Job *job;
 
@@ -XXX,XX +XXX,XX @@ Job *job_get(const char *id)
     return NULL;
 }
 
+Job *job_get(const char *id)
+{
+    JOB_LOCK_GUARD();
+    return job_get_locked(id);
+}
+
+/* Called with job_mutex *not* held. */
 static void job_sleep_timer_cb(void *opaque)
 {
     Job *job = opaque;
@@ -XXX,XX +XXX,XX @@ void *job_create(const char *job_id, const JobDriver *driver, JobTxn *txn,
 {
     Job *job;
 
+    JOB_LOCK_GUARD();
+
     if (job_id) {
         if (flags & JOB_INTERNAL) {
             error_setg(errp, "Cannot specify job ID for internal job");
@@ -XXX,XX +XXX,XX @@ void *job_create(const char *job_id, const JobDriver *driver, JobTxn *txn,
             error_setg(errp, "Invalid job ID '%s'", job_id);
             return NULL;
         }
-        if (job_get(job_id)) {
+        if (job_get_locked(job_id)) {
             error_setg(errp, "Job ID '%s' already in use", job_id);
             return NULL;
         }
@@ -XXX,XX +XXX,XX @@ void *job_create(const char *job_id, const JobDriver *driver, JobTxn *txn,
     notifier_list_init(&job->on_ready);
     notifier_list_init(&job->on_idle);
 
-    job_state_transition(job, JOB_STATUS_CREATED);
+    job_state_transition_locked(job, JOB_STATUS_CREATED);
     aio_timer_init(qemu_get_aio_context(), &job->sleep_timer,
                    QEMU_CLOCK_REALTIME, SCALE_NS,
                    job_sleep_timer_cb, job);
@@ -XXX,XX +XXX,XX @@ void *job_create(const char *job_id, const JobDriver *driver, JobTxn *txn,
      * consolidating the job management logic */
     if (!txn) {
         txn = job_txn_new();
-        job_txn_add_job(txn, job);
-        job_txn_unref(txn);
+        job_txn_add_job_locked(txn, job);
+        job_txn_unref_locked(txn);
     } else {
-        job_txn_add_job(txn, job);
+        job_txn_add_job_locked(txn, job);
     }
 
     return job;
 }
 
-void job_ref(Job *job)
+void job_ref_locked(Job *job)
 {
     ++job->refcnt;
 }
 
-void job_unref(Job *job)
+void job_ref(Job *job)
+{
+    JOB_LOCK_GUARD();
+    job_ref_locked(job);
+}
+
+void job_unref_locked(Job *job)
 {
     GLOBAL_STATE_CODE();
 
@@ -XXX,XX +XXX,XX @@ void job_unref(Job *job)
         assert(!job->txn);
 
         if (job->driver->free) {
+            job_unlock();
             job->driver->free(job);
+            job_lock();
         }
 
         QLIST_REMOVE(job, job_list);
@@ -XXX,XX +XXX,XX @@ void job_unref(Job *job)
     }
 }
 
+void job_unref(Job *job)
+{
+    JOB_LOCK_GUARD();
+    job_unref_locked(job);
+}
+
 void job_progress_update(Job *job, uint64_t done)
 {
     progress_work_done(&job->progress, done);
@@ -XXX,XX +XXX,XX @@ void job_progress_increase_remaining(Job *job, uint64_t delta)
 
 /**
  * To be called when a cancelled job is finalised.
+ * Called with job_mutex held.
  */
-static void job_event_cancelled(Job *job)
+static void job_event_cancelled_locked(Job *job)
 {
     notifier_list_notify(&job->on_finalize_cancelled, job);
 }
 
 /**
  * To be called when a successfully completed job is finalised.
+ * Called with job_mutex held.
  */
-static void job_event_completed(Job *job)
+static void job_event_completed_locked(Job *job)
 {
     notifier_list_notify(&job->on_finalize_completed, job);
 }
 
-static void job_event_pending(Job *job)
+/* Called with job_mutex held. */
+static void job_event_pending_locked(Job *job)
 {
     notifier_list_notify(&job->on_pending, job);
 }
 
-static void job_event_ready(Job *job)
+/* Called with job_mutex held. */
+static void job_event_ready_locked(Job *job)
 {
     notifier_list_notify(&job->on_ready, job);
 }
 
-static void job_event_idle(Job *job)
+/* Called with job_mutex held. */
+static void job_event_idle_locked(Job *job)
 {
     notifier_list_notify(&job->on_idle, job);
 }
 
-void job_enter_cond(Job *job, bool(*fn)(Job *job))
+void job_enter_cond_locked(Job *job, bool(*fn)(Job *job))
 {
-    if (!job_started(job)) {
+    if (!job_started_locked(job)) {
         return;
     }
     if (job->deferred_to_main_loop) {
@@ -XXX,XX +XXX,XX @@ void job_enter_cond(Job *job, bool(*fn)(Job *job))
     timer_del(&job->sleep_timer);
     job->busy = true;
     real_job_unlock();
+    job_unlock();
     aio_co_enter(job->aio_context, job->co);
+    job_lock();
+}
+
+void job_enter_cond(Job *job, bool(*fn)(Job *job))
+{
+    JOB_LOCK_GUARD();
+    job_enter_cond_locked(job, fn);
 }
 
 void job_enter(Job *job)
 {
-    job_enter_cond(job, NULL);
+    JOB_LOCK_GUARD();
+    job_enter_cond_locked(job, NULL);
 }
 
 /* Yield, and schedule a timer to reenter the coroutine after @ns nanoseconds.
@@ -XXX,XX +XXX,XX @@ void job_enter(Job *job)
  * is allowed and cancels the timer.
  *
  * If @ns is (uint64_t) -1, no timer is scheduled and job_enter() must be
- * called explicitly. */
-static void coroutine_fn job_do_yield(Job *job, uint64_t ns)
+ * called explicitly.
+ *
+ * Called with job_mutex held, but releases it temporarily.
+ */
+static void coroutine_fn job_do_yield_locked(Job *job, uint64_t ns)
 {
     real_job_lock();
     if (ns != -1) {
         timer_mod(&job->sleep_timer, ns);
     }
     job->busy = false;
-    job_event_idle(job);
+    job_event_idle_locked(job);
     real_job_unlock();
+    job_unlock();
     qemu_coroutine_yield();
+    job_lock();
 
     /* Set by job_enter_cond() before re-entering the coroutine.  */
     assert(job->busy);
 }
 
-void coroutine_fn job_pause_point(Job *job)
+/* Called with job_mutex held, but releases it temporarily. */
+static void coroutine_fn job_pause_point_locked(Job *job)
 {
-    assert(job && job_started(job));
+    assert(job && job_started_locked(job));
 
-    if (!job_should_pause(job)) {
+    if (!job_should_pause_locked(job)) {
         return;
     }
-    if (job_is_cancelled(job)) {
+    if (job_is_cancelled_locked(job)) {
         return;
     }
 
     if (job->driver->pause) {
+        job_unlock();
         job->driver->pause(job);
+        job_lock();
     }
 
-    if (job_should_pause(job) && !job_is_cancelled(job)) {
+    if (job_should_pause_locked(job) && !job_is_cancelled_locked(job)) {
         JobStatus status = job->status;
-        job_state_transition(job, status == JOB_STATUS_READY
-                                  ? JOB_STATUS_STANDBY
-                                  : JOB_STATUS_PAUSED);
+        job_state_transition_locked(job, status == JOB_STATUS_READY
+                                    ? JOB_STATUS_STANDBY
+                                    : JOB_STATUS_PAUSED);
         job->paused = true;
-        job_do_yield(job, -1);
+        job_do_yield_locked(job, -1);
         job->paused = false;
-        job_state_transition(job, status);
+        job_state_transition_locked(job, status);
     }
 
     if (job->driver->resume) {
+        job_unlock();
         job->driver->resume(job);
+        job_lock();
     }
 }
 
-void coroutine_fn job_yield(Job *job)
+void coroutine_fn job_pause_point(Job *job)
+{
+    JOB_LOCK_GUARD();
+    job_pause_point_locked(job);
+}
+
+static void coroutine_fn job_yield_locked(Job *job)
 {
     assert(job->busy);
 
     /* Check cancellation *before* setting busy = false, too!  */
-    if (job_is_cancelled(job)) {
+    if (job_is_cancelled_locked(job)) {
         return;
     }
 
-    if (!job_should_pause(job)) {
-        job_do_yield(job, -1);
+    if (!job_should_pause_locked(job)) {
+        job_do_yield_locked(job, -1);
     }
 
-    job_pause_point(job);
+    job_pause_point_locked(job);
+}
+
+void coroutine_fn job_yield(Job *job)
+{
+    JOB_LOCK_GUARD();
+    job_yield_locked(job);
 }
 
 void coroutine_fn job_sleep_ns(Job *job, int64_t ns)
 {
+    JOB_LOCK_GUARD();
     assert(job->busy);
 
     /* Check cancellation *before* setting busy = false, too!  */
-    if (job_is_cancelled(job)) {
+    if (job_is_cancelled_locked(job)) {
         return;
     }
 
-    if (!job_should_pause(job)) {
-        job_do_yield(job, qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + ns);
+    if (!job_should_pause_locked(job)) {
+        job_do_yield_locked(job, qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + ns);
     }
 
-    job_pause_point(job);
+    job_pause_point_locked(job);
 }
 
-/* Assumes the block_job_mutex is held */
-static bool job_timer_not_pending(Job *job)
+/* Assumes the job_mutex is held */
+static bool job_timer_not_pending_locked(Job *job)
 {
     return !timer_pending(&job->sleep_timer);
 }
 
-void job_pause(Job *job)
+void job_pause_locked(Job *job)
 {
     job->pause_count++;
     if (!job->paused) {
-        job_enter(job);
+        job_enter_cond_locked(job, NULL);
     }
 }
 
-void job_resume(Job *job)
+void job_pause(Job *job)
+{
+    JOB_LOCK_GUARD();
+    job_pause_locked(job);
+}
+
+void job_resume_locked(Job *job)
 {
     assert(job->pause_count > 0);
     job->pause_count--;
@@ -XXX,XX +XXX,XX @@ void job_resume(Job *job)
     }
 
     /* kick only if no timer is pending */
-    job_enter_cond(job, job_timer_not_pending);
+    job_enter_cond_locked(job, job_timer_not_pending_locked);
 }
 
-void job_user_pause(Job *job, Error **errp)
+void job_resume(Job *job)
 {
-    if (job_apply_verb(job, JOB_VERB_PAUSE, errp)) {
+    JOB_LOCK_GUARD();
+    job_resume_locked(job);
+}
+
+void job_user_pause_locked(Job *job, Error **errp)
+{
+    if (job_apply_verb_locked(job, JOB_VERB_PAUSE, errp)) {
         return;
     }
     if (job->user_paused) {
@@ -XXX,XX +XXX,XX @@ void job_user_pause(Job *job, Error **errp)
         return;
     }
     job->user_paused = true;
-    job_pause(job);
+    job_pause_locked(job);
 }
 
-bool job_user_paused(Job *job)
+void job_user_pause(Job *job, Error **errp)
+{
+    JOB_LOCK_GUARD();
+    job_user_pause_locked(job, errp);
+}
+
+bool job_user_paused_locked(Job *job)
 {
     return job->user_paused;
 }
 
-void job_user_resume(Job *job, Error **errp)
+bool job_user_paused(Job *job)
+{
+    JOB_LOCK_GUARD();
+    return job_user_paused_locked(job);
+}
+
+void job_user_resume_locked(Job *job, Error **errp)
 {
     assert(job);
     GLOBAL_STATE_CODE();
@@ -XXX,XX +XXX,XX @@ void job_user_resume(Job *job, Error **errp)
         error_setg(errp, "Can't resume a job that was not paused");
         return;
     }
-    if (job_apply_verb(job, JOB_VERB_RESUME, errp)) {
+    if (job_apply_verb_locked(job, JOB_VERB_RESUME, errp)) {
         return;
     }
     if (job->driver->user_resume) {
+        job_unlock();
         job->driver->user_resume(job);
+        job_lock();
     }
     job->user_paused = false;
-    job_resume(job);
+    job_resume_locked(job);
 }
 
-static void job_do_dismiss(Job *job)
+void job_user_resume(Job *job, Error **errp)
+{
+    JOB_LOCK_GUARD();
+    job_user_resume_locked(job, errp);
+}
+
+/* Called with job_mutex held, but releases it temporarily. */
+static void job_do_dismiss_locked(Job *job)
 {
     assert(job);
     job->busy = false;
     job->paused = false;
     job->deferred_to_main_loop = true;
 
-    job_txn_del_job(job);
+    job_txn_del_job_locked(job);
 
-    job_state_transition(job, JOB_STATUS_NULL);
-    job_unref(job);
+    job_state_transition_locked(job, JOB_STATUS_NULL);
+    job_unref_locked(job);
 }
 
-void job_dismiss(Job **jobptr, Error **errp)
+void job_dismiss_locked(Job **jobptr, Error **errp)
 {
     Job *job = *jobptr;
     /* similarly to _complete, this is QMP-interface only. */
     assert(job->id);
-    if (job_apply_verb(job, JOB_VERB_DISMISS, errp)) {
+    if (job_apply_verb_locked(job, JOB_VERB_DISMISS, errp)) {
         return;
     }
 
-    job_do_dismiss(job);
+    job_do_dismiss_locked(job);
     *jobptr = NULL;
 }
 
+void job_dismiss(Job **jobptr, Error **errp)
+{
+    JOB_LOCK_GUARD();
+    job_dismiss_locked(jobptr, errp);
+}
+
 void job_early_fail(Job *job)
 {
+    JOB_LOCK_GUARD();
     assert(job->status == JOB_STATUS_CREATED);
-    job_do_dismiss(job);
+    job_do_dismiss_locked(job);
 }
 
-static void job_conclude(Job *job)
+/* Called with job_mutex held. */
+static void job_conclude_locked(Job *job)
 {
-    job_state_transition(job, JOB_STATUS_CONCLUDED);
-    if (job->auto_dismiss || !job_started(job)) {
-        job_do_dismiss(job);
+    job_state_transition_locked(job, JOB_STATUS_CONCLUDED);
+    if (job->auto_dismiss || !job_started_locked(job)) {
+        job_do_dismiss_locked(job);
     }
 }
 
-static void job_update_rc(Job *job)
+/* Called with job_mutex held. */
+static void job_update_rc_locked(Job *job)
 {
-    if (!job->ret && job_is_cancelled(job)) {
+    if (!job->ret && job_is_cancelled_locked(job)) {
         job->ret = -ECANCELED;
     }
     if (job->ret) {
         if (!job->err) {
             error_setg(&job->err, "%s", strerror(-job->ret));
         }
-        job_state_transition(job, JOB_STATUS_ABORTING);
+        job_state_transition_locked(job, JOB_STATUS_ABORTING);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void job_clean(Job *job)
     }
 }
 
-static int job_finalize_single(Job *job)
+/* Called with job_mutex held, but releases it temporarily */
+static int job_finalize_single_locked(Job *job)
 {
-    assert(job_is_completed(job));
+    int job_ret;
+
+    assert(job_is_completed_locked(job));
 
     /* Ensure abort is called for late-transactional failures */
-    job_update_rc(job);
+    job_update_rc_locked(job);
+
+    job_ret = job->ret;
+    job_unlock();
 
-    if (!job->ret) {
+    if (!job_ret) {
         job_commit(job);
     } else {
         job_abort(job);
     }
     job_clean(job);
 
+    job_lock();
+
     if (job->cb) {
-        job->cb(job->opaque, job->ret);
+        job_ret = job->ret;
+        job_unlock();
+        job->cb(job->opaque, job_ret);
+        job_lock();
     }
 
     /* Emit events only if we actually started */
-    if (job_started(job)) {
-        if (job_is_cancelled(job)) {
-            job_event_cancelled(job);
+    if (job_started_locked(job)) {
+        if (job_is_cancelled_locked(job)) {
+            job_event_cancelled_locked(job);
         } else {
-            job_event_completed(job);
+            job_event_completed_locked(job);
         }
     }
 
-    job_txn_del_job(job);
-    job_conclude(job);
+    job_txn_del_job_locked(job);
+    job_conclude_locked(job);
     return 0;
 }
 
-static void job_cancel_async(Job *job, bool force)
+/* Called with job_mutex held, but releases it temporarily */
+static void job_cancel_async_locked(Job *job, bool force)
 {
     GLOBAL_STATE_CODE();
     if (job->driver->cancel) {
+        job_unlock();
         force = job->driver->cancel(job, force);
+        job_lock();
     } else {
         /* No .cancel() means the job will behave as if force-cancelled */
         force = true;
@@ -XXX,XX +XXX,XX @@ static void job_cancel_async(Job *job, bool force)
     if (job->user_paused) {
         /* Do not call job_enter here, the caller will handle it.  */
         if (job->driver->user_resume) {
+            job_unlock();
             job->driver->user_resume(job);
+            job_lock();
         }
         job->user_paused = false;
         assert(job->pause_count > 0);
@@ -XXX,XX +XXX,XX @@ static void job_cancel_async(Job *job, bool force)
     }
 }
 
-static void job_completed_txn_abort(Job *job)
+/* Called with job_mutex held, but releases it temporarily. */
+static void job_completed_txn_abort_locked(Job *job)
 {
     AioContext *ctx;
     JobTxn *txn = job->txn;
@@ -XXX,XX +XXX,XX @@ static void job_completed_txn_abort(Job *job)
         return;
     }
     txn->aborting = true;
-    job_txn_ref(txn);
+    job_txn_ref_locked(txn);
 
     /*
      * We can only hold the single job's AioContext lock while calling
@@ -XXX,XX +XXX,XX @@ static void job_completed_txn_abort(Job *job)
      * calls of AIO_WAIT_WHILE(), which could deadlock otherwise.
      * Note that the job's AioContext may change when it is finalized.
      */
-    job_ref(job);
+    job_ref_locked(job);
     aio_context_release(job->aio_context);
 
     /* Other jobs are effectively cancelled by us, set the status for
@@ -XXX,XX +XXX,XX @@ static void job_completed_txn_abort(Job *job)
              * Therefore, pass force=true to terminate all other jobs as quickly
              * as possible.
              */
-            job_cancel_async(other_job, true);
+            job_cancel_async_locked(other_job, true);
             aio_context_release(ctx);
         }
     }
@@ -XXX,XX +XXX,XX @@ static void job_completed_txn_abort(Job *job)
          */
         ctx = other_job->aio_context;
         aio_context_acquire(ctx);
-        if (!job_is_completed(other_job)) {
-            assert(job_cancel_requested(other_job));
-            job_finish_sync(other_job, NULL, NULL);
+        if (!job_is_completed_locked(other_job)) {
+            assert(job_cancel_requested_locked(other_job));
+            job_finish_sync_locked(other_job, NULL, NULL);
         }
-        job_finalize_single(other_job);
+        job_finalize_single_locked(other_job);
         aio_context_release(ctx);
     }
 
@@ -XXX,XX +XXX,XX @@ static void job_completed_txn_abort(Job *job)
      * even if the job went away during job_finalize_single().
      */
     aio_context_acquire(job->aio_context);
-    job_unref(job);
+    job_unref_locked(job);
 
-    job_txn_unref(txn);
+    job_txn_unref_locked(txn);
 }
 
-static int job_prepare(Job *job)
+/* Called with job_mutex held, but releases it temporarily */
+static int job_prepare_locked(Job *job)
 {
+    int ret;
+
     GLOBAL_STATE_CODE();
     if (job->ret == 0 && job->driver->prepare) {
-        job->ret = job->driver->prepare(job);
-        job_update_rc(job);
+        job_unlock();
+        ret = job->driver->prepare(job);
+        job_lock();
+        job->ret = ret;
+        job_update_rc_locked(job);
     }
     return job->ret;
 }
 
-static int job_needs_finalize(Job *job)
+/* Called with job_mutex held */
+static int job_needs_finalize_locked(Job *job)
 {
     return !job->auto_finalize;
 }
 
-static void job_do_finalize(Job *job)
+/* Called with job_mutex held */
+static void job_do_finalize_locked(Job *job)
 {
     int rc;
     assert(job && job->txn);
 
     /* prepare the transaction to complete */
-    rc = job_txn_apply(job, job_prepare);
+    rc = job_txn_apply_locked(job, job_prepare_locked);
     if (rc) {
-        job_completed_txn_abort(job);
+        job_completed_txn_abort_locked(job);
     } else {
-        job_txn_apply(job, job_finalize_single);
+        job_txn_apply_locked(job, job_finalize_single_locked);
     }
 }
 
-void job_finalize(Job *job, Error **errp)
+void job_finalize_locked(Job *job, Error **errp)
 {
     assert(job && job->id);
-    if (job_apply_verb(job, JOB_VERB_FINALIZE, errp)) {
+    if (job_apply_verb_locked(job, JOB_VERB_FINALIZE, errp)) {
         return;
     }
-    job_do_finalize(job);
+    job_do_finalize_locked(job);
 }
 
-static int job_transition_to_pending(Job *job)
+void job_finalize(Job *job, Error **errp)
 {
-    job_state_transition(job, JOB_STATUS_PENDING);
+    JOB_LOCK_GUARD();
+    job_finalize_locked(job, errp);
+}
+
+/* Called with job_mutex held. */
+static int job_transition_to_pending_locked(Job *job)
+{
+    job_state_transition_locked(job, JOB_STATUS_PENDING);
     if (!job->auto_finalize) {
-        job_event_pending(job);
+        job_event_pending_locked(job);
     }
     return 0;
 }
 
 void job_transition_to_ready(Job *job)
 {
-    job_state_transition(job, JOB_STATUS_READY);
-    job_event_ready(job);
+    JOB_LOCK_GUARD();
+    job_state_transition_locked(job, JOB_STATUS_READY);
+    job_event_ready_locked(job);
 }
 
-static void job_completed_txn_success(Job *job)
+/* Called with job_mutex held. */
+static void job_completed_txn_success_locked(Job *job)
 {
     JobTxn *txn = job->txn;
     Job *other_job;
 
-    job_state_transition(job, JOB_STATUS_WAITING);
+    job_state_transition_locked(job, JOB_STATUS_WAITING);
 
     /*
      * Successful completion, see if there are other running jobs in this
      * txn.
      */
     QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
-        if (!job_is_completed(other_job)) {
+        if (!job_is_completed_locked(other_job)) {
             return;
         }
         assert(other_job->ret == 0);
     }
 
-    job_txn_apply(job, job_transition_to_pending);
+    job_txn_apply_locked(job, job_transition_to_pending_locked);
 
     /* If no jobs need manual finalization, automatically do so */
-    if (job_txn_apply(job, job_needs_finalize) == 0) {
-        job_do_finalize(job);
+    if (job_txn_apply_locked(job, job_needs_finalize_locked) == 0) {
+        job_do_finalize_locked(job);
     }
 }
 
-static void job_completed(Job *job)
+/* Called with job_mutex held. */
+static void job_completed_locked(Job *job)
 {
-    assert(job && job->txn && !job_is_completed(job));
+    assert(job && job->txn && !job_is_completed_locked(job));
 
-    job_update_rc(job);
+    job_update_rc_locked(job);
     trace_job_completed(job, job->ret);
     if (job->ret) {
-        job_completed_txn_abort(job);
+        job_completed_txn_abort_locked(job);
     } else {
-        job_completed_txn_success(job);
+        job_completed_txn_success_locked(job);
     }
 }
 
-/** Useful only as a type shim for aio_bh_schedule_oneshot. */
+/**
+ * Useful only as a type shim for aio_bh_schedule_oneshot.
+ * Called with job_mutex *not* held.
+ */
 static void job_exit(void *opaque)
 {
     Job *job = (Job *)opaque;
     AioContext *ctx;
+    JOB_LOCK_GUARD();
 
-    job_ref(job);
+    job_ref_locked(job);
     aio_context_acquire(job->aio_context);
 
     /* This is a lie, we're not quiescent, but still doing the completion
@@ -XXX,XX +XXX,XX @@ static void job_exit(void *opaque)
      * drain block nodes, and if .drained_poll still returned true, we would
      * deadlock. */
     job->busy = false;
-    job_event_idle(job);
+    job_event_idle_locked(job);
 
-    job_completed(job);
+    job_completed_locked(job);
 
     /*
      * Note that calling job_completed can move the job to a different
@@ -XXX,XX +XXX,XX @@ static void job_exit(void *opaque)
      * the job underneath us.
      */
     ctx = job->aio_context;
-    job_unref(job);
+    job_unref_locked(job);
     aio_context_release(ctx);
 }
 
@@ -XXX,XX +XXX,XX @@ static void job_exit(void *opaque)
 static void coroutine_fn job_co_entry(void *opaque)
 {
     Job *job = opaque;
+    int ret;
 
     assert(job && job->driver && job->driver->run);
-    assert(job->aio_context == qemu_get_current_aio_context());
-    job_pause_point(job);
-    job->ret = job->driver->run(job, &job->err);
-    job->deferred_to_main_loop = true;
-    job->busy = true;
+    WITH_JOB_LOCK_GUARD() {
+        assert(job->aio_context == qemu_get_current_aio_context());
+        job_pause_point_locked(job);
+    }
+    ret = job->driver->run(job, &job->err);
+    WITH_JOB_LOCK_GUARD() {
+        job->ret = ret;
+        job->deferred_to_main_loop = true;
+        job->busy = true;
+    }
     aio_bh_schedule_oneshot(qemu_get_aio_context(), job_exit, job);
 }
 
 void job_start(Job *job)
 {
-    assert(job && !job_started(job) && job->paused &&
-           job->driver && job->driver->run);
-    job->co = qemu_coroutine_create(job_co_entry, job);
-    job->pause_count--;
-    job->busy = true;
-    job->paused = false;
-    job_state_transition(job, JOB_STATUS_RUNNING);
+    assert(qemu_in_main_thread());
+
+    WITH_JOB_LOCK_GUARD() {
+        assert(job && !job_started_locked(job) && job->paused &&
+            job->driver && job->driver->run);
+        job->co = qemu_coroutine_create(job_co_entry, job);
+        job->pause_count--;
+        job->busy = true;
+        job->paused = false;
+        job_state_transition_locked(job, JOB_STATUS_RUNNING);
+    }
     aio_co_enter(job->aio_context, job->co);
 }
 
-void job_cancel(Job *job, bool force)
+void job_cancel_locked(Job *job, bool force)
 {
     if (job->status == JOB_STATUS_CONCLUDED) {
-        job_do_dismiss(job);
+        job_do_dismiss_locked(job);
         return;
     }
-    job_cancel_async(job, force);
-    if (!job_started(job)) {
-        job_completed(job);
+    job_cancel_async_locked(job, force);
+    if (!job_started_locked(job)) {
+        job_completed_locked(job);
     } else if (job->deferred_to_main_loop) {
         /*
          * job_cancel_async() ignores soft-cancel requests for jobs
@@ -XXX,XX +XXX,XX @@ void job_cancel(Job *job, bool force)
          * choose to call job_is_cancelled() to show that we invoke
          * job_completed_txn_abort() only for force-cancelled jobs.)
          */
-        if (job_is_cancelled(job)) {
-            job_completed_txn_abort(job);
+        if (job_is_cancelled_locked(job)) {
+            job_completed_txn_abort_locked(job);
         }
     } else {
-        job_enter(job);
+        job_enter_cond_locked(job, NULL);
     }
 }
 
-void job_user_cancel(Job *job, bool force, Error **errp)
+void job_cancel(Job *job, bool force)
 {
-    if (job_apply_verb(job, JOB_VERB_CANCEL, errp)) {
+    JOB_LOCK_GUARD();
+    job_cancel_locked(job, force);
+}
+
+void job_user_cancel_locked(Job *job, bool force, Error **errp)
+{
+    if (job_apply_verb_locked(job, JOB_VERB_CANCEL, errp)) {
         return;
     }
-    job_cancel(job, force);
+    job_cancel_locked(job, force);
+}
+
+void job_user_cancel(Job *job, bool force, Error **errp)
+{
+    JOB_LOCK_GUARD();
+    job_user_cancel_locked(job, force, errp);
 }
 
 /* A wrapper around job_cancel() taking an Error ** parameter so it may be
  * used with job_finish_sync() without the need for (rather nasty) function
- * pointer casts there. */
-static void job_cancel_err(Job *job, Error **errp)
+ * pointer casts there.
+ *
+ * Called with job_mutex held.
+ */
+static void job_cancel_err_locked(Job *job, Error **errp)
 {
-    job_cancel(job, false);
+    job_cancel_locked(job, false);
 }
 
 /**
  * Same as job_cancel_err(), but force-cancel.
+ * Called with job_mutex held.
  */
-static void job_force_cancel_err(Job *job, Error **errp)
+static void job_force_cancel_err_locked(Job *job, Error **errp)
 {
-    job_cancel(job, true);
+    job_cancel_locked(job, true);
 }
 
-int job_cancel_sync(Job *job, bool force)
+int job_cancel_sync_locked(Job *job, bool force)
 {
     if (force) {
-        return job_finish_sync(job, &job_force_cancel_err, NULL);
+        return job_finish_sync_locked(job, &job_force_cancel_err_locked, NULL);
     } else {
-        return job_finish_sync(job, &job_cancel_err, NULL);
+        return job_finish_sync_locked(job, &job_cancel_err_locked, NULL);
     }
 }
 
+int job_cancel_sync(Job *job, bool force)
+{
+    JOB_LOCK_GUARD();
+    return job_cancel_sync_locked(job, force);
+}
+
 void job_cancel_sync_all(void)
 {
     Job *job;
     AioContext *aio_context;
+    JOB_LOCK_GUARD();
 
-    while ((job = job_next(NULL))) {
+    while ((job = job_next_locked(NULL))) {
         aio_context = job->aio_context;
         aio_context_acquire(aio_context);
-        job_cancel_sync(job, true);
+        job_cancel_sync_locked(job, true);
         aio_context_release(aio_context);
     }
 }
 
+int job_complete_sync_locked(Job *job, Error **errp)
+{
+    return job_finish_sync_locked(job, job_complete_locked, errp);
+}
+
 int job_complete_sync(Job *job, Error **errp)
 {
-    return job_finish_sync(job, job_complete, errp);
+    JOB_LOCK_GUARD();
+    return job_complete_sync_locked(job, errp);
 }
 
-void job_complete(Job *job, Error **errp)
+void job_complete_locked(Job *job, Error **errp)
 {
     /* Should not be reachable via external interface for internal jobs */
     assert(job->id);
     GLOBAL_STATE_CODE();
-    if (job_apply_verb(job, JOB_VERB_COMPLETE, errp)) {
+    if (job_apply_verb_locked(job, JOB_VERB_COMPLETE, errp)) {
         return;
     }
-    if (job_cancel_requested(job) || !job->driver->complete) {
+    if (job_cancel_requested_locked(job) || !job->driver->complete) {
         error_setg(errp, "The active block job '%s' cannot be completed",
                    job->id);
         return;
     }
 
+    job_unlock();
     job->driver->complete(job, errp);
+    job_lock();
 }
 
-int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error **errp)
+void job_complete(Job *job, Error **errp)
+{
+    JOB_LOCK_GUARD();
+    job_complete_locked(job, errp);
+}
+
+int job_finish_sync_locked(Job *job,
+                           void (*finish)(Job *, Error **errp),
+                           Error **errp)
 {
     Error *local_err = NULL;
     int ret;
 
-    job_ref(job);
+    job_ref_locked(job);
 
     if (finish) {
         finish(job, &local_err);
     }
     if (local_err) {
         error_propagate(errp, local_err);
-        job_unref(job);
+        job_unref_locked(job);
         return -EBUSY;
     }
 
+    job_unlock();
     AIO_WAIT_WHILE(job->aio_context,
                    (job_enter(job), !job_is_completed(job)));
+    job_lock();
 
-    ret = (job_is_cancelled(job) && job->ret == 0) ? -ECANCELED : job->ret;
-    job_unref(job);
+    ret = (job_is_cancelled_locked(job) && job->ret == 0)
+          ? -ECANCELED : job->ret;
+    job_unref_locked(job);
     return ret;
 }
+
+int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error **errp)
+{
+    JOB_LOCK_GUARD();
+    return job_finish_sync_locked(job, finish, errp);
+}
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

This comment applies more on job, it was left in blockjob as in the past
the whole job logic was implemented there.

Note: at this stage, job_{lock/unlock} and job lock guard macros
are *nop*.

No functional change intended.

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20220926093214.506243-7-eesposit@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockjob.c | 20 --------------------
 job.c      | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/main-loop.h"
 #include "qemu/timer.h"
 
-/*
- * The block job API is composed of two categories of functions.
- *
- * The first includes functions used by the monitor.  The monitor is
- * peculiar in that it accesses the block job list with block_job_get, and
- * therefore needs consistency across block_job_get and the actual operation
- * (e.g. block_job_set_speed).  The consistency is achieved with
- * aio_context_acquire/release.  These functions are declared in blockjob.h.
- *
- * The second includes functions used by the block job drivers and sometimes
- * by the core block layer.  These do not care about locking, because the
- * whole coroutine runs under the AioContext lock, and are declared in
- * blockjob_int.h.
- */
-
 static bool is_block_job(Job *job)
 {
     return job_type(job) == JOB_TYPE_BACKUP ||
@@ -XXX,XX +XXX,XX @@ static void block_job_event_ready(Notifier *n, void *opaque)
 }
 
 
-/*
- * API for block job drivers and the block layer.  These functions are
- * declared in blockjob_int.h.
- */
-
 void *block_job_create(const char *job_id, const BlockJobDriver *driver,
                        JobTxn *txn, BlockDriverState *bs, uint64_t perm,
                        uint64_t shared_perm, int64_t speed, int flags,
diff --git a/job.c b/job.c
index XXXXXXX..XXXXXXX 100644
--- a/job.c
+++ b/job.c
@@ -XXX,XX +XXX,XX @@
 #include "trace/trace-root.h"
 #include "qapi/qapi-events-job.h"
 
+/*
+ * The job API is composed of two categories of functions.
+ *
+ * The first includes functions used by the monitor.  The monitor is
+ * peculiar in that it accesses the job list with job_get, and
+ * therefore needs consistency across job_get and the actual operation
+ * (e.g. job_user_cancel). To achieve this consistency, the caller
+ * calls job_lock/job_unlock itself around the whole operation.
+ *
+ *
+ * The second includes functions used by the job drivers and sometimes
+ * by the core block layer. These delegate the locking to the callee instead.
+ *
+ * TODO Actually make this true
+ */
+
 /*
  * job_mutex protects the jobs list, but also makes the
  * struct job fields thread-safe.
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

Just as done with job.h, create _locked() functions in blockjob.h

These functions will be later useful when caller has already taken
the lock. All blockjob _locked functions call job _locked functions.

Note: at this stage, job_{lock/unlock} and job lock guard macros
are *nop*.

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20220926093214.506243-8-eesposit@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/blockjob.h | 18 ++++++++++++++
 blockjob.c               | 52 ++++++++++++++++++++++++++++++++--------
 2 files changed, 60 insertions(+), 10 deletions(-)

diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -XXX,XX +XXX,XX @@ typedef struct BlockJob {
  */
 BlockJob *block_job_next(BlockJob *job);
 
+/* Same as block_job_next(), but called with job lock held. */
+BlockJob *block_job_next_locked(BlockJob *job);
+
 /**
  * block_job_get:
  * @id: The id of the block job.
@@ -XXX,XX +XXX,XX @@ BlockJob *block_job_next(BlockJob *job);
  */
 BlockJob *block_job_get(const char *id);
 
+/* Same as block_job_get(), but called with job lock held. */
+BlockJob *block_job_get_locked(const char *id);
+
 /**
  * block_job_add_bdrv:
  * @job: A block job
@@ -XXX,XX +XXX,XX @@ bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs);
  */
 bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp);
 
+/*
+ * Same as block_job_set_speed(), but called with job lock held.
+ * Might release the lock temporarily.
+ */
+bool block_job_set_speed_locked(BlockJob *job, int64_t speed, Error **errp);
+
 /**
  * block_job_query:
  * @job: The job to get information about.
@@ -XXX,XX +XXX,XX @@ bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp);
  */
 BlockJobInfo *block_job_query(BlockJob *job, Error **errp);
 
+/* Same as block_job_query(), but called with job lock held. */
+BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp);
+
 /**
  * block_job_iostatus_reset:
  * @job: The job whose I/O status should be reset.
@@ -XXX,XX +XXX,XX @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp);
  */
 void block_job_iostatus_reset(BlockJob *job);
 
+/* Same as block_job_iostatus_reset(), but called with job lock held. */
+void block_job_iostatus_reset_locked(BlockJob *job);
+
 /*
  * block_job_get_aio_context:
  *
diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static bool is_block_job(Job *job)
            job_type(job) == JOB_TYPE_STREAM;
 }
 
-BlockJob *block_job_next(BlockJob *bjob)
+BlockJob *block_job_next_locked(BlockJob *bjob)
 {
     Job *job = bjob ? &bjob->job : NULL;
     GLOBAL_STATE_CODE();
 
     do {
-        job = job_next(job);
+        job = job_next_locked(job);
     } while (job && !is_block_job(job));
 
     return job ? container_of(job, BlockJob, job) : NULL;
 }
 
-BlockJob *block_job_get(const char *id)
+BlockJob *block_job_next(BlockJob *bjob)
 {
-    Job *job = job_get(id);
+    JOB_LOCK_GUARD();
+    return block_job_next_locked(bjob);
+}
+
+BlockJob *block_job_get_locked(const char *id)
+{
+    Job *job = job_get_locked(id);
     GLOBAL_STATE_CODE();
 
     if (job && is_block_job(job)) {
@@ -XXX,XX +XXX,XX @@ BlockJob *block_job_get(const char *id)
     }
 }
 
+BlockJob *block_job_get(const char *id)
+{
+    JOB_LOCK_GUARD();
+    return block_job_get_locked(id);
+}
+
 void block_job_free(Job *job)
 {
     BlockJob *bjob = container_of(job, BlockJob, job);
@@ -XXX,XX +XXX,XX @@ static bool job_timer_pending(Job *job)
     return timer_pending(&job->sleep_timer);
 }
 
-bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
+bool block_job_set_speed_locked(BlockJob *job, int64_t speed, Error **errp)
 {
     const BlockJobDriver *drv = block_job_driver(job);
     int64_t old_speed = job->speed;
 
     GLOBAL_STATE_CODE();
 
-    if (job_apply_verb(&job->job, JOB_VERB_SET_SPEED, errp) < 0) {
+    if (job_apply_verb_locked(&job->job, JOB_VERB_SET_SPEED, errp) < 0) {
         return false;
     }
     if (speed < 0) {
@@ -XXX,XX +XXX,XX @@ bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
     job->speed = speed;
 
     if (drv->set_speed) {
+        job_unlock();
         drv->set_speed(job, speed);
+        job_lock();
     }
 
     if (speed && speed <= old_speed) {
@@ -XXX,XX +XXX,XX @@ bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
     }
 
     /* kick only if a timer is pending */
-    job_enter_cond(&job->job, job_timer_pending);
+    job_enter_cond_locked(&job->job, job_timer_pending);
 
     return true;
 }
 
+bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+    JOB_LOCK_GUARD();
+    return block_job_set_speed_locked(job, speed, errp);
+}
+
 int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n)
 {
     IO_CODE();
     return ratelimit_calculate_delay(&job->limit, n);
 }
 
-BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
+BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp)
 {
     BlockJobInfo *info;
     uint64_t progress_current, progress_total;
@@ -XXX,XX +XXX,XX @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
     info->len       = progress_total;
     info->speed     = job->speed;
     info->io_status = job->iostatus;
-    info->ready     = job_is_ready(&job->job),
+    info->ready     = job_is_ready_locked(&job->job),
     info->status    = job->job.status;
     info->auto_finalize = job->job.auto_finalize;
     info->auto_dismiss  = job->job.auto_dismiss;
@@ -XXX,XX +XXX,XX @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
     return info;
 }
 
+BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
+{
+    JOB_LOCK_GUARD();
+    return block_job_query_locked(job, errp);
+}
+
 static void block_job_iostatus_set_err(BlockJob *job, int error)
 {
     if (job->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
@@ -XXX,XX +XXX,XX @@ fail:
     return NULL;
 }
 
-void block_job_iostatus_reset(BlockJob *job)
+void block_job_iostatus_reset_locked(BlockJob *job)
 {
     GLOBAL_STATE_CODE();
     if (job->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
@@ -XXX,XX +XXX,XX @@ void block_job_iostatus_reset(BlockJob *job)
     job->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
 }
 
+void block_job_iostatus_reset(BlockJob *job)
+{
+    JOB_LOCK_GUARD();
+    block_job_iostatus_reset_locked(job);
+}
+
 void block_job_user_resume(Job *job)
 {
     BlockJob *bjob = container_of(job, BlockJob, job);
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

Both blockdev.c and job-qmp.c have TOC/TOU conditions, because
they first search for the job and then perform an action on it.
Therefore, we need to do the search + action under the same
job mutex critical section.

Note: at this stage, job_{lock/unlock} and job lock guard macros
are *nop*.

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20220926093214.506243-9-eesposit@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c | 67 +++++++++++++++++++++++++++++++++++++-----------------
 job-qmp.c  | 57 ++++++++++++++++++++++++++++++++--------------
 2 files changed, 86 insertions(+), 38 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index XXXXXXX..XXXXXXX 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -XXX,XX +XXX,XX @@ out:
     aio_context_release(aio_context);
 }
 
-/* Get a block job using its ID and acquire its AioContext */
-static BlockJob *find_block_job(const char *id, AioContext **aio_context,
-                                Error **errp)
+/*
+ * Get a block job using its ID and acquire its AioContext.
+ * Called with job_mutex held.
+ */
+static BlockJob *find_block_job_locked(const char *id,
+                                       AioContext **aio_context,
+                                       Error **errp)
 {
     BlockJob *job;
 
@@ -XXX,XX +XXX,XX @@ static BlockJob *find_block_job(const char *id, AioContext **aio_context,
 
     *aio_context = NULL;
 
-    job = block_job_get(id);
+    job = block_job_get_locked(id);
 
     if (!job) {
         error_set(errp, ERROR_CLASS_DEVICE_NOT_ACTIVE,
@@ -XXX,XX +XXX,XX @@ static BlockJob *find_block_job(const char *id, AioContext **aio_context,
 void qmp_block_job_set_speed(const char *device, int64_t speed, Error **errp)
 {
     AioContext *aio_context;
-    BlockJob *job = find_block_job(device, &aio_context, errp);
+    BlockJob *job;
+
+    JOB_LOCK_GUARD();
+    job = find_block_job_locked(device, &aio_context, errp);
 
     if (!job) {
         return;
     }
 
-    block_job_set_speed(job, speed, errp);
+    block_job_set_speed_locked(job, speed, errp);
     aio_context_release(aio_context);
 }
 
@@ -XXX,XX +XXX,XX @@ void qmp_block_job_cancel(const char *device,
                           bool has_force, bool force, Error **errp)
 {
     AioContext *aio_context;
-    BlockJob *job = find_block_job(device, &aio_context, errp);
+    BlockJob *job;
+
+    JOB_LOCK_GUARD();
+    job = find_block_job_locked(device, &aio_context, errp);
 
     if (!job) {
         return;
@@ -XXX,XX +XXX,XX @@ void qmp_block_job_cancel(const char *device,
         force = false;
     }
 
-    if (job_user_paused(&job->job) && !force) {
+    if (job_user_paused_locked(&job->job) && !force) {
         error_setg(errp, "The block job for device '%s' is currently paused",
                    device);
         goto out;
     }
 
     trace_qmp_block_job_cancel(job);
-    job_user_cancel(&job->job, force, errp);
+    job_user_cancel_locked(&job->job, force, errp);
 out:
     aio_context_release(aio_context);
 }
@@ -XXX,XX +XXX,XX @@ out:
 void qmp_block_job_pause(const char *device, Error **errp)
 {
     AioContext *aio_context;
-    BlockJob *job = find_block_job(device, &aio_context, errp);
+    BlockJob *job;
+
+    JOB_LOCK_GUARD();
+    job = find_block_job_locked(device, &aio_context, errp);
 
     if (!job) {
         return;
     }
 
     trace_qmp_block_job_pause(job);
-    job_user_pause(&job->job, errp);
+    job_user_pause_locked(&job->job, errp);
     aio_context_release(aio_context);
 }
 
 void qmp_block_job_resume(const char *device, Error **errp)
 {
     AioContext *aio_context;
-    BlockJob *job = find_block_job(device, &aio_context, errp);
+    BlockJob *job;
+
+    JOB_LOCK_GUARD();
+    job = find_block_job_locked(device, &aio_context, errp);
 
     if (!job) {
         return;
     }
 
     trace_qmp_block_job_resume(job);
-    job_user_resume(&job->job, errp);
+    job_user_resume_locked(&job->job, errp);
     aio_context_release(aio_context);
 }
 
 void qmp_block_job_complete(const char *device, Error **errp)
 {
     AioContext *aio_context;
-    BlockJob *job = find_block_job(device, &aio_context, errp);
+    BlockJob *job;
+
+    JOB_LOCK_GUARD();
+    job = find_block_job_locked(device, &aio_context, errp);
 
     if (!job) {
         return;
     }
 
     trace_qmp_block_job_complete(job);
-    job_complete(&job->job, errp);
+    job_complete_locked(&job->job, errp);
     aio_context_release(aio_context);
 }
 
 void qmp_block_job_finalize(const char *id, Error **errp)
 {
     AioContext *aio_context;
-    BlockJob *job = find_block_job(id, &aio_context, errp);
+    BlockJob *job;
+
+    JOB_LOCK_GUARD();
+    job = find_block_job_locked(id, &aio_context, errp);
 
     if (!job) {
         return;
     }
 
     trace_qmp_block_job_finalize(job);
-    job_ref(&job->job);
-    job_finalize(&job->job, errp);
+    job_ref_locked(&job->job);
+    job_finalize_locked(&job->job, errp);
 
     /*
      * Job's context might have changed via job_finalize (and job_txn_apply
@@ -XXX,XX +XXX,XX @@ void qmp_block_job_finalize(const char *id, Error **errp)
      * one.
      */
     aio_context = block_job_get_aio_context(job);
-    job_unref(&job->job);
+    job_unref_locked(&job->job);
     aio_context_release(aio_context);
 }
 
 void qmp_block_job_dismiss(const char *id, Error **errp)
 {
     AioContext *aio_context;
-    BlockJob *bjob = find_block_job(id, &aio_context, errp);
+    BlockJob *bjob;
     Job *job;
 
+    JOB_LOCK_GUARD();
+    bjob = find_block_job_locked(id, &aio_context, errp);
+
     if (!bjob) {
         return;
     }
 
     trace_qmp_block_job_dismiss(bjob);
     job = &bjob->job;
-    job_dismiss(&job, errp);
+    job_dismiss_locked(&job, errp);
     aio_context_release(aio_context);
 }
 
diff --git a/job-qmp.c b/job-qmp.c
index XXXXXXX..XXXXXXX 100644
--- a/job-qmp.c
+++ b/job-qmp.c
@@ -XXX,XX +XXX,XX @@
 #include "qapi/error.h"
 #include "trace/trace-root.h"
 
-/* Get a job using its ID and acquire its AioContext */
-static Job *find_job(const char *id, AioContext **aio_context, Error **errp)
+/*
+ * Get a job using its ID and acquire its AioContext.
+ * Called with job_mutex held.
+ */
+static Job *find_job_locked(const char *id,
+                            AioContext **aio_context,
+                            Error **errp)
 {
     Job *job;
 
     *aio_context = NULL;
 
-    job = job_get(id);
+    job = job_get_locked(id);
     if (!job) {
         error_setg(errp, "Job not found");
         return NULL;
@@ -XXX,XX +XXX,XX @@ static Job *find_job(const char *id, AioContext **aio_context, Error **errp)
 void qmp_job_cancel(const char *id, Error **errp)
 {
     AioContext *aio_context;
-    Job *job = find_job(id, &aio_context, errp);
+    Job *job;
+
+    JOB_LOCK_GUARD();
+    job = find_job_locked(id, &aio_context, errp);
 
     if (!job) {
         return;
     }
 
     trace_qmp_job_cancel(job);
-    job_user_cancel(job, true, errp);
+    job_user_cancel_locked(job, true, errp);
     aio_context_release(aio_context);
 }
 
 void qmp_job_pause(const char *id, Error **errp)
 {
     AioContext *aio_context;
-    Job *job = find_job(id, &aio_context, errp);
+    Job *job;
+
+    JOB_LOCK_GUARD();
+    job = find_job_locked(id, &aio_context, errp);
 
     if (!job) {
         return;
     }
 
     trace_qmp_job_pause(job);
-    job_user_pause(job, errp);
+    job_user_pause_locked(job, errp);
     aio_context_release(aio_context);
 }
 
 void qmp_job_resume(const char *id, Error **errp)
 {
     AioContext *aio_context;
-    Job *job = find_job(id, &aio_context, errp);
+    Job *job;
+
+    JOB_LOCK_GUARD();
+    job = find_job_locked(id, &aio_context, errp);
 
     if (!job) {
         return;
     }
 
     trace_qmp_job_resume(job);
-    job_user_resume(job, errp);
+    job_user_resume_locked(job, errp);
     aio_context_release(aio_context);
 }
 
 void qmp_job_complete(const char *id, Error **errp)
 {
     AioContext *aio_context;
-    Job *job = find_job(id, &aio_context, errp);
+    Job *job;
+
+    JOB_LOCK_GUARD();
+    job = find_job_locked(id, &aio_context, errp);
 
     if (!job) {
         return;
     }
 
     trace_qmp_job_complete(job);
-    job_complete(job, errp);
+    job_complete_locked(job, errp);
     aio_context_release(aio_context);
 }
 
 void qmp_job_finalize(const char *id, Error **errp)
 {
     AioContext *aio_context;
-    Job *job = find_job(id, &aio_context, errp);
+    Job *job;
+
+    JOB_LOCK_GUARD();
+    job = find_job_locked(id, &aio_context, errp);
 
     if (!job) {
         return;
     }
 
     trace_qmp_job_finalize(job);
-    job_ref(job);
-    job_finalize(job, errp);
+    job_ref_locked(job);
+    job_finalize_locked(job, errp);
 
     /*
      * Job's context might have changed via job_finalize (and job_txn_apply
@@ -XXX,XX +XXX,XX @@ void qmp_job_finalize(const char *id, Error **errp)
      * one.
      */
     aio_context = job->aio_context;
-    job_unref(job);
+    job_unref_locked(job);
     aio_context_release(aio_context);
 }
 
 void qmp_job_dismiss(const char *id, Error **errp)
 {
     AioContext *aio_context;
-    Job *job = find_job(id, &aio_context, errp);
+    Job *job;
+
+    JOB_LOCK_GUARD();
+    job = find_job_locked(id, &aio_context, errp);
 
     if (!job) {
         return;
     }
 
     trace_qmp_job_dismiss(job);
-    job_dismiss(&job, errp);
+    job_dismiss_locked(&job, errp);
     aio_context_release(aio_context);
 }
 
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

Add missing job synchronization in the unit tests, with
explicit locks.

We are deliberately using _locked functions wrapped by a guard
instead of a normal call because the normal call will be removed
in future, as the only usage is limited to the tests.

In other words, if a function like job_pause() is/will be only used
in tests to avoid:

WITH_JOB_LOCK_GUARD(){
    job_pause_locked();
}

then it is not worth keeping job_pause(), and just use the guard.

Note: at this stage, job_{lock/unlock} and job lock guard macros
are *nop*.

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20220926093214.506243-10-eesposit@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/unit/test-bdrv-drain.c     |  76 ++++++++++++--------
 tests/unit/test-block-iothread.c |   8 ++-
 tests/unit/test-blockjob-txn.c   |  24 ++++---
 tests/unit/test-blockjob.c       | 115 +++++++++++++++++++------------
 4 files changed, 140 insertions(+), 83 deletions(-)

diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/test-bdrv-drain.c
+++ b/tests/unit/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common_drain_node(enum drain_type drain_type,
         }
     }
 
-    g_assert_cmpint(job->job.pause_count, ==, 0);
-    g_assert_false(job->job.paused);
-    g_assert_true(tjob->running);
-    g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
+    WITH_JOB_LOCK_GUARD() {
+        g_assert_cmpint(job->job.pause_count, ==, 0);
+        g_assert_false(job->job.paused);
+        g_assert_true(tjob->running);
+        g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
+    }
 
     do_drain_begin_unlocked(drain_type, drain_bs);
 
-    if (drain_type == BDRV_DRAIN_ALL) {
-        /* bdrv_drain_all() drains both src and target */
-        g_assert_cmpint(job->job.pause_count, ==, 2);
-    } else {
-        g_assert_cmpint(job->job.pause_count, ==, 1);
+    WITH_JOB_LOCK_GUARD() {
+        if (drain_type == BDRV_DRAIN_ALL) {
+            /* bdrv_drain_all() drains both src and target */
+            g_assert_cmpint(job->job.pause_count, ==, 2);
+        } else {
+            g_assert_cmpint(job->job.pause_count, ==, 1);
+        }
+        g_assert_true(job->job.paused);
+        g_assert_false(job->job.busy); /* The job is paused */
     }
-    g_assert_true(job->job.paused);
-    g_assert_false(job->job.busy); /* The job is paused */
 
     do_drain_end_unlocked(drain_type, drain_bs);
 
     if (use_iothread) {
-        /* paused is reset in the I/O thread, wait for it */
+        /*
+         * Here we are waiting for the paused status to change,
+         * so don't bother protecting the read every time.
+         *
+         * paused is reset in the I/O thread, wait for it
+         */
         while (job->job.paused) {
             aio_poll(qemu_get_aio_context(), false);
         }
     }
 
-    g_assert_cmpint(job->job.pause_count, ==, 0);
-    g_assert_false(job->job.paused);
-    g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
+    WITH_JOB_LOCK_GUARD() {
+        g_assert_cmpint(job->job.pause_count, ==, 0);
+        g_assert_false(job->job.paused);
+        g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
+    }
 
     do_drain_begin_unlocked(drain_type, target);
 
-    if (drain_type == BDRV_DRAIN_ALL) {
-        /* bdrv_drain_all() drains both src and target */
-        g_assert_cmpint(job->job.pause_count, ==, 2);
-    } else {
-        g_assert_cmpint(job->job.pause_count, ==, 1);
+    WITH_JOB_LOCK_GUARD() {
+        if (drain_type == BDRV_DRAIN_ALL) {
+            /* bdrv_drain_all() drains both src and target */
+            g_assert_cmpint(job->job.pause_count, ==, 2);
+        } else {
+            g_assert_cmpint(job->job.pause_count, ==, 1);
+        }
+        g_assert_true(job->job.paused);
+        g_assert_false(job->job.busy); /* The job is paused */
     }
-    g_assert_true(job->job.paused);
-    g_assert_false(job->job.busy); /* The job is paused */
 
     do_drain_end_unlocked(drain_type, target);
 
     if (use_iothread) {
-        /* paused is reset in the I/O thread, wait for it */
+        /*
+         * Here we are waiting for the paused status to change,
+         * so don't bother protecting the read every time.
+         *
+         * paused is reset in the I/O thread, wait for it
+         */
         while (job->job.paused) {
             aio_poll(qemu_get_aio_context(), false);
         }
     }
 
-    g_assert_cmpint(job->job.pause_count, ==, 0);
-    g_assert_false(job->job.paused);
-    g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
+    WITH_JOB_LOCK_GUARD() {
+        g_assert_cmpint(job->job.pause_count, ==, 0);
+        g_assert_false(job->job.paused);
+        g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
+    }
 
     aio_context_acquire(ctx);
-    ret = job_complete_sync(&job->job, &error_abort);
+    WITH_JOB_LOCK_GUARD() {
+        ret = job_complete_sync_locked(&job->job, &error_abort);
+    }
     g_assert_cmpint(ret, ==, (result == TEST_JOB_SUCCESS ? 0 : -EIO));
 
     if (use_iothread) {
diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/test-block-iothread.c
+++ b/tests/unit/test-block-iothread.c
@@ -XXX,XX +XXX,XX @@ static void test_attach_blockjob(void)
     }
 
     aio_context_acquire(ctx);
-    job_complete_sync(&tjob->common.job, &error_abort);
+    WITH_JOB_LOCK_GUARD() {
+        job_complete_sync_locked(&tjob->common.job, &error_abort);
+    }
     blk_set_aio_context(blk, qemu_get_aio_context(), &error_abort);
     aio_context_release(ctx);
 
@@ -XXX,XX +XXX,XX @@ static void test_propagate_mirror(void)
                  BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
                  false, "filter_node", MIRROR_COPY_MODE_BACKGROUND,
                  &error_abort);
-    job = job_get("job0");
+    WITH_JOB_LOCK_GUARD() {
+        job = job_get_locked("job0");
+    }
     filter = bdrv_find_node("filter_node");
 
     /* Change the AioContext of src */
diff --git a/tests/unit/test-blockjob-txn.c b/tests/unit/test-blockjob-txn.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/test-blockjob-txn.c
+++ b/tests/unit/test-blockjob-txn.c
@@ -XXX,XX +XXX,XX @@ static void test_single_job(int expected)
     job = test_block_job_start(1, true, expected, &result, txn);
     job_start(&job->job);
 
-    if (expected == -ECANCELED) {
-        job_cancel(&job->job, false);
+    WITH_JOB_LOCK_GUARD() {
+        if (expected == -ECANCELED) {
+            job_cancel_locked(&job->job, false);
+        }
     }
 
     while (result == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static void test_pair_jobs(int expected1, int expected2)
     /* Release our reference now to trigger as many nice
      * use-after-free bugs as possible.
      */
-    job_txn_unref(txn);
+    WITH_JOB_LOCK_GUARD() {
+        job_txn_unref_locked(txn);
 
-    if (expected1 == -ECANCELED) {
-        job_cancel(&job1->job, false);
-    }
-    if (expected2 == -ECANCELED) {
-        job_cancel(&job2->job, false);
+        if (expected1 == -ECANCELED) {
+            job_cancel_locked(&job1->job, false);
+        }
+        if (expected2 == -ECANCELED) {
+            job_cancel_locked(&job2->job, false);
+        }
     }
 
     while (result1 == -EINPROGRESS || result2 == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static void test_pair_jobs_fail_cancel_race(void)
     job_start(&job1->job);
     job_start(&job2->job);
 
-    job_cancel(&job1->job, false);
+    WITH_JOB_LOCK_GUARD() {
+        job_cancel_locked(&job1->job, false);
+    }
 
     /* Now make job2 finish before the main loop kicks jobs.  This simulates
      * the race between a pending kick and another job completing.
diff --git a/tests/unit/test-blockjob.c b/tests/unit/test-blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/test-blockjob.c
+++ b/tests/unit/test-blockjob.c
@@ -XXX,XX +XXX,XX @@ static CancelJob *create_common(Job **pjob)
     bjob = mk_job(blk, "Steve", &test_cancel_driver, true,
                   JOB_MANUAL_FINALIZE | JOB_MANUAL_DISMISS);
     job = &bjob->job;
-    job_ref(job);
-    assert(job->status == JOB_STATUS_CREATED);
+    WITH_JOB_LOCK_GUARD() {
+        job_ref_locked(job);
+        assert(job->status == JOB_STATUS_CREATED);
+    }
+
     s = container_of(bjob, CancelJob, common);
     s->blk = blk;
 
@@ -XXX,XX +XXX,XX @@ static void cancel_common(CancelJob *s)
     aio_context_acquire(ctx);
 
     job_cancel_sync(&job->job, true);
-    if (sts != JOB_STATUS_CREATED && sts != JOB_STATUS_CONCLUDED) {
-        Job *dummy = &job->job;
-        job_dismiss(&dummy, &error_abort);
+    WITH_JOB_LOCK_GUARD() {
+        if (sts != JOB_STATUS_CREATED && sts != JOB_STATUS_CONCLUDED) {
+            Job *dummy = &job->job;
+            job_dismiss_locked(&dummy, &error_abort);
+        }
+        assert(job->job.status == JOB_STATUS_NULL);
+        job_unref_locked(&job->job);
     }
-    assert(job->job.status == JOB_STATUS_NULL);
-    job_unref(&job->job);
     destroy_blk(blk);
 
     aio_context_release(ctx);
@@ -XXX,XX +XXX,XX @@ static void test_cancel_created(void)
     cancel_common(s);
 }
 
+static void assert_job_status_is(Job *job, int status)
+{
+    WITH_JOB_LOCK_GUARD() {
+        assert(job->status == status);
+    }
+}
+
 static void test_cancel_running(void)
 {
     Job *job;
@@ -XXX,XX +XXX,XX @@ static void test_cancel_running(void)
     s = create_common(&job);
 
     job_start(job);
-    assert(job->status == JOB_STATUS_RUNNING);
+    assert_job_status_is(job, JOB_STATUS_RUNNING);
 
     cancel_common(s);
 }
@@ -XXX,XX +XXX,XX @@ static void test_cancel_paused(void)
     s = create_common(&job);
 
     job_start(job);
-    assert(job->status == JOB_STATUS_RUNNING);
-
-    job_user_pause(job, &error_abort);
+    WITH_JOB_LOCK_GUARD() {
+        assert(job->status == JOB_STATUS_RUNNING);
+        job_user_pause_locked(job, &error_abort);
+    }
     job_enter(job);
-    assert(job->status == JOB_STATUS_PAUSED);
+    assert_job_status_is(job, JOB_STATUS_PAUSED);
 
     cancel_common(s);
 }
@@ -XXX,XX +XXX,XX @@ static void test_cancel_ready(void)
     s = create_common(&job);
 
     job_start(job);
-    assert(job->status == JOB_STATUS_RUNNING);
+    assert_job_status_is(job, JOB_STATUS_RUNNING);
 
     s->should_converge = true;
     job_enter(job);
-    assert(job->status == JOB_STATUS_READY);
+    assert_job_status_is(job, JOB_STATUS_READY);
 
     cancel_common(s);
 }
@@ -XXX,XX +XXX,XX @@ static void test_cancel_standby(void)
     s = create_common(&job);
 
     job_start(job);
-    assert(job->status == JOB_STATUS_RUNNING);
+    assert_job_status_is(job, JOB_STATUS_RUNNING);
 
     s->should_converge = true;
     job_enter(job);
-    assert(job->status == JOB_STATUS_READY);
-
-    job_user_pause(job, &error_abort);
+    WITH_JOB_LOCK_GUARD() {
+        assert(job->status == JOB_STATUS_READY);
+        job_user_pause_locked(job, &error_abort);
+    }
     job_enter(job);
-    assert(job->status == JOB_STATUS_STANDBY);
+    assert_job_status_is(job, JOB_STATUS_STANDBY);
 
     cancel_common(s);
 }
@@ -XXX,XX +XXX,XX @@ static void test_cancel_pending(void)
     s = create_common(&job);
 
     job_start(job);
-    assert(job->status == JOB_STATUS_RUNNING);
+    assert_job_status_is(job, JOB_STATUS_RUNNING);
 
     s->should_converge = true;
     job_enter(job);
-    assert(job->status == JOB_STATUS_READY);
-
-    job_complete(job, &error_abort);
+    WITH_JOB_LOCK_GUARD() {
+        assert(job->status == JOB_STATUS_READY);
+        job_complete_locked(job, &error_abort);
+    }
     job_enter(job);
     while (!job->deferred_to_main_loop) {
         aio_poll(qemu_get_aio_context(), true);
     }
-    assert(job->status == JOB_STATUS_READY);
+    assert_job_status_is(job, JOB_STATUS_READY);
     aio_poll(qemu_get_aio_context(), true);
-    assert(job->status == JOB_STATUS_PENDING);
+    assert_job_status_is(job, JOB_STATUS_PENDING);
 
     cancel_common(s);
 }
@@ -XXX,XX +XXX,XX @@ static void test_cancel_concluded(void)
     s = create_common(&job);
 
     job_start(job);
-    assert(job->status == JOB_STATUS_RUNNING);
+    assert_job_status_is(job, JOB_STATUS_RUNNING);
 
     s->should_converge = true;
     job_enter(job);
-    assert(job->status == JOB_STATUS_READY);
-
-    job_complete(job, &error_abort);
+    WITH_JOB_LOCK_GUARD() {
+        assert(job->status == JOB_STATUS_READY);
+        job_complete_locked(job, &error_abort);
+    }
     job_enter(job);
     while (!job->deferred_to_main_loop) {
         aio_poll(qemu_get_aio_context(), true);
     }
-    assert(job->status == JOB_STATUS_READY);
+    assert_job_status_is(job, JOB_STATUS_READY);
     aio_poll(qemu_get_aio_context(), true);
-    assert(job->status == JOB_STATUS_PENDING);
+    assert_job_status_is(job, JOB_STATUS_PENDING);
 
     aio_context_acquire(job->aio_context);
-    job_finalize(job, &error_abort);
+    WITH_JOB_LOCK_GUARD() {
+        job_finalize_locked(job, &error_abort);
+    }
     aio_context_release(job->aio_context);
-    assert(job->status == JOB_STATUS_CONCLUDED);
+    assert_job_status_is(job, JOB_STATUS_CONCLUDED);
 
     cancel_common(s);
 }
@@ -XXX,XX +XXX,XX @@ static void test_complete_in_standby(void)
     bjob = mk_job(blk, "job", &test_yielding_driver, true,
                   JOB_MANUAL_FINALIZE | JOB_MANUAL_DISMISS);
     job = &bjob->job;
-    assert(job->status == JOB_STATUS_CREATED);
+    assert_job_status_is(job, JOB_STATUS_CREATED);
 
     /* Wait for the job to become READY */
     job_start(job);
     aio_context_acquire(ctx);
+    /*
+     * Here we are waiting for the status to change, so don't bother
+     * protecting the read every time.
+     */
     AIO_WAIT_WHILE(ctx, job->status != JOB_STATUS_READY);
     aio_context_release(ctx);
 
     /* Begin the drained section, pausing the job */
     bdrv_drain_all_begin();
-    assert(job->status == JOB_STATUS_STANDBY);
+    assert_job_status_is(job, JOB_STATUS_STANDBY);
+
     /* Lock the IO thread to prevent the job from being run */
     aio_context_acquire(ctx);
     /* This will schedule the job to resume it */
     bdrv_drain_all_end();
 
-    /* But the job cannot run, so it will remain on standby */
-    assert(job->status == JOB_STATUS_STANDBY);
+    WITH_JOB_LOCK_GUARD() {
+        /* But the job cannot run, so it will remain on standby */
+        assert(job->status == JOB_STATUS_STANDBY);
 
-    /* Even though the job is on standby, this should work */
-    job_complete(job, &error_abort);
+        /* Even though the job is on standby, this should work */
+        job_complete_locked(job, &error_abort);
 
-    /* The test is done now, clean up. */
-    job_finish_sync(job, NULL, &error_abort);
-    assert(job->status == JOB_STATUS_PENDING);
+        /* The test is done now, clean up. */
+        job_finish_sync_locked(job, NULL, &error_abort);
+        assert(job->status == JOB_STATUS_PENDING);
 
-    job_finalize(job, &error_abort);
-    assert(job->status == JOB_STATUS_CONCLUDED);
+        job_finalize_locked(job, &error_abort);
+        assert(job->status == JOB_STATUS_CONCLUDED);
 
-    job_dismiss(&job, &error_abort);
+        job_dismiss_locked(&job, &error_abort);
+    }
 
     destroy_blk(blk);
     aio_context_release(ctx);
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

Once job lock is used and aiocontext is removed, mirror has
to perform job operations under the same critical section,
Note: at this stage, job_{lock/unlock} and job lock guard macros
are *nop*.

diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static void mirror_complete(Job *job, Error **errp)
     s->should_complete = true;
 
     /* If the job is paused, it will be re-entered when it is resumed */
-    if (!job->paused) {
-        job_enter(job);
+    WITH_JOB_LOCK_GUARD() {
+        if (!job->paused) {
+            job_enter_cond_locked(job, NULL);
+        }
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static bool mirror_drained_poll(BlockJob *job)
      * from one of our own drain sections, to avoid a deadlock waiting for
      * ourselves.
      */
-    if (!s->common.job.paused && !job_is_cancelled(&job->job) && !s->in_drain) {
-        return true;
+    WITH_JOB_LOCK_GUARD() {
+        if (!s->common.job.paused && !job_is_cancelled_locked(&job->job)
+            && !s->in_drain) {
+            return true;
+        }
     }
 
     return !!s->in_flight;
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

Now that the API offers also _locked() functions, take advantage
of it and give also the caller control to take the lock and call
_locked functions.

This makes sense especially when we have for loops, because it
makes no sense to have:

for(job = job_next(); ...)

where each job_next() takes the lock internally.
Instead we want

JOB_LOCK_GUARD();
for(job = job_next_locked(); ...)

In addition, protect also direct field accesses, by either creating a
new critical section or widening the existing ones.

Note: at this stage, job_{lock/unlock} and job lock guard macros
are *nop*.

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Message-Id: <20220926093214.506243-12-eesposit@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c            | 17 ++++++++++-------
 blockdev.c         | 14 ++++++++++----
 blockjob.c         | 35 ++++++++++++++++++++++-------------
 job-qmp.c          |  9 ++++++---
 monitor/qmp-cmds.c |  7 +++++--
 qemu-img.c         | 15 ++++++++++-----
 6 files changed, 63 insertions(+), 34 deletions(-)

diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ static void bdrv_close(BlockDriverState *bs)
 
 void bdrv_close_all(void)
 {
-    assert(job_next(NULL) == NULL);
     GLOBAL_STATE_CODE();
+    assert(job_next(NULL) == NULL);
 
     /* Drop references from requests still in flight, such as canceled block
      * jobs whose AIO context has not been polled yet */
@@ -XXX,XX +XXX,XX @@ XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp)
         }
     }
 
-    for (job = block_job_next(NULL); job; job = block_job_next(job)) {
-        GSList *el;
+    WITH_JOB_LOCK_GUARD() {
+        for (job = block_job_next_locked(NULL); job;
+             job = block_job_next_locked(job)) {
+            GSList *el;
 
-        xdbg_graph_add_node(gr, job, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_JOB,
-                           job->job.id);
-        for (el = job->nodes; el; el = el->next) {
-            xdbg_graph_add_edge(gr, job, (BdrvChild *)el->data);
+            xdbg_graph_add_node(gr, job, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_JOB,
+                                job->job.id);
+            for (el = job->nodes; el; el = el->next) {
+                xdbg_graph_add_edge(gr, job, (BdrvChild *)el->data);
+            }
         }
     }
 
diff --git a/blockdev.c b/blockdev.c
index XXXXXXX..XXXXXXX 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -XXX,XX +XXX,XX @@ void blockdev_mark_auto_del(BlockBackend *blk)
         return;
     }
 
-    for (job = block_job_next(NULL); job; job = block_job_next(job)) {
+    JOB_LOCK_GUARD();
+
+    for (job = block_job_next_locked(NULL); job;
+         job = block_job_next_locked(job)) {
         if (block_job_has_bdrv(job, blk_bs(blk))) {
             AioContext *aio_context = job->job.aio_context;
             aio_context_acquire(aio_context);
 
-            job_cancel(&job->job, false);
+            job_cancel_locked(&job->job, false);
 
             aio_context_release(aio_context);
         }
@@ -XXX,XX +XXX,XX @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
     BlockJobInfoList *head = NULL, **tail = &head;
     BlockJob *job;
 
-    for (job = block_job_next(NULL); job; job = block_job_next(job)) {
+    JOB_LOCK_GUARD();
+
+    for (job = block_job_next_locked(NULL); job;
+         job = block_job_next_locked(job)) {
         BlockJobInfo *value;
         AioContext *aio_context;
 
@@ -XXX,XX +XXX,XX @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
         }
         aio_context = block_job_get_aio_context(job);
         aio_context_acquire(aio_context);
-        value = block_job_query(job, errp);
+        value = block_job_query_locked(job, errp);
         aio_context_release(aio_context);
         if (!value) {
             qapi_free_BlockJobInfoList(head);
diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static bool child_job_drained_poll(BdrvChild *c)
     /* An inactive or completed job doesn't have any pending requests. Jobs
      * with !job->busy are either already paused or have a pause point after
      * being reentered, so no job driver code will run before they pause. */
-    if (!job->busy || job_is_completed(job)) {
-        return false;
+    WITH_JOB_LOCK_GUARD() {
+        if (!job->busy || job_is_completed_locked(job)) {
+            return false;
+        }
     }
 
     /* Otherwise, assume that it isn't fully stopped yet, but allow the job to
@@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
     job->ready_notifier.notify = block_job_event_ready;
     job->idle_notifier.notify = block_job_on_idle;
 
-    notifier_list_add(&job->job.on_finalize_cancelled,
-                      &job->finalize_cancelled_notifier);
-    notifier_list_add(&job->job.on_finalize_completed,
-                      &job->finalize_completed_notifier);
-    notifier_list_add(&job->job.on_pending, &job->pending_notifier);
-    notifier_list_add(&job->job.on_ready, &job->ready_notifier);
-    notifier_list_add(&job->job.on_idle, &job->idle_notifier);
+    WITH_JOB_LOCK_GUARD() {
+        notifier_list_add(&job->job.on_finalize_cancelled,
+                          &job->finalize_cancelled_notifier);
+        notifier_list_add(&job->job.on_finalize_completed,
+                          &job->finalize_completed_notifier);
+        notifier_list_add(&job->job.on_pending, &job->pending_notifier);
+        notifier_list_add(&job->job.on_ready, &job->ready_notifier);
+        notifier_list_add(&job->job.on_idle, &job->idle_notifier);
+    }
 
     error_setg(&job->blocker, "block device is in use by block job: %s",
                job_type_str(&job->job));
@@ -XXX,XX +XXX,XX @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
                                         action);
     }
     if (action == BLOCK_ERROR_ACTION_STOP) {
-        if (!job->job.user_paused) {
-            job_pause(&job->job);
-            /* make the pause user visible, which will be resumed from QMP. */
-            job->job.user_paused = true;
+        WITH_JOB_LOCK_GUARD() {
+            if (!job->job.user_paused) {
+                job_pause_locked(&job->job);
+                /*
+                 * make the pause user visible, which will be
+                 * resumed from QMP.
+                 */
+                job->job.user_paused = true;
+            }
         }
         block_job_iostatus_set_err(job, error);
     }
diff --git a/job-qmp.c b/job-qmp.c
index XXXXXXX..XXXXXXX 100644
--- a/job-qmp.c
+++ b/job-qmp.c
@@ -XXX,XX +XXX,XX @@ void qmp_job_dismiss(const char *id, Error **errp)
     aio_context_release(aio_context);
 }
 
-static JobInfo *job_query_single(Job *job, Error **errp)
+/* Called with job_mutex held. */
+static JobInfo *job_query_single_locked(Job *job, Error **errp)
 {
     JobInfo *info;
     uint64_t progress_current;
@@ -XXX,XX +XXX,XX @@ JobInfoList *qmp_query_jobs(Error **errp)
     JobInfoList *head = NULL, **tail = &head;
     Job *job;
 
-    for (job = job_next(NULL); job; job = job_next(job)) {
+    JOB_LOCK_GUARD();
+
+    for (job = job_next_locked(NULL); job; job = job_next_locked(job)) {
         JobInfo *value;
         AioContext *aio_context;
 
@@ -XXX,XX +XXX,XX @@ JobInfoList *qmp_query_jobs(Error **errp)
         }
         aio_context = job->aio_context;
         aio_context_acquire(aio_context);
-        value = job_query_single(job, errp);
+        value = job_query_single_locked(job, errp);
         aio_context_release(aio_context);
         if (!value) {
             qapi_free_JobInfoList(head);
diff --git a/monitor/qmp-cmds.c b/monitor/qmp-cmds.c
index XXXXXXX..XXXXXXX 100644
--- a/monitor/qmp-cmds.c
+++ b/monitor/qmp-cmds.c
@@ -XXX,XX +XXX,XX @@ void qmp_cont(Error **errp)
         blk_iostatus_reset(blk);
     }
 
-    for (job = block_job_next(NULL); job; job = block_job_next(job)) {
-        block_job_iostatus_reset(job);
+    WITH_JOB_LOCK_GUARD() {
+        for (job = block_job_next_locked(NULL); job;
+             job = block_job_next_locked(job)) {
+            block_job_iostatus_reset_locked(job);
+        }
     }
 
     /* Continuing after completed migration. Images have been inactivated to
diff --git a/qemu-img.c b/qemu-img.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ static void run_block_job(BlockJob *job, Error **errp)
     int ret = 0;
 
     aio_context_acquire(aio_context);
-    job_ref(&job->job);
+    job_lock();
+    job_ref_locked(&job->job);
     do {
         float progress = 0.0f;
+        job_unlock();
         aio_poll(aio_context, true);
 
         progress_get_snapshot(&job->job.progress, &progress_current,
@@ -XXX,XX +XXX,XX @@ static void run_block_job(BlockJob *job, Error **errp)
             progress = (float)progress_current / progress_total * 100.f;
         }
         qemu_progress_print(progress, 0);
-    } while (!job_is_ready(&job->job) && !job_is_completed(&job->job));
+        job_lock();
+    } while (!job_is_ready_locked(&job->job) &&
+             !job_is_completed_locked(&job->job));
 
-    if (!job_is_completed(&job->job)) {
-        ret = job_complete_sync(&job->job, errp);
+    if (!job_is_completed_locked(&job->job)) {
+        ret = job_complete_sync_locked(&job->job, errp);
     } else {
         ret = job->job.ret;
     }
-    job_unref(&job->job);
+    job_unref_locked(&job->job);
+    job_unlock();
     aio_context_release(aio_context);
 
     /* publish completion progress only when success */
-- 
2.37.3

From: Paolo Bonzini <pbonzini@redhat.com>

We want to make sure access of job->aio_context is always done
under either BQL or job_mutex. The problem is that using
aio_co_enter(job->aiocontext, job->co) in job_start and job_enter_cond
makes the coroutine immediately resume, so we can't hold the job lock.
And caching it is not safe either, as it might change.

job_start is under BQL, so it can freely read job->aiocontext, but
job_enter_cond is not.
We want to avoid reading job->aio_context in job_enter_cond, therefore:
1) use aio_co_wake(), since it doesn't want an aiocontext as argument
   but uses job->co->ctx
2) detect possible discrepancy between job->co->ctx and job->aio_context
   by checking right after the coroutine resumes back from yielding if
   job->aio_context has changed. If so, reschedule the coroutine to the
   new context.

Calling bdrv_try_set_aio_context() will issue the following calls
(simplified):
* in terms of  bdrv callbacks:
  .drained_begin -> .set_aio_context -> .drained_end
* in terms of child_job functions:
  child_job_drained_begin -> child_job_set_aio_context -> child_job_drained_end
* in terms of job functions:
  job_pause_locked -> job_set_aio_context -> job_resume_locked

We can see that after setting the new aio_context, job_resume_locked
calls again job_enter_cond, which then invokes aio_co_wake(). But
while job->aiocontext has been set in job_set_aio_context,
job->co->ctx has not changed, so the coroutine would be entering in
the wrong aiocontext.

Using aio_co_schedule in job_resume_locked() might seem as a valid
alternative, but the problem is that the bh resuming the coroutine
is not scheduled immediately, and if in the meanwhile another
bdrv_try_set_aio_context() is run (see test_propagate_mirror() in
test-block-iothread.c), we would have the first schedule in the
wrong aiocontext, and the second set of drains won't even manage
to schedule the coroutine, as job->busy would still be true from
the previous job_resume_locked().

The solution is to stick with aio_co_wake() and detect every time
the coroutine resumes back from yielding if job->aio_context
has changed. If so, we can reschedule it to the new context.

Check for the aiocontext change in job_do_yield_locked because:
1) aio_co_reschedule_self requires to be in the running coroutine
2) since child_job_set_aio_context allows changing the aiocontext only
   while the job is paused, this is the exact place where the coroutine
   resumes, before running JobDriver's code.

Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220926093214.506243-13-eesposit@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 job.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/job.c b/job.c
index XXXXXXX..XXXXXXX 100644
--- a/job.c
+++ b/job.c
@@ -XXX,XX +XXX,XX @@ void job_enter_cond_locked(Job *job, bool(*fn)(Job *job))
     job->busy = true;
     real_job_unlock();
     job_unlock();
-    aio_co_enter(job->aio_context, job->co);
+    aio_co_wake(job->co);
     job_lock();
 }
 
@@ -XXX,XX +XXX,XX @@ void job_enter(Job *job)
  */
 static void coroutine_fn job_do_yield_locked(Job *job, uint64_t ns)
 {
+    AioContext *next_aio_context;
+
     real_job_lock();
     if (ns != -1) {
         timer_mod(&job->sleep_timer, ns);
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn job_do_yield_locked(Job *job, uint64_t ns)
     qemu_coroutine_yield();
     job_lock();
 
-    /* Set by job_enter_cond() before re-entering the coroutine.  */
+    next_aio_context = job->aio_context;
+    /*
+     * Coroutine has resumed, but in the meanwhile the job AioContext
+     * might have changed via bdrv_try_set_aio_context(), so we need to move
+     * the coroutine too in the new aiocontext.
+     */
+    while (qemu_get_current_aio_context() != next_aio_context) {
+        job_unlock();
+        aio_co_reschedule_self(next_aio_context);
+        job_lock();
+        next_aio_context = job->aio_context;
+    }
+
+    /* Set by job_enter_cond_locked() before re-entering the coroutine.  */
     assert(job->busy);
 }
 
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

In order to make it thread safe, implement a "fake rwlock",
where we allow reads under BQL *or* job_mutex held, but
writes only under BQL *and* job_mutex.

The only write we have is in child_job_set_aio_ctx, which always
happens under drain (so the job is paused).
For this reason, introduce job_set_aio_context and make sure that
the context is set under BQL, job_mutex and drain.
Also make sure all other places where the aiocontext is read
are protected.

The reads in commit.c and mirror.c are actually safe, because always
done under BQL.

Note: at this stage, job_{lock/unlock} and job lock guard macros
are *nop*.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Message-Id: <20220926093214.506243-14-eesposit@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/qemu/job.h  | 23 ++++++++++++++++++++---
 block/replication.c |  1 +
 blockjob.c          |  3 ++-
 job.c               | 12 ++++++++++++
 4 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -XXX,XX +XXX,XX @@ typedef struct Job {
     /* ProgressMeter API is thread-safe */
     ProgressMeter progress;
 
+    /**
+     * AioContext to run the job coroutine in.
+     * The job Aiocontext can be read when holding *either*
+     * the BQL (so we are in the main loop) or the job_mutex.
+     * It can only be written when we hold *both* BQL
+     * and the job_mutex.
+     */
+    AioContext *aio_context;
 
-    /** Protected by AioContext lock */
 
-    /** AioContext to run the job coroutine in */
-    AioContext *aio_context;
+    /** Protected by AioContext lock */
 
     /** Reference count of the block job */
     int refcnt;
@@ -XXX,XX +XXX,XX @@ int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp),
 int job_finish_sync_locked(Job *job, void (*finish)(Job *, Error **errp),
                            Error **errp);
 
+/**
+ * Sets the @job->aio_context.
+ * Called with job_mutex *not* held.
+ *
+ * This function must run in the main thread to protect against
+ * concurrent read in job_finish_sync_locked(), takes the job_mutex
+ * lock to protect against the read in job_do_yield_locked(), and must
+ * be called when the job is quiescent.
+ */
+void job_set_aio_context(Job *job, AioContext *ctx);
+
 #endif
diff --git a/block/replication.c b/block/replication.c
index XXXXXXX..XXXXXXX 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -XXX,XX +XXX,XX @@ static void replication_close(BlockDriverState *bs)
 {
     BDRVReplicationState *s = bs->opaque;
     Job *commit_job;
+    GLOBAL_STATE_CODE();
 
     if (s->stage == BLOCK_REPLICATION_RUNNING) {
         replication_stop(s->rs, false, NULL);
diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ static void child_job_set_aio_ctx(BdrvChild *c, AioContext *ctx,
         bdrv_set_aio_context_ignore(sibling->bs, ctx, ignore);
     }
 
-    job->job.aio_context = ctx;
+    job_set_aio_context(&job->job, ctx);
 }
 
 static AioContext *child_job_get_parent_aio_context(BdrvChild *c)
 {
     BlockJob *job = c->opaque;
+    GLOBAL_STATE_CODE();
 
     return job->job.aio_context;
 }
diff --git a/job.c b/job.c
index XXXXXXX..XXXXXXX 100644
--- a/job.c
+++ b/job.c
@@ -XXX,XX +XXX,XX @@ Job *job_get(const char *id)
     return job_get_locked(id);
 }
 
+void job_set_aio_context(Job *job, AioContext *ctx)
+{
+    /* protect against read in job_finish_sync_locked and job_start */
+    GLOBAL_STATE_CODE();
+    /* protect against read in job_do_yield_locked */
+    JOB_LOCK_GUARD();
+    /* ensure the job is quiescent while the AioContext is changed */
+    assert(job->paused || job_is_completed_locked(job));
+    job->aio_context = ctx;
+}
+
 /* Called with job_mutex *not* held. */
 static void job_sleep_timer_cb(void *opaque)
 {
@@ -XXX,XX +XXX,XX @@ int job_finish_sync_locked(Job *job,
 {
     Error *local_err = NULL;
     int ret;
+    GLOBAL_STATE_CODE();
 
     job_ref_locked(job);
 
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

The same job lock is being used also to protect some of blockjob fields.
Categorize them just as done in job.h.

Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Message-Id: <20220926093214.506243-15-eesposit@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/blockjob.h | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -XXX,XX +XXX,XX @@ typedef struct BlockJobDriver BlockJobDriver;
  * Long-running operation on a BlockDriverState.
  */
 typedef struct BlockJob {
-    /** Data belonging to the generic Job infrastructure */
+    /**
+     * Data belonging to the generic Job infrastructure.
+     * Protected by job mutex.
+     */
     Job job;
 
-    /** Status that is published by the query-block-jobs QMP API */
+    /**
+     * Status that is published by the query-block-jobs QMP API.
+     * Protected by job mutex.
+     */
     BlockDeviceIoStatus iostatus;
 
-    /** Speed that was set with @block_job_set_speed.  */
+    /**
+     * Speed that was set with @block_job_set_speed.
+     * Always modified and read under QEMU global mutex (GLOBAL_STATE_CODE).
+     */
     int64_t speed;
 
-    /** Rate limiting data structure for implementing @speed. */
+    /**
+     * Rate limiting data structure for implementing @speed.
+     * RateLimit API is thread-safe.
+     */
     RateLimit limit;
 
-    /** Block other operations when block job is running */
+    /**
+     * Block other operations when block job is running.
+     * Always modified and read under QEMU global mutex (GLOBAL_STATE_CODE).
+     */
     Error *blocker;
 
+    /** All notifiers are set once in block_job_create() and never modified. */
+
     /** Called when a cancelled job is finalised. */
     Notifier finalize_cancelled_notifier;
 
@@ -XXX,XX +XXX,XX @@ typedef struct BlockJob {
     /** Called when the job coroutine yields or terminates */
     Notifier idle_notifier;
 
-    /** BlockDriverStates that are involved in this block job */
+    /**
+     * BlockDriverStates that are involved in this block job.
+     * Always modified and read under QEMU global mutex (GLOBAL_STATE_CODE).
+     */
     GSList *nodes;
 } BlockJob;
 
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

They all are called with job_lock held, in job_event_*_locked()

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20220926093214.506243-16-eesposit@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockjob.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs,
     return 0;
 }
 
-static void block_job_on_idle(Notifier *n, void *opaque)
+/* Called with job_mutex lock held. */
+static void block_job_on_idle_locked(Notifier *n, void *opaque)
 {
     aio_wait_kick();
 }
@@ -XXX,XX +XXX,XX @@ static void block_job_iostatus_set_err(BlockJob *job, int error)
     }
 }
 
-static void block_job_event_cancelled(Notifier *n, void *opaque)
+/* Called with job_mutex lock held. */
+static void block_job_event_cancelled_locked(Notifier *n, void *opaque)
 {
     BlockJob *job = opaque;
     uint64_t progress_current, progress_total;
@@ -XXX,XX +XXX,XX @@ static void block_job_event_cancelled(Notifier *n, void *opaque)
                                         job->speed);
 }
 
-static void block_job_event_completed(Notifier *n, void *opaque)
+/* Called with job_mutex lock held. */
+static void block_job_event_completed_locked(Notifier *n, void *opaque)
 {
     BlockJob *job = opaque;
     const char *msg = NULL;
@@ -XXX,XX +XXX,XX @@ static void block_job_event_completed(Notifier *n, void *opaque)
                                         msg);
 }
 
-static void block_job_event_pending(Notifier *n, void *opaque)
+/* Called with job_mutex lock held. */
+static void block_job_event_pending_locked(Notifier *n, void *opaque)
 {
     BlockJob *job = opaque;
 
@@ -XXX,XX +XXX,XX @@ static void block_job_event_pending(Notifier *n, void *opaque)
                                       job->job.id);
 }
 
-static void block_job_event_ready(Notifier *n, void *opaque)
+/* Called with job_mutex lock held. */
+static void block_job_event_ready_locked(Notifier *n, void *opaque)
 {
     BlockJob *job = opaque;
     uint64_t progress_current, progress_total;
@@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
 
     ratelimit_init(&job->limit);
 
-    job->finalize_cancelled_notifier.notify = block_job_event_cancelled;
-    job->finalize_completed_notifier.notify = block_job_event_completed;
-    job->pending_notifier.notify = block_job_event_pending;
-    job->ready_notifier.notify = block_job_event_ready;
-    job->idle_notifier.notify = block_job_on_idle;
+    job->finalize_cancelled_notifier.notify = block_job_event_cancelled_locked;
+    job->finalize_completed_notifier.notify = block_job_event_completed_locked;
+    job->pending_notifier.notify = block_job_event_pending_locked;
+    job->ready_notifier.notify = block_job_event_ready_locked;
+    job->idle_notifier.notify = block_job_on_idle_locked;
 
     WITH_JOB_LOCK_GUARD() {
         notifier_list_add(&job->job.on_finalize_cancelled,
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

iostatus is the only field (together with .job) that needs
protection using the job mutex.

It is set in the main loop (GLOBAL_STATE functions) but read
in I/O code (block_job_error_action).

In order to protect it, change block_job_iostatus_set_err
to block_job_iostatus_set_err_locked(), always called under
job lock.

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Message-Id: <20220926093214.506243-17-eesposit@redhat.com>
[kwolf: Fixed up type of iostatus]
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/mirror.c | 6 +++++-
 blockjob.c     | 5 +++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
     BlockDriverState *bs = s->mirror_top_bs->backing->bs;
     BlockDriverState *target_bs = blk_bs(s->target);
     bool need_drain = true;
+    BlockDeviceIoStatus iostatus;
     int64_t length;
     int64_t target_length;
     BlockDriverInfo bdi;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
          * We do so every BLKOCK_JOB_SLICE_TIME nanoseconds, or when there is
          * an error, or when the source is clean, whichever comes first. */
         delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
+        WITH_JOB_LOCK_GUARD() {
+            iostatus = s->common.iostatus;
+        }
         if (delta < BLOCK_JOB_SLICE_TIME &&
-            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
+            iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
             if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                 (cnt == 0 && s->in_flight > 0)) {
                 trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight);
diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
     return block_job_query_locked(job, errp);
 }
 
-static void block_job_iostatus_set_err(BlockJob *job, int error)
+/* Called with job lock held */
+static void block_job_iostatus_set_err_locked(BlockJob *job, int error)
 {
     if (job->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
         job->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
@@ -XXX,XX +XXX,XX @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
                  */
                 job->job.user_paused = true;
             }
+            block_job_iostatus_set_err_locked(job, error);
         }
-        block_job_iostatus_set_err(job, error);
     }
     return action;
 }
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

Some callbacks implementation use bdrv_* APIs that assume the
AioContext lock is held. Make sure this invariant is documented.

diff --git a/include/qemu/job.h b/include/qemu/job.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -XXX,XX +XXX,XX @@ typedef struct Job {
     /** True if this job should automatically dismiss itself */
     bool auto_dismiss;
 
-    /** The completion function that will be called when the job completes.  */
+    /**
+     * The completion function that will be called when the job completes.
+     * Called with AioContext lock held, since many callback implementations
+     * use bdrv_* functions that require to hold the lock.
+     */
     BlockCompletionFunc *cb;
 
     /** The opaque value that is passed to the completion function.  */
@@ -XXX,XX +XXX,XX @@ struct JobDriver {
      *
      * This callback will not be invoked if the job has already failed.
      * If it fails, abort and then clean will be called.
+     *
+     * Called with AioContext lock held, since many callbacs implementations
+     * use bdrv_* functions that require to hold the lock.
      */
     int (*prepare)(Job *job);
 
@@ -XXX,XX +XXX,XX @@ struct JobDriver {
      *
      * All jobs will complete with a call to either .commit() or .abort() but
      * never both.
+     *
+     * Called with AioContext lock held, since many callback implementations
+     * use bdrv_* functions that require to hold the lock.
      */
     void (*commit)(Job *job);
 
@@ -XXX,XX +XXX,XX @@ struct JobDriver {
      *
      * All jobs will complete with a call to either .commit() or .abort() but
      * never both.
+     *
+     * Called with AioContext lock held, since many callback implementations
+     * use bdrv_* functions that require to hold the lock.
      */
     void (*abort)(Job *job);
 
@@ -XXX,XX +XXX,XX @@ struct JobDriver {
      * .commit() or .abort(). Regardless of which callback is invoked after
      * completion, .clean() will always be called, even if the job does not
      * belong to a transaction group.
+     *
+     * Called with AioContext lock held, since many callbacs implementations
+     * use bdrv_* functions that require to hold the lock.
      */
     void (*clean)(Job *job);
 
@@ -XXX,XX +XXX,XX @@ struct JobDriver {
      * READY).
      * (If the callback is NULL, the job is assumed to terminate
      * without I/O.)
+     *
+     * Called with AioContext lock held, since many callback implementations
+     * use bdrv_* functions that require to hold the lock.
      */
     bool (*cancel)(Job *job, bool force);
 
 
-    /** Called when the job is freed */
+    /**
+     * Called when the job is freed.
+     * Called with AioContext lock held, since many callback implementations
+     * use bdrv_* functions that require to hold the lock.
+     */
     void (*free)(Job *job);
 };
 
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

Change the job_{lock/unlock} and macros to use job_mutex.

Now that they are not nop anymore, remove the aiocontext
to avoid deadlocks.

Therefore:
- when possible, remove completely the aiocontext lock/unlock pair
- if it is used by some other function too, reduce the locking
  section as much as possible, leaving the job API outside.
- change AIO_WAIT_WHILE in AIO_WAIT_WHILE_UNLOCKED, since we
  are not using the aiocontext lock anymore

The only functions that still need the aiocontext lock are:
- the JobDriver callbacks, already documented in job.h
- job_cancel_sync() in replication.c is called with aio_context_lock
  taken, but now job is using AIO_WAIT_WHILE_UNLOCKED so we need to
  release the lock.

Reduce the locking section to only cover the callback invocation
and document the functions that take the AioContext lock,
to avoid taking it twice.

Also remove real_job_{lock/unlock}, as they are replaced by the
public functions.

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Message-Id: <20220926093214.506243-19-eesposit@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/qemu/job.h               |  17 ++---
 block/replication.c              |   2 +
 blockdev.c                       |  72 +++-----------------
 job-qmp.c                        |  46 +++----------
 job.c                            | 111 +++++++++----------------------
 qemu-img.c                       |   2 -
 tests/unit/test-bdrv-drain.c     |   4 +-
 tests/unit/test-block-iothread.c |   2 +-
 tests/unit/test-blockjob.c       |  19 +++---
 9 files changed, 72 insertions(+), 203 deletions(-)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -XXX,XX +XXX,XX @@ typedef struct Job {
     AioContext *aio_context;
 
 
-    /** Protected by AioContext lock */
+    /** Protected by job_mutex */
 
     /** Reference count of the block job */
     int refcnt;
@@ -XXX,XX +XXX,XX @@ typedef struct Job {
     /**
      * Set to false by the job while the coroutine has yielded and may be
      * re-entered by job_enter(). There may still be I/O or event loop activity
-     * pending. Accessed under block_job_mutex (in blockjob.c).
+     * pending. Accessed under job_mutex.
      *
      * When the job is deferred to the main loop, busy is true as long as the
      * bottom half is still pending.
@@ -XXX,XX +XXX,XX @@ typedef enum JobCreateFlags {
 
 extern QemuMutex job_mutex;
 
-#define JOB_LOCK_GUARD() /* QEMU_LOCK_GUARD(&job_mutex) */
+#define JOB_LOCK_GUARD() QEMU_LOCK_GUARD(&job_mutex)
 
-#define WITH_JOB_LOCK_GUARD() /* WITH_QEMU_LOCK_GUARD(&job_mutex) */
+#define WITH_JOB_LOCK_GUARD() WITH_QEMU_LOCK_GUARD(&job_mutex)
 
 /**
  * job_lock:
@@ -XXX,XX +XXX,XX @@ void job_ref_locked(Job *job);
 /**
  * Release a reference that was previously acquired with job_ref() or
  * job_create(). If it's the last reference to the object, it will be freed.
+ *
+ * Takes AioContext lock internally to invoke a job->driver callback.
  */
 void job_unref(Job *job);
 
@@ -XXX,XX +XXX,XX @@ void job_user_cancel_locked(Job *job, bool force, Error **errp);
  * Returns the return value from the job if the job actually completed
  * during the call, or -ECANCELED if it was canceled.
  *
- * Callers must hold the AioContext lock of job->aio_context.
+ * Called with job_lock *not* held.
  */
 int job_cancel_sync(Job *job, bool force);
 
@@ -XXX,XX +XXX,XX @@ void job_cancel_sync_all(void);
  * function).
  *
  * Returns the return value from the job.
- *
- * Callers must hold the AioContext lock of job->aio_context.
+ * Called with job_lock *not* held.
  */
 int job_complete_sync(Job *job, Error **errp);
 
@@ -XXX,XX +XXX,XX @@ void job_dismiss_locked(Job **job, Error **errp);
  * Returns 0 if the job is successfully completed, -ECANCELED if the job was
  * cancelled before completing, and -errno in other error cases.
  *
- * Callers must hold the AioContext lock of job->aio_context.
+ * Called with job_lock *not* held.
  */
 int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp),
                     Error **errp);
diff --git a/block/replication.c b/block/replication.c
index XXXXXXX..XXXXXXX 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -XXX,XX +XXX,XX @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
          * disk, secondary disk in backup_job_completed().
          */
         if (s->backup_job) {
+            aio_context_release(aio_context);
             job_cancel_sync(&s->backup_job->job, true);
+            aio_context_acquire(aio_context);
         }
 
         if (!failover) {
diff --git a/blockdev.c b/blockdev.c
index XXXXXXX..XXXXXXX 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -XXX,XX +XXX,XX @@ void blockdev_mark_auto_del(BlockBackend *blk)
     for (job = block_job_next_locked(NULL); job;
          job = block_job_next_locked(job)) {
         if (block_job_has_bdrv(job, blk_bs(blk))) {
-            AioContext *aio_context = job->job.aio_context;
-            aio_context_acquire(aio_context);
-
             job_cancel_locked(&job->job, false);
-
-            aio_context_release(aio_context);
         }
     }
 
@@ -XXX,XX +XXX,XX @@ static void drive_backup_abort(BlkActionState *common)
     DriveBackupState *state = DO_UPCAST(DriveBackupState, common, common);
 
     if (state->job) {
-        AioContext *aio_context;
-
-        aio_context = bdrv_get_aio_context(state->bs);
-        aio_context_acquire(aio_context);
-
         job_cancel_sync(&state->job->job, true);
-
-        aio_context_release(aio_context);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void blockdev_backup_abort(BlkActionState *common)
     BlockdevBackupState *state = DO_UPCAST(BlockdevBackupState, common, common);
 
     if (state->job) {
-        AioContext *aio_context;
-
-        aio_context = bdrv_get_aio_context(state->bs);
-        aio_context_acquire(aio_context);
-
         job_cancel_sync(&state->job->job, true);
-
-        aio_context_release(aio_context);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ out:
 }
 
 /*
- * Get a block job using its ID and acquire its AioContext.
- * Called with job_mutex held.
+ * Get a block job using its ID. Called with job_mutex held.
  */
-static BlockJob *find_block_job_locked(const char *id,
-                                       AioContext **aio_context,
-                                       Error **errp)
+static BlockJob *find_block_job_locked(const char *id, Error **errp)
 {
     BlockJob *job;
 
     assert(id != NULL);
 
-    *aio_context = NULL;
-
     job = block_job_get_locked(id);
 
     if (!job) {
@@ -XXX,XX +XXX,XX @@ static BlockJob *find_block_job_locked(const char *id,
         return NULL;
     }
 
-    *aio_context = block_job_get_aio_context(job);
-    aio_context_acquire(*aio_context);
-
     return job;
 }
 
 void qmp_block_job_set_speed(const char *device, int64_t speed, Error **errp)
 {
-    AioContext *aio_context;
     BlockJob *job;
 
     JOB_LOCK_GUARD();
-    job = find_block_job_locked(device, &aio_context, errp);
+    job = find_block_job_locked(device, errp);
 
     if (!job) {
         return;
     }
 
     block_job_set_speed_locked(job, speed, errp);
-    aio_context_release(aio_context);
 }
 
 void qmp_block_job_cancel(const char *device,
                           bool has_force, bool force, Error **errp)
 {
-    AioContext *aio_context;
     BlockJob *job;
 
     JOB_LOCK_GUARD();
-    job = find_block_job_locked(device, &aio_context, errp);
+    job = find_block_job_locked(device, errp);
 
     if (!job) {
         return;
@@ -XXX,XX +XXX,XX @@ void qmp_block_job_cancel(const char *device,
     if (job_user_paused_locked(&job->job) && !force) {
         error_setg(errp, "The block job for device '%s' is currently paused",
                    device);
-        goto out;
+        return;
     }
 
     trace_qmp_block_job_cancel(job);
     job_user_cancel_locked(&job->job, force, errp);
-out:
-    aio_context_release(aio_context);
 }
 
 void qmp_block_job_pause(const char *device, Error **errp)
 {
-    AioContext *aio_context;
     BlockJob *job;
 
     JOB_LOCK_GUARD();
-    job = find_block_job_locked(device, &aio_context, errp);
+    job = find_block_job_locked(device, errp);
 
     if (!job) {
         return;
@@ -XXX,XX +XXX,XX @@ void qmp_block_job_pause(const char *device, Error **errp)
 
     trace_qmp_block_job_pause(job);
     job_user_pause_locked(&job->job, errp);
-    aio_context_release(aio_context);
 }
 
 void qmp_block_job_resume(const char *device, Error **errp)
 {
-    AioContext *aio_context;
     BlockJob *job;
 
     JOB_LOCK_GUARD();
-    job = find_block_job_locked(device, &aio_context, errp);
+    job = find_block_job_locked(device, errp);
 
     if (!job) {
         return;
@@ -XXX,XX +XXX,XX @@ void qmp_block_job_resume(const char *device, Error **errp)
 
     trace_qmp_block_job_resume(job);
     job_user_resume_locked(&job->job, errp);
-    aio_context_release(aio_context);
 }
 
 void qmp_block_job_complete(const char *device, Error **errp)
 {
-    AioContext *aio_context;
     BlockJob *job;
 
     JOB_LOCK_GUARD();
-    job = find_block_job_locked(device, &aio_context, errp);
+    job = find_block_job_locked(device, errp);
 
     if (!job) {
         return;
@@ -XXX,XX +XXX,XX @@ void qmp_block_job_complete(const char *device, Error **errp)
 
     trace_qmp_block_job_complete(job);
     job_complete_locked(&job->job, errp);
-    aio_context_release(aio_context);
 }
 
 void qmp_block_job_finalize(const char *id, Error **errp)
 {
-    AioContext *aio_context;
     BlockJob *job;
 
     JOB_LOCK_GUARD();
-    job = find_block_job_locked(id, &aio_context, errp);
+    job = find_block_job_locked(id, errp);
 
     if (!job) {
         return;
@@ -XXX,XX +XXX,XX @@ void qmp_block_job_finalize(const char *id, Error **errp)
     job_ref_locked(&job->job);
     job_finalize_locked(&job->job, errp);
 
-    /*
-     * Job's context might have changed via job_finalize (and job_txn_apply
-     * automatically acquires the new one), so make sure we release the correct
-     * one.
-     */
-    aio_context = block_job_get_aio_context(job);
     job_unref_locked(&job->job);
-    aio_context_release(aio_context);
 }
 
 void qmp_block_job_dismiss(const char *id, Error **errp)
 {
-    AioContext *aio_context;
     BlockJob *bjob;
     Job *job;
 
     JOB_LOCK_GUARD();
-    bjob = find_block_job_locked(id, &aio_context, errp);
+    bjob = find_block_job_locked(id, errp);
 
     if (!bjob) {
         return;
@@ -XXX,XX +XXX,XX @@ void qmp_block_job_dismiss(const char *id, Error **errp)
     trace_qmp_block_job_dismiss(bjob);
     job = &bjob->job;
     job_dismiss_locked(&job, errp);
-    aio_context_release(aio_context);
 }
 
 void qmp_change_backing_file(const char *device,
@@ -XXX,XX +XXX,XX @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
     for (job = block_job_next_locked(NULL); job;
          job = block_job_next_locked(job)) {
         BlockJobInfo *value;
-        AioContext *aio_context;
 
         if (block_job_is_internal(job)) {
             continue;
         }
-        aio_context = block_job_get_aio_context(job);
-        aio_context_acquire(aio_context);
         value = block_job_query_locked(job, errp);
-        aio_context_release(aio_context);
         if (!value) {
             qapi_free_BlockJobInfoList(head);
             return NULL;
diff --git a/job-qmp.c b/job-qmp.c
index XXXXXXX..XXXXXXX 100644
--- a/job-qmp.c
+++ b/job-qmp.c
@@ -XXX,XX +XXX,XX @@
 #include "trace/trace-root.h"
 
 /*
- * Get a job using its ID and acquire its AioContext.
- * Called with job_mutex held.
+ * Get a job using its ID. Called with job_mutex held.
  */
-static Job *find_job_locked(const char *id,
-                            AioContext **aio_context,
-                            Error **errp)
+static Job *find_job_locked(const char *id, Error **errp)
 {
     Job *job;
 
-    *aio_context = NULL;
-
     job = job_get_locked(id);
     if (!job) {
         error_setg(errp, "Job not found");
         return NULL;
     }
 
-    *aio_context = job->aio_context;
-    aio_context_acquire(*aio_context);
-
     return job;
 }
 
 void qmp_job_cancel(const char *id, Error **errp)
 {
-    AioContext *aio_context;
     Job *job;
 
     JOB_LOCK_GUARD();
-    job = find_job_locked(id, &aio_context, errp);
+    job = find_job_locked(id, errp);
 
     if (!job) {
         return;
@@ -XXX,XX +XXX,XX @@ void qmp_job_cancel(const char *id, Error **errp)
 
     trace_qmp_job_cancel(job);
     job_user_cancel_locked(job, true, errp);
-    aio_context_release(aio_context);
 }
 
 void qmp_job_pause(const char *id, Error **errp)
 {
-    AioContext *aio_context;
     Job *job;
 
     JOB_LOCK_GUARD();
-    job = find_job_locked(id, &aio_context, errp);
+    job = find_job_locked(id, errp);
 
     if (!job) {
         return;
@@ -XXX,XX +XXX,XX @@ void qmp_job_pause(const char *id, Error **errp)
 
     trace_qmp_job_pause(job);
     job_user_pause_locked(job, errp);
-    aio_context_release(aio_context);
 }
 
 void qmp_job_resume(const char *id, Error **errp)
 {
-    AioContext *aio_context;
     Job *job;
 
     JOB_LOCK_GUARD();
-    job = find_job_locked(id, &aio_context, errp);
+    job = find_job_locked(id, errp);
 
     if (!job) {
         return;
@@ -XXX,XX +XXX,XX @@ void qmp_job_resume(const char *id, Error **errp)
 
     trace_qmp_job_resume(job);
     job_user_resume_locked(job, errp);
-    aio_context_release(aio_context);
 }
 
 void qmp_job_complete(const char *id, Error **errp)
 {
-    AioContext *aio_context;
     Job *job;
 
     JOB_LOCK_GUARD();
-    job = find_job_locked(id, &aio_context, errp);
+    job = find_job_locked(id, errp);
 
     if (!job) {
         return;
@@ -XXX,XX +XXX,XX @@ void qmp_job_complete(const char *id, Error **errp)
 
     trace_qmp_job_complete(job);
     job_complete_locked(job, errp);
-    aio_context_release(aio_context);
 }
 
 void qmp_job_finalize(const char *id, Error **errp)
 {
-    AioContext *aio_context;
     Job *job;
 
     JOB_LOCK_GUARD();
-    job = find_job_locked(id, &aio_context, errp);
+    job = find_job_locked(id, errp);
 
     if (!job) {
         return;
@@ -XXX,XX +XXX,XX @@ void qmp_job_finalize(const char *id, Error **errp)
     job_ref_locked(job);
     job_finalize_locked(job, errp);
 
-    /*
-     * Job's context might have changed via job_finalize (and job_txn_apply
-     * automatically acquires the new one), so make sure we release the correct
-     * one.
-     */
-    aio_context = job->aio_context;
     job_unref_locked(job);
-    aio_context_release(aio_context);
 }
 
 void qmp_job_dismiss(const char *id, Error **errp)
 {
-    AioContext *aio_context;
     Job *job;
 
     JOB_LOCK_GUARD();
-    job = find_job_locked(id, &aio_context, errp);
+    job = find_job_locked(id, errp);
 
     if (!job) {
         return;
@@ -XXX,XX +XXX,XX @@ void qmp_job_dismiss(const char *id, Error **errp)
 
     trace_qmp_job_dismiss(job);
     job_dismiss_locked(&job, errp);
-    aio_context_release(aio_context);
 }
 
 /* Called with job_mutex held. */
@@ -XXX,XX +XXX,XX @@ JobInfoList *qmp_query_jobs(Error **errp)
 
     for (job = job_next_locked(NULL); job; job = job_next_locked(job)) {
         JobInfo *value;
-        AioContext *aio_context;
 
         if (job_is_internal(job)) {
             continue;
         }
-        aio_context = job->aio_context;
-        aio_context_acquire(aio_context);
         value = job_query_single_locked(job, errp);
-        aio_context_release(aio_context);
         if (!value) {
             qapi_free_JobInfoList(head);
             return NULL;
diff --git a/job.c b/job.c
index XXXXXXX..XXXXXXX 100644
--- a/job.c
+++ b/job.c
@@ -XXX,XX +XXX,XX @@
  *
  * The second includes functions used by the job drivers and sometimes
  * by the core block layer. These delegate the locking to the callee instead.
- *
- * TODO Actually make this true
  */
 
 /*
@@ -XXX,XX +XXX,XX @@ struct JobTxn {
 };
 
 void job_lock(void)
-{
-    /* nop */
-}
-
-void job_unlock(void)
-{
-    /* nop */
-}
-
-static void real_job_lock(void)
 {
     qemu_mutex_lock(&job_mutex);
 }
 
-static void real_job_unlock(void)
+void job_unlock(void)
 {
     qemu_mutex_unlock(&job_mutex);
 }
@@ -XXX,XX +XXX,XX @@ static void job_txn_del_job_locked(Job *job)
 /* Called with job_mutex held, but releases it temporarily. */
 static int job_txn_apply_locked(Job *job, int fn(Job *))
 {
-    AioContext *inner_ctx;
     Job *other_job, *next;
     JobTxn *txn = job->txn;
     int rc = 0;
@@ -XXX,XX +XXX,XX @@ static int job_txn_apply_locked(Job *job, int fn(Job *))
      * break AIO_WAIT_WHILE from within fn.
      */
     job_ref_locked(job);
-    aio_context_release(job->aio_context);
 
     QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
-        inner_ctx = other_job->aio_context;
-        aio_context_acquire(inner_ctx);
         rc = fn(other_job);
-        aio_context_release(inner_ctx);
         if (rc) {
             break;
         }
     }
 
-    /*
-     * Note that job->aio_context might have been changed by calling fn, so we
-     * can't use a local variable to cache it.
-     */
-    aio_context_acquire(job->aio_context);
     job_unref_locked(job);
     return rc;
 }
@@ -XXX,XX +XXX,XX @@ void job_unref_locked(Job *job)
         assert(!job->txn);
 
         if (job->driver->free) {
+            AioContext *aio_context = job->aio_context;
             job_unlock();
+            /* FIXME: aiocontext lock is required because cb calls blk_unref */
+            aio_context_acquire(aio_context);
             job->driver->free(job);
+            aio_context_release(aio_context);
             job_lock();
         }
 
@@ -XXX,XX +XXX,XX @@ void job_enter_cond_locked(Job *job, bool(*fn)(Job *job))
         return;
     }
 
-    real_job_lock();
     if (job->busy) {
-        real_job_unlock();
         return;
     }
 
     if (fn && !fn(job)) {
-        real_job_unlock();
         return;
     }
 
     assert(!job->deferred_to_main_loop);
     timer_del(&job->sleep_timer);
     job->busy = true;
-    real_job_unlock();
     job_unlock();
     aio_co_wake(job->co);
     job_lock();
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn job_do_yield_locked(Job *job, uint64_t ns)
 {
     AioContext *next_aio_context;
 
-    real_job_lock();
     if (ns != -1) {
         timer_mod(&job->sleep_timer, ns);
     }
     job->busy = false;
     job_event_idle_locked(job);
-    real_job_unlock();
     job_unlock();
     qemu_coroutine_yield();
     job_lock();
@@ -XXX,XX +XXX,XX @@ static void job_clean(Job *job)
     }
 }
 
-/* Called with job_mutex held, but releases it temporarily */
+/*
+ * Called with job_mutex held, but releases it temporarily.
+ * Takes AioContext lock internally to invoke a job->driver callback.
+ */
 static int job_finalize_single_locked(Job *job)
 {
     int job_ret;
+    AioContext *ctx = job->aio_context;
 
     assert(job_is_completed_locked(job));
 
@@ -XXX,XX +XXX,XX @@ static int job_finalize_single_locked(Job *job)
 
     job_ret = job->ret;
     job_unlock();
+    aio_context_acquire(ctx);
 
     if (!job_ret) {
         job_commit(job);
@@ -XXX,XX +XXX,XX @@ static int job_finalize_single_locked(Job *job)
     }
     job_clean(job);
 
-    job_lock();
-
     if (job->cb) {
-        job_ret = job->ret;
-        job_unlock();
         job->cb(job->opaque, job_ret);
-        job_lock();
     }
 
+    aio_context_release(ctx);
+    job_lock();
+
     /* Emit events only if we actually started */
     if (job_started_locked(job)) {
         if (job_is_cancelled_locked(job)) {
@@ -XXX,XX +XXX,XX @@ static int job_finalize_single_locked(Job *job)
     return 0;
 }
 
-/* Called with job_mutex held, but releases it temporarily */
+/*
+ * Called with job_mutex held, but releases it temporarily.
+ * Takes AioContext lock internally to invoke a job->driver callback.
+ */
 static void job_cancel_async_locked(Job *job, bool force)
 {
+    AioContext *ctx = job->aio_context;
     GLOBAL_STATE_CODE();
     if (job->driver->cancel) {
         job_unlock();
+        aio_context_acquire(ctx);
         force = job->driver->cancel(job, force);
+        aio_context_release(ctx);
         job_lock();
     } else {
         /* No .cancel() means the job will behave as if force-cancelled */
@@ -XXX,XX +XXX,XX @@ static void job_cancel_async_locked(Job *job, bool force)
     }
 }
 
-/* Called with job_mutex held, but releases it temporarily. */
+/*
+ * Called with job_mutex held, but releases it temporarily.
+ * Takes AioContext lock internally to invoke a job->driver callback.
+ */
 static void job_completed_txn_abort_locked(Job *job)
 {
-    AioContext *ctx;
     JobTxn *txn = job->txn;
     Job *other_job;
 
@@ -XXX,XX +XXX,XX @@ static void job_completed_txn_abort_locked(Job *job)
     txn->aborting = true;
     job_txn_ref_locked(txn);
 
-    /*
-     * We can only hold the single job's AioContext lock while calling
-     * job_finalize_single() because the finalization callbacks can involve
-     * calls of AIO_WAIT_WHILE(), which could deadlock otherwise.
-     * Note that the job's AioContext may change when it is finalized.
-     */
     job_ref_locked(job);
-    aio_context_release(job->aio_context);
 
     /* Other jobs are effectively cancelled by us, set the status for
      * them; this job, however, may or may not be cancelled, depending
      * on the caller, so leave it. */
     QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
         if (other_job != job) {
-            ctx = other_job->aio_context;
-            aio_context_acquire(ctx);
             /*
              * This is a transaction: If one job failed, no result will matter.
              * Therefore, pass force=true to terminate all other jobs as quickly
              * as possible.
              */
             job_cancel_async_locked(other_job, true);
-            aio_context_release(ctx);
         }
     }
     while (!QLIST_EMPTY(&txn->jobs)) {
         other_job = QLIST_FIRST(&txn->jobs);
-        /*
-         * The job's AioContext may change, so store it in @ctx so we
-         * release the same context that we have acquired before.
-         */
-        ctx = other_job->aio_context;
-        aio_context_acquire(ctx);
         if (!job_is_completed_locked(other_job)) {
             assert(job_cancel_requested_locked(other_job));
             job_finish_sync_locked(other_job, NULL, NULL);
         }
         job_finalize_single_locked(other_job);
-        aio_context_release(ctx);
     }
 
-    /*
-     * Use job_ref()/job_unref() so we can read the AioContext here
-     * even if the job went away during job_finalize_single().
-     */
-    aio_context_acquire(job->aio_context);
     job_unref_locked(job);
-
     job_txn_unref_locked(txn);
 }
 
@@ -XXX,XX +XXX,XX @@ static void job_completed_txn_abort_locked(Job *job)
 static int job_prepare_locked(Job *job)
 {
     int ret;
+    AioContext *ctx = job->aio_context;
 
     GLOBAL_STATE_CODE();
+
     if (job->ret == 0 && job->driver->prepare) {
         job_unlock();
+        aio_context_acquire(ctx);
         ret = job->driver->prepare(job);
+        aio_context_release(ctx);
         job_lock();
         job->ret = ret;
         job_update_rc_locked(job);
     }
+
     return job->ret;
 }
 
@@ -XXX,XX +XXX,XX @@ static void job_completed_locked(Job *job)
 static void job_exit(void *opaque)
 {
     Job *job = (Job *)opaque;
-    AioContext *ctx;
     JOB_LOCK_GUARD();
-
     job_ref_locked(job);
-    aio_context_acquire(job->aio_context);
 
     /* This is a lie, we're not quiescent, but still doing the completion
      * callbacks. However, completion callbacks tend to involve operations that
@@ -XXX,XX +XXX,XX @@ static void job_exit(void *opaque)
     job_event_idle_locked(job);
 
     job_completed_locked(job);
-
-    /*
-     * Note that calling job_completed can move the job to a different
-     * aio_context, so we cannot cache from above. job_txn_apply takes care of
-     * acquiring the new lock, and we ref/unref to avoid job_completed freeing
-     * the job underneath us.
-     */
-    ctx = job->aio_context;
     job_unref_locked(job);
-    aio_context_release(ctx);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ int job_cancel_sync(Job *job, bool force)
 void job_cancel_sync_all(void)
 {
     Job *job;
-    AioContext *aio_context;
     JOB_LOCK_GUARD();
 
     while ((job = job_next_locked(NULL))) {
-        aio_context = job->aio_context;
-        aio_context_acquire(aio_context);
         job_cancel_sync_locked(job, true);
-        aio_context_release(aio_context);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ int job_finish_sync_locked(Job *job,
     }
 
     job_unlock();
-    AIO_WAIT_WHILE(job->aio_context,
-                   (job_enter(job), !job_is_completed(job)));
+    AIO_WAIT_WHILE_UNLOCKED(job->aio_context,
+                            (job_enter(job), !job_is_completed(job)));
     job_lock();
 
     ret = (job_is_cancelled_locked(job) && job->ret == 0)
diff --git a/qemu-img.c b/qemu-img.c
index XXXXXXX..XXXXXXX 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -XXX,XX +XXX,XX @@ static void run_block_job(BlockJob *job, Error **errp)
     AioContext *aio_context = block_job_get_aio_context(job);
     int ret = 0;
 
-    aio_context_acquire(aio_context);
     job_lock();
     job_ref_locked(&job->job);
     do {
@@ -XXX,XX +XXX,XX @@ static void run_block_job(BlockJob *job, Error **errp)
     }
     job_unref_locked(&job->job);
     job_unlock();
-    aio_context_release(aio_context);
 
     /* publish completion progress only when success */
     if (!ret) {
diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/test-bdrv-drain.c
+++ b/tests/unit/test-bdrv-drain.c
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common_drain_node(enum drain_type drain_type,
         tjob->prepare_ret = -EIO;
         break;
     }
+    aio_context_release(ctx);
 
     job_start(&job->job);
-    aio_context_release(ctx);
 
     if (use_iothread) {
         /* job_co_entry() is run in the I/O thread, wait for the actual job
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common_drain_node(enum drain_type drain_type,
         g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
     }
 
-    aio_context_acquire(ctx);
     WITH_JOB_LOCK_GUARD() {
         ret = job_complete_sync_locked(&job->job, &error_abort);
     }
     g_assert_cmpint(ret, ==, (result == TEST_JOB_SUCCESS ? 0 : -EIO));
 
+    aio_context_acquire(ctx);
     if (use_iothread) {
         blk_set_aio_context(blk_src, qemu_get_aio_context(), &error_abort);
         assert(blk_get_aio_context(blk_target) == qemu_get_aio_context());
diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/test-block-iothread.c
+++ b/tests/unit/test-block-iothread.c
@@ -XXX,XX +XXX,XX @@ static void test_attach_blockjob(void)
         aio_poll(qemu_get_aio_context(), false);
     }
 
-    aio_context_acquire(ctx);
     WITH_JOB_LOCK_GUARD() {
         job_complete_sync_locked(&tjob->common.job, &error_abort);
     }
+    aio_context_acquire(ctx);
     blk_set_aio_context(blk, qemu_get_aio_context(), &error_abort);
     aio_context_release(ctx);
 
diff --git a/tests/unit/test-blockjob.c b/tests/unit/test-blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/test-blockjob.c
+++ b/tests/unit/test-blockjob.c
@@ -XXX,XX +XXX,XX @@ static void cancel_common(CancelJob *s)
     BlockJob *job = &s->common;
     BlockBackend *blk = s->blk;
     JobStatus sts = job->job.status;
-    AioContext *ctx;
-
-    ctx = job->job.aio_context;
-    aio_context_acquire(ctx);
+    AioContext *ctx = job->job.aio_context;
 
     job_cancel_sync(&job->job, true);
     WITH_JOB_LOCK_GUARD() {
@@ -XXX,XX +XXX,XX @@ static void cancel_common(CancelJob *s)
         assert(job->job.status == JOB_STATUS_NULL);
         job_unref_locked(&job->job);
     }
-    destroy_blk(blk);
 
+    aio_context_acquire(ctx);
+    destroy_blk(blk);
     aio_context_release(ctx);
+
 }
 
 static void test_cancel_created(void)
@@ -XXX,XX +XXX,XX @@ static void test_cancel_concluded(void)
     aio_poll(qemu_get_aio_context(), true);
     assert_job_status_is(job, JOB_STATUS_PENDING);
 
-    aio_context_acquire(job->aio_context);
     WITH_JOB_LOCK_GUARD() {
         job_finalize_locked(job, &error_abort);
+        assert(job->status == JOB_STATUS_CONCLUDED);
     }
-    aio_context_release(job->aio_context);
-    assert_job_status_is(job, JOB_STATUS_CONCLUDED);
 
     cancel_common(s);
 }
@@ -XXX,XX +XXX,XX @@ static void test_complete_in_standby(void)
 
     /* Wait for the job to become READY */
     job_start(job);
-    aio_context_acquire(ctx);
     /*
      * Here we are waiting for the status to change, so don't bother
      * protecting the read every time.
      */
-    AIO_WAIT_WHILE(ctx, job->status != JOB_STATUS_READY);
-    aio_context_release(ctx);
+    AIO_WAIT_WHILE_UNLOCKED(ctx, job->status != JOB_STATUS_READY);
 
     /* Begin the drained section, pausing the job */
     bdrv_drain_all_begin();
@@ -XXX,XX +XXX,XX @@ static void test_complete_in_standby(void)
     aio_context_acquire(ctx);
     /* This will schedule the job to resume it */
     bdrv_drain_all_end();
+    aio_context_release(ctx);
 
     WITH_JOB_LOCK_GUARD() {
         /* But the job cannot run, so it will remain on standby */
@@ -XXX,XX +XXX,XX @@ static void test_complete_in_standby(void)
         job_dismiss_locked(&job, &error_abort);
     }
 
+    aio_context_acquire(ctx);
     destroy_blk(blk);
     aio_context_release(ctx);
     iothread_join(iothread);
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

Not sure what the atomic here was supposed to do, since job.busy
is protected by the job lock. Since the whole function
is called under job_mutex, just remove the atomic.

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20220926093214.506243-20-eesposit@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockjob.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp)
     info = g_new0(BlockJobInfo, 1);
     info->type      = g_strdup(job_type_str(&job->job));
     info->device    = g_strdup(job->job.id);
-    info->busy      = qatomic_read(&job->job.busy);
+    info->busy      = job->job.busy;
     info->paused    = job->job.pause_count > 0;
     info->offset    = progress_current;
     info->len       = progress_total;
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

These public functions are not used anywhere, thus can be dropped.

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Message-Id: <20220926093214.506243-21-eesposit@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/blockjob.h | 31 ++++++++++++-------------------
 blockjob.c               | 16 ++--------------
 2 files changed, 14 insertions(+), 33 deletions(-)

diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -XXX,XX +XXX,XX @@ typedef struct BlockJob {
  */
 
 /**
- * block_job_next:
+ * block_job_next_locked:
  * @job: A block job, or %NULL.
  *
  * Get the next element from the list of block jobs after @job, or the
  * first one if @job is %NULL.
  *
  * Returns the requested job, or %NULL if there are no more jobs left.
+ * Called with job lock held.
  */
-BlockJob *block_job_next(BlockJob *job);
-
-/* Same as block_job_next(), but called with job lock held. */
 BlockJob *block_job_next_locked(BlockJob *job);
 
 /**
@@ -XXX,XX +XXX,XX @@ BlockJob *block_job_next_locked(BlockJob *job);
  * Get the block job identified by @id (which must not be %NULL).
  *
  * Returns the requested job, or %NULL if it doesn't exist.
+ * Called with job lock *not* held.
  */
 BlockJob *block_job_get(const char *id);
 
@@ -XXX,XX +XXX,XX @@ void block_job_remove_all_bdrv(BlockJob *job);
 bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs);
 
 /**
- * block_job_set_speed:
+ * block_job_set_speed_locked:
  * @job: The job to set the speed for.
  * @speed: The new value
  * @errp: Error object.
  *
  * Set a rate-limiting parameter for the job; the actual meaning may
  * vary depending on the job type.
- */
-bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp);
-
-/*
- * Same as block_job_set_speed(), but called with job lock held.
- * Might release the lock temporarily.
+ *
+ * Called with job lock held, but might release it temporarily.
  */
 bool block_job_set_speed_locked(BlockJob *job, int64_t speed, Error **errp);
 
 /**
- * block_job_query:
+ * block_job_query_locked:
  * @job: The job to get information about.
  *
  * Return information about a job.
+ *
+ * Called with job lock held.
  */
-BlockJobInfo *block_job_query(BlockJob *job, Error **errp);
-
-/* Same as block_job_query(), but called with job lock held. */
 BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp);
 
 /**
- * block_job_iostatus_reset:
+ * block_job_iostatus_reset_locked:
  * @job: The job whose I/O status should be reset.
  *
  * Reset I/O status on @job and on BlockDriverState objects it uses,
  * other than job->blk.
+ *
+ * Called with job lock held.
  */
-void block_job_iostatus_reset(BlockJob *job);
-
-/* Same as block_job_iostatus_reset(), but called with job lock held. */
 void block_job_iostatus_reset_locked(BlockJob *job);
 
 /*
diff --git a/blockjob.c b/blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -XXX,XX +XXX,XX @@ BlockJob *block_job_next_locked(BlockJob *bjob)
     return job ? container_of(job, BlockJob, job) : NULL;
 }
 
-BlockJob *block_job_next(BlockJob *bjob)
-{
-    JOB_LOCK_GUARD();
-    return block_job_next_locked(bjob);
-}
-
 BlockJob *block_job_get_locked(const char *id)
 {
     Job *job = job_get_locked(id);
@@ -XXX,XX +XXX,XX @@ bool block_job_set_speed_locked(BlockJob *job, int64_t speed, Error **errp)
     return true;
 }
 
-bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
+static bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
 {
     JOB_LOCK_GUARD();
     return block_job_set_speed_locked(job, speed, errp);
@@ -XXX,XX +XXX,XX @@ BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp)
     return info;
 }
 
-BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
-{
-    JOB_LOCK_GUARD();
-    return block_job_query_locked(job, errp);
-}
-
 /* Called with job lock held */
 static void block_job_iostatus_set_err_locked(BlockJob *job, int error)
 {
@@ -XXX,XX +XXX,XX @@ void block_job_iostatus_reset_locked(BlockJob *job)
     job->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
 }
 
-void block_job_iostatus_reset(BlockJob *job)
+static void block_job_iostatus_reset(BlockJob *job)
 {
     JOB_LOCK_GUARD();
     block_job_iostatus_reset_locked(job);
-- 
2.37.3

From: Emanuele Giuseppe Esposito <eesposit@redhat.com>

These public functions are not used anywhere, thus can be dropped.
Also, since this is the final job API that doesn't use AioContext
lock and replaces it with job_lock, adjust all remaining function
documentation to clearly specify if the job lock is taken or not.

Also document the locking requirements for a few functions
where the second version is not removed.

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20220926093214.506243-22-eesposit@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/qemu/job.h         | 110 +++++++++++++------------------------
 job.c                      | 107 ++----------------------------------
 tests/unit/test-blockjob.c |   4 +-
 3 files changed, 46 insertions(+), 175 deletions(-)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -XXX,XX +XXX,XX @@ JobTxn *job_txn_new(void);
 /**
  * Release a reference that was previously acquired with job_txn_add_job or
  * job_txn_new. If it's the last reference to the object, it will be freed.
+ *
+ * Called with job lock *not* held.
  */
 void job_txn_unref(JobTxn *txn);
 
@@ -XXX,XX +XXX,XX @@ void *job_create(const char *job_id, const JobDriver *driver, JobTxn *txn,
 /**
  * Add a reference to Job refcnt, it will be decreased with job_unref, and then
  * be freed if it comes to be the last reference.
+ *
+ * Called with job lock held.
  */
-void job_ref(Job *job);
-
-/* Same as job_ref(), but called with job lock held. */
 void job_ref_locked(Job *job);
 
 /**
- * Release a reference that was previously acquired with job_ref() or
+ * Release a reference that was previously acquired with job_ref_locked() or
  * job_create(). If it's the last reference to the object, it will be freed.
  *
  * Takes AioContext lock internally to invoke a job->driver callback.
+ * Called with job lock held.
  */
-void job_unref(Job *job);
-
-/* Same as job_unref(), but called with job lock held. */
 void job_unref_locked(Job *job);
 
 /**
@@ -XXX,XX +XXX,XX @@ void job_progress_increase_remaining(Job *job, uint64_t delta);
  * Conditionally enter the job coroutine if the job is ready to run, not
  * already busy and fn() returns true. fn() is called while under the job_lock
  * critical section.
- */
-void job_enter_cond(Job *job, bool(*fn)(Job *job));
-
-/*
- * Same as job_enter_cond(), but called with job lock held.
- * Might release the lock temporarily.
+ *
+ * Called with job lock held, but might release it temporarily.
  */
 void job_enter_cond_locked(Job *job, bool(*fn)(Job *job));
 
@@ -XXX,XX +XXX,XX @@ bool job_cancel_requested(Job *job);
 
 /**
  * Returns whether the job is in a completed state.
- * Called with job_mutex *not* held.
+ * Called with job lock held.
  */
-bool job_is_completed(Job *job);
-
-/* Same as job_is_completed(), but called with job lock held. */
 bool job_is_completed_locked(Job *job);
 
 /**
@@ -XXX,XX +XXX,XX @@ bool job_is_ready_locked(Job *job);
 /**
  * Request @job to pause at the next pause point. Must be paired with
  * job_resume(). If the job is supposed to be resumed by user action, call
- * job_user_pause() instead.
+ * job_user_pause_locked() instead.
+ *
+ * Called with job lock *not* held.
  */
 void job_pause(Job *job);
 
 /* Same as job_pause(), but called with job lock held. */
 void job_pause_locked(Job *job);
 
-/** Resumes a @job paused with job_pause. */
+/** Resumes a @job paused with job_pause. Called with job lock *not* held. */
 void job_resume(Job *job);
 
 /*
@@ -XXX,XX +XXX,XX @@ void job_resume_locked(Job *job);
 /**
  * Asynchronously pause the specified @job.
  * Do not allow a resume until a matching call to job_user_resume.
+ * Called with job lock held.
  */
-void job_user_pause(Job *job, Error **errp);
-
-/* Same as job_user_pause(), but called with job lock held. */
 void job_user_pause_locked(Job *job, Error **errp);
 
-/** Returns true if the job is user-paused. */
-bool job_user_paused(Job *job);
-
-/* Same as job_user_paused(), but called with job lock held. */
+/**
+ * Returns true if the job is user-paused.
+ * Called with job lock held.
+ */
 bool job_user_paused_locked(Job *job);
 
 /**
  * Resume the specified @job.
- * Must be paired with a preceding job_user_pause.
- */
-void job_user_resume(Job *job, Error **errp);
-
-/*
- * Same as job_user_resume(), but called with job lock held.
- * Might release the lock temporarily.
+ * Must be paired with a preceding job_user_pause_locked.
+ * Called with job lock held, but might release it temporarily.
  */
 void job_user_resume_locked(Job *job, Error **errp);
 
@@ -XXX,XX +XXX,XX @@ void job_user_resume_locked(Job *job, Error **errp);
  * first one if @job is %NULL.
  *
  * Returns the requested job, or %NULL if there are no more jobs left.
+ * Called with job lock *not* held.
  */
 Job *job_next(Job *job);
 
@@ -XXX,XX +XXX,XX @@ Job *job_next_locked(Job *job);
  * Get the job identified by @id (which must not be %NULL).
  *
  * Returns the requested job, or %NULL if it doesn't exist.
+ * Called with job lock held.
  */
-Job *job_get(const char *id);
-
-/* Same as job_get(), but called with job lock held. */
 Job *job_get_locked(const char *id);
 
 /**
  * Check whether the verb @verb can be applied to @job in its current state.
  * Returns 0 if the verb can be applied; otherwise errp is set and -EPERM
  * returned.
+ *
+ * Called with job lock held.
  */
-int job_apply_verb(Job *job, JobVerb verb, Error **errp);
-
-/* Same as job_apply_verb, but called with job lock held. */
 int job_apply_verb_locked(Job *job, JobVerb verb, Error **errp);
 
 /**
@@ -XXX,XX +XXX,XX @@ void job_early_fail(Job *job);
  */
 void job_transition_to_ready(Job *job);
 
-/** Asynchronously complete the specified @job. */
-void job_complete(Job *job, Error **errp);
-
-/*
- * Same as job_complete(), but called with job lock held.
- * Might release the lock temporarily.
+/**
+ * Asynchronously complete the specified @job.
+ * Called with job lock held, but might release it temporarily.
  */
 void job_complete_locked(Job *job, Error **errp);
 
 /**
  * Asynchronously cancel the specified @job. If @force is true, the job should
  * be cancelled immediately without waiting for a consistent state.
+ * Called with job lock held.
  */
-void job_cancel(Job *job, bool force);
-
-/* Same as job_cancel(), but called with job lock held. */
 void job_cancel_locked(Job *job, bool force);
 
 /**
- * Cancels the specified job like job_cancel(), but may refuse to do so if the
- * operation isn't meaningful in the current state of the job.
+ * Cancels the specified job like job_cancel_locked(), but may refuse
+ * to do so if the operation isn't meaningful in the current state of the job.
+ * Called with job lock held.
  */
-void job_user_cancel(Job *job, bool force, Error **errp);
-
-/* Same as job_user_cancel(), but called with job lock held. */
 void job_user_cancel_locked(Job *job, bool force, Error **errp);
 
 /**
@@ -XXX,XX +XXX,XX @@ void job_cancel_sync_all(void);
 
 /**
  * @job: The job to be completed.
- * @errp: Error object which may be set by job_complete(); this is not
+ * @errp: Error object which may be set by job_complete_locked(); this is not
  *        necessarily set on every error, the job return value has to be
  *        checked as well.
  *
@@ -XXX,XX +XXX,XX @@ void job_cancel_sync_all(void);
  * function).
  *
  * Returns the return value from the job.
- * Called with job_lock *not* held.
+ * Called with job_lock held.
  */
-int job_complete_sync(Job *job, Error **errp);
-
-/* Same as job_complete_sync, but called with job lock held. */
 int job_complete_sync_locked(Job *job, Error **errp);
 
 /**
@@ -XXX,XX +XXX,XX @@ int job_complete_sync_locked(Job *job, Error **errp);
  * FIXME: Make the below statement universally true:
  * For jobs that support the manual workflow mode, all graph changes that occur
  * as a result will occur after this command and before a successful reply.
+ *
+ * Called with job lock held.
  */
-void job_finalize(Job *job, Error **errp);
-
-/* Same as job_finalize(), but called with job lock held. */
 void job_finalize_locked(Job *job, Error **errp);
 
 /**
  * Remove the concluded @job from the query list and resets the passed pointer
  * to %NULL. Returns an error if the job is not actually concluded.
+ *
+ * Called with job lock held.
  */
-void job_dismiss(Job **job, Error **errp);
-
-/* Same as job_dismiss(), but called with job lock held. */
 void job_dismiss_locked(Job **job, Error **errp);
 
 /**
@@ -XXX,XX +XXX,XX @@ void job_dismiss_locked(Job **job, Error **errp);
  * Returns 0 if the job is successfully completed, -ECANCELED if the job was
  * cancelled before completing, and -errno in other error cases.
  *
- * Called with job_lock *not* held.
- */
-int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp),
-                    Error **errp);
-
-/*
- * Same as job_finish_sync(), but called with job lock held.
- * Might release the lock temporarily.
+ * Called with job_lock held, but might release it temporarily.
  */
 int job_finish_sync_locked(Job *job, void (*finish)(Job *, Error **errp),
                            Error **errp);
diff --git a/job.c b/job.c
index XXXXXXX..XXXXXXX 100644
--- a/job.c
+++ b/job.c
@@ -XXX,XX +XXX,XX @@ int job_apply_verb_locked(Job *job, JobVerb verb, Error **errp)
     return -EPERM;
 }
 
-int job_apply_verb(Job *job, JobVerb verb, Error **errp)
-{
-    JOB_LOCK_GUARD();
-    return job_apply_verb_locked(job, verb, errp);
-}
-
 JobType job_type(const Job *job)
 {
     return job->driver->job_type;
@@ -XXX,XX +XXX,XX @@ bool job_is_completed_locked(Job *job)
     return false;
 }
 
-bool job_is_completed(Job *job)
+static bool job_is_completed(Job *job)
 {
     JOB_LOCK_GUARD();
     return job_is_completed_locked(job);
@@ -XXX,XX +XXX,XX @@ Job *job_get_locked(const char *id)
     return NULL;
 }
 
-Job *job_get(const char *id)
-{
-    JOB_LOCK_GUARD();
-    return job_get_locked(id);
-}
-
 void job_set_aio_context(Job *job, AioContext *ctx)
 {
     /* protect against read in job_finish_sync_locked and job_start */
@@ -XXX,XX +XXX,XX @@ void job_ref_locked(Job *job)
     ++job->refcnt;
 }
 
-void job_ref(Job *job)
-{
-    JOB_LOCK_GUARD();
-    job_ref_locked(job);
-}
-
 void job_unref_locked(Job *job)
 {
     GLOBAL_STATE_CODE();
@@ -XXX,XX +XXX,XX @@ void job_unref_locked(Job *job)
     }
 }
 
-void job_unref(Job *job)
-{
-    JOB_LOCK_GUARD();
-    job_unref_locked(job);
-}
-
 void job_progress_update(Job *job, uint64_t done)
 {
     progress_work_done(&job->progress, done);
@@ -XXX,XX +XXX,XX @@ void job_enter_cond_locked(Job *job, bool(*fn)(Job *job))
     job_lock();
 }
 
-void job_enter_cond(Job *job, bool(*fn)(Job *job))
-{
-    JOB_LOCK_GUARD();
-    job_enter_cond_locked(job, fn);
-}
-
 void job_enter(Job *job)
 {
     JOB_LOCK_GUARD();
@@ -XXX,XX +XXX,XX @@ void coroutine_fn job_pause_point(Job *job)
     job_pause_point_locked(job);
 }
 
-static void coroutine_fn job_yield_locked(Job *job)
+void coroutine_fn job_yield(Job *job)
 {
+    JOB_LOCK_GUARD();
     assert(job->busy);
 
     /* Check cancellation *before* setting busy = false, too!  */
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn job_yield_locked(Job *job)
     job_pause_point_locked(job);
 }
 
-void coroutine_fn job_yield(Job *job)
-{
-    JOB_LOCK_GUARD();
-    job_yield_locked(job);
-}
-
 void coroutine_fn job_sleep_ns(Job *job, int64_t ns)
 {
     JOB_LOCK_GUARD();
@@ -XXX,XX +XXX,XX @@ void job_user_pause_locked(Job *job, Error **errp)
     job_pause_locked(job);
 }
 
-void job_user_pause(Job *job, Error **errp)
-{
-    JOB_LOCK_GUARD();
-    job_user_pause_locked(job, errp);
-}
-
 bool job_user_paused_locked(Job *job)
 {
     return job->user_paused;
 }
 
-bool job_user_paused(Job *job)
-{
-    JOB_LOCK_GUARD();
-    return job_user_paused_locked(job);
-}
-
 void job_user_resume_locked(Job *job, Error **errp)
 {
     assert(job);
@@ -XXX,XX +XXX,XX @@ void job_user_resume_locked(Job *job, Error **errp)
     job_resume_locked(job);
 }
 
-void job_user_resume(Job *job, Error **errp)
-{
-    JOB_LOCK_GUARD();
-    job_user_resume_locked(job, errp);
-}
-
 /* Called with job_mutex held, but releases it temporarily. */
 static void job_do_dismiss_locked(Job *job)
 {
@@ -XXX,XX +XXX,XX @@ void job_dismiss_locked(Job **jobptr, Error **errp)
     *jobptr = NULL;
 }
 
-void job_dismiss(Job **jobptr, Error **errp)
-{
-    JOB_LOCK_GUARD();
-    job_dismiss_locked(jobptr, errp);
-}
-
 void job_early_fail(Job *job)
 {
     JOB_LOCK_GUARD();
@@ -XXX,XX +XXX,XX @@ void job_finalize_locked(Job *job, Error **errp)
     job_do_finalize_locked(job);
 }
 
-void job_finalize(Job *job, Error **errp)
-{
-    JOB_LOCK_GUARD();
-    job_finalize_locked(job, errp);
-}
-
 /* Called with job_mutex held. */
 static int job_transition_to_pending_locked(Job *job)
 {
@@ -XXX,XX +XXX,XX @@ void job_cancel_locked(Job *job, bool force)
     }
 }
 
-void job_cancel(Job *job, bool force)
-{
-    JOB_LOCK_GUARD();
-    job_cancel_locked(job, force);
-}
-
 void job_user_cancel_locked(Job *job, bool force, Error **errp)
 {
     if (job_apply_verb_locked(job, JOB_VERB_CANCEL, errp)) {
@@ -XXX,XX +XXX,XX @@ void job_user_cancel_locked(Job *job, bool force, Error **errp)
     job_cancel_locked(job, force);
 }
 
-void job_user_cancel(Job *job, bool force, Error **errp)
-{
-    JOB_LOCK_GUARD();
-    job_user_cancel_locked(job, force, errp);
-}
-
-/* A wrapper around job_cancel() taking an Error ** parameter so it may be
- * used with job_finish_sync() without the need for (rather nasty) function
- * pointer casts there.
+/* A wrapper around job_cancel_locked() taking an Error ** parameter so it may
+ * be used with job_finish_sync_locked() without the need for (rather nasty)
+ * function pointer casts there.
  *
  * Called with job_mutex held.
  */
@@ -XXX,XX +XXX,XX @@ int job_complete_sync_locked(Job *job, Error **errp)
     return job_finish_sync_locked(job, job_complete_locked, errp);
 }
 
-int job_complete_sync(Job *job, Error **errp)
-{
-    JOB_LOCK_GUARD();
-    return job_complete_sync_locked(job, errp);
-}
-
 void job_complete_locked(Job *job, Error **errp)
 {
     /* Should not be reachable via external interface for internal jobs */
@@ -XXX,XX +XXX,XX @@ void job_complete_locked(Job *job, Error **errp)
     job_lock();
 }
 
-void job_complete(Job *job, Error **errp)
-{
-    JOB_LOCK_GUARD();
-    job_complete_locked(job, errp);
-}
-
 int job_finish_sync_locked(Job *job,
                            void (*finish)(Job *, Error **errp),
                            Error **errp)
@@ -XXX,XX +XXX,XX @@ int job_finish_sync_locked(Job *job,
     job_unref_locked(job);
     return ret;
 }
-
-int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error **errp)
-{
-    JOB_LOCK_GUARD();
-    return job_finish_sync_locked(job, finish, errp);
-}
diff --git a/tests/unit/test-blockjob.c b/tests/unit/test-blockjob.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/test-blockjob.c
+++ b/tests/unit/test-blockjob.c
@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver test_yielding_driver = {
 };
 
 /*
- * Test that job_complete() works even on jobs that are in a paused
+ * Test that job_complete_locked() works even on jobs that are in a paused
  * state (i.e., STANDBY).
  *
  * To do this, run YieldingJob in an IO thread, get it into the READY
@@ -XXX,XX +XXX,XX @@ static const BlockJobDriver test_yielding_driver = {
  * acquire the context so the job will not be entered and will thus
  * remain on STANDBY.
  *
- * job_complete() should still work without error.
+ * job_complete_locked() should still work without error.
  *
  * Note that on the QMP interface, it is impossible to lock an IO
  * thread before a drained section ends.  In practice, the
-- 
2.37.3

The field is unused (only ever set, but never read) since commit
ac9185603. Additionally, the commit message of commit 34fa110e already
explained earlier why it's unreliable. Remove it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20220923142838.91043-1-kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/file-posix.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -XXX,XX +XXX,XX @@ typedef struct BDRVRawState {
 
     bool has_discard:1;
     bool has_write_zeroes:1;
-    bool discard_zeroes:1;
     bool use_linux_aio:1;
     bool use_linux_io_uring:1;
     int page_cache_inconsistent; /* errno from fdatasync failure */
@@ -XXX,XX +XXX,XX @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
             ret = -EINVAL;
             goto fail;
         } else {
-            s->discard_zeroes = true;
             s->has_fallocate = true;
         }
     } else {
@@ -XXX,XX +XXX,XX @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
     }
 
     if (S_ISBLK(st.st_mode)) {
-#ifdef BLKDISCARDZEROES
-        unsigned int arg;
-        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
-            s->discard_zeroes = true;
-        }
-#endif
 #ifdef __linux__
         /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
          * not rely on the contents of discarded blocks unless using O_DIRECT.
          * Same for BLKZEROOUT.
          */
         if (!(bs->open_flags & BDRV_O_NOCACHE)) {
-            s->discard_zeroes = false;
             s->has_write_zeroes = false;
         }
 #endif
-- 
2.37.3

The following changes since commit 825b96dbcee23d134b691fc75618b59c5f53da32:

Merge tag 'migration-20250310-pull-request' of https://gitlab.com/farosas/qemu into staging (2025-03-11 09:32:07 +0800)

are available in the Git repository at:

https://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to a93c04f3cbe690877b3297a9df4767aa811fcd97:

virtio-scsi: only expose cmd vqs via iothread-vq-mapping (2025-03-11 15:49:22 +0100)

----------------------------------------------------------------
Block layer patches

- virtio-scsi: add iothread-vq-mapping parameter
- Improve writethrough performance
- Fix missing zero init in bdrv_snapshot_goto()
- Code cleanup and iotests fixes

----------------------------------------------------------------
Kevin Wolf (8):
      block: Remove unused blk_op_is_blocked()
      block: Zero block driver state before reopening
      file-posix: Support FUA writes
      block/io: Ignore FUA with cache.no-flush=on
      aio: Create AioPolledEvent
      aio-posix: Factor out adjust_polling_time()
      aio-posix: Separate AioPolledEvent per AioHandler
      aio-posix: Adjust polling time also for new handlers

Stefan Hajnoczi (13):
      scsi-disk: drop unused SCSIDiskState->bh field
      dma: use current AioContext for dma_blk_io()
      scsi: track per-SCSIRequest AioContext
      scsi: introduce requests_lock
      virtio-scsi: introduce event and ctrl virtqueue locks
      virtio-scsi: protect events_dropped field
      virtio-scsi: perform TMFs in appropriate AioContexts
      virtio-blk: extract cleanup_iothread_vq_mapping() function
      virtio-blk: tidy up iothread_vq_mapping functions
      virtio: extract iothread-vq-mapping.h API
      virtio-scsi: add iothread-vq-mapping parameter
      virtio-scsi: handle ctrl virtqueue in main loop
      virtio-scsi: only expose cmd vqs via iothread-vq-mapping

Thomas Huth (1):
      iotests: Limit qsd-migrate to working formats

Commit fc4e394b28 removed the last caller of blk_op_is_blocked(). Remove
the now unused function.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250206165331.379033-1-kwolf@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/system/block-backend-global-state.h |  1 -
 block/block-backend.c                       | 12 ------------
 2 files changed, 13 deletions(-)

diff --git a/include/system/block-backend-global-state.h b/include/system/block-backend-global-state.h
index XXXXXXX..XXXXXXX 100644
--- a/include/system/block-backend-global-state.h
+++ b/include/system/block-backend-global-state.h
@@ -XXX,XX +XXX,XX @@ bool blk_supports_write_perm(BlockBackend *blk);
 bool blk_is_sg(BlockBackend *blk);
 void blk_set_enable_write_cache(BlockBackend *blk, bool wce);
 int blk_get_flags(BlockBackend *blk);
-bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp);
 int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
                         Error **errp);
 void blk_add_aio_context_notifier(BlockBackend *blk,
diff --git a/block/block-backend.c b/block/block-backend.c
index XXXXXXX..XXXXXXX 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -XXX,XX +XXX,XX @@ void *blk_blockalign(BlockBackend *blk, size_t size)
     return qemu_blockalign(blk ? blk_bs(blk) : NULL, size);
 }
 
-bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
-{
-    BlockDriverState *bs = blk_bs(blk);
-    GLOBAL_STATE_CODE();
-    GRAPH_RDLOCK_GUARD_MAINLOOP();
-
-    if (!bs) {
-        return false;
-    }
-
-    return bdrv_op_is_blocked(bs, op, errp);
-}
 
 /**
  * Return BB's current AioContext.  Note that this context may change
-- 
2.48.1

Block drivers assume in their .bdrv_open() implementation that their
state in bs->opaque has been zeroed; it is initially allocated with
g_malloc0() in bdrv_open_driver().

bdrv_snapshot_goto() needs to make sure that it is zeroed again before
calling drv->bdrv_open() to avoid that block drivers use stale values.

One symptom of this bug is VMDK running into a double free when the user
tries to apply an internal snapshot like 'qemu-img snapshot -a test
test.vmdk'. This should be a graceful error because VMDK doesn't support
internal snapshots.

==25507== Invalid free() / delete / delete[] / realloc()
==25507==    at 0x484B347: realloc (vg_replace_malloc.c:1801)
==25507==    by 0x54B592A: g_realloc (gmem.c:171)
==25507==    by 0x1B221D: vmdk_add_extent (../block/vmdk.c:570)
==25507==    by 0x1B1084: vmdk_open_sparse (../block/vmdk.c:1059)
==25507==    by 0x1AF3D8: vmdk_open (../block/vmdk.c:1371)
==25507==    by 0x1A2AE0: bdrv_snapshot_goto (../block/snapshot.c:299)
==25507==    by 0x205C77: img_snapshot (../qemu-img.c:3500)
==25507==    by 0x58FA087: (below main) (libc_start_call_main.h:58)
==25507==  Address 0x832f3e0 is 0 bytes inside a block of size 272 free'd
==25507==    at 0x4846B83: free (vg_replace_malloc.c:989)
==25507==    by 0x54AEAC4: g_free (gmem.c:208)
==25507==    by 0x1AF629: vmdk_close (../block/vmdk.c:2889)
==25507==    by 0x1A2A9C: bdrv_snapshot_goto (../block/snapshot.c:290)
==25507==    by 0x205C77: img_snapshot (../qemu-img.c:3500)
==25507==    by 0x58FA087: (below main) (libc_start_call_main.h:58)

This error was discovered by fuzzing qemu-img.

Cc: qemu-stable@nongnu.org
Closes: https://gitlab.com/qemu-project/qemu/-/issues/2853
Closes: https://gitlab.com/qemu-project/qemu/-/issues/2851
Reported-by: Denis Rastyogin <gerben@altlinux.org>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250310104858.28221-1-kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/snapshot.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/snapshot.c b/block/snapshot.c
index XXXXXXX..XXXXXXX 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -XXX,XX +XXX,XX @@ int bdrv_snapshot_goto(BlockDriverState *bs,
         bdrv_graph_wrunlock();
 
         ret = bdrv_snapshot_goto(fallback_bs, snapshot_id, errp);
+        memset(bs->opaque, 0, drv->instance_size);
         open_ret = drv->bdrv_open(bs, options, bs->open_flags, &local_err);
         qobject_unref(options);
         if (open_ret < 0) {
-- 
2.48.1

Until now, FUA was always emulated with a separate flush after the write
for file-posix. The overhead of processing a second request can reduce
performance significantly for a guest disk that has disabled the write
cache, especially if the host disk is already write through, too, and
the flush isn't actually doing anything.

Advertise support for REQ_FUA in write requests and implement it for
Linux AIO and io_uring using the RWF_DSYNC flag for write requests. The
thread pool still performs a separate fdatasync() call. This can be
improved later by using the pwritev2() syscall if available.

As an example, this is how fio numbers can be improved in some scenarios
with this patch (all using virtio-blk with cache=directsync on an nvme
block device for the VM, fio with ioengine=libaio,direct=1,sync=1):

However, not all scenarios are clear wins. On another slower disk I saw
little to no improvment. In fact, in two corner case scenarios, I even
observed a regression, which I however consider acceptable:

1. On slow host disks in a write through cache mode, when the guest is
   using virtio-blk in a separate iothread so that polling can be
   enabled, and each completion is quickly followed up with a new
   request (so that polling gets it), it can happen that enabling FUA
   makes things slower - the additional very fast no-op flush we used to
   have gave the adaptive polling algorithm a success so that it kept
   polling. Without it, we only have the slow write request, which
   disables polling. This is a problem in the polling algorithm that
   will be fixed later in this series.

2. With a high queue depth, it can be beneficial to have flush requests
   for another reason: The optimisation in bdrv_co_flush() that flushes
   only once per write generation acts as a synchronisation mechanism
   that lets all requests complete at the same time. This can result in
   better batching and if the disk is very fast (I only saw this with a
   null_blk backend), this can make up for the overhead of the flush and
   improve throughput. In theory, we could optionally introduce a
   similar artificial latency in the normal completion path to achieve
   the same kind of completion batching. This is not implemented in this
   series.

Compatibility is not a concern for io_uring, it has supported RWF_DSYNC
from the start. Linux AIO started supporting it in Linux 4.13 and libaio
0.3.111. The kernel is not a problem for any supported build platform,
so it's not necessary to add runtime checks. However, openSUSE is still
stuck with an older libaio version that would break the build. We must
detect this at build time to avoid build failures.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250307221634.71951-2-kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/raw-aio.h |  8 ++++++--
 block/file-posix.c      | 26 ++++++++++++++++++--------
 block/io_uring.c        | 13 ++++++++-----
 block/linux-aio.c       | 24 +++++++++++++++++++++---
 meson.build             |  4 ++++
 5 files changed, 57 insertions(+), 18 deletions(-)

diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -XXX,XX +XXX,XX @@
 #define QEMU_RAW_AIO_H
 
 #include "block/aio.h"
+#include "block/block-common.h"
 #include "qemu/iov.h"
 
 /* AIO request types */
@@ -XXX,XX +XXX,XX @@ void laio_cleanup(LinuxAioState *s);
 
 /* laio_co_submit: submit I/O requests in the thread's current AioContext. */
 int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
-                                int type, uint64_t dev_max_batch);
+                                int type, BdrvRequestFlags flags,
+                                uint64_t dev_max_batch);
 
 bool laio_has_fdsync(int);
+bool laio_has_fua(void);
 void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
 void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
 #endif
@@ -XXX,XX +XXX,XX @@ void luring_cleanup(LuringState *s);
 
 /* luring_co_submit: submit I/O requests in the thread's current AioContext. */
 int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
-                                  QEMUIOVector *qiov, int type);
+                                  QEMUIOVector *qiov, int type,
+                                  BdrvRequestFlags flags);
 void luring_detach_aio_context(LuringState *s, AioContext *old_context);
 void luring_attach_aio_context(LuringState *s, AioContext *new_context);
 #endif
diff --git a/block/file-posix.c b/block/file-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -XXX,XX +XXX,XX @@ static int fd_open(BlockDriverState *bs)
 }
 
 static int64_t raw_getlength(BlockDriverState *bs);
+static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs);
 
 typedef struct RawPosixAIOData {
     BlockDriverState *bs;
@@ -XXX,XX +XXX,XX @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
 #endif
     s->needs_alignment = raw_needs_alignment(bs);
 
+    if (!s->use_linux_aio || laio_has_fua()) {
+        bs->supported_write_flags = BDRV_REQ_FUA;
+    }
+
     bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
     if (S_ISREG(st.st_mode)) {
         /* When extending regular files, we get zeros from the OS */
@@ -XXX,XX +XXX,XX @@ static inline bool raw_check_linux_aio(BDRVRawState *s)
 #endif
 
 static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
-                                   uint64_t bytes, QEMUIOVector *qiov, int type)
+                                   uint64_t bytes, QEMUIOVector *qiov, int type,
+                                   int flags)
 {
     BDRVRawState *s = bs->opaque;
     RawPosixAIOData acb;
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
 #ifdef CONFIG_LINUX_IO_URING
     } else if (raw_check_linux_io_uring(s)) {
         assert(qiov->size == bytes);
-        ret = luring_co_submit(bs, s->fd, offset, qiov, type);
+        ret = luring_co_submit(bs, s->fd, offset, qiov, type, flags);
         goto out;
 #endif
 #ifdef CONFIG_LINUX_AIO
     } else if (raw_check_linux_aio(s)) {
         assert(qiov->size == bytes);
-        ret = laio_co_submit(s->fd, offset, qiov, type,
+        ret = laio_co_submit(s->fd, offset, qiov, type, flags,
                               s->aio_max_batch);
         goto out;
 #endif
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
 
     assert(qiov->size == bytes);
     ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
+    if (ret == 0 && (flags & BDRV_REQ_FUA)) {
+        /* TODO Use pwritev2() instead if it's available */
+        ret = raw_co_flush_to_disk(bs);
+    }
     goto out; /* Avoid the compiler err of unused label */
 
 out:
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
                                       int64_t bytes, QEMUIOVector *qiov,
                                       BdrvRequestFlags flags)
 {
-    return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ);
+    return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ, flags);
 }
 
 static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
                                        int64_t bytes, QEMUIOVector *qiov,
                                        BdrvRequestFlags flags)
 {
-    return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE);
+    return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE, flags);
 }
 
 static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
 
 #ifdef CONFIG_LINUX_IO_URING
     if (raw_check_linux_io_uring(s)) {
-        return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH);
+        return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH, 0);
     }
 #endif
 #ifdef CONFIG_LINUX_AIO
     if (s->has_laio_fdsync && raw_check_linux_aio(s)) {
-        return laio_co_submit(s->fd, 0, NULL, QEMU_AIO_FLUSH, 0);
+        return laio_co_submit(s->fd, 0, NULL, QEMU_AIO_FLUSH, 0, 0);
     }
 #endif
     return raw_thread_pool_submit(handle_aiocb_flush, &acb);
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
     }
 
     trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
-    return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND);
+    return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND, 0);
 }
 #endif
 
diff --git a/block/io_uring.c b/block/io_uring.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -XXX,XX +XXX,XX @@ static void luring_deferred_fn(void *opaque)
  *
  */
 static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
-                            uint64_t offset, int type)
+                            uint64_t offset, int type, BdrvRequestFlags flags)
 {
     int ret;
     struct io_uring_sqe *sqes = &luringcb->sqeq;
+    int luring_flags;
 
     switch (type) {
     case QEMU_AIO_WRITE:
-        io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
-                             luringcb->qiov->niov, offset);
+        luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
+        io_uring_prep_writev2(sqes, fd, luringcb->qiov->iov,
+                              luringcb->qiov->niov, offset, luring_flags);
         break;
     case QEMU_AIO_ZONE_APPEND:
         io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
@@ -XXX,XX +XXX,XX @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
 }
 
 int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
-                                  QEMUIOVector *qiov, int type)
+                                  QEMUIOVector *qiov, int type,
+                                  BdrvRequestFlags flags)
 {
     int ret;
     AioContext *ctx = qemu_get_current_aio_context();
@@ -XXX,XX +XXX,XX @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
     };
     trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0,
                            type);
-    ret = luring_do_submit(fd, &luringcb, s, offset, type);
+    ret = luring_do_submit(fd, &luringcb, s, offset, type, flags);
 
     if (ret < 0) {
         return ret;
diff --git a/block/linux-aio.c b/block/linux-aio.c
index XXXXXXX..XXXXXXX 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -XXX,XX +XXX,XX @@ static void laio_deferred_fn(void *opaque)
 }
 
 static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
-                          int type, uint64_t dev_max_batch)
+                          int type, BdrvRequestFlags flags,
+                          uint64_t dev_max_batch)
 {
     LinuxAioState *s = laiocb->ctx;
     struct iocb *iocbs = &laiocb->iocb;
     QEMUIOVector *qiov = laiocb->qiov;
+    int laio_flags;
 
     switch (type) {
     case QEMU_AIO_WRITE:
+#ifdef HAVE_IO_PREP_PWRITEV2
+        laio_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
+        io_prep_pwritev2(iocbs, fd, qiov->iov, qiov->niov, offset, laio_flags);
+#else
+        assert(flags == 0);
         io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
+#endif
         break;
     case QEMU_AIO_ZONE_APPEND:
         io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
@@ -XXX,XX +XXX,XX @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
 }
 
 int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
-                                int type, uint64_t dev_max_batch)
+                                int type, BdrvRequestFlags flags,
+                                uint64_t dev_max_batch)
 {
     int ret;
     AioContext *ctx = qemu_get_current_aio_context();
@@ -XXX,XX +XXX,XX @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
         .qiov       = qiov,
     };
 
-    ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
+    ret = laio_do_submit(fd, &laiocb, offset, type, flags, dev_max_batch);
     if (ret < 0) {
         return ret;
     }
@@ -XXX,XX +XXX,XX @@ bool laio_has_fdsync(int fd)
     io_destroy(ctx);
     return (ret == -EINVAL) ? false : true;
 }
+
+bool laio_has_fua(void)
+{
+#ifdef HAVE_IO_PREP_PWRITEV2
+    return true;
+#else
+    return false;
+#endif
+}
diff --git a/meson.build b/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/meson.build
+++ b/meson.build
@@ -XXX,XX +XXX,XX @@ config_host_data.set('HAVE_OPTRESET',
                      cc.has_header_symbol('getopt.h', 'optreset'))
 config_host_data.set('HAVE_IPPROTO_MPTCP',
                      cc.has_header_symbol('netinet/in.h', 'IPPROTO_MPTCP'))
+if libaio.found()
+  config_host_data.set('HAVE_IO_PREP_PWRITEV2',
+                       cc.has_header_symbol('libaio.h', 'io_prep_pwritev2'))
+endif
 
 # has_member
 config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID',
-- 
2.48.1

For block drivers that don't advertise FUA support, we already call
bdrv_co_flush(), which considers BDRV_O_NO_FLUSH. However, drivers that
do support FUA still see the FUA flag with BDRV_O_NO_FLUSH and get the
associated performance penalty that cache.no-flush=on was supposed to
avoid.

Clear FUA for write requests if BDRV_O_NO_FLUSH is set.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250307221634.71951-3-kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ bdrv_driver_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
         return -ENOMEDIUM;
     }
 
+    if (bs->open_flags & BDRV_O_NO_FLUSH) {
+        flags &= ~BDRV_REQ_FUA;
+    }
+
     if ((flags & BDRV_REQ_FUA) &&
         (~bs->supported_write_flags & BDRV_REQ_FUA)) {
         flags &= ~BDRV_REQ_FUA;
-- 
2.48.1

As a preparation for having multiple adaptive polling states per
AioContext, move the 'ns' field into a separate struct.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250307221634.71951-4-kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/aio.h |  6 +++++-
 util/aio-posix.c    | 31 ++++++++++++++++---------------
 util/async.c        |  3 ++-
 3 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/include/block/aio.h b/include/block/aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ struct BHListSlice {
 
 typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
 
+typedef struct AioPolledEvent {
+    int64_t ns;        /* current polling time in nanoseconds */
+} AioPolledEvent;
+
 struct AioContext {
     GSource source;
 
@@ -XXX,XX +XXX,XX @@ struct AioContext {
     int poll_disable_cnt;
 
     /* Polling mode parameters */
-    int64_t poll_ns;        /* current polling time in nanoseconds */
+    AioPolledEvent poll;
     int64_t poll_max_ns;    /* maximum polling time in nanoseconds */
     int64_t poll_grow;      /* polling time growth factor */
     int64_t poll_shrink;    /* polling time shrink factor */
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
         return false;
     }
 
-    max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
+    max_ns = qemu_soonest_timeout(*timeout, ctx->poll.ns);
     if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
         /*
          * Enable poll mode. It pairs with the poll_set_started() in
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     if (ctx->poll_max_ns) {
         int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
 
-        if (block_ns <= ctx->poll_ns) {
+        if (block_ns <= ctx->poll.ns) {
             /* This is the sweet spot, no adjustment needed */
         } else if (block_ns > ctx->poll_max_ns) {
             /* We'd have to poll for too long, poll less */
-            int64_t old = ctx->poll_ns;
+            int64_t old = ctx->poll.ns;
 
             if (ctx->poll_shrink) {
-                ctx->poll_ns /= ctx->poll_shrink;
+                ctx->poll.ns /= ctx->poll_shrink;
             } else {
-                ctx->poll_ns = 0;
+                ctx->poll.ns = 0;
             }
 
-            trace_poll_shrink(ctx, old, ctx->poll_ns);
-        } else if (ctx->poll_ns < ctx->poll_max_ns &&
+            trace_poll_shrink(ctx, old, ctx->poll.ns);
+        } else if (ctx->poll.ns < ctx->poll_max_ns &&
                    block_ns < ctx->poll_max_ns) {
             /* There is room to grow, poll longer */
-            int64_t old = ctx->poll_ns;
+            int64_t old = ctx->poll.ns;
             int64_t grow = ctx->poll_grow;
 
             if (grow == 0) {
                 grow = 2;
             }
 
-            if (ctx->poll_ns) {
-                ctx->poll_ns *= grow;
+            if (ctx->poll.ns) {
+                ctx->poll.ns *= grow;
             } else {
-                ctx->poll_ns = 4000; /* start polling at 4 microseconds */
+                ctx->poll.ns = 4000; /* start polling at 4 microseconds */
             }
 
-            if (ctx->poll_ns > ctx->poll_max_ns) {
-                ctx->poll_ns = ctx->poll_max_ns;
+            if (ctx->poll.ns > ctx->poll_max_ns) {
+                ctx->poll.ns = ctx->poll_max_ns;
             }
 
-            trace_poll_grow(ctx, old, ctx->poll_ns);
+            trace_poll_grow(ctx, old, ctx->poll.ns);
         }
     }
 
@@ -XXX,XX +XXX,XX @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
     /* No thread synchronization here, it doesn't matter if an incorrect value
      * is used once.
      */
+    ctx->poll.ns = 0;
+
     ctx->poll_max_ns = max_ns;
-    ctx->poll_ns = 0;
     ctx->poll_grow = grow;
     ctx->poll_shrink = shrink;
 
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
     qemu_rec_mutex_init(&ctx->lock);
     timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
 
-    ctx->poll_ns = 0;
+    ctx->poll.ns = 0;
+
     ctx->poll_max_ns = 0;
     ctx->poll_grow = 0;
     ctx->poll_shrink = 0;
-- 
2.48.1

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250307221634.71951-5-kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 util/aio-posix.c | 77 ++++++++++++++++++++++++++----------------------
 1 file changed, 41 insertions(+), 36 deletions(-)

diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
     return false;
 }
 
+static void adjust_polling_time(AioContext *ctx, AioPolledEvent *poll,
+                                int64_t block_ns)
+{
+    if (block_ns <= poll->ns) {
+        /* This is the sweet spot, no adjustment needed */
+    } else if (block_ns > ctx->poll_max_ns) {
+        /* We'd have to poll for too long, poll less */
+        int64_t old = poll->ns;
+
+        if (ctx->poll_shrink) {
+            poll->ns /= ctx->poll_shrink;
+        } else {
+            poll->ns = 0;
+        }
+
+        trace_poll_shrink(ctx, old, poll->ns);
+    } else if (poll->ns < ctx->poll_max_ns &&
+               block_ns < ctx->poll_max_ns) {
+        /* There is room to grow, poll longer */
+        int64_t old = poll->ns;
+        int64_t grow = ctx->poll_grow;
+
+        if (grow == 0) {
+            grow = 2;
+        }
+
+        if (poll->ns) {
+            poll->ns *= grow;
+        } else {
+            poll->ns = 4000; /* start polling at 4 microseconds */
+        }
+
+        if (poll->ns > ctx->poll_max_ns) {
+            poll->ns = ctx->poll_max_ns;
+        }
+
+        trace_poll_grow(ctx, old, poll->ns);
+    }
+}
+
 bool aio_poll(AioContext *ctx, bool blocking)
 {
     AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     /* Adjust polling time */
     if (ctx->poll_max_ns) {
         int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
-
-        if (block_ns <= ctx->poll.ns) {
-            /* This is the sweet spot, no adjustment needed */
-        } else if (block_ns > ctx->poll_max_ns) {
-            /* We'd have to poll for too long, poll less */
-            int64_t old = ctx->poll.ns;
-
-            if (ctx->poll_shrink) {
-                ctx->poll.ns /= ctx->poll_shrink;
-            } else {
-                ctx->poll.ns = 0;
-            }
-
-            trace_poll_shrink(ctx, old, ctx->poll.ns);
-        } else if (ctx->poll.ns < ctx->poll_max_ns &&
-                   block_ns < ctx->poll_max_ns) {
-            /* There is room to grow, poll longer */
-            int64_t old = ctx->poll.ns;
-            int64_t grow = ctx->poll_grow;
-
-            if (grow == 0) {
-                grow = 2;
-            }
-
-            if (ctx->poll.ns) {
-                ctx->poll.ns *= grow;
-            } else {
-                ctx->poll.ns = 4000; /* start polling at 4 microseconds */
-            }
-
-            if (ctx->poll.ns > ctx->poll_max_ns) {
-                ctx->poll.ns = ctx->poll_max_ns;
-            }
-
-            trace_poll_grow(ctx, old, ctx->poll.ns);
-        }
+        adjust_polling_time(ctx, &ctx->poll, block_ns);
     }
 
     progress |= aio_bh_poll(ctx);
-- 
2.48.1

Adaptive polling has a big problem: It doesn't consider that an event
loop can wait for many different events that may have very different
typical latencies.

For example, think of a guest that tends to send a new I/O request soon
after the previous I/O request completes, but the storage on the host is
rather slow. In this case, getting the new request from guest quickly
means that polling is enabled, but the next thing is performing the I/O
request on the backend, which is slow and disables polling again for the
next guest request. This means that in such a scenario, polling could
help for every other event, but is only ever enabled when it can't
succeed.

In order to fix this, keep a separate AioPolledEvent for each
AioHandler. We will then know that the backend file descriptor always
has a high latency and isn't worth polling for, but we also know that
the guest is always fast and we should poll for it. This solves at least
half of the problem, we can now keep polling for those cases where it
makes sense and get the improved performance from it.

Since the event loop doesn't know which event will be next, we still do
some unnecessary polling while we're waiting for the slow disk. I made
some attempts to be more clever than just randomly growing and shrinking
the polling time, and even to let callers be explicit about when they
expect a new event, but so far this hasn't resulted in improved
performance or even caused performance regressions. For now, let's just
fix the part that is easy enough to fix, we can revisit the rest later.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250307221634.71951-6-kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/aio.h |  1 -
 util/aio-posix.h    |  1 +
 util/aio-posix.c    | 26 ++++++++++++++++++++++----
 util/async.c        |  2 --
 4 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/include/block/aio.h b/include/block/aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ struct AioContext {
     int poll_disable_cnt;
 
     /* Polling mode parameters */
-    AioPolledEvent poll;
     int64_t poll_max_ns;    /* maximum polling time in nanoseconds */
     int64_t poll_grow;      /* polling time growth factor */
     int64_t poll_shrink;    /* polling time shrink factor */
diff --git a/util/aio-posix.h b/util/aio-posix.h
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.h
+++ b/util/aio-posix.h
@@ -XXX,XX +XXX,XX @@ struct AioHandler {
 #endif
     int64_t poll_idle_timeout; /* when to stop userspace polling */
     bool poll_ready; /* has polling detected an event? */
+    AioPolledEvent poll;
 };
 
 /* Add a handler to a ready list */
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list,
 static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
                           int64_t *timeout)
 {
+    AioHandler *node;
     int64_t max_ns;
 
     if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
         return false;
     }
 
-    max_ns = qemu_soonest_timeout(*timeout, ctx->poll.ns);
+    max_ns = 0;
+    QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
+        max_ns = MAX(max_ns, node->poll.ns);
+    }
+    max_ns = qemu_soonest_timeout(*timeout, max_ns);
+
     if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
         /*
          * Enable poll mode. It pairs with the poll_set_started() in
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
 
     /* Adjust polling time */
     if (ctx->poll_max_ns) {
+        AioHandler *node;
         int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
-        adjust_polling_time(ctx, &ctx->poll, block_ns);
+
+        QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
+            if (QLIST_IS_INSERTED(node, node_ready)) {
+                adjust_polling_time(ctx, &node->poll, block_ns);
+            }
+        }
     }
 
     progress |= aio_bh_poll(ctx);
@@ -XXX,XX +XXX,XX @@ void aio_context_use_g_source(AioContext *ctx)
 void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
                                  int64_t grow, int64_t shrink, Error **errp)
 {
+    AioHandler *node;
+
+    qemu_lockcnt_inc(&ctx->list_lock);
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        node->poll.ns = 0;
+    }
+    qemu_lockcnt_dec(&ctx->list_lock);
+
     /* No thread synchronization here, it doesn't matter if an incorrect value
      * is used once.
      */
-    ctx->poll.ns = 0;
-
     ctx->poll_max_ns = max_ns;
     ctx->poll_grow = grow;
     ctx->poll_shrink = shrink;
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
     qemu_rec_mutex_init(&ctx->lock);
     timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
 
-    ctx->poll.ns = 0;
-
     ctx->poll_max_ns = 0;
     ctx->poll_grow = 0;
     ctx->poll_shrink = 0;
-- 
2.48.1

aio_dispatch_handler() adds handlers to ctx->poll_aio_handlers if
polling should be enabled. If we call adjust_polling_time() for all
polling handlers before this, new polling handlers are still left at
poll->ns = 0 and polling is only actually enabled after the next event.
Move the adjust_polling_time() call after aio_dispatch_handler().

This fixes test-nested-aio-poll, which expects that polling becomes
effective the first time around.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250311141912.135657-1-kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 util/aio-posix.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@
 /* Stop userspace polling on a handler if it isn't active for some time */
 #define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
 
+static void adjust_polling_time(AioContext *ctx, AioPolledEvent *poll,
+                                int64_t block_ns);
+
 bool aio_poll_disabled(AioContext *ctx)
 {
     return qatomic_read(&ctx->poll_disable_cnt);
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
  * scanning all handlers with aio_dispatch_handlers().
  */
 static bool aio_dispatch_ready_handlers(AioContext *ctx,
-                                        AioHandlerList *ready_list)
+                                        AioHandlerList *ready_list,
+                                        int64_t block_ns)
 {
     bool progress = false;
     AioHandler *node;
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_ready_handlers(AioContext *ctx,
     while ((node = QLIST_FIRST(ready_list))) {
         QLIST_REMOVE(node, node_ready);
         progress = aio_dispatch_handler(ctx, node) || progress;
+
+        /*
+         * Adjust polling time only after aio_dispatch_handler(), which can
+         * add the handler to ctx->poll_aio_handlers.
+         */
+        if (ctx->poll_max_ns && QLIST_IS_INSERTED(node, node_poll)) {
+            adjust_polling_time(ctx, &node->poll, block_ns);
+        }
     }
 
     return progress;
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
     bool use_notify_me;
     int64_t timeout;
     int64_t start = 0;
+    int64_t block_ns = 0;
 
     /*
      * There cannot be two concurrent aio_poll calls for the same AioContext (or
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
 
     aio_notify_accept(ctx);
 
-    /* Adjust polling time */
+    /* Calculate blocked time for adaptive polling */
     if (ctx->poll_max_ns) {
-        AioHandler *node;
-        int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
-
-        QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
-            if (QLIST_IS_INSERTED(node, node_ready)) {
-                adjust_polling_time(ctx, &node->poll, block_ns);
-            }
-        }
+        block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
     }
 
     progress |= aio_bh_poll(ctx);
-    progress |= aio_dispatch_ready_handlers(ctx, &ready_list);
+    progress |= aio_dispatch_ready_handlers(ctx, &ready_list, block_ns);
 
     aio_free_deleted_handlers(ctx);
 
-- 
2.48.1

From: Thomas Huth <thuth@redhat.com>

qsd-migrate is currently only working for raw, qcow2 and qed.
Other formats are failing, e.g. because they don't support migration.
Thus let's limit this test to the three usable formats now.

Suggested-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Thomas Huth <thuth@redhat.com>
Message-ID: <20250224214058.205889-1-thuth@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/tests/qsd-migrate | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/tests/qsd-migrate b/tests/qemu-iotests/tests/qsd-migrate
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/tests/qsd-migrate
+++ b/tests/qemu-iotests/tests/qsd-migrate
@@ -XXX,XX +XXX,XX @@ import iotests
 
 from iotests import filter_qemu_io, filter_qtest
 
-iotests.script_initialize(supported_fmts=['generic'],
+iotests.script_initialize(supported_fmts=['qcow2', 'qed', 'raw'],
                           supported_protocols=['file'],
                           supported_platforms=['linux'])
 
-- 
2.48.1

From: Stefan Hajnoczi <stefanha@redhat.com>

In the past a single AioContext was used for block I/O and it was
fetched using blk_get_aio_context(). Nowadays the block layer supports
running I/O from any AioContext and multiple AioContexts at the same
time. Remove the dma_blk_io() AioContext argument and use the current
AioContext instead.

This makes calling the function easier and enables multiple IOThreads to
use dma_blk_io() concurrently for the same block device.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250311132616.1049687-3-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/system/dma.h | 3 +--
 hw/ide/core.c        | 3 +--
 hw/ide/macio.c       | 3 +--
 hw/scsi/scsi-disk.c  | 6 ++----
 system/dma-helpers.c | 8 ++++----
 5 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/include/system/dma.h b/include/system/dma.h
index XXXXXXX..XXXXXXX 100644
--- a/include/system/dma.h
+++ b/include/system/dma.h
@@ -XXX,XX +XXX,XX @@ typedef BlockAIOCB *DMAIOFunc(int64_t offset, QEMUIOVector *iov,
                               BlockCompletionFunc *cb, void *cb_opaque,
                               void *opaque);
 
-BlockAIOCB *dma_blk_io(AioContext *ctx,
-                       QEMUSGList *sg, uint64_t offset, uint32_t align,
+BlockAIOCB *dma_blk_io(QEMUSGList *sg, uint64_t offset, uint32_t align,
                        DMAIOFunc *io_func, void *io_func_opaque,
                        BlockCompletionFunc *cb, void *opaque, DMADirection dir);
 BlockAIOCB *dma_blk_read(BlockBackend *blk,
diff --git a/hw/ide/core.c b/hw/ide/core.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -XXX,XX +XXX,XX @@ static void ide_dma_cb(void *opaque, int ret)
                                            BDRV_SECTOR_SIZE, ide_dma_cb, s);
         break;
     case IDE_DMA_TRIM:
-        s->bus->dma->aiocb = dma_blk_io(blk_get_aio_context(s->blk),
-                                        &s->sg, offset, BDRV_SECTOR_SIZE,
+        s->bus->dma->aiocb = dma_blk_io(&s->sg, offset, BDRV_SECTOR_SIZE,
                                         ide_issue_trim, s, ide_dma_cb, s,
                                         DMA_DIRECTION_TO_DEVICE);
         break;
diff --git a/hw/ide/macio.c b/hw/ide/macio.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/ide/macio.c
+++ b/hw/ide/macio.c
@@ -XXX,XX +XXX,XX @@ static void pmac_ide_transfer_cb(void *opaque, int ret)
                                            pmac_ide_transfer_cb, io);
         break;
     case IDE_DMA_TRIM:
-        s->bus->dma->aiocb = dma_blk_io(blk_get_aio_context(s->blk), &s->sg,
-                                        offset, 0x1, ide_issue_trim, s,
+        s->bus->dma->aiocb = dma_blk_io(&s->sg, offset, 0x1, ide_issue_trim, s,
                                         pmac_ide_transfer_cb, io,
                                         DMA_DIRECTION_TO_DEVICE);
         break;
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -XXX,XX +XXX,XX @@ static void scsi_do_read(SCSIDiskReq *r, int ret)
     if (r->req.sg) {
         dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_READ);
         r->req.residual -= r->req.sg->size;
-        r->req.aiocb = dma_blk_io(blk_get_aio_context(s->qdev.conf.blk),
-                                  r->req.sg, r->sector << BDRV_SECTOR_BITS,
+        r->req.aiocb = dma_blk_io(r->req.sg, r->sector << BDRV_SECTOR_BITS,
                                   BDRV_SECTOR_SIZE,
                                   sdc->dma_readv, r, scsi_dma_complete, r,
                                   DMA_DIRECTION_FROM_DEVICE);
@@ -XXX,XX +XXX,XX @@ static void scsi_write_data(SCSIRequest *req)
     if (r->req.sg) {
         dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_WRITE);
         r->req.residual -= r->req.sg->size;
-        r->req.aiocb = dma_blk_io(blk_get_aio_context(s->qdev.conf.blk),
-                                  r->req.sg, r->sector << BDRV_SECTOR_BITS,
+        r->req.aiocb = dma_blk_io(r->req.sg, r->sector << BDRV_SECTOR_BITS,
                                   BDRV_SECTOR_SIZE,
                                   sdc->dma_writev, r, scsi_dma_complete, r,
                                   DMA_DIRECTION_TO_DEVICE);
diff --git a/system/dma-helpers.c b/system/dma-helpers.c
index XXXXXXX..XXXXXXX 100644
--- a/system/dma-helpers.c
+++ b/system/dma-helpers.c
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo dma_aiocb_info = {
     .cancel_async       = dma_aio_cancel,
 };
 
-BlockAIOCB *dma_blk_io(AioContext *ctx,
+BlockAIOCB *dma_blk_io(
     QEMUSGList *sg, uint64_t offset, uint32_t align,
     DMAIOFunc *io_func, void *io_func_opaque,
     BlockCompletionFunc *cb,
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *dma_blk_io(AioContext *ctx,
 
     dbs->acb = NULL;
     dbs->sg = sg;
-    dbs->ctx = ctx;
+    dbs->ctx = qemu_get_current_aio_context();
     dbs->offset = offset;
     dbs->align = align;
     dbs->sg_cur_index = 0;
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *dma_blk_read(BlockBackend *blk,
                          QEMUSGList *sg, uint64_t offset, uint32_t align,
                          void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_blk_io(blk_get_aio_context(blk), sg, offset, align,
+    return dma_blk_io(sg, offset, align,
                       dma_blk_read_io_func, blk, cb, opaque,
                       DMA_DIRECTION_FROM_DEVICE);
 }
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *dma_blk_write(BlockBackend *blk,
                           QEMUSGList *sg, uint64_t offset, uint32_t align,
                           void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_blk_io(blk_get_aio_context(blk), sg, offset, align,
+    return dma_blk_io(sg, offset, align,
                       dma_blk_write_io_func, blk, cb, opaque,
                       DMA_DIRECTION_TO_DEVICE);
 }
-- 
2.48.1

From: Stefan Hajnoczi <stefanha@redhat.com>

Until now, a SCSIDevice's I/O requests have run in a single AioContext.
In order to support multiple IOThreads it will be necessary to move to
the concept of a per-SCSIRequest AioContext.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250311132616.1049687-4-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/hw/scsi/scsi.h |  1 +
 hw/scsi/scsi-bus.c     |  1 +
 hw/scsi/scsi-disk.c    | 17 ++++++-----------
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -XXX,XX +XXX,XX @@ struct SCSIRequest {
     SCSIBus           *bus;
     SCSIDevice        *dev;
     const SCSIReqOps  *ops;
+    AioContext        *ctx;
     uint32_t          refcount;
     uint32_t          tag;
     uint32_t          lun;
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -XXX,XX +XXX,XX @@ invalid_opcode:
         }
     }
 
+    req->ctx = qemu_get_current_aio_context();
     req->cmd = cmd;
     req->residual = req->cmd.xfer;
 
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
     SCSIDiskReq *r = (SCSIDiskReq *)opaque;
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
 
-    /* The request must only run in the BlockBackend's AioContext */
-    assert(blk_get_aio_context(s->qdev.conf.blk) ==
-           qemu_get_current_aio_context());
+    /* The request must run in its AioContext */
+    assert(r->req.ctx == qemu_get_current_aio_context());
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_complete(void *opaque, int ret)
 
 static void scsi_read_complete_noio(SCSIDiskReq *r, int ret)
 {
-    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
     uint32_t n;
 
-    /* The request must only run in the BlockBackend's AioContext */
-    assert(blk_get_aio_context(s->qdev.conf.blk) ==
-           qemu_get_current_aio_context());
+    /* The request must run in its AioContext */
+    assert(r->req.ctx == qemu_get_current_aio_context());
 
     assert(r->req.aiocb == NULL);
     if (scsi_disk_req_check_error(r, ret, ret > 0)) {
@@ -XXX,XX +XXX,XX @@ static void scsi_read_data(SCSIRequest *req)
 
 static void scsi_write_complete_noio(SCSIDiskReq *r, int ret)
 {
-    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
     uint32_t n;
 
-    /* The request must only run in the BlockBackend's AioContext */
-    assert(blk_get_aio_context(s->qdev.conf.blk) ==
-           qemu_get_current_aio_context());
+    /* The request must run in its AioContext */
+    assert(r->req.ctx == qemu_get_current_aio_context());
 
     assert (r->req.aiocb == NULL);
     if (scsi_disk_req_check_error(r, ret, ret > 0)) {
-- 
2.48.1

From: Stefan Hajnoczi <stefanha@redhat.com>

SCSIDevice keeps track of in-flight requests for device reset and Task
Management Functions (TMFs). The request list requires protection so
that multi-threaded SCSI emulation can be implemented in commits that
follow.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250311132616.1049687-5-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/hw/scsi/scsi.h |   7 ++-
 hw/scsi/scsi-bus.c     | 120 +++++++++++++++++++++++++++++------------
 2 files changed, 88 insertions(+), 39 deletions(-)

diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -XXX,XX +XXX,XX @@ struct SCSIRequest {
     bool              dma_started;
     BlockAIOCB        *aiocb;
     QEMUSGList        *sg;
+
+    /* Protected by SCSIDevice->requests_lock */
     QTAILQ_ENTRY(SCSIRequest) next;
 };
 
@@ -XXX,XX +XXX,XX @@ struct SCSIDevice
     uint8_t sense[SCSI_SENSE_BUF_SIZE];
     uint32_t sense_len;
 
-    /*
-     * The requests list is only accessed from the AioContext that executes
-     * requests or from the main loop when IOThread processing is stopped.
-     */
+    QemuMutex requests_lock; /* protects the requests list */
     QTAILQ_HEAD(, SCSIRequest) requests;
 
     uint32_t channel;
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -XXX,XX +XXX,XX @@ static void scsi_device_for_each_req_sync(SCSIDevice *s,
     assert(!runstate_is_running());
     assert(qemu_in_main_thread());
 
-    QTAILQ_FOREACH_SAFE(req, &s->requests, next, next_req) {
-        fn(req, opaque);
+    /*
+     * Locking is not necessary because the guest is stopped and no other
+     * threads can be accessing the requests list, but take the lock for
+     * consistency.
+     */
+    WITH_QEMU_LOCK_GUARD(&s->requests_lock) {
+        QTAILQ_FOREACH_SAFE(req, &s->requests, next, next_req) {
+            fn(req, opaque);
+        }
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void scsi_device_for_each_req_async_bh(void *opaque)
 {
     g_autofree SCSIDeviceForEachReqAsyncData *data = opaque;
     SCSIDevice *s = data->s;
-    AioContext *ctx;
-    SCSIRequest *req;
-    SCSIRequest *next;
+    g_autoptr(GList) reqs = NULL;
 
     /*
-     * The BB cannot have changed contexts between this BH being scheduled and
-     * now: BBs' AioContexts, when they have a node attached, can only be
-     * changed via bdrv_try_change_aio_context(), in a drained section.  While
-     * we have the in-flight counter incremented, that drain must block.
+     * Build a list of requests in this AioContext so fn() can be invoked later
+     * outside requests_lock.
      */
-    ctx = blk_get_aio_context(s->conf.blk);
-    assert(ctx == qemu_get_current_aio_context());
+    WITH_QEMU_LOCK_GUARD(&s->requests_lock) {
+        AioContext *ctx = qemu_get_current_aio_context();
+        SCSIRequest *req;
+        SCSIRequest *next;
+
+        QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
+            if (req->ctx == ctx) {
+                scsi_req_ref(req); /* dropped after calling fn() */
+                reqs = g_list_prepend(reqs, req);
+            }
+        }
+    }
 
-    QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
-        data->fn(req, data->fn_opaque);
+    /* Call fn() on each request */
+    for (GList *elem = g_list_first(reqs); elem; elem = g_list_next(elem)) {
+        data->fn(elem->data, data->fn_opaque);
+        scsi_req_unref(elem->data);
     }
 
     /* Drop the reference taken by scsi_device_for_each_req_async() */
@@ -XXX,XX +XXX,XX @@ static void scsi_device_for_each_req_async_bh(void *opaque)
     blk_dec_in_flight(s->conf.blk);
 }
 
+static void scsi_device_for_each_req_async_do_ctx(gpointer key, gpointer value,
+                                                  gpointer user_data)
+{
+    AioContext *ctx = key;
+    SCSIDeviceForEachReqAsyncData *params = user_data;
+    SCSIDeviceForEachReqAsyncData *data;
+
+    data = g_new(SCSIDeviceForEachReqAsyncData, 1);
+    data->s = params->s;
+    data->fn = params->fn;
+    data->fn_opaque = params->fn_opaque;
+
+    /*
+     * Hold a reference to the SCSIDevice until
+     * scsi_device_for_each_req_async_bh() finishes.
+     */
+    object_ref(OBJECT(data->s));
+
+    /* Paired with scsi_device_for_each_req_async_bh() */
+    blk_inc_in_flight(data->s->conf.blk);
+
+    aio_bh_schedule_oneshot(ctx, scsi_device_for_each_req_async_bh, data);
+}
+
 /*
  * Schedule @fn() to be invoked for each enqueued request in device @s. @fn()
- * runs in the AioContext that is executing the request.
+ * must be thread-safe because it runs concurrently in each AioContext that is
+ * executing a request.
+ *
  * Keeps the BlockBackend's in-flight counter incremented until everything is
  * done, so draining it will settle all scheduled @fn() calls.
  */
@@ -XXX,XX +XXX,XX @@ static void scsi_device_for_each_req_async(SCSIDevice *s,
 {
     assert(qemu_in_main_thread());
 
-    SCSIDeviceForEachReqAsyncData *data =
-        g_new(SCSIDeviceForEachReqAsyncData, 1);
-
-    data->s = s;
-    data->fn = fn;
-    data->fn_opaque = opaque;
-
-    /*
-     * Hold a reference to the SCSIDevice until
-     * scsi_device_for_each_req_async_bh() finishes.
-     */
-    object_ref(OBJECT(s));
+    /* The set of AioContexts where the requests are being processed */
+    g_autoptr(GHashTable) aio_contexts = g_hash_table_new(NULL, NULL);
+    WITH_QEMU_LOCK_GUARD(&s->requests_lock) {
+        SCSIRequest *req;
+        QTAILQ_FOREACH(req, &s->requests, next) {
+            g_hash_table_add(aio_contexts, req->ctx);
+        }
+    }
 
-    /* Paired with blk_dec_in_flight() in scsi_device_for_each_req_async_bh() */
-    blk_inc_in_flight(s->conf.blk);
-    aio_bh_schedule_oneshot(blk_get_aio_context(s->conf.blk),
-                            scsi_device_for_each_req_async_bh,
-                            data);
+    /* Schedule a BH for each AioContext */
+    SCSIDeviceForEachReqAsyncData params = {
+        .s = s,
+        .fn = fn,
+        .fn_opaque = opaque,
+    };
+    g_hash_table_foreach(
+            aio_contexts,
+            scsi_device_for_each_req_async_do_ctx,
+            &params
+    );
 }
 
 static void scsi_device_realize(SCSIDevice *s, Error **errp)
@@ -XXX,XX +XXX,XX @@ static void scsi_qdev_realize(DeviceState *qdev, Error **errp)
         dev->lun = lun;
     }
 
+    qemu_mutex_init(&dev->requests_lock);
     QTAILQ_INIT(&dev->requests);
     scsi_device_realize(dev, &local_err);
     if (local_err) {
@@ -XXX,XX +XXX,XX @@ static void scsi_qdev_unrealize(DeviceState *qdev)
 
     scsi_device_purge_requests(dev, SENSE_CODE(NO_SENSE));
 
+    qemu_mutex_destroy(&dev->requests_lock);
+
     scsi_device_unrealize(dev);
 
     blockdev_mark_auto_del(dev->conf.blk);
@@ -XXX,XX +XXX,XX @@ static void scsi_req_enqueue_internal(SCSIRequest *req)
         req->sg = NULL;
     }
     req->enqueued = true;
-    QTAILQ_INSERT_TAIL(&req->dev->requests, req, next);
+
+    WITH_QEMU_LOCK_GUARD(&req->dev->requests_lock) {
+        QTAILQ_INSERT_TAIL(&req->dev->requests, req, next);
+    }
 }
 
 int32_t scsi_req_enqueue(SCSIRequest *req)
@@ -XXX,XX +XXX,XX @@ static void scsi_req_dequeue(SCSIRequest *req)
     trace_scsi_req_dequeue(req->dev->id, req->lun, req->tag);
     req->retry = false;
     if (req->enqueued) {
-        QTAILQ_REMOVE(&req->dev->requests, req, next);
+        WITH_QEMU_LOCK_GUARD(&req->dev->requests_lock) {
+            QTAILQ_REMOVE(&req->dev->requests, req, next);
+        }
         req->enqueued = false;
         scsi_req_unref(req);
     }
@@ -XXX,XX +XXX,XX @@ static void scsi_device_class_init(ObjectClass *klass, void *data)
 
 static void scsi_dev_instance_init(Object *obj)
 {
-    DeviceState *dev = DEVICE(obj);
-    SCSIDevice *s = SCSI_DEVICE(dev);
+    SCSIDevice *s = SCSI_DEVICE(obj);
 
     device_add_bootindex_property(obj, &s->conf.bootindex,
                                   "bootindex", NULL,
-- 
2.48.1

From: Stefan Hajnoczi <stefanha@redhat.com>

Virtqueues are not thread-safe. Until now this was not a major issue
since all virtqueue processing happened in the same thread. The ctrl
queue's Task Management Function (TMF) requests sometimes need the main
loop, so a BH was used to schedule the virtqueue completion back in the
thread that has virtqueue access.

When IOThread Virtqueue Mapping is introduced in later commits, event
and ctrl virtqueue accesses from other threads will become necessary.
Introduce an optional per-virtqueue lock so the event and ctrl
virtqueues can be protected in the commits that follow.

The addition of the ctrl virtqueue lock makes
virtio_scsi_complete_req_from_main_loop() and its BH unnecessary.
Instead, take the ctrl virtqueue lock from the main loop thread.

The cmd virtqueue does not have a lock because the entirety of SCSI
command processing happens in one thread. Only one thread accesses the
cmd virtqueue and a lock is unnecessary.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250311132616.1049687-6-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/hw/virtio/virtio-scsi.h |  3 ++
 hw/scsi/virtio-scsi.c           | 84 ++++++++++++++++++---------------
 2 files changed, 49 insertions(+), 38 deletions(-)

diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -XXX,XX +XXX,XX @@ struct VirtIOSCSI {
     int resetting; /* written from main loop thread, read from any thread */
     bool events_dropped;
 
+    QemuMutex ctrl_lock; /* protects ctrl_vq */
+    QemuMutex event_lock; /* protects event_vq */
+
     /*
      * TMFs deferred to main loop BH. These fields are protected by
      * tmf_bh_lock.
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_free_req(VirtIOSCSIReq *req)
     g_free(req);
 }
 
-static void virtio_scsi_complete_req(VirtIOSCSIReq *req)
+static void virtio_scsi_complete_req(VirtIOSCSIReq *req, QemuMutex *vq_lock)
 {
     VirtIOSCSI *s = req->dev;
     VirtQueue *vq = req->vq;
     VirtIODevice *vdev = VIRTIO_DEVICE(s);
 
     qemu_iovec_from_buf(&req->resp_iov, 0, &req->resp, req->resp_size);
+
+    if (vq_lock) {
+        qemu_mutex_lock(vq_lock);
+    }
+
     virtqueue_push(vq, &req->elem, req->qsgl.size + req->resp_iov.size);
     if (s->dataplane_started && !s->dataplane_fenced) {
         virtio_notify_irqfd(vdev, vq);
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_complete_req(VirtIOSCSIReq *req)
         virtio_notify(vdev, vq);
     }
 
+    if (vq_lock) {
+        qemu_mutex_unlock(vq_lock);
+    }
+
     if (req->sreq) {
         req->sreq->hba_private = NULL;
         scsi_req_unref(req->sreq);
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_complete_req(VirtIOSCSIReq *req)
     virtio_scsi_free_req(req);
 }
 
-static void virtio_scsi_complete_req_bh(void *opaque)
+static void virtio_scsi_bad_req(VirtIOSCSIReq *req, QemuMutex *vq_lock)
 {
-    VirtIOSCSIReq *req = opaque;
+    virtio_error(VIRTIO_DEVICE(req->dev), "wrong size for virtio-scsi headers");
 
-    virtio_scsi_complete_req(req);
-}
+    if (vq_lock) {
+        qemu_mutex_lock(vq_lock);
+    }
 
-/*
- * Called from virtio_scsi_do_one_tmf_bh() in main loop thread. The main loop
- * thread cannot touch the virtqueue since that could race with an IOThread.
- */
-static void virtio_scsi_complete_req_from_main_loop(VirtIOSCSIReq *req)
-{
-    VirtIOSCSI *s = req->dev;
+    virtqueue_detach_element(req->vq, &req->elem, 0);
 
-    if (!s->ctx || s->ctx == qemu_get_aio_context()) {
-        /* No need to schedule a BH when there is no IOThread */
-        virtio_scsi_complete_req(req);
-    } else {
-        /* Run request completion in the IOThread */
-        aio_wait_bh_oneshot(s->ctx, virtio_scsi_complete_req_bh, req);
+    if (vq_lock) {
+        qemu_mutex_unlock(vq_lock);
     }
-}
 
-static void virtio_scsi_bad_req(VirtIOSCSIReq *req)
-{
-    virtio_error(VIRTIO_DEVICE(req->dev), "wrong size for virtio-scsi headers");
-    virtqueue_detach_element(req->vq, &req->elem, 0);
     virtio_scsi_free_req(req);
 }
 
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_parse_req(VirtIOSCSIReq *req,
     return 0;
 }
 
-static VirtIOSCSIReq *virtio_scsi_pop_req(VirtIOSCSI *s, VirtQueue *vq)
+static VirtIOSCSIReq *virtio_scsi_pop_req(VirtIOSCSI *s, VirtQueue *vq, QemuMutex *vq_lock)
 {
     VirtIOSCSICommon *vs = (VirtIOSCSICommon *)s;
     VirtIOSCSIReq *req;
 
+    if (vq_lock) {
+        qemu_mutex_lock(vq_lock);
+    }
+
     req = virtqueue_pop(vq, sizeof(VirtIOSCSIReq) + vs->cdb_size);
+
+    if (vq_lock) {
+        qemu_mutex_unlock(vq_lock);
+    }
+
     if (!req) {
         return NULL;
     }
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_cancel_notify(Notifier *notifier, void *data)
 
         trace_virtio_scsi_tmf_resp(virtio_scsi_get_lun(req->req.tmf.lun),
                                    req->req.tmf.tag, req->resp.tmf.response);
-        virtio_scsi_complete_req(req);
+        virtio_scsi_complete_req(req, &req->dev->ctrl_lock);
     }
     g_free(n);
 }
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_do_one_tmf_bh(VirtIOSCSIReq *req)
 
 out:
     object_unref(OBJECT(d));
-    virtio_scsi_complete_req_from_main_loop(req);
+    virtio_scsi_complete_req(req, &s->ctrl_lock);
 }
 
 /* Some TMFs must be processed from the main loop thread */
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_reset_tmf_bh(VirtIOSCSI *s)
 
         /* SAM-6 6.3.2 Hard reset */
         req->resp.tmf.response = VIRTIO_SCSI_S_TARGET_FAILURE;
-        virtio_scsi_complete_req(req);
+        virtio_scsi_complete_req(req, &req->dev->ctrl_lock);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req)
 
     if (iov_to_buf(req->elem.out_sg, req->elem.out_num, 0,
                 &type, sizeof(type)) < sizeof(type)) {
-        virtio_scsi_bad_req(req);
+        virtio_scsi_bad_req(req, &s->ctrl_lock);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req)
     if (type == VIRTIO_SCSI_T_TMF) {
         if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICtrlTMFReq),
                     sizeof(VirtIOSCSICtrlTMFResp)) < 0) {
-            virtio_scsi_bad_req(req);
+            virtio_scsi_bad_req(req, &s->ctrl_lock);
             return;
         } else {
             r = virtio_scsi_do_tmf(s, req);
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req)
                type == VIRTIO_SCSI_T_AN_SUBSCRIBE) {
         if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICtrlANReq),
                     sizeof(VirtIOSCSICtrlANResp)) < 0) {
-            virtio_scsi_bad_req(req);
+            virtio_scsi_bad_req(req, &s->ctrl_lock);
             return;
         } else {
             req->req.an.event_requested =
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req)
                  type == VIRTIO_SCSI_T_AN_SUBSCRIBE)
             trace_virtio_scsi_an_resp(virtio_scsi_get_lun(req->req.an.lun),
                                       req->resp.an.response);
-        virtio_scsi_complete_req(req);
+        virtio_scsi_complete_req(req, &s->ctrl_lock);
     } else {
         assert(r == -EINPROGRESS);
     }
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
 {
     VirtIOSCSIReq *req;
 
-    while ((req = virtio_scsi_pop_req(s, vq))) {
+    while ((req = virtio_scsi_pop_req(s, vq, &s->ctrl_lock))) {
         virtio_scsi_handle_ctrl_req(s, req);
     }
 }
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_complete_cmd_req(VirtIOSCSIReq *req)
      * in virtio_scsi_command_complete.
      */
     req->resp_size = sizeof(VirtIOSCSICmdResp);
-    virtio_scsi_complete_req(req);
+    virtio_scsi_complete_req(req, NULL);
 }
 
 static void virtio_scsi_command_failed(SCSIRequest *r)
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req)
             virtio_scsi_fail_cmd_req(req);
             return -ENOTSUP;
         } else {
-            virtio_scsi_bad_req(req);
+            virtio_scsi_bad_req(req, NULL);
             return -EINVAL;
         }
     }
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
             virtio_queue_set_notification(vq, 0);
         }
 
-        while ((req = virtio_scsi_pop_req(s, vq))) {
+        while ((req = virtio_scsi_pop_req(s, vq, NULL))) {
             ret = virtio_scsi_handle_cmd_req_prepare(s, req);
             if (!ret) {
                 QTAILQ_INSERT_TAIL(&reqs, req, next);
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_push_event(VirtIOSCSI *s,
         return;
     }
 
-    req = virtio_scsi_pop_req(s, vs->event_vq);
+    req = virtio_scsi_pop_req(s, vs->event_vq, &s->event_lock);
     if (!req) {
         s->events_dropped = true;
         return;
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_push_event(VirtIOSCSI *s,
     }
 
     if (virtio_scsi_parse_req(req, 0, sizeof(VirtIOSCSIEvent))) {
-        virtio_scsi_bad_req(req);
+        virtio_scsi_bad_req(req, &s->event_lock);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_push_event(VirtIOSCSI *s,
     }
     trace_virtio_scsi_event(virtio_scsi_get_lun(evt->lun), event, reason);
 
-    virtio_scsi_complete_req(req);
+    virtio_scsi_complete_req(req, &s->event_lock);
 }
 
 static void virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq)
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_device_realize(DeviceState *dev, Error **errp)
     Error *err = NULL;
 
     QTAILQ_INIT(&s->tmf_bh_list);
+    qemu_mutex_init(&s->ctrl_lock);
+    qemu_mutex_init(&s->event_lock);
     qemu_mutex_init(&s->tmf_bh_lock);
 
     virtio_scsi_common_realize(dev,
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_device_unrealize(DeviceState *dev)
     qbus_set_hotplug_handler(BUS(&s->bus), NULL);
     virtio_scsi_common_unrealize(dev);
     qemu_mutex_destroy(&s->tmf_bh_lock);
+    qemu_mutex_destroy(&s->event_lock);
+    qemu_mutex_destroy(&s->ctrl_lock);
 }
 
 static const Property virtio_scsi_properties[] = {
-- 
2.48.1

From: Stefan Hajnoczi <stefanha@redhat.com>

The block layer can invoke the resize callback from any AioContext that
is processing requests. The virtqueue is already protected but the
events_dropped field also needs to be protected against races. Cover it
using the event virtqueue lock because it is closely associated with
accesses to the virtqueue.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250311132616.1049687-7-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/hw/virtio/virtio-scsi.h |  3 ++-
 hw/scsi/virtio-scsi.c           | 29 ++++++++++++++++++++---------
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -XXX,XX +XXX,XX @@ struct VirtIOSCSI {
 
     SCSIBus bus;
     int resetting; /* written from main loop thread, read from any thread */
+
+    QemuMutex event_lock; /* protects event_vq and events_dropped */
     bool events_dropped;
 
     QemuMutex ctrl_lock; /* protects ctrl_vq */
-    QemuMutex event_lock; /* protects event_vq */
 
     /*
      * TMFs deferred to main loop BH. These fields are protected by
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_reset(VirtIODevice *vdev)
 
     vs->sense_size = VIRTIO_SCSI_SENSE_DEFAULT_SIZE;
     vs->cdb_size = VIRTIO_SCSI_CDB_DEFAULT_SIZE;
-    s->events_dropped = false;
+
+    WITH_QEMU_LOCK_GUARD(&s->event_lock) {
+        s->events_dropped = false;
+    }
 }
 
 typedef struct {
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_push_event(VirtIOSCSI *s,
     }
 
     req = virtio_scsi_pop_req(s, vs->event_vq, &s->event_lock);
-    if (!req) {
-        s->events_dropped = true;
-        return;
-    }
+    WITH_QEMU_LOCK_GUARD(&s->event_lock) {
+        if (!req) {
+            s->events_dropped = true;
+            return;
+        }
 
-    if (s->events_dropped) {
-        event |= VIRTIO_SCSI_T_EVENTS_MISSED;
-        s->events_dropped = false;
+        if (s->events_dropped) {
+            event |= VIRTIO_SCSI_T_EVENTS_MISSED;
+            s->events_dropped = false;
+        }
     }
 
     if (virtio_scsi_parse_req(req, 0, sizeof(VirtIOSCSIEvent))) {
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_push_event(VirtIOSCSI *s,
 
 static void virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq)
 {
-    if (s->events_dropped) {
+    bool events_dropped;
+
+    WITH_QEMU_LOCK_GUARD(&s->event_lock) {
+        events_dropped = s->events_dropped;
+    }
+
+    if (events_dropped) {
         VirtIOSCSIEventInfo info = {
             .event = VIRTIO_SCSI_T_NO_EVENT,
         };
-- 
2.48.1

From: Stefan Hajnoczi <stefanha@redhat.com>

With IOThread Virtqueue Mapping there will be multiple AioContexts
processing SCSI requests. scsi_req_cancel() and other SCSI request
operations must be performed from the AioContext where the request is
running.

Introduce a virtio_scsi_defer_tmf_to_aio_context() function and the
necessary VirtIOSCSIReq->remaining refcount infrastructure to move the
TMF code into the AioContext where the request is running.

For the time being there is still just one AioContext: the main loop or
the IOThread. When the iothread-vq-mapping parameter is added in a later
patch this will be changed to per-virtqueue AioContexts.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250311132616.1049687-8-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/scsi/virtio-scsi.c | 270 ++++++++++++++++++++++++++++++++----------
 1 file changed, 206 insertions(+), 64 deletions(-)

diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -XXX,XX +XXX,XX @@ typedef struct VirtIOSCSIReq {
     /* Used for two-stage request submission and TMFs deferred to BH */
     QTAILQ_ENTRY(VirtIOSCSIReq) next;
 
-    /* Used for cancellation of request during TMFs */
+    /* Used for cancellation of request during TMFs. Atomic. */
     int remaining;
 
     SCSIRequest *sreq;
@@ -XXX,XX +XXX,XX @@ typedef struct {
     VirtIOSCSIReq  *tmf_req;
 } VirtIOSCSICancelNotifier;
 
+static void virtio_scsi_tmf_dec_remaining(VirtIOSCSIReq *tmf)
+{
+    if (qatomic_fetch_dec(&tmf->remaining) == 1) {
+        trace_virtio_scsi_tmf_resp(virtio_scsi_get_lun(tmf->req.tmf.lun),
+                                   tmf->req.tmf.tag, tmf->resp.tmf.response);
+
+        virtio_scsi_complete_req(tmf, &tmf->dev->ctrl_lock);
+    }
+}
+
 static void virtio_scsi_cancel_notify(Notifier *notifier, void *data)
 {
     VirtIOSCSICancelNotifier *n = container_of(notifier,
                                                VirtIOSCSICancelNotifier,
                                                notifier);
 
-    if (--n->tmf_req->remaining == 0) {
-        VirtIOSCSIReq *req = n->tmf_req;
-
-        trace_virtio_scsi_tmf_resp(virtio_scsi_get_lun(req->req.tmf.lun),
-                                   req->req.tmf.tag, req->resp.tmf.response);
-        virtio_scsi_complete_req(req, &req->dev->ctrl_lock);
-    }
+    virtio_scsi_tmf_dec_remaining(n->tmf_req);
     g_free(n);
 }
 
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_reset_tmf_bh(VirtIOSCSI *s)
     }
 }
 
-static void virtio_scsi_defer_tmf_to_bh(VirtIOSCSIReq *req)
+static void virtio_scsi_defer_tmf_to_main_loop(VirtIOSCSIReq *req)
 {
     VirtIOSCSI *s = req->dev;
 
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_defer_tmf_to_bh(VirtIOSCSIReq *req)
     }
 }
 
+static void virtio_scsi_tmf_cancel_req(VirtIOSCSIReq *tmf, SCSIRequest *r)
+{
+    VirtIOSCSICancelNotifier *notifier;
+
+    assert(r->ctx == qemu_get_current_aio_context());
+
+    /* Decremented in virtio_scsi_cancel_notify() */
+    qatomic_inc(&tmf->remaining);
+
+    notifier = g_new(VirtIOSCSICancelNotifier, 1);
+    notifier->notifier.notify = virtio_scsi_cancel_notify;
+    notifier->tmf_req = tmf;
+    scsi_req_cancel_async(r, &notifier->notifier);
+}
+
+/* Execute a TMF on the requests in the current AioContext */
+static void virtio_scsi_do_tmf_aio_context(void *opaque)
+{
+    AioContext *ctx = qemu_get_current_aio_context();
+    VirtIOSCSIReq *tmf = opaque;
+    VirtIOSCSI *s = tmf->dev;
+    SCSIDevice *d = virtio_scsi_device_get(s, tmf->req.tmf.lun);
+    SCSIRequest *r;
+    bool match_tag;
+
+    if (!d) {
+        tmf->resp.tmf.response = VIRTIO_SCSI_S_BAD_TARGET;
+        virtio_scsi_tmf_dec_remaining(tmf);
+        return;
+    }
+
+    /*
+     * This function could handle other subtypes that need to be processed in
+     * the request's AioContext in the future, but for now only request
+     * cancelation subtypes are performed here.
+     */
+    switch (tmf->req.tmf.subtype) {
+    case VIRTIO_SCSI_T_TMF_ABORT_TASK:
+        match_tag = true;
+        break;
+    case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
+    case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET:
+        match_tag = false;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    WITH_QEMU_LOCK_GUARD(&d->requests_lock) {
+        QTAILQ_FOREACH(r, &d->requests, next) {
+            VirtIOSCSIReq *cmd_req = r->hba_private;
+            assert(cmd_req); /* request has hba_private while enqueued */
+
+            if (r->ctx != ctx) {
+                continue;
+            }
+            if (match_tag && cmd_req->req.cmd.tag != tmf->req.tmf.tag) {
+                continue;
+            }
+            virtio_scsi_tmf_cancel_req(tmf, r);
+        }
+    }
+
+    /* Incremented by virtio_scsi_do_tmf() */
+    virtio_scsi_tmf_dec_remaining(tmf);
+
+    object_unref(d);
+}
+
+static void dummy_bh(void *opaque)
+{
+    /* Do nothing */
+}
+
+/*
+ * Wait for pending virtio_scsi_defer_tmf_to_aio_context() BHs.
+ */
+static void virtio_scsi_flush_defer_tmf_to_aio_context(VirtIOSCSI *s)
+{
+    GLOBAL_STATE_CODE();
+
+    assert(!s->dataplane_started);
+
+    if (s->ctx) {
+        /* Our BH only runs after previously scheduled BHs */
+        aio_wait_bh_oneshot(s->ctx, dummy_bh, NULL);
+    }
+}
+
+/*
+ * Run the TMF in a specific AioContext, handling only requests in that
+ * AioContext. This is necessary because requests can run in different
+ * AioContext and it is only possible to cancel them from the AioContext where
+ * they are running.
+ */
+static void virtio_scsi_defer_tmf_to_aio_context(VirtIOSCSIReq *tmf,
+                                                 AioContext *ctx)
+{
+    /* Decremented in virtio_scsi_do_tmf_aio_context() */
+    qatomic_inc(&tmf->remaining);
+
+    /* See virtio_scsi_flush_defer_tmf_to_aio_context() cleanup during reset */
+    aio_bh_schedule_oneshot(ctx, virtio_scsi_do_tmf_aio_context, tmf);
+}
+
+/*
+ * Returns the AioContext for a given TMF's tag field or NULL. Note that the
+ * request identified by the tag may have completed by the time you can execute
+ * a BH in the AioContext, so don't assume the request still exists in your BH.
+ */
+static AioContext *find_aio_context_for_tmf_tag(SCSIDevice *d,
+                                                VirtIOSCSIReq *tmf)
+{
+    WITH_QEMU_LOCK_GUARD(&d->requests_lock) {
+        SCSIRequest *r;
+        SCSIRequest *next;
+
+        QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) {
+            VirtIOSCSIReq *cmd_req = r->hba_private;
+
+            /* hba_private is non-NULL while the request is enqueued */
+            assert(cmd_req);
+
+            if (cmd_req->req.cmd.tag == tmf->req.tmf.tag) {
+                return r->ctx;
+            }
+        }
+    }
+    return NULL;
+}
+
 /* Return 0 if the request is ready to be completed and return to guest;
  * -EINPROGRESS if the request is submitted and will be completed later, in the
  *  case of async cancellation. */
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
 {
     SCSIDevice *d = virtio_scsi_device_get(s, req->req.tmf.lun);
     SCSIRequest *r, *next;
+    AioContext *ctx;
     int ret = 0;
 
     virtio_scsi_ctx_check(s, d);
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
                               req->req.tmf.tag, req->req.tmf.subtype);
 
     switch (req->req.tmf.subtype) {
-    case VIRTIO_SCSI_T_TMF_ABORT_TASK:
-    case VIRTIO_SCSI_T_TMF_QUERY_TASK:
+    case VIRTIO_SCSI_T_TMF_ABORT_TASK: {
         if (!d) {
             goto fail;
         }
         if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
             goto incorrect_lun;
         }
-        QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) {
-            VirtIOSCSIReq *cmd_req = r->hba_private;
-            if (cmd_req && cmd_req->req.cmd.tag == req->req.tmf.tag) {
-                break;
-            }
+
+        ctx = find_aio_context_for_tmf_tag(d, req);
+        if (ctx) {
+            virtio_scsi_defer_tmf_to_aio_context(req, ctx);
+            ret = -EINPROGRESS;
         }
-        if (r) {
-            /*
-             * Assert that the request has not been completed yet, we
-             * check for it in the loop above.
-             */
-            assert(r->hba_private);
-            if (req->req.tmf.subtype == VIRTIO_SCSI_T_TMF_QUERY_TASK) {
-                /* "If the specified command is present in the task set, then
-                 * return a service response set to FUNCTION SUCCEEDED".
-                 */
-                req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
-            } else {
-                VirtIOSCSICancelNotifier *notifier;
-
-                req->remaining = 1;
-                notifier = g_new(VirtIOSCSICancelNotifier, 1);
-                notifier->tmf_req = req;
-                notifier->notifier.notify = virtio_scsi_cancel_notify;
-                scsi_req_cancel_async(r, &notifier->notifier);
-                ret = -EINPROGRESS;
+        break;
+    }
+
+    case VIRTIO_SCSI_T_TMF_QUERY_TASK:
+        if (!d) {
+            goto fail;
+        }
+        if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
+            goto incorrect_lun;
+        }
+
+        WITH_QEMU_LOCK_GUARD(&d->requests_lock) {
+            QTAILQ_FOREACH(r, &d->requests, next) {
+                VirtIOSCSIReq *cmd_req = r->hba_private;
+                assert(cmd_req); /* request has hba_private while enqueued */
+
+                if (cmd_req->req.cmd.tag == req->req.tmf.tag) {
+                    /*
+                     * "If the specified command is present in the task set,
+                     * then return a service response set to FUNCTION
+                     * SUCCEEDED".
+                     */
+                    req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
+                }
             }
         }
         break;
 
     case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
     case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET:
-        virtio_scsi_defer_tmf_to_bh(req);
+        virtio_scsi_defer_tmf_to_main_loop(req);
         ret = -EINPROGRESS;
         break;
 
     case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
-    case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET:
+    case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: {
+        if (!d) {
+            goto fail;
+        }
+        if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
+            goto incorrect_lun;
+        }
+
+        qatomic_inc(&req->remaining);
+
+        ctx = s->ctx ?: qemu_get_aio_context();
+        virtio_scsi_defer_tmf_to_aio_context(req, ctx);
+
+        virtio_scsi_tmf_dec_remaining(req);
+        ret = -EINPROGRESS;
+        break;
+    }
+
     case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET:
         if (!d) {
             goto fail;
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
             goto incorrect_lun;
         }
 
-        /* Add 1 to "remaining" until virtio_scsi_do_tmf returns.
-         * This way, if the bus starts calling back to the notifiers
-         * even before we finish the loop, virtio_scsi_cancel_notify
-         * will not complete the TMF too early.
-         */
-        req->remaining = 1;
-        QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) {
-            if (r->hba_private) {
-                if (req->req.tmf.subtype == VIRTIO_SCSI_T_TMF_QUERY_TASK_SET) {
-                    /* "If there is any command present in the task set, then
-                     * return a service response set to FUNCTION SUCCEEDED".
-                     */
-                    req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
-                    break;
-                } else {
-                    VirtIOSCSICancelNotifier *notifier;
-
-                    req->remaining++;
-                    notifier = g_new(VirtIOSCSICancelNotifier, 1);
-                    notifier->notifier.notify = virtio_scsi_cancel_notify;
-                    notifier->tmf_req = req;
-                    scsi_req_cancel_async(r, &notifier->notifier);
-                }
+        WITH_QEMU_LOCK_GUARD(&d->requests_lock) {
+            QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) {
+                /* Request has hba_private while enqueued */
+                assert(r->hba_private);
+
+                /*
+                 * "If there is any command present in the task set, then
+                 * return a service response set to FUNCTION SUCCEEDED".
+                 */
+                req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
+                break;
             }
         }
-        if (--req->remaining > 0) {
-            ret = -EINPROGRESS;
-        }
         break;
 
     case VIRTIO_SCSI_T_TMF_CLEAR_ACA:
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_reset(VirtIODevice *vdev)
     assert(!s->dataplane_started);
 
     virtio_scsi_reset_tmf_bh(s);
+    virtio_scsi_flush_defer_tmf_to_aio_context(s);
 
     qatomic_inc(&s->resetting);
     bus_cold_reset(BUS(&s->bus));
-- 
2.48.1

From: Stefan Hajnoczi <stefanha@redhat.com>

This is the cleanup function that must be called after
apply_iothread_vq_mapping() succeeds. virtio-scsi will need this
function too, so extract it.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250311132616.1049687-9-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/virtio-blk.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ validate_iothread_vq_mapping_list(IOThreadVirtQueueMappingList *list,
  * Fill in the AioContext for each virtqueue in the @vq_aio_context array given
  * the iothread-vq-mapping parameter in @iothread_vq_mapping_list.
  *
+ * cleanup_iothread_vq_mapping() must be called to free IOThread object
+ * references after this function returns success.
+ *
  * Returns: %true on success, %false on failure.
  **/
 static bool apply_iothread_vq_mapping(
@@ -XXX,XX +XXX,XX @@ static bool apply_iothread_vq_mapping(
     return true;
 }
 
+/**
+ * cleanup_iothread_vq_mapping:
+ * @list: The mapping of virtqueues to IOThreads.
+ *
+ * Release IOThread object references that were acquired by
+ * apply_iothread_vq_mapping().
+ */
+static void cleanup_iothread_vq_mapping(IOThreadVirtQueueMappingList *list)
+{
+    IOThreadVirtQueueMappingList *node;
+
+    for (node = list; node; node = node->next) {
+        IOThread *iothread = iothread_by_id(node->value->iothread);
+        object_unref(OBJECT(iothread));
+    }
+}
+
 /* Context: BQL held */
 static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_vq_aio_context_cleanup(VirtIOBlock *s)
     assert(!s->ioeventfd_started);
 
     if (conf->iothread_vq_mapping_list) {
-        IOThreadVirtQueueMappingList *node;
-
-        for (node = conf->iothread_vq_mapping_list; node; node = node->next) {
-            IOThread *iothread = iothread_by_id(node->value->iothread);
-            object_unref(OBJECT(iothread));
-        }
+        cleanup_iothread_vq_mapping(conf->iothread_vq_mapping_list);
     }
 
     if (conf->iothread) {
-- 
2.48.1

From: Stefan Hajnoczi <stefanha@redhat.com>

Use noun_verb() function naming instead of verb_noun() because the
former is the most common naming style for APIs. The next commit will
move these functions into a header file so that virtio-scsi can call
them.

Shorten iothread_vq_mapping_apply()'s iothread_vq_mapping_list argument
to just "list" like in the other functions.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250311132616.1049687-10-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/virtio-blk.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@ static const BlockDevOps virtio_block_ops = {
 };
 
 static bool
-validate_iothread_vq_mapping_list(IOThreadVirtQueueMappingList *list,
-        uint16_t num_queues, Error **errp)
+iothread_vq_mapping_validate(IOThreadVirtQueueMappingList *list, uint16_t
+        num_queues, Error **errp)
 {
     g_autofree unsigned long *vqs = bitmap_new(num_queues);
     g_autoptr(GHashTable) iothreads =
@@ -XXX,XX +XXX,XX @@ validate_iothread_vq_mapping_list(IOThreadVirtQueueMappingList *list,
 }
 
 /**
- * apply_iothread_vq_mapping:
- * @iothread_vq_mapping_list: The mapping of virtqueues to IOThreads.
+ * iothread_vq_mapping_apply:
+ * @list: The mapping of virtqueues to IOThreads.
  * @vq_aio_context: The array of AioContext pointers to fill in.
  * @num_queues: The length of @vq_aio_context.
  * @errp: If an error occurs, a pointer to the area to store the error.
  *
  * Fill in the AioContext for each virtqueue in the @vq_aio_context array given
- * the iothread-vq-mapping parameter in @iothread_vq_mapping_list.
+ * the iothread-vq-mapping parameter in @list.
  *
- * cleanup_iothread_vq_mapping() must be called to free IOThread object
+ * iothread_vq_mapping_cleanup() must be called to free IOThread object
  * references after this function returns success.
  *
  * Returns: %true on success, %false on failure.
  **/
-static bool apply_iothread_vq_mapping(
-        IOThreadVirtQueueMappingList *iothread_vq_mapping_list,
+static bool iothread_vq_mapping_apply(
+        IOThreadVirtQueueMappingList *list,
         AioContext **vq_aio_context,
         uint16_t num_queues,
         Error **errp)
@@ -XXX,XX +XXX,XX @@ static bool apply_iothread_vq_mapping(
     size_t num_iothreads = 0;
     size_t cur_iothread = 0;
 
-    if (!validate_iothread_vq_mapping_list(iothread_vq_mapping_list,
-                                           num_queues, errp)) {
+    if (!iothread_vq_mapping_validate(list, num_queues, errp)) {
         return false;
     }
 
-    for (node = iothread_vq_mapping_list; node; node = node->next) {
+    for (node = list; node; node = node->next) {
         num_iothreads++;
     }
 
-    for (node = iothread_vq_mapping_list; node; node = node->next) {
+    for (node = list; node; node = node->next) {
         IOThread *iothread = iothread_by_id(node->value->iothread);
         AioContext *ctx = iothread_get_aio_context(iothread);
 
@@ -XXX,XX +XXX,XX @@ static bool apply_iothread_vq_mapping(
 }
 
 /**
- * cleanup_iothread_vq_mapping:
+ * iothread_vq_mapping_cleanup:
  * @list: The mapping of virtqueues to IOThreads.
  *
  * Release IOThread object references that were acquired by
- * apply_iothread_vq_mapping().
+ * iothread_vq_mapping_apply().
  */
-static void cleanup_iothread_vq_mapping(IOThreadVirtQueueMappingList *list)
+static void iothread_vq_mapping_cleanup(IOThreadVirtQueueMappingList *list)
 {
     IOThreadVirtQueueMappingList *node;
 
@@ -XXX,XX +XXX,XX @@ static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp)
     s->vq_aio_context = g_new(AioContext *, conf->num_queues);
 
     if (conf->iothread_vq_mapping_list) {
-        if (!apply_iothread_vq_mapping(conf->iothread_vq_mapping_list,
+        if (!iothread_vq_mapping_apply(conf->iothread_vq_mapping_list,
                                        s->vq_aio_context,
                                        conf->num_queues,
                                        errp)) {
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_vq_aio_context_cleanup(VirtIOBlock *s)
     assert(!s->ioeventfd_started);
 
     if (conf->iothread_vq_mapping_list) {
-        cleanup_iothread_vq_mapping(conf->iothread_vq_mapping_list);
+        iothread_vq_mapping_cleanup(conf->iothread_vq_mapping_list);
     }
 
     if (conf->iothread) {
-- 
2.48.1

From: Stefan Hajnoczi <stefanha@redhat.com>

The code that builds an array of AioContext pointers indexed by the
virtqueue is not specific to virtio-blk. virtio-scsi will need to do the
same thing, so extract the functions.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250311132616.1049687-11-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/hw/virtio/iothread-vq-mapping.h |  45 ++++++++
 hw/block/virtio-blk.c                   | 142 +-----------------------
 hw/virtio/iothread-vq-mapping.c         | 131 ++++++++++++++++++++++
 hw/virtio/meson.build                   |   1 +
 4 files changed, 178 insertions(+), 141 deletions(-)
 create mode 100644 include/hw/virtio/iothread-vq-mapping.h
 create mode 100644 hw/virtio/iothread-vq-mapping.c

diff --git a/include/hw/virtio/iothread-vq-mapping.h b/include/hw/virtio/iothread-vq-mapping.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/hw/virtio/iothread-vq-mapping.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * IOThread Virtqueue Mapping
+ *
+ * Copyright Red Hat, Inc
+ *
+ * SPDX-License-Identifier: GPL-2.0-only
+ */
+
+#ifndef HW_VIRTIO_IOTHREAD_VQ_MAPPING_H
+#define HW_VIRTIO_IOTHREAD_VQ_MAPPING_H
+
+#include "qapi/error.h"
+#include "qapi/qapi-types-virtio.h"
+
+/**
+ * iothread_vq_mapping_apply:
+ * @list: The mapping of virtqueues to IOThreads.
+ * @vq_aio_context: The array of AioContext pointers to fill in.
+ * @num_queues: The length of @vq_aio_context.
+ * @errp: If an error occurs, a pointer to the area to store the error.
+ *
+ * Fill in the AioContext for each virtqueue in the @vq_aio_context array given
+ * the iothread-vq-mapping parameter in @list.
+ *
+ * iothread_vq_mapping_cleanup() must be called to free IOThread object
+ * references after this function returns success.
+ *
+ * Returns: %true on success, %false on failure.
+ **/
+bool iothread_vq_mapping_apply(
+        IOThreadVirtQueueMappingList *list,
+        AioContext **vq_aio_context,
+        uint16_t num_queues,
+        Error **errp);
+
+/**
+ * iothread_vq_mapping_cleanup:
+ * @list: The mapping of virtqueues to IOThreads.
+ *
+ * Release IOThread object references that were acquired by
+ * iothread_vq_mapping_apply().
+ */
+void iothread_vq_mapping_cleanup(IOThreadVirtQueueMappingList *list);
+
+#endif /* HW_VIRTIO_IOTHREAD_VQ_MAPPING_H */
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -XXX,XX +XXX,XX @@
 #endif
 #include "hw/virtio/virtio-bus.h"
 #include "migration/qemu-file-types.h"
+#include "hw/virtio/iothread-vq-mapping.h"
 #include "hw/virtio/virtio-access.h"
 #include "hw/virtio/virtio-blk-common.h"
 #include "qemu/coroutine.h"
@@ -XXX,XX +XXX,XX @@ static const BlockDevOps virtio_block_ops = {
     .drained_end   = virtio_blk_drained_end,
 };
 
-static bool
-iothread_vq_mapping_validate(IOThreadVirtQueueMappingList *list, uint16_t
-        num_queues, Error **errp)
-{
-    g_autofree unsigned long *vqs = bitmap_new(num_queues);
-    g_autoptr(GHashTable) iothreads =
-        g_hash_table_new(g_str_hash, g_str_equal);
-
-    for (IOThreadVirtQueueMappingList *node = list; node; node = node->next) {
-        const char *name = node->value->iothread;
-        uint16List *vq;
-
-        if (!iothread_by_id(name)) {
-            error_setg(errp, "IOThread \"%s\" object does not exist", name);
-            return false;
-        }
-
-        if (!g_hash_table_add(iothreads, (gpointer)name)) {
-            error_setg(errp,
-                    "duplicate IOThread name \"%s\" in iothread-vq-mapping",
-                    name);
-            return false;
-        }
-
-        if (node != list) {
-            if (!!node->value->vqs != !!list->value->vqs) {
-                error_setg(errp, "either all items in iothread-vq-mapping "
-                                 "must have vqs or none of them must have it");
-                return false;
-            }
-        }
-
-        for (vq = node->value->vqs; vq; vq = vq->next) {
-            if (vq->value >= num_queues) {
-                error_setg(errp, "vq index %u for IOThread \"%s\" must be "
-                        "less than num_queues %u in iothread-vq-mapping",
-                        vq->value, name, num_queues);
-                return false;
-            }
-
-            if (test_and_set_bit(vq->value, vqs)) {
-                error_setg(errp, "cannot assign vq %u to IOThread \"%s\" "
-                        "because it is already assigned", vq->value, name);
-                return false;
-            }
-        }
-    }
-
-    if (list->value->vqs) {
-        for (uint16_t i = 0; i < num_queues; i++) {
-            if (!test_bit(i, vqs)) {
-                error_setg(errp,
-                        "missing vq %u IOThread assignment in iothread-vq-mapping",
-                        i);
-                return false;
-            }
-        }
-    }
-
-    return true;
-}
-
-/**
- * iothread_vq_mapping_apply:
- * @list: The mapping of virtqueues to IOThreads.
- * @vq_aio_context: The array of AioContext pointers to fill in.
- * @num_queues: The length of @vq_aio_context.
- * @errp: If an error occurs, a pointer to the area to store the error.
- *
- * Fill in the AioContext for each virtqueue in the @vq_aio_context array given
- * the iothread-vq-mapping parameter in @list.
- *
- * iothread_vq_mapping_cleanup() must be called to free IOThread object
- * references after this function returns success.
- *
- * Returns: %true on success, %false on failure.
- **/
-static bool iothread_vq_mapping_apply(
-        IOThreadVirtQueueMappingList *list,
-        AioContext **vq_aio_context,
-        uint16_t num_queues,
-        Error **errp)
-{
-    IOThreadVirtQueueMappingList *node;
-    size_t num_iothreads = 0;
-    size_t cur_iothread = 0;
-
-    if (!iothread_vq_mapping_validate(list, num_queues, errp)) {
-        return false;
-    }
-
-    for (node = list; node; node = node->next) {
-        num_iothreads++;
-    }
-
-    for (node = list; node; node = node->next) {
-        IOThread *iothread = iothread_by_id(node->value->iothread);
-        AioContext *ctx = iothread_get_aio_context(iothread);
-
-        /* Released in virtio_blk_vq_aio_context_cleanup() */
-        object_ref(OBJECT(iothread));
-
-        if (node->value->vqs) {
-            uint16List *vq;
-
-            /* Explicit vq:IOThread assignment */
-            for (vq = node->value->vqs; vq; vq = vq->next) {
-                assert(vq->value < num_queues);
-                vq_aio_context[vq->value] = ctx;
-            }
-        } else {
-            /* Round-robin vq:IOThread assignment */
-            for (unsigned i = cur_iothread; i < num_queues;
-                 i += num_iothreads) {
-                vq_aio_context[i] = ctx;
-            }
-        }
-
-        cur_iothread++;
-    }
-
-    return true;
-}
-
-/**
- * iothread_vq_mapping_cleanup:
- * @list: The mapping of virtqueues to IOThreads.
- *
- * Release IOThread object references that were acquired by
- * iothread_vq_mapping_apply().
- */
-static void iothread_vq_mapping_cleanup(IOThreadVirtQueueMappingList *list)
-{
-    IOThreadVirtQueueMappingList *node;
-
-    for (node = list; node; node = node->next) {
-        IOThread *iothread = iothread_by_id(node->value->iothread);
-        object_unref(OBJECT(iothread));
-    }
-}
-
 /* Context: BQL held */
 static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp)
 {
diff --git a/hw/virtio/iothread-vq-mapping.c b/hw/virtio/iothread-vq-mapping.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/iothread-vq-mapping.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * IOThread Virtqueue Mapping
+ *
+ * Copyright Red Hat, Inc
+ *
+ * SPDX-License-Identifier: GPL-2.0-only
+ */
+
+#include "qemu/osdep.h"
+#include "system/iothread.h"
+#include "hw/virtio/iothread-vq-mapping.h"
+
+static bool
+iothread_vq_mapping_validate(IOThreadVirtQueueMappingList *list, uint16_t
+        num_queues, Error **errp)
+{
+    g_autofree unsigned long *vqs = bitmap_new(num_queues);
+    g_autoptr(GHashTable) iothreads =
+        g_hash_table_new(g_str_hash, g_str_equal);
+
+    for (IOThreadVirtQueueMappingList *node = list; node; node = node->next) {
+        const char *name = node->value->iothread;
+        uint16List *vq;
+
+        if (!iothread_by_id(name)) {
+            error_setg(errp, "IOThread \"%s\" object does not exist", name);
+            return false;
+        }
+
+        if (!g_hash_table_add(iothreads, (gpointer)name)) {
+            error_setg(errp,
+                    "duplicate IOThread name \"%s\" in iothread-vq-mapping",
+                    name);
+            return false;
+        }
+
+        if (node != list) {
+            if (!!node->value->vqs != !!list->value->vqs) {
+                error_setg(errp, "either all items in iothread-vq-mapping "
+                                 "must have vqs or none of them must have it");
+                return false;
+            }
+        }
+
+        for (vq = node->value->vqs; vq; vq = vq->next) {
+            if (vq->value >= num_queues) {
+                error_setg(errp, "vq index %u for IOThread \"%s\" must be "
+                        "less than num_queues %u in iothread-vq-mapping",
+                        vq->value, name, num_queues);
+                return false;
+            }
+
+            if (test_and_set_bit(vq->value, vqs)) {
+                error_setg(errp, "cannot assign vq %u to IOThread \"%s\" "
+                        "because it is already assigned", vq->value, name);
+                return false;
+            }
+        }
+    }
+
+    if (list->value->vqs) {
+        for (uint16_t i = 0; i < num_queues; i++) {
+            if (!test_bit(i, vqs)) {
+                error_setg(errp,
+                        "missing vq %u IOThread assignment in iothread-vq-mapping",
+                        i);
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+bool iothread_vq_mapping_apply(
+        IOThreadVirtQueueMappingList *list,
+        AioContext **vq_aio_context,
+        uint16_t num_queues,
+        Error **errp)
+{
+    IOThreadVirtQueueMappingList *node;
+    size_t num_iothreads = 0;
+    size_t cur_iothread = 0;
+
+    if (!iothread_vq_mapping_validate(list, num_queues, errp)) {
+        return false;
+    }
+
+    for (node = list; node; node = node->next) {
+        num_iothreads++;
+    }
+
+    for (node = list; node; node = node->next) {
+        IOThread *iothread = iothread_by_id(node->value->iothread);
+        AioContext *ctx = iothread_get_aio_context(iothread);
+
+        /* Released in virtio_blk_vq_aio_context_cleanup() */
+        object_ref(OBJECT(iothread));
+
+        if (node->value->vqs) {
+            uint16List *vq;
+
+            /* Explicit vq:IOThread assignment */
+            for (vq = node->value->vqs; vq; vq = vq->next) {
+                assert(vq->value < num_queues);
+                vq_aio_context[vq->value] = ctx;
+            }
+        } else {
+            /* Round-robin vq:IOThread assignment */
+            for (unsigned i = cur_iothread; i < num_queues;
+                 i += num_iothreads) {
+                vq_aio_context[i] = ctx;
+            }
+        }
+
+        cur_iothread++;
+    }
+
+    return true;
+}
+
+void iothread_vq_mapping_cleanup(IOThreadVirtQueueMappingList *list)
+{
+    IOThreadVirtQueueMappingList *node;
+
+    for (node = list; node; node = node->next) {
+        IOThread *iothread = iothread_by_id(node->value->iothread);
+        object_unref(OBJECT(iothread));
+    }
+}
+
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@
 system_virtio_ss = ss.source_set()
 system_virtio_ss.add(files('virtio-bus.c'))
+system_virtio_ss.add(files('iothread-vq-mapping.c'))
 system_virtio_ss.add(when: 'CONFIG_VIRTIO_PCI', if_true: files('virtio-pci.c'))
 system_virtio_ss.add(when: 'CONFIG_VIRTIO_MMIO', if_true: files('virtio-mmio.c'))
 system_virtio_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto.c'))
-- 
2.48.1

From: Stefan Hajnoczi <stefanha@redhat.com>

Allow virtio-scsi virtqueues to be assigned to different IOThreads. This
makes it possible to take advantage of host multi-queue block layer
scalability by assigning virtqueues that have affinity with vCPUs to
different IOThreads that have affinity with host CPUs. The same feature
was introduced for virtio-blk in the past:
https://developers.redhat.com/articles/2024/09/05/scaling-virtio-blk-disk-io-iothread-virtqueue-mapping

Here are fio randread 4k iodepth=64 results from a 4 vCPU guest with an
Intel P4800X SSD:
iothreads IOPS
------------------------------
1         189576
2         312698
4         346744

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20250311132616.1049687-12-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/hw/virtio/virtio-scsi.h |  5 +-
 hw/scsi/virtio-scsi-dataplane.c | 90 ++++++++++++++++++++++++---------
 hw/scsi/virtio-scsi.c           | 63 ++++++++++++++---------
 3 files changed, 107 insertions(+), 51 deletions(-)

diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -XXX,XX +XXX,XX @@
 #include "hw/virtio/virtio.h"
 #include "hw/scsi/scsi.h"
 #include "chardev/char-fe.h"
+#include "qapi/qapi-types-virtio.h"
 #include "system/iothread.h"
 
 #define TYPE_VIRTIO_SCSI_COMMON "virtio-scsi-common"
@@ -XXX,XX +XXX,XX @@ struct VirtIOSCSIConf {
     CharBackend chardev;
     uint32_t boot_tpgt;
     IOThread *iothread;
+    IOThreadVirtQueueMappingList *iothread_vq_mapping_list;
 };
 
 struct VirtIOSCSI;
@@ -XXX,XX +XXX,XX @@ struct VirtIOSCSI {
     QTAILQ_HEAD(, VirtIOSCSIReq) tmf_bh_list;
 
     /* Fields for dataplane below */
-    AioContext *ctx; /* one iothread per virtio-scsi-pci for now */
+    AioContext **vq_aio_context; /* per-virtqueue AioContext pointer */
 
     bool dataplane_started;
     bool dataplane_starting;
@@ -XXX,XX +XXX,XX @@ void virtio_scsi_common_realize(DeviceState *dev,
 void virtio_scsi_common_unrealize(DeviceState *dev);
 
 void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp);
+void virtio_scsi_dataplane_cleanup(VirtIOSCSI *s);
 int virtio_scsi_dataplane_start(VirtIODevice *s);
 void virtio_scsi_dataplane_stop(VirtIODevice *s);
 
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -XXX,XX +XXX,XX @@
 #include "system/block-backend.h"
 #include "hw/scsi/scsi.h"
 #include "scsi/constants.h"
+#include "hw/virtio/iothread-vq-mapping.h"
 #include "hw/virtio/virtio-bus.h"
 
 /* Context: BQL held */
@@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
     VirtIODevice *vdev = VIRTIO_DEVICE(s);
     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+    uint16_t num_vqs = vs->conf.num_queues + VIRTIO_SCSI_VQ_NUM_FIXED;
 
-    if (vs->conf.iothread) {
+    if (vs->conf.iothread && vs->conf.iothread_vq_mapping_list) {
+        error_setg(errp,
+                   "iothread and iothread-vq-mapping properties cannot be set "
+                   "at the same time");
+        return;
+    }
+
+    if (vs->conf.iothread || vs->conf.iothread_vq_mapping_list) {
         if (!k->set_guest_notifiers || !k->ioeventfd_assign) {
             error_setg(errp,
                        "device is incompatible with iothread "
@@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
             error_setg(errp, "ioeventfd is required for iothread");
             return;
         }
-        s->ctx = iothread_get_aio_context(vs->conf.iothread);
-    } else {
-        if (!virtio_device_ioeventfd_enabled(vdev)) {
+    }
+
+    s->vq_aio_context = g_new(AioContext *, num_vqs);
+
+    if (vs->conf.iothread_vq_mapping_list) {
+        if (!iothread_vq_mapping_apply(vs->conf.iothread_vq_mapping_list,
+                                       s->vq_aio_context, num_vqs, errp)) {
+            g_free(s->vq_aio_context);
+            s->vq_aio_context = NULL;
             return;
         }
-        s->ctx = qemu_get_aio_context();
+    } else if (vs->conf.iothread) {
+        AioContext *ctx = iothread_get_aio_context(vs->conf.iothread);
+        for (uint16_t i = 0; i < num_vqs; i++) {
+            s->vq_aio_context[i] = ctx;
+        }
+
+        /* Released in virtio_scsi_dataplane_cleanup() */
+        object_ref(OBJECT(vs->conf.iothread));
+    } else {
+        AioContext *ctx = qemu_get_aio_context();
+        for (unsigned i = 0; i < num_vqs; i++) {
+            s->vq_aio_context[i] = ctx;
+        }
+    }
+}
+
+/* Context: BQL held */
+void virtio_scsi_dataplane_cleanup(VirtIOSCSI *s)
+{
+    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
+
+    if (vs->conf.iothread_vq_mapping_list) {
+        iothread_vq_mapping_cleanup(vs->conf.iothread_vq_mapping_list);
     }
+
+    if (vs->conf.iothread) {
+        object_unref(OBJECT(vs->conf.iothread));
+    }
+
+    g_free(s->vq_aio_context);
+    s->vq_aio_context = NULL;
 }
 
 static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n)
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n)
 }
 
 /* Context: BH in IOThread */
-static void virtio_scsi_dataplane_stop_bh(void *opaque)
+static void virtio_scsi_dataplane_stop_vq_bh(void *opaque)
 {
-    VirtIOSCSI *s = opaque;
-    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
+    AioContext *ctx = qemu_get_current_aio_context();
+    VirtQueue *vq = opaque;
     EventNotifier *host_notifier;
-    int i;
 
-    virtio_queue_aio_detach_host_notifier(vs->ctrl_vq, s->ctx);
-    host_notifier = virtio_queue_get_host_notifier(vs->ctrl_vq);
+    virtio_queue_aio_detach_host_notifier(vq, ctx);
+    host_notifier = virtio_queue_get_host_notifier(vq);
 
     /*
      * Test and clear notifier after disabling event, in case poll callback
      * didn't have time to run.
      */
     virtio_queue_host_notifier_read(host_notifier);
-
-    virtio_queue_aio_detach_host_notifier(vs->event_vq, s->ctx);
-    host_notifier = virtio_queue_get_host_notifier(vs->event_vq);
-    virtio_queue_host_notifier_read(host_notifier);
-
-    for (i = 0; i < vs->conf.num_queues; i++) {
-        virtio_queue_aio_detach_host_notifier(vs->cmd_vqs[i], s->ctx);
-        host_notifier = virtio_queue_get_host_notifier(vs->cmd_vqs[i]);
-        virtio_queue_host_notifier_read(host_notifier);
-    }
 }
 
 /* Context: BQL held */
@@ -XXX,XX +XXX,XX @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev)
     smp_wmb(); /* paired with aio_notify_accept() */
 
     if (s->bus.drain_count == 0) {
-        virtio_queue_aio_attach_host_notifier(vs->ctrl_vq, s->ctx);
-        virtio_queue_aio_attach_host_notifier_no_poll(vs->event_vq, s->ctx);
+        virtio_queue_aio_attach_host_notifier(vs->ctrl_vq,
+                                              s->vq_aio_context[0]);
+        virtio_queue_aio_attach_host_notifier_no_poll(vs->event_vq,
+                                                      s->vq_aio_context[1]);
 
         for (i = 0; i < vs->conf.num_queues; i++) {
-            virtio_queue_aio_attach_host_notifier(vs->cmd_vqs[i], s->ctx);
+            AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
+            virtio_queue_aio_attach_host_notifier(vs->cmd_vqs[i], ctx);
         }
     }
     return 0;
@@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_stop(VirtIODevice *vdev)
     s->dataplane_stopping = true;
 
     if (s->bus.drain_count == 0) {
-        aio_wait_bh_oneshot(s->ctx, virtio_scsi_dataplane_stop_bh, s);
+        for (i = 0; i < vs->conf.num_queues + VIRTIO_SCSI_VQ_NUM_FIXED; i++) {
+            VirtQueue *vq = virtio_get_queue(&vs->parent_obj, i);
+            AioContext *ctx = s->vq_aio_context[i];
+            aio_wait_bh_oneshot(ctx, virtio_scsi_dataplane_stop_vq_bh, vq);
+        }
     }
 
     blk_drain_all(); /* ensure there are no in-flight requests */
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -XXX,XX +XXX,XX @@
 #include "hw/qdev-properties.h"
 #include "hw/scsi/scsi.h"
 #include "scsi/constants.h"
+#include "hw/virtio/iothread-vq-mapping.h"
 #include "hw/virtio/virtio-bus.h"
 #include "hw/virtio/virtio-access.h"
 #include "trace.h"
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_cancel_notify(Notifier *notifier, void *data)
     g_free(n);
 }
 
-static inline void virtio_scsi_ctx_check(VirtIOSCSI *s, SCSIDevice *d)
-{
-    if (s->dataplane_started && d && blk_is_available(d->conf.blk)) {
-        assert(blk_get_aio_context(d->conf.blk) == s->ctx);
-    }
-}
-
 static void virtio_scsi_do_one_tmf_bh(VirtIOSCSIReq *req)
 {
     VirtIOSCSI *s = req->dev;
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_flush_defer_tmf_to_aio_context(VirtIOSCSI *s)
 
     assert(!s->dataplane_started);
 
-    if (s->ctx) {
+    for (uint32_t i = 0; i < s->parent_obj.conf.num_queues; i++) {
+        AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
+
         /* Our BH only runs after previously scheduled BHs */
-        aio_wait_bh_oneshot(s->ctx, dummy_bh, NULL);
+        aio_wait_bh_oneshot(ctx, dummy_bh, NULL);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
     AioContext *ctx;
     int ret = 0;
 
-    virtio_scsi_ctx_check(s, d);
     /* Here VIRTIO_SCSI_S_OK means "FUNCTION COMPLETE".  */
     req->resp.tmf.response = VIRTIO_SCSI_S_OK;
 
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
 
     case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
     case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: {
+        g_autoptr(GHashTable) aio_contexts = g_hash_table_new(NULL, NULL);
+
         if (!d) {
             goto fail;
         }
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
 
         qatomic_inc(&req->remaining);
 
-        ctx = s->ctx ?: qemu_get_aio_context();
-        virtio_scsi_defer_tmf_to_aio_context(req, ctx);
+        for (uint32_t i = 0; i < s->parent_obj.conf.num_queues; i++) {
+            ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
+
+            if (!g_hash_table_add(aio_contexts, ctx)) {
+                continue; /* skip previously added AioContext */
+            }
+
+            virtio_scsi_defer_tmf_to_aio_context(req, ctx);
+        }
 
         virtio_scsi_tmf_dec_remaining(req);
         ret = -EINPROGRESS;
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
  */
 static bool virtio_scsi_defer_to_dataplane(VirtIOSCSI *s)
 {
-    if (!s->ctx || s->dataplane_started) {
+    if (s->dataplane_started) {
         return false;
     }
+    if (s->vq_aio_context[0] == qemu_get_aio_context()) {
+        return false; /* not using IOThreads */
+    }
 
     virtio_device_start_ioeventfd(&s->parent_obj.parent_obj);
     return !s->dataplane_fenced;
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req)
         virtio_scsi_complete_cmd_req(req);
         return -ENOENT;
     }
-    virtio_scsi_ctx_check(s, d);
     req->sreq = scsi_req_new(d, req->req.cmd.tag,
                              virtio_scsi_get_lun(req->req.cmd.lun),
                              req->req.cmd.cdb, vs->cdb_size, req);
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev,
 {
     VirtIODevice *vdev = VIRTIO_DEVICE(hotplug_dev);
     VirtIOSCSI *s = VIRTIO_SCSI(vdev);
+    AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED];
     SCSIDevice *sd = SCSI_DEVICE(dev);
-    int ret;
 
-    if (s->ctx && !s->dataplane_fenced) {
-        ret = blk_set_aio_context(sd->conf.blk, s->ctx, errp);
-        if (ret < 0) {
-            return;
-        }
+    if (ctx != qemu_get_aio_context() && !s->dataplane_fenced) {
+        /*
+         * Try to make the BlockBackend's AioContext match ours. Ignore failure
+         * because I/O will still work although block jobs and other users
+         * might be slower when multiple AioContexts use a BlockBackend.
+         */
+        blk_set_aio_context(sd->conf.blk, ctx, errp);
     }
 
     if (virtio_vdev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) {
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_hotunplug(HotplugHandler *hotplug_dev, DeviceState *dev,
 
     qdev_simple_device_unplug_cb(hotplug_dev, dev, errp);
 
-    if (s->ctx) {
+    if (s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED] != qemu_get_aio_context()) {
         /* If other users keep the BlockBackend in the iothread, that's ok */
         blk_set_aio_context(sd->conf.blk, qemu_get_aio_context(), NULL);
     }
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_drained_begin(SCSIBus *bus)
 
     for (uint32_t i = 0; i < total_queues; i++) {
         VirtQueue *vq = virtio_get_queue(vdev, i);
-        virtio_queue_aio_detach_host_notifier(vq, s->ctx);
+        virtio_queue_aio_detach_host_notifier(vq, s->vq_aio_context[i]);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_drained_end(SCSIBus *bus)
 
     for (uint32_t i = 0; i < total_queues; i++) {
         VirtQueue *vq = virtio_get_queue(vdev, i);
+        AioContext *ctx = s->vq_aio_context[i];
+
         if (vq == vs->event_vq) {
-            virtio_queue_aio_attach_host_notifier_no_poll(vq, s->ctx);
+            virtio_queue_aio_attach_host_notifier_no_poll(vq, ctx);
         } else {
-            virtio_queue_aio_attach_host_notifier(vq, s->ctx);
+            virtio_queue_aio_attach_host_notifier(vq, ctx);
         }
     }
 }
@@ -XXX,XX +XXX,XX @@ void virtio_scsi_common_unrealize(DeviceState *dev)
     virtio_cleanup(vdev);
 }
 
+/* main loop */
 static void virtio_scsi_device_unrealize(DeviceState *dev)
 {
     VirtIOSCSI *s = VIRTIO_SCSI(dev);
 
     virtio_scsi_reset_tmf_bh(s);
-
+    virtio_scsi_dataplane_cleanup(s);
     qbus_set_hotplug_handler(BUS(&s->bus), NULL);
     virtio_scsi_common_unrealize(dev);
     qemu_mutex_destroy(&s->tmf_bh_lock);
@@ -XXX,XX +XXX,XX @@ static const Property virtio_scsi_properties[] = {
                                                 VIRTIO_SCSI_F_CHANGE, true),
     DEFINE_PROP_LINK("iothread", VirtIOSCSI, parent_obj.conf.iothread,
                      TYPE_IOTHREAD, IOThread *),
+    DEFINE_PROP_IOTHREAD_VQ_MAPPING_LIST("iothread-vq-mapping", VirtIOSCSI,
+            parent_obj.conf.iothread_vq_mapping_list),
 };
 
 static const VMStateDescription vmstate_virtio_scsi = {
-- 
2.48.1

From: Stefan Hajnoczi <stefanha@redhat.com>

Previously the ctrl virtqueue was handled in the AioContext where SCSI
requests are processed. When IOThread Virtqueue Mapping was added things
become more complicated because SCSI requests could run in other
AioContexts.

Simplify by handling the ctrl virtqueue in the main loop where reset
operations can be performed. Note that BHs are still used canceling SCSI
requests in their AioContexts but at least the mean loop activity
doesn't need BHs anymore.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20250311132616.1049687-13-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/hw/virtio/virtio-scsi.h |   8 --
 hw/scsi/virtio-scsi-dataplane.c |   6 ++
 hw/scsi/virtio-scsi.c           | 144 ++++++--------------------------
 3 files changed, 33 insertions(+), 125 deletions(-)

diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -XXX,XX +XXX,XX @@ struct VirtIOSCSI {
 
     QemuMutex ctrl_lock; /* protects ctrl_vq */
 
-    /*
-     * TMFs deferred to main loop BH. These fields are protected by
-     * tmf_bh_lock.
-     */
-    QemuMutex tmf_bh_lock;
-    QEMUBH *tmf_bh;
-    QTAILQ_HEAD(, VirtIOSCSIReq) tmf_bh_list;
-
     /* Fields for dataplane below */
     AioContext **vq_aio_context; /* per-virtqueue AioContext pointer */
 
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
             s->vq_aio_context[i] = ctx;
         }
     }
+
+    /*
+     * Always handle the ctrl virtqueue in the main loop thread where device
+     * resets can be performed.
+     */
+    s->vq_aio_context[0] = qemu_get_aio_context();
 }
 
 /* Context: BQL held */
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_cancel_notify(Notifier *notifier, void *data)
     g_free(n);
 }
 
-static void virtio_scsi_do_one_tmf_bh(VirtIOSCSIReq *req)
-{
-    VirtIOSCSI *s = req->dev;
-    SCSIDevice *d = virtio_scsi_device_get(s, req->req.tmf.lun);
-    BusChild *kid;
-    int target;
-
-    switch (req->req.tmf.subtype) {
-    case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
-        if (!d) {
-            req->resp.tmf.response = VIRTIO_SCSI_S_BAD_TARGET;
-            goto out;
-        }
-        if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
-            req->resp.tmf.response = VIRTIO_SCSI_S_INCORRECT_LUN;
-            goto out;
-        }
-        qatomic_inc(&s->resetting);
-        device_cold_reset(&d->qdev);
-        qatomic_dec(&s->resetting);
-        break;
-
-    case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET:
-        target = req->req.tmf.lun[1];
-        qatomic_inc(&s->resetting);
-
-        rcu_read_lock();
-        QTAILQ_FOREACH_RCU(kid, &s->bus.qbus.children, sibling) {
-            SCSIDevice *d1 = SCSI_DEVICE(kid->child);
-            if (d1->channel == 0 && d1->id == target) {
-                device_cold_reset(&d1->qdev);
-            }
-        }
-        rcu_read_unlock();
-
-        qatomic_dec(&s->resetting);
-        break;
-
-    default:
-        g_assert_not_reached();
-    }
-
-out:
-    object_unref(OBJECT(d));
-    virtio_scsi_complete_req(req, &s->ctrl_lock);
-}
-
-/* Some TMFs must be processed from the main loop thread */
-static void virtio_scsi_do_tmf_bh(void *opaque)
-{
-    VirtIOSCSI *s = opaque;
-    QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs);
-    VirtIOSCSIReq *req;
-    VirtIOSCSIReq *tmp;
-
-    GLOBAL_STATE_CODE();
-
-    WITH_QEMU_LOCK_GUARD(&s->tmf_bh_lock) {
-        QTAILQ_FOREACH_SAFE(req, &s->tmf_bh_list, next, tmp) {
-            QTAILQ_REMOVE(&s->tmf_bh_list, req, next);
-            QTAILQ_INSERT_TAIL(&reqs, req, next);
-        }
-
-        qemu_bh_delete(s->tmf_bh);
-        s->tmf_bh = NULL;
-    }
-
-    QTAILQ_FOREACH_SAFE(req, &reqs, next, tmp) {
-        QTAILQ_REMOVE(&reqs, req, next);
-        virtio_scsi_do_one_tmf_bh(req);
-    }
-}
-
-static void virtio_scsi_reset_tmf_bh(VirtIOSCSI *s)
-{
-    VirtIOSCSIReq *req;
-    VirtIOSCSIReq *tmp;
-
-    GLOBAL_STATE_CODE();
-
-    /* Called after ioeventfd has been stopped, so tmf_bh_lock is not needed */
-    if (s->tmf_bh) {
-        qemu_bh_delete(s->tmf_bh);
-        s->tmf_bh = NULL;
-    }
-
-    QTAILQ_FOREACH_SAFE(req, &s->tmf_bh_list, next, tmp) {
-        QTAILQ_REMOVE(&s->tmf_bh_list, req, next);
-
-        /* SAM-6 6.3.2 Hard reset */
-        req->resp.tmf.response = VIRTIO_SCSI_S_TARGET_FAILURE;
-        virtio_scsi_complete_req(req, &req->dev->ctrl_lock);
-    }
-}
-
-static void virtio_scsi_defer_tmf_to_main_loop(VirtIOSCSIReq *req)
-{
-    VirtIOSCSI *s = req->dev;
-
-    WITH_QEMU_LOCK_GUARD(&s->tmf_bh_lock) {
-        QTAILQ_INSERT_TAIL(&s->tmf_bh_list, req, next);
-
-        if (!s->tmf_bh) {
-            s->tmf_bh = qemu_bh_new(virtio_scsi_do_tmf_bh, s);
-            qemu_bh_schedule(s->tmf_bh);
-        }
-    }
-}
-
 static void virtio_scsi_tmf_cancel_req(VirtIOSCSIReq *tmf, SCSIRequest *r)
 {
     VirtIOSCSICancelNotifier *notifier;
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
         break;
 
     case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
-    case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET:
-        virtio_scsi_defer_tmf_to_main_loop(req);
-        ret = -EINPROGRESS;
+        if (!d) {
+            goto fail;
+        }
+        if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
+            goto incorrect_lun;
+        }
+        qatomic_inc(&s->resetting);
+        device_cold_reset(&d->qdev);
+        qatomic_dec(&s->resetting);
         break;
 
+    case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: {
+        BusChild *kid;
+        int target = req->req.tmf.lun[1];
+        qatomic_inc(&s->resetting);
+
+        rcu_read_lock();
+        QTAILQ_FOREACH_RCU(kid, &s->bus.qbus.children, sibling) {
+            SCSIDevice *d1 = SCSI_DEVICE(kid->child);
+            if (d1->channel == 0 && d1->id == target) {
+                device_cold_reset(&d1->qdev);
+            }
+        }
+        rcu_read_unlock();
+
+        qatomic_dec(&s->resetting);
+        break;
+    }
+
     case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
     case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: {
         g_autoptr(GHashTable) aio_contexts = g_hash_table_new(NULL, NULL);
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_reset(VirtIODevice *vdev)
 
     assert(!s->dataplane_started);
 
-    virtio_scsi_reset_tmf_bh(s);
     virtio_scsi_flush_defer_tmf_to_aio_context(s);
 
     qatomic_inc(&s->resetting);
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_device_realize(DeviceState *dev, Error **errp)
     VirtIOSCSI *s = VIRTIO_SCSI(dev);
     Error *err = NULL;
 
-    QTAILQ_INIT(&s->tmf_bh_list);
     qemu_mutex_init(&s->ctrl_lock);
     qemu_mutex_init(&s->event_lock);
-    qemu_mutex_init(&s->tmf_bh_lock);
 
     virtio_scsi_common_realize(dev,
                                virtio_scsi_handle_ctrl,
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_device_unrealize(DeviceState *dev)
 {
     VirtIOSCSI *s = VIRTIO_SCSI(dev);
 
-    virtio_scsi_reset_tmf_bh(s);
     virtio_scsi_dataplane_cleanup(s);
     qbus_set_hotplug_handler(BUS(&s->bus), NULL);
     virtio_scsi_common_unrealize(dev);
-    qemu_mutex_destroy(&s->tmf_bh_lock);
     qemu_mutex_destroy(&s->event_lock);
     qemu_mutex_destroy(&s->ctrl_lock);
 }
-- 
2.48.1

From: Stefan Hajnoczi <stefanha@redhat.com>

Peter Krempa and Kevin Wolf observed that iothread-vq-mapping is
confusing to use because the control and event virtqueues have a fixed
location before the command virtqueues but need to be treated
differently.

Only expose the command virtqueues via iothread-vq-mapping so that the
command-line parameter is intuitive: it controls where SCSI requests are
processed.

The control virtqueue needs to be hardcoded to the main loop thread for
technical reasons anyway. Kevin also pointed out that it's better to
place the event virtqueue in the main loop thread since its no poll
behavior would prevent polling if assigned to an IOThread.

This change is its own commit to avoid squashing the previous commit.

Suggested-by: Kevin Wolf <kwolf@redhat.com>
Suggested-by: Peter Krempa <pkrempa@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20250311132616.1049687-14-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/scsi/virtio-scsi-dataplane.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
     VirtIODevice *vdev = VIRTIO_DEVICE(s);
     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
-    uint16_t num_vqs = vs->conf.num_queues + VIRTIO_SCSI_VQ_NUM_FIXED;
 
     if (vs->conf.iothread && vs->conf.iothread_vq_mapping_list) {
         error_setg(errp,
@@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
         }
     }
 
-    s->vq_aio_context = g_new(AioContext *, num_vqs);
+    s->vq_aio_context = g_new(AioContext *, vs->conf.num_queues +
+                                            VIRTIO_SCSI_VQ_NUM_FIXED);
+
+    /*
+     * Handle the ctrl virtqueue in the main loop thread where device resets
+     * can be performed.
+     */
+    s->vq_aio_context[0] = qemu_get_aio_context();
+
+    /*
+     * Handle the event virtqueue in the main loop thread where its no_poll
+     * behavior won't stop IOThread polling.
+     */
+    s->vq_aio_context[1] = qemu_get_aio_context();
 
     if (vs->conf.iothread_vq_mapping_list) {
         if (!iothread_vq_mapping_apply(vs->conf.iothread_vq_mapping_list,
-                                       s->vq_aio_context, num_vqs, errp)) {
+                    &s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED],
+                    vs->conf.num_queues, errp)) {
             g_free(s->vq_aio_context);
             s->vq_aio_context = NULL;
             return;
         }
     } else if (vs->conf.iothread) {
         AioContext *ctx = iothread_get_aio_context(vs->conf.iothread);
-        for (uint16_t i = 0; i < num_vqs; i++) {
-            s->vq_aio_context[i] = ctx;
+        for (uint16_t i = 0; i < vs->conf.num_queues; i++) {
+            s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i] = ctx;
         }
 
         /* Released in virtio_scsi_dataplane_cleanup() */
         object_ref(OBJECT(vs->conf.iothread));
     } else {
         AioContext *ctx = qemu_get_aio_context();
-        for (unsigned i = 0; i < num_vqs; i++) {
-            s->vq_aio_context[i] = ctx;
+        for (unsigned i = 0; i < vs->conf.num_queues; i++) {
+            s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i] = ctx;
         }
     }
-
-    /*
-     * Always handle the ctrl virtqueue in the main loop thread where device
-     * resets can be performed.
-     */
-    s->vq_aio_context[0] = qemu_get_aio_context();
 }
 
 /* Context: BQL held */
-- 
2.48.1