Series comparison

-[PULL 00/12] Block patches
+[PULL v3 0/5] Block patches
-The following changes since commit 171199f56f5f9bdf1e5d670d09ef1351d8f01bae:
+The following changes since commit 813bac3d8d70d85cb7835f7945eb9eed84c2d8d0:
-  Merge remote-tracking branch 'remotes/alistair/tags/pull-riscv-to-apply-20200619-3' into staging (2020-06-22 14:45:25 +0100)
+  Merge tag '2023q3-bsd-user-pull-request' of https://gitlab.com/bsdimp/qemu into staging (2023-08-29 08:58:00 -0400)
 are available in the Git repository at:
-  https://github.com/stefanha/qemu.git tags/block-pull-request
+  https://gitlab.com/stefanha/qemu.git tags/block-pull-request
-for you to fetch changes up to 7838c67f22a81fcf669785cd6c0876438422071a:
+for you to fetch changes up to 87ec6f55af38e29be5b2b65a8acf84da73e06d06:
-  block/nvme: support nested aio_poll() (2020-06-23 15:46:08 +0100)
+  aio-posix: zero out io_uring sqe user_data (2023-08-30 07:39:59 -0400)
 ----------------------------------------------------------------
 Pull request
+v3:
+- Drop UFS emulation due to CI failures
+- Add "aio-posix: zero out io_uring sqe user_data"
 ----------------------------------------------------------------
-Daniele Buono (4):
+Andrey Drobyshev (3):
-  coroutine: support SafeStack in ucontext backend
+  block: add subcluster_size field to BlockDriverInfo
-  coroutine: add check for SafeStack in sigaltstack
+  block/io: align requests to subcluster_size
-  configure: add flags to support SafeStack
+  tests/qemu-iotests/197: add testcase for CoR with subclusters
   check-block: enable iotests with SafeStack
-Stefan Hajnoczi (8):
+Fabiano Rosas (1):
-  minikconf: explicitly set encoding to UTF-8
+  block-migration: Ensure we don't crash during migration cleanup
   block/nvme: poll queues without q->lock
   block/nvme: drop tautologous assertion
   block/nvme: don't access CQE after moving cq.head
   block/nvme: switch to a NVMeRequest freelist
   block/nvme: clarify that free_req_queue is protected by q->lock
   block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair
   block/nvme: support nested aio_poll()
- configure                    |  73 ++++++++++++
+Stefan Hajnoczi (1):
- include/qemu/coroutine_int.h |   5 +
+  aio-posix: zero out io_uring sqe user_data
- block/nvme.c                 | 220 +++++++++++++++++++++++++----------
- util/coroutine-sigaltstack.c |   4 +
+ include/block/block-common.h |  5 ++++
- util/coroutine-ucontext.c    |  28 +++++
+ include/block/block-io.h     |  8 +++---
- block/trace-events           |   2 +-
+ block.c                      |  7 +++++
- scripts/minikconf.py         |   6 +-
+ block/io.c                   | 50 ++++++++++++++++++------------------
- tests/check-block.sh         |  12 +-
+ block/mirror.c               |  8 +++---
-files changed, 284 insertions(+), 66 deletions(-)
+ block/qcow2.c                |  1 +
  migration/block.c            | 11 ++++++--
  util/fdmon-io_uring.c        |  2 ++
  tests/qemu-iotests/197       | 29 +++++++++++++++++++++
  tests/qemu-iotests/197.out   | 24 +++++++++++++++++
 files changed, 110 insertions(+), 35 deletions(-)
 --
-.26.2
+.41.0

-[PULL 01/12] minikconf: explicitly set encoding to UTF-8
+Deleted patch
-QEMU currently only has ASCII Kconfig files but Linux actually uses
-UTF-8. Explicitly specify the encoding and that we're doing text file
-I/O.
-It's unclear whether or not QEMU will ever need Unicode in its Kconfig
-files. If we start using the help text then it will become an issue
-sooner or later. Make this change now for consistency with Linux
-Kconfig.
-Reported-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
-Message-id: 20200521153616.307100-1-stefanha@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- scripts/minikconf.py | 6 +++---
-file changed, 3 insertions(+), 3 deletions(-)
-diff --git a/scripts/minikconf.py b/scripts/minikconf.py
-index XXXXXXX..XXXXXXX 100755
---- a/scripts/minikconf.py
-+++ b/scripts/minikconf.py
-@@ -XXX,XX +XXX,XX @@ class KconfigParser:
-         if incl_abs_fname in self.data.previously_included:
-             return
-         try:
--            fp = open(incl_abs_fname, 'r')
-+            fp = open(incl_abs_fname, 'rt', encoding='utf-8')
-         except IOError as e:
-             raise KconfigParserError(self,
-                                 '%s: %s' % (e.strerror, include))
-@@ -XXX,XX +XXX,XX @@ if __name__ == '__main__':
-             parser.do_assignment(name, value == 'y')
-             external_vars.add(name[7:])
-         else:
--            fp = open(arg, 'r')
-+            fp = open(arg, 'rt', encoding='utf-8')
-             parser.parse_file(fp)
-             fp.close()
-@@ -XXX,XX +XXX,XX @@ if __name__ == '__main__':
-         if key not in external_vars and config[key]:
-             print ('CONFIG_%s=y' % key)
--    deps = open(argv[2], 'w')
-+    deps = open(argv[2], 'wt', encoding='utf-8')
-     for fname in data.previously_included:
-         print ('%s: %s' % (argv[1], fname), file=deps)
-     deps.close()
---
-.26.2

-[PULL 11/12] block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair
+[PULL v3 1/5] block-migration: Ensure we don't crash during migration cleanup
-Passing around both BDRVNVMeState and NVMeQueuePair is unwieldy. Reduce
+From: Fabiano Rosas <farosas@suse.de>
 the number of function arguments by keeping the BDRVNVMeState pointer in
 NVMeQueuePair. This will come in handly when a BH is introduced in a
 later patch and only one argument can be passed to it.
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+We can fail the blk_insert_bs() at init_blk_migration(), leaving the
-Reviewed-by: Sergio Lopez <slp@redhat.com>
+BlkMigDevState without a dirty_bitmap and BlockDriverState. Account
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+for the possibly missing elements when doing cleanup.
-Message-id: 20200617132201.1832152-7-stefanha@redhat.com
 Fix the following crashes:
 Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault.
 x0000555555ec83ef in bdrv_release_dirty_bitmap (bitmap=0x0) at ../block/dirty-bitmap.c:359
 BlockDriverState *bs = bitmap->bs;
  #0  0x0000555555ec83ef in bdrv_release_dirty_bitmap (bitmap=0x0) at ../block/dirty-bitmap.c:359
  #1  0x0000555555bba331 in unset_dirty_tracking () at ../migration/block.c:371
  #2  0x0000555555bbad98 in block_migration_cleanup_bmds () at ../migration/block.c:681
 Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault.
 x0000555555e971ff in bdrv_op_unblock (bs=0x0, op=BLOCK_OP_TYPE_BACKUP_SOURCE, reason=0x0) at ../block.c:7073
 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
  #0  0x0000555555e971ff in bdrv_op_unblock (bs=0x0, op=BLOCK_OP_TYPE_BACKUP_SOURCE, reason=0x0) at ../block.c:7073
  #1  0x0000555555e9734a in bdrv_op_unblock_all (bs=0x0, reason=0x0) at ../block.c:7095
  #2  0x0000555555bbae13 in block_migration_cleanup_bmds () at ../migration/block.c:690
 Signed-off-by: Fabiano Rosas <farosas@suse.de>
 Message-id: 20230731203338.27581-1-farosas@suse.de
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/nvme.c | 70 ++++++++++++++++++++++++++++------------------------
+ migration/block.c | 11 +++++++++--
-file changed, 38 insertions(+), 32 deletions(-)
+file changed, 9 insertions(+), 2 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
+diff --git a/migration/block.c b/migration/block.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
+--- a/migration/block.c
-+++ b/block/nvme.c
++++ b/migration/block.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void unset_dirty_tracking(void)
-  */
+     BlkMigDevState *bmds;
- #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
+     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-+typedef struct BDRVNVMeState BDRVNVMeState;
+-        bdrv_release_dirty_bitmap(bmds->dirty_bitmap);
-+
++        if (bmds->dirty_bitmap) {
- typedef struct {
++            bdrv_release_dirty_bitmap(bmds->dirty_bitmap);
-     int32_t  head, tail;
++        }
      uint8_t  *queue;
@@ -XXX,XX +XXX,XX @@ typedef struct {
  typedef struct {
      QemuMutex   lock;
 +    /* Read from I/O code path, initialized under BQL */
 +    BDRVNVMeState   *s;
 +    int             index;
 +
      /* Fields protected by BQL */
 -    int         index;
      uint8_t     *prp_list_pages;
      /* Fields protected by @lock */
@@ -XXX,XX +XXX,XX @@ typedef volatile struct {
  QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000);
 -typedef struct {
 +struct BDRVNVMeState {
      AioContext *aio_context;
      QEMUVFIOState *vfio;
      NVMeRegs *regs;
@@ -XXX,XX +XXX,XX @@ typedef struct {
      /* PCI address (required for nvme_refresh_filename()) */
      char *device;
 -} BDRVNVMeState;
 +};
  #define NVME_BLOCK_OPT_DEVICE "device"
  #define NVME_BLOCK_OPT_NAMESPACE "namespace"
@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
      }
  }
--static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q)
+@@ -XXX,XX +XXX,XX @@ static int64_t get_remaining_dirty(void)
-+static void nvme_free_queue_pair(NVMeQueuePair *q)
+ static void block_migration_cleanup_bmds(void)
  {
-     qemu_vfree(q->prp_list_pages);
+     BlkMigDevState *bmds;
-     qemu_vfree(q->sq.queue);
++    BlockDriverState *bs;
-@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
+     AioContext *ctx;
-     uint64_t prp_list_iova;
+     unset_dirty_tracking();
-     qemu_mutex_init(&q->lock);
-+    q->s = s;
+     while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
-     q->index = idx;
+         QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
-     qemu_co_queue_init(&q->free_req_queue);
+-        bdrv_op_unblock_all(blk_bs(bmds->blk), bmds->blocker);
      q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
      return q;
  fail:
 -    nvme_free_queue_pair(bs, q);
 +    nvme_free_queue_pair(q);
      return NULL;
  }
  /* With q->lock */
 -static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
 +static void nvme_kick(NVMeQueuePair *q)
  {
 +    BDRVNVMeState *s = q->s;
 +
-     if (s->plugged || !q->need_kick) {
++        bs = blk_bs(bmds->blk);
-         return;
++        if (bs) {
-     }
++            bdrv_op_unblock_all(bs, bmds->blocker);
-@@ -XXX,XX +XXX,XX @@ static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
++        }
- }
+         error_free(bmds->blocker);
- /* With q->lock */
+         /* Save ctx, because bmds->blk can disappear during blk_unref.  */
 -static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
 +static void nvme_wake_free_req_locked(NVMeQueuePair *q)
  {
      if (!qemu_co_queue_empty(&q->free_req_queue)) {
 -        replay_bh_schedule_oneshot_event(s->aio_context,
 +        replay_bh_schedule_oneshot_event(q->s->aio_context,
                  nvme_free_req_queue_cb, q);
      }
  }
  /* Insert a request in the freelist and wake waiters */
 -static void nvme_put_free_req_and_wake(BDRVNVMeState *s,  NVMeQueuePair *q,
 -                                       NVMeRequest *req)
 +static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req)
  {
      qemu_mutex_lock(&q->lock);
      nvme_put_free_req_locked(q, req);
 -    nvme_wake_free_req_locked(s, q);
 +    nvme_wake_free_req_locked(q);
      qemu_mutex_unlock(&q->lock);
  }
@@ -XXX,XX +XXX,XX @@ static inline int nvme_translate_error(const NvmeCqe *c)
  }
  /* With q->lock */
 -static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
 +static bool nvme_process_completion(NVMeQueuePair *q)
  {
 +    BDRVNVMeState *s = q->s;
      bool progress = false;
      NVMeRequest *preq;
      NVMeRequest req;
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
          /* Notify the device so it can post more completions. */
          smp_mb_release();
          *q->cq.doorbell = cpu_to_le32(q->cq.head);
 -        nvme_wake_free_req_locked(s, q);
 +        nvme_wake_free_req_locked(q);
      }
      q->busy = false;
      return progress;
@@ -XXX,XX +XXX,XX @@ static void nvme_trace_command(const NvmeCmd *cmd)
      }
  }
 -static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
 -                                NVMeRequest *req,
 +static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
                                  NvmeCmd *cmd, BlockCompletionFunc cb,
                                  void *opaque)
  {
@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
      req->opaque = opaque;
      cmd->cid = cpu_to_le32(req->cid);
 -    trace_nvme_submit_command(s, q->index, req->cid);
 +    trace_nvme_submit_command(q->s, q->index, req->cid);
      nvme_trace_command(cmd);
      qemu_mutex_lock(&q->lock);
      memcpy((uint8_t *)q->sq.queue +
             q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
      q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
      q->need_kick++;
 -    nvme_kick(s, q);
 -    nvme_process_completion(s, q);
 +    nvme_kick(q);
 +    nvme_process_completion(q);
      qemu_mutex_unlock(&q->lock);
  }
@@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
                           NvmeCmd *cmd)
  {
      NVMeRequest *req;
 -    BDRVNVMeState *s = bs->opaque;
      int ret = -EINPROGRESS;
      req = nvme_get_free_req(q);
      if (!req) {
          return -EBUSY;
      }
 -    nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret);
 +    nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
      BDRV_POLL_WHILE(bs, ret == -EINPROGRESS);
      return ret;
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
          }
          qemu_mutex_lock(&q->lock);
 -        while (nvme_process_completion(s, q)) {
 +        while (nvme_process_completion(q)) {
              /* Keep polling */
              progress = true;
          }
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
      };
      if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
          error_setg(errp, "Failed to create io queue [%d]", n);
 -        nvme_free_queue_pair(bs, q);
 +        nvme_free_queue_pair(q);
          return false;
      }
      cmd = (NvmeCmd) {
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
      };
      if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
          error_setg(errp, "Failed to create io queue [%d]", n);
 -        nvme_free_queue_pair(bs, q);
 +        nvme_free_queue_pair(q);
          return false;
      }
      s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
@@ -XXX,XX +XXX,XX @@ static void nvme_close(BlockDriverState *bs)
      BDRVNVMeState *s = bs->opaque;
      for (i = 0; i < s->nr_queues; ++i) {
 -        nvme_free_queue_pair(bs, s->queues[i]);
 +        nvme_free_queue_pair(s->queues[i]);
      }
      g_free(s->queues);
      aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
      r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
      qemu_co_mutex_unlock(&s->dma_map_lock);
      if (r) {
 -        nvme_put_free_req_and_wake(s, ioq, req);
 +        nvme_put_free_req_and_wake(ioq, req);
          return r;
      }
 -    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
 +    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
      data.co = qemu_coroutine_self();
      while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
      assert(s->nr_queues > 1);
      req = nvme_get_free_req(ioq);
      assert(req);
 -    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
 +    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
      data.co = qemu_coroutine_self();
      if (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
      req = nvme_get_free_req(ioq);
      assert(req);
 -    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
 +    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
      data.co = qemu_coroutine_self();
      while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
      qemu_co_mutex_unlock(&s->dma_map_lock);
      if (ret) {
 -        nvme_put_free_req_and_wake(s, ioq, req);
 +        nvme_put_free_req_and_wake(ioq, req);
          goto out;
      }
      trace_nvme_dsm(s, offset, bytes);
 -    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
 +    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
      data.co = qemu_coroutine_self();
      while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static void nvme_aio_unplug(BlockDriverState *bs)
      for (i = 1; i < s->nr_queues; i++) {
          NVMeQueuePair *q = s->queues[i];
          qemu_mutex_lock(&q->lock);
 -        nvme_kick(s, q);
 -        nvme_process_completion(s, q);
 +        nvme_kick(q);
 +        nvme_process_completion(q);
          qemu_mutex_unlock(&q->lock);
      }
  }
 --
-.26.2
+.41.0

-[PULL 12/12] block/nvme: support nested aio_poll()
+[PULL v3 2/5] block: add subcluster_size field to BlockDriverInfo
-QEMU block drivers are supposed to support aio_poll() from I/O
+From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
 completion callback functions. This means completion processing must be
 re-entrant.
-The standard approach is to schedule a BH during completion processing
+This is going to be used in the subsequent commit as requests alignment
-and cancel it at the end of processing. If aio_poll() is invoked by a
+(in particular, during copy-on-read).  This value only makes sense for
-callback function then the BH will run. The BH continues the suspended
+the formats which support subclusters (currently QCOW2 only).  If this
-completion processing.
+field isn't set by driver's own bdrv_get_info() implementation, we
 simply set it equal to the cluster size thus treating each cluster as
 having a single subcluster.
-All of this means that request A's cb() can synchronously wait for
+Reviewed-by: Eric Blake <eblake@redhat.com>
-request B to complete. Previously the nvme block driver would hang
+Reviewed-by: Denis V. Lunev <den@openvz.org>
-because it didn't process completions from nested aio_poll().
+Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Message-ID: <20230711172553.234055-2-andrey.drobyshev@virtuozzo.com>
 ---
  include/block/block-common.h | 5 +++++
  block.c                      | 7 +++++++
  block/qcow2.c                | 1 +
 files changed, 13 insertions(+)
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+diff --git a/include/block/block-common.h b/include/block/block-common.h
 Reviewed-by: Sergio Lopez <slp@redhat.com>
 Message-id: 20200617132201.1832152-8-stefanha@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
  block/nvme.c       | 67 ++++++++++++++++++++++++++++++++++++++++------
  block/trace-events |  2 +-
 files changed, 60 insertions(+), 9 deletions(-)
 diff --git a/block/nvme.c b/block/nvme.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
+--- a/include/block/block-common.h
-+++ b/block/nvme.c
++++ b/include/block/block-common.h
-@@ -XXX,XX +XXX,XX @@ typedef struct {
+@@ -XXX,XX +XXX,XX @@ typedef struct BlockZoneWps {
-     int         cq_phase;
+ typedef struct BlockDriverInfo {
-     int         free_req_head;
+     /* in bytes, 0 if irrelevant */
-     NVMeRequest reqs[NVME_NUM_REQS];
+     int cluster_size;
--    bool        busy;
++    /*
-     int         need_kick;
++     * A fraction of cluster_size, if supported (currently QCOW2 only); if
-     int         inflight;
++     * disabled or unsupported, set equal to cluster_size.
-+
++     */
-+    /* Thread-safe, no lock necessary */
++    int subcluster_size;
-+    QEMUBH      *completion_bh;
+     /* offset at which the VM state can be saved (0 if not possible) */
- } NVMeQueuePair;
+     int64_t vm_state_offset;
+     bool is_dirty;
- /* Memory mapped registers */
+diff --git a/block.c b/block.c
-@@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState {
+index XXXXXXX..XXXXXXX 100644
- #define NVME_BLOCK_OPT_DEVICE "device"
+--- a/block.c
- #define NVME_BLOCK_OPT_NAMESPACE "namespace"
++++ b/block.c
+@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
-+static void nvme_process_completion_bh(void *opaque);
+     }
-+
+     memset(bdi, 0, sizeof(*bdi));
- static QemuOptsList runtime_opts = {
+     ret = drv->bdrv_co_get_info(bs, bdi);
-     .name = "nvme",
++    if (bdi->subcluster_size == 0) {
-     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
++        /*
-@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
++         * If the driver left this unset, subclusters are not supported.
++         * Then it is safe to treat each cluster as having only one subcluster.
- static void nvme_free_queue_pair(NVMeQueuePair *q)
++         */
 +        bdi->subcluster_size = bdi->cluster_size;
 +    }
      if (ret < 0) {
          return ret;
      }
 diff --git a/block/qcow2.c b/block/qcow2.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/qcow2.c
 +++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ qcow2_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
  {
-+    if (q->completion_bh) {
+     BDRVQcow2State *s = bs->opaque;
-+        qemu_bh_delete(q->completion_bh);
+     bdi->cluster_size = s->cluster_size;
-+    }
++    bdi->subcluster_size = s->subcluster_size;
-     qemu_vfree(q->prp_list_pages);
+     bdi->vm_state_offset = qcow2_vm_state_offset(s);
-     qemu_vfree(q->sq.queue);
+     bdi->is_dirty = s->incompatible_features & QCOW2_INCOMPAT_DIRTY;
-     qemu_vfree(q->cq.queue);
+     return 0;
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
      q->index = idx;
      qemu_co_queue_init(&q->free_req_queue);
      q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
 +    q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs),
 +                                  nvme_process_completion_bh, q);
      r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
                            s->page_size * NVME_NUM_REQS,
                            false, &prp_list_iova);
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
      NvmeCqe *c;
      trace_nvme_process_completion(s, q->index, q->inflight);
 -    if (q->busy || s->plugged) {
 -        trace_nvme_process_completion_queue_busy(s, q->index);
 +    if (s->plugged) {
 +        trace_nvme_process_completion_queue_plugged(s, q->index);
          return false;
      }
 -    q->busy = true;
 +
 +    /*
 +     * Support re-entrancy when a request cb() function invokes aio_poll().
 +     * Pending completions must be visible to aio_poll() so that a cb()
 +     * function can wait for the completion of another request.
 +     *
 +     * The aio_poll() loop will execute our BH and we'll resume completion
 +     * processing there.
 +     */
 +    qemu_bh_schedule(q->completion_bh);
 +
      assert(q->inflight >= 0);
      while (q->inflight) {
          int ret;
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
          assert(req.cb);
          nvme_put_free_req_locked(q, preq);
          preq->cb = preq->opaque = NULL;
 -        qemu_mutex_unlock(&q->lock);
 -        req.cb(req.opaque, ret);
 -        qemu_mutex_lock(&q->lock);
          q->inflight--;
 +        qemu_mutex_unlock(&q->lock);
 +        req.cb(req.opaque, ret);
 +        qemu_mutex_lock(&q->lock);
          progress = true;
      }
      if (progress) {
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
          *q->cq.doorbell = cpu_to_le32(q->cq.head);
          nvme_wake_free_req_locked(q);
      }
 -    q->busy = false;
 +
 +    qemu_bh_cancel(q->completion_bh);
 +
      return progress;
  }
 +static void nvme_process_completion_bh(void *opaque)
 +{
 +    NVMeQueuePair *q = opaque;
 +
 +    /*
 +     * We're being invoked because a nvme_process_completion() cb() function
 +     * called aio_poll(). The callback may be waiting for further completions
 +     * so notify the device that it has space to fill in more completions now.
 +     */
 +    smp_mb_release();
 +    *q->cq.doorbell = cpu_to_le32(q->cq.head);
 +    nvme_wake_free_req_locked(q);
 +
 +    nvme_process_completion(q);
 +}
 +
  static void nvme_trace_command(const NvmeCmd *cmd)
  {
      int i;
@@ -XXX,XX +XXX,XX @@ static void nvme_detach_aio_context(BlockDriverState *bs)
  {
      BDRVNVMeState *s = bs->opaque;
 +    for (int i = 0; i < s->nr_queues; i++) {
 +        NVMeQueuePair *q = s->queues[i];
 +
 +        qemu_bh_delete(q->completion_bh);
 +        q->completion_bh = NULL;
 +    }
 +
      aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
                             false, NULL, NULL);
  }
@@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs,
      s->aio_context = new_context;
      aio_set_event_notifier(new_context, &s->irq_notifier,
                             false, nvme_handle_event, nvme_poll_cb);
 +
 +    for (int i = 0; i < s->nr_queues; i++) {
 +        NVMeQueuePair *q = s->queues[i];
 +
 +        q->completion_bh =
 +            aio_bh_new(new_context, nvme_process_completion_bh, q);
 +    }
  }
  static void nvme_aio_plug(BlockDriverState *bs)
 diff --git a/block/trace-events b/block/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/block/trace-events
 +++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ nvme_kick(void *s, int queue) "s %p queue %d"
  nvme_dma_flush_queue_wait(void *s) "s %p"
  nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
  nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
 -nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d"
 +nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d"
  nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
  nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
  nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
 --
-.26.2
+.41.0

-[PULL 02/12] coroutine: support SafeStack in ucontext backend
+[PULL v3 3/5] block/io: align requests to subcluster_size
-From: Daniele Buono <dbuono@linux.vnet.ibm.com>
+From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
-LLVM's SafeStack instrumentation does not yet support programs that make
+When target image is using subclusters, and we align the request during
-use of the APIs in ucontext.h
+copy-on-read, it makes sense to align to subcluster_size rather than
-With the current implementation of coroutine-ucontext, the resulting
+cluster_size.  Otherwise we end up with unnecessary allocations.
-binary is incorrect, with different coroutines sharing the same unsafe
-stack and producing undefined behavior at runtime.
+This commit renames bdrv_round_to_clusters() to bdrv_round_to_subclusters()
-This fix allocates an additional unsafe stack area for each coroutine,
+and utilizes subcluster_size field of BlockDriverInfo to make necessary
-and sets the new unsafe stack pointer before calling swapcontext() in
+alignments.  It affects copy-on-read as well as mirror job (which is
-qemu_coroutine_new.
+using bdrv_round_to_clusters()).
-This is the only place where the pointer needs to be manually updated,
-since sigsetjmp/siglongjmp are already instrumented by LLVM to properly
+This change also fixes the following bug with failing assert (covered by
-support SafeStack.
+the test in the subsequent commit):
-The additional stack is then freed in qemu_coroutine_delete.
+qemu-img create -f qcow2 base.qcow2 64K
-Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
+qemu-img create -f qcow2 -o extended_l2=on,backing_file=base.qcow2,backing_fmt=qcow2 img.qcow2 64K
-Message-id: 20200529205122.714-2-dbuono@linux.vnet.ibm.com
+qemu-io -c "write -P 0xaa 0 2K" img.qcow2
 qemu-io -C -c "read -P 0x00 2K 62K" img.qcow2
 qemu-io: ../block/io.c:1236: bdrv_co_do_copy_on_readv: Assertion `skip_bytes < pnum' failed.
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Denis V. Lunev <den@openvz.org>
 Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Message-ID: <20230711172553.234055-3-andrey.drobyshev@virtuozzo.com>
 ---
- include/qemu/coroutine_int.h |  5 +++++
+ include/block/block-io.h |  8 +++----
- util/coroutine-ucontext.c    | 28 ++++++++++++++++++++++++++++
+ block/io.c               | 50 ++++++++++++++++++++--------------------
-files changed, 33 insertions(+)
+ block/mirror.c           |  8 +++----
+files changed, 33 insertions(+), 33 deletions(-)
-diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
 diff --git a/include/block/block-io.h b/include/block/block-io.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/coroutine_int.h
+--- a/include/block/block-io.h
-+++ b/include/qemu/coroutine_int.h
++++ b/include/block/block-io.h
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
- #include "qemu/queue.h"
+ ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs,
- #include "qemu/coroutine.h"
+                                           Error **errp);
+ BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs);
-+#ifdef CONFIG_SAFESTACK
+-void bdrv_round_to_clusters(BlockDriverState *bs,
-+/* Pointer to the unsafe stack, defined by the compiler */
+-                            int64_t offset, int64_t bytes,
-+extern __thread void *__safestack_unsafe_stack_ptr;
+-                            int64_t *cluster_offset,
-+#endif
+-                            int64_t *cluster_bytes);
-+
++void bdrv_round_to_subclusters(BlockDriverState *bs,
- #define COROUTINE_STACK_SIZE (1 << 20)
++                               int64_t offset, int64_t bytes,
++                               int64_t *cluster_offset,
- typedef enum {
++                               int64_t *cluster_bytes);
-diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c
  void bdrv_get_backing_filename(BlockDriverState *bs,
                                 char *filename, int filename_size);
 diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/util/coroutine-ucontext.c
+--- a/block/io.c
-+++ b/util/coroutine-ucontext.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@ typedef struct {
+@@ -XXX,XX +XXX,XX @@ BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs)
-     Coroutine base;
+ }
-     void *stack;
-     size_t stack_size;
+ /**
-+#ifdef CONFIG_SAFESTACK
+- * Round a region to cluster boundaries
-+    /* Need an unsafe stack for each coroutine */
++ * Round a region to subcluster (if supported) or cluster boundaries
-+    void *unsafe_stack;
+  */
-+    size_t unsafe_stack_size;
+ void coroutine_fn GRAPH_RDLOCK
-+#endif
+-bdrv_round_to_clusters(BlockDriverState *bs, int64_t offset, int64_t bytes,
-     sigjmp_buf env;
+-                       int64_t *cluster_offset, int64_t *cluster_bytes)
++bdrv_round_to_subclusters(BlockDriverState *bs, int64_t offset, int64_t bytes,
-     void *tsan_co_fiber;
++                          int64_t *align_offset, int64_t *align_bytes)
-@@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void)
+ {
-     co = g_malloc0(sizeof(*co));
+     BlockDriverInfo bdi;
-     co->stack_size = COROUTINE_STACK_SIZE;
+     IO_CODE();
-     co->stack = qemu_alloc_stack(&co->stack_size);
+-    if (bdrv_co_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
-+#ifdef CONFIG_SAFESTACK
+-        *cluster_offset = offset;
-+    co->unsafe_stack_size = COROUTINE_STACK_SIZE;
+-        *cluster_bytes = bytes;
-+    co->unsafe_stack = qemu_alloc_stack(&co->unsafe_stack_size);
++    if (bdrv_co_get_info(bs, &bdi) < 0 || bdi.subcluster_size == 0) {
-+#endif
++        *align_offset = offset;
-     co->base.entry_arg = &old_env; /* stash away our jmp_buf */
++        *align_bytes = bytes;
+     } else {
-     uc.uc_link = &old_uc;
+-        int64_t c = bdi.cluster_size;
-@@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void)
+-        *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
-             COROUTINE_YIELD,
+-        *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
-             &fake_stack_save,
++        int64_t c = bdi.subcluster_size;
-             co->stack, co->stack_size, co->tsan_co_fiber);
++        *align_offset = QEMU_ALIGN_DOWN(offset, c);
-+
++        *align_bytes = QEMU_ALIGN_UP(offset - *align_offset + bytes, c);
 +#ifdef CONFIG_SAFESTACK
 +        /*
 +         * Before we swap the context, set the new unsafe stack
 +         * The unsafe stack grows just like the normal stack, so start from
 +         * the last usable location of the memory area.
 +         * NOTE: we don't have to re-set the usp afterwards because we are
 +         * coming back to this context through a siglongjmp.
 +         * The compiler already wrapped the corresponding sigsetjmp call with
 +         * code that saves the usp on the (safe) stack before the call, and
 +         * restores it right after (which is where we return with siglongjmp).
 +         */
 +        void *usp = co->unsafe_stack + co->unsafe_stack_size;
 +        __safestack_unsafe_stack_ptr = usp;
 +#endif
 +
          swapcontext(&old_uc, &uc);
      }
-@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_delete(Coroutine *co_)
- #endif
-     qemu_free_stack(co->stack, co->stack_size);
-+#ifdef CONFIG_SAFESTACK
-+    qemu_free_stack(co->unsafe_stack, co->unsafe_stack_size);
-+#endif
-     g_free(co);
  }
+@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
+     void *bounce_buffer = NULL;
+     BlockDriver *drv = bs->drv;
+-    int64_t cluster_offset;
+-    int64_t cluster_bytes;
++    int64_t align_offset;
++    int64_t align_bytes;
+     int64_t skip_bytes;
+     int ret;
+     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
+@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
+      * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
+      * is one reason we loop rather than doing it all at once.
+      */
+-    bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
+-    skip_bytes = offset - cluster_offset;
++    bdrv_round_to_subclusters(bs, offset, bytes, &align_offset, &align_bytes);
++    skip_bytes = offset - align_offset;
+     trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
+-                                   cluster_offset, cluster_bytes);
++                                   align_offset, align_bytes);
+-    while (cluster_bytes) {
++    while (align_bytes) {
+         int64_t pnum;
+         if (skip_write) {
+             ret = 1; /* "already allocated", so nothing will be copied */
+-            pnum = MIN(cluster_bytes, max_transfer);
++            pnum = MIN(align_bytes, max_transfer);
+         } else {
+-            ret = bdrv_is_allocated(bs, cluster_offset,
+-                                    MIN(cluster_bytes, max_transfer), &pnum);
++            ret = bdrv_is_allocated(bs, align_offset,
++                                    MIN(align_bytes, max_transfer), &pnum);
+             if (ret < 0) {
+                 /*
+                  * Safe to treat errors in querying allocation as if
+                  * unallocated; we'll probably fail again soon on the
+                  * read, but at least that will set a decent errno.
+                  */
+-                pnum = MIN(cluster_bytes, max_transfer);
++                pnum = MIN(align_bytes, max_transfer);
+             }
+             /* Stop at EOF if the image ends in the middle of the cluster */
+@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
+             /* Must copy-on-read; use the bounce buffer */
+             pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
+             if (!bounce_buffer) {
+-                int64_t max_we_need = MAX(pnum, cluster_bytes - pnum);
++                int64_t max_we_need = MAX(pnum, align_bytes - pnum);
+                 int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER);
+                 int64_t bounce_buffer_len = MIN(max_we_need, max_allowed);
+@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
+             }
+             qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum);
+-            ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
++            ret = bdrv_driver_preadv(bs, align_offset, pnum,
+                                      &local_qiov, 0, 0);
+             if (ret < 0) {
+                 goto err;
+@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
+                 /* FIXME: Should we (perhaps conditionally) be setting
+                  * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
+                  * that still correctly reads as zero? */
+-                ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
++                ret = bdrv_co_do_pwrite_zeroes(bs, align_offset, pnum,
+                                                BDRV_REQ_WRITE_UNCHANGED);
+             } else {
+                 /* This does not change the data on the disk, it is not
+                  * necessary to flush even in cache=writethrough mode.
+                  */
+-                ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
++                ret = bdrv_driver_pwritev(bs, align_offset, pnum,
+                                           &local_qiov, 0,
+                                           BDRV_REQ_WRITE_UNCHANGED);
+             }
+@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
+             }
+         }
+-        cluster_offset += pnum;
+-        cluster_bytes -= pnum;
++        align_offset += pnum;
++        align_bytes -= pnum;
+         progress += pnum - skip_bytes;
+         skip_bytes = 0;
+     }
+diff --git a/block/mirror.c b/block/mirror.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/mirror.c
++++ b/block/mirror.c
+@@ -XXX,XX +XXX,XX @@ static int coroutine_fn mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
+     need_cow |= !test_bit((*offset + *bytes - 1) / s->granularity,
+                           s->cow_bitmap);
+     if (need_cow) {
+-        bdrv_round_to_clusters(blk_bs(s->target), *offset, *bytes,
+-                               &align_offset, &align_bytes);
++        bdrv_round_to_subclusters(blk_bs(s->target), *offset, *bytes,
++                                  &align_offset, &align_bytes);
+     }
+     if (align_bytes > max_bytes) {
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
+             int64_t target_offset;
+             int64_t target_bytes;
+             WITH_GRAPH_RDLOCK_GUARD() {
+-                bdrv_round_to_clusters(blk_bs(s->target), offset, io_bytes,
+-                                       &target_offset, &target_bytes);
++                bdrv_round_to_subclusters(blk_bs(s->target), offset, io_bytes,
++                                          &target_offset, &target_bytes);
+             }
+             if (target_offset == offset &&
+                 target_bytes == io_bytes) {
 --
-.26.2
+.41.0

-[PULL 03/12] coroutine: add check for SafeStack in sigaltstack
+Deleted patch
-From: Daniele Buono <dbuono@linux.vnet.ibm.com>
-Current implementation of LLVM's SafeStack is not compatible with
-code that uses an alternate stack created with sigaltstack().
-Since coroutine-sigaltstack relies on sigaltstack(), it is not
-compatible with SafeStack. The resulting binary is incorrect, with
-different coroutines sharing the same unsafe stack and producing
-undefined behavior at runtime.
-In the future LLVM may provide a SafeStack implementation compatible with
-sigaltstack(). In the meantime, if SafeStack is desired, the coroutine
-implementation from coroutine-ucontext should be used.
-As a safety check, add a control in coroutine-sigaltstack to throw a
-preprocessor #error if SafeStack is enabled and we are trying to
-use coroutine-sigaltstack to implement coroutines.
-Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
-Message-id: 20200529205122.714-3-dbuono@linux.vnet.ibm.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- util/coroutine-sigaltstack.c | 4 ++++
-file changed, 4 insertions(+)
-diff --git a/util/coroutine-sigaltstack.c b/util/coroutine-sigaltstack.c
-index XXXXXXX..XXXXXXX 100644
---- a/util/coroutine-sigaltstack.c
-+++ b/util/coroutine-sigaltstack.c
-@@ -XXX,XX +XXX,XX @@
- #include "qemu-common.h"
- #include "qemu/coroutine_int.h"
-+#ifdef CONFIG_SAFESTACK
-+#error "SafeStack is not compatible with code run in alternate signal stacks"
-+#endif
-+
- typedef struct {
-     Coroutine base;
-     void *stack;
---
-.26.2

-[PULL 04/12] configure: add flags to support SafeStack
+Deleted patch
-From: Daniele Buono <dbuono@linux.vnet.ibm.com>
-This patch adds a flag to enable/disable the SafeStack instrumentation
-provided by LLVM.
-On enable, make sure that the compiler supports the flags, and that we
-are using the proper coroutine implementation (coroutine-ucontext).
-On disable, explicitly disable the option if it was enabled by default.
-While SafeStack is supported only on Linux, NetBSD, FreeBSD and macOS,
-we are not checking for the O.S. since this is already done by LLVM.
-Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
-Message-id: 20200529205122.714-4-dbuono@linux.vnet.ibm.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- configure | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
-file changed, 73 insertions(+)
-diff --git a/configure b/configure
-index XXXXXXX..XXXXXXX 100755
---- a/configure
-+++ b/configure
-@@ -XXX,XX +XXX,XX @@ audio_win_int=""
- libs_qga=""
- debug_info="yes"
- stack_protector=""
-+safe_stack=""
- use_containers="yes"
- gdb_bin=$(command -v "gdb-multiarch" || command -v "gdb")
-@@ -XXX,XX +XXX,XX @@ for opt do
-   ;;
-   --disable-stack-protector) stack_protector="no"
-   ;;
-+  --enable-safe-stack) safe_stack="yes"
-+  ;;
-+  --disable-safe-stack) safe_stack="no"
-+  ;;
-   --disable-curses) curses="no"
-   ;;
-   --enable-curses) curses="yes"
-@@ -XXX,XX +XXX,XX @@ disabled with --disable-FEATURE, default is enabled if available:
-   debug-tcg       TCG debugging (default is disabled)
-   debug-info      debugging information
-   sparse          sparse checker
-+  safe-stack      SafeStack Stack Smash Protection. Depends on
-+                  clang/llvm >= 3.7 and requires coroutine backend ucontext.
-   gnutls          GNUTLS cryptography support
-   nettle          nettle cryptography support
-@@ -XXX,XX +XXX,XX @@ if test "$debug_stack_usage" = "yes"; then
-   fi
- fi
-+##################################################
-+# SafeStack
-+
-+
-+if test "$safe_stack" = "yes"; then
-+cat > $TMPC << EOF
-+int main(int argc, char *argv[])
-+{
-+#if ! __has_feature(safe_stack)
-+#error SafeStack Disabled
-+#endif
-+    return 0;
-+}
-+EOF
-+  flag="-fsanitize=safe-stack"
-+  # Check that safe-stack is supported and enabled.
-+  if compile_prog "-Werror $flag" "$flag"; then
-+    # Flag needed both at compilation and at linking
-+    QEMU_CFLAGS="$QEMU_CFLAGS $flag"
-+    QEMU_LDFLAGS="$QEMU_LDFLAGS $flag"
-+  else
-+    error_exit "SafeStack not supported by your compiler"
-+  fi
-+  if test "$coroutine" != "ucontext"; then
-+    error_exit "SafeStack is only supported by the coroutine backend ucontext"
-+  fi
-+else
-+cat > $TMPC << EOF
-+int main(int argc, char *argv[])
-+{
-+#if defined(__has_feature)
-+#if __has_feature(safe_stack)
-+#error SafeStack Enabled
-+#endif
-+#endif
-+    return 0;
-+}
-+EOF
-+if test "$safe_stack" = "no"; then
-+  # Make sure that safe-stack is disabled
-+  if ! compile_prog "-Werror" ""; then
-+    # SafeStack was already enabled, try to explicitly remove the feature
-+    flag="-fno-sanitize=safe-stack"
-+    if ! compile_prog "-Werror $flag" "$flag"; then
-+      error_exit "Configure cannot disable SafeStack"
-+    fi
-+    QEMU_CFLAGS="$QEMU_CFLAGS $flag"
-+    QEMU_LDFLAGS="$QEMU_LDFLAGS $flag"
-+  fi
-+else # "$safe_stack" = ""
-+  # Set safe_stack to yes or no based on pre-existing flags
-+  if compile_prog "-Werror" ""; then
-+    safe_stack="no"
-+  else
-+    safe_stack="yes"
-+    if test "$coroutine" != "ucontext"; then
-+      error_exit "SafeStack is only supported by the coroutine backend ucontext"
-+    fi
-+  fi
-+fi
-+fi
- ##########################################
- # check if we have open_by_handle_at
-@@ -XXX,XX +XXX,XX @@ echo "sparse enabled    $sparse"
- echo "strip binaries    $strip_opt"
- echo "profiler          $profiler"
- echo "static build      $static"
-+echo "safe stack        $safe_stack"
- if test "$darwin" = "yes" ; then
-     echo "Cocoa support     $cocoa"
- fi
-@@ -XXX,XX +XXX,XX @@ if test "$ccache_cpp2" = "yes"; then
-   echo "export CCACHE_CPP2=y" >> $config_host_mak
- fi
-+if test "$safe_stack" = "yes"; then
-+  echo "CONFIG_SAFESTACK=y" >> $config_host_mak
-+fi
-+
- # If we're using a separate build tree, set it up now.
- # DIRS are directories which we simply mkdir in the build tree;
- # LINKS are things to symlink back into the source tree
---
-.26.2

-[PULL 05/12] check-block: enable iotests with SafeStack
+Deleted patch
-From: Daniele Buono <dbuono@linux.vnet.ibm.com>
-SafeStack is a stack protection technique implemented in llvm. It is
-enabled with a -fsanitize flag.
-iotests are currently disabled when any -fsanitize option is used,
-because such options tend to produce additional warnings and false
-positives.
-While common -fsanitize options are used to verify the code and not
-added in production, SafeStack's main use is in production environments
-to protect against stack smashing.
-Since SafeStack does not print any warning or false positive, enable
-iotests when SafeStack is the only -fsanitize option used.
-This is likely going to be a production binary and we want to make sure
-it works correctly.
-Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
-Message-id: 20200529205122.714-5-dbuono@linux.vnet.ibm.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- tests/check-block.sh | 12 +++++++++++-
-file changed, 11 insertions(+), 1 deletion(-)
-diff --git a/tests/check-block.sh b/tests/check-block.sh
-index XXXXXXX..XXXXXXX 100755
---- a/tests/check-block.sh
-+++ b/tests/check-block.sh
-@@ -XXX,XX +XXX,XX @@ if grep -q "CONFIG_GPROF=y" config-host.mak 2>/dev/null ; then
-     exit 0
- fi
--if grep -q "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null ; then
-+# Disable tests with any sanitizer except for SafeStack
-+CFLAGS=$( grep "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null )
-+SANITIZE_FLAGS=""
-+#Remove all occurrencies of -fsanitize=safe-stack
-+for i in ${CFLAGS}; do
-+        if [ "${i}" != "-fsanitize=safe-stack" ]; then
-+                SANITIZE_FLAGS="${SANITIZE_FLAGS} ${i}"
-+        fi
-+done
-+if echo ${SANITIZE_FLAGS} | grep -q "\-fsanitize" 2>/dev/null; then
-+    # Have a sanitize flag that is not allowed, stop
-     echo "Sanitizers are enabled ==> Not running the qemu-iotests."
-     exit 0
- fi
---
-.26.2

-[PULL 06/12] block/nvme: poll queues without q->lock
+Deleted patch
-A lot of CPU time is spent simply locking/unlocking q->lock during
-polling. Check for completion outside the lock to make q->lock disappear
-from the profile.
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Sergio Lopez <slp@redhat.com>
-Message-id: 20200617132201.1832152-2-stefanha@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- block/nvme.c | 12 ++++++++++++
-file changed, 12 insertions(+)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
-     for (i = 0; i < s->nr_queues; i++) {
-         NVMeQueuePair *q = s->queues[i];
-+        const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
-+        NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
-+
-+        /*
-+         * Do an early check for completions. q->lock isn't needed because
-+         * nvme_process_completion() only runs in the event loop thread and
-+         * cannot race with itself.
-+         */
-+        if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
-+            continue;
-+        }
-+
-         qemu_mutex_lock(&q->lock);
-         while (nvme_process_completion(s, q)) {
-             /* Keep polling */
---
-.26.2

-[PULL 07/12] block/nvme: drop tautologous assertion
+Deleted patch
-nvme_process_completion() explicitly checks cid so the assertion that
-follows is always true:
-  if (cid == 0 || cid > NVME_QUEUE_SIZE) {
-      ...
-      continue;
-  }
-  assert(cid <= NVME_QUEUE_SIZE);
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Sergio Lopez <slp@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20200617132201.1832152-3-stefanha@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- block/nvme.c | 1 -
-file changed, 1 deletion(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
-                     cid);
-             continue;
-         }
--        assert(cid <= NVME_QUEUE_SIZE);
-         trace_nvme_complete_command(s, q->index, cid);
-         preq = &q->reqs[cid - 1];
-         req = *preq;
---
-.26.2

-[PULL 08/12] block/nvme: don't access CQE after moving cq.head
+Deleted patch
-Do not access a CQE after incrementing q->cq.head and releasing q->lock.
-It is unlikely that this causes problems in practice but it's a latent
-bug.
-The reason why it should be safe at the moment is that completion
-processing is not re-entrant and the CQ doorbell isn't written until the
-end of nvme_process_completion().
-Make this change now because QEMU expects completion processing to be
-re-entrant and later patches will do that.
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Sergio Lopez <slp@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20200617132201.1832152-4-stefanha@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- block/nvme.c | 5 ++++-
-file changed, 4 insertions(+), 1 deletion(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
-     q->busy = true;
-     assert(q->inflight >= 0);
-     while (q->inflight) {
-+        int ret;
-         int16_t cid;
-+
-         c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
-         if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
-             break;
-         }
-+        ret = nvme_translate_error(c);
-         q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
-         if (!q->cq.head) {
-             q->cq_phase = !q->cq_phase;
-@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
-         preq->busy = false;
-         preq->cb = preq->opaque = NULL;
-         qemu_mutex_unlock(&q->lock);
--        req.cb(req.opaque, nvme_translate_error(c));
-+        req.cb(req.opaque, ret);
-         qemu_mutex_lock(&q->lock);
-         q->inflight--;
-         progress = true;
---
-.26.2

-[PULL 10/12] block/nvme: clarify that free_req_queue is protected by q->lock
+[PULL v3 4/5] tests/qemu-iotests/197: add testcase for CoR with subclusters
-Existing users access free_req_queue under q->lock. Document this.
+From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
+Add testcase which checks that allocations during copy-on-read are
+performed on the subcluster basis when subclusters are enabled in target
+image.
+This testcase also triggers the following assert with previous commit
+not being applied, so we check that as well:
+qemu-io: ../block/io.c:1236: bdrv_co_do_copy_on_readv: Assertion `skip_bytes < pnum' failed.
+Reviewed-by: Eric Blake <eblake@redhat.com>
+Reviewed-by: Denis V. Lunev <den@openvz.org>
+Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
+Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Sergio Lopez <slp@redhat.com>
+Message-ID: <20230711172553.234055-4-andrey.drobyshev@virtuozzo.com>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Message-id: 20200617132201.1832152-6-stefanha@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/nvme.c | 2 +-
+ tests/qemu-iotests/197     | 29 +++++++++++++++++++++++++++++
-file changed, 1 insertion(+), 1 deletion(-)
+ tests/qemu-iotests/197.out | 24 ++++++++++++++++++++++++
 files changed, 53 insertions(+)
-diff --git a/block/nvme.c b/block/nvme.c
+diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197
 index XXXXXXX..XXXXXXX 100755
 --- a/tests/qemu-iotests/197
 +++ b/tests/qemu-iotests/197
@@ -XXX,XX +XXX,XX @@ $QEMU_IO -f qcow2 -C -c 'read 0 1024' "$TEST_WRAP" | _filter_qemu_io
  $QEMU_IO -f qcow2 -c map "$TEST_WRAP"
  _check_test_img
 +echo
 +echo '=== Copy-on-read with subclusters ==='
 +echo
 +
 +# Create base and top images 64K (1 cluster) each.  Make subclusters enabled
 +# for the top image
 +_make_test_img 64K
 +IMGPROTO=file IMGFMT=qcow2 TEST_IMG_FILE="$TEST_WRAP" \
 +    _make_test_img --no-opts -o extended_l2=true -F "$IMGFMT" -b "$TEST_IMG" \
 +    64K | _filter_img_create
 +
 +$QEMU_IO -c "write -P 0xaa 0 64k" "$TEST_IMG" | _filter_qemu_io
 +
 +# Allocate individual subclusters in the top image, and not the whole cluster
 +$QEMU_IO -c "write -P 0xbb 28K 2K" -c "write -P 0xcc 34K 2K" "$TEST_WRAP" \
 +    | _filter_qemu_io
 +
 +# Only 2 subclusters should be allocated in the top image at this point
 +$QEMU_IMG map "$TEST_WRAP" | _filter_qemu_img_map
 +
 +# Actual copy-on-read operation
 +$QEMU_IO -C -c "read -P 0xaa 30K 4K" "$TEST_WRAP" | _filter_qemu_io
 +
 +# And here we should have 4 subclusters allocated right in the middle of the
 +# top image. Make sure the whole cluster remains unallocated
 +$QEMU_IMG map "$TEST_WRAP" | _filter_qemu_img_map
 +
 +_check_test_img
 +
  # success, all done
  echo '*** done'
  status=0
 diff --git a/tests/qemu-iotests/197.out b/tests/qemu-iotests/197.out
 index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
+--- a/tests/qemu-iotests/197.out
-+++ b/block/nvme.c
++++ b/tests/qemu-iotests/197.out
-@@ -XXX,XX +XXX,XX @@ typedef struct {
+@@ -XXX,XX +XXX,XX @@ read 1024/1024 bytes at offset 0
- } NVMeRequest;
+KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+KiB (0x400) bytes     allocated at offset 0 bytes (0x0)
- typedef struct {
+ No errors were found on the image.
--    CoQueue     free_req_queue;
++
-     QemuMutex   lock;
++=== Copy-on-read with subclusters ===
++
-     /* Fields protected by BQL */
++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65536
-@@ -XXX,XX +XXX,XX @@ typedef struct {
++Formatting 'TEST_DIR/t.wrap.IMGFMT', fmt=IMGFMT size=65536 backing_file=TEST_DIR/t.IMGFMT backing_fmt=IMGFMT
-     uint8_t     *prp_list_pages;
++wrote 65536/65536 bytes at offset 0
++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-     /* Fields protected by @lock */
++wrote 2048/2048 bytes at offset 28672
-+    CoQueue     free_req_queue;
++2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-     NVMeQueue   sq, cq;
++wrote 2048/2048 bytes at offset 34816
-     int         cq_phase;
++2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-     int         free_req_head;
++Offset          Length          File
 +0               0x7000          TEST_DIR/t.IMGFMT
 +0x7000          0x800           TEST_DIR/t.wrap.IMGFMT
 +0x7800          0x1000          TEST_DIR/t.IMGFMT
 +0x8800          0x800           TEST_DIR/t.wrap.IMGFMT
 +0x9000          0x7000          TEST_DIR/t.IMGFMT
 +read 4096/4096 bytes at offset 30720
 +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +Offset          Length          File
 +0               0x7000          TEST_DIR/t.IMGFMT
 +0x7000          0x2000          TEST_DIR/t.wrap.IMGFMT
 +0x9000          0x7000          TEST_DIR/t.IMGFMT
 +No errors were found on the image.
  *** done
 --
-.26.2
+.41.0

-[PULL 09/12] block/nvme: switch to a NVMeRequest freelist
+[PULL v3 5/5] aio-posix: zero out io_uring sqe user_data
-There are three issues with the current NVMeRequest->busy field:
+liburing does not clear sqe->user_data. We must do it ourselves to avoid
-. The busy field is accidentally accessed outside q->lock when request
+undefined behavior in process_cqe() when user_data is used.
    submission fails.
 . Waiters on free_req_queue are not woken when a request is returned
    early due to submission failure.
 . Finding a free request involves scanning all requests. This makes
    request submission O(n^2).
-Switch to an O(1) freelist that is always accessed under the lock.
+Note that fdmon-io_uring is currently disabled, so this is a latent bug
+that does not affect users. Let's merge this fix now to make it easier
-Also differentiate between NVME_QUEUE_SIZE, the actual SQ/CQ size, and
+to enable fdmon-io_uring in the future (and I'm working on that).
 NVME_NUM_REQS, the number of usable requests. This makes the code
 simpler than using NVME_QUEUE_SIZE everywhere and having to keep in mind
 that one slot is reserved.
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Sergio Lopez <slp@redhat.com>
+Message-ID: <20230426212639.82310-1-stefanha@redhat.com>
 Message-id: 20200617132201.1832152-5-stefanha@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/nvme.c | 81 ++++++++++++++++++++++++++++++++++------------------
+ util/fdmon-io_uring.c | 2 ++
-file changed, 54 insertions(+), 27 deletions(-)
+file changed, 2 insertions(+)
-diff --git a/block/nvme.c b/block/nvme.c
+diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
+--- a/util/fdmon-io_uring.c
-+++ b/block/nvme.c
++++ b/util/fdmon-io_uring.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node)
- #define NVME_QUEUE_SIZE 128
+ #else
- #define NVME_BAR_SIZE 8192
+     io_uring_prep_poll_remove(sqe, node);
+ #endif
-+/*
++    io_uring_sqe_set_data(sqe, NULL);
 + * We have to leave one slot empty as that is the full queue case where
 + * head == tail + 1.
 + */
 +#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
 +
  typedef struct {
      int32_t  head, tail;
      uint8_t  *queue;
@@ -XXX,XX +XXX,XX @@ typedef struct {
      int cid;
      void *prp_list_page;
      uint64_t prp_list_iova;
 -    bool busy;
 +    int free_req_next; /* q->reqs[] index of next free req */
  } NVMeRequest;
  typedef struct {
@@ -XXX,XX +XXX,XX @@ typedef struct {
      /* Fields protected by @lock */
      NVMeQueue   sq, cq;
      int         cq_phase;
 -    NVMeRequest reqs[NVME_QUEUE_SIZE];
 +    int         free_req_head;
 +    NVMeRequest reqs[NVME_NUM_REQS];
      bool        busy;
      int         need_kick;
      int         inflight;
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
      qemu_mutex_init(&q->lock);
      q->index = idx;
      qemu_co_queue_init(&q->free_req_queue);
 -    q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE);
 +    q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
      r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
 -                          s->page_size * NVME_QUEUE_SIZE,
 +                          s->page_size * NVME_NUM_REQS,
                            false, &prp_list_iova);
      if (r) {
          goto fail;
      }
 -    for (i = 0; i < NVME_QUEUE_SIZE; i++) {
 +    q->free_req_head = -1;
 +    for (i = 0; i < NVME_NUM_REQS; i++) {
          NVMeRequest *req = &q->reqs[i];
          req->cid = i + 1;
 +        req->free_req_next = q->free_req_head;
 +        q->free_req_head = i;
          req->prp_list_page = q->prp_list_pages + i * s->page_size;
          req->prp_list_iova = prp_list_iova + i * s->page_size;
      }
 +
      nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
      if (local_err) {
          error_propagate(errp, local_err);
@@ -XXX,XX +XXX,XX @@ static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
   */
  static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
  {
 -    int i;
 -    NVMeRequest *req = NULL;
 +    NVMeRequest *req;
      qemu_mutex_lock(&q->lock);
 -    while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) {
 -        /* We have to leave one slot empty as that is the full queue case (head
 -         * == tail + 1). */
 +
 +    while (q->free_req_head == -1) {
          if (qemu_in_coroutine()) {
              trace_nvme_free_req_queue_wait(q);
              qemu_co_queue_wait(&q->free_req_queue, &q->lock);
@@ -XXX,XX +XXX,XX @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
              return NULL;
          }
      }
 -    for (i = 0; i < NVME_QUEUE_SIZE; i++) {
 -        if (!q->reqs[i].busy) {
 -            q->reqs[i].busy = true;
 -            req = &q->reqs[i];
 -            break;
 -        }
 -    }
 -    /* We have checked inflight and need_kick while holding q->lock, so one
 -     * free req must be available. */
 -    assert(req);
 +
 +    req = &q->reqs[q->free_req_head];
 +    q->free_req_head = req->free_req_next;
 +    req->free_req_next = -1;
 +
      qemu_mutex_unlock(&q->lock);
      return req;
  }
-+/* With q->lock */
+ /* Add a timeout that self-cancels when another cqe becomes ready */
-+static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
+@@ -XXX,XX +XXX,XX @@ static void add_timeout_sqe(AioContext *ctx, int64_t ns)
-+{
-+    req->free_req_next = q->free_req_head;
+     sqe = get_sqe(ctx);
-+    q->free_req_head = req - q->reqs;
+     io_uring_prep_timeout(sqe, &ts, 1, 0);
-+}
++    io_uring_sqe_set_data(sqe, NULL);
-+
+ }
-+/* With q->lock */
-+static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
+ /* Add sqes from ctx->submit_list for submission */
 +{
 +    if (!qemu_co_queue_empty(&q->free_req_queue)) {
 +        replay_bh_schedule_oneshot_event(s->aio_context,
 +                nvme_free_req_queue_cb, q);
 +    }
 +}
 +
 +/* Insert a request in the freelist and wake waiters */
 +static void nvme_put_free_req_and_wake(BDRVNVMeState *s,  NVMeQueuePair *q,
 +                                       NVMeRequest *req)
 +{
 +    qemu_mutex_lock(&q->lock);
 +    nvme_put_free_req_locked(q, req);
 +    nvme_wake_free_req_locked(s, q);
 +    qemu_mutex_unlock(&q->lock);
 +}
 +
  static inline int nvme_translate_error(const NvmeCqe *c)
  {
      uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
          req = *preq;
          assert(req.cid == cid);
          assert(req.cb);
 -        preq->busy = false;
 +        nvme_put_free_req_locked(q, preq);
          preq->cb = preq->opaque = NULL;
          qemu_mutex_unlock(&q->lock);
          req.cb(req.opaque, ret);
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
          /* Notify the device so it can post more completions. */
          smp_mb_release();
          *q->cq.doorbell = cpu_to_le32(q->cq.head);
 -        if (!qemu_co_queue_empty(&q->free_req_queue)) {
 -            replay_bh_schedule_oneshot_event(s->aio_context,
 -                                             nvme_free_req_queue_cb, q);
 -        }
 +        nvme_wake_free_req_locked(s, q);
      }
      q->busy = false;
      return progress;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
      r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
      qemu_co_mutex_unlock(&s->dma_map_lock);
      if (r) {
 -        req->busy = false;
 +        nvme_put_free_req_and_wake(s, ioq, req);
          return r;
      }
      nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
      qemu_co_mutex_unlock(&s->dma_map_lock);
      if (ret) {
 -        req->busy = false;
 +        nvme_put_free_req_and_wake(s, ioq, req);
          goto out;
      }
 --
-.26.2
+.41.0

The following changes since commit 171199f56f5f9bdf1e5d670d09ef1351d8f01bae:

Merge remote-tracking branch 'remotes/alistair/tags/pull-riscv-to-apply-20200619-3' into staging (2020-06-22 14:45:25 +0100)

are available in the Git repository at:

https://github.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to 7838c67f22a81fcf669785cd6c0876438422071a:

block/nvme: support nested aio_poll() (2020-06-23 15:46:08 +0100)

----------------------------------------------------------------
Pull request

----------------------------------------------------------------

Daniele Buono (4):
  coroutine: support SafeStack in ucontext backend
  coroutine: add check for SafeStack in sigaltstack
  configure: add flags to support SafeStack
  check-block: enable iotests with SafeStack

Stefan Hajnoczi (8):
  minikconf: explicitly set encoding to UTF-8
  block/nvme: poll queues without q->lock
  block/nvme: drop tautologous assertion
  block/nvme: don't access CQE after moving cq.head
  block/nvme: switch to a NVMeRequest freelist
  block/nvme: clarify that free_req_queue is protected by q->lock
  block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair
  block/nvme: support nested aio_poll()

-- 
2.26.2

QEMU currently only has ASCII Kconfig files but Linux actually uses
UTF-8. Explicitly specify the encoding and that we're doing text file
I/O.

It's unclear whether or not QEMU will ever need Unicode in its Kconfig
files. If we start using the help text then it will become an issue
sooner or later. Make this change now for consistency with Linux
Kconfig.

Reported-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20200521153616.307100-1-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 scripts/minikconf.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/minikconf.py b/scripts/minikconf.py
index XXXXXXX..XXXXXXX 100755
--- a/scripts/minikconf.py
+++ b/scripts/minikconf.py
@@ -XXX,XX +XXX,XX @@ class KconfigParser:
         if incl_abs_fname in self.data.previously_included:
             return
         try:
-            fp = open(incl_abs_fname, 'r')
+            fp = open(incl_abs_fname, 'rt', encoding='utf-8')
         except IOError as e:
             raise KconfigParserError(self,
                                 '%s: %s' % (e.strerror, include))
@@ -XXX,XX +XXX,XX @@ if __name__ == '__main__':
             parser.do_assignment(name, value == 'y')
             external_vars.add(name[7:])
         else:
-            fp = open(arg, 'r')
+            fp = open(arg, 'rt', encoding='utf-8')
             parser.parse_file(fp)
             fp.close()
 
@@ -XXX,XX +XXX,XX @@ if __name__ == '__main__':
         if key not in external_vars and config[key]:
             print ('CONFIG_%s=y' % key)
 
-    deps = open(argv[2], 'w')
+    deps = open(argv[2], 'wt', encoding='utf-8')
     for fname in data.previously_included:
         print ('%s: %s' % (argv[1], fname), file=deps)
     deps.close()
-- 
2.26.2

From: Daniele Buono <dbuono@linux.vnet.ibm.com>

LLVM's SafeStack instrumentation does not yet support programs that make
use of the APIs in ucontext.h
With the current implementation of coroutine-ucontext, the resulting
binary is incorrect, with different coroutines sharing the same unsafe
stack and producing undefined behavior at runtime.
This fix allocates an additional unsafe stack area for each coroutine,
and sets the new unsafe stack pointer before calling swapcontext() in
qemu_coroutine_new.
This is the only place where the pointer needs to be manually updated,
since sigsetjmp/siglongjmp are already instrumented by LLVM to properly
support SafeStack.
The additional stack is then freed in qemu_coroutine_delete.

Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
Message-id: 20200529205122.714-2-dbuono@linux.vnet.ibm.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine_int.h |  5 +++++
 util/coroutine-ucontext.c    | 28 ++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine_int.h
+++ b/include/qemu/coroutine_int.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/queue.h"
 #include "qemu/coroutine.h"
 
+#ifdef CONFIG_SAFESTACK
+/* Pointer to the unsafe stack, defined by the compiler */
+extern __thread void *__safestack_unsafe_stack_ptr;
+#endif
+
 #define COROUTINE_STACK_SIZE (1 << 20)
 
 typedef enum {
diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c
index XXXXXXX..XXXXXXX 100644
--- a/util/coroutine-ucontext.c
+++ b/util/coroutine-ucontext.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
     Coroutine base;
     void *stack;
     size_t stack_size;
+#ifdef CONFIG_SAFESTACK
+    /* Need an unsafe stack for each coroutine */
+    void *unsafe_stack;
+    size_t unsafe_stack_size;
+#endif
     sigjmp_buf env;
 
     void *tsan_co_fiber;
@@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void)
     co = g_malloc0(sizeof(*co));
     co->stack_size = COROUTINE_STACK_SIZE;
     co->stack = qemu_alloc_stack(&co->stack_size);
+#ifdef CONFIG_SAFESTACK
+    co->unsafe_stack_size = COROUTINE_STACK_SIZE;
+    co->unsafe_stack = qemu_alloc_stack(&co->unsafe_stack_size);
+#endif
     co->base.entry_arg = &old_env; /* stash away our jmp_buf */
 
     uc.uc_link = &old_uc;
@@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void)
             COROUTINE_YIELD,
             &fake_stack_save,
             co->stack, co->stack_size, co->tsan_co_fiber);
+
+#ifdef CONFIG_SAFESTACK
+        /*
+         * Before we swap the context, set the new unsafe stack
+         * The unsafe stack grows just like the normal stack, so start from
+         * the last usable location of the memory area.
+         * NOTE: we don't have to re-set the usp afterwards because we are
+         * coming back to this context through a siglongjmp.
+         * The compiler already wrapped the corresponding sigsetjmp call with
+         * code that saves the usp on the (safe) stack before the call, and
+         * restores it right after (which is where we return with siglongjmp).
+         */
+        void *usp = co->unsafe_stack + co->unsafe_stack_size;
+        __safestack_unsafe_stack_ptr = usp;
+#endif
+
         swapcontext(&old_uc, &uc);
     }
 
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_delete(Coroutine *co_)
 #endif
 
     qemu_free_stack(co->stack, co->stack_size);
+#ifdef CONFIG_SAFESTACK
+    qemu_free_stack(co->unsafe_stack, co->unsafe_stack_size);
+#endif
     g_free(co);
 }
 
-- 
2.26.2

From: Daniele Buono <dbuono@linux.vnet.ibm.com>

Current implementation of LLVM's SafeStack is not compatible with
code that uses an alternate stack created with sigaltstack().
Since coroutine-sigaltstack relies on sigaltstack(), it is not
compatible with SafeStack. The resulting binary is incorrect, with
different coroutines sharing the same unsafe stack and producing
undefined behavior at runtime.

In the future LLVM may provide a SafeStack implementation compatible with
sigaltstack(). In the meantime, if SafeStack is desired, the coroutine
implementation from coroutine-ucontext should be used.
As a safety check, add a control in coroutine-sigaltstack to throw a
preprocessor #error if SafeStack is enabled and we are trying to
use coroutine-sigaltstack to implement coroutines.

Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
Message-id: 20200529205122.714-3-dbuono@linux.vnet.ibm.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 util/coroutine-sigaltstack.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/util/coroutine-sigaltstack.c b/util/coroutine-sigaltstack.c
index XXXXXXX..XXXXXXX 100644
--- a/util/coroutine-sigaltstack.c
+++ b/util/coroutine-sigaltstack.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu-common.h"
 #include "qemu/coroutine_int.h"
 
+#ifdef CONFIG_SAFESTACK
+#error "SafeStack is not compatible with code run in alternate signal stacks"
+#endif
+
 typedef struct {
     Coroutine base;
     void *stack;
-- 
2.26.2

From: Daniele Buono <dbuono@linux.vnet.ibm.com>

This patch adds a flag to enable/disable the SafeStack instrumentation
provided by LLVM.

On enable, make sure that the compiler supports the flags, and that we
are using the proper coroutine implementation (coroutine-ucontext).
On disable, explicitly disable the option if it was enabled by default.

While SafeStack is supported only on Linux, NetBSD, FreeBSD and macOS,
we are not checking for the O.S. since this is already done by LLVM.

Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
Message-id: 20200529205122.714-4-dbuono@linux.vnet.ibm.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 configure | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/configure b/configure
index XXXXXXX..XXXXXXX 100755
--- a/configure
+++ b/configure
@@ -XXX,XX +XXX,XX @@ audio_win_int=""
 libs_qga=""
 debug_info="yes"
 stack_protector=""
+safe_stack=""
 use_containers="yes"
 gdb_bin=$(command -v "gdb-multiarch" || command -v "gdb")
 
@@ -XXX,XX +XXX,XX @@ for opt do
   ;;
   --disable-stack-protector) stack_protector="no"
   ;;
+  --enable-safe-stack) safe_stack="yes"
+  ;;
+  --disable-safe-stack) safe_stack="no"
+  ;;
   --disable-curses) curses="no"
   ;;
   --enable-curses) curses="yes"
@@ -XXX,XX +XXX,XX @@ disabled with --disable-FEATURE, default is enabled if available:
   debug-tcg       TCG debugging (default is disabled)
   debug-info      debugging information
   sparse          sparse checker
+  safe-stack      SafeStack Stack Smash Protection. Depends on
+                  clang/llvm >= 3.7 and requires coroutine backend ucontext.
 
   gnutls          GNUTLS cryptography support
   nettle          nettle cryptography support
@@ -XXX,XX +XXX,XX @@ if test "$debug_stack_usage" = "yes"; then
   fi
 fi
 
+##################################################
+# SafeStack
+
+
+if test "$safe_stack" = "yes"; then
+cat > $TMPC << EOF
+int main(int argc, char *argv[])
+{
+#if ! __has_feature(safe_stack)
+#error SafeStack Disabled
+#endif
+    return 0;
+}
+EOF
+  flag="-fsanitize=safe-stack"
+  # Check that safe-stack is supported and enabled.
+  if compile_prog "-Werror $flag" "$flag"; then
+    # Flag needed both at compilation and at linking
+    QEMU_CFLAGS="$QEMU_CFLAGS $flag"
+    QEMU_LDFLAGS="$QEMU_LDFLAGS $flag"
+  else
+    error_exit "SafeStack not supported by your compiler"
+  fi
+  if test "$coroutine" != "ucontext"; then
+    error_exit "SafeStack is only supported by the coroutine backend ucontext"
+  fi
+else
+cat > $TMPC << EOF
+int main(int argc, char *argv[])
+{
+#if defined(__has_feature)
+#if __has_feature(safe_stack)
+#error SafeStack Enabled
+#endif
+#endif
+    return 0;
+}
+EOF
+if test "$safe_stack" = "no"; then
+  # Make sure that safe-stack is disabled
+  if ! compile_prog "-Werror" ""; then
+    # SafeStack was already enabled, try to explicitly remove the feature
+    flag="-fno-sanitize=safe-stack"
+    if ! compile_prog "-Werror $flag" "$flag"; then
+      error_exit "Configure cannot disable SafeStack"
+    fi
+    QEMU_CFLAGS="$QEMU_CFLAGS $flag"
+    QEMU_LDFLAGS="$QEMU_LDFLAGS $flag"
+  fi
+else # "$safe_stack" = ""
+  # Set safe_stack to yes or no based on pre-existing flags
+  if compile_prog "-Werror" ""; then
+    safe_stack="no"
+  else
+    safe_stack="yes"
+    if test "$coroutine" != "ucontext"; then
+      error_exit "SafeStack is only supported by the coroutine backend ucontext"
+    fi
+  fi
+fi
+fi
 
 ##########################################
 # check if we have open_by_handle_at
@@ -XXX,XX +XXX,XX @@ echo "sparse enabled    $sparse"
 echo "strip binaries    $strip_opt"
 echo "profiler          $profiler"
 echo "static build      $static"
+echo "safe stack        $safe_stack"
 if test "$darwin" = "yes" ; then
     echo "Cocoa support     $cocoa"
 fi
@@ -XXX,XX +XXX,XX @@ if test "$ccache_cpp2" = "yes"; then
   echo "export CCACHE_CPP2=y" >> $config_host_mak
 fi
 
+if test "$safe_stack" = "yes"; then
+  echo "CONFIG_SAFESTACK=y" >> $config_host_mak
+fi
+
 # If we're using a separate build tree, set it up now.
 # DIRS are directories which we simply mkdir in the build tree;
 # LINKS are things to symlink back into the source tree
-- 
2.26.2

From: Daniele Buono <dbuono@linux.vnet.ibm.com>

SafeStack is a stack protection technique implemented in llvm. It is
enabled with a -fsanitize flag.
iotests are currently disabled when any -fsanitize option is used,
because such options tend to produce additional warnings and false
positives.

While common -fsanitize options are used to verify the code and not
added in production, SafeStack's main use is in production environments
to protect against stack smashing.

Since SafeStack does not print any warning or false positive, enable
iotests when SafeStack is the only -fsanitize option used.
This is likely going to be a production binary and we want to make sure
it works correctly.

Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
Message-id: 20200529205122.714-5-dbuono@linux.vnet.ibm.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/check-block.sh | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/check-block.sh b/tests/check-block.sh
index XXXXXXX..XXXXXXX 100755
--- a/tests/check-block.sh
+++ b/tests/check-block.sh
@@ -XXX,XX +XXX,XX @@ if grep -q "CONFIG_GPROF=y" config-host.mak 2>/dev/null ; then
     exit 0
 fi
 
-if grep -q "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null ; then
+# Disable tests with any sanitizer except for SafeStack
+CFLAGS=$( grep "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null )
+SANITIZE_FLAGS=""
+#Remove all occurrencies of -fsanitize=safe-stack
+for i in ${CFLAGS}; do
+        if [ "${i}" != "-fsanitize=safe-stack" ]; then
+                SANITIZE_FLAGS="${SANITIZE_FLAGS} ${i}"
+        fi
+done
+if echo ${SANITIZE_FLAGS} | grep -q "\-fsanitize" 2>/dev/null; then
+    # Have a sanitize flag that is not allowed, stop
     echo "Sanitizers are enabled ==> Not running the qemu-iotests."
     exit 0
 fi
-- 
2.26.2

A lot of CPU time is spent simply locking/unlocking q->lock during
polling. Check for completion outside the lock to make q->lock disappear
from the profile.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Message-id: 20200617132201.1832152-2-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/nvme.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
 
     for (i = 0; i < s->nr_queues; i++) {
         NVMeQueuePair *q = s->queues[i];
+        const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
+        NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
+
+        /*
+         * Do an early check for completions. q->lock isn't needed because
+         * nvme_process_completion() only runs in the event loop thread and
+         * cannot race with itself.
+         */
+        if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
+            continue;
+        }
+
         qemu_mutex_lock(&q->lock);
         while (nvme_process_completion(s, q)) {
             /* Keep polling */
-- 
2.26.2

Do not access a CQE after incrementing q->cq.head and releasing q->lock.
It is unlikely that this causes problems in practice but it's a latent
bug.

The reason why it should be safe at the moment is that completion
processing is not re-entrant and the CQ doorbell isn't written until the
end of nvme_process_completion().

Make this change now because QEMU expects completion processing to be
re-entrant and later patches will do that.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20200617132201.1832152-4-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/nvme.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
     q->busy = true;
     assert(q->inflight >= 0);
     while (q->inflight) {
+        int ret;
         int16_t cid;
+
         c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
         if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
             break;
         }
+        ret = nvme_translate_error(c);
         q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
         if (!q->cq.head) {
             q->cq_phase = !q->cq_phase;
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
         preq->busy = false;
         preq->cb = preq->opaque = NULL;
         qemu_mutex_unlock(&q->lock);
-        req.cb(req.opaque, nvme_translate_error(c));
+        req.cb(req.opaque, ret);
         qemu_mutex_lock(&q->lock);
         q->inflight--;
         progress = true;
-- 
2.26.2

There are three issues with the current NVMeRequest->busy field:
1. The busy field is accidentally accessed outside q->lock when request
   submission fails.
2. Waiters on free_req_queue are not woken when a request is returned
   early due to submission failure.
2. Finding a free request involves scanning all requests. This makes
   request submission O(n^2).

Switch to an O(1) freelist that is always accessed under the lock.

Also differentiate between NVME_QUEUE_SIZE, the actual SQ/CQ size, and
NVME_NUM_REQS, the number of usable requests. This makes the code
simpler than using NVME_QUEUE_SIZE everywhere and having to keep in mind
that one slot is reserved.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Message-id: 20200617132201.1832152-5-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/nvme.c | 81 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 54 insertions(+), 27 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@
 #define NVME_QUEUE_SIZE 128
 #define NVME_BAR_SIZE 8192
 
+/*
+ * We have to leave one slot empty as that is the full queue case where
+ * head == tail + 1.
+ */
+#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
+
 typedef struct {
     int32_t  head, tail;
     uint8_t  *queue;
@@ -XXX,XX +XXX,XX @@ typedef struct {
     int cid;
     void *prp_list_page;
     uint64_t prp_list_iova;
-    bool busy;
+    int free_req_next; /* q->reqs[] index of next free req */
 } NVMeRequest;
 
 typedef struct {
@@ -XXX,XX +XXX,XX @@ typedef struct {
     /* Fields protected by @lock */
     NVMeQueue   sq, cq;
     int         cq_phase;
-    NVMeRequest reqs[NVME_QUEUE_SIZE];
+    int         free_req_head;
+    NVMeRequest reqs[NVME_NUM_REQS];
     bool        busy;
     int         need_kick;
     int         inflight;
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
     qemu_mutex_init(&q->lock);
     q->index = idx;
     qemu_co_queue_init(&q->free_req_queue);
-    q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE);
+    q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
     r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
-                          s->page_size * NVME_QUEUE_SIZE,
+                          s->page_size * NVME_NUM_REQS,
                           false, &prp_list_iova);
     if (r) {
         goto fail;
     }
-    for (i = 0; i < NVME_QUEUE_SIZE; i++) {
+    q->free_req_head = -1;
+    for (i = 0; i < NVME_NUM_REQS; i++) {
         NVMeRequest *req = &q->reqs[i];
         req->cid = i + 1;
+        req->free_req_next = q->free_req_head;
+        q->free_req_head = i;
         req->prp_list_page = q->prp_list_pages + i * s->page_size;
         req->prp_list_iova = prp_list_iova + i * s->page_size;
     }
+
     nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
@@ -XXX,XX +XXX,XX @@ static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
  */
 static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
 {
-    int i;
-    NVMeRequest *req = NULL;
+    NVMeRequest *req;
 
     qemu_mutex_lock(&q->lock);
-    while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) {
-        /* We have to leave one slot empty as that is the full queue case (head
-         * == tail + 1). */
+
+    while (q->free_req_head == -1) {
         if (qemu_in_coroutine()) {
             trace_nvme_free_req_queue_wait(q);
             qemu_co_queue_wait(&q->free_req_queue, &q->lock);
@@ -XXX,XX +XXX,XX @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
             return NULL;
         }
     }
-    for (i = 0; i < NVME_QUEUE_SIZE; i++) {
-        if (!q->reqs[i].busy) {
-            q->reqs[i].busy = true;
-            req = &q->reqs[i];
-            break;
-        }
-    }
-    /* We have checked inflight and need_kick while holding q->lock, so one
-     * free req must be available. */
-    assert(req);
+
+    req = &q->reqs[q->free_req_head];
+    q->free_req_head = req->free_req_next;
+    req->free_req_next = -1;
+
     qemu_mutex_unlock(&q->lock);
     return req;
 }
 
+/* With q->lock */
+static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
+{
+    req->free_req_next = q->free_req_head;
+    q->free_req_head = req - q->reqs;
+}
+
+/* With q->lock */
+static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
+{
+    if (!qemu_co_queue_empty(&q->free_req_queue)) {
+        replay_bh_schedule_oneshot_event(s->aio_context,
+                nvme_free_req_queue_cb, q);
+    }
+}
+
+/* Insert a request in the freelist and wake waiters */
+static void nvme_put_free_req_and_wake(BDRVNVMeState *s,  NVMeQueuePair *q,
+                                       NVMeRequest *req)
+{
+    qemu_mutex_lock(&q->lock);
+    nvme_put_free_req_locked(q, req);
+    nvme_wake_free_req_locked(s, q);
+    qemu_mutex_unlock(&q->lock);
+}
+
 static inline int nvme_translate_error(const NvmeCqe *c)
 {
     uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
         req = *preq;
         assert(req.cid == cid);
         assert(req.cb);
-        preq->busy = false;
+        nvme_put_free_req_locked(q, preq);
         preq->cb = preq->opaque = NULL;
         qemu_mutex_unlock(&q->lock);
         req.cb(req.opaque, ret);
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
         /* Notify the device so it can post more completions. */
         smp_mb_release();
         *q->cq.doorbell = cpu_to_le32(q->cq.head);
-        if (!qemu_co_queue_empty(&q->free_req_queue)) {
-            replay_bh_schedule_oneshot_event(s->aio_context,
-                                             nvme_free_req_queue_cb, q);
-        }
+        nvme_wake_free_req_locked(s, q);
     }
     q->busy = false;
     return progress;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
     r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
     qemu_co_mutex_unlock(&s->dma_map_lock);
     if (r) {
-        req->busy = false;
+        nvme_put_free_req_and_wake(s, ioq, req);
         return r;
     }
     nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
     qemu_co_mutex_unlock(&s->dma_map_lock);
 
     if (ret) {
-        req->busy = false;
+        nvme_put_free_req_and_wake(s, ioq, req);
         goto out;
     }
 
-- 
2.26.2

Passing around both BDRVNVMeState and NVMeQueuePair is unwieldy. Reduce
the number of function arguments by keeping the BDRVNVMeState pointer in
NVMeQueuePair. This will come in handly when a BH is introduced in a
later patch and only one argument can be passed to it.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20200617132201.1832152-7-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/nvme.c | 70 ++++++++++++++++++++++++++++------------------------
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@
  */
 #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
 
+typedef struct BDRVNVMeState BDRVNVMeState;
+
 typedef struct {
     int32_t  head, tail;
     uint8_t  *queue;
@@ -XXX,XX +XXX,XX @@ typedef struct {
 typedef struct {
     QemuMutex   lock;
 
+    /* Read from I/O code path, initialized under BQL */
+    BDRVNVMeState   *s;
+    int             index;
+
     /* Fields protected by BQL */
-    int         index;
     uint8_t     *prp_list_pages;
 
     /* Fields protected by @lock */
@@ -XXX,XX +XXX,XX @@ typedef volatile struct {
 
 QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000);
 
-typedef struct {
+struct BDRVNVMeState {
     AioContext *aio_context;
     QEMUVFIOState *vfio;
     NVMeRegs *regs;
@@ -XXX,XX +XXX,XX @@ typedef struct {
 
     /* PCI address (required for nvme_refresh_filename()) */
     char *device;
-} BDRVNVMeState;
+};
 
 #define NVME_BLOCK_OPT_DEVICE "device"
 #define NVME_BLOCK_OPT_NAMESPACE "namespace"
@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
     }
 }
 
-static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q)
+static void nvme_free_queue_pair(NVMeQueuePair *q)
 {
     qemu_vfree(q->prp_list_pages);
     qemu_vfree(q->sq.queue);
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
     uint64_t prp_list_iova;
 
     qemu_mutex_init(&q->lock);
+    q->s = s;
     q->index = idx;
     qemu_co_queue_init(&q->free_req_queue);
     q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
 
     return q;
 fail:
-    nvme_free_queue_pair(bs, q);
+    nvme_free_queue_pair(q);
     return NULL;
 }
 
 /* With q->lock */
-static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
+static void nvme_kick(NVMeQueuePair *q)
 {
+    BDRVNVMeState *s = q->s;
+
     if (s->plugged || !q->need_kick) {
         return;
     }
@@ -XXX,XX +XXX,XX @@ static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
 }
 
 /* With q->lock */
-static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
+static void nvme_wake_free_req_locked(NVMeQueuePair *q)
 {
     if (!qemu_co_queue_empty(&q->free_req_queue)) {
-        replay_bh_schedule_oneshot_event(s->aio_context,
+        replay_bh_schedule_oneshot_event(q->s->aio_context,
                 nvme_free_req_queue_cb, q);
     }
 }
 
 /* Insert a request in the freelist and wake waiters */
-static void nvme_put_free_req_and_wake(BDRVNVMeState *s,  NVMeQueuePair *q,
-                                       NVMeRequest *req)
+static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req)
 {
     qemu_mutex_lock(&q->lock);
     nvme_put_free_req_locked(q, req);
-    nvme_wake_free_req_locked(s, q);
+    nvme_wake_free_req_locked(q);
     qemu_mutex_unlock(&q->lock);
 }
 
@@ -XXX,XX +XXX,XX @@ static inline int nvme_translate_error(const NvmeCqe *c)
 }
 
 /* With q->lock */
-static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
+static bool nvme_process_completion(NVMeQueuePair *q)
 {
+    BDRVNVMeState *s = q->s;
     bool progress = false;
     NVMeRequest *preq;
     NVMeRequest req;
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
         /* Notify the device so it can post more completions. */
         smp_mb_release();
         *q->cq.doorbell = cpu_to_le32(q->cq.head);
-        nvme_wake_free_req_locked(s, q);
+        nvme_wake_free_req_locked(q);
     }
     q->busy = false;
     return progress;
@@ -XXX,XX +XXX,XX @@ static void nvme_trace_command(const NvmeCmd *cmd)
     }
 }
 
-static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
-                                NVMeRequest *req,
+static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
                                 NvmeCmd *cmd, BlockCompletionFunc cb,
                                 void *opaque)
 {
@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
     req->opaque = opaque;
     cmd->cid = cpu_to_le32(req->cid);
 
-    trace_nvme_submit_command(s, q->index, req->cid);
+    trace_nvme_submit_command(q->s, q->index, req->cid);
     nvme_trace_command(cmd);
     qemu_mutex_lock(&q->lock);
     memcpy((uint8_t *)q->sq.queue +
            q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
     q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
     q->need_kick++;
-    nvme_kick(s, q);
-    nvme_process_completion(s, q);
+    nvme_kick(q);
+    nvme_process_completion(q);
     qemu_mutex_unlock(&q->lock);
 }
 
@@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
                          NvmeCmd *cmd)
 {
     NVMeRequest *req;
-    BDRVNVMeState *s = bs->opaque;
     int ret = -EINPROGRESS;
     req = nvme_get_free_req(q);
     if (!req) {
         return -EBUSY;
     }
-    nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret);
+    nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
 
     BDRV_POLL_WHILE(bs, ret == -EINPROGRESS);
     return ret;
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
         }
 
         qemu_mutex_lock(&q->lock);
-        while (nvme_process_completion(s, q)) {
+        while (nvme_process_completion(q)) {
             /* Keep polling */
             progress = true;
         }
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
     };
     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
         error_setg(errp, "Failed to create io queue [%d]", n);
-        nvme_free_queue_pair(bs, q);
+        nvme_free_queue_pair(q);
         return false;
     }
     cmd = (NvmeCmd) {
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
     };
     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
         error_setg(errp, "Failed to create io queue [%d]", n);
-        nvme_free_queue_pair(bs, q);
+        nvme_free_queue_pair(q);
         return false;
     }
     s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
@@ -XXX,XX +XXX,XX @@ static void nvme_close(BlockDriverState *bs)
     BDRVNVMeState *s = bs->opaque;
 
     for (i = 0; i < s->nr_queues; ++i) {
-        nvme_free_queue_pair(bs, s->queues[i]);
+        nvme_free_queue_pair(s->queues[i]);
     }
     g_free(s->queues);
     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
     r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
     qemu_co_mutex_unlock(&s->dma_map_lock);
     if (r) {
-        nvme_put_free_req_and_wake(s, ioq, req);
+        nvme_put_free_req_and_wake(ioq, req);
         return r;
     }
-    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
 
     data.co = qemu_coroutine_self();
     while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
     assert(s->nr_queues > 1);
     req = nvme_get_free_req(ioq);
     assert(req);
-    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
 
     data.co = qemu_coroutine_self();
     if (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
     req = nvme_get_free_req(ioq);
     assert(req);
 
-    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
 
     data.co = qemu_coroutine_self();
     while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
     qemu_co_mutex_unlock(&s->dma_map_lock);
 
     if (ret) {
-        nvme_put_free_req_and_wake(s, ioq, req);
+        nvme_put_free_req_and_wake(ioq, req);
         goto out;
     }
 
     trace_nvme_dsm(s, offset, bytes);
 
-    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
 
     data.co = qemu_coroutine_self();
     while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static void nvme_aio_unplug(BlockDriverState *bs)
     for (i = 1; i < s->nr_queues; i++) {
         NVMeQueuePair *q = s->queues[i];
         qemu_mutex_lock(&q->lock);
-        nvme_kick(s, q);
-        nvme_process_completion(s, q);
+        nvme_kick(q);
+        nvme_process_completion(q);
         qemu_mutex_unlock(&q->lock);
     }
 }
-- 
2.26.2

QEMU block drivers are supposed to support aio_poll() from I/O
completion callback functions. This means completion processing must be
re-entrant.

The standard approach is to schedule a BH during completion processing
and cancel it at the end of processing. If aio_poll() is invoked by a
callback function then the BH will run. The BH continues the suspended
completion processing.

All of this means that request A's cb() can synchronously wait for
request B to complete. Previously the nvme block driver would hang
because it didn't process completions from nested aio_poll().

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Message-id: 20200617132201.1832152-8-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/nvme.c       | 67 ++++++++++++++++++++++++++++++++++++++++------
 block/trace-events |  2 +-
 2 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
     int         cq_phase;
     int         free_req_head;
     NVMeRequest reqs[NVME_NUM_REQS];
-    bool        busy;
     int         need_kick;
     int         inflight;
+
+    /* Thread-safe, no lock necessary */
+    QEMUBH      *completion_bh;
 } NVMeQueuePair;
 
 /* Memory mapped registers */
@@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState {
 #define NVME_BLOCK_OPT_DEVICE "device"
 #define NVME_BLOCK_OPT_NAMESPACE "namespace"
 
+static void nvme_process_completion_bh(void *opaque);
+
 static QemuOptsList runtime_opts = {
     .name = "nvme",
     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
 
 static void nvme_free_queue_pair(NVMeQueuePair *q)
 {
+    if (q->completion_bh) {
+        qemu_bh_delete(q->completion_bh);
+    }
     qemu_vfree(q->prp_list_pages);
     qemu_vfree(q->sq.queue);
     qemu_vfree(q->cq.queue);
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
     q->index = idx;
     qemu_co_queue_init(&q->free_req_queue);
     q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
+    q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs),
+                                  nvme_process_completion_bh, q);
     r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
                           s->page_size * NVME_NUM_REQS,
                           false, &prp_list_iova);
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
     NvmeCqe *c;
 
     trace_nvme_process_completion(s, q->index, q->inflight);
-    if (q->busy || s->plugged) {
-        trace_nvme_process_completion_queue_busy(s, q->index);
+    if (s->plugged) {
+        trace_nvme_process_completion_queue_plugged(s, q->index);
         return false;
     }
-    q->busy = true;
+
+    /*
+     * Support re-entrancy when a request cb() function invokes aio_poll().
+     * Pending completions must be visible to aio_poll() so that a cb()
+     * function can wait for the completion of another request.
+     *
+     * The aio_poll() loop will execute our BH and we'll resume completion
+     * processing there.
+     */
+    qemu_bh_schedule(q->completion_bh);
+
     assert(q->inflight >= 0);
     while (q->inflight) {
         int ret;
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
         assert(req.cb);
         nvme_put_free_req_locked(q, preq);
         preq->cb = preq->opaque = NULL;
-        qemu_mutex_unlock(&q->lock);
-        req.cb(req.opaque, ret);
-        qemu_mutex_lock(&q->lock);
         q->inflight--;
+        qemu_mutex_unlock(&q->lock);
+        req.cb(req.opaque, ret);
+        qemu_mutex_lock(&q->lock);
         progress = true;
     }
     if (progress) {
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
         *q->cq.doorbell = cpu_to_le32(q->cq.head);
         nvme_wake_free_req_locked(q);
     }
-    q->busy = false;
+
+    qemu_bh_cancel(q->completion_bh);
+
     return progress;
 }
 
+static void nvme_process_completion_bh(void *opaque)
+{
+    NVMeQueuePair *q = opaque;
+
+    /*
+     * We're being invoked because a nvme_process_completion() cb() function
+     * called aio_poll(). The callback may be waiting for further completions
+     * so notify the device that it has space to fill in more completions now.
+     */
+    smp_mb_release();
+    *q->cq.doorbell = cpu_to_le32(q->cq.head);
+    nvme_wake_free_req_locked(q);
+
+    nvme_process_completion(q);
+}
+
 static void nvme_trace_command(const NvmeCmd *cmd)
 {
     int i;
@@ -XXX,XX +XXX,XX @@ static void nvme_detach_aio_context(BlockDriverState *bs)
 {
     BDRVNVMeState *s = bs->opaque;
 
+    for (int i = 0; i < s->nr_queues; i++) {
+        NVMeQueuePair *q = s->queues[i];
+
+        qemu_bh_delete(q->completion_bh);
+        q->completion_bh = NULL;
+    }
+
     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
                            false, NULL, NULL);
 }
@@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs,
     s->aio_context = new_context;
     aio_set_event_notifier(new_context, &s->irq_notifier,
                            false, nvme_handle_event, nvme_poll_cb);
+
+    for (int i = 0; i < s->nr_queues; i++) {
+        NVMeQueuePair *q = s->queues[i];
+
+        q->completion_bh =
+            aio_bh_new(new_context, nvme_process_completion_bh, q);
+    }
 }
 
 static void nvme_aio_plug(BlockDriverState *bs)
diff --git a/block/trace-events b/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ nvme_kick(void *s, int queue) "s %p queue %d"
 nvme_dma_flush_queue_wait(void *s) "s %p"
 nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
 nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
-nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d"
+nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d"
 nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
 nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
 nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
-- 
2.26.2

The following changes since commit 813bac3d8d70d85cb7835f7945eb9eed84c2d8d0:

Merge tag '2023q3-bsd-user-pull-request' of https://gitlab.com/bsdimp/qemu into staging (2023-08-29 08:58:00 -0400)

are available in the Git repository at:

https://gitlab.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to 87ec6f55af38e29be5b2b65a8acf84da73e06d06:

aio-posix: zero out io_uring sqe user_data (2023-08-30 07:39:59 -0400)

----------------------------------------------------------------
Pull request

v3:
- Drop UFS emulation due to CI failures
- Add "aio-posix: zero out io_uring sqe user_data"

----------------------------------------------------------------

Andrey Drobyshev (3):
  block: add subcluster_size field to BlockDriverInfo
  block/io: align requests to subcluster_size
  tests/qemu-iotests/197: add testcase for CoR with subclusters

Fabiano Rosas (1):
  block-migration: Ensure we don't crash during migration cleanup

Stefan Hajnoczi (1):
  aio-posix: zero out io_uring sqe user_data

-- 
2.41.0

From: Fabiano Rosas <farosas@suse.de>

We can fail the blk_insert_bs() at init_blk_migration(), leaving the
BlkMigDevState without a dirty_bitmap and BlockDriverState. Account
for the possibly missing elements when doing cleanup.

Fix the following crashes:

Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault.
0x0000555555ec83ef in bdrv_release_dirty_bitmap (bitmap=0x0) at ../block/dirty-bitmap.c:359
359         BlockDriverState *bs = bitmap->bs;
 #0  0x0000555555ec83ef in bdrv_release_dirty_bitmap (bitmap=0x0) at ../block/dirty-bitmap.c:359
 #1  0x0000555555bba331 in unset_dirty_tracking () at ../migration/block.c:371
 #2  0x0000555555bbad98 in block_migration_cleanup_bmds () at ../migration/block.c:681

Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault.
0x0000555555e971ff in bdrv_op_unblock (bs=0x0, op=BLOCK_OP_TYPE_BACKUP_SOURCE, reason=0x0) at ../block.c:7073
7073        QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
 #0  0x0000555555e971ff in bdrv_op_unblock (bs=0x0, op=BLOCK_OP_TYPE_BACKUP_SOURCE, reason=0x0) at ../block.c:7073
 #1  0x0000555555e9734a in bdrv_op_unblock_all (bs=0x0, reason=0x0) at ../block.c:7095
 #2  0x0000555555bbae13 in block_migration_cleanup_bmds () at ../migration/block.c:690

Signed-off-by: Fabiano Rosas <farosas@suse.de>
Message-id: 20230731203338.27581-1-farosas@suse.de
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 migration/block.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/migration/block.c b/migration/block.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -XXX,XX +XXX,XX @@ static void unset_dirty_tracking(void)
     BlkMigDevState *bmds;
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        bdrv_release_dirty_bitmap(bmds->dirty_bitmap);
+        if (bmds->dirty_bitmap) {
+            bdrv_release_dirty_bitmap(bmds->dirty_bitmap);
+        }
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static int64_t get_remaining_dirty(void)
 static void block_migration_cleanup_bmds(void)
 {
     BlkMigDevState *bmds;
+    BlockDriverState *bs;
     AioContext *ctx;
 
     unset_dirty_tracking();
 
     while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
         QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
-        bdrv_op_unblock_all(blk_bs(bmds->blk), bmds->blocker);
+
+        bs = blk_bs(bmds->blk);
+        if (bs) {
+            bdrv_op_unblock_all(bs, bmds->blocker);
+        }
         error_free(bmds->blocker);
 
         /* Save ctx, because bmds->blk can disappear during blk_unref.  */
-- 
2.41.0

From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>

This is going to be used in the subsequent commit as requests alignment
(in particular, during copy-on-read).  This value only makes sense for
the formats which support subclusters (currently QCOW2 only).  If this
field isn't set by driver's own bdrv_get_info() implementation, we
simply set it equal to the cluster size thus treating each cluster as
having a single subcluster.

Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20230711172553.234055-2-andrey.drobyshev@virtuozzo.com>
---
 include/block/block-common.h | 5 +++++
 block.c                      | 7 +++++++
 block/qcow2.c                | 1 +
 3 files changed, 13 insertions(+)

diff --git a/include/block/block-common.h b/include/block/block-common.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -XXX,XX +XXX,XX @@ typedef struct BlockZoneWps {
 typedef struct BlockDriverInfo {
     /* in bytes, 0 if irrelevant */
     int cluster_size;
+    /*
+     * A fraction of cluster_size, if supported (currently QCOW2 only); if
+     * disabled or unsupported, set equal to cluster_size.
+     */
+    int subcluster_size;
     /* offset at which the VM state can be saved (0 if not possible) */
     int64_t vm_state_offset;
     bool is_dirty;
diff --git a/block.c b/block.c
index XXXXXXX..XXXXXXX 100644
--- a/block.c
+++ b/block.c
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
     }
     memset(bdi, 0, sizeof(*bdi));
     ret = drv->bdrv_co_get_info(bs, bdi);
+    if (bdi->subcluster_size == 0) {
+        /*
+         * If the driver left this unset, subclusters are not supported.
+         * Then it is safe to treat each cluster as having only one subcluster.
+         */
+        bdi->subcluster_size = bdi->cluster_size;
+    }
     if (ret < 0) {
         return ret;
     }
diff --git a/block/qcow2.c b/block/qcow2.c
index XXXXXXX..XXXXXXX 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -XXX,XX +XXX,XX @@ qcow2_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
     BDRVQcow2State *s = bs->opaque;
     bdi->cluster_size = s->cluster_size;
+    bdi->subcluster_size = s->subcluster_size;
     bdi->vm_state_offset = qcow2_vm_state_offset(s);
     bdi->is_dirty = s->incompatible_features & QCOW2_INCOMPAT_DIRTY;
     return 0;
-- 
2.41.0

From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>

When target image is using subclusters, and we align the request during
copy-on-read, it makes sense to align to subcluster_size rather than
cluster_size.  Otherwise we end up with unnecessary allocations.

This commit renames bdrv_round_to_clusters() to bdrv_round_to_subclusters()
and utilizes subcluster_size field of BlockDriverInfo to make necessary
alignments.  It affects copy-on-read as well as mirror job (which is
using bdrv_round_to_clusters()).

This change also fixes the following bug with failing assert (covered by
the test in the subsequent commit):

qemu-img create -f qcow2 base.qcow2 64K
qemu-img create -f qcow2 -o extended_l2=on,backing_file=base.qcow2,backing_fmt=qcow2 img.qcow2 64K
qemu-io -c "write -P 0xaa 0 2K" img.qcow2
qemu-io -C -c "read -P 0x00 2K 62K" img.qcow2

qemu-io: ../block/io.c:1236: bdrv_co_do_copy_on_readv: Assertion `skip_bytes < pnum' failed.

Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20230711172553.234055-3-andrey.drobyshev@virtuozzo.com>
---
 include/block/block-io.h |  8 +++----
 block/io.c               | 50 ++++++++++++++++++++--------------------
 block/mirror.c           |  8 +++----
 3 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/include/block/block-io.h b/include/block/block-io.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -XXX,XX +XXX,XX @@ bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs,
                                           Error **errp);
 BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs);
-void bdrv_round_to_clusters(BlockDriverState *bs,
-                            int64_t offset, int64_t bytes,
-                            int64_t *cluster_offset,
-                            int64_t *cluster_bytes);
+void bdrv_round_to_subclusters(BlockDriverState *bs,
+                               int64_t offset, int64_t bytes,
+                               int64_t *cluster_offset,
+                               int64_t *cluster_bytes);
 
 void bdrv_get_backing_filename(BlockDriverState *bs,
                                char *filename, int filename_size);
diff --git a/block/io.c b/block/io.c
index XXXXXXX..XXXXXXX 100644
--- a/block/io.c
+++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs)
 }
 
 /**
- * Round a region to cluster boundaries
+ * Round a region to subcluster (if supported) or cluster boundaries
  */
 void coroutine_fn GRAPH_RDLOCK
-bdrv_round_to_clusters(BlockDriverState *bs, int64_t offset, int64_t bytes,
-                       int64_t *cluster_offset, int64_t *cluster_bytes)
+bdrv_round_to_subclusters(BlockDriverState *bs, int64_t offset, int64_t bytes,
+                          int64_t *align_offset, int64_t *align_bytes)
 {
     BlockDriverInfo bdi;
     IO_CODE();
-    if (bdrv_co_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
-        *cluster_offset = offset;
-        *cluster_bytes = bytes;
+    if (bdrv_co_get_info(bs, &bdi) < 0 || bdi.subcluster_size == 0) {
+        *align_offset = offset;
+        *align_bytes = bytes;
     } else {
-        int64_t c = bdi.cluster_size;
-        *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
-        *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
+        int64_t c = bdi.subcluster_size;
+        *align_offset = QEMU_ALIGN_DOWN(offset, c);
+        *align_bytes = QEMU_ALIGN_UP(offset - *align_offset + bytes, c);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
     void *bounce_buffer = NULL;
 
     BlockDriver *drv = bs->drv;
-    int64_t cluster_offset;
-    int64_t cluster_bytes;
+    int64_t align_offset;
+    int64_t align_bytes;
     int64_t skip_bytes;
     int ret;
     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
      * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
      * is one reason we loop rather than doing it all at once.
      */
-    bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
-    skip_bytes = offset - cluster_offset;
+    bdrv_round_to_subclusters(bs, offset, bytes, &align_offset, &align_bytes);
+    skip_bytes = offset - align_offset;
 
     trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
-                                   cluster_offset, cluster_bytes);
+                                   align_offset, align_bytes);
 
-    while (cluster_bytes) {
+    while (align_bytes) {
         int64_t pnum;
 
         if (skip_write) {
             ret = 1; /* "already allocated", so nothing will be copied */
-            pnum = MIN(cluster_bytes, max_transfer);
+            pnum = MIN(align_bytes, max_transfer);
         } else {
-            ret = bdrv_is_allocated(bs, cluster_offset,
-                                    MIN(cluster_bytes, max_transfer), &pnum);
+            ret = bdrv_is_allocated(bs, align_offset,
+                                    MIN(align_bytes, max_transfer), &pnum);
             if (ret < 0) {
                 /*
                  * Safe to treat errors in querying allocation as if
                  * unallocated; we'll probably fail again soon on the
                  * read, but at least that will set a decent errno.
                  */
-                pnum = MIN(cluster_bytes, max_transfer);
+                pnum = MIN(align_bytes, max_transfer);
             }
 
             /* Stop at EOF if the image ends in the middle of the cluster */
@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
             /* Must copy-on-read; use the bounce buffer */
             pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
             if (!bounce_buffer) {
-                int64_t max_we_need = MAX(pnum, cluster_bytes - pnum);
+                int64_t max_we_need = MAX(pnum, align_bytes - pnum);
                 int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER);
                 int64_t bounce_buffer_len = MIN(max_we_need, max_allowed);
 
@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
             }
             qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum);
 
-            ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
+            ret = bdrv_driver_preadv(bs, align_offset, pnum,
                                      &local_qiov, 0, 0);
             if (ret < 0) {
                 goto err;
@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
                 /* FIXME: Should we (perhaps conditionally) be setting
                  * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
                  * that still correctly reads as zero? */
-                ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
+                ret = bdrv_co_do_pwrite_zeroes(bs, align_offset, pnum,
                                                BDRV_REQ_WRITE_UNCHANGED);
             } else {
                 /* This does not change the data on the disk, it is not
                  * necessary to flush even in cache=writethrough mode.
                  */
-                ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
+                ret = bdrv_driver_pwritev(bs, align_offset, pnum,
                                           &local_qiov, 0,
                                           BDRV_REQ_WRITE_UNCHANGED);
             }
@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
             }
         }
 
-        cluster_offset += pnum;
-        cluster_bytes -= pnum;
+        align_offset += pnum;
+        align_bytes -= pnum;
         progress += pnum - skip_bytes;
         skip_bytes = 0;
     }
diff --git a/block/mirror.c b/block/mirror.c
index XXXXXXX..XXXXXXX 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
     need_cow |= !test_bit((*offset + *bytes - 1) / s->granularity,
                           s->cow_bitmap);
     if (need_cow) {
-        bdrv_round_to_clusters(blk_bs(s->target), *offset, *bytes,
-                               &align_offset, &align_bytes);
+        bdrv_round_to_subclusters(blk_bs(s->target), *offset, *bytes,
+                                  &align_offset, &align_bytes);
     }
 
     if (align_bytes > max_bytes) {
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
             int64_t target_offset;
             int64_t target_bytes;
             WITH_GRAPH_RDLOCK_GUARD() {
-                bdrv_round_to_clusters(blk_bs(s->target), offset, io_bytes,
-                                       &target_offset, &target_bytes);
+                bdrv_round_to_subclusters(blk_bs(s->target), offset, io_bytes,
+                                          &target_offset, &target_bytes);
             }
             if (target_offset == offset &&
                 target_bytes == io_bytes) {
-- 
2.41.0

From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>

Add testcase which checks that allocations during copy-on-read are
performed on the subcluster basis when subclusters are enabled in target
image.

This testcase also triggers the following assert with previous commit
not being applied, so we check that as well:

qemu-io: ../block/io.c:1236: bdrv_co_do_copy_on_readv: Assertion `skip_bytes < pnum' failed.

Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20230711172553.234055-4-andrey.drobyshev@virtuozzo.com>
---
 tests/qemu-iotests/197     | 29 +++++++++++++++++++++++++++++
 tests/qemu-iotests/197.out | 24 ++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197
index XXXXXXX..XXXXXXX 100755
--- a/tests/qemu-iotests/197
+++ b/tests/qemu-iotests/197
@@ -XXX,XX +XXX,XX @@ $QEMU_IO -f qcow2 -C -c 'read 0 1024' "$TEST_WRAP" | _filter_qemu_io
 $QEMU_IO -f qcow2 -c map "$TEST_WRAP"
 _check_test_img
 
+echo
+echo '=== Copy-on-read with subclusters ==='
+echo
+
+# Create base and top images 64K (1 cluster) each.  Make subclusters enabled
+# for the top image
+_make_test_img 64K
+IMGPROTO=file IMGFMT=qcow2 TEST_IMG_FILE="$TEST_WRAP" \
+    _make_test_img --no-opts -o extended_l2=true -F "$IMGFMT" -b "$TEST_IMG" \
+    64K | _filter_img_create
+
+$QEMU_IO -c "write -P 0xaa 0 64k" "$TEST_IMG" | _filter_qemu_io
+
+# Allocate individual subclusters in the top image, and not the whole cluster
+$QEMU_IO -c "write -P 0xbb 28K 2K" -c "write -P 0xcc 34K 2K" "$TEST_WRAP" \
+    | _filter_qemu_io
+
+# Only 2 subclusters should be allocated in the top image at this point
+$QEMU_IMG map "$TEST_WRAP" | _filter_qemu_img_map
+
+# Actual copy-on-read operation
+$QEMU_IO -C -c "read -P 0xaa 30K 4K" "$TEST_WRAP" | _filter_qemu_io
+
+# And here we should have 4 subclusters allocated right in the middle of the
+# top image. Make sure the whole cluster remains unallocated
+$QEMU_IMG map "$TEST_WRAP" | _filter_qemu_img_map
+
+_check_test_img
+
 # success, all done
 echo '*** done'
 status=0
diff --git a/tests/qemu-iotests/197.out b/tests/qemu-iotests/197.out
index XXXXXXX..XXXXXXX 100644
--- a/tests/qemu-iotests/197.out
+++ b/tests/qemu-iotests/197.out
@@ -XXX,XX +XXX,XX @@ read 1024/1024 bytes at offset 0
 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 1 KiB (0x400) bytes     allocated at offset 0 bytes (0x0)
 No errors were found on the image.
+
+=== Copy-on-read with subclusters ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65536
+Formatting 'TEST_DIR/t.wrap.IMGFMT', fmt=IMGFMT size=65536 backing_file=TEST_DIR/t.IMGFMT backing_fmt=IMGFMT
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2048/2048 bytes at offset 28672
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2048/2048 bytes at offset 34816
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Offset          Length          File
+0               0x7000          TEST_DIR/t.IMGFMT
+0x7000          0x800           TEST_DIR/t.wrap.IMGFMT
+0x7800          0x1000          TEST_DIR/t.IMGFMT
+0x8800          0x800           TEST_DIR/t.wrap.IMGFMT
+0x9000          0x7000          TEST_DIR/t.IMGFMT
+read 4096/4096 bytes at offset 30720
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Offset          Length          File
+0               0x7000          TEST_DIR/t.IMGFMT
+0x7000          0x2000          TEST_DIR/t.wrap.IMGFMT
+0x9000          0x7000          TEST_DIR/t.IMGFMT
+No errors were found on the image.
 *** done
-- 
2.41.0

liburing does not clear sqe->user_data. We must do it ourselves to avoid
undefined behavior in process_cqe() when user_data is used.

Note that fdmon-io_uring is currently disabled, so this is a latent bug
that does not affect users. Let's merge this fix now to make it easier
to enable fdmon-io_uring in the future (and I'm working on that).

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20230426212639.82310-1-stefanha@redhat.com>
---
 util/fdmon-io_uring.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
index XXXXXXX..XXXXXXX 100644
--- a/util/fdmon-io_uring.c
+++ b/util/fdmon-io_uring.c
@@ -XXX,XX +XXX,XX @@ static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node)
 #else
     io_uring_prep_poll_remove(sqe, node);
 #endif
+    io_uring_sqe_set_data(sqe, NULL);
 }
 
 /* Add a timeout that self-cancels when another cqe becomes ready */
@@ -XXX,XX +XXX,XX @@ static void add_timeout_sqe(AioContext *ctx, int64_t ns)
 
     sqe = get_sqe(ctx);
     io_uring_prep_timeout(sqe, &ts, 1, 0);
+    io_uring_sqe_set_data(sqe, NULL);
 }
 
 /* Add sqes from ctx->submit_list for submission */
-- 
2.41.0