Series comparison

-[PULL 00/12] Block patches
+[PULL v2 00/28] Block patches
-The following changes since commit 171199f56f5f9bdf1e5d670d09ef1351d8f01bae:
+The following changes since commit ac793156f650ae2d77834932d72224175ee69086:
-  Merge remote-tracking branch 'remotes/alistair/tags/pull-riscv-to-apply-20200619-3' into staging (2020-06-22 14:45:25 +0100)
+  Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20201020-1' into staging (2020-10-20 21:11:35 +0100)
 are available in the Git repository at:
-  https://github.com/stefanha/qemu.git tags/block-pull-request
+  https://gitlab.com/stefanha/qemu.git tags/block-pull-request
-for you to fetch changes up to 7838c67f22a81fcf669785cd6c0876438422071a:
+for you to fetch changes up to 32a3fd65e7e3551337fd26bfc0e2f899d70c028c:
-  block/nvme: support nested aio_poll() (2020-06-23 15:46:08 +0100)
+  iotests: add commit top->base cases to 274 (2020-10-22 09:55:39 +0100)
 ----------------------------------------------------------------
 Pull request
+v2:
+ * Fix format string issues on 32-bit hosts [Peter]
+ * Fix qemu-nbd.c CONFIG_POSIX ifdef issue [Eric]
+ * Fix missing eventfd.h header on macOS [Peter]
+ * Drop unreliable vhost-user-blk test (will send a new patch when ready) [Peter]
+This pull request contains the vhost-user-blk server by Coiby Xu along with my
+additions, block/nvme.c alignment and hardware error statistics by Philippe
+Mathieu-Daudé, and bdrv_co_block_status_above() fixes by Vladimir
+Sementsov-Ogievskiy.
 ----------------------------------------------------------------
-Daniele Buono (4):
+Coiby Xu (6):
-  coroutine: support SafeStack in ucontext backend
+  libvhost-user: Allow vu_message_read to be replaced
-  coroutine: add check for SafeStack in sigaltstack
+  libvhost-user: remove watch for kick_fd when de-initialize vu-dev
-  configure: add flags to support SafeStack
+  util/vhost-user-server: generic vhost user server
-  check-block: enable iotests with SafeStack
+  block: move logical block size check function to a common utility
     function
   block/export: vhost-user block device backend server
   MAINTAINERS: Add vhost-user block device backend server maintainer
-Stefan Hajnoczi (8):
+Philippe Mathieu-Daudé (1):
-  minikconf: explicitly set encoding to UTF-8
+  block/nvme: Add driver statistics for access alignment and hw errors
   block/nvme: poll queues without q->lock
   block/nvme: drop tautologous assertion
   block/nvme: don't access CQE after moving cq.head
   block/nvme: switch to a NVMeRequest freelist
   block/nvme: clarify that free_req_queue is protected by q->lock
   block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair
   block/nvme: support nested aio_poll()
- configure                    |  73 ++++++++++++
+Stefan Hajnoczi (16):
- include/qemu/coroutine_int.h |   5 +
+  util/vhost-user-server: s/fileds/fields/ typo fix
- block/nvme.c                 | 220 +++++++++++++++++++++++++----------
+  util/vhost-user-server: drop unnecessary QOM cast
- util/coroutine-sigaltstack.c |   4 +
+  util/vhost-user-server: drop unnecessary watch deletion
- util/coroutine-ucontext.c    |  28 +++++
+  block/export: consolidate request structs into VuBlockReq
- block/trace-events           |   2 +-
+  util/vhost-user-server: drop unused DevicePanicNotifier
- scripts/minikconf.py         |   6 +-
+  util/vhost-user-server: fix memory leak in vu_message_read()
- tests/check-block.sh         |  12 +-
+  util/vhost-user-server: check EOF when reading payload
-files changed, 284 insertions(+), 66 deletions(-)
+  util/vhost-user-server: rework vu_client_trip() coroutine lifecycle
   block/export: report flush errors
   block/export: convert vhost-user-blk server to block export API
   util/vhost-user-server: move header to include/
   util/vhost-user-server: use static library in meson.build
   qemu-storage-daemon: avoid compiling blockdev_ss twice
   block: move block exports to libblockdev
   block/export: add iothread and fixed-iothread options
   block/export: add vhost-user-blk multi-queue support
 Vladimir Sementsov-Ogievskiy (5):
   block/io: fix bdrv_co_block_status_above
   block/io: bdrv_common_block_status_above: support include_base
   block/io: bdrv_common_block_status_above: support bs == base
   block/io: fix bdrv_is_allocated_above
   iotests: add commit top->base cases to 274
  MAINTAINERS                                |   9 +
  qapi/block-core.json                       |  24 +-
  qapi/block-export.json                     |  36 +-
  block/coroutines.h                         |   2 +
  block/export/vhost-user-blk-server.h       |  19 +
  contrib/libvhost-user/libvhost-user.h      |  21 +
  include/qemu/vhost-user-server.h           |  65 +++
  util/block-helpers.h                       |  19 +
  block/export/export.c                      |  37 +-
  block/export/vhost-user-blk-server.c       | 431 ++++++++++++++++++++
  block/io.c                                 | 132 +++---
  block/nvme.c                               |  27 ++
  block/qcow2.c                              |  16 +-
  contrib/libvhost-user/libvhost-user-glib.c |   2 +-
  contrib/libvhost-user/libvhost-user.c      |  15 +-
  hw/core/qdev-properties-system.c           |  31 +-
  nbd/server.c                               |   2 -
  qemu-nbd.c                                 |  21 +-
  softmmu/vl.c                               |   4 +
  stubs/blk-exp-close-all.c                  |   7 +
  tests/vhost-user-bridge.c                  |   2 +
  tools/virtiofsd/fuse_virtio.c              |   4 +-
  util/block-helpers.c                       |  46 +++
  util/vhost-user-server.c                   | 446 +++++++++++++++++++++
  block/export/meson.build                   |   3 +-
  contrib/libvhost-user/meson.build          |   1 +
  meson.build                                |  22 +-
  nbd/meson.build                            |   2 +
  storage-daemon/meson.build                 |   3 +-
  stubs/meson.build                          |   1 +
  tests/qemu-iotests/274                     |  20 +
  tests/qemu-iotests/274.out                 |  68 ++++
  util/meson.build                           |   4 +
 files changed, 1420 insertions(+), 122 deletions(-)
  create mode 100644 block/export/vhost-user-blk-server.h
  create mode 100644 include/qemu/vhost-user-server.h
  create mode 100644 util/block-helpers.h
  create mode 100644 block/export/vhost-user-blk-server.c
  create mode 100644 stubs/blk-exp-close-all.c
  create mode 100644 util/block-helpers.c
  create mode 100644 util/vhost-user-server.c
 --
 .26.2

-[PULL 08/12] block/nvme: don't access CQE after moving cq.head
+[PULL v2 01/28] block/nvme: Add driver statistics for access alignment and hw errors
-Do not access a CQE after incrementing q->cq.head and releasing q->lock.
+From: Philippe Mathieu-Daudé <philmd@redhat.com>
 It is unlikely that this causes problems in practice but it's a latent
 bug.
-The reason why it should be safe at the moment is that completion
+Keep statistics of some hardware errors, and number of
-processing is not re-entrant and the CQ doorbell isn't written until the
+aligned/unaligned I/O accesses.
 end of nvme_process_completion().
-Make this change now because QEMU expects completion processing to be
+QMP example booting a full RHEL 8.3 aarch64 guest:
 re-entrant and later patches will do that.
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+{ "execute": "query-blockstats" }
-Reviewed-by: Sergio Lopez <slp@redhat.com>
+{
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+    "return": [
-Message-id: 20200617132201.1832152-4-stefanha@redhat.com
+        {
             "device": "",
             "node-name": "drive0",
             "stats": {
                 "flush_total_time_ns": 6026948,
                 "wr_highest_offset": 3383991230464,
                 "wr_total_time_ns": 807450995,
                 "failed_wr_operations": 0,
                 "failed_rd_operations": 0,
                 "wr_merged": 3,
                 "wr_bytes": 50133504,
                 "failed_unmap_operations": 0,
                 "failed_flush_operations": 0,
                 "account_invalid": false,
                 "rd_total_time_ns": 1846979900,
                 "flush_operations": 130,
                 "wr_operations": 659,
                 "rd_merged": 1192,
                 "rd_bytes": 218244096,
                 "account_failed": false,
                 "idle_time_ns": 2678641497,
                 "rd_operations": 7406,
             },
             "driver-specific": {
                 "driver": "nvme",
                 "completion-errors": 0,
                 "unaligned-accesses": 2959,
                 "aligned-accesses": 4477
             },
             "qdev": "/machine/peripheral-anon/device[0]/virtio-backend"
         }
     ]
 }
 Suggested-by: Stefan Hajnoczi <stefanha@gmail.com>
 Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Acked-by: Markus Armbruster <armbru@redhat.com>
 Message-id: 20201001162939.1567915-1-philmd@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/nvme.c | 5 ++++-
+ qapi/block-core.json | 24 +++++++++++++++++++++++-
-file changed, 4 insertions(+), 1 deletion(-)
+ block/nvme.c         | 27 +++++++++++++++++++++++++++
 files changed, 50 insertions(+), 1 deletion(-)
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index XXXXXXX..XXXXXXX 100644
+--- a/qapi/block-core.json
++++ b/qapi/block-core.json
+@@ -XXX,XX +XXX,XX @@
+       'discard-nb-failed': 'uint64',
+       'discard-bytes-ok': 'uint64' } }
++##
++# @BlockStatsSpecificNvme:
++#
++# NVMe driver statistics
++#
++# @completion-errors: The number of completion errors.
++#
++# @aligned-accesses: The number of aligned accesses performed by
++#                    the driver.
++#
++# @unaligned-accesses: The number of unaligned accesses performed by
++#                      the driver.
++#
++# Since: 5.2
++##
++{ 'struct': 'BlockStatsSpecificNvme',
++  'data': {
++      'completion-errors': 'uint64',
++      'aligned-accesses': 'uint64',
++      'unaligned-accesses': 'uint64' } }
++
+ ##
+ # @BlockStatsSpecific:
+ #
+@@ -XXX,XX +XXX,XX @@
+   'discriminator': 'driver',
+   'data': {
+       'file': 'BlockStatsSpecificFile',
+-      'host_device': 'BlockStatsSpecificFile' } }
++      'host_device': 'BlockStatsSpecificFile',
++      'nvme': 'BlockStatsSpecificNvme' } }
+ ##
+ # @BlockStats:
 diff --git a/block/nvme.c b/block/nvme.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/nvme.c
 +++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
+@@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState {
-     q->busy = true;
-     assert(q->inflight >= 0);
+     /* PCI address (required for nvme_refresh_filename()) */
-     while (q->inflight) {
+     char *device;
 +        int ret;
          int16_t cid;
 +
-         c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
++    struct {
-         if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
++        uint64_t completion_errors;
 +        uint64_t aligned_accesses;
 +        uint64_t unaligned_accesses;
 +    } stats;
  };
  #define NVME_BLOCK_OPT_DEVICE "device"
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
              break;
          }
-+        ret = nvme_translate_error(c);
+         ret = nvme_translate_error(c);
 +        if (ret) {
 +            s->stats.completion_errors++;
 +        }
          q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
          if (!q->cq.head) {
              q->cq_phase = !q->cq_phase;
-@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
+@@ -XXX,XX +XXX,XX @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-         preq->busy = false;
+     assert(QEMU_IS_ALIGNED(bytes, s->page_size));
-         preq->cb = preq->opaque = NULL;
+     assert(bytes <= s->max_transfer);
-         qemu_mutex_unlock(&q->lock);
+     if (nvme_qiov_aligned(bs, qiov)) {
--        req.cb(req.opaque, nvme_translate_error(c));
++        s->stats.aligned_accesses++;
-+        req.cb(req.opaque, ret);
+         return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags);
-         qemu_mutex_lock(&q->lock);
+     }
-         q->inflight--;
++    s->stats.unaligned_accesses++;
-         progress = true;
+     trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
      buf = qemu_try_memalign(s->page_size, bytes);
@@ -XXX,XX +XXX,XX @@ static void nvme_unregister_buf(BlockDriverState *bs, void *host)
      qemu_vfio_dma_unmap(s->vfio, host);
  }
 +static BlockStatsSpecific *nvme_get_specific_stats(BlockDriverState *bs)
 +{
 +    BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
 +    BDRVNVMeState *s = bs->opaque;
 +
 +    stats->driver = BLOCKDEV_DRIVER_NVME;
 +    stats->u.nvme = (BlockStatsSpecificNvme) {
 +        .completion_errors = s->stats.completion_errors,
 +        .aligned_accesses = s->stats.aligned_accesses,
 +        .unaligned_accesses = s->stats.unaligned_accesses,
 +    };
 +
 +    return stats;
 +}
 +
  static const char *const nvme_strong_runtime_opts[] = {
      NVME_BLOCK_OPT_DEVICE,
      NVME_BLOCK_OPT_NAMESPACE,
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_nvme = {
      .bdrv_refresh_filename    = nvme_refresh_filename,
      .bdrv_refresh_limits      = nvme_refresh_limits,
      .strong_runtime_opts      = nvme_strong_runtime_opts,
 +    .bdrv_get_specific_stats  = nvme_get_specific_stats,
      .bdrv_detach_aio_context  = nvme_detach_aio_context,
      .bdrv_attach_aio_context  = nvme_attach_aio_context,
 --
 .26.2

-[PULL 11/12] block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair
+[PULL v2 02/28] libvhost-user: Allow vu_message_read to be replaced
-Passing around both BDRVNVMeState and NVMeQueuePair is unwieldy. Reduce
+From: Coiby Xu <coiby.xu@gmail.com>
 the number of function arguments by keeping the BDRVNVMeState pointer in
 NVMeQueuePair. This will come in handly when a BH is introduced in a
 later patch and only one argument can be passed to it.
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Allow vu_message_read to be replaced by one which will make use of the
-Reviewed-by: Sergio Lopez <slp@redhat.com>
+QIOChannel functions. Thus reading vhost-user message won't stall the
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+guest. For slave channel, we still use the default vu_message_read.
-Message-id: 20200617132201.1832152-7-stefanha@redhat.com
 Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
 Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Message-id: 20200918080912.321299-2-coiby.xu@gmail.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/nvme.c | 70 ++++++++++++++++++++++++++++------------------------
+ contrib/libvhost-user/libvhost-user.h      | 21 +++++++++++++++++++++
-file changed, 38 insertions(+), 32 deletions(-)
+ contrib/libvhost-user/libvhost-user-glib.c |  2 +-
  contrib/libvhost-user/libvhost-user.c      | 14 +++++++-------
  tests/vhost-user-bridge.c                  |  2 ++
  tools/virtiofsd/fuse_virtio.c              |  4 ++--
 files changed, 33 insertions(+), 10 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
+diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
+--- a/contrib/libvhost-user/libvhost-user.h
-+++ b/block/nvme.c
++++ b/contrib/libvhost-user/libvhost-user.h
 @@ -XXX,XX +XXX,XX @@
   */
- #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
+ #define VHOST_USER_MAX_RAM_SLOTS 32
-+typedef struct BDRVNVMeState BDRVNVMeState;
++#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
 +
- typedef struct {
+ typedef enum VhostSetConfigType {
-     int32_t  head, tail;
+     VHOST_SET_CONFIG_TYPE_MASTER = 0,
-     uint8_t  *queue;
+     VHOST_SET_CONFIG_TYPE_MIGRATION = 1,
-@@ -XXX,XX +XXX,XX @@ typedef struct {
+@@ -XXX,XX +XXX,XX @@ typedef uint64_t (*vu_get_features_cb) (VuDev *dev);
- typedef struct {
+ typedef void (*vu_set_features_cb) (VuDev *dev, uint64_t features);
-     QemuMutex   lock;
+ typedef int (*vu_process_msg_cb) (VuDev *dev, VhostUserMsg *vmsg,
+                                   int *do_reply);
-+    /* Read from I/O code path, initialized under BQL */
++typedef bool (*vu_read_msg_cb) (VuDev *dev, int sock, VhostUserMsg *vmsg);
-+    BDRVNVMeState   *s;
+ typedef void (*vu_queue_set_started_cb) (VuDev *dev, int qidx, bool started);
-+    int             index;
+ typedef bool (*vu_queue_is_processed_in_order_cb) (VuDev *dev, int qidx);
-+
+ typedef int (*vu_get_config_cb) (VuDev *dev, uint8_t *config, uint32_t len);
-     /* Fields protected by BQL */
+@@ -XXX,XX +XXX,XX @@ struct VuDev {
--    int         index;
+     bool broken;
-     uint8_t     *prp_list_pages;
+     uint16_t max_queues;
-     /* Fields protected by @lock */
++    /* @read_msg: custom method to read vhost-user message
-@@ -XXX,XX +XXX,XX @@ typedef volatile struct {
++     *
++     * Read data from vhost_user socket fd and fill up
- QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000);
++     * the passed VhostUserMsg *vmsg struct.
++     *
--typedef struct {
++     * If reading fails, it should close the received set of file
-+struct BDRVNVMeState {
++     * descriptors as socket message's auxiliary data.
-     AioContext *aio_context;
++     *
-     QEMUVFIOState *vfio;
++     * For the details, please refer to vu_message_read in libvhost-user.c
-     NVMeRegs *regs;
++     * which will be used by default if not custom method is provided when
-@@ -XXX,XX +XXX,XX @@ typedef struct {
++     * calling vu_init
++     *
-     /* PCI address (required for nvme_refresh_filename()) */
++     * Returns: true if vhost-user message successfully received,
-     char *device;
++     *          otherwise return false.
--} BDRVNVMeState;
++     *
-+};
++     */
++    vu_read_msg_cb read_msg;
- #define NVME_BLOCK_OPT_DEVICE "device"
+     /* @set_watch: add or update the given fd to the watch set,
- #define NVME_BLOCK_OPT_NAMESPACE "namespace"
+      * call cb when condition is met */
-@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
+     vu_set_watch_cb set_watch;
@@ -XXX,XX +XXX,XX @@ bool vu_init(VuDev *dev,
               uint16_t max_queues,
               int socket,
               vu_panic_cb panic,
 +             vu_read_msg_cb read_msg,
               vu_set_watch_cb set_watch,
               vu_remove_watch_cb remove_watch,
               const VuDevIface *iface);
 diff --git a/contrib/libvhost-user/libvhost-user-glib.c b/contrib/libvhost-user/libvhost-user-glib.c
 index XXXXXXX..XXXXXXX 100644
 --- a/contrib/libvhost-user/libvhost-user-glib.c
 +++ b/contrib/libvhost-user/libvhost-user-glib.c
@@ -XXX,XX +XXX,XX @@ vug_init(VugDev *dev, uint16_t max_queues, int socket,
      g_assert(dev);
      g_assert(iface);
 -    if (!vu_init(&dev->parent, max_queues, socket, panic, set_watch,
 +    if (!vu_init(&dev->parent, max_queues, socket, panic, NULL, set_watch,
                   remove_watch, iface)) {
          return false;
      }
+diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
+index XXXXXXX..XXXXXXX 100644
+--- a/contrib/libvhost-user/libvhost-user.c
++++ b/contrib/libvhost-user/libvhost-user.c
+@@ -XXX,XX +XXX,XX @@
+ /* The version of inflight buffer */
+ #define INFLIGHT_VERSION 1
+-#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
+-
+ /* The version of the protocol we support */
+ #define VHOST_USER_VERSION 1
+ #define LIBVHOST_USER_DEBUG 0
+@@ -XXX,XX +XXX,XX @@ have_userfault(void)
  }
--static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q)
+ static bool
-+static void nvme_free_queue_pair(NVMeQueuePair *q)
+-vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
 +vu_message_read_default(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
  {
-     qemu_vfree(q->prp_list_pages);
+     char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {};
-     qemu_vfree(q->sq.queue);
+     struct iovec iov = {
-@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg)
-     uint64_t prp_list_iova;
+         goto out;
+     }
-     qemu_mutex_init(&q->lock);
-+    q->s = s;
+-    if (!vu_message_read(dev, dev->slave_fd, &msg_reply)) {
-     q->index = idx;
++    if (!vu_message_read_default(dev, dev->slave_fd, &msg_reply)) {
-     qemu_co_queue_init(&q->free_req_queue);
+         goto out;
-     q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
+     }
-@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@ vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
-     return q;
+     /* Wait for QEMU to confirm that it's registered the handler for the
- fail:
+      * faults.
--    nvme_free_queue_pair(bs, q);
+      */
-+    nvme_free_queue_pair(q);
+-    if (!vu_message_read(dev, dev->sock, vmsg) ||
-     return NULL;
++    if (!dev->read_msg(dev, dev->sock, vmsg) ||
- }
+         vmsg->size != sizeof(vmsg->payload.u64) ||
+         vmsg->payload.u64 != 0) {
- /* With q->lock */
+         vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table");
--static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
+@@ -XXX,XX +XXX,XX @@ vu_dispatch(VuDev *dev)
-+static void nvme_kick(NVMeQueuePair *q)
+     int reply_requested;
- {
+     bool need_reply, success = false;
-+    BDRVNVMeState *s = q->s;
-+
+-    if (!vu_message_read(dev, dev->sock, &vmsg)) {
-     if (s->plugged || !q->need_kick) {
++    if (!dev->read_msg(dev, dev->sock, &vmsg)) {
          goto end;
      }
@@ -XXX,XX +XXX,XX @@ vu_init(VuDev *dev,
          uint16_t max_queues,
          int socket,
          vu_panic_cb panic,
 +        vu_read_msg_cb read_msg,
          vu_set_watch_cb set_watch,
          vu_remove_watch_cb remove_watch,
          const VuDevIface *iface)
@@ -XXX,XX +XXX,XX @@ vu_init(VuDev *dev,
      dev->sock = socket;
      dev->panic = panic;
 +    dev->read_msg = read_msg ? read_msg : vu_message_read_default;
      dev->set_watch = set_watch;
      dev->remove_watch = remove_watch;
      dev->iface = iface;
@@ -XXX,XX +XXX,XX @@ static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync)
          vu_message_write(dev, dev->slave_fd, &vmsg);
          if (ack) {
 -            vu_message_read(dev, dev->slave_fd, &vmsg);
 +            vu_message_read_default(dev, dev->slave_fd, &vmsg);
          }
          return;
      }
-@@ -XXX,XX +XXX,XX @@ static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
+diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c
- }
+index XXXXXXX..XXXXXXX 100644
+--- a/tests/vhost-user-bridge.c
- /* With q->lock */
++++ b/tests/vhost-user-bridge.c
--static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
+@@ -XXX,XX +XXX,XX @@ vubr_accept_cb(int sock, void *ctx)
-+static void nvme_wake_free_req_locked(NVMeQueuePair *q)
+                  VHOST_USER_BRIDGE_MAX_QUEUES,
- {
+                  conn_fd,
-     if (!qemu_co_queue_empty(&q->free_req_queue)) {
+                  vubr_panic,
--        replay_bh_schedule_oneshot_event(s->aio_context,
++                 NULL,
-+        replay_bh_schedule_oneshot_event(q->s->aio_context,
+                  vubr_set_watch,
-                 nvme_free_req_queue_cb, q);
+                  vubr_remove_watch,
-     }
+                  &vuiface)) {
- }
+@@ -XXX,XX +XXX,XX @@ vubr_new(const char *path, bool client)
+                      VHOST_USER_BRIDGE_MAX_QUEUES,
- /* Insert a request in the freelist and wake waiters */
+                      dev->sock,
--static void nvme_put_free_req_and_wake(BDRVNVMeState *s,  NVMeQueuePair *q,
+                      vubr_panic,
--                                       NVMeRequest *req)
++                     NULL,
-+static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req)
+                      vubr_set_watch,
- {
+                      vubr_remove_watch,
-     qemu_mutex_lock(&q->lock);
+                      &vuiface)) {
-     nvme_put_free_req_locked(q, req);
+diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c
--    nvme_wake_free_req_locked(s, q);
+index XXXXXXX..XXXXXXX 100644
-+    nvme_wake_free_req_locked(q);
+--- a/tools/virtiofsd/fuse_virtio.c
-     qemu_mutex_unlock(&q->lock);
++++ b/tools/virtiofsd/fuse_virtio.c
- }
+@@ -XXX,XX +XXX,XX @@ int virtio_session_mount(struct fuse_session *se)
+     se->vu_socketfd = data_sock;
-@@ -XXX,XX +XXX,XX @@ static inline int nvme_translate_error(const NvmeCqe *c)
+     se->virtio_dev->se = se;
- }
+     pthread_rwlock_init(&se->virtio_dev->vu_dispatch_rwlock, NULL);
+-    vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, fv_set_watch,
- /* With q->lock */
+-            fv_remove_watch, &fv_iface);
--static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
++    vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, NULL,
-+static bool nvme_process_completion(NVMeQueuePair *q)
++            fv_set_watch, fv_remove_watch, &fv_iface);
- {
-+    BDRVNVMeState *s = q->s;
+     return 0;
      bool progress = false;
      NVMeRequest *preq;
      NVMeRequest req;
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
          /* Notify the device so it can post more completions. */
          smp_mb_release();
          *q->cq.doorbell = cpu_to_le32(q->cq.head);
 -        nvme_wake_free_req_locked(s, q);
 +        nvme_wake_free_req_locked(q);
      }
      q->busy = false;
      return progress;
@@ -XXX,XX +XXX,XX @@ static void nvme_trace_command(const NvmeCmd *cmd)
      }
  }
 -static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
 -                                NVMeRequest *req,
 +static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
                                  NvmeCmd *cmd, BlockCompletionFunc cb,
                                  void *opaque)
  {
@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
      req->opaque = opaque;
      cmd->cid = cpu_to_le32(req->cid);
 -    trace_nvme_submit_command(s, q->index, req->cid);
 +    trace_nvme_submit_command(q->s, q->index, req->cid);
      nvme_trace_command(cmd);
      qemu_mutex_lock(&q->lock);
      memcpy((uint8_t *)q->sq.queue +
             q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
      q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
      q->need_kick++;
 -    nvme_kick(s, q);
 -    nvme_process_completion(s, q);
 +    nvme_kick(q);
 +    nvme_process_completion(q);
      qemu_mutex_unlock(&q->lock);
  }
@@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
                           NvmeCmd *cmd)
  {
      NVMeRequest *req;
 -    BDRVNVMeState *s = bs->opaque;
      int ret = -EINPROGRESS;
      req = nvme_get_free_req(q);
      if (!req) {
          return -EBUSY;
      }
 -    nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret);
 +    nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
      BDRV_POLL_WHILE(bs, ret == -EINPROGRESS);
      return ret;
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
          }
          qemu_mutex_lock(&q->lock);
 -        while (nvme_process_completion(s, q)) {
 +        while (nvme_process_completion(q)) {
              /* Keep polling */
              progress = true;
          }
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
      };
      if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
          error_setg(errp, "Failed to create io queue [%d]", n);
 -        nvme_free_queue_pair(bs, q);
 +        nvme_free_queue_pair(q);
          return false;
      }
      cmd = (NvmeCmd) {
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
      };
      if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
          error_setg(errp, "Failed to create io queue [%d]", n);
 -        nvme_free_queue_pair(bs, q);
 +        nvme_free_queue_pair(q);
          return false;
      }
      s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
@@ -XXX,XX +XXX,XX @@ static void nvme_close(BlockDriverState *bs)
      BDRVNVMeState *s = bs->opaque;
      for (i = 0; i < s->nr_queues; ++i) {
 -        nvme_free_queue_pair(bs, s->queues[i]);
 +        nvme_free_queue_pair(s->queues[i]);
      }
      g_free(s->queues);
      aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
      r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
      qemu_co_mutex_unlock(&s->dma_map_lock);
      if (r) {
 -        nvme_put_free_req_and_wake(s, ioq, req);
 +        nvme_put_free_req_and_wake(ioq, req);
          return r;
      }
 -    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
 +    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
      data.co = qemu_coroutine_self();
      while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
      assert(s->nr_queues > 1);
      req = nvme_get_free_req(ioq);
      assert(req);
 -    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
 +    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
      data.co = qemu_coroutine_self();
      if (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
      req = nvme_get_free_req(ioq);
      assert(req);
 -    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
 +    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
      data.co = qemu_coroutine_self();
      while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
      qemu_co_mutex_unlock(&s->dma_map_lock);
      if (ret) {
 -        nvme_put_free_req_and_wake(s, ioq, req);
 +        nvme_put_free_req_and_wake(ioq, req);
          goto out;
      }
      trace_nvme_dsm(s, offset, bytes);
 -    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
 +    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
      data.co = qemu_coroutine_self();
      while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static void nvme_aio_unplug(BlockDriverState *bs)
      for (i = 1; i < s->nr_queues; i++) {
          NVMeQueuePair *q = s->queues[i];
          qemu_mutex_lock(&q->lock);
 -        nvme_kick(s, q);
 -        nvme_process_completion(s, q);
 +        nvme_kick(q);
 +        nvme_process_completion(q);
          qemu_mutex_unlock(&q->lock);
      }
  }
 --
 .26.2

-New patch
+[PULL v2 03/28] libvhost-user: remove watch for kick_fd when de-initialize vu-dev
+From: Coiby Xu <coiby.xu@gmail.com>
+When the client is running in gdb and quit command is run in gdb,
+QEMU will still dispatch the event which will cause segment fault in
+the callback function.
+Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
+Message-id: 20200918080912.321299-3-coiby.xu@gmail.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ contrib/libvhost-user/libvhost-user.c | 1 +
+file changed, 1 insertion(+)
+diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
+index XXXXXXX..XXXXXXX 100644
+--- a/contrib/libvhost-user/libvhost-user.c
++++ b/contrib/libvhost-user/libvhost-user.c
+@@ -XXX,XX +XXX,XX @@ vu_deinit(VuDev *dev)
+         }
+         if (vq->kick_fd != -1) {
++            dev->remove_watch(dev, vq->kick_fd);
+             close(vq->kick_fd);
+             vq->kick_fd = -1;
+         }
+--
+.26.2

-New patch
+[PULL v2 04/28] util/vhost-user-server: generic vhost user server
+From: Coiby Xu <coiby.xu@gmail.com>
+Sharing QEMU devices via vhost-user protocol.
+Only one vhost-user client can connect to the server one time.
+Suggested-by: Kevin Wolf <kwolf@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
+Message-id: 20200918080912.321299-4-coiby.xu@gmail.com
+[Fixed size_t %lu -> %zu format string compiler error.
+--Stefan]
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ util/vhost-user-server.h |  65 ++++++
+ util/vhost-user-server.c | 428 +++++++++++++++++++++++++++++++++++++++
+ util/meson.build         |   1 +
+files changed, 494 insertions(+)
+ create mode 100644 util/vhost-user-server.h
+ create mode 100644 util/vhost-user-server.c
+diff --git a/util/vhost-user-server.h b/util/vhost-user-server.h
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/util/vhost-user-server.h
+@@ -XXX,XX +XXX,XX @@
++/*
++ * Sharing QEMU devices via vhost-user protocol
++ *
++ * Copyright (c) Coiby Xu <coiby.xu@gmail.com>.
++ * Copyright (c) 2020 Red Hat, Inc.
++ *
++ * This work is licensed under the terms of the GNU GPL, version 2 or
++ * later.  See the COPYING file in the top-level directory.
++ */
++
++#ifndef VHOST_USER_SERVER_H
++#define VHOST_USER_SERVER_H
++
++#include "contrib/libvhost-user/libvhost-user.h"
++#include "io/channel-socket.h"
++#include "io/channel-file.h"
++#include "io/net-listener.h"
++#include "qemu/error-report.h"
++#include "qapi/error.h"
++#include "standard-headers/linux/virtio_blk.h"
++
++typedef struct VuFdWatch {
++    VuDev *vu_dev;
++    int fd; /*kick fd*/
++    void *pvt;
++    vu_watch_cb cb;
++    bool processing;
++    QTAILQ_ENTRY(VuFdWatch) next;
++} VuFdWatch;
++
++typedef struct VuServer VuServer;
++typedef void DevicePanicNotifierFn(VuServer *server);
++
++struct VuServer {
++    QIONetListener *listener;
++    AioContext *ctx;
++    DevicePanicNotifierFn *device_panic_notifier;
++    int max_queues;
++    const VuDevIface *vu_iface;
++    VuDev vu_dev;
++    QIOChannel *ioc; /* The I/O channel with the client */
++    QIOChannelSocket *sioc; /* The underlying data channel with the client */
++    /* IOChannel for fd provided via VHOST_USER_SET_SLAVE_REQ_FD */
++    QIOChannel *ioc_slave;
++    QIOChannelSocket *sioc_slave;
++    Coroutine *co_trip; /* coroutine for processing VhostUserMsg */
++    QTAILQ_HEAD(, VuFdWatch) vu_fd_watches;
++    /* restart coroutine co_trip if AIOContext is changed */
++    bool aio_context_changed;
++    bool processing_msg;
++};
++
++bool vhost_user_server_start(VuServer *server,
++                             SocketAddress *unix_socket,
++                             AioContext *ctx,
++                             uint16_t max_queues,
++                             DevicePanicNotifierFn *device_panic_notifier,
++                             const VuDevIface *vu_iface,
++                             Error **errp);
++
++void vhost_user_server_stop(VuServer *server);
++
++void vhost_user_server_set_aio_context(VuServer *server, AioContext *ctx);
++
++#endif /* VHOST_USER_SERVER_H */
+diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/util/vhost-user-server.c
+@@ -XXX,XX +XXX,XX @@
++/*
++ * Sharing QEMU devices via vhost-user protocol
++ *
++ * Copyright (c) Coiby Xu <coiby.xu@gmail.com>.
++ * Copyright (c) 2020 Red Hat, Inc.
++ *
++ * This work is licensed under the terms of the GNU GPL, version 2 or
++ * later.  See the COPYING file in the top-level directory.
++ */
++#include "qemu/osdep.h"
++#include "qemu/main-loop.h"
++#include "vhost-user-server.h"
++
++static void vmsg_close_fds(VhostUserMsg *vmsg)
++{
++    int i;
++    for (i = 0; i < vmsg->fd_num; i++) {
++        close(vmsg->fds[i]);
++    }
++}
++
++static void vmsg_unblock_fds(VhostUserMsg *vmsg)
++{
++    int i;
++    for (i = 0; i < vmsg->fd_num; i++) {
++        qemu_set_nonblock(vmsg->fds[i]);
++    }
++}
++
++static void vu_accept(QIONetListener *listener, QIOChannelSocket *sioc,
++                      gpointer opaque);
++
++static void close_client(VuServer *server)
++{
++    /*
++     * Before closing the client
++     *
++     * 1. Let vu_client_trip stop processing new vhost-user msg
++     *
++     * 2. remove kick_handler
++     *
++     * 3. wait for the kick handler to be finished
++     *
++     * 4. wait for the current vhost-user msg to be finished processing
++     */
++
++    QIOChannelSocket *sioc = server->sioc;
++    /* When this is set vu_client_trip will stop new processing vhost-user message */
++    server->sioc = NULL;
++
++    VuFdWatch *vu_fd_watch, *next;
++    QTAILQ_FOREACH_SAFE(vu_fd_watch, &server->vu_fd_watches, next, next) {
++        aio_set_fd_handler(server->ioc->ctx, vu_fd_watch->fd, true, NULL,
++                           NULL, NULL, NULL);
++    }
++
++    while (!QTAILQ_EMPTY(&server->vu_fd_watches)) {
++        QTAILQ_FOREACH_SAFE(vu_fd_watch, &server->vu_fd_watches, next, next) {
++            if (!vu_fd_watch->processing) {
++                QTAILQ_REMOVE(&server->vu_fd_watches, vu_fd_watch, next);
++                g_free(vu_fd_watch);
++            }
++        }
++    }
++
++    while (server->processing_msg) {
++        if (server->ioc->read_coroutine) {
++            server->ioc->read_coroutine = NULL;
++            qio_channel_set_aio_fd_handler(server->ioc, server->ioc->ctx, NULL,
++                                           NULL, server->ioc);
++            server->processing_msg = false;
++        }
++    }
++
++    vu_deinit(&server->vu_dev);
++    object_unref(OBJECT(sioc));
++    object_unref(OBJECT(server->ioc));
++}
++
++static void panic_cb(VuDev *vu_dev, const char *buf)
++{
++    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
++
++    /* avoid while loop in close_client */
++    server->processing_msg = false;
++
++    if (buf) {
++        error_report("vu_panic: %s", buf);
++    }
++
++    if (server->sioc) {
++        close_client(server);
++    }
++
++    if (server->device_panic_notifier) {
++        server->device_panic_notifier(server);
++    }
++
++    /*
++     * Set the callback function for network listener so another
++     * vhost-user client can connect to this server
++     */
++    qio_net_listener_set_client_func(server->listener,
++                                     vu_accept,
++                                     server,
++                                     NULL);
++}
++
++static bool coroutine_fn
++vu_message_read(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)
++{
++    struct iovec iov = {
++        .iov_base = (char *)vmsg,
++        .iov_len = VHOST_USER_HDR_SIZE,
++    };
++    int rc, read_bytes = 0;
++    Error *local_err = NULL;
++    /*
++     * Store fds/nfds returned from qio_channel_readv_full into
++     * temporary variables.
++     *
++     * VhostUserMsg is a packed structure, gcc will complain about passing
++     * pointer to a packed structure member if we pass &VhostUserMsg.fd_num
++     * and &VhostUserMsg.fds directly when calling qio_channel_readv_full,
++     * thus two temporary variables nfds and fds are used here.
++     */
++    size_t nfds = 0, nfds_t = 0;
++    const size_t max_fds = G_N_ELEMENTS(vmsg->fds);
++    int *fds_t = NULL;
++    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
++    QIOChannel *ioc = server->ioc;
++
++    if (!ioc) {
++        error_report_err(local_err);
++        goto fail;
++    }
++
++    assert(qemu_in_coroutine());
++    do {
++        /*
++         * qio_channel_readv_full may have short reads, keeping calling it
++         * until getting VHOST_USER_HDR_SIZE or 0 bytes in total
++         */
++        rc = qio_channel_readv_full(ioc, &iov, 1, &fds_t, &nfds_t, &local_err);
++        if (rc < 0) {
++            if (rc == QIO_CHANNEL_ERR_BLOCK) {
++                qio_channel_yield(ioc, G_IO_IN);
++                continue;
++            } else {
++                error_report_err(local_err);
++                return false;
++            }
++        }
++        read_bytes += rc;
++        if (nfds_t > 0) {
++            if (nfds + nfds_t > max_fds) {
++                error_report("A maximum of %zu fds are allowed, "
++                             "however got %zu fds now",
++                             max_fds, nfds + nfds_t);
++                goto fail;
++            }
++            memcpy(vmsg->fds + nfds, fds_t,
++                   nfds_t *sizeof(vmsg->fds[0]));
++            nfds += nfds_t;
++            g_free(fds_t);
++        }
++        if (read_bytes == VHOST_USER_HDR_SIZE || rc == 0) {
++            break;
++        }
++        iov.iov_base = (char *)vmsg + read_bytes;
++        iov.iov_len = VHOST_USER_HDR_SIZE - read_bytes;
++    } while (true);
++
++    vmsg->fd_num = nfds;
++    /* qio_channel_readv_full will make socket fds blocking, unblock them */
++    vmsg_unblock_fds(vmsg);
++    if (vmsg->size > sizeof(vmsg->payload)) {
++        error_report("Error: too big message request: %d, "
++                     "size: vmsg->size: %u, "
++                     "while sizeof(vmsg->payload) = %zu",
++                     vmsg->request, vmsg->size, sizeof(vmsg->payload));
++        goto fail;
++    }
++
++    struct iovec iov_payload = {
++        .iov_base = (char *)&vmsg->payload,
++        .iov_len = vmsg->size,
++    };
++    if (vmsg->size) {
++        rc = qio_channel_readv_all_eof(ioc, &iov_payload, 1, &local_err);
++        if (rc == -1) {
++            error_report_err(local_err);
++            goto fail;
++        }
++    }
++
++    return true;
++
++fail:
++    vmsg_close_fds(vmsg);
++
++    return false;
++}
++
++
++static void vu_client_start(VuServer *server);
++static coroutine_fn void vu_client_trip(void *opaque)
++{
++    VuServer *server = opaque;
++
++    while (!server->aio_context_changed && server->sioc) {
++        server->processing_msg = true;
++        vu_dispatch(&server->vu_dev);
++        server->processing_msg = false;
++    }
++
++    if (server->aio_context_changed && server->sioc) {
++        server->aio_context_changed = false;
++        vu_client_start(server);
++    }
++}
++
++static void vu_client_start(VuServer *server)
++{
++    server->co_trip = qemu_coroutine_create(vu_client_trip, server);
++    aio_co_enter(server->ctx, server->co_trip);
++}
++
++/*
++ * a wrapper for vu_kick_cb
++ *
++ * since aio_dispatch can only pass one user data pointer to the
++ * callback function, pack VuDev and pvt into a struct. Then unpack it
++ * and pass them to vu_kick_cb
++ */
++static void kick_handler(void *opaque)
++{
++    VuFdWatch *vu_fd_watch = opaque;
++    vu_fd_watch->processing = true;
++    vu_fd_watch->cb(vu_fd_watch->vu_dev, 0, vu_fd_watch->pvt);
++    vu_fd_watch->processing = false;
++}
++
++
++static VuFdWatch *find_vu_fd_watch(VuServer *server, int fd)
++{
++
++    VuFdWatch *vu_fd_watch, *next;
++    QTAILQ_FOREACH_SAFE(vu_fd_watch, &server->vu_fd_watches, next, next) {
++        if (vu_fd_watch->fd == fd) {
++            return vu_fd_watch;
++        }
++    }
++    return NULL;
++}
++
++static void
++set_watch(VuDev *vu_dev, int fd, int vu_evt,
++          vu_watch_cb cb, void *pvt)
++{
++
++    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
++    g_assert(vu_dev);
++    g_assert(fd >= 0);
++    g_assert(cb);
++
++    VuFdWatch *vu_fd_watch = find_vu_fd_watch(server, fd);
++
++    if (!vu_fd_watch) {
++        VuFdWatch *vu_fd_watch = g_new0(VuFdWatch, 1);
++
++        QTAILQ_INSERT_TAIL(&server->vu_fd_watches, vu_fd_watch, next);
++
++        vu_fd_watch->fd = fd;
++        vu_fd_watch->cb = cb;
++        qemu_set_nonblock(fd);
++        aio_set_fd_handler(server->ioc->ctx, fd, true, kick_handler,
++                           NULL, NULL, vu_fd_watch);
++        vu_fd_watch->vu_dev = vu_dev;
++        vu_fd_watch->pvt = pvt;
++    }
++}
++
++
++static void remove_watch(VuDev *vu_dev, int fd)
++{
++    VuServer *server;
++    g_assert(vu_dev);
++    g_assert(fd >= 0);
++
++    server = container_of(vu_dev, VuServer, vu_dev);
++
++    VuFdWatch *vu_fd_watch = find_vu_fd_watch(server, fd);
++
++    if (!vu_fd_watch) {
++        return;
++    }
++    aio_set_fd_handler(server->ioc->ctx, fd, true, NULL, NULL, NULL, NULL);
++
++    QTAILQ_REMOVE(&server->vu_fd_watches, vu_fd_watch, next);
++    g_free(vu_fd_watch);
++}
++
++
++static void vu_accept(QIONetListener *listener, QIOChannelSocket *sioc,
++                      gpointer opaque)
++{
++    VuServer *server = opaque;
++
++    if (server->sioc) {
++        warn_report("Only one vhost-user client is allowed to "
++                    "connect the server one time");
++        return;
++    }
++
++    if (!vu_init(&server->vu_dev, server->max_queues, sioc->fd, panic_cb,
++                 vu_message_read, set_watch, remove_watch, server->vu_iface)) {
++        error_report("Failed to initialize libvhost-user");
++        return;
++    }
++
++    /*
++     * Unset the callback function for network listener to make another
++     * vhost-user client keeping waiting until this client disconnects
++     */
++    qio_net_listener_set_client_func(server->listener,
++                                     NULL,
++                                     NULL,
++                                     NULL);
++    server->sioc = sioc;
++    /*
++     * Increase the object reference, so sioc will not freed by
++     * qio_net_listener_channel_func which will call object_unref(OBJECT(sioc))
++     */
++    object_ref(OBJECT(server->sioc));
++    qio_channel_set_name(QIO_CHANNEL(sioc), "vhost-user client");
++    server->ioc = QIO_CHANNEL(sioc);
++    object_ref(OBJECT(server->ioc));
++    qio_channel_attach_aio_context(server->ioc, server->ctx);
++    qio_channel_set_blocking(QIO_CHANNEL(server->sioc), false, NULL);
++    vu_client_start(server);
++}
++
++
++void vhost_user_server_stop(VuServer *server)
++{
++    if (server->sioc) {
++        close_client(server);
++    }
++
++    if (server->listener) {
++        qio_net_listener_disconnect(server->listener);
++        object_unref(OBJECT(server->listener));
++    }
++
++}
++
++void vhost_user_server_set_aio_context(VuServer *server, AioContext *ctx)
++{
++    VuFdWatch *vu_fd_watch, *next;
++    void *opaque = NULL;
++    IOHandler *io_read = NULL;
++    bool attach;
++
++    server->ctx = ctx ? ctx : qemu_get_aio_context();
++
++    if (!server->sioc) {
++        /* not yet serving any client*/
++        return;
++    }
++
++    if (ctx) {
++        qio_channel_attach_aio_context(server->ioc, ctx);
++        server->aio_context_changed = true;
++        io_read = kick_handler;
++        attach = true;
++    } else {
++        qio_channel_detach_aio_context(server->ioc);
++        /* server->ioc->ctx keeps the old AioConext */
++        ctx = server->ioc->ctx;
++        attach = false;
++    }
++
++    QTAILQ_FOREACH_SAFE(vu_fd_watch, &server->vu_fd_watches, next, next) {
++        if (vu_fd_watch->cb) {
++            opaque = attach ? vu_fd_watch : NULL;
++            aio_set_fd_handler(ctx, vu_fd_watch->fd, true,
++                               io_read, NULL, NULL,
++                               opaque);
++        }
++    }
++}
++
++
++bool vhost_user_server_start(VuServer *server,
++                             SocketAddress *socket_addr,
++                             AioContext *ctx,
++                             uint16_t max_queues,
++                             DevicePanicNotifierFn *device_panic_notifier,
++                             const VuDevIface *vu_iface,
++                             Error **errp)
++{
++    QIONetListener *listener = qio_net_listener_new();
++    if (qio_net_listener_open_sync(listener, socket_addr, 1,
++                                   errp) < 0) {
++        object_unref(OBJECT(listener));
++        return false;
++    }
++
++    /* zero out unspecified fileds */
++    *server = (VuServer) {
++        .listener              = listener,
++        .vu_iface              = vu_iface,
++        .max_queues            = max_queues,
++        .ctx                   = ctx,
++        .device_panic_notifier = device_panic_notifier,
++    };
++
++    qio_net_listener_set_name(server->listener, "vhost-user-backend-listener");
++
++    qio_net_listener_set_client_func(server->listener,
++                                     vu_accept,
++                                     server,
++                                     NULL);
++
++    QTAILQ_INIT(&server->vu_fd_watches);
++    return true;
++}
+diff --git a/util/meson.build b/util/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/util/meson.build
++++ b/util/meson.build
+@@ -XXX,XX +XXX,XX @@ if have_block
+   util_ss.add(files('main-loop.c'))
+   util_ss.add(files('nvdimm-utils.c'))
+   util_ss.add(files('qemu-coroutine.c', 'qemu-coroutine-lock.c', 'qemu-coroutine-io.c'))
++  util_ss.add(when: 'CONFIG_LINUX', if_true: files('vhost-user-server.c'))
+   util_ss.add(files('qemu-coroutine-sleep.c'))
+   util_ss.add(files('qemu-co-shared-resource.c'))
+   util_ss.add(files('thread-pool.c', 'qemu-timer.c'))
+--
+.26.2

-New patch
+[PULL v2 05/28] block: move logical block size check function to a common utility function
+From: Coiby Xu <coiby.xu@gmail.com>
+Move the constants from hw/core/qdev-properties.c to
+util/block-helpers.h so that knowledge of the min/max values is
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
+Acked-by: Eduardo Habkost <ehabkost@redhat.com>
+Message-id: 20200918080912.321299-5-coiby.xu@gmail.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ util/block-helpers.h             | 19 +++++++++++++
+ hw/core/qdev-properties-system.c | 31 ++++-----------------
+ util/block-helpers.c             | 46 ++++++++++++++++++++++++++++++++
+ util/meson.build                 |  1 +
+files changed, 71 insertions(+), 26 deletions(-)
+ create mode 100644 util/block-helpers.h
+ create mode 100644 util/block-helpers.c
+diff --git a/util/block-helpers.h b/util/block-helpers.h
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/util/block-helpers.h
+@@ -XXX,XX +XXX,XX @@
++#ifndef BLOCK_HELPERS_H
++#define BLOCK_HELPERS_H
++
++#include "qemu/units.h"
++
++/* lower limit is sector size */
++#define MIN_BLOCK_SIZE          INT64_C(512)
++#define MIN_BLOCK_SIZE_STR      "512 B"
++/*
++ * upper limit is arbitrary, 2 MiB looks sufficient for all sensible uses, and
++ * matches qcow2 cluster size limit
++ */
++#define MAX_BLOCK_SIZE          (2 * MiB)
++#define MAX_BLOCK_SIZE_STR      "2 MiB"
++
++void check_block_size(const char *id, const char *name, int64_t value,
++                      Error **errp);
++
++#endif /* BLOCK_HELPERS_H */
+diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/core/qdev-properties-system.c
++++ b/hw/core/qdev-properties-system.c
+@@ -XXX,XX +XXX,XX @@
+ #include "sysemu/blockdev.h"
+ #include "net/net.h"
+ #include "hw/pci/pci.h"
++#include "util/block-helpers.h"
+ static bool check_prop_still_unset(DeviceState *dev, const char *name,
+                                    const void *old_val, const char *new_val,
+@@ -XXX,XX +XXX,XX @@ const PropertyInfo qdev_prop_losttickpolicy = {
+ /* --- blocksize --- */
+-/* lower limit is sector size */
+-#define MIN_BLOCK_SIZE          512
+-#define MIN_BLOCK_SIZE_STR      "512 B"
+-/*
+- * upper limit is arbitrary, 2 MiB looks sufficient for all sensible uses, and
+- * matches qcow2 cluster size limit
+- */
+-#define MAX_BLOCK_SIZE          (2 * MiB)
+-#define MAX_BLOCK_SIZE_STR      "2 MiB"
+-
+ static void set_blocksize(Object *obj, Visitor *v, const char *name,
+                           void *opaque, Error **errp)
+ {
+@@ -XXX,XX +XXX,XX @@ static void set_blocksize(Object *obj, Visitor *v, const char *name,
+     Property *prop = opaque;
+     uint32_t *ptr = qdev_get_prop_ptr(dev, prop);
+     uint64_t value;
++    Error *local_err = NULL;
+     if (dev->realized) {
+         qdev_prop_set_after_realize(dev, name, errp);
+@@ -XXX,XX +XXX,XX @@ static void set_blocksize(Object *obj, Visitor *v, const char *name,
+     if (!visit_type_size(v, name, &value, errp)) {
+         return;
+     }
+-    /* value of 0 means "unset" */
+-    if (value && (value < MIN_BLOCK_SIZE || value > MAX_BLOCK_SIZE)) {
+-        error_setg(errp,
+-                   "Property %s.%s doesn't take value %" PRIu64
+-                   " (minimum: " MIN_BLOCK_SIZE_STR
+-                   ", maximum: " MAX_BLOCK_SIZE_STR ")",
+-                   dev->id ? : "", name, value);
++    check_block_size(dev->id ? : "", name, value, &local_err);
++    if (local_err) {
++        error_propagate(errp, local_err);
+         return;
+     }
+-
+-    /* We rely on power-of-2 blocksizes for bitmasks */
+-    if ((value & (value - 1)) != 0) {
+-        error_setg(errp,
+-                  "Property %s.%s doesn't take value '%" PRId64 "', "
+-                  "it's not a power of 2", dev->id ?: "", name, (int64_t)value);
+-        return;
+-    }
+-
+     *ptr = value;
+ }
+diff --git a/util/block-helpers.c b/util/block-helpers.c
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/util/block-helpers.c
+@@ -XXX,XX +XXX,XX @@
++/*
++ * Block utility functions
++ *
++ * Copyright IBM, Corp. 2011
++ * Copyright (c) 2020 Coiby Xu <coiby.xu@gmail.com>
++ *
++ * This work is licensed under the terms of the GNU GPL, version 2 or later.
++ * See the COPYING file in the top-level directory.
++ */
++
++#include "qemu/osdep.h"
++#include "qapi/error.h"
++#include "qapi/qmp/qerror.h"
++#include "block-helpers.h"
++
++/**
++ * check_block_size:
++ * @id: The unique ID of the object
++ * @name: The name of the property being validated
++ * @value: The block size in bytes
++ * @errp: A pointer to an area to store an error
++ *
++ * This function checks that the block size meets the following conditions:
++ * 1. At least MIN_BLOCK_SIZE
++ * 2. No larger than MAX_BLOCK_SIZE
++ * 3. A power of 2
++ */
++void check_block_size(const char *id, const char *name, int64_t value,
++                      Error **errp)
++{
++    /* value of 0 means "unset" */
++    if (value && (value < MIN_BLOCK_SIZE || value > MAX_BLOCK_SIZE)) {
++        error_setg(errp, QERR_PROPERTY_VALUE_OUT_OF_RANGE,
++                   id, name, value, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE);
++        return;
++    }
++
++    /* We rely on power-of-2 blocksizes for bitmasks */
++    if ((value & (value - 1)) != 0) {
++        error_setg(errp,
++                   "Property %s.%s doesn't take value '%" PRId64
++                   "', it's not a power of 2",
++                   id, name, value);
++        return;
++    }
++}
+diff --git a/util/meson.build b/util/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/util/meson.build
++++ b/util/meson.build
+@@ -XXX,XX +XXX,XX @@ if have_block
+   util_ss.add(files('nvdimm-utils.c'))
+   util_ss.add(files('qemu-coroutine.c', 'qemu-coroutine-lock.c', 'qemu-coroutine-io.c'))
+   util_ss.add(when: 'CONFIG_LINUX', if_true: files('vhost-user-server.c'))
++  util_ss.add(files('block-helpers.c'))
+   util_ss.add(files('qemu-coroutine-sleep.c'))
+   util_ss.add(files('qemu-co-shared-resource.c'))
+   util_ss.add(files('thread-pool.c', 'qemu-timer.c'))
+--
+.26.2

-[PULL 04/12] configure: add flags to support SafeStack
+[PULL v2 06/28] block/export: vhost-user block device backend server
-From: Daniele Buono <dbuono@linux.vnet.ibm.com>
+From: Coiby Xu <coiby.xu@gmail.com>
-This patch adds a flag to enable/disable the SafeStack instrumentation
+By making use of libvhost-user, block device drive can be shared to
-provided by LLVM.
+the connected vhost-user client. Only one client can connect to the
 server one time.
-On enable, make sure that the compiler supports the flags, and that we
+Since vhost-user-server needs a block drive to be created first, delay
-are using the proper coroutine implementation (coroutine-ucontext).
+the creation of this object.
 On disable, explicitly disable the option if it was enabled by default.
-While SafeStack is supported only on Linux, NetBSD, FreeBSD and macOS,
+Suggested-by: Kevin Wolf <kwolf@redhat.com>
-we are not checking for the O.S. since this is already done by LLVM.
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
-Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Message-id: 20200529205122.714-4-dbuono@linux.vnet.ibm.com
+Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
 Message-id: 20200918080912.321299-6-coiby.xu@gmail.com
 [Shorten "vhost_user_blk_server" string to "vhost_user_blk" to avoid the
 following compiler warning:
 ../block/export/vhost-user-blk-server.c:178:50: error: ‘%s’ directive output truncated writing 21 bytes into a region of size 20 [-Werror=format-truncation=]
 and fix "Invalid size %ld ..." ssize_t format string arguments for
 -bit hosts.
 --Stefan]
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- configure | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ block/export/vhost-user-blk-server.h |  36 ++
-file changed, 73 insertions(+)
+ block/export/vhost-user-blk-server.c | 661 +++++++++++++++++++++++++++
  softmmu/vl.c                         |   4 +
  block/meson.build                    |   1 +
 files changed, 702 insertions(+)
  create mode 100644 block/export/vhost-user-blk-server.h
  create mode 100644 block/export/vhost-user-blk-server.c
-diff --git a/configure b/configure
+diff --git a/block/export/vhost-user-blk-server.h b/block/export/vhost-user-blk-server.h
-index XXXXXXX..XXXXXXX 100755
+new file mode 100644
---- a/configure
+index XXXXXXX..XXXXXXX
-+++ b/configure
+--- /dev/null
-@@ -XXX,XX +XXX,XX @@ audio_win_int=""
++++ b/block/export/vhost-user-blk-server.h
- libs_qga=""
+@@ -XXX,XX +XXX,XX @@
- debug_info="yes"
++/*
- stack_protector=""
++ * Sharing QEMU block devices via vhost-user protocal
-+safe_stack=""
++ *
- use_containers="yes"
++ * Copyright (c) Coiby Xu <coiby.xu@gmail.com>.
- gdb_bin=$(command -v "gdb-multiarch" || command -v "gdb")
++ * Copyright (c) 2020 Red Hat, Inc.
++ *
-@@ -XXX,XX +XXX,XX @@ for opt do
++ * This work is licensed under the terms of the GNU GPL, version 2 or
-   ;;
++ * later.  See the COPYING file in the top-level directory.
-   --disable-stack-protector) stack_protector="no"
++ */
-   ;;
++
-+  --enable-safe-stack) safe_stack="yes"
++#ifndef VHOST_USER_BLK_SERVER_H
-+  ;;
++#define VHOST_USER_BLK_SERVER_H
-+  --disable-safe-stack) safe_stack="no"
++#include "util/vhost-user-server.h"
-+  ;;
++
-   --disable-curses) curses="no"
++typedef struct VuBlockDev VuBlockDev;
-   ;;
++#define TYPE_VHOST_USER_BLK_SERVER "vhost-user-blk-server"
-   --enable-curses) curses="yes"
++#define VHOST_USER_BLK_SERVER(obj) \
-@@ -XXX,XX +XXX,XX @@ disabled with --disable-FEATURE, default is enabled if available:
++   OBJECT_CHECK(VuBlockDev, obj, TYPE_VHOST_USER_BLK_SERVER)
-   debug-tcg       TCG debugging (default is disabled)
++
-   debug-info      debugging information
++/* vhost user block device */
-   sparse          sparse checker
++struct VuBlockDev {
-+  safe-stack      SafeStack Stack Smash Protection. Depends on
++    Object parent_obj;
-+                  clang/llvm >= 3.7 and requires coroutine backend ucontext.
++    char *node_name;
++    SocketAddress *addr;
-   gnutls          GNUTLS cryptography support
++    AioContext *ctx;
-   nettle          nettle cryptography support
++    VuServer vu_server;
-@@ -XXX,XX +XXX,XX @@ if test "$debug_stack_usage" = "yes"; then
++    bool running;
-   fi
++    uint32_t blk_size;
- fi
++    BlockBackend *backend;
++    QIOChannelSocket *sioc;
-+##################################################
++    QTAILQ_ENTRY(VuBlockDev) next;
-+# SafeStack
++    struct virtio_blk_config blkcfg;
-+
++    bool writable;
-+
++};
-+if test "$safe_stack" = "yes"; then
++
-+cat > $TMPC << EOF
++#endif /* VHOST_USER_BLK_SERVER_H */
-+int main(int argc, char *argv[])
+diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
-+{
+new file mode 100644
-+#if ! __has_feature(safe_stack)
+index XXXXXXX..XXXXXXX
-+#error SafeStack Disabled
+--- /dev/null
-+#endif
++++ b/block/export/vhost-user-blk-server.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * Sharing QEMU block devices via vhost-user protocal
 + *
 + * Parts of the code based on nbd/server.c.
 + *
 + * Copyright (c) Coiby Xu <coiby.xu@gmail.com>.
 + * Copyright (c) 2020 Red Hat, Inc.
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or
 + * later.  See the COPYING file in the top-level directory.
 + */
 +#include "qemu/osdep.h"
 +#include "block/block.h"
 +#include "vhost-user-blk-server.h"
 +#include "qapi/error.h"
 +#include "qom/object_interfaces.h"
 +#include "sysemu/block-backend.h"
 +#include "util/block-helpers.h"
 +
 +enum {
 +    VHOST_USER_BLK_MAX_QUEUES = 1,
 +};
 +struct virtio_blk_inhdr {
 +    unsigned char status;
 +};
 +
 +typedef struct VuBlockReq {
 +    VuVirtqElement *elem;
 +    int64_t sector_num;
 +    size_t size;
 +    struct virtio_blk_inhdr *in;
 +    struct virtio_blk_outhdr out;
 +    VuServer *server;
 +    struct VuVirtq *vq;
 +} VuBlockReq;
 +
 +static void vu_block_req_complete(VuBlockReq *req)
 +{
 +    VuDev *vu_dev = &req->server->vu_dev;
 +
 +    /* IO size with 1 extra status byte */
 +    vu_queue_push(vu_dev, req->vq, req->elem, req->size + 1);
 +    vu_queue_notify(vu_dev, req->vq);
 +
 +    if (req->elem) {
 +        free(req->elem);
 +    }
 +
 +    g_free(req);
 +}
 +
 +static VuBlockDev *get_vu_block_device_by_server(VuServer *server)
 +{
 +    return container_of(server, VuBlockDev, vu_server);
 +}
 +
 +static int coroutine_fn
 +vu_block_discard_write_zeroes(VuBlockReq *req, struct iovec *iov,
 +                              uint32_t iovcnt, uint32_t type)
 +{
 +    struct virtio_blk_discard_write_zeroes desc;
 +    ssize_t size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc));
 +    if (unlikely(size != sizeof(desc))) {
 +        error_report("Invalid size %zd, expect %zu", size, sizeof(desc));
 +        return -EINVAL;
 +    }
 +
 +    VuBlockDev *vdev_blk = get_vu_block_device_by_server(req->server);
 +    uint64_t range[2] = { le64_to_cpu(desc.sector) << 9,
 +                          le32_to_cpu(desc.num_sectors) << 9 };
 +    if (type == VIRTIO_BLK_T_DISCARD) {
 +        if (blk_co_pdiscard(vdev_blk->backend, range[0], range[1]) == 0) {
 +            return 0;
 +        }
 +    } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
 +        if (blk_co_pwrite_zeroes(vdev_blk->backend,
 +                                 range[0], range[1], 0) == 0) {
 +            return 0;
 +        }
 +    }
 +
 +    return -EINVAL;
 +}
 +
 +static void coroutine_fn vu_block_flush(VuBlockReq *req)
 +{
 +    VuBlockDev *vdev_blk = get_vu_block_device_by_server(req->server);
 +    BlockBackend *backend = vdev_blk->backend;
 +    blk_co_flush(backend);
 +}
 +
 +struct req_data {
 +    VuServer *server;
 +    VuVirtq *vq;
 +    VuVirtqElement *elem;
 +};
 +
 +static void coroutine_fn vu_block_virtio_process_req(void *opaque)
 +{
 +    struct req_data *data = opaque;
 +    VuServer *server = data->server;
 +    VuVirtq *vq = data->vq;
 +    VuVirtqElement *elem = data->elem;
 +    uint32_t type;
 +    VuBlockReq *req;
 +
 +    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
 +    BlockBackend *backend = vdev_blk->backend;
 +
 +    struct iovec *in_iov = elem->in_sg;
 +    struct iovec *out_iov = elem->out_sg;
 +    unsigned in_num = elem->in_num;
 +    unsigned out_num = elem->out_num;
 +    /* refer to hw/block/virtio_blk.c */
 +    if (elem->out_num < 1 || elem->in_num < 1) {
 +        error_report("virtio-blk request missing headers");
 +        free(elem);
 +        return;
 +    }
 +
 +    req = g_new0(VuBlockReq, 1);
 +    req->server = server;
 +    req->vq = vq;
 +    req->elem = elem;
 +
 +    if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
 +                            sizeof(req->out)) != sizeof(req->out))) {
 +        error_report("virtio-blk request outhdr too short");
 +        goto err;
 +    }
 +
 +    iov_discard_front(&out_iov, &out_num, sizeof(req->out));
 +
 +    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
 +        error_report("virtio-blk request inhdr too short");
 +        goto err;
 +    }
 +
 +    /* We always touch the last byte, so just see how big in_iov is.  */
 +    req->in = (void *)in_iov[in_num - 1].iov_base
 +              + in_iov[in_num - 1].iov_len
 +              - sizeof(struct virtio_blk_inhdr);
 +    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
 +
 +    type = le32_to_cpu(req->out.type);
 +    switch (type & ~VIRTIO_BLK_T_BARRIER) {
 +    case VIRTIO_BLK_T_IN:
 +    case VIRTIO_BLK_T_OUT: {
 +        ssize_t ret = 0;
 +        bool is_write = type & VIRTIO_BLK_T_OUT;
 +        req->sector_num = le64_to_cpu(req->out.sector);
 +
 +        int64_t offset = req->sector_num * vdev_blk->blk_size;
 +        QEMUIOVector qiov;
 +        if (is_write) {
 +            qemu_iovec_init_external(&qiov, out_iov, out_num);
 +            ret = blk_co_pwritev(backend, offset, qiov.size,
 +                                 &qiov, 0);
 +        } else {
 +            qemu_iovec_init_external(&qiov, in_iov, in_num);
 +            ret = blk_co_preadv(backend, offset, qiov.size,
 +                                &qiov, 0);
 +        }
 +        if (ret >= 0) {
 +            req->in->status = VIRTIO_BLK_S_OK;
 +        } else {
 +            req->in->status = VIRTIO_BLK_S_IOERR;
 +        }
 +        break;
 +    }
 +    case VIRTIO_BLK_T_FLUSH:
 +        vu_block_flush(req);
 +        req->in->status = VIRTIO_BLK_S_OK;
 +        break;
 +    case VIRTIO_BLK_T_GET_ID: {
 +        size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
 +                          VIRTIO_BLK_ID_BYTES);
 +        snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
 +        req->in->status = VIRTIO_BLK_S_OK;
 +        req->size = elem->in_sg[0].iov_len;
 +        break;
 +    }
 +    case VIRTIO_BLK_T_DISCARD:
 +    case VIRTIO_BLK_T_WRITE_ZEROES: {
 +        int rc;
 +        rc = vu_block_discard_write_zeroes(req, &elem->out_sg[1],
 +                                           out_num, type);
 +        if (rc == 0) {
 +            req->in->status = VIRTIO_BLK_S_OK;
 +        } else {
 +            req->in->status = VIRTIO_BLK_S_IOERR;
 +        }
 +        break;
 +    }
 +    default:
 +        req->in->status = VIRTIO_BLK_S_UNSUPP;
 +        break;
 +    }
 +
 +    vu_block_req_complete(req);
 +    return;
 +
 +err:
 +    free(elem);
 +    g_free(req);
 +    return;
 +}
 +
 +static void vu_block_process_vq(VuDev *vu_dev, int idx)
 +{
 +    VuServer *server;
 +    VuVirtq *vq;
 +    struct req_data *req_data;
 +
 +    server = container_of(vu_dev, VuServer, vu_dev);
 +    assert(server);
 +
 +    vq = vu_get_queue(vu_dev, idx);
 +    assert(vq);
 +    VuVirtqElement *elem;
 +    while (1) {
 +        elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) +
 +                                    sizeof(VuBlockReq));
 +        if (elem) {
 +            req_data = g_new0(struct req_data, 1);
 +            req_data->server = server;
 +            req_data->vq = vq;
 +            req_data->elem = elem;
 +            Coroutine *co = qemu_coroutine_create(vu_block_virtio_process_req,
 +                                                  req_data);
 +            aio_co_enter(server->ioc->ctx, co);
 +        } else {
 +            break;
 +        }
 +    }
 +}
 +
 +static void vu_block_queue_set_started(VuDev *vu_dev, int idx, bool started)
 +{
 +    VuVirtq *vq;
 +
 +    assert(vu_dev);
 +
 +    vq = vu_get_queue(vu_dev, idx);
 +    vu_set_queue_handler(vu_dev, vq, started ? vu_block_process_vq : NULL);
 +}
 +
 +static uint64_t vu_block_get_features(VuDev *dev)
 +{
 +    uint64_t features;
 +    VuServer *server = container_of(dev, VuServer, vu_dev);
 +    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
 +    features = 1ull << VIRTIO_BLK_F_SIZE_MAX |
 +               1ull << VIRTIO_BLK_F_SEG_MAX |
 +               1ull << VIRTIO_BLK_F_TOPOLOGY |
 +               1ull << VIRTIO_BLK_F_BLK_SIZE |
 +               1ull << VIRTIO_BLK_F_FLUSH |
 +               1ull << VIRTIO_BLK_F_DISCARD |
 +               1ull << VIRTIO_BLK_F_WRITE_ZEROES |
 +               1ull << VIRTIO_BLK_F_CONFIG_WCE |
 +               1ull << VIRTIO_F_VERSION_1 |
 +               1ull << VIRTIO_RING_F_INDIRECT_DESC |
 +               1ull << VIRTIO_RING_F_EVENT_IDX |
 +               1ull << VHOST_USER_F_PROTOCOL_FEATURES;
 +
 +    if (!vdev_blk->writable) {
 +        features |= 1ull << VIRTIO_BLK_F_RO;
 +    }
 +
 +    return features;
 +}
 +
 +static uint64_t vu_block_get_protocol_features(VuDev *dev)
 +{
 +    return 1ull << VHOST_USER_PROTOCOL_F_CONFIG |
 +           1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD;
 +}
 +
 +static int
 +vu_block_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
 +{
 +    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
 +    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
 +    memcpy(config, &vdev_blk->blkcfg, len);
 +
 +    return 0;
 +}
-+EOF
++
-+  flag="-fsanitize=safe-stack"
++static int
-+  # Check that safe-stack is supported and enabled.
++vu_block_set_config(VuDev *vu_dev, const uint8_t *data,
-+  if compile_prog "-Werror $flag" "$flag"; then
++                    uint32_t offset, uint32_t size, uint32_t flags)
-+    # Flag needed both at compilation and at linking
++{
-+    QEMU_CFLAGS="$QEMU_CFLAGS $flag"
++    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
-+    QEMU_LDFLAGS="$QEMU_LDFLAGS $flag"
++    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
-+  else
++    uint8_t wce;
-+    error_exit "SafeStack not supported by your compiler"
++
-+  fi
++    /* don't support live migration */
-+  if test "$coroutine" != "ucontext"; then
++    if (flags != VHOST_SET_CONFIG_TYPE_MASTER) {
-+    error_exit "SafeStack is only supported by the coroutine backend ucontext"
++        return -EINVAL;
-+  fi
++    }
-+else
++
-+cat > $TMPC << EOF
++    if (offset != offsetof(struct virtio_blk_config, wce) ||
-+int main(int argc, char *argv[])
++        size != 1) {
-+{
++        return -EINVAL;
-+#if defined(__has_feature)
++    }
-+#if __has_feature(safe_stack)
++
-+#error SafeStack Enabled
++    wce = *data;
-+#endif
++    vdev_blk->blkcfg.wce = wce;
-+#endif
++    blk_set_enable_write_cache(vdev_blk->backend, wce);
 +    return 0;
 +}
-+EOF
++
-+if test "$safe_stack" = "no"; then
++/*
-+  # Make sure that safe-stack is disabled
++ * When the client disconnects, it sends a VHOST_USER_NONE request
-+  if ! compile_prog "-Werror" ""; then
++ * and vu_process_message will simple call exit which cause the VM
-+    # SafeStack was already enabled, try to explicitly remove the feature
++ * to exit abruptly.
-+    flag="-fno-sanitize=safe-stack"
++ * To avoid this issue,  process VHOST_USER_NONE request ahead
-+    if ! compile_prog "-Werror $flag" "$flag"; then
++ * of vu_process_message.
-+      error_exit "Configure cannot disable SafeStack"
++ *
-+    fi
++ */
-+    QEMU_CFLAGS="$QEMU_CFLAGS $flag"
++static int vu_block_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply)
-+    QEMU_LDFLAGS="$QEMU_LDFLAGS $flag"
++{
-+  fi
++    if (vmsg->request == VHOST_USER_NONE) {
-+else # "$safe_stack" = ""
++        dev->panic(dev, "disconnect");
-+  # Set safe_stack to yes or no based on pre-existing flags
++        return true;
-+  if compile_prog "-Werror" ""; then
++    }
-+    safe_stack="no"
++    return false;
-+  else
++}
-+    safe_stack="yes"
++
-+    if test "$coroutine" != "ucontext"; then
++static const VuDevIface vu_block_iface = {
-+      error_exit "SafeStack is only supported by the coroutine backend ucontext"
++    .get_features          = vu_block_get_features,
-+    fi
++    .queue_set_started     = vu_block_queue_set_started,
-+  fi
++    .get_protocol_features = vu_block_get_protocol_features,
-+fi
++    .get_config            = vu_block_get_config,
-+fi
++    .set_config            = vu_block_set_config,
++    .process_msg           = vu_block_process_msg,
- ##########################################
++};
- # check if we have open_by_handle_at
++
-@@ -XXX,XX +XXX,XX @@ echo "sparse enabled    $sparse"
++static void blk_aio_attached(AioContext *ctx, void *opaque)
- echo "strip binaries    $strip_opt"
++{
- echo "profiler          $profiler"
++    VuBlockDev *vub_dev = opaque;
- echo "static build      $static"
++    aio_context_acquire(ctx);
-+echo "safe stack        $safe_stack"
++    vhost_user_server_set_aio_context(&vub_dev->vu_server, ctx);
- if test "$darwin" = "yes" ; then
++    aio_context_release(ctx);
-     echo "Cocoa support     $cocoa"
++}
- fi
++
-@@ -XXX,XX +XXX,XX @@ if test "$ccache_cpp2" = "yes"; then
++static void blk_aio_detach(void *opaque)
-   echo "export CCACHE_CPP2=y" >> $config_host_mak
++{
- fi
++    VuBlockDev *vub_dev = opaque;
++    AioContext *ctx = vub_dev->vu_server.ctx;
-+if test "$safe_stack" = "yes"; then
++    aio_context_acquire(ctx);
-+  echo "CONFIG_SAFESTACK=y" >> $config_host_mak
++    vhost_user_server_set_aio_context(&vub_dev->vu_server, NULL);
-+fi
++    aio_context_release(ctx);
-+
++}
- # If we're using a separate build tree, set it up now.
++
- # DIRS are directories which we simply mkdir in the build tree;
++static void
- # LINKS are things to symlink back into the source tree
++vu_block_initialize_config(BlockDriverState *bs,
 +                           struct virtio_blk_config *config, uint32_t blk_size)
 +{
 +    config->capacity = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
 +    config->blk_size = blk_size;
 +    config->size_max = 0;
 +    config->seg_max = 128 - 2;
 +    config->min_io_size = 1;
 +    config->opt_io_size = 1;
 +    config->num_queues = VHOST_USER_BLK_MAX_QUEUES;
 +    config->max_discard_sectors = 32768;
 +    config->max_discard_seg = 1;
 +    config->discard_sector_alignment = config->blk_size >> 9;
 +    config->max_write_zeroes_sectors = 32768;
 +    config->max_write_zeroes_seg = 1;
 +}
 +
 +static VuBlockDev *vu_block_init(VuBlockDev *vu_block_device, Error **errp)
 +{
 +
 +    BlockBackend *blk;
 +    Error *local_error = NULL;
 +    const char *node_name = vu_block_device->node_name;
 +    bool writable = vu_block_device->writable;
 +    uint64_t perm = BLK_PERM_CONSISTENT_READ;
 +    int ret;
 +
 +    AioContext *ctx;
 +
 +    BlockDriverState *bs = bdrv_lookup_bs(node_name, node_name, &local_error);
 +
 +    if (!bs) {
 +        error_propagate(errp, local_error);
 +        return NULL;
 +    }
 +
 +    if (bdrv_is_read_only(bs)) {
 +        writable = false;
 +    }
 +
 +    if (writable) {
 +        perm |= BLK_PERM_WRITE;
 +    }
 +
 +    ctx = bdrv_get_aio_context(bs);
 +    aio_context_acquire(ctx);
 +    bdrv_invalidate_cache(bs, NULL);
 +    aio_context_release(ctx);
 +
 +    /*
 +     * Don't allow resize while the vhost user server is running,
 +     * otherwise we don't care what happens with the node.
 +     */
 +    blk = blk_new(bdrv_get_aio_context(bs), perm,
 +                  BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
 +                  BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD);
 +    ret = blk_insert_bs(blk, bs, errp);
 +
 +    if (ret < 0) {
 +        goto fail;
 +    }
 +
 +    blk_set_enable_write_cache(blk, false);
 +
 +    blk_set_allow_aio_context_change(blk, true);
 +
 +    vu_block_device->blkcfg.wce = 0;
 +    vu_block_device->backend = blk;
 +    if (!vu_block_device->blk_size) {
 +        vu_block_device->blk_size = BDRV_SECTOR_SIZE;
 +    }
 +    vu_block_device->blkcfg.blk_size = vu_block_device->blk_size;
 +    blk_set_guest_block_size(blk, vu_block_device->blk_size);
 +    vu_block_initialize_config(bs, &vu_block_device->blkcfg,
 +                                   vu_block_device->blk_size);
 +    return vu_block_device;
 +
 +fail:
 +    blk_unref(blk);
 +    return NULL;
 +}
 +
 +static void vu_block_deinit(VuBlockDev *vu_block_device)
 +{
 +    if (vu_block_device->backend) {
 +        blk_remove_aio_context_notifier(vu_block_device->backend, blk_aio_attached,
 +                                        blk_aio_detach, vu_block_device);
 +    }
 +
 +    blk_unref(vu_block_device->backend);
 +}
 +
 +static void vhost_user_blk_server_stop(VuBlockDev *vu_block_device)
 +{
 +    vhost_user_server_stop(&vu_block_device->vu_server);
 +    vu_block_deinit(vu_block_device);
 +}
 +
 +static void vhost_user_blk_server_start(VuBlockDev *vu_block_device,
 +                                        Error **errp)
 +{
 +    AioContext *ctx;
 +    SocketAddress *addr = vu_block_device->addr;
 +
 +    if (!vu_block_init(vu_block_device, errp)) {
 +        return;
 +    }
 +
 +    ctx = bdrv_get_aio_context(blk_bs(vu_block_device->backend));
 +
 +    if (!vhost_user_server_start(&vu_block_device->vu_server, addr, ctx,
 +                                 VHOST_USER_BLK_MAX_QUEUES,
 +                                 NULL, &vu_block_iface,
 +                                 errp)) {
 +        goto error;
 +    }
 +
 +    blk_add_aio_context_notifier(vu_block_device->backend, blk_aio_attached,
 +                                 blk_aio_detach, vu_block_device);
 +    vu_block_device->running = true;
 +    return;
 +
 + error:
 +    vu_block_deinit(vu_block_device);
 +}
 +
 +static bool vu_prop_modifiable(VuBlockDev *vus, Error **errp)
 +{
 +    if (vus->running) {
 +            error_setg(errp, "The property can't be modified "
 +                       "while the server is running");
 +            return false;
 +    }
 +    return true;
 +}
 +
 +static void vu_set_node_name(Object *obj, const char *value, Error **errp)
 +{
 +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
 +
 +    if (!vu_prop_modifiable(vus, errp)) {
 +        return;
 +    }
 +
 +    if (vus->node_name) {
 +        g_free(vus->node_name);
 +    }
 +
 +    vus->node_name = g_strdup(value);
 +}
 +
 +static char *vu_get_node_name(Object *obj, Error **errp)
 +{
 +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
 +    return g_strdup(vus->node_name);
 +}
 +
 +static void free_socket_addr(SocketAddress *addr)
 +{
 +        g_free(addr->u.q_unix.path);
 +        g_free(addr);
 +}
 +
 +static void vu_set_unix_socket(Object *obj, const char *value,
 +                               Error **errp)
 +{
 +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
 +
 +    if (!vu_prop_modifiable(vus, errp)) {
 +        return;
 +    }
 +
 +    if (vus->addr) {
 +        free_socket_addr(vus->addr);
 +    }
 +
 +    SocketAddress *addr = g_new0(SocketAddress, 1);
 +    addr->type = SOCKET_ADDRESS_TYPE_UNIX;
 +    addr->u.q_unix.path = g_strdup(value);
 +    vus->addr = addr;
 +}
 +
 +static char *vu_get_unix_socket(Object *obj, Error **errp)
 +{
 +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
 +    return g_strdup(vus->addr->u.q_unix.path);
 +}
 +
 +static bool vu_get_block_writable(Object *obj, Error **errp)
 +{
 +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
 +    return vus->writable;
 +}
 +
 +static void vu_set_block_writable(Object *obj, bool value, Error **errp)
 +{
 +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
 +
 +    if (!vu_prop_modifiable(vus, errp)) {
 +            return;
 +    }
 +
 +    vus->writable = value;
 +}
 +
 +static void vu_get_blk_size(Object *obj, Visitor *v, const char *name,
 +                            void *opaque, Error **errp)
 +{
 +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
 +    uint32_t value = vus->blk_size;
 +
 +    visit_type_uint32(v, name, &value, errp);
 +}
 +
 +static void vu_set_blk_size(Object *obj, Visitor *v, const char *name,
 +                            void *opaque, Error **errp)
 +{
 +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
 +
 +    Error *local_err = NULL;
 +    uint32_t value;
 +
 +    if (!vu_prop_modifiable(vus, errp)) {
 +            return;
 +    }
 +
 +    visit_type_uint32(v, name, &value, &local_err);
 +    if (local_err) {
 +        goto out;
 +    }
 +
 +    check_block_size(object_get_typename(obj), name, value, &local_err);
 +    if (local_err) {
 +        goto out;
 +    }
 +
 +    vus->blk_size = value;
 +
 +out:
 +    error_propagate(errp, local_err);
 +}
 +
 +static void vhost_user_blk_server_instance_finalize(Object *obj)
 +{
 +    VuBlockDev *vub = VHOST_USER_BLK_SERVER(obj);
 +
 +    vhost_user_blk_server_stop(vub);
 +
 +    /*
 +     * Unlike object_property_add_str, object_class_property_add_str
 +     * doesn't have a release method. Thus manual memory freeing is
 +     * needed.
 +     */
 +    free_socket_addr(vub->addr);
 +    g_free(vub->node_name);
 +}
 +
 +static void vhost_user_blk_server_complete(UserCreatable *obj, Error **errp)
 +{
 +    VuBlockDev *vub = VHOST_USER_BLK_SERVER(obj);
 +
 +    vhost_user_blk_server_start(vub, errp);
 +}
 +
 +static void vhost_user_blk_server_class_init(ObjectClass *klass,
 +                                             void *class_data)
 +{
 +    UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
 +    ucc->complete = vhost_user_blk_server_complete;
 +
 +    object_class_property_add_bool(klass, "writable",
 +                                   vu_get_block_writable,
 +                                   vu_set_block_writable);
 +
 +    object_class_property_add_str(klass, "node-name",
 +                                  vu_get_node_name,
 +                                  vu_set_node_name);
 +
 +    object_class_property_add_str(klass, "unix-socket",
 +                                  vu_get_unix_socket,
 +                                  vu_set_unix_socket);
 +
 +    object_class_property_add(klass, "logical-block-size", "uint32",
 +                              vu_get_blk_size, vu_set_blk_size,
 +                              NULL, NULL);
 +}
 +
 +static const TypeInfo vhost_user_blk_server_info = {
 +    .name = TYPE_VHOST_USER_BLK_SERVER,
 +    .parent = TYPE_OBJECT,
 +    .instance_size = sizeof(VuBlockDev),
 +    .instance_finalize = vhost_user_blk_server_instance_finalize,
 +    .class_init = vhost_user_blk_server_class_init,
 +    .interfaces = (InterfaceInfo[]) {
 +        {TYPE_USER_CREATABLE},
 +        {}
 +    },
 +};
 +
 +static void vhost_user_blk_server_register_types(void)
 +{
 +    type_register_static(&vhost_user_blk_server_info);
 +}
 +
 +type_init(vhost_user_blk_server_register_types)
 diff --git a/softmmu/vl.c b/softmmu/vl.c
 index XXXXXXX..XXXXXXX 100644
 --- a/softmmu/vl.c
 +++ b/softmmu/vl.c
@@ -XXX,XX +XXX,XX @@ static bool object_create_initial(const char *type, QemuOpts *opts)
      }
  #endif
 +    /* Reason: vhost-user-blk-server property "node-name" */
 +    if (g_str_equal(type, "vhost-user-blk-server")) {
 +        return false;
 +    }
      /*
       * Reason: filter-* property "netdev" etc.
       */
 diff --git a/block/meson.build b/block/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/block/meson.build
 +++ b/block/meson.build
@@ -XXX,XX +XXX,XX @@ block_ss.add(when: 'CONFIG_WIN32', if_true: files('file-win32.c', 'win32-aio.c')
  block_ss.add(when: 'CONFIG_POSIX', if_true: [files('file-posix.c'), coref, iokit])
  block_ss.add(when: 'CONFIG_LIBISCSI', if_true: files('iscsi-opts.c'))
  block_ss.add(when: 'CONFIG_LINUX', if_true: files('nvme.c'))
 +block_ss.add(when: 'CONFIG_LINUX', if_true: files('export/vhost-user-blk-server.c', '../contrib/libvhost-user/libvhost-user.c'))
  block_ss.add(when: 'CONFIG_REPLICATION', if_true: files('replication.c'))
  block_ss.add(when: 'CONFIG_SHEEPDOG', if_true: files('sheepdog.c'))
  block_ss.add(when: ['CONFIG_LINUX_AIO', libaio], if_true: files('linux-aio.c'))
 --
 .26.2

-New patch
+[PULL v2 07/28] MAINTAINERS: Add vhost-user block device backend server maintainer
+From: Coiby Xu <coiby.xu@gmail.com>
+Suggested-by: Stefano Garzarella <sgarzare@redhat.com>
+Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
+Message-id: 20200918080912.321299-8-coiby.xu@gmail.com
+[Removed reference to vhost-user-blk-test.c, it will be sent in a
+separate pull request.
+--Stefan]
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ MAINTAINERS | 7 +++++++
+file changed, 7 insertions(+)
+diff --git a/MAINTAINERS b/MAINTAINERS
+index XXXXXXX..XXXXXXX 100644
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -XXX,XX +XXX,XX @@ L: qemu-block@nongnu.org
+ S: Supported
+ F: tests/image-fuzzer/
++Vhost-user block device backend server
++M: Coiby Xu <Coiby.Xu@gmail.com>
++S: Maintained
++F: block/export/vhost-user-blk-server.c
++F: util/vhost-user-server.c
++F: tests/qtest/libqos/vhost-user-blk.c
++
+ Replication
+ M: Wen Congyang <wencongyang2@huawei.com>
+ M: Xie Changlong <xiechanglong.d@gmail.com>
+--
+.26.2

-New patch
+[PULL v2 08/28] util/vhost-user-server: s/fileds/fields/ typo fix
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Message-id: 20200924151549.913737-3-stefanha@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ util/vhost-user-server.c | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/vhost-user-server.c
++++ b/util/vhost-user-server.c
+@@ -XXX,XX +XXX,XX @@ bool vhost_user_server_start(VuServer *server,
+         return false;
+     }
+-    /* zero out unspecified fileds */
++    /* zero out unspecified fields */
+     *server = (VuServer) {
+         .listener              = listener,
+         .vu_iface              = vu_iface,
+--
+.26.2

-[PULL 10/12] block/nvme: clarify that free_req_queue is protected by q->lock
+[PULL v2 09/28] util/vhost-user-server: drop unnecessary QOM cast
-Existing users access free_req_queue under q->lock. Document this.
+We already have access to the value with the correct type (ioc and sioc
 are the same QIOChannel).
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Sergio Lopez <slp@redhat.com>
+Message-id: 20200924151549.913737-4-stefanha@redhat.com
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Message-id: 20200617132201.1832152-6-stefanha@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/nvme.c | 2 +-
+ util/vhost-user-server.c | 2 +-
 file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/block/nvme.c b/block/nvme.c
+diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
+--- a/util/vhost-user-server.c
-+++ b/block/nvme.c
++++ b/util/vhost-user-server.c
-@@ -XXX,XX +XXX,XX @@ typedef struct {
+@@ -XXX,XX +XXX,XX @@ static void vu_accept(QIONetListener *listener, QIOChannelSocket *sioc,
- } NVMeRequest;
+     server->ioc = QIO_CHANNEL(sioc);
+     object_ref(OBJECT(server->ioc));
- typedef struct {
+     qio_channel_attach_aio_context(server->ioc, server->ctx);
--    CoQueue     free_req_queue;
+-    qio_channel_set_blocking(QIO_CHANNEL(server->sioc), false, NULL);
-     QemuMutex   lock;
++    qio_channel_set_blocking(server->ioc, false, NULL);
+     vu_client_start(server);
-     /* Fields protected by BQL */
+ }
-@@ -XXX,XX +XXX,XX @@ typedef struct {
      uint8_t     *prp_list_pages;
      /* Fields protected by @lock */
 +    CoQueue     free_req_queue;
      NVMeQueue   sq, cq;
      int         cq_phase;
      int         free_req_head;
 --
 .26.2

-New patch
+[PULL v2 10/28] util/vhost-user-server: drop unnecessary watch deletion
+Explicitly deleting watches is not necessary since libvhost-user calls
+remove_watch() during vu_deinit(). Add an assertion to check this
+though.
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Message-id: 20200924151549.913737-5-stefanha@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ util/vhost-user-server.c | 19 ++++---------------
+file changed, 4 insertions(+), 15 deletions(-)
+diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/vhost-user-server.c
++++ b/util/vhost-user-server.c
+@@ -XXX,XX +XXX,XX @@ static void close_client(VuServer *server)
+     /* When this is set vu_client_trip will stop new processing vhost-user message */
+     server->sioc = NULL;
+-    VuFdWatch *vu_fd_watch, *next;
+-    QTAILQ_FOREACH_SAFE(vu_fd_watch, &server->vu_fd_watches, next, next) {
+-        aio_set_fd_handler(server->ioc->ctx, vu_fd_watch->fd, true, NULL,
+-                           NULL, NULL, NULL);
+-    }
+-
+-    while (!QTAILQ_EMPTY(&server->vu_fd_watches)) {
+-        QTAILQ_FOREACH_SAFE(vu_fd_watch, &server->vu_fd_watches, next, next) {
+-            if (!vu_fd_watch->processing) {
+-                QTAILQ_REMOVE(&server->vu_fd_watches, vu_fd_watch, next);
+-                g_free(vu_fd_watch);
+-            }
+-        }
+-    }
+-
+     while (server->processing_msg) {
+         if (server->ioc->read_coroutine) {
+             server->ioc->read_coroutine = NULL;
+@@ -XXX,XX +XXX,XX @@ static void close_client(VuServer *server)
+     }
+     vu_deinit(&server->vu_dev);
++
++    /* vu_deinit() should have called remove_watch() */
++    assert(QTAILQ_EMPTY(&server->vu_fd_watches));
++
+     object_unref(OBJECT(sioc));
+     object_unref(OBJECT(server->ioc));
+ }
+--
+.26.2

-New patch
+[PULL v2 11/28] block/export: consolidate request structs into VuBlockReq
+Only one struct is needed per request. Drop req_data and the separate
+VuBlockReq instance. Instead let vu_queue_pop() allocate everything at
+once.
+This fixes the req_data memory leak in vu_block_virtio_process_req().
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Message-id: 20200924151549.913737-6-stefanha@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/export/vhost-user-blk-server.c | 68 +++++++++-------------------
+file changed, 21 insertions(+), 47 deletions(-)
+diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/export/vhost-user-blk-server.c
++++ b/block/export/vhost-user-blk-server.c
+@@ -XXX,XX +XXX,XX @@ struct virtio_blk_inhdr {
+ };
+ typedef struct VuBlockReq {
+-    VuVirtqElement *elem;
++    VuVirtqElement elem;
+     int64_t sector_num;
+     size_t size;
+     struct virtio_blk_inhdr *in;
+@@ -XXX,XX +XXX,XX @@ static void vu_block_req_complete(VuBlockReq *req)
+     VuDev *vu_dev = &req->server->vu_dev;
+     /* IO size with 1 extra status byte */
+-    vu_queue_push(vu_dev, req->vq, req->elem, req->size + 1);
++    vu_queue_push(vu_dev, req->vq, &req->elem, req->size + 1);
+     vu_queue_notify(vu_dev, req->vq);
+-    if (req->elem) {
+-        free(req->elem);
+-    }
+-
+-    g_free(req);
++    free(req);
+ }
+ static VuBlockDev *get_vu_block_device_by_server(VuServer *server)
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn vu_block_flush(VuBlockReq *req)
+     blk_co_flush(backend);
+ }
+-struct req_data {
+-    VuServer *server;
+-    VuVirtq *vq;
+-    VuVirtqElement *elem;
+-};
+-
+ static void coroutine_fn vu_block_virtio_process_req(void *opaque)
+ {
+-    struct req_data *data = opaque;
+-    VuServer *server = data->server;
+-    VuVirtq *vq = data->vq;
+-    VuVirtqElement *elem = data->elem;
++    VuBlockReq *req = opaque;
++    VuServer *server = req->server;
++    VuVirtqElement *elem = &req->elem;
+     uint32_t type;
+-    VuBlockReq *req;
+     VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
+     BlockBackend *backend = vdev_blk->backend;
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn vu_block_virtio_process_req(void *opaque)
+     struct iovec *out_iov = elem->out_sg;
+     unsigned in_num = elem->in_num;
+     unsigned out_num = elem->out_num;
++
+     /* refer to hw/block/virtio_blk.c */
+     if (elem->out_num < 1 || elem->in_num < 1) {
+         error_report("virtio-blk request missing headers");
+-        free(elem);
+-        return;
++        goto err;
+     }
+-    req = g_new0(VuBlockReq, 1);
+-    req->server = server;
+-    req->vq = vq;
+-    req->elem = elem;
+-
+     if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
+                             sizeof(req->out)) != sizeof(req->out))) {
+         error_report("virtio-blk request outhdr too short");
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn vu_block_virtio_process_req(void *opaque)
+ err:
+     free(elem);
+-    g_free(req);
+-    return;
+ }
+ static void vu_block_process_vq(VuDev *vu_dev, int idx)
+ {
+-    VuServer *server;
+-    VuVirtq *vq;
+-    struct req_data *req_data;
++    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
++    VuVirtq *vq = vu_get_queue(vu_dev, idx);
+-    server = container_of(vu_dev, VuServer, vu_dev);
+-    assert(server);
+-
+-    vq = vu_get_queue(vu_dev, idx);
+-    assert(vq);
+-    VuVirtqElement *elem;
+     while (1) {
+-        elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) +
+-                                    sizeof(VuBlockReq));
+-        if (elem) {
+-            req_data = g_new0(struct req_data, 1);
+-            req_data->server = server;
+-            req_data->vq = vq;
+-            req_data->elem = elem;
+-            Coroutine *co = qemu_coroutine_create(vu_block_virtio_process_req,
+-                                                  req_data);
+-            aio_co_enter(server->ioc->ctx, co);
+-        } else {
++        VuBlockReq *req;
++
++        req = vu_queue_pop(vu_dev, vq, sizeof(VuBlockReq));
++        if (!req) {
+             break;
+         }
++
++        req->server = server;
++        req->vq = vq;
++
++        Coroutine *co =
++            qemu_coroutine_create(vu_block_virtio_process_req, req);
++        qemu_coroutine_enter(co);
+     }
+ }
+--
+.26.2

-New patch
+[PULL v2 12/28] util/vhost-user-server: drop unused DevicePanicNotifier
+The device panic notifier callback is not used. Drop it.
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Message-id: 20200924151549.913737-7-stefanha@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ util/vhost-user-server.h             | 3 ---
+ block/export/vhost-user-blk-server.c | 3 +--
+ util/vhost-user-server.c             | 6 ------
+files changed, 1 insertion(+), 11 deletions(-)
+diff --git a/util/vhost-user-server.h b/util/vhost-user-server.h
+index XXXXXXX..XXXXXXX 100644
+--- a/util/vhost-user-server.h
++++ b/util/vhost-user-server.h
+@@ -XXX,XX +XXX,XX @@ typedef struct VuFdWatch {
+ } VuFdWatch;
+ typedef struct VuServer VuServer;
+-typedef void DevicePanicNotifierFn(VuServer *server);
+ struct VuServer {
+     QIONetListener *listener;
+     AioContext *ctx;
+-    DevicePanicNotifierFn *device_panic_notifier;
+     int max_queues;
+     const VuDevIface *vu_iface;
+     VuDev vu_dev;
+@@ -XXX,XX +XXX,XX @@ bool vhost_user_server_start(VuServer *server,
+                              SocketAddress *unix_socket,
+                              AioContext *ctx,
+                              uint16_t max_queues,
+-                             DevicePanicNotifierFn *device_panic_notifier,
+                              const VuDevIface *vu_iface,
+                              Error **errp);
+diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/export/vhost-user-blk-server.c
++++ b/block/export/vhost-user-blk-server.c
+@@ -XXX,XX +XXX,XX @@ static void vhost_user_blk_server_start(VuBlockDev *vu_block_device,
+     ctx = bdrv_get_aio_context(blk_bs(vu_block_device->backend));
+     if (!vhost_user_server_start(&vu_block_device->vu_server, addr, ctx,
+-                                 VHOST_USER_BLK_MAX_QUEUES,
+-                                 NULL, &vu_block_iface,
++                                 VHOST_USER_BLK_MAX_QUEUES, &vu_block_iface,
+                                  errp)) {
+         goto error;
+     }
+diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/vhost-user-server.c
++++ b/util/vhost-user-server.c
+@@ -XXX,XX +XXX,XX @@ static void panic_cb(VuDev *vu_dev, const char *buf)
+         close_client(server);
+     }
+-    if (server->device_panic_notifier) {
+-        server->device_panic_notifier(server);
+-    }
+-
+     /*
+      * Set the callback function for network listener so another
+      * vhost-user client can connect to this server
+@@ -XXX,XX +XXX,XX @@ bool vhost_user_server_start(VuServer *server,
+                              SocketAddress *socket_addr,
+                              AioContext *ctx,
+                              uint16_t max_queues,
+-                             DevicePanicNotifierFn *device_panic_notifier,
+                              const VuDevIface *vu_iface,
+                              Error **errp)
+ {
+@@ -XXX,XX +XXX,XX @@ bool vhost_user_server_start(VuServer *server,
+         .vu_iface              = vu_iface,
+         .max_queues            = max_queues,
+         .ctx                   = ctx,
+-        .device_panic_notifier = device_panic_notifier,
+     };
+     qio_net_listener_set_name(server->listener, "vhost-user-backend-listener");
+--
+.26.2

-New patch
+[PULL v2 13/28] util/vhost-user-server: fix memory leak in vu_message_read()
+fds[] is leaked when qio_channel_readv_full() fails.
+Use vmsg->fds[] instead of keeping a local fds[] array. Then we can
+reuse goto fail to clean up fds. vmsg->fd_num must be zeroed before the
+loop to make this safe.
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Message-id: 20200924151549.913737-8-stefanha@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ util/vhost-user-server.c | 50 ++++++++++++++++++----------------------
+file changed, 23 insertions(+), 27 deletions(-)
+diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/vhost-user-server.c
++++ b/util/vhost-user-server.c
+@@ -XXX,XX +XXX,XX @@ vu_message_read(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)
+     };
+     int rc, read_bytes = 0;
+     Error *local_err = NULL;
+-    /*
+-     * Store fds/nfds returned from qio_channel_readv_full into
+-     * temporary variables.
+-     *
+-     * VhostUserMsg is a packed structure, gcc will complain about passing
+-     * pointer to a packed structure member if we pass &VhostUserMsg.fd_num
+-     * and &VhostUserMsg.fds directly when calling qio_channel_readv_full,
+-     * thus two temporary variables nfds and fds are used here.
+-     */
+-    size_t nfds = 0, nfds_t = 0;
+     const size_t max_fds = G_N_ELEMENTS(vmsg->fds);
+-    int *fds_t = NULL;
+     VuServer *server = container_of(vu_dev, VuServer, vu_dev);
+     QIOChannel *ioc = server->ioc;
++    vmsg->fd_num = 0;
+     if (!ioc) {
+         error_report_err(local_err);
+         goto fail;
+@@ -XXX,XX +XXX,XX @@ vu_message_read(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)
+     assert(qemu_in_coroutine());
+     do {
++        size_t nfds = 0;
++        int *fds = NULL;
++
+         /*
+          * qio_channel_readv_full may have short reads, keeping calling it
+          * until getting VHOST_USER_HDR_SIZE or 0 bytes in total
+          */
+-        rc = qio_channel_readv_full(ioc, &iov, 1, &fds_t, &nfds_t, &local_err);
++        rc = qio_channel_readv_full(ioc, &iov, 1, &fds, &nfds, &local_err);
+         if (rc < 0) {
+             if (rc == QIO_CHANNEL_ERR_BLOCK) {
++                assert(local_err == NULL);
+                 qio_channel_yield(ioc, G_IO_IN);
+                 continue;
+             } else {
+                 error_report_err(local_err);
+-                return false;
++                goto fail;
+             }
+         }
+-        read_bytes += rc;
+-        if (nfds_t > 0) {
+-            if (nfds + nfds_t > max_fds) {
++
++        if (nfds > 0) {
++            if (vmsg->fd_num + nfds > max_fds) {
+                 error_report("A maximum of %zu fds are allowed, "
+                              "however got %zu fds now",
+-                             max_fds, nfds + nfds_t);
++                             max_fds, vmsg->fd_num + nfds);
++                g_free(fds);
+                 goto fail;
+             }
+-            memcpy(vmsg->fds + nfds, fds_t,
+-                   nfds_t *sizeof(vmsg->fds[0]));
+-            nfds += nfds_t;
+-            g_free(fds_t);
++            memcpy(vmsg->fds + vmsg->fd_num, fds, nfds * sizeof(vmsg->fds[0]));
++            vmsg->fd_num += nfds;
++            g_free(fds);
+         }
+-        if (read_bytes == VHOST_USER_HDR_SIZE || rc == 0) {
+-            break;
++
++        if (rc == 0) { /* socket closed */
++            goto fail;
+         }
+-        iov.iov_base = (char *)vmsg + read_bytes;
+-        iov.iov_len = VHOST_USER_HDR_SIZE - read_bytes;
+-    } while (true);
+-    vmsg->fd_num = nfds;
++        iov.iov_base += rc;
++        iov.iov_len -= rc;
++        read_bytes += rc;
++    } while (read_bytes != VHOST_USER_HDR_SIZE);
++
+     /* qio_channel_readv_full will make socket fds blocking, unblock them */
+     vmsg_unblock_fds(vmsg);
+     if (vmsg->size > sizeof(vmsg->payload)) {
+--
+.26.2

-New patch
+[PULL v2 14/28] util/vhost-user-server: check EOF when reading payload
+Unexpected EOF is an error that must be reported.
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Message-id: 20200924151549.913737-9-stefanha@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ util/vhost-user-server.c | 6 ++++--
+file changed, 4 insertions(+), 2 deletions(-)
+diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/vhost-user-server.c
++++ b/util/vhost-user-server.c
+@@ -XXX,XX +XXX,XX @@ vu_message_read(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)
+     };
+     if (vmsg->size) {
+         rc = qio_channel_readv_all_eof(ioc, &iov_payload, 1, &local_err);
+-        if (rc == -1) {
+-            error_report_err(local_err);
++        if (rc != 1) {
++            if (local_err) {
++                error_report_err(local_err);
++            }
+             goto fail;
+         }
+     }
+--
+.26.2

-[PULL 09/12] block/nvme: switch to a NVMeRequest freelist
+[PULL v2 15/28] util/vhost-user-server: rework vu_client_trip() coroutine lifecycle
-There are three issues with the current NVMeRequest->busy field:
+The vu_client_trip() coroutine is leaked during AioContext switching. It
-. The busy field is accidentally accessed outside q->lock when request
+is also unsafe to destroy the vu_dev in panic_cb() since its callers
-   submission fails.
+still access it in some cases.
 . Waiters on free_req_queue are not woken when a request is returned
    early due to submission failure.
 . Finding a free request involves scanning all requests. This makes
    request submission O(n^2).
-Switch to an O(1) freelist that is always accessed under the lock.
+Rework the lifecycle to solve these safety issues.
 Also differentiate between NVME_QUEUE_SIZE, the actual SQ/CQ size, and
 NVME_NUM_REQS, the number of usable requests. This makes the code
 simpler than using NVME_QUEUE_SIZE everywhere and having to keep in mind
 that one slot is reserved.
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Sergio Lopez <slp@redhat.com>
+Message-id: 20200924151549.913737-10-stefanha@redhat.com
 Message-id: 20200617132201.1832152-5-stefanha@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/nvme.c | 81 ++++++++++++++++++++++++++++++++++------------------
+ util/vhost-user-server.h             |  29 ++--
-file changed, 54 insertions(+), 27 deletions(-)
+ block/export/vhost-user-blk-server.c |   9 +-
  util/vhost-user-server.c             | 245 +++++++++++++++------------
 files changed, 155 insertions(+), 128 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
+diff --git a/util/vhost-user-server.h b/util/vhost-user-server.h
 index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
+--- a/util/vhost-user-server.h
-+++ b/block/nvme.c
++++ b/util/vhost-user-server.h
 @@ -XXX,XX +XXX,XX @@
- #define NVME_QUEUE_SIZE 128
+ #include "qapi/error.h"
- #define NVME_BAR_SIZE 8192
+ #include "standard-headers/linux/virtio_blk.h"
 +/* A kick fd that we monitor on behalf of libvhost-user */
  typedef struct VuFdWatch {
      VuDev *vu_dev;
      int fd; /*kick fd*/
      void *pvt;
      vu_watch_cb cb;
 -    bool processing;
      QTAILQ_ENTRY(VuFdWatch) next;
  } VuFdWatch;
 -typedef struct VuServer VuServer;
 -
 -struct VuServer {
 +/**
 + * VuServer:
 + * A vhost-user server instance with user-defined VuDevIface callbacks.
 + * Vhost-user device backends can be implemented using VuServer. VuDevIface
 + * callbacks and virtqueue kicks run in the given AioContext.
 + */
 +typedef struct {
      QIONetListener *listener;
 +    QEMUBH *restart_listener_bh;
      AioContext *ctx;
      int max_queues;
      const VuDevIface *vu_iface;
 +
 +    /* Protected by ctx lock */
      VuDev vu_dev;
      QIOChannel *ioc; /* The I/O channel with the client */
      QIOChannelSocket *sioc; /* The underlying data channel with the client */
 -    /* IOChannel for fd provided via VHOST_USER_SET_SLAVE_REQ_FD */
 -    QIOChannel *ioc_slave;
 -    QIOChannelSocket *sioc_slave;
 -    Coroutine *co_trip; /* coroutine for processing VhostUserMsg */
      QTAILQ_HEAD(, VuFdWatch) vu_fd_watches;
 -    /* restart coroutine co_trip if AIOContext is changed */
 -    bool aio_context_changed;
 -    bool processing_msg;
 -};
 +
 +    Coroutine *co_trip; /* coroutine for processing VhostUserMsg */
 +} VuServer;
  bool vhost_user_server_start(VuServer *server,
                               SocketAddress *unix_socket,
@@ -XXX,XX +XXX,XX @@ bool vhost_user_server_start(VuServer *server,
  void vhost_user_server_stop(VuServer *server);
 -void vhost_user_server_set_aio_context(VuServer *server, AioContext *ctx);
 +void vhost_user_server_attach_aio_context(VuServer *server, AioContext *ctx);
 +void vhost_user_server_detach_aio_context(VuServer *server);
  #endif /* VHOST_USER_SERVER_H */
 diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/export/vhost-user-blk-server.c
 +++ b/block/export/vhost-user-blk-server.c
@@ -XXX,XX +XXX,XX @@ static const VuDevIface vu_block_iface = {
  static void blk_aio_attached(AioContext *ctx, void *opaque)
  {
      VuBlockDev *vub_dev = opaque;
 -    aio_context_acquire(ctx);
 -    vhost_user_server_set_aio_context(&vub_dev->vu_server, ctx);
 -    aio_context_release(ctx);
 +    vhost_user_server_attach_aio_context(&vub_dev->vu_server, ctx);
  }
  static void blk_aio_detach(void *opaque)
  {
      VuBlockDev *vub_dev = opaque;
 -    AioContext *ctx = vub_dev->vu_server.ctx;
 -    aio_context_acquire(ctx);
 -    vhost_user_server_set_aio_context(&vub_dev->vu_server, NULL);
 -    aio_context_release(ctx);
 +    vhost_user_server_detach_aio_context(&vub_dev->vu_server);
  }
  static void
 diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/vhost-user-server.c
 +++ b/util/vhost-user-server.c
@@ -XXX,XX +XXX,XX @@
   */
  #include "qemu/osdep.h"
  #include "qemu/main-loop.h"
 +#include "block/aio-wait.h"
  #include "vhost-user-server.h"
 +/*
-+ * We have to leave one slot empty as that is the full queue case where
++ * Theory of operation:
-+ * head == tail + 1.
++ *
 + * VuServer is started and stopped by vhost_user_server_start() and
 + * vhost_user_server_stop() from the main loop thread. Starting the server
 + * opens a vhost-user UNIX domain socket and listens for incoming connections.
 + * Only one connection is allowed at a time.
 + *
 + * The connection is handled by the vu_client_trip() coroutine in the
 + * VuServer->ctx AioContext. The coroutine consists of a vu_dispatch() loop
 + * where libvhost-user calls vu_message_read() to receive the next vhost-user
 + * protocol messages over the UNIX domain socket.
 + *
 + * When virtqueues are set up libvhost-user calls set_watch() to monitor kick
 + * fds. These fds are also handled in the VuServer->ctx AioContext.
 + *
 + * Both vu_client_trip() and kick fd monitoring can be stopped by shutting down
 + * the socket connection. Shutting down the socket connection causes
 + * vu_message_read() to fail since no more data can be received from the socket.
 + * After vu_dispatch() fails, vu_client_trip() calls vu_deinit() to stop
 + * libvhost-user before terminating the coroutine. vu_deinit() calls
 + * remove_watch() to stop monitoring kick fds and this stops virtqueue
 + * processing.
 + *
 + * When vu_client_trip() has finished cleaning up it schedules a BH in the main
 + * loop thread to accept the next client connection.
 + *
 + * When libvhost-user detects an error it calls panic_cb() and sets the
 + * dev->broken flag. Both vu_client_trip() and kick fd processing stop when
 + * the dev->broken flag is set.
 + *
 + * It is possible to switch AioContexts using
 + * vhost_user_server_detach_aio_context() and
 + * vhost_user_server_attach_aio_context(). They stop monitoring fds in the old
 + * AioContext and resume monitoring in the new AioContext. The vu_client_trip()
 + * coroutine remains in a yielded state during the switch. This is made
 + * possible by QIOChannel's support for spurious coroutine re-entry in
 + * qio_channel_yield(). The coroutine will restart I/O when re-entered from the
 + * new AioContext.
 + */
-+#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
++
-+
+ static void vmsg_close_fds(VhostUserMsg *vmsg)
- typedef struct {
+ {
-     int32_t  head, tail;
+     int i;
-     uint8_t  *queue;
+@@ -XXX,XX +XXX,XX @@ static void vmsg_unblock_fds(VhostUserMsg *vmsg)
-@@ -XXX,XX +XXX,XX @@ typedef struct {
+     }
-     int cid;
+ }
-     void *prp_list_page;
-     uint64_t prp_list_iova;
+-static void vu_accept(QIONetListener *listener, QIOChannelSocket *sioc,
--    bool busy;
+-                      gpointer opaque);
-+    int free_req_next; /* q->reqs[] index of next free req */
+-
- } NVMeRequest;
+-static void close_client(VuServer *server)
+-{
- typedef struct {
+-    /*
-@@ -XXX,XX +XXX,XX @@ typedef struct {
+-     * Before closing the client
-     /* Fields protected by @lock */
+-     *
-     NVMeQueue   sq, cq;
+-     * 1. Let vu_client_trip stop processing new vhost-user msg
-     int         cq_phase;
+-     *
--    NVMeRequest reqs[NVME_QUEUE_SIZE];
+-     * 2. remove kick_handler
-+    int         free_req_head;
+-     *
-+    NVMeRequest reqs[NVME_NUM_REQS];
+-     * 3. wait for the kick handler to be finished
-     bool        busy;
+-     *
-     int         need_kick;
+-     * 4. wait for the current vhost-user msg to be finished processing
-     int         inflight;
+-     */
-@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
+-
-     qemu_mutex_init(&q->lock);
+-    QIOChannelSocket *sioc = server->sioc;
-     q->index = idx;
+-    /* When this is set vu_client_trip will stop new processing vhost-user message */
-     qemu_co_queue_init(&q->free_req_queue);
+-    server->sioc = NULL;
--    q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE);
+-
-+    q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
+-    while (server->processing_msg) {
-     r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
+-        if (server->ioc->read_coroutine) {
--                          s->page_size * NVME_QUEUE_SIZE,
+-            server->ioc->read_coroutine = NULL;
-+                          s->page_size * NVME_NUM_REQS,
+-            qio_channel_set_aio_fd_handler(server->ioc, server->ioc->ctx, NULL,
-                           false, &prp_list_iova);
+-                                           NULL, server->ioc);
-     if (r) {
+-            server->processing_msg = false;
          goto fail;
      }
 -    for (i = 0; i < NVME_QUEUE_SIZE; i++) {
 +    q->free_req_head = -1;
 +    for (i = 0; i < NVME_NUM_REQS; i++) {
          NVMeRequest *req = &q->reqs[i];
          req->cid = i + 1;
 +        req->free_req_next = q->free_req_head;
 +        q->free_req_head = i;
          req->prp_list_page = q->prp_list_pages + i * s->page_size;
          req->prp_list_iova = prp_list_iova + i * s->page_size;
      }
 +
      nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
      if (local_err) {
          error_propagate(errp, local_err);
@@ -XXX,XX +XXX,XX @@ static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
   */
  static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
  {
 -    int i;
 -    NVMeRequest *req = NULL;
 +    NVMeRequest *req;
      qemu_mutex_lock(&q->lock);
 -    while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) {
 -        /* We have to leave one slot empty as that is the full queue case (head
 -         * == tail + 1). */
 +
 +    while (q->free_req_head == -1) {
          if (qemu_in_coroutine()) {
              trace_nvme_free_req_queue_wait(q);
              qemu_co_queue_wait(&q->free_req_queue, &q->lock);
@@ -XXX,XX +XXX,XX @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
              return NULL;
          }
      }
 -    for (i = 0; i < NVME_QUEUE_SIZE; i++) {
 -        if (!q->reqs[i].busy) {
 -            q->reqs[i].busy = true;
 -            req = &q->reqs[i];
 -            break;
 -        }
 -    }
--    /* We have checked inflight and need_kick while holding q->lock, so one
+-
--     * free req must be available. */
+-    vu_deinit(&server->vu_dev);
--    assert(req);
+-
-+
+-    /* vu_deinit() should have called remove_watch() */
-+    req = &q->reqs[q->free_req_head];
+-    assert(QTAILQ_EMPTY(&server->vu_fd_watches));
-+    q->free_req_head = req->free_req_next;
+-
-+    req->free_req_next = -1;
+-    object_unref(OBJECT(sioc));
-+
+-    object_unref(OBJECT(server->ioc));
-     qemu_mutex_unlock(&q->lock);
+-}
-     return req;
+-
- }
+ static void panic_cb(VuDev *vu_dev, const char *buf)
+ {
-+/* With q->lock */
+-    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
-+static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
+-
 -    /* avoid while loop in close_client */
 -    server->processing_msg = false;
 -
 -    if (buf) {
 -        error_report("vu_panic: %s", buf);
 -    }
 -
 -    if (server->sioc) {
 -        close_client(server);
 -    }
 -
 -    /*
 -     * Set the callback function for network listener so another
 -     * vhost-user client can connect to this server
 -     */
 -    qio_net_listener_set_client_func(server->listener,
 -                                     vu_accept,
 -                                     server,
 -                                     NULL);
 +    error_report("vu_panic: %s", buf);
  }
  static bool coroutine_fn
@@ -XXX,XX +XXX,XX @@ fail:
      return false;
  }
 -
 -static void vu_client_start(VuServer *server);
  static coroutine_fn void vu_client_trip(void *opaque)
  {
      VuServer *server = opaque;
 +    VuDev *vu_dev = &server->vu_dev;
 -    while (!server->aio_context_changed && server->sioc) {
 -        server->processing_msg = true;
 -        vu_dispatch(&server->vu_dev);
 -        server->processing_msg = false;
 +    while (!vu_dev->broken && vu_dispatch(vu_dev)) {
 +        /* Keep running */
      }
 -    if (server->aio_context_changed && server->sioc) {
 -        server->aio_context_changed = false;
 -        vu_client_start(server);
 -    }
 -}
 +    vu_deinit(vu_dev);
 +
 +    /* vu_deinit() should have called remove_watch() */
 +    assert(QTAILQ_EMPTY(&server->vu_fd_watches));
 +
 +    object_unref(OBJECT(server->sioc));
 +    server->sioc = NULL;
 -static void vu_client_start(VuServer *server)
 -{
 -    server->co_trip = qemu_coroutine_create(vu_client_trip, server);
 -    aio_co_enter(server->ctx, server->co_trip);
 +    object_unref(OBJECT(server->ioc));
 +    server->ioc = NULL;
 +
 +    server->co_trip = NULL;
 +    if (server->restart_listener_bh) {
 +        qemu_bh_schedule(server->restart_listener_bh);
 +    }
 +    aio_wait_kick();
  }
  /*
@@ -XXX,XX +XXX,XX @@ static void vu_client_start(VuServer *server)
  static void kick_handler(void *opaque)
  {
      VuFdWatch *vu_fd_watch = opaque;
 -    vu_fd_watch->processing = true;
 -    vu_fd_watch->cb(vu_fd_watch->vu_dev, 0, vu_fd_watch->pvt);
 -    vu_fd_watch->processing = false;
 +    VuDev *vu_dev = vu_fd_watch->vu_dev;
 +
 +    vu_fd_watch->cb(vu_dev, 0, vu_fd_watch->pvt);
 +
 +    /* Stop vu_client_trip() if an error occurred in vu_fd_watch->cb() */
 +    if (vu_dev->broken) {
 +        VuServer *server = container_of(vu_dev, VuServer, vu_dev);
 +
 +        qio_channel_shutdown(server->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
 +    }
  }
 -
  static VuFdWatch *find_vu_fd_watch(VuServer *server, int fd)
  {
@@ -XXX,XX +XXX,XX @@ static void vu_accept(QIONetListener *listener, QIOChannelSocket *sioc,
      qio_channel_set_name(QIO_CHANNEL(sioc), "vhost-user client");
      server->ioc = QIO_CHANNEL(sioc);
      object_ref(OBJECT(server->ioc));
 -    qio_channel_attach_aio_context(server->ioc, server->ctx);
 +
 +    /* TODO vu_message_write() spins if non-blocking! */
      qio_channel_set_blocking(server->ioc, false, NULL);
 -    vu_client_start(server);
 +
 +    server->co_trip = qemu_coroutine_create(vu_client_trip, server);
 +
 +    aio_context_acquire(server->ctx);
 +    vhost_user_server_attach_aio_context(server, server->ctx);
 +    aio_context_release(server->ctx);
  }
 -
  void vhost_user_server_stop(VuServer *server)
  {
 +    aio_context_acquire(server->ctx);
 +
 +    qemu_bh_delete(server->restart_listener_bh);
 +    server->restart_listener_bh = NULL;
 +
      if (server->sioc) {
 -        close_client(server);
 +        VuFdWatch *vu_fd_watch;
 +
 +        QTAILQ_FOREACH(vu_fd_watch, &server->vu_fd_watches, next) {
 +            aio_set_fd_handler(server->ctx, vu_fd_watch->fd, true,
 +                               NULL, NULL, NULL, vu_fd_watch);
 +        }
 +
 +        qio_channel_shutdown(server->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
 +
 +        AIO_WAIT_WHILE(server->ctx, server->co_trip);
      }
 +    aio_context_release(server->ctx);
 +
      if (server->listener) {
          qio_net_listener_disconnect(server->listener);
          object_unref(OBJECT(server->listener));
      }
 +}
 +
 +/*
 + * Allow the next client to connect to the server. Called from a BH in the main
 + * loop.
 + */
 +static void restart_listener_bh(void *opaque)
 +{
-+    req->free_req_next = q->free_req_head;
++    VuServer *server = opaque;
-+    q->free_req_head = req - q->reqs;
 +    qio_net_listener_set_client_func(server->listener, vu_accept, server,
 +                                     NULL);
  }
 -void vhost_user_server_set_aio_context(VuServer *server, AioContext *ctx)
 +/* Called with ctx acquired */
 +void vhost_user_server_attach_aio_context(VuServer *server, AioContext *ctx)
  {
 -    VuFdWatch *vu_fd_watch, *next;
 -    void *opaque = NULL;
 -    IOHandler *io_read = NULL;
 -    bool attach;
 +    VuFdWatch *vu_fd_watch;
 -    server->ctx = ctx ? ctx : qemu_get_aio_context();
 +    server->ctx = ctx;
      if (!server->sioc) {
 -        /* not yet serving any client*/
          return;
      }
 -    if (ctx) {
 -        qio_channel_attach_aio_context(server->ioc, ctx);
 -        server->aio_context_changed = true;
 -        io_read = kick_handler;
 -        attach = true;
 -    } else {
 +    qio_channel_attach_aio_context(server->ioc, ctx);
 +
 +    QTAILQ_FOREACH(vu_fd_watch, &server->vu_fd_watches, next) {
 +        aio_set_fd_handler(ctx, vu_fd_watch->fd, true, kick_handler, NULL,
 +                           NULL, vu_fd_watch);
 +    }
 +
 +    aio_co_schedule(ctx, server->co_trip);
 +}
 +
-+/* With q->lock */
++/* Called with server->ctx acquired */
-+static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
++void vhost_user_server_detach_aio_context(VuServer *server)
 +{
-+    if (!qemu_co_queue_empty(&q->free_req_queue)) {
++    if (server->sioc) {
-+        replay_bh_schedule_oneshot_event(s->aio_context,
++        VuFdWatch *vu_fd_watch;
-+                nvme_free_req_queue_cb, q);
++
-+    }
++        QTAILQ_FOREACH(vu_fd_watch, &server->vu_fd_watches, next) {
-+}
++            aio_set_fd_handler(server->ctx, vu_fd_watch->fd, true,
-+
++                               NULL, NULL, NULL, vu_fd_watch);
-+/* Insert a request in the freelist and wake waiters */
++        }
-+static void nvme_put_free_req_and_wake(BDRVNVMeState *s,  NVMeQueuePair *q,
++
-+                                       NVMeRequest *req)
+         qio_channel_detach_aio_context(server->ioc);
-+{
+-        /* server->ioc->ctx keeps the old AioConext */
-+    qemu_mutex_lock(&q->lock);
+-        ctx = server->ioc->ctx;
-+    nvme_put_free_req_locked(q, req);
+-        attach = false;
-+    nvme_wake_free_req_locked(s, q);
+     }
-+    qemu_mutex_unlock(&q->lock);
-+}
+-    QTAILQ_FOREACH_SAFE(vu_fd_watch, &server->vu_fd_watches, next, next) {
-+
+-        if (vu_fd_watch->cb) {
- static inline int nvme_translate_error(const NvmeCqe *c)
+-            opaque = attach ? vu_fd_watch : NULL;
- {
+-            aio_set_fd_handler(ctx, vu_fd_watch->fd, true,
-     uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
+-                               io_read, NULL, NULL,
-@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
+-                               opaque);
          req = *preq;
          assert(req.cid == cid);
          assert(req.cb);
 -        preq->busy = false;
 +        nvme_put_free_req_locked(q, preq);
          preq->cb = preq->opaque = NULL;
          qemu_mutex_unlock(&q->lock);
          req.cb(req.opaque, ret);
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
          /* Notify the device so it can post more completions. */
          smp_mb_release();
          *q->cq.doorbell = cpu_to_le32(q->cq.head);
 -        if (!qemu_co_queue_empty(&q->free_req_queue)) {
 -            replay_bh_schedule_oneshot_event(s->aio_context,
 -                                             nvme_free_req_queue_cb, q);
 -        }
-+        nvme_wake_free_req_locked(s, q);
+-    }
-     }
++    server->ctx = NULL;
-     q->busy = false;
+ }
-     return progress;
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
+-
-     r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
+ bool vhost_user_server_start(VuServer *server,
-     qemu_co_mutex_unlock(&s->dma_map_lock);
+                              SocketAddress *socket_addr,
-     if (r) {
+                              AioContext *ctx,
--        req->busy = false;
+@@ -XXX,XX +XXX,XX @@ bool vhost_user_server_start(VuServer *server,
-+        nvme_put_free_req_and_wake(s, ioq, req);
+                              const VuDevIface *vu_iface,
-         return r;
+                              Error **errp)
-     }
+ {
-     nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
++    QEMUBH *bh;
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
+     QIONetListener *listener = qio_net_listener_new();
-     qemu_co_mutex_unlock(&s->dma_map_lock);
+     if (qio_net_listener_open_sync(listener, socket_addr, 1,
+                                    errp) < 0) {
-     if (ret) {
+@@ -XXX,XX +XXX,XX @@ bool vhost_user_server_start(VuServer *server,
--        req->busy = false;
+         return false;
-+        nvme_put_free_req_and_wake(s, ioq, req);
+     }
-         goto out;
-     }
++    bh = qemu_bh_new(restart_listener_bh, server);
++
      /* zero out unspecified fields */
      *server = (VuServer) {
          .listener              = listener,
 +        .restart_listener_bh   = bh,
          .vu_iface              = vu_iface,
          .max_queues            = max_queues,
          .ctx                   = ctx,
 --
 .26.2

-New patch
+[PULL v2 16/28] block/export: report flush errors
+Propagate the flush return value since errors are possible.
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Message-id: 20200924151549.913737-11-stefanha@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/export/vhost-user-blk-server.c | 11 +++++++----
+file changed, 7 insertions(+), 4 deletions(-)
+diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/export/vhost-user-blk-server.c
++++ b/block/export/vhost-user-blk-server.c
+@@ -XXX,XX +XXX,XX @@ vu_block_discard_write_zeroes(VuBlockReq *req, struct iovec *iov,
+     return -EINVAL;
+ }
+-static void coroutine_fn vu_block_flush(VuBlockReq *req)
++static int coroutine_fn vu_block_flush(VuBlockReq *req)
+ {
+     VuBlockDev *vdev_blk = get_vu_block_device_by_server(req->server);
+     BlockBackend *backend = vdev_blk->backend;
+-    blk_co_flush(backend);
++    return blk_co_flush(backend);
+ }
+ static void coroutine_fn vu_block_virtio_process_req(void *opaque)
+@@ -XXX,XX +XXX,XX @@ static void coroutine_fn vu_block_virtio_process_req(void *opaque)
+         break;
+     }
+     case VIRTIO_BLK_T_FLUSH:
+-        vu_block_flush(req);
+-        req->in->status = VIRTIO_BLK_S_OK;
++        if (vu_block_flush(req) == 0) {
++            req->in->status = VIRTIO_BLK_S_OK;
++        } else {
++            req->in->status = VIRTIO_BLK_S_IOERR;
++        }
+         break;
+     case VIRTIO_BLK_T_GET_ID: {
+         size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
+--
+.26.2

-[PULL 06/12] block/nvme: poll queues without q->lock
+[PULL v2 17/28] block/export: convert vhost-user-blk server to block export API
-A lot of CPU time is spent simply locking/unlocking q->lock during
+Use the new QAPI block exports API instead of defining our own QOM
-polling. Check for completion outside the lock to make q->lock disappear
+objects.
-from the profile.
 This is a large change because the lifecycle of VuBlockDev needs to
 follow BlockExportDriver. QOM properties are replaced by QAPI options
 objects.
 VuBlockDev is renamed VuBlkExport and contains a BlockExport field.
 Several fields can be dropped since BlockExport already has equivalents.
 The file names and meson build integration will be adjusted in a future
 patch. libvhost-user should probably be built as a static library that
 is linked into QEMU instead of as a .c file that results in duplicate
 compilation.
 The new command-line syntax is:
   $ qemu-storage-daemon \
       --blockdev file,node-name=drive0,filename=test.img \
       --export vhost-user-blk,node-name=drive0,id=export0,unix-socket=/tmp/vhost-user-blk.sock
 Note that unix-socket is optional because we may wish to accept chardevs
 too in the future.
 Markus noted that supported address families are not explicit in the
 QAPI schema. It is unlikely that support for more address families will
 be added since file descriptor passing is required and few address
 families support it. If a new address family needs to be added, then the
 QAPI 'features' syntax can be used to advertize them.
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Sergio Lopez <slp@redhat.com>
+Acked-by: Markus Armbruster <armbru@redhat.com>
-Message-id: 20200617132201.1832152-2-stefanha@redhat.com
+Message-id: 20200924151549.913737-12-stefanha@redhat.com
 [Skip test on big-endian host architectures because this device doesn't
 support them yet (as already mentioned in a code comment).
 --Stefan]
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/nvme.c | 12 ++++++++++++
+ qapi/block-export.json               |  21 +-
-file changed, 12 insertions(+)
+ block/export/vhost-user-blk-server.h |  23 +-
+ block/export/export.c                |   6 +
-diff --git a/block/nvme.c b/block/nvme.c
+ block/export/vhost-user-blk-server.c | 452 +++++++--------------------
  util/vhost-user-server.c             |  10 +-
  block/export/meson.build             |   1 +
  block/meson.build                    |   1 -
 files changed, 156 insertions(+), 358 deletions(-)
 diff --git a/qapi/block-export.json b/qapi/block-export.json
 index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
+--- a/qapi/block-export.json
-+++ b/block/nvme.c
++++ b/qapi/block-export.json
-@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
+@@ -XXX,XX +XXX,XX @@
+   'data': { '*name': 'str', '*description': 'str',
-     for (i = 0; i < s->nr_queues; i++) {
+             '*bitmap': 'str' } }
-         NVMeQueuePair *q = s->queues[i];
-+        const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
++##
-+        NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
++# @BlockExportOptionsVhostUserBlk:
 +#
 +# A vhost-user-blk block export.
 +#
 +# @addr: The vhost-user socket on which to listen. Both 'unix' and 'fd'
 +#        SocketAddress types are supported. Passed fds must be UNIX domain
 +#        sockets.
 +# @logical-block-size: Logical block size in bytes. Defaults to 512 bytes.
 +#
 +# Since: 5.2
 +##
 +{ 'struct': 'BlockExportOptionsVhostUserBlk',
 +  'data': { 'addr': 'SocketAddress', '*logical-block-size': 'size' } }
 +
-+        /*
+ ##
-+         * Do an early check for completions. q->lock isn't needed because
+ # @NbdServerAddOptions:
-+         * nvme_process_completion() only runs in the event loop thread and
+ #
-+         * cannot race with itself.
+@@ -XXX,XX +XXX,XX @@
-+         */
+ # An enumeration of block export types
-+        if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
+ #
-+            continue;
+ # @nbd: NBD export
 +# @vhost-user-blk: vhost-user-blk export (since 5.2)
  #
  # Since: 4.2
  ##
  { 'enum': 'BlockExportType',
 -  'data': [ 'nbd' ] }
 +  'data': [ 'nbd', 'vhost-user-blk' ] }
  ##
  # @BlockExportOptions:
@@ -XXX,XX +XXX,XX @@
              '*writethrough': 'bool' },
    'discriminator': 'type',
    'data': {
 -      'nbd': 'BlockExportOptionsNbd'
 +      'nbd': 'BlockExportOptionsNbd',
 +      'vhost-user-blk': 'BlockExportOptionsVhostUserBlk'
     } }
  ##
 diff --git a/block/export/vhost-user-blk-server.h b/block/export/vhost-user-blk-server.h
 index XXXXXXX..XXXXXXX 100644
 --- a/block/export/vhost-user-blk-server.h
 +++ b/block/export/vhost-user-blk-server.h
@@ -XXX,XX +XXX,XX @@
  #ifndef VHOST_USER_BLK_SERVER_H
  #define VHOST_USER_BLK_SERVER_H
 -#include "util/vhost-user-server.h"
 -typedef struct VuBlockDev VuBlockDev;
 -#define TYPE_VHOST_USER_BLK_SERVER "vhost-user-blk-server"
 -#define VHOST_USER_BLK_SERVER(obj) \
 -   OBJECT_CHECK(VuBlockDev, obj, TYPE_VHOST_USER_BLK_SERVER)
 +#include "block/export.h"
 -/* vhost user block device */
 -struct VuBlockDev {
 -    Object parent_obj;
 -    char *node_name;
 -    SocketAddress *addr;
 -    AioContext *ctx;
 -    VuServer vu_server;
 -    bool running;
 -    uint32_t blk_size;
 -    BlockBackend *backend;
 -    QIOChannelSocket *sioc;
 -    QTAILQ_ENTRY(VuBlockDev) next;
 -    struct virtio_blk_config blkcfg;
 -    bool writable;
 -};
 +/* For block/export/export.c */
 +extern const BlockExportDriver blk_exp_vhost_user_blk;
  #endif /* VHOST_USER_BLK_SERVER_H */
 diff --git a/block/export/export.c b/block/export/export.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/export/export.c
 +++ b/block/export/export.c
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/block-backend.h"
  #include "block/export.h"
  #include "block/nbd.h"
 +#if CONFIG_LINUX
 +#include "block/export/vhost-user-blk-server.h"
 +#endif
  #include "qapi/error.h"
  #include "qapi/qapi-commands-block-export.h"
  #include "qapi/qapi-events-block-export.h"
@@ -XXX,XX +XXX,XX @@
  static const BlockExportDriver *blk_exp_drivers[] = {
      &blk_exp_nbd,
 +#if CONFIG_LINUX
 +    &blk_exp_vhost_user_blk,
 +#endif
  };
  /* Only accessed from the main thread */
 diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/export/vhost-user-blk-server.c
 +++ b/block/export/vhost-user-blk-server.c
@@ -XXX,XX +XXX,XX @@
   */
  #include "qemu/osdep.h"
  #include "block/block.h"
 +#include "contrib/libvhost-user/libvhost-user.h"
 +#include "standard-headers/linux/virtio_blk.h"
 +#include "util/vhost-user-server.h"
  #include "vhost-user-blk-server.h"
  #include "qapi/error.h"
  #include "qom/object_interfaces.h"
@@ -XXX,XX +XXX,XX @@ struct virtio_blk_inhdr {
      unsigned char status;
  };
 -typedef struct VuBlockReq {
 +typedef struct VuBlkReq {
      VuVirtqElement elem;
      int64_t sector_num;
      size_t size;
@@ -XXX,XX +XXX,XX @@ typedef struct VuBlockReq {
      struct virtio_blk_outhdr out;
      VuServer *server;
      struct VuVirtq *vq;
 -} VuBlockReq;
 +} VuBlkReq;
 -static void vu_block_req_complete(VuBlockReq *req)
 +/* vhost user block device */
 +typedef struct {
 +    BlockExport export;
 +    VuServer vu_server;
 +    uint32_t blk_size;
 +    QIOChannelSocket *sioc;
 +    struct virtio_blk_config blkcfg;
 +    bool writable;
 +} VuBlkExport;
 +
 +static void vu_blk_req_complete(VuBlkReq *req)
  {
      VuDev *vu_dev = &req->server->vu_dev;
@@ -XXX,XX +XXX,XX @@ static void vu_block_req_complete(VuBlockReq *req)
      free(req);
  }
 -static VuBlockDev *get_vu_block_device_by_server(VuServer *server)
 -{
 -    return container_of(server, VuBlockDev, vu_server);
 -}
 -
  static int coroutine_fn
 -vu_block_discard_write_zeroes(VuBlockReq *req, struct iovec *iov,
 -                              uint32_t iovcnt, uint32_t type)
 +vu_blk_discard_write_zeroes(BlockBackend *blk, struct iovec *iov,
 +                            uint32_t iovcnt, uint32_t type)
  {
      struct virtio_blk_discard_write_zeroes desc;
      ssize_t size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc));
@@ -XXX,XX +XXX,XX @@ vu_block_discard_write_zeroes(VuBlockReq *req, struct iovec *iov,
          return -EINVAL;
      }
 -    VuBlockDev *vdev_blk = get_vu_block_device_by_server(req->server);
      uint64_t range[2] = { le64_to_cpu(desc.sector) << 9,
                            le32_to_cpu(desc.num_sectors) << 9 };
      if (type == VIRTIO_BLK_T_DISCARD) {
 -        if (blk_co_pdiscard(vdev_blk->backend, range[0], range[1]) == 0) {
 +        if (blk_co_pdiscard(blk, range[0], range[1]) == 0) {
              return 0;
          }
      } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
 -        if (blk_co_pwrite_zeroes(vdev_blk->backend,
 -                                 range[0], range[1], 0) == 0) {
 +        if (blk_co_pwrite_zeroes(blk, range[0], range[1], 0) == 0) {
              return 0;
          }
      }
@@ -XXX,XX +XXX,XX @@ vu_block_discard_write_zeroes(VuBlockReq *req, struct iovec *iov,
      return -EINVAL;
  }
 -static int coroutine_fn vu_block_flush(VuBlockReq *req)
 +static void coroutine_fn vu_blk_virtio_process_req(void *opaque)
  {
 -    VuBlockDev *vdev_blk = get_vu_block_device_by_server(req->server);
 -    BlockBackend *backend = vdev_blk->backend;
 -    return blk_co_flush(backend);
 -}
 -
 -static void coroutine_fn vu_block_virtio_process_req(void *opaque)
 -{
 -    VuBlockReq *req = opaque;
 +    VuBlkReq *req = opaque;
      VuServer *server = req->server;
      VuVirtqElement *elem = &req->elem;
      uint32_t type;
 -    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
 -    BlockBackend *backend = vdev_blk->backend;
 +    VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server);
 +    BlockBackend *blk = vexp->export.blk;
      struct iovec *in_iov = elem->in_sg;
      struct iovec *out_iov = elem->out_sg;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn vu_block_virtio_process_req(void *opaque)
          bool is_write = type & VIRTIO_BLK_T_OUT;
          req->sector_num = le64_to_cpu(req->out.sector);
 -        int64_t offset = req->sector_num * vdev_blk->blk_size;
 +        if (is_write && !vexp->writable) {
 +            req->in->status = VIRTIO_BLK_S_IOERR;
 +            break;
 +        }
 +
-         qemu_mutex_lock(&q->lock);
++        int64_t offset = req->sector_num * vexp->blk_size;
-         while (nvme_process_completion(s, q)) {
+         QEMUIOVector qiov;
-             /* Keep polling */
+         if (is_write) {
              qemu_iovec_init_external(&qiov, out_iov, out_num);
 -            ret = blk_co_pwritev(backend, offset, qiov.size,
 -                                 &qiov, 0);
 +            ret = blk_co_pwritev(blk, offset, qiov.size, &qiov, 0);
          } else {
              qemu_iovec_init_external(&qiov, in_iov, in_num);
 -            ret = blk_co_preadv(backend, offset, qiov.size,
 -                                &qiov, 0);
 +            ret = blk_co_preadv(blk, offset, qiov.size, &qiov, 0);
          }
          if (ret >= 0) {
              req->in->status = VIRTIO_BLK_S_OK;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn vu_block_virtio_process_req(void *opaque)
          break;
      }
      case VIRTIO_BLK_T_FLUSH:
 -        if (vu_block_flush(req) == 0) {
 +        if (blk_co_flush(blk) == 0) {
              req->in->status = VIRTIO_BLK_S_OK;
          } else {
              req->in->status = VIRTIO_BLK_S_IOERR;
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn vu_block_virtio_process_req(void *opaque)
      case VIRTIO_BLK_T_DISCARD:
      case VIRTIO_BLK_T_WRITE_ZEROES: {
          int rc;
 -        rc = vu_block_discard_write_zeroes(req, &elem->out_sg[1],
 -                                           out_num, type);
 +
 +        if (!vexp->writable) {
 +            req->in->status = VIRTIO_BLK_S_IOERR;
 +            break;
 +        }
 +
 +        rc = vu_blk_discard_write_zeroes(blk, &elem->out_sg[1], out_num, type);
          if (rc == 0) {
              req->in->status = VIRTIO_BLK_S_OK;
          } else {
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn vu_block_virtio_process_req(void *opaque)
          break;
      }
 -    vu_block_req_complete(req);
 +    vu_blk_req_complete(req);
      return;
  err:
 -    free(elem);
 +    free(req);
  }
 -static void vu_block_process_vq(VuDev *vu_dev, int idx)
 +static void vu_blk_process_vq(VuDev *vu_dev, int idx)
  {
      VuServer *server = container_of(vu_dev, VuServer, vu_dev);
      VuVirtq *vq = vu_get_queue(vu_dev, idx);
      while (1) {
 -        VuBlockReq *req;
 +        VuBlkReq *req;
 -        req = vu_queue_pop(vu_dev, vq, sizeof(VuBlockReq));
 +        req = vu_queue_pop(vu_dev, vq, sizeof(VuBlkReq));
          if (!req) {
              break;
          }
@@ -XXX,XX +XXX,XX @@ static void vu_block_process_vq(VuDev *vu_dev, int idx)
          req->vq = vq;
          Coroutine *co =
 -            qemu_coroutine_create(vu_block_virtio_process_req, req);
 +            qemu_coroutine_create(vu_blk_virtio_process_req, req);
          qemu_coroutine_enter(co);
      }
  }
 -static void vu_block_queue_set_started(VuDev *vu_dev, int idx, bool started)
 +static void vu_blk_queue_set_started(VuDev *vu_dev, int idx, bool started)
  {
      VuVirtq *vq;
      assert(vu_dev);
      vq = vu_get_queue(vu_dev, idx);
 -    vu_set_queue_handler(vu_dev, vq, started ? vu_block_process_vq : NULL);
 +    vu_set_queue_handler(vu_dev, vq, started ? vu_blk_process_vq : NULL);
  }
 -static uint64_t vu_block_get_features(VuDev *dev)
 +static uint64_t vu_blk_get_features(VuDev *dev)
  {
      uint64_t features;
      VuServer *server = container_of(dev, VuServer, vu_dev);
 -    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
 +    VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server);
      features = 1ull << VIRTIO_BLK_F_SIZE_MAX |
 ull << VIRTIO_BLK_F_SEG_MAX |
 ull << VIRTIO_BLK_F_TOPOLOGY |
@@ -XXX,XX +XXX,XX @@ static uint64_t vu_block_get_features(VuDev *dev)
 ull << VIRTIO_RING_F_EVENT_IDX |
 ull << VHOST_USER_F_PROTOCOL_FEATURES;
 -    if (!vdev_blk->writable) {
 +    if (!vexp->writable) {
          features |= 1ull << VIRTIO_BLK_F_RO;
      }
      return features;
  }
 -static uint64_t vu_block_get_protocol_features(VuDev *dev)
 +static uint64_t vu_blk_get_protocol_features(VuDev *dev)
  {
      return 1ull << VHOST_USER_PROTOCOL_F_CONFIG |
 ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD;
  }
  static int
 -vu_block_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
 +vu_blk_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
  {
 +    /* TODO blkcfg must be little-endian for VIRTIO 1.0 */
      VuServer *server = container_of(vu_dev, VuServer, vu_dev);
 -    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
 -    memcpy(config, &vdev_blk->blkcfg, len);
 -
 +    VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server);
 +    memcpy(config, &vexp->blkcfg, len);
      return 0;
  }
  static int
 -vu_block_set_config(VuDev *vu_dev, const uint8_t *data,
 +vu_blk_set_config(VuDev *vu_dev, const uint8_t *data,
                      uint32_t offset, uint32_t size, uint32_t flags)
  {
      VuServer *server = container_of(vu_dev, VuServer, vu_dev);
 -    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
 +    VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server);
      uint8_t wce;
      /* don't support live migration */
@@ -XXX,XX +XXX,XX @@ vu_block_set_config(VuDev *vu_dev, const uint8_t *data,
      }
      wce = *data;
 -    vdev_blk->blkcfg.wce = wce;
 -    blk_set_enable_write_cache(vdev_blk->backend, wce);
 +    vexp->blkcfg.wce = wce;
 +    blk_set_enable_write_cache(vexp->export.blk, wce);
      return 0;
  }
@@ -XXX,XX +XXX,XX @@ vu_block_set_config(VuDev *vu_dev, const uint8_t *data,
   * of vu_process_message.
   *
   */
 -static int vu_block_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply)
 +static int vu_blk_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply)
  {
      if (vmsg->request == VHOST_USER_NONE) {
          dev->panic(dev, "disconnect");
@@ -XXX,XX +XXX,XX @@ static int vu_block_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply)
      return false;
  }
 -static const VuDevIface vu_block_iface = {
 -    .get_features          = vu_block_get_features,
 -    .queue_set_started     = vu_block_queue_set_started,
 -    .get_protocol_features = vu_block_get_protocol_features,
 -    .get_config            = vu_block_get_config,
 -    .set_config            = vu_block_set_config,
 -    .process_msg           = vu_block_process_msg,
 +static const VuDevIface vu_blk_iface = {
 +    .get_features          = vu_blk_get_features,
 +    .queue_set_started     = vu_blk_queue_set_started,
 +    .get_protocol_features = vu_blk_get_protocol_features,
 +    .get_config            = vu_blk_get_config,
 +    .set_config            = vu_blk_set_config,
 +    .process_msg           = vu_blk_process_msg,
  };
  static void blk_aio_attached(AioContext *ctx, void *opaque)
  {
 -    VuBlockDev *vub_dev = opaque;
 -    vhost_user_server_attach_aio_context(&vub_dev->vu_server, ctx);
 +    VuBlkExport *vexp = opaque;
 +    vhost_user_server_attach_aio_context(&vexp->vu_server, ctx);
  }
  static void blk_aio_detach(void *opaque)
  {
 -    VuBlockDev *vub_dev = opaque;
 -    vhost_user_server_detach_aio_context(&vub_dev->vu_server);
 +    VuBlkExport *vexp = opaque;
 +    vhost_user_server_detach_aio_context(&vexp->vu_server);
  }
  static void
 -vu_block_initialize_config(BlockDriverState *bs,
 +vu_blk_initialize_config(BlockDriverState *bs,
                             struct virtio_blk_config *config, uint32_t blk_size)
  {
      config->capacity = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
@@ -XXX,XX +XXX,XX @@ vu_block_initialize_config(BlockDriverState *bs,
      config->max_write_zeroes_seg = 1;
  }
 -static VuBlockDev *vu_block_init(VuBlockDev *vu_block_device, Error **errp)
 +static void vu_blk_exp_request_shutdown(BlockExport *exp)
  {
 +    VuBlkExport *vexp = container_of(exp, VuBlkExport, export);
 -    BlockBackend *blk;
 -    Error *local_error = NULL;
 -    const char *node_name = vu_block_device->node_name;
 -    bool writable = vu_block_device->writable;
 -    uint64_t perm = BLK_PERM_CONSISTENT_READ;
 -    int ret;
 -
 -    AioContext *ctx;
 -
 -    BlockDriverState *bs = bdrv_lookup_bs(node_name, node_name, &local_error);
 -
 -    if (!bs) {
 -        error_propagate(errp, local_error);
 -        return NULL;
 -    }
 -
 -    if (bdrv_is_read_only(bs)) {
 -        writable = false;
 -    }
 -
 -    if (writable) {
 -        perm |= BLK_PERM_WRITE;
 -    }
 -
 -    ctx = bdrv_get_aio_context(bs);
 -    aio_context_acquire(ctx);
 -    bdrv_invalidate_cache(bs, NULL);
 -    aio_context_release(ctx);
 -
 -    /*
 -     * Don't allow resize while the vhost user server is running,
 -     * otherwise we don't care what happens with the node.
 -     */
 -    blk = blk_new(bdrv_get_aio_context(bs), perm,
 -                  BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
 -                  BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD);
 -    ret = blk_insert_bs(blk, bs, errp);
 -
 -    if (ret < 0) {
 -        goto fail;
 -    }
 -
 -    blk_set_enable_write_cache(blk, false);
 -
 -    blk_set_allow_aio_context_change(blk, true);
 -
 -    vu_block_device->blkcfg.wce = 0;
 -    vu_block_device->backend = blk;
 -    if (!vu_block_device->blk_size) {
 -        vu_block_device->blk_size = BDRV_SECTOR_SIZE;
 -    }
 -    vu_block_device->blkcfg.blk_size = vu_block_device->blk_size;
 -    blk_set_guest_block_size(blk, vu_block_device->blk_size);
 -    vu_block_initialize_config(bs, &vu_block_device->blkcfg,
 -                                   vu_block_device->blk_size);
 -    return vu_block_device;
 -
 -fail:
 -    blk_unref(blk);
 -    return NULL;
 -}
 -
 -static void vu_block_deinit(VuBlockDev *vu_block_device)
 -{
 -    if (vu_block_device->backend) {
 -        blk_remove_aio_context_notifier(vu_block_device->backend, blk_aio_attached,
 -                                        blk_aio_detach, vu_block_device);
 -    }
 -
 -    blk_unref(vu_block_device->backend);
 -}
 -
 -static void vhost_user_blk_server_stop(VuBlockDev *vu_block_device)
 -{
 -    vhost_user_server_stop(&vu_block_device->vu_server);
 -    vu_block_deinit(vu_block_device);
 -}
 -
 -static void vhost_user_blk_server_start(VuBlockDev *vu_block_device,
 -                                        Error **errp)
 -{
 -    AioContext *ctx;
 -    SocketAddress *addr = vu_block_device->addr;
 -
 -    if (!vu_block_init(vu_block_device, errp)) {
 -        return;
 -    }
 -
 -    ctx = bdrv_get_aio_context(blk_bs(vu_block_device->backend));
 -
 -    if (!vhost_user_server_start(&vu_block_device->vu_server, addr, ctx,
 -                                 VHOST_USER_BLK_MAX_QUEUES, &vu_block_iface,
 -                                 errp)) {
 -        goto error;
 -    }
 -
 -    blk_add_aio_context_notifier(vu_block_device->backend, blk_aio_attached,
 -                                 blk_aio_detach, vu_block_device);
 -    vu_block_device->running = true;
 -    return;
 -
 - error:
 -    vu_block_deinit(vu_block_device);
 -}
 -
 -static bool vu_prop_modifiable(VuBlockDev *vus, Error **errp)
 -{
 -    if (vus->running) {
 -            error_setg(errp, "The property can't be modified "
 -                       "while the server is running");
 -            return false;
 -    }
 -    return true;
 -}
 -
 -static void vu_set_node_name(Object *obj, const char *value, Error **errp)
 -{
 -    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
 -
 -    if (!vu_prop_modifiable(vus, errp)) {
 -        return;
 -    }
 -
 -    if (vus->node_name) {
 -        g_free(vus->node_name);
 -    }
 -
 -    vus->node_name = g_strdup(value);
 -}
 -
 -static char *vu_get_node_name(Object *obj, Error **errp)
 -{
 -    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
 -    return g_strdup(vus->node_name);
 -}
 -
 -static void free_socket_addr(SocketAddress *addr)
 -{
 -        g_free(addr->u.q_unix.path);
 -        g_free(addr);
 -}
 -
 -static void vu_set_unix_socket(Object *obj, const char *value,
 -                               Error **errp)
 -{
 -    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
 -
 -    if (!vu_prop_modifiable(vus, errp)) {
 -        return;
 -    }
 -
 -    if (vus->addr) {
 -        free_socket_addr(vus->addr);
 -    }
 -
 -    SocketAddress *addr = g_new0(SocketAddress, 1);
 -    addr->type = SOCKET_ADDRESS_TYPE_UNIX;
 -    addr->u.q_unix.path = g_strdup(value);
 -    vus->addr = addr;
 +    vhost_user_server_stop(&vexp->vu_server);
  }
 -static char *vu_get_unix_socket(Object *obj, Error **errp)
 +static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
 +                             Error **errp)
  {
 -    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
 -    return g_strdup(vus->addr->u.q_unix.path);
 -}
 -
 -static bool vu_get_block_writable(Object *obj, Error **errp)
 -{
 -    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
 -    return vus->writable;
 -}
 -
 -static void vu_set_block_writable(Object *obj, bool value, Error **errp)
 -{
 -    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
 -
 -    if (!vu_prop_modifiable(vus, errp)) {
 -            return;
 -    }
 -
 -    vus->writable = value;
 -}
 -
 -static void vu_get_blk_size(Object *obj, Visitor *v, const char *name,
 -                            void *opaque, Error **errp)
 -{
 -    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
 -    uint32_t value = vus->blk_size;
 -
 -    visit_type_uint32(v, name, &value, errp);
 -}
 -
 -static void vu_set_blk_size(Object *obj, Visitor *v, const char *name,
 -                            void *opaque, Error **errp)
 -{
 -    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
 -
 +    VuBlkExport *vexp = container_of(exp, VuBlkExport, export);
 +    BlockExportOptionsVhostUserBlk *vu_opts = &opts->u.vhost_user_blk;
      Error *local_err = NULL;
 -    uint32_t value;
 +    uint64_t logical_block_size;
 -    if (!vu_prop_modifiable(vus, errp)) {
 -            return;
 -    }
 +    vexp->writable = opts->writable;
 +    vexp->blkcfg.wce = 0;
 -    visit_type_uint32(v, name, &value, &local_err);
 -    if (local_err) {
 -        goto out;
 +    if (vu_opts->has_logical_block_size) {
 +        logical_block_size = vu_opts->logical_block_size;
 +    } else {
 +        logical_block_size = BDRV_SECTOR_SIZE;
      }
 -
 -    check_block_size(object_get_typename(obj), name, value, &local_err);
 +    check_block_size(exp->id, "logical-block-size", logical_block_size,
 +                     &local_err);
      if (local_err) {
 -        goto out;
 +        error_propagate(errp, local_err);
 +        return -EINVAL;
 +    }
 +    vexp->blk_size = logical_block_size;
 +    blk_set_guest_block_size(exp->blk, logical_block_size);
 +    vu_blk_initialize_config(blk_bs(exp->blk), &vexp->blkcfg,
 +                               logical_block_size);
 +
 +    blk_set_allow_aio_context_change(exp->blk, true);
 +    blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
 +                                 vexp);
 +
 +    if (!vhost_user_server_start(&vexp->vu_server, vu_opts->addr, exp->ctx,
 +                                 VHOST_USER_BLK_MAX_QUEUES, &vu_blk_iface,
 +                                 errp)) {
 +        blk_remove_aio_context_notifier(exp->blk, blk_aio_attached,
 +                                        blk_aio_detach, vexp);
 +        return -EADDRNOTAVAIL;
      }
 -    vus->blk_size = value;
 -
 -out:
 -    error_propagate(errp, local_err);
 -}
 -
 -static void vhost_user_blk_server_instance_finalize(Object *obj)
 -{
 -    VuBlockDev *vub = VHOST_USER_BLK_SERVER(obj);
 -
 -    vhost_user_blk_server_stop(vub);
 -
 -    /*
 -     * Unlike object_property_add_str, object_class_property_add_str
 -     * doesn't have a release method. Thus manual memory freeing is
 -     * needed.
 -     */
 -    free_socket_addr(vub->addr);
 -    g_free(vub->node_name);
 -}
 -
 -static void vhost_user_blk_server_complete(UserCreatable *obj, Error **errp)
 -{
 -    VuBlockDev *vub = VHOST_USER_BLK_SERVER(obj);
 -
 -    vhost_user_blk_server_start(vub, errp);
 +    return 0;
  }
 -static void vhost_user_blk_server_class_init(ObjectClass *klass,
 -                                             void *class_data)
 +static void vu_blk_exp_delete(BlockExport *exp)
  {
 -    UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
 -    ucc->complete = vhost_user_blk_server_complete;
 -
 -    object_class_property_add_bool(klass, "writable",
 -                                   vu_get_block_writable,
 -                                   vu_set_block_writable);
 -
 -    object_class_property_add_str(klass, "node-name",
 -                                  vu_get_node_name,
 -                                  vu_set_node_name);
 -
 -    object_class_property_add_str(klass, "unix-socket",
 -                                  vu_get_unix_socket,
 -                                  vu_set_unix_socket);
 +    VuBlkExport *vexp = container_of(exp, VuBlkExport, export);
 -    object_class_property_add(klass, "logical-block-size", "uint32",
 -                              vu_get_blk_size, vu_set_blk_size,
 -                              NULL, NULL);
 +    blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
 +                                    vexp);
  }
 -static const TypeInfo vhost_user_blk_server_info = {
 -    .name = TYPE_VHOST_USER_BLK_SERVER,
 -    .parent = TYPE_OBJECT,
 -    .instance_size = sizeof(VuBlockDev),
 -    .instance_finalize = vhost_user_blk_server_instance_finalize,
 -    .class_init = vhost_user_blk_server_class_init,
 -    .interfaces = (InterfaceInfo[]) {
 -        {TYPE_USER_CREATABLE},
 -        {}
 -    },
 +const BlockExportDriver blk_exp_vhost_user_blk = {
 +    .type               = BLOCK_EXPORT_TYPE_VHOST_USER_BLK,
 +    .instance_size      = sizeof(VuBlkExport),
 +    .create             = vu_blk_exp_create,
 +    .delete             = vu_blk_exp_delete,
 +    .request_shutdown   = vu_blk_exp_request_shutdown,
  };
 -
 -static void vhost_user_blk_server_register_types(void)
 -{
 -    type_register_static(&vhost_user_blk_server_info);
 -}
 -
 -type_init(vhost_user_blk_server_register_types)
 diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/vhost-user-server.c
 +++ b/util/vhost-user-server.c
@@ -XXX,XX +XXX,XX @@ bool vhost_user_server_start(VuServer *server,
                               Error **errp)
  {
      QEMUBH *bh;
 -    QIONetListener *listener = qio_net_listener_new();
 +    QIONetListener *listener;
 +
 +    if (socket_addr->type != SOCKET_ADDRESS_TYPE_UNIX &&
 +        socket_addr->type != SOCKET_ADDRESS_TYPE_FD) {
 +        error_setg(errp, "Only socket address types 'unix' and 'fd' are supported");
 +        return false;
 +    }
 +
 +    listener = qio_net_listener_new();
      if (qio_net_listener_open_sync(listener, socket_addr, 1,
                                     errp) < 0) {
          object_unref(OBJECT(listener));
 diff --git a/block/export/meson.build b/block/export/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/block/export/meson.build
 +++ b/block/export/meson.build
@@ -1 +1,2 @@
  block_ss.add(files('export.c'))
 +block_ss.add(when: 'CONFIG_LINUX', if_true: files('vhost-user-blk-server.c', '../../contrib/libvhost-user/libvhost-user.c'))
 diff --git a/block/meson.build b/block/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/block/meson.build
 +++ b/block/meson.build
@@ -XXX,XX +XXX,XX @@ block_ss.add(when: 'CONFIG_WIN32', if_true: files('file-win32.c', 'win32-aio.c')
  block_ss.add(when: 'CONFIG_POSIX', if_true: [files('file-posix.c'), coref, iokit])
  block_ss.add(when: 'CONFIG_LIBISCSI', if_true: files('iscsi-opts.c'))
  block_ss.add(when: 'CONFIG_LINUX', if_true: files('nvme.c'))
 -block_ss.add(when: 'CONFIG_LINUX', if_true: files('export/vhost-user-blk-server.c', '../contrib/libvhost-user/libvhost-user.c'))
  block_ss.add(when: 'CONFIG_REPLICATION', if_true: files('replication.c'))
  block_ss.add(when: 'CONFIG_SHEEPDOG', if_true: files('sheepdog.c'))
  block_ss.add(when: ['CONFIG_LINUX_AIO', libaio], if_true: files('linux-aio.c'))
 --
 .26.2

-New patch
+[PULL v2 18/28] util/vhost-user-server: move header to include/
+Headers used by other subsystems are located in include/. Also add the
+vhost-user-server and vhost-user-blk-server headers to MAINTAINERS.
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Message-id: 20200924151549.913737-13-stefanha@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ MAINTAINERS                                | 4 +++-
+ {util => include/qemu}/vhost-user-server.h | 0
+ block/export/vhost-user-blk-server.c       | 2 +-
+ util/vhost-user-server.c                   | 2 +-
+files changed, 5 insertions(+), 3 deletions(-)
+ rename {util => include/qemu}/vhost-user-server.h (100%)
+diff --git a/MAINTAINERS b/MAINTAINERS
+index XXXXXXX..XXXXXXX 100644
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -XXX,XX +XXX,XX @@ Vhost-user block device backend server
+ M: Coiby Xu <Coiby.Xu@gmail.com>
+ S: Maintained
+ F: block/export/vhost-user-blk-server.c
+-F: util/vhost-user-server.c
++F: block/export/vhost-user-blk-server.h
++F: include/qemu/vhost-user-server.h
+ F: tests/qtest/libqos/vhost-user-blk.c
++F: util/vhost-user-server.c
+ Replication
+ M: Wen Congyang <wencongyang2@huawei.com>
+diff --git a/util/vhost-user-server.h b/include/qemu/vhost-user-server.h
+similarity index 100%
+rename from util/vhost-user-server.h
+rename to include/qemu/vhost-user-server.h
+diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/export/vhost-user-blk-server.c
++++ b/block/export/vhost-user-blk-server.c
+@@ -XXX,XX +XXX,XX @@
+ #include "block/block.h"
+ #include "contrib/libvhost-user/libvhost-user.h"
+ #include "standard-headers/linux/virtio_blk.h"
+-#include "util/vhost-user-server.h"
++#include "qemu/vhost-user-server.h"
+ #include "vhost-user-blk-server.h"
+ #include "qapi/error.h"
+ #include "qom/object_interfaces.h"
+diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
+index XXXXXXX..XXXXXXX 100644
+--- a/util/vhost-user-server.c
++++ b/util/vhost-user-server.c
+@@ -XXX,XX +XXX,XX @@
+  */
+ #include "qemu/osdep.h"
+ #include "qemu/main-loop.h"
++#include "qemu/vhost-user-server.h"
+ #include "block/aio-wait.h"
+-#include "vhost-user-server.h"
+ /*
+  * Theory of operation:
+--
+.26.2

-New patch
+[PULL v2 19/28] util/vhost-user-server: use static library in meson.build
+Don't compile contrib/libvhost-user/libvhost-user.c again. Instead build
+the static library once and then reuse it throughout QEMU.
+Also switch from CONFIG_LINUX to CONFIG_VHOST_USER, which is what the
+vhost-user tools (vhost-user-gpu, etc) do.
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Message-id: 20200924151549.913737-14-stefanha@redhat.com
+[Added CONFIG_LINUX again because libvhost-user doesn't build on macOS.
+--Stefan]
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/export/export.c             | 8 ++++----
+ block/export/meson.build          | 2 +-
+ contrib/libvhost-user/meson.build | 1 +
+ meson.build                       | 6 +++++-
+ util/meson.build                  | 4 +++-
+files changed, 14 insertions(+), 7 deletions(-)
+diff --git a/block/export/export.c b/block/export/export.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/export/export.c
++++ b/block/export/export.c
+@@ -XXX,XX +XXX,XX @@
+ #include "sysemu/block-backend.h"
+ #include "block/export.h"
+ #include "block/nbd.h"
+-#if CONFIG_LINUX
+-#include "block/export/vhost-user-blk-server.h"
+-#endif
+ #include "qapi/error.h"
+ #include "qapi/qapi-commands-block-export.h"
+ #include "qapi/qapi-events-block-export.h"
+ #include "qemu/id.h"
++#ifdef CONFIG_VHOST_USER
++#include "vhost-user-blk-server.h"
++#endif
+ static const BlockExportDriver *blk_exp_drivers[] = {
+     &blk_exp_nbd,
+-#if CONFIG_LINUX
++#ifdef CONFIG_VHOST_USER
+     &blk_exp_vhost_user_blk,
+ #endif
+ };
+diff --git a/block/export/meson.build b/block/export/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/block/export/meson.build
++++ b/block/export/meson.build
+@@ -XXX,XX +XXX,XX @@
+ block_ss.add(files('export.c'))
+-block_ss.add(when: 'CONFIG_LINUX', if_true: files('vhost-user-blk-server.c', '../../contrib/libvhost-user/libvhost-user.c'))
++block_ss.add(when: ['CONFIG_LINUX', 'CONFIG_VHOST_USER'], if_true: files('vhost-user-blk-server.c'))
+diff --git a/contrib/libvhost-user/meson.build b/contrib/libvhost-user/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/contrib/libvhost-user/meson.build
++++ b/contrib/libvhost-user/meson.build
+@@ -XXX,XX +XXX,XX @@
+ libvhost_user = static_library('vhost-user',
+                                files('libvhost-user.c', 'libvhost-user-glib.c'),
+                                build_by_default: false)
++vhost_user = declare_dependency(link_with: libvhost_user)
+diff --git a/meson.build b/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/meson.build
++++ b/meson.build
+@@ -XXX,XX +XXX,XX @@ trace_events_subdirs += [
+   'util',
+ ]
++vhost_user = not_found
++if 'CONFIG_VHOST_USER' in config_host
++  subdir('contrib/libvhost-user')
++endif
++
+ subdir('qapi')
+ subdir('qobject')
+ subdir('stubs')
+@@ -XXX,XX +XXX,XX @@ if have_tools
+              install: true)
+   if 'CONFIG_VHOST_USER' in config_host
+-    subdir('contrib/libvhost-user')
+     subdir('contrib/vhost-user-blk')
+     subdir('contrib/vhost-user-gpu')
+     subdir('contrib/vhost-user-input')
+diff --git a/util/meson.build b/util/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/util/meson.build
++++ b/util/meson.build
+@@ -XXX,XX +XXX,XX @@ if have_block
+   util_ss.add(files('main-loop.c'))
+   util_ss.add(files('nvdimm-utils.c'))
+   util_ss.add(files('qemu-coroutine.c', 'qemu-coroutine-lock.c', 'qemu-coroutine-io.c'))
+-  util_ss.add(when: 'CONFIG_LINUX', if_true: files('vhost-user-server.c'))
++  util_ss.add(when: ['CONFIG_LINUX', 'CONFIG_VHOST_USER'], if_true: [
++    files('vhost-user-server.c'), vhost_user
++  ])
+   util_ss.add(files('block-helpers.c'))
+   util_ss.add(files('qemu-coroutine-sleep.c'))
+   util_ss.add(files('qemu-co-shared-resource.c'))
+--
+.26.2

-New patch
+[PULL v2 20/28] qemu-storage-daemon: avoid compiling blockdev_ss twice
+Introduce libblkdev.fa to avoid recompiling blockdev_ss twice.
+Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Message-id: 20200929125516.186715-3-stefanha@redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ meson.build                | 12 ++++++++++--
+ storage-daemon/meson.build |  3 +--
+files changed, 11 insertions(+), 4 deletions(-)
+diff --git a/meson.build b/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/meson.build
++++ b/meson.build
+@@ -XXX,XX +XXX,XX @@ blockdev_ss.add(files(
+ # os-win32.c does not
+ blockdev_ss.add(when: 'CONFIG_POSIX', if_true: files('os-posix.c'))
+ softmmu_ss.add(when: 'CONFIG_WIN32', if_true: [files('os-win32.c')])
+-softmmu_ss.add_all(blockdev_ss)
+ common_ss.add(files('cpus-common.c'))
+@@ -XXX,XX +XXX,XX @@ block = declare_dependency(link_whole: [libblock],
+                            link_args: '@block.syms',
+                            dependencies: [crypto, io])
++blockdev_ss = blockdev_ss.apply(config_host, strict: false)
++libblockdev = static_library('blockdev', blockdev_ss.sources() + genh,
++                             dependencies: blockdev_ss.dependencies(),
++                             name_suffix: 'fa',
++                             build_by_default: false)
++
++blockdev = declare_dependency(link_whole: [libblockdev],
++                              dependencies: [block])
++
+ qmp_ss = qmp_ss.apply(config_host, strict: false)
+ libqmp = static_library('qmp', qmp_ss.sources() + genh,
+                         dependencies: qmp_ss.dependencies(),
+@@ -XXX,XX +XXX,XX @@ foreach m : block_mods + softmmu_mods
+                 install_dir: config_host['qemu_moddir'])
+ endforeach
+-softmmu_ss.add(authz, block, chardev, crypto, io, qmp)
++softmmu_ss.add(authz, blockdev, chardev, crypto, io, qmp)
+ common_ss.add(qom, qemuutil)
+ common_ss.add_all(when: 'CONFIG_SOFTMMU', if_true: [softmmu_ss])
+diff --git a/storage-daemon/meson.build b/storage-daemon/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/storage-daemon/meson.build
++++ b/storage-daemon/meson.build
+@@ -XXX,XX +XXX,XX @@
+ qsd_ss = ss.source_set()
+ qsd_ss.add(files('qemu-storage-daemon.c'))
+-qsd_ss.add(block, chardev, qmp, qom, qemuutil)
+-qsd_ss.add_all(blockdev_ss)
++qsd_ss.add(blockdev, chardev, qmp, qom, qemuutil)
+ subdir('qapi')
+--
+.26.2

-New patch
+[PULL v2 21/28] block: move block exports to libblockdev
+Block exports are used by softmmu, qemu-storage-daemon, and qemu-nbd.
+They are not used by other programs and are not otherwise needed in
+libblock.
+Undo the recent move of blockdev-nbd.c from blockdev_ss into block_ss.
+Since bdrv_close_all() (libblock) calls blk_exp_close_all()
+(libblockdev) a stub function is required..
+Make qemu-nbd.c use signal handling utility functions instead of
+duplicating the code. This helps because os-posix.c is in libblockdev
+and it depends on a qemu_system_killed() symbol that qemu-nbd.c lacks.
+Once we use the signal handling utility functions we also end up
+providing the necessary symbol.
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Eric Blake <eblake@redhat.com>
+Message-id: 20200929125516.186715-4-stefanha@redhat.com
+[Fixed s/ndb/nbd/ typo in commit description as suggested by Eric Blake
+--Stefan]
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ qemu-nbd.c                | 21 ++++++++-------------
+ stubs/blk-exp-close-all.c |  7 +++++++
+ block/export/meson.build  |  4 ++--
+ meson.build               |  4 ++--
+ nbd/meson.build           |  2 ++
+ stubs/meson.build         |  1 +
+files changed, 22 insertions(+), 17 deletions(-)
+ create mode 100644 stubs/blk-exp-close-all.c
+diff --git a/qemu-nbd.c b/qemu-nbd.c
+index XXXXXXX..XXXXXXX 100644
+--- a/qemu-nbd.c
++++ b/qemu-nbd.c
+@@ -XXX,XX +XXX,XX @@
+ #include "qapi/error.h"
+ #include "qemu/cutils.h"
+ #include "sysemu/block-backend.h"
++#include "sysemu/runstate.h" /* for qemu_system_killed() prototype */
+ #include "block/block_int.h"
+ #include "block/nbd.h"
+ #include "qemu/main-loop.h"
+@@ -XXX,XX +XXX,XX @@ QEMU_COPYRIGHT "\n"
+ }
+ #ifdef CONFIG_POSIX
+-static void termsig_handler(int signum)
++/*
++ * The client thread uses SIGTERM to interrupt the server.  A signal
++ * handler ensures that "qemu-nbd -v -c" exits with a nice status code.
++ */
++void qemu_system_killed(int signum, pid_t pid)
+ {
+     qatomic_cmpxchg(&state, RUNNING, TERMINATE);
+     qemu_notify_event();
+@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
+     BlockExportOptions *export_opts;
+ #ifdef CONFIG_POSIX
+-    /*
+-     * Exit gracefully on various signals, which includes SIGTERM used
+-     * by 'qemu-nbd -v -c'.
+-     */
+-    struct sigaction sa_sigterm;
+-    memset(&sa_sigterm, 0, sizeof(sa_sigterm));
+-    sa_sigterm.sa_handler = termsig_handler;
+-    sigaction(SIGTERM, &sa_sigterm, NULL);
+-    sigaction(SIGINT, &sa_sigterm, NULL);
+-    sigaction(SIGHUP, &sa_sigterm, NULL);
+-
+-    signal(SIGPIPE, SIG_IGN);
++    os_setup_early_signal_handling();
++    os_setup_signal_handling();
+ #endif
+     socket_init();
+diff --git a/stubs/blk-exp-close-all.c b/stubs/blk-exp-close-all.c
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/stubs/blk-exp-close-all.c
+@@ -XXX,XX +XXX,XX @@
++#include "qemu/osdep.h"
++#include "block/export.h"
++
++/* Only used in programs that support block exports (libblockdev.fa) */
++void blk_exp_close_all(void)
++{
++}
+diff --git a/block/export/meson.build b/block/export/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/block/export/meson.build
++++ b/block/export/meson.build
+@@ -XXX,XX +XXX,XX @@
+-block_ss.add(files('export.c'))
+-block_ss.add(when: ['CONFIG_LINUX', 'CONFIG_VHOST_USER'], if_true: files('vhost-user-blk-server.c'))
++blockdev_ss.add(files('export.c'))
++blockdev_ss.add(when: ['CONFIG_LINUX', 'CONFIG_VHOST_USER'], if_true: files('vhost-user-blk-server.c'))
+diff --git a/meson.build b/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/meson.build
++++ b/meson.build
+@@ -XXX,XX +XXX,XX @@ subdir('dump')
+ block_ss.add(files(
+   'block.c',
+-  'blockdev-nbd.c',
+   'blockjob.c',
+   'job.c',
+   'qemu-io-cmds.c',
+@@ -XXX,XX +XXX,XX @@ subdir('block')
+ blockdev_ss.add(files(
+   'blockdev.c',
++  'blockdev-nbd.c',
+   'iothread.c',
+   'job-qmp.c',
+ ))
+@@ -XXX,XX +XXX,XX @@ if have_tools
+   qemu_io = executable('qemu-io', files('qemu-io.c'),
+              dependencies: [block, qemuutil], install: true)
+   qemu_nbd = executable('qemu-nbd', files('qemu-nbd.c'),
+-               dependencies: [block, qemuutil], install: true)
++               dependencies: [blockdev, qemuutil], install: true)
+   subdir('storage-daemon')
+   subdir('contrib/rdmacm-mux')
+diff --git a/nbd/meson.build b/nbd/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/nbd/meson.build
++++ b/nbd/meson.build
+@@ -XXX,XX +XXX,XX @@
+ block_ss.add(files(
+   'client.c',
+   'common.c',
++))
++blockdev_ss.add(files(
+   'server.c',
+ ))
+diff --git a/stubs/meson.build b/stubs/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/stubs/meson.build
++++ b/stubs/meson.build
+@@ -XXX,XX +XXX,XX @@
+ stub_ss.add(files('arch_type.c'))
+ stub_ss.add(files('bdrv-next-monitor-owned.c'))
+ stub_ss.add(files('blk-commit-all.c'))
++stub_ss.add(files('blk-exp-close-all.c'))
+ stub_ss.add(files('blockdev-close-all-bdrv-states.c'))
+ stub_ss.add(files('change-state-handler.c'))
+ stub_ss.add(files('cmos.c'))
+--
+.26.2

-[PULL 12/12] block/nvme: support nested aio_poll()
+[PULL v2 22/28] block/export: add iothread and fixed-iothread options
-QEMU block drivers are supposed to support aio_poll() from I/O
+Make it possible to specify the iothread where the export will run. By
-completion callback functions. This means completion processing must be
+default the block node can be moved to other AioContexts later and the
-re-entrant.
+export will follow. The fixed-iothread option forces strict behavior
+that prevents changing AioContext while the export is active. See the
-The standard approach is to schedule a BH during completion processing
+QAPI docs for details.
 and cancel it at the end of processing. If aio_poll() is invoked by a
 callback function then the BH will run. The BH continues the suspended
 completion processing.
 All of this means that request A's cb() can synchronously wait for
 request B to complete. Previously the nvme block driver would hang
 because it didn't process completions from nested aio_poll().
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Sergio Lopez <slp@redhat.com>
+Message-id: 20200929125516.186715-5-stefanha@redhat.com
-Message-id: 20200617132201.1832152-8-stefanha@redhat.com
+[Fix stray '#' character in block-export.json and add missing "(since:
 .2)" as suggested by Eric Blake.
 --Stefan]
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/nvme.c       | 67 ++++++++++++++++++++++++++++++++++++++++------
+ qapi/block-export.json               | 11 ++++++++++
- block/trace-events |  2 +-
+ block/export/export.c                | 31 +++++++++++++++++++++++++++-
-files changed, 60 insertions(+), 9 deletions(-)
+ block/export/vhost-user-blk-server.c |  5 ++++-
  nbd/server.c                         |  2 --
 files changed, 45 insertions(+), 4 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
+diff --git a/qapi/block-export.json b/qapi/block-export.json
 index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
+--- a/qapi/block-export.json
-+++ b/block/nvme.c
++++ b/qapi/block-export.json
-@@ -XXX,XX +XXX,XX @@ typedef struct {
+@@ -XXX,XX +XXX,XX @@
-     int         cq_phase;
+ #                export before completion is signalled. (since: 5.2;
-     int         free_req_head;
+ #                default: false)
-     NVMeRequest reqs[NVME_NUM_REQS];
+ #
--    bool        busy;
++# @iothread: The name of the iothread object where the export will run. The
-     int         need_kick;
++#            default is to use the thread currently associated with the
-     int         inflight;
++#            block node. (since: 5.2)
 +#
 +# @fixed-iothread: True prevents the block node from being moved to another
 +#                  thread while the export is active. If true and @iothread is
 +#                  given, export creation fails if the block node cannot be
 +#                  moved to the iothread. The default is false. (since: 5.2)
 +#
  # Since: 4.2
  ##
  { 'union': 'BlockExportOptions',
    'base': { 'type': 'BlockExportType',
              'id': 'str',
 +        '*fixed-iothread': 'bool',
 +        '*iothread': 'str',
              'node-name': 'str',
              '*writable': 'bool',
              '*writethrough': 'bool' },
 diff --git a/block/export/export.c b/block/export/export.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/export/export.c
 +++ b/block/export/export.c
@@ -XXX,XX +XXX,XX @@
  #include "block/block.h"
  #include "sysemu/block-backend.h"
 +#include "sysemu/iothread.h"
  #include "block/export.h"
  #include "block/nbd.h"
  #include "qapi/error.h"
@@ -XXX,XX +XXX,XX @@ static const BlockExportDriver *blk_exp_find_driver(BlockExportType type)
  BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp)
  {
 +    bool fixed_iothread = export->has_fixed_iothread && export->fixed_iothread;
      const BlockExportDriver *drv;
      BlockExport *exp = NULL;
      BlockDriverState *bs;
 -    BlockBackend *blk;
 +    BlockBackend *blk = NULL;
      AioContext *ctx;
      uint64_t perm;
      int ret;
@@ -XXX,XX +XXX,XX @@ BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp)
      ctx = bdrv_get_aio_context(bs);
      aio_context_acquire(ctx);
 +    if (export->has_iothread) {
 +        IOThread *iothread;
 +        AioContext *new_ctx;
 +
-+    /* Thread-safe, no lock necessary */
++        iothread = iothread_by_id(export->iothread);
-+    QEMUBH      *completion_bh;
++        if (!iothread) {
- } NVMeQueuePair;
++            error_setg(errp, "iothread \"%s\" not found", export->iothread);
++            goto fail;
- /* Memory mapped registers */
++        }
@@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState {
  #define NVME_BLOCK_OPT_DEVICE "device"
  #define NVME_BLOCK_OPT_NAMESPACE "namespace"
 +static void nvme_process_completion_bh(void *opaque);
 +
- static QemuOptsList runtime_opts = {
++        new_ctx = iothread_get_aio_context(iothread);
      .name = "nvme",
      .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
  static void nvme_free_queue_pair(NVMeQueuePair *q)
  {
 +    if (q->completion_bh) {
 +        qemu_bh_delete(q->completion_bh);
 +    }
      qemu_vfree(q->prp_list_pages);
      qemu_vfree(q->sq.queue);
      qemu_vfree(q->cq.queue);
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
      q->index = idx;
      qemu_co_queue_init(&q->free_req_queue);
      q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
 +    q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs),
 +                                  nvme_process_completion_bh, q);
      r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
                            s->page_size * NVME_NUM_REQS,
                            false, &prp_list_iova);
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
      NvmeCqe *c;
      trace_nvme_process_completion(s, q->index, q->inflight);
 -    if (q->busy || s->plugged) {
 -        trace_nvme_process_completion_queue_busy(s, q->index);
 +    if (s->plugged) {
 +        trace_nvme_process_completion_queue_plugged(s, q->index);
          return false;
      }
 -    q->busy = true;
 +
-+    /*
++        ret = bdrv_try_set_aio_context(bs, new_ctx, errp);
-+     * Support re-entrancy when a request cb() function invokes aio_poll().
++        if (ret == 0) {
-+     * Pending completions must be visible to aio_poll() so that a cb()
++            aio_context_release(ctx);
-+     * function can wait for the completion of another request.
++            aio_context_acquire(new_ctx);
-+     *
++            ctx = new_ctx;
-+     * The aio_poll() loop will execute our BH and we'll resume completion
++        } else if (fixed_iothread) {
-+     * processing there.
++            goto fail;
-+     */
++        }
 +    qemu_bh_schedule(q->completion_bh);
 +
      assert(q->inflight >= 0);
      while (q->inflight) {
          int ret;
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
          assert(req.cb);
          nvme_put_free_req_locked(q, preq);
          preq->cb = preq->opaque = NULL;
 -        qemu_mutex_unlock(&q->lock);
 -        req.cb(req.opaque, ret);
 -        qemu_mutex_lock(&q->lock);
          q->inflight--;
 +        qemu_mutex_unlock(&q->lock);
 +        req.cb(req.opaque, ret);
 +        qemu_mutex_lock(&q->lock);
          progress = true;
      }
      if (progress) {
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
          *q->cq.doorbell = cpu_to_le32(q->cq.head);
          nvme_wake_free_req_locked(q);
      }
 -    q->busy = false;
 +
 +    qemu_bh_cancel(q->completion_bh);
 +
      return progress;
  }
 +static void nvme_process_completion_bh(void *opaque)
 +{
 +    NVMeQueuePair *q = opaque;
 +
 +    /*
 +     * We're being invoked because a nvme_process_completion() cb() function
 +     * called aio_poll(). The callback may be waiting for further completions
 +     * so notify the device that it has space to fill in more completions now.
 +     */
 +    smp_mb_release();
 +    *q->cq.doorbell = cpu_to_le32(q->cq.head);
 +    nvme_wake_free_req_locked(q);
 +
 +    nvme_process_completion(q);
 +}
 +
  static void nvme_trace_command(const NvmeCmd *cmd)
  {
      int i;
@@ -XXX,XX +XXX,XX @@ static void nvme_detach_aio_context(BlockDriverState *bs)
  {
      BDRVNVMeState *s = bs->opaque;
 +    for (int i = 0; i < s->nr_queues; i++) {
 +        NVMeQueuePair *q = s->queues[i];
 +
 +        qemu_bh_delete(q->completion_bh);
 +        q->completion_bh = NULL;
 +    }
 +
-     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
+     /*
-                            false, NULL, NULL);
+      * Block exports are used for non-shared storage migration. Make sure
       * that BDRV_O_INACTIVE is cleared and the image is ready for write
@@ -XXX,XX +XXX,XX @@ BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp)
      }
      blk = blk_new(ctx, perm, BLK_PERM_ALL);
 +
 +    if (!fixed_iothread) {
 +        blk_set_allow_aio_context_change(blk, true);
 +    }
 +
      ret = blk_insert_bs(blk, bs, errp);
      if (ret < 0) {
          goto fail;
 diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/export/vhost-user-blk-server.c
 +++ b/block/export/vhost-user-blk-server.c
@@ -XXX,XX +XXX,XX @@ static const VuDevIface vu_blk_iface = {
  static void blk_aio_attached(AioContext *ctx, void *opaque)
  {
      VuBlkExport *vexp = opaque;
 +
 +    vexp->export.ctx = ctx;
      vhost_user_server_attach_aio_context(&vexp->vu_server, ctx);
  }
-@@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs,
-     s->aio_context = new_context;
+ static void blk_aio_detach(void *opaque)
-     aio_set_event_notifier(new_context, &s->irq_notifier,
+ {
-                            false, nvme_handle_event, nvme_poll_cb);
+     VuBlkExport *vexp = opaque;
 +
-+    for (int i = 0; i < s->nr_queues; i++) {
+     vhost_user_server_detach_aio_context(&vexp->vu_server);
-+        NVMeQueuePair *q = s->queues[i];
++    vexp->export.ctx = NULL;
 +
 +        q->completion_bh =
 +            aio_bh_new(new_context, nvme_process_completion_bh, q);
 +    }
  }
- static void nvme_aio_plug(BlockDriverState *bs)
+ static void
-diff --git a/block/trace-events b/block/trace-events
+@@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
      vu_blk_initialize_config(blk_bs(exp->blk), &vexp->blkcfg,
                                 logical_block_size);
 -    blk_set_allow_aio_context_change(exp->blk, true);
      blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
                                   vexp);
 diff --git a/nbd/server.c b/nbd/server.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/trace-events
+--- a/nbd/server.c
-+++ b/block/trace-events
++++ b/nbd/server.c
-@@ -XXX,XX +XXX,XX @@ nvme_kick(void *s, int queue) "s %p queue %d"
+@@ -XXX,XX +XXX,XX @@ static int nbd_export_create(BlockExport *blk_exp, BlockExportOptions *exp_args,
- nvme_dma_flush_queue_wait(void *s) "s %p"
+         return ret;
- nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
+     }
- nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
--nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d"
+-    blk_set_allow_aio_context_change(blk, true);
-+nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d"
+-
- nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
+     QTAILQ_INIT(&exp->clients);
- nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
+     exp->name = g_strdup(arg->name);
- nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
+     exp->description = g_strdup(arg->description);
 --
 .26.2

-[PULL 07/12] block/nvme: drop tautologous assertion
+[PULL v2 23/28] block/export: add vhost-user-blk multi-queue support
-nvme_process_completion() explicitly checks cid so the assertion that
+Allow the number of queues to be configured using --export
-follows is always true:
+vhost-user-blk,num-queues=N. This setting should match the QEMU --device
 vhost-user-blk-pci,num-queues=N setting but QEMU vhost-user-blk.c lowers
 its own value if the vhost-user-blk backend offers fewer queues than
 QEMU.
-  if (cid == 0 || cid > NVME_QUEUE_SIZE) {
+The vhost-user-blk-server.c code is already capable of multi-queue. All
-      ...
+virtqueue processing runs in the same AioContext. No new locking is
-      continue;
+needed.
-  }
-  assert(cid <= NVME_QUEUE_SIZE);
+Add the num-queues=N option and set the VIRTIO_BLK_F_MQ feature bit.
 Note that the feature bit only announces the presence of the num_queues
 configuration space field. It does not promise that there is more than 1
 virtqueue, so we can set it unconditionally.
 I tested multi-queue by running a random read fio test with numjobs=4 on
 an -smp 4 guest. After the benchmark finished the guest /proc/interrupts
 file showed activity on all 4 virtio-blk MSI-X. The /sys/block/vda/mq/
 directory shows that Linux blk-mq has 4 queues configured.
 An automated test is included in the next commit.
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Sergio Lopez <slp@redhat.com>
+Acked-by: Markus Armbruster <armbru@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20201001144604.559733-2-stefanha@redhat.com
-Message-id: 20200617132201.1832152-3-stefanha@redhat.com
+[Fixed accidental tab characters as suggested by Markus Armbruster
 --Stefan]
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/nvme.c | 1 -
+ qapi/block-export.json               | 10 +++++++---
-file changed, 1 deletion(-)
+ block/export/vhost-user-blk-server.c | 24 ++++++++++++++++++------
 files changed, 25 insertions(+), 9 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
+diff --git a/qapi/block-export.json b/qapi/block-export.json
 index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
+--- a/qapi/block-export.json
-+++ b/block/nvme.c
++++ b/qapi/block-export.json
-@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
+@@ -XXX,XX +XXX,XX @@
-                     cid);
+ #        SocketAddress types are supported. Passed fds must be UNIX domain
-             continue;
+ #        sockets.
-         }
+ # @logical-block-size: Logical block size in bytes. Defaults to 512 bytes.
--        assert(cid <= NVME_QUEUE_SIZE);
++# @num-queues: Number of request virtqueues. Must be greater than 0. Defaults
-         trace_nvme_complete_command(s, q->index, cid);
++#              to 1.
-         preq = &q->reqs[cid - 1];
+ #
-         req = *preq;
+ # Since: 5.2
  ##
  { 'struct': 'BlockExportOptionsVhostUserBlk',
 -  'data': { 'addr': 'SocketAddress', '*logical-block-size': 'size' } }
 +  'data': { 'addr': 'SocketAddress',
 +        '*logical-block-size': 'size',
 +            '*num-queues': 'uint16'} }
  ##
  # @NbdServerAddOptions:
@@ -XXX,XX +XXX,XX @@
  { 'union': 'BlockExportOptions',
    'base': { 'type': 'BlockExportType',
              'id': 'str',
 -        '*fixed-iothread': 'bool',
 -        '*iothread': 'str',
 +            '*fixed-iothread': 'bool',
 +            '*iothread': 'str',
              'node-name': 'str',
              '*writable': 'bool',
              '*writethrough': 'bool' },
 diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
 index XXXXXXX..XXXXXXX 100644
 --- a/block/export/vhost-user-blk-server.c
 +++ b/block/export/vhost-user-blk-server.c
@@ -XXX,XX +XXX,XX @@
  #include "util/block-helpers.h"
  enum {
 -    VHOST_USER_BLK_MAX_QUEUES = 1,
 +    VHOST_USER_BLK_NUM_QUEUES_DEFAULT = 1,
  };
  struct virtio_blk_inhdr {
      unsigned char status;
@@ -XXX,XX +XXX,XX @@ static uint64_t vu_blk_get_features(VuDev *dev)
 ull << VIRTIO_BLK_F_DISCARD |
 ull << VIRTIO_BLK_F_WRITE_ZEROES |
 ull << VIRTIO_BLK_F_CONFIG_WCE |
 +               1ull << VIRTIO_BLK_F_MQ |
 ull << VIRTIO_F_VERSION_1 |
 ull << VIRTIO_RING_F_INDIRECT_DESC |
 ull << VIRTIO_RING_F_EVENT_IDX |
@@ -XXX,XX +XXX,XX @@ static void blk_aio_detach(void *opaque)
  static void
  vu_blk_initialize_config(BlockDriverState *bs,
 -                           struct virtio_blk_config *config, uint32_t blk_size)
 +                         struct virtio_blk_config *config,
 +                         uint32_t blk_size,
 +                         uint16_t num_queues)
  {
      config->capacity = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
      config->blk_size = blk_size;
@@ -XXX,XX +XXX,XX @@ vu_blk_initialize_config(BlockDriverState *bs,
      config->seg_max = 128 - 2;
      config->min_io_size = 1;
      config->opt_io_size = 1;
 -    config->num_queues = VHOST_USER_BLK_MAX_QUEUES;
 +    config->num_queues = num_queues;
      config->max_discard_sectors = 32768;
      config->max_discard_seg = 1;
      config->discard_sector_alignment = config->blk_size >> 9;
@@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
      BlockExportOptionsVhostUserBlk *vu_opts = &opts->u.vhost_user_blk;
      Error *local_err = NULL;
      uint64_t logical_block_size;
 +    uint16_t num_queues = VHOST_USER_BLK_NUM_QUEUES_DEFAULT;
      vexp->writable = opts->writable;
      vexp->blkcfg.wce = 0;
@@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
      }
      vexp->blk_size = logical_block_size;
      blk_set_guest_block_size(exp->blk, logical_block_size);
 +
 +    if (vu_opts->has_num_queues) {
 +        num_queues = vu_opts->num_queues;
 +    }
 +    if (num_queues == 0) {
 +        error_setg(errp, "num-queues must be greater than 0");
 +        return -EINVAL;
 +    }
 +
      vu_blk_initialize_config(blk_bs(exp->blk), &vexp->blkcfg,
 -                               logical_block_size);
 +                             logical_block_size, num_queues);
      blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
                                   vexp);
      if (!vhost_user_server_start(&vexp->vu_server, vu_opts->addr, exp->ctx,
 -                                 VHOST_USER_BLK_MAX_QUEUES, &vu_blk_iface,
 -                                 errp)) {
 +                                 num_queues, &vu_blk_iface, errp)) {
          blk_remove_aio_context_notifier(exp->blk, blk_aio_attached,
                                          blk_aio_detach, vexp);
          return -EADDRNOTAVAIL;
 --
 .26.2

-New patch
+[PULL v2 24/28] block/io: fix bdrv_co_block_status_above
+From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+bdrv_co_block_status_above has several design problems with handling
+short backing files:
+. With want_zeros=true, it may return ret with BDRV_BLOCK_ZERO but
+without BDRV_BLOCK_ALLOCATED flag, when actually short backing file
+which produces these after-EOF zeros is inside requested backing
+sequence.
+. With want_zero=false, it may return pnum=0 prior to actual EOF,
+because of EOF of short backing file.
+Fix these things, making logic about short backing files clearer.
+With fixed bdrv_block_status_above we also have to improve is_zero in
+qcow2 code, otherwise iotest 154 will fail, because with this patch we
+stop to merge zeros of different types (produced by fully unallocated
+in the whole backing chain regions vs produced by short backing files).
+Note also, that this patch leaves for another day the general problem
+around block-status: misuse of BDRV_BLOCK_ALLOCATED as is-fs-allocated
+vs go-to-backing.
+Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+Reviewed-by: Alberto Garcia <berto@igalia.com>
+Reviewed-by: Eric Blake <eblake@redhat.com>
+Message-id: 20200924194003.22080-2-vsementsov@virtuozzo.com
+[Fix s/comes/come/ as suggested by Eric Blake
+--Stefan]
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ block/io.c    | 68 ++++++++++++++++++++++++++++++++++++++++-----------
+ block/qcow2.c | 16 ++++++++++--
+files changed, 68 insertions(+), 16 deletions(-)
+diff --git a/block/io.c b/block/io.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/io.c
++++ b/block/io.c
+@@ -XXX,XX +XXX,XX @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
+                                   int64_t *map,
+                                   BlockDriverState **file)
+ {
++    int ret;
+     BlockDriverState *p;
+-    int ret = 0;
+-    bool first = true;
++    int64_t eof = 0;
+     assert(bs != base);
+-    for (p = bs; p != base; p = bdrv_filter_or_cow_bs(p)) {
++
++    ret = bdrv_co_block_status(bs, want_zero, offset, bytes, pnum, map, file);
++    if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED) {
++        return ret;
++    }
++
++    if (ret & BDRV_BLOCK_EOF) {
++        eof = offset + *pnum;
++    }
++
++    assert(*pnum <= bytes);
++    bytes = *pnum;
++
++    for (p = bdrv_filter_or_cow_bs(bs); p != base;
++         p = bdrv_filter_or_cow_bs(p))
++    {
+         ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
+                                    file);
+         if (ret < 0) {
+-            break;
++            return ret;
+         }
+-        if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
++        if (*pnum == 0) {
+             /*
+-             * Reading beyond the end of the file continues to read
+-             * zeroes, but we can only widen the result to the
+-             * unallocated length we learned from an earlier
+-             * iteration.
++             * The top layer deferred to this layer, and because this layer is
++             * short, any zeroes that we synthesize beyond EOF behave as if they
++             * were allocated at this layer.
++             *
++             * We don't include BDRV_BLOCK_EOF into ret, as upper layer may be
++             * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see
++             * below.
+              */
++            assert(ret & BDRV_BLOCK_EOF);
+             *pnum = bytes;
++            if (file) {
++                *file = p;
++            }
++            ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_ALLOCATED;
++            break;
+         }
+-        if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
++        if (ret & BDRV_BLOCK_ALLOCATED) {
++            /*
++             * We've found the node and the status, we must break.
++             *
++             * Drop BDRV_BLOCK_EOF, as it's not for upper layer, which may be
++             * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see
++             * below.
++             */
++            ret &= ~BDRV_BLOCK_EOF;
+             break;
+         }
+-        /* [offset, pnum] unallocated on this layer, which could be only
+-         * the first part of [offset, bytes].  */
+-        bytes = MIN(bytes, *pnum);
+-        first = false;
++
++        /*
++         * OK, [offset, offset + *pnum) region is unallocated on this layer,
++         * let's continue the diving.
++         */
++        assert(*pnum <= bytes);
++        bytes = *pnum;
++    }
++
++    if (offset + *pnum == eof) {
++        ret |= BDRV_BLOCK_EOF;
+     }
++
+     return ret;
+ }
+diff --git a/block/qcow2.c b/block/qcow2.c
+index XXXXXXX..XXXXXXX 100644
+--- a/block/qcow2.c
++++ b/block/qcow2.c
+@@ -XXX,XX +XXX,XX @@ static bool is_zero(BlockDriverState *bs, int64_t offset, int64_t bytes)
+     if (!bytes) {
+         return true;
+     }
+-    res = bdrv_block_status_above(bs, NULL, offset, bytes, &nr, NULL, NULL);
+-    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == bytes;
++
++    /*
++     * bdrv_block_status_above doesn't merge different types of zeros, for
++     * example, zeros which come from the region which is unallocated in
++     * the whole backing chain, and zeros which come because of a short
++     * backing file. So, we need a loop.
++     */
++    do {
++        res = bdrv_block_status_above(bs, NULL, offset, bytes, &nr, NULL, NULL);
++        offset += nr;
++        bytes -= nr;
++    } while (res >= 0 && (res & BDRV_BLOCK_ZERO) && nr && bytes);
++
++    return res >= 0 && (res & BDRV_BLOCK_ZERO) && bytes == 0;
+ }
+ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
+--
+.26.2

-[PULL 05/12] check-block: enable iotests with SafeStack
+[PULL v2 25/28] block/io: bdrv_common_block_status_above: support include_base
-From: Daniele Buono <dbuono@linux.vnet.ibm.com>
+From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-SafeStack is a stack protection technique implemented in llvm. It is
+In order to reuse bdrv_common_block_status_above in
-enabled with a -fsanitize flag.
+bdrv_is_allocated_above, let's support include_base parameter.
 iotests are currently disabled when any -fsanitize option is used,
 because such options tend to produce additional warnings and false
 positives.
-While common -fsanitize options are used to verify the code and not
+Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-added in production, SafeStack's main use is in production environments
+Reviewed-by: Alberto Garcia <berto@igalia.com>
-to protect against stack smashing.
+Reviewed-by: Eric Blake <eblake@redhat.com>
+Message-id: 20200924194003.22080-3-vsementsov@virtuozzo.com
 Since SafeStack does not print any warning or false positive, enable
 iotests when SafeStack is the only -fsanitize option used.
 This is likely going to be a production binary and we want to make sure
 it works correctly.
 Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
 Message-id: 20200529205122.714-5-dbuono@linux.vnet.ibm.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- tests/check-block.sh | 12 +++++++++++-
+ block/coroutines.h |  2 ++
-file changed, 11 insertions(+), 1 deletion(-)
+ block/io.c         | 21 ++++++++++++++-------
 files changed, 16 insertions(+), 7 deletions(-)
-diff --git a/tests/check-block.sh b/tests/check-block.sh
+diff --git a/block/coroutines.h b/block/coroutines.h
-index XXXXXXX..XXXXXXX 100755
+index XXXXXXX..XXXXXXX 100644
---- a/tests/check-block.sh
+--- a/block/coroutines.h
-+++ b/tests/check-block.sh
++++ b/block/coroutines.h
-@@ -XXX,XX +XXX,XX @@ if grep -q "CONFIG_GPROF=y" config-host.mak 2>/dev/null ; then
+@@ -XXX,XX +XXX,XX @@ bdrv_pwritev(BdrvChild *child, int64_t offset, unsigned int bytes,
-     exit 0
+ int coroutine_fn
- fi
+ bdrv_co_common_block_status_above(BlockDriverState *bs,
+                                   BlockDriverState *base,
--if grep -q "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null ; then
++                                  bool include_base,
-+# Disable tests with any sanitizer except for SafeStack
+                                   bool want_zero,
-+CFLAGS=$( grep "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null )
+                                   int64_t offset,
-+SANITIZE_FLAGS=""
+                                   int64_t bytes,
-+#Remove all occurrencies of -fsanitize=safe-stack
+@@ -XXX,XX +XXX,XX @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
-+for i in ${CFLAGS}; do
+ int generated_co_wrapper
-+        if [ "${i}" != "-fsanitize=safe-stack" ]; then
+ bdrv_common_block_status_above(BlockDriverState *bs,
-+                SANITIZE_FLAGS="${SANITIZE_FLAGS} ${i}"
+                                BlockDriverState *base,
-+        fi
++                               bool include_base,
-+done
+                                bool want_zero,
-+if echo ${SANITIZE_FLAGS} | grep -q "\-fsanitize" 2>/dev/null; then
+                                int64_t offset,
-+    # Have a sanitize flag that is not allowed, stop
+                                int64_t bytes,
-     echo "Sanitizers are enabled ==> Not running the qemu-iotests."
+diff --git a/block/io.c b/block/io.c
-     exit 0
+index XXXXXXX..XXXXXXX 100644
- fi
+--- a/block/io.c
 +++ b/block/io.c
@@ -XXX,XX +XXX,XX @@ early_out:
  int coroutine_fn
  bdrv_co_common_block_status_above(BlockDriverState *bs,
                                    BlockDriverState *base,
 +                                  bool include_base,
                                    bool want_zero,
                                    int64_t offset,
                                    int64_t bytes,
@@ -XXX,XX +XXX,XX @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
      BlockDriverState *p;
      int64_t eof = 0;
 -    assert(bs != base);
 +    assert(include_base || bs != base);
 +    assert(!include_base || base); /* Can't include NULL base */
      ret = bdrv_co_block_status(bs, want_zero, offset, bytes, pnum, map, file);
 -    if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED) {
 +    if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) {
          return ret;
      }
@@ -XXX,XX +XXX,XX @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
      assert(*pnum <= bytes);
      bytes = *pnum;
 -    for (p = bdrv_filter_or_cow_bs(bs); p != base;
 +    for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base;
           p = bdrv_filter_or_cow_bs(p))
      {
          ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
@@ -XXX,XX +XXX,XX @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
              break;
          }
 +        if (p == base) {
 +            assert(include_base);
 +            break;
 +        }
 +
          /*
           * OK, [offset, offset + *pnum) region is unallocated on this layer,
           * let's continue the diving.
@@ -XXX,XX +XXX,XX @@ int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
                              int64_t offset, int64_t bytes, int64_t *pnum,
                              int64_t *map, BlockDriverState **file)
  {
 -    return bdrv_common_block_status_above(bs, base, true, offset, bytes,
 +    return bdrv_common_block_status_above(bs, base, false, true, offset, bytes,
                                            pnum, map, file);
  }
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
      int ret;
      int64_t dummy;
 -    ret = bdrv_common_block_status_above(bs, bdrv_filter_or_cow_bs(bs), false,
 -                                         offset, bytes, pnum ? pnum : &dummy,
 -                                         NULL, NULL);
 +    ret = bdrv_common_block_status_above(bs, bs, true, false, offset,
 +                                         bytes, pnum ? pnum : &dummy, NULL,
 +                                         NULL);
      if (ret < 0) {
          return ret;
      }
 --
 .26.2

-[PULL 03/12] coroutine: add check for SafeStack in sigaltstack
+[PULL v2 26/28] block/io: bdrv_common_block_status_above: support bs == base
-From: Daniele Buono <dbuono@linux.vnet.ibm.com>
+From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-Current implementation of LLVM's SafeStack is not compatible with
+We are going to reuse bdrv_common_block_status_above in
-code that uses an alternate stack created with sigaltstack().
+bdrv_is_allocated_above. bdrv_is_allocated_above may be called with
-Since coroutine-sigaltstack relies on sigaltstack(), it is not
+include_base == false and still bs == base (for ex. from img_rebase()).
 compatible with SafeStack. The resulting binary is incorrect, with
 different coroutines sharing the same unsafe stack and producing
 undefined behavior at runtime.
-In the future LLVM may provide a SafeStack implementation compatible with
+So, support this corner case.
 sigaltstack(). In the meantime, if SafeStack is desired, the coroutine
 implementation from coroutine-ucontext should be used.
 As a safety check, add a control in coroutine-sigaltstack to throw a
 preprocessor #error if SafeStack is enabled and we are trying to
 use coroutine-sigaltstack to implement coroutines.
-Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
+Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-Message-id: 20200529205122.714-3-dbuono@linux.vnet.ibm.com
+Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Alberto Garcia <berto@igalia.com>
 Message-id: 20200924194003.22080-4-vsementsov@virtuozzo.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- util/coroutine-sigaltstack.c | 4 ++++
+ block/io.c | 6 +++++-
-file changed, 4 insertions(+)
+file changed, 5 insertions(+), 1 deletion(-)
-diff --git a/util/coroutine-sigaltstack.c b/util/coroutine-sigaltstack.c
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/util/coroutine-sigaltstack.c
+--- a/block/io.c
-+++ b/util/coroutine-sigaltstack.c
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
- #include "qemu-common.h"
+     BlockDriverState *p;
- #include "qemu/coroutine_int.h"
+     int64_t eof = 0;
-+#ifdef CONFIG_SAFESTACK
+-    assert(include_base || bs != base);
-+#error "SafeStack is not compatible with code run in alternate signal stacks"
+     assert(!include_base || base); /* Can't include NULL base */
-+#endif
 +    if (!include_base && bs == base) {
 +        *pnum = bytes;
 +        return 0;
 +    }
 +
- typedef struct {
+     ret = bdrv_co_block_status(bs, want_zero, offset, bytes, pnum, map, file);
-     Coroutine base;
+     if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) {
-     void *stack;
+         return ret;
 --
 .26.2

-[PULL 02/12] coroutine: support SafeStack in ucontext backend
+[PULL v2 27/28] block/io: fix bdrv_is_allocated_above
-From: Daniele Buono <dbuono@linux.vnet.ibm.com>
+From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-LLVM's SafeStack instrumentation does not yet support programs that make
+bdrv_is_allocated_above wrongly handles short backing files: it reports
-use of the APIs in ucontext.h
+after-EOF space as UNALLOCATED which is wrong, as on read the data is
-With the current implementation of coroutine-ucontext, the resulting
+generated on the level of short backing file (if all overlays have
-binary is incorrect, with different coroutines sharing the same unsafe
+unallocated areas at that place).
 stack and producing undefined behavior at runtime.
 This fix allocates an additional unsafe stack area for each coroutine,
 and sets the new unsafe stack pointer before calling swapcontext() in
 qemu_coroutine_new.
 This is the only place where the pointer needs to be manually updated,
 since sigsetjmp/siglongjmp are already instrumented by LLVM to properly
 support SafeStack.
 The additional stack is then freed in qemu_coroutine_delete.
-Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
+Reusing bdrv_common_block_status_above fixes the issue and unifies code
-Message-id: 20200529205122.714-2-dbuono@linux.vnet.ibm.com
+path.
 Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Alberto Garcia <berto@igalia.com>
 Message-id: 20200924194003.22080-5-vsementsov@virtuozzo.com
 [Fix s/has/have/ as suggested by Eric Blake. Fix s/area/areas/.
 --Stefan]
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- include/qemu/coroutine_int.h |  5 +++++
+ block/io.c | 43 +++++--------------------------------------
- util/coroutine-ucontext.c    | 28 ++++++++++++++++++++++++++++
+file changed, 5 insertions(+), 38 deletions(-)
 files changed, 33 insertions(+)
-diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
+diff --git a/block/io.c b/block/io.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/coroutine_int.h
+--- a/block/io.c
-+++ b/include/qemu/coroutine_int.h
++++ b/block/io.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
- #include "qemu/queue.h"
+  * at 'offset + *pnum' may return the same allocation status (in other
- #include "qemu/coroutine.h"
+  * words, the result is not necessarily the maximum possible range);
+  * but 'pnum' will only be 0 when end of file is reached.
-+#ifdef CONFIG_SAFESTACK
+- *
-+/* Pointer to the unsafe stack, defined by the compiler */
+  */
-+extern __thread void *__safestack_unsafe_stack_ptr;
+ int bdrv_is_allocated_above(BlockDriverState *top,
-+#endif
+                             BlockDriverState *base,
-+
+                             bool include_base, int64_t offset,
- #define COROUTINE_STACK_SIZE (1 << 20)
+                             int64_t bytes, int64_t *pnum)
+ {
- typedef enum {
+-    BlockDriverState *intermediate;
-diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c
+-    int ret;
-index XXXXXXX..XXXXXXX 100644
+-    int64_t n = bytes;
---- a/util/coroutine-ucontext.c
+-
-+++ b/util/coroutine-ucontext.c
+-    assert(base || !include_base);
-@@ -XXX,XX +XXX,XX @@ typedef struct {
+-
-     Coroutine base;
+-    intermediate = top;
-     void *stack;
+-    while (include_base || intermediate != base) {
-     size_t stack_size;
+-        int64_t pnum_inter;
-+#ifdef CONFIG_SAFESTACK
+-        int64_t size_inter;
-+    /* Need an unsafe stack for each coroutine */
+-
-+    void *unsafe_stack;
+-        assert(intermediate);
-+    size_t unsafe_stack_size;
+-        ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
-+#endif
+-        if (ret < 0) {
-     sigjmp_buf env;
+-            return ret;
+-        }
-     void *tsan_co_fiber;
+-        if (ret) {
-@@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void)
+-            *pnum = pnum_inter;
-     co = g_malloc0(sizeof(*co));
+-            return 1;
-     co->stack_size = COROUTINE_STACK_SIZE;
+-        }
-     co->stack = qemu_alloc_stack(&co->stack_size);
+-
-+#ifdef CONFIG_SAFESTACK
+-        size_inter = bdrv_getlength(intermediate);
-+    co->unsafe_stack_size = COROUTINE_STACK_SIZE;
+-        if (size_inter < 0) {
-+    co->unsafe_stack = qemu_alloc_stack(&co->unsafe_stack_size);
+-            return size_inter;
-+#endif
+-        }
-     co->base.entry_arg = &old_env; /* stash away our jmp_buf */
+-        if (n > pnum_inter &&
+-            (intermediate == top || offset + pnum_inter < size_inter)) {
-     uc.uc_link = &old_uc;
+-            n = pnum_inter;
-@@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void)
+-        }
-             COROUTINE_YIELD,
+-
-             &fake_stack_save,
+-        if (intermediate == base) {
-             co->stack, co->stack_size, co->tsan_co_fiber);
+-            break;
-+
+-        }
-+#ifdef CONFIG_SAFESTACK
+-
-+        /*
+-        intermediate = bdrv_filter_or_cow_bs(intermediate);
-+         * Before we swap the context, set the new unsafe stack
++    int ret = bdrv_common_block_status_above(top, base, include_base, false,
-+         * The unsafe stack grows just like the normal stack, so start from
++                                             offset, bytes, pnum, NULL, NULL);
-+         * the last usable location of the memory area.
++    if (ret < 0) {
-+         * NOTE: we don't have to re-set the usp afterwards because we are
++        return ret;
 +         * coming back to this context through a siglongjmp.
 +         * The compiler already wrapped the corresponding sigsetjmp call with
 +         * code that saves the usp on the (safe) stack before the call, and
 +         * restores it right after (which is where we return with siglongjmp).
 +         */
 +        void *usp = co->unsafe_stack + co->unsafe_stack_size;
 +        __safestack_unsafe_stack_ptr = usp;
 +#endif
 +
          swapcontext(&old_uc, &uc);
      }
-@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_delete(Coroutine *co_)
+-    *pnum = n;
- #endif
+-    return 0;
++    return !!(ret & BDRV_BLOCK_ALLOCATED);
      qemu_free_stack(co->stack, co->stack_size);
 +#ifdef CONFIG_SAFESTACK
 +    qemu_free_stack(co->unsafe_stack, co->unsafe_stack_size);
 +#endif
      g_free(co);
  }
+ int coroutine_fn
 --
 .26.2

-[PULL 01/12] minikconf: explicitly set encoding to UTF-8
+[PULL v2 28/28] iotests: add commit top->base cases to 274
-QEMU currently only has ASCII Kconfig files but Linux actually uses
+From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
 UTF-8. Explicitly specify the encoding and that we're doing text file
 I/O.
-It's unclear whether or not QEMU will ever need Unicode in its Kconfig
+These cases are fixed by previous patches around block_status and
-files. If we start using the help text then it will become an issue
+is_allocated.
 sooner or later. Make this change now for consistency with Linux
 Kconfig.
-Reported-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Alberto Garcia <berto@igalia.com>
-Message-id: 20200521153616.307100-1-stefanha@redhat.com
+Message-id: 20200924194003.22080-6-vsementsov@virtuozzo.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- scripts/minikconf.py | 6 +++---
+ tests/qemu-iotests/274     | 20 +++++++++++
-file changed, 3 insertions(+), 3 deletions(-)
+ tests/qemu-iotests/274.out | 68 ++++++++++++++++++++++++++++++++++++++
 files changed, 88 insertions(+)
-diff --git a/scripts/minikconf.py b/scripts/minikconf.py
+diff --git a/tests/qemu-iotests/274 b/tests/qemu-iotests/274
 index XXXXXXX..XXXXXXX 100755
---- a/scripts/minikconf.py
+--- a/tests/qemu-iotests/274
-+++ b/scripts/minikconf.py
++++ b/tests/qemu-iotests/274
-@@ -XXX,XX +XXX,XX @@ class KconfigParser:
+@@ -XXX,XX +XXX,XX @@ with iotests.FilePath('base') as base, \
-         if incl_abs_fname in self.data.previously_included:
+     iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, mid)
-             return
+     iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), mid)
-         try:
--            fp = open(incl_abs_fname, 'r')
++    iotests.log('=== Testing qemu-img commit (top -> base) ===')
-+            fp = open(incl_abs_fname, 'rt', encoding='utf-8')
++
-         except IOError as e:
++    create_chain()
-             raise KconfigParserError(self,
++    iotests.qemu_img_log('commit', '-b', base, top)
-                                 '%s: %s' % (e.strerror, include))
++    iotests.img_info_log(base)
-@@ -XXX,XX +XXX,XX @@ if __name__ == '__main__':
++    iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, base)
-             parser.do_assignment(name, value == 'y')
++    iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), base)
-             external_vars.add(name[7:])
++
-         else:
++    iotests.log('=== Testing QMP active commit (top -> base) ===')
--            fp = open(arg, 'r')
++
-+            fp = open(arg, 'rt', encoding='utf-8')
++    create_chain()
-             parser.parse_file(fp)
++    with create_vm() as vm:
-             fp.close()
++        vm.launch()
++        vm.qmp_log('block-commit', device='top', base_node='base',
-@@ -XXX,XX +XXX,XX @@ if __name__ == '__main__':
++                   job_id='job0', auto_dismiss=False)
-         if key not in external_vars and config[key]:
++        vm.run_job('job0', wait=5)
-             print ('CONFIG_%s=y' % key)
++
++    iotests.img_info_log(mid)
--    deps = open(argv[2], 'w')
++    iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, base)
-+    deps = open(argv[2], 'wt', encoding='utf-8')
++    iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), base)
-     for fname in data.previously_included:
-         print ('%s: %s' % (argv[1], fname), file=deps)
+     iotests.log('== Resize tests ==')
-     deps.close()
 diff --git a/tests/qemu-iotests/274.out b/tests/qemu-iotests/274.out
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/qemu-iotests/274.out
 +++ b/tests/qemu-iotests/274.out
@@ -XXX,XX +XXX,XX @@ read 1048576/1048576 bytes at offset 0
  read 1048576/1048576 bytes at offset 1048576
 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +=== Testing qemu-img commit (top -> base) ===
 +Formatting 'TEST_DIR/PID-base', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=2097152 lazy_refcounts=off refcount_bits=16
 +
 +Formatting 'TEST_DIR/PID-mid', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1048576 backing_file=TEST_DIR/PID-base backing_fmt=qcow2 lazy_refcounts=off refcount_bits=16
 +
 +Formatting 'TEST_DIR/PID-top', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=2097152 backing_file=TEST_DIR/PID-mid backing_fmt=qcow2 lazy_refcounts=off refcount_bits=16
 +
 +wrote 2097152/2097152 bytes at offset 0
 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
 +Image committed.
 +
 +image: TEST_IMG
 +file format: IMGFMT
 +virtual size: 2 MiB (2097152 bytes)
 +cluster_size: 65536
 +Format specific information:
 +    compat: 1.1
 +    compression type: zlib
 +    lazy refcounts: false
 +    refcount bits: 16
 +    corrupt: false
 +    extended l2: false
 +
 +read 1048576/1048576 bytes at offset 0
 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
 +read 1048576/1048576 bytes at offset 1048576
 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
 +=== Testing QMP active commit (top -> base) ===
 +Formatting 'TEST_DIR/PID-base', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=2097152 lazy_refcounts=off refcount_bits=16
 +
 +Formatting 'TEST_DIR/PID-mid', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1048576 backing_file=TEST_DIR/PID-base backing_fmt=qcow2 lazy_refcounts=off refcount_bits=16
 +
 +Formatting 'TEST_DIR/PID-top', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=2097152 backing_file=TEST_DIR/PID-mid backing_fmt=qcow2 lazy_refcounts=off refcount_bits=16
 +
 +wrote 2097152/2097152 bytes at offset 0
 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
 +{"execute": "block-commit", "arguments": {"auto-dismiss": false, "base-node": "base", "device": "top", "job-id": "job0"}}
 +{"return": {}}
 +{"execute": "job-complete", "arguments": {"id": "job0"}}
 +{"return": {}}
 +{"data": {"device": "job0", "len": 1048576, "offset": 1048576, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_READY", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
 +{"data": {"device": "job0", "len": 1048576, "offset": 1048576, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
 +{"execute": "job-dismiss", "arguments": {"id": "job0"}}
 +{"return": {}}
 +image: TEST_IMG
 +file format: IMGFMT
 +virtual size: 1 MiB (1048576 bytes)
 +cluster_size: 65536
 +backing file: TEST_DIR/PID-base
 +backing file format: IMGFMT
 +Format specific information:
 +    compat: 1.1
 +    compression type: zlib
 +    lazy refcounts: false
 +    refcount bits: 16
 +    corrupt: false
 +    extended l2: false
 +
 +read 1048576/1048576 bytes at offset 0
 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
 +read 1048576/1048576 bytes at offset 1048576
 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
  == Resize tests ==
  === preallocation=off ===
  Formatting 'TEST_DIR/PID-base', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=6442450944 lazy_refcounts=off refcount_bits=16
 --
 .26.2

The following changes since commit 171199f56f5f9bdf1e5d670d09ef1351d8f01bae:

Merge remote-tracking branch 'remotes/alistair/tags/pull-riscv-to-apply-20200619-3' into staging (2020-06-22 14:45:25 +0100)

are available in the Git repository at:

https://github.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to 7838c67f22a81fcf669785cd6c0876438422071a:

block/nvme: support nested aio_poll() (2020-06-23 15:46:08 +0100)

----------------------------------------------------------------
Pull request

----------------------------------------------------------------

Daniele Buono (4):
  coroutine: support SafeStack in ucontext backend
  coroutine: add check for SafeStack in sigaltstack
  configure: add flags to support SafeStack
  check-block: enable iotests with SafeStack

Stefan Hajnoczi (8):
  minikconf: explicitly set encoding to UTF-8
  block/nvme: poll queues without q->lock
  block/nvme: drop tautologous assertion
  block/nvme: don't access CQE after moving cq.head
  block/nvme: switch to a NVMeRequest freelist
  block/nvme: clarify that free_req_queue is protected by q->lock
  block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair
  block/nvme: support nested aio_poll()

-- 
2.26.2

QEMU currently only has ASCII Kconfig files but Linux actually uses
UTF-8. Explicitly specify the encoding and that we're doing text file
I/O.

It's unclear whether or not QEMU will ever need Unicode in its Kconfig
files. If we start using the help text then it will become an issue
sooner or later. Make this change now for consistency with Linux
Kconfig.

Reported-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20200521153616.307100-1-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 scripts/minikconf.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/minikconf.py b/scripts/minikconf.py
index XXXXXXX..XXXXXXX 100755
--- a/scripts/minikconf.py
+++ b/scripts/minikconf.py
@@ -XXX,XX +XXX,XX @@ class KconfigParser:
         if incl_abs_fname in self.data.previously_included:
             return
         try:
-            fp = open(incl_abs_fname, 'r')
+            fp = open(incl_abs_fname, 'rt', encoding='utf-8')
         except IOError as e:
             raise KconfigParserError(self,
                                 '%s: %s' % (e.strerror, include))
@@ -XXX,XX +XXX,XX @@ if __name__ == '__main__':
             parser.do_assignment(name, value == 'y')
             external_vars.add(name[7:])
         else:
-            fp = open(arg, 'r')
+            fp = open(arg, 'rt', encoding='utf-8')
             parser.parse_file(fp)
             fp.close()
 
@@ -XXX,XX +XXX,XX @@ if __name__ == '__main__':
         if key not in external_vars and config[key]:
             print ('CONFIG_%s=y' % key)
 
-    deps = open(argv[2], 'w')
+    deps = open(argv[2], 'wt', encoding='utf-8')
     for fname in data.previously_included:
         print ('%s: %s' % (argv[1], fname), file=deps)
     deps.close()
-- 
2.26.2

From: Daniele Buono <dbuono@linux.vnet.ibm.com>

LLVM's SafeStack instrumentation does not yet support programs that make
use of the APIs in ucontext.h
With the current implementation of coroutine-ucontext, the resulting
binary is incorrect, with different coroutines sharing the same unsafe
stack and producing undefined behavior at runtime.
This fix allocates an additional unsafe stack area for each coroutine,
and sets the new unsafe stack pointer before calling swapcontext() in
qemu_coroutine_new.
This is the only place where the pointer needs to be manually updated,
since sigsetjmp/siglongjmp are already instrumented by LLVM to properly
support SafeStack.
The additional stack is then freed in qemu_coroutine_delete.

Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
Message-id: 20200529205122.714-2-dbuono@linux.vnet.ibm.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine_int.h |  5 +++++
 util/coroutine-ucontext.c    | 28 ++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine_int.h
+++ b/include/qemu/coroutine_int.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/queue.h"
 #include "qemu/coroutine.h"
 
+#ifdef CONFIG_SAFESTACK
+/* Pointer to the unsafe stack, defined by the compiler */
+extern __thread void *__safestack_unsafe_stack_ptr;
+#endif
+
 #define COROUTINE_STACK_SIZE (1 << 20)
 
 typedef enum {
diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c
index XXXXXXX..XXXXXXX 100644
--- a/util/coroutine-ucontext.c
+++ b/util/coroutine-ucontext.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
     Coroutine base;
     void *stack;
     size_t stack_size;
+#ifdef CONFIG_SAFESTACK
+    /* Need an unsafe stack for each coroutine */
+    void *unsafe_stack;
+    size_t unsafe_stack_size;
+#endif
     sigjmp_buf env;
 
     void *tsan_co_fiber;
@@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void)
     co = g_malloc0(sizeof(*co));
     co->stack_size = COROUTINE_STACK_SIZE;
     co->stack = qemu_alloc_stack(&co->stack_size);
+#ifdef CONFIG_SAFESTACK
+    co->unsafe_stack_size = COROUTINE_STACK_SIZE;
+    co->unsafe_stack = qemu_alloc_stack(&co->unsafe_stack_size);
+#endif
     co->base.entry_arg = &old_env; /* stash away our jmp_buf */
 
     uc.uc_link = &old_uc;
@@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void)
             COROUTINE_YIELD,
             &fake_stack_save,
             co->stack, co->stack_size, co->tsan_co_fiber);
+
+#ifdef CONFIG_SAFESTACK
+        /*
+         * Before we swap the context, set the new unsafe stack
+         * The unsafe stack grows just like the normal stack, so start from
+         * the last usable location of the memory area.
+         * NOTE: we don't have to re-set the usp afterwards because we are
+         * coming back to this context through a siglongjmp.
+         * The compiler already wrapped the corresponding sigsetjmp call with
+         * code that saves the usp on the (safe) stack before the call, and
+         * restores it right after (which is where we return with siglongjmp).
+         */
+        void *usp = co->unsafe_stack + co->unsafe_stack_size;
+        __safestack_unsafe_stack_ptr = usp;
+#endif
+
         swapcontext(&old_uc, &uc);
     }
 
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_delete(Coroutine *co_)
 #endif
 
     qemu_free_stack(co->stack, co->stack_size);
+#ifdef CONFIG_SAFESTACK
+    qemu_free_stack(co->unsafe_stack, co->unsafe_stack_size);
+#endif
     g_free(co);
 }
 
-- 
2.26.2

From: Daniele Buono <dbuono@linux.vnet.ibm.com>

Current implementation of LLVM's SafeStack is not compatible with
code that uses an alternate stack created with sigaltstack().
Since coroutine-sigaltstack relies on sigaltstack(), it is not
compatible with SafeStack. The resulting binary is incorrect, with
different coroutines sharing the same unsafe stack and producing
undefined behavior at runtime.

In the future LLVM may provide a SafeStack implementation compatible with
sigaltstack(). In the meantime, if SafeStack is desired, the coroutine
implementation from coroutine-ucontext should be used.
As a safety check, add a control in coroutine-sigaltstack to throw a
preprocessor #error if SafeStack is enabled and we are trying to
use coroutine-sigaltstack to implement coroutines.

Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
Message-id: 20200529205122.714-3-dbuono@linux.vnet.ibm.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 util/coroutine-sigaltstack.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/util/coroutine-sigaltstack.c b/util/coroutine-sigaltstack.c
index XXXXXXX..XXXXXXX 100644
--- a/util/coroutine-sigaltstack.c
+++ b/util/coroutine-sigaltstack.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu-common.h"
 #include "qemu/coroutine_int.h"
 
+#ifdef CONFIG_SAFESTACK
+#error "SafeStack is not compatible with code run in alternate signal stacks"
+#endif
+
 typedef struct {
     Coroutine base;
     void *stack;
-- 
2.26.2

From: Daniele Buono <dbuono@linux.vnet.ibm.com>

This patch adds a flag to enable/disable the SafeStack instrumentation
provided by LLVM.

On enable, make sure that the compiler supports the flags, and that we
are using the proper coroutine implementation (coroutine-ucontext).
On disable, explicitly disable the option if it was enabled by default.

While SafeStack is supported only on Linux, NetBSD, FreeBSD and macOS,
we are not checking for the O.S. since this is already done by LLVM.

Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
Message-id: 20200529205122.714-4-dbuono@linux.vnet.ibm.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 configure | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/configure b/configure
index XXXXXXX..XXXXXXX 100755
--- a/configure
+++ b/configure
@@ -XXX,XX +XXX,XX @@ audio_win_int=""
 libs_qga=""
 debug_info="yes"
 stack_protector=""
+safe_stack=""
 use_containers="yes"
 gdb_bin=$(command -v "gdb-multiarch" || command -v "gdb")
 
@@ -XXX,XX +XXX,XX @@ for opt do
   ;;
   --disable-stack-protector) stack_protector="no"
   ;;
+  --enable-safe-stack) safe_stack="yes"
+  ;;
+  --disable-safe-stack) safe_stack="no"
+  ;;
   --disable-curses) curses="no"
   ;;
   --enable-curses) curses="yes"
@@ -XXX,XX +XXX,XX @@ disabled with --disable-FEATURE, default is enabled if available:
   debug-tcg       TCG debugging (default is disabled)
   debug-info      debugging information
   sparse          sparse checker
+  safe-stack      SafeStack Stack Smash Protection. Depends on
+                  clang/llvm >= 3.7 and requires coroutine backend ucontext.
 
   gnutls          GNUTLS cryptography support
   nettle          nettle cryptography support
@@ -XXX,XX +XXX,XX @@ if test "$debug_stack_usage" = "yes"; then
   fi
 fi
 
+##################################################
+# SafeStack
+
+
+if test "$safe_stack" = "yes"; then
+cat > $TMPC << EOF
+int main(int argc, char *argv[])
+{
+#if ! __has_feature(safe_stack)
+#error SafeStack Disabled
+#endif
+    return 0;
+}
+EOF
+  flag="-fsanitize=safe-stack"
+  # Check that safe-stack is supported and enabled.
+  if compile_prog "-Werror $flag" "$flag"; then
+    # Flag needed both at compilation and at linking
+    QEMU_CFLAGS="$QEMU_CFLAGS $flag"
+    QEMU_LDFLAGS="$QEMU_LDFLAGS $flag"
+  else
+    error_exit "SafeStack not supported by your compiler"
+  fi
+  if test "$coroutine" != "ucontext"; then
+    error_exit "SafeStack is only supported by the coroutine backend ucontext"
+  fi
+else
+cat > $TMPC << EOF
+int main(int argc, char *argv[])
+{
+#if defined(__has_feature)
+#if __has_feature(safe_stack)
+#error SafeStack Enabled
+#endif
+#endif
+    return 0;
+}
+EOF
+if test "$safe_stack" = "no"; then
+  # Make sure that safe-stack is disabled
+  if ! compile_prog "-Werror" ""; then
+    # SafeStack was already enabled, try to explicitly remove the feature
+    flag="-fno-sanitize=safe-stack"
+    if ! compile_prog "-Werror $flag" "$flag"; then
+      error_exit "Configure cannot disable SafeStack"
+    fi
+    QEMU_CFLAGS="$QEMU_CFLAGS $flag"
+    QEMU_LDFLAGS="$QEMU_LDFLAGS $flag"
+  fi
+else # "$safe_stack" = ""
+  # Set safe_stack to yes or no based on pre-existing flags
+  if compile_prog "-Werror" ""; then
+    safe_stack="no"
+  else
+    safe_stack="yes"
+    if test "$coroutine" != "ucontext"; then
+      error_exit "SafeStack is only supported by the coroutine backend ucontext"
+    fi
+  fi
+fi
+fi
 
 ##########################################
 # check if we have open_by_handle_at
@@ -XXX,XX +XXX,XX @@ echo "sparse enabled    $sparse"
 echo "strip binaries    $strip_opt"
 echo "profiler          $profiler"
 echo "static build      $static"
+echo "safe stack        $safe_stack"
 if test "$darwin" = "yes" ; then
     echo "Cocoa support     $cocoa"
 fi
@@ -XXX,XX +XXX,XX @@ if test "$ccache_cpp2" = "yes"; then
   echo "export CCACHE_CPP2=y" >> $config_host_mak
 fi
 
+if test "$safe_stack" = "yes"; then
+  echo "CONFIG_SAFESTACK=y" >> $config_host_mak
+fi
+
 # If we're using a separate build tree, set it up now.
 # DIRS are directories which we simply mkdir in the build tree;
 # LINKS are things to symlink back into the source tree
-- 
2.26.2

From: Daniele Buono <dbuono@linux.vnet.ibm.com>

SafeStack is a stack protection technique implemented in llvm. It is
enabled with a -fsanitize flag.
iotests are currently disabled when any -fsanitize option is used,
because such options tend to produce additional warnings and false
positives.

While common -fsanitize options are used to verify the code and not
added in production, SafeStack's main use is in production environments
to protect against stack smashing.

Since SafeStack does not print any warning or false positive, enable
iotests when SafeStack is the only -fsanitize option used.
This is likely going to be a production binary and we want to make sure
it works correctly.

Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
Message-id: 20200529205122.714-5-dbuono@linux.vnet.ibm.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/check-block.sh | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/check-block.sh b/tests/check-block.sh
index XXXXXXX..XXXXXXX 100755
--- a/tests/check-block.sh
+++ b/tests/check-block.sh
@@ -XXX,XX +XXX,XX @@ if grep -q "CONFIG_GPROF=y" config-host.mak 2>/dev/null ; then
     exit 0
 fi
 
-if grep -q "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null ; then
+# Disable tests with any sanitizer except for SafeStack
+CFLAGS=$( grep "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null )
+SANITIZE_FLAGS=""
+#Remove all occurrencies of -fsanitize=safe-stack
+for i in ${CFLAGS}; do
+        if [ "${i}" != "-fsanitize=safe-stack" ]; then
+                SANITIZE_FLAGS="${SANITIZE_FLAGS} ${i}"
+        fi
+done
+if echo ${SANITIZE_FLAGS} | grep -q "\-fsanitize" 2>/dev/null; then
+    # Have a sanitize flag that is not allowed, stop
     echo "Sanitizers are enabled ==> Not running the qemu-iotests."
     exit 0
 fi
-- 
2.26.2

A lot of CPU time is spent simply locking/unlocking q->lock during
polling. Check for completion outside the lock to make q->lock disappear
from the profile.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Message-id: 20200617132201.1832152-2-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/nvme.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
 
     for (i = 0; i < s->nr_queues; i++) {
         NVMeQueuePair *q = s->queues[i];
+        const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
+        NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
+
+        /*
+         * Do an early check for completions. q->lock isn't needed because
+         * nvme_process_completion() only runs in the event loop thread and
+         * cannot race with itself.
+         */
+        if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
+            continue;
+        }
+
         qemu_mutex_lock(&q->lock);
         while (nvme_process_completion(s, q)) {
             /* Keep polling */
-- 
2.26.2

Do not access a CQE after incrementing q->cq.head and releasing q->lock.
It is unlikely that this causes problems in practice but it's a latent
bug.

The reason why it should be safe at the moment is that completion
processing is not re-entrant and the CQ doorbell isn't written until the
end of nvme_process_completion().

Make this change now because QEMU expects completion processing to be
re-entrant and later patches will do that.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20200617132201.1832152-4-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/nvme.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
     q->busy = true;
     assert(q->inflight >= 0);
     while (q->inflight) {
+        int ret;
         int16_t cid;
+
         c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
         if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
             break;
         }
+        ret = nvme_translate_error(c);
         q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
         if (!q->cq.head) {
             q->cq_phase = !q->cq_phase;
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
         preq->busy = false;
         preq->cb = preq->opaque = NULL;
         qemu_mutex_unlock(&q->lock);
-        req.cb(req.opaque, nvme_translate_error(c));
+        req.cb(req.opaque, ret);
         qemu_mutex_lock(&q->lock);
         q->inflight--;
         progress = true;
-- 
2.26.2

There are three issues with the current NVMeRequest->busy field:
1. The busy field is accidentally accessed outside q->lock when request
   submission fails.
2. Waiters on free_req_queue are not woken when a request is returned
   early due to submission failure.
2. Finding a free request involves scanning all requests. This makes
   request submission O(n^2).

Switch to an O(1) freelist that is always accessed under the lock.

Also differentiate between NVME_QUEUE_SIZE, the actual SQ/CQ size, and
NVME_NUM_REQS, the number of usable requests. This makes the code
simpler than using NVME_QUEUE_SIZE everywhere and having to keep in mind
that one slot is reserved.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Message-id: 20200617132201.1832152-5-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/nvme.c | 81 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 54 insertions(+), 27 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@
 #define NVME_QUEUE_SIZE 128
 #define NVME_BAR_SIZE 8192
 
+/*
+ * We have to leave one slot empty as that is the full queue case where
+ * head == tail + 1.
+ */
+#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
+
 typedef struct {
     int32_t  head, tail;
     uint8_t  *queue;
@@ -XXX,XX +XXX,XX @@ typedef struct {
     int cid;
     void *prp_list_page;
     uint64_t prp_list_iova;
-    bool busy;
+    int free_req_next; /* q->reqs[] index of next free req */
 } NVMeRequest;
 
 typedef struct {
@@ -XXX,XX +XXX,XX @@ typedef struct {
     /* Fields protected by @lock */
     NVMeQueue   sq, cq;
     int         cq_phase;
-    NVMeRequest reqs[NVME_QUEUE_SIZE];
+    int         free_req_head;
+    NVMeRequest reqs[NVME_NUM_REQS];
     bool        busy;
     int         need_kick;
     int         inflight;
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
     qemu_mutex_init(&q->lock);
     q->index = idx;
     qemu_co_queue_init(&q->free_req_queue);
-    q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE);
+    q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
     r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
-                          s->page_size * NVME_QUEUE_SIZE,
+                          s->page_size * NVME_NUM_REQS,
                           false, &prp_list_iova);
     if (r) {
         goto fail;
     }
-    for (i = 0; i < NVME_QUEUE_SIZE; i++) {
+    q->free_req_head = -1;
+    for (i = 0; i < NVME_NUM_REQS; i++) {
         NVMeRequest *req = &q->reqs[i];
         req->cid = i + 1;
+        req->free_req_next = q->free_req_head;
+        q->free_req_head = i;
         req->prp_list_page = q->prp_list_pages + i * s->page_size;
         req->prp_list_iova = prp_list_iova + i * s->page_size;
     }
+
     nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
@@ -XXX,XX +XXX,XX @@ static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
  */
 static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
 {
-    int i;
-    NVMeRequest *req = NULL;
+    NVMeRequest *req;
 
     qemu_mutex_lock(&q->lock);
-    while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) {
-        /* We have to leave one slot empty as that is the full queue case (head
-         * == tail + 1). */
+
+    while (q->free_req_head == -1) {
         if (qemu_in_coroutine()) {
             trace_nvme_free_req_queue_wait(q);
             qemu_co_queue_wait(&q->free_req_queue, &q->lock);
@@ -XXX,XX +XXX,XX @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
             return NULL;
         }
     }
-    for (i = 0; i < NVME_QUEUE_SIZE; i++) {
-        if (!q->reqs[i].busy) {
-            q->reqs[i].busy = true;
-            req = &q->reqs[i];
-            break;
-        }
-    }
-    /* We have checked inflight and need_kick while holding q->lock, so one
-     * free req must be available. */
-    assert(req);
+
+    req = &q->reqs[q->free_req_head];
+    q->free_req_head = req->free_req_next;
+    req->free_req_next = -1;
+
     qemu_mutex_unlock(&q->lock);
     return req;
 }
 
+/* With q->lock */
+static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
+{
+    req->free_req_next = q->free_req_head;
+    q->free_req_head = req - q->reqs;
+}
+
+/* With q->lock */
+static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
+{
+    if (!qemu_co_queue_empty(&q->free_req_queue)) {
+        replay_bh_schedule_oneshot_event(s->aio_context,
+                nvme_free_req_queue_cb, q);
+    }
+}
+
+/* Insert a request in the freelist and wake waiters */
+static void nvme_put_free_req_and_wake(BDRVNVMeState *s,  NVMeQueuePair *q,
+                                       NVMeRequest *req)
+{
+    qemu_mutex_lock(&q->lock);
+    nvme_put_free_req_locked(q, req);
+    nvme_wake_free_req_locked(s, q);
+    qemu_mutex_unlock(&q->lock);
+}
+
 static inline int nvme_translate_error(const NvmeCqe *c)
 {
     uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
         req = *preq;
         assert(req.cid == cid);
         assert(req.cb);
-        preq->busy = false;
+        nvme_put_free_req_locked(q, preq);
         preq->cb = preq->opaque = NULL;
         qemu_mutex_unlock(&q->lock);
         req.cb(req.opaque, ret);
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
         /* Notify the device so it can post more completions. */
         smp_mb_release();
         *q->cq.doorbell = cpu_to_le32(q->cq.head);
-        if (!qemu_co_queue_empty(&q->free_req_queue)) {
-            replay_bh_schedule_oneshot_event(s->aio_context,
-                                             nvme_free_req_queue_cb, q);
-        }
+        nvme_wake_free_req_locked(s, q);
     }
     q->busy = false;
     return progress;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
     r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
     qemu_co_mutex_unlock(&s->dma_map_lock);
     if (r) {
-        req->busy = false;
+        nvme_put_free_req_and_wake(s, ioq, req);
         return r;
     }
     nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
     qemu_co_mutex_unlock(&s->dma_map_lock);
 
     if (ret) {
-        req->busy = false;
+        nvme_put_free_req_and_wake(s, ioq, req);
         goto out;
     }
 
-- 
2.26.2

Passing around both BDRVNVMeState and NVMeQueuePair is unwieldy. Reduce
the number of function arguments by keeping the BDRVNVMeState pointer in
NVMeQueuePair. This will come in handly when a BH is introduced in a
later patch and only one argument can be passed to it.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20200617132201.1832152-7-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/nvme.c | 70 ++++++++++++++++++++++++++++------------------------
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@
  */
 #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
 
+typedef struct BDRVNVMeState BDRVNVMeState;
+
 typedef struct {
     int32_t  head, tail;
     uint8_t  *queue;
@@ -XXX,XX +XXX,XX @@ typedef struct {
 typedef struct {
     QemuMutex   lock;
 
+    /* Read from I/O code path, initialized under BQL */
+    BDRVNVMeState   *s;
+    int             index;
+
     /* Fields protected by BQL */
-    int         index;
     uint8_t     *prp_list_pages;
 
     /* Fields protected by @lock */
@@ -XXX,XX +XXX,XX @@ typedef volatile struct {
 
 QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000);
 
-typedef struct {
+struct BDRVNVMeState {
     AioContext *aio_context;
     QEMUVFIOState *vfio;
     NVMeRegs *regs;
@@ -XXX,XX +XXX,XX @@ typedef struct {
 
     /* PCI address (required for nvme_refresh_filename()) */
     char *device;
-} BDRVNVMeState;
+};
 
 #define NVME_BLOCK_OPT_DEVICE "device"
 #define NVME_BLOCK_OPT_NAMESPACE "namespace"
@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
     }
 }
 
-static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q)
+static void nvme_free_queue_pair(NVMeQueuePair *q)
 {
     qemu_vfree(q->prp_list_pages);
     qemu_vfree(q->sq.queue);
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
     uint64_t prp_list_iova;
 
     qemu_mutex_init(&q->lock);
+    q->s = s;
     q->index = idx;
     qemu_co_queue_init(&q->free_req_queue);
     q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
 
     return q;
 fail:
-    nvme_free_queue_pair(bs, q);
+    nvme_free_queue_pair(q);
     return NULL;
 }
 
 /* With q->lock */
-static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
+static void nvme_kick(NVMeQueuePair *q)
 {
+    BDRVNVMeState *s = q->s;
+
     if (s->plugged || !q->need_kick) {
         return;
     }
@@ -XXX,XX +XXX,XX @@ static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
 }
 
 /* With q->lock */
-static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
+static void nvme_wake_free_req_locked(NVMeQueuePair *q)
 {
     if (!qemu_co_queue_empty(&q->free_req_queue)) {
-        replay_bh_schedule_oneshot_event(s->aio_context,
+        replay_bh_schedule_oneshot_event(q->s->aio_context,
                 nvme_free_req_queue_cb, q);
     }
 }
 
 /* Insert a request in the freelist and wake waiters */
-static void nvme_put_free_req_and_wake(BDRVNVMeState *s,  NVMeQueuePair *q,
-                                       NVMeRequest *req)
+static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req)
 {
     qemu_mutex_lock(&q->lock);
     nvme_put_free_req_locked(q, req);
-    nvme_wake_free_req_locked(s, q);
+    nvme_wake_free_req_locked(q);
     qemu_mutex_unlock(&q->lock);
 }
 
@@ -XXX,XX +XXX,XX @@ static inline int nvme_translate_error(const NvmeCqe *c)
 }
 
 /* With q->lock */
-static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
+static bool nvme_process_completion(NVMeQueuePair *q)
 {
+    BDRVNVMeState *s = q->s;
     bool progress = false;
     NVMeRequest *preq;
     NVMeRequest req;
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
         /* Notify the device so it can post more completions. */
         smp_mb_release();
         *q->cq.doorbell = cpu_to_le32(q->cq.head);
-        nvme_wake_free_req_locked(s, q);
+        nvme_wake_free_req_locked(q);
     }
     q->busy = false;
     return progress;
@@ -XXX,XX +XXX,XX @@ static void nvme_trace_command(const NvmeCmd *cmd)
     }
 }
 
-static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
-                                NVMeRequest *req,
+static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
                                 NvmeCmd *cmd, BlockCompletionFunc cb,
                                 void *opaque)
 {
@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
     req->opaque = opaque;
     cmd->cid = cpu_to_le32(req->cid);
 
-    trace_nvme_submit_command(s, q->index, req->cid);
+    trace_nvme_submit_command(q->s, q->index, req->cid);
     nvme_trace_command(cmd);
     qemu_mutex_lock(&q->lock);
     memcpy((uint8_t *)q->sq.queue +
            q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
     q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
     q->need_kick++;
-    nvme_kick(s, q);
-    nvme_process_completion(s, q);
+    nvme_kick(q);
+    nvme_process_completion(q);
     qemu_mutex_unlock(&q->lock);
 }
 
@@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
                          NvmeCmd *cmd)
 {
     NVMeRequest *req;
-    BDRVNVMeState *s = bs->opaque;
     int ret = -EINPROGRESS;
     req = nvme_get_free_req(q);
     if (!req) {
         return -EBUSY;
     }
-    nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret);
+    nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
 
     BDRV_POLL_WHILE(bs, ret == -EINPROGRESS);
     return ret;
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
         }
 
         qemu_mutex_lock(&q->lock);
-        while (nvme_process_completion(s, q)) {
+        while (nvme_process_completion(q)) {
             /* Keep polling */
             progress = true;
         }
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
     };
     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
         error_setg(errp, "Failed to create io queue [%d]", n);
-        nvme_free_queue_pair(bs, q);
+        nvme_free_queue_pair(q);
         return false;
     }
     cmd = (NvmeCmd) {
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
     };
     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
         error_setg(errp, "Failed to create io queue [%d]", n);
-        nvme_free_queue_pair(bs, q);
+        nvme_free_queue_pair(q);
         return false;
     }
     s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
@@ -XXX,XX +XXX,XX @@ static void nvme_close(BlockDriverState *bs)
     BDRVNVMeState *s = bs->opaque;
 
     for (i = 0; i < s->nr_queues; ++i) {
-        nvme_free_queue_pair(bs, s->queues[i]);
+        nvme_free_queue_pair(s->queues[i]);
     }
     g_free(s->queues);
     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
     r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
     qemu_co_mutex_unlock(&s->dma_map_lock);
     if (r) {
-        nvme_put_free_req_and_wake(s, ioq, req);
+        nvme_put_free_req_and_wake(ioq, req);
         return r;
     }
-    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
 
     data.co = qemu_coroutine_self();
     while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
     assert(s->nr_queues > 1);
     req = nvme_get_free_req(ioq);
     assert(req);
-    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
 
     data.co = qemu_coroutine_self();
     if (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
     req = nvme_get_free_req(ioq);
     assert(req);
 
-    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
 
     data.co = qemu_coroutine_self();
     while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
     qemu_co_mutex_unlock(&s->dma_map_lock);
 
     if (ret) {
-        nvme_put_free_req_and_wake(s, ioq, req);
+        nvme_put_free_req_and_wake(ioq, req);
         goto out;
     }
 
     trace_nvme_dsm(s, offset, bytes);
 
-    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
 
     data.co = qemu_coroutine_self();
     while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static void nvme_aio_unplug(BlockDriverState *bs)
     for (i = 1; i < s->nr_queues; i++) {
         NVMeQueuePair *q = s->queues[i];
         qemu_mutex_lock(&q->lock);
-        nvme_kick(s, q);
-        nvme_process_completion(s, q);
+        nvme_kick(q);
+        nvme_process_completion(q);
         qemu_mutex_unlock(&q->lock);
     }
 }
-- 
2.26.2

QEMU block drivers are supposed to support aio_poll() from I/O
completion callback functions. This means completion processing must be
re-entrant.

The standard approach is to schedule a BH during completion processing
and cancel it at the end of processing. If aio_poll() is invoked by a
callback function then the BH will run. The BH continues the suspended
completion processing.

All of this means that request A's cb() can synchronously wait for
request B to complete. Previously the nvme block driver would hang
because it didn't process completions from nested aio_poll().

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Message-id: 20200617132201.1832152-8-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/nvme.c       | 67 ++++++++++++++++++++++++++++++++++++++++------
 block/trace-events |  2 +-
 2 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
     int         cq_phase;
     int         free_req_head;
     NVMeRequest reqs[NVME_NUM_REQS];
-    bool        busy;
     int         need_kick;
     int         inflight;
+
+    /* Thread-safe, no lock necessary */
+    QEMUBH      *completion_bh;
 } NVMeQueuePair;
 
 /* Memory mapped registers */
@@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState {
 #define NVME_BLOCK_OPT_DEVICE "device"
 #define NVME_BLOCK_OPT_NAMESPACE "namespace"
 
+static void nvme_process_completion_bh(void *opaque);
+
 static QemuOptsList runtime_opts = {
     .name = "nvme",
     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
 
 static void nvme_free_queue_pair(NVMeQueuePair *q)
 {
+    if (q->completion_bh) {
+        qemu_bh_delete(q->completion_bh);
+    }
     qemu_vfree(q->prp_list_pages);
     qemu_vfree(q->sq.queue);
     qemu_vfree(q->cq.queue);
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
     q->index = idx;
     qemu_co_queue_init(&q->free_req_queue);
     q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
+    q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs),
+                                  nvme_process_completion_bh, q);
     r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
                           s->page_size * NVME_NUM_REQS,
                           false, &prp_list_iova);
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
     NvmeCqe *c;
 
     trace_nvme_process_completion(s, q->index, q->inflight);
-    if (q->busy || s->plugged) {
-        trace_nvme_process_completion_queue_busy(s, q->index);
+    if (s->plugged) {
+        trace_nvme_process_completion_queue_plugged(s, q->index);
         return false;
     }
-    q->busy = true;
+
+    /*
+     * Support re-entrancy when a request cb() function invokes aio_poll().
+     * Pending completions must be visible to aio_poll() so that a cb()
+     * function can wait for the completion of another request.
+     *
+     * The aio_poll() loop will execute our BH and we'll resume completion
+     * processing there.
+     */
+    qemu_bh_schedule(q->completion_bh);
+
     assert(q->inflight >= 0);
     while (q->inflight) {
         int ret;
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
         assert(req.cb);
         nvme_put_free_req_locked(q, preq);
         preq->cb = preq->opaque = NULL;
-        qemu_mutex_unlock(&q->lock);
-        req.cb(req.opaque, ret);
-        qemu_mutex_lock(&q->lock);
         q->inflight--;
+        qemu_mutex_unlock(&q->lock);
+        req.cb(req.opaque, ret);
+        qemu_mutex_lock(&q->lock);
         progress = true;
     }
     if (progress) {
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
         *q->cq.doorbell = cpu_to_le32(q->cq.head);
         nvme_wake_free_req_locked(q);
     }
-    q->busy = false;
+
+    qemu_bh_cancel(q->completion_bh);
+
     return progress;
 }
 
+static void nvme_process_completion_bh(void *opaque)
+{
+    NVMeQueuePair *q = opaque;
+
+    /*
+     * We're being invoked because a nvme_process_completion() cb() function
+     * called aio_poll(). The callback may be waiting for further completions
+     * so notify the device that it has space to fill in more completions now.
+     */
+    smp_mb_release();
+    *q->cq.doorbell = cpu_to_le32(q->cq.head);
+    nvme_wake_free_req_locked(q);
+
+    nvme_process_completion(q);
+}
+
 static void nvme_trace_command(const NvmeCmd *cmd)
 {
     int i;
@@ -XXX,XX +XXX,XX @@ static void nvme_detach_aio_context(BlockDriverState *bs)
 {
     BDRVNVMeState *s = bs->opaque;
 
+    for (int i = 0; i < s->nr_queues; i++) {
+        NVMeQueuePair *q = s->queues[i];
+
+        qemu_bh_delete(q->completion_bh);
+        q->completion_bh = NULL;
+    }
+
     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
                            false, NULL, NULL);
 }
@@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs,
     s->aio_context = new_context;
     aio_set_event_notifier(new_context, &s->irq_notifier,
                            false, nvme_handle_event, nvme_poll_cb);
+
+    for (int i = 0; i < s->nr_queues; i++) {
+        NVMeQueuePair *q = s->queues[i];
+
+        q->completion_bh =
+            aio_bh_new(new_context, nvme_process_completion_bh, q);
+    }
 }
 
 static void nvme_aio_plug(BlockDriverState *bs)
diff --git a/block/trace-events b/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ nvme_kick(void *s, int queue) "s %p queue %d"
 nvme_dma_flush_queue_wait(void *s) "s %p"
 nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
 nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
-nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d"
+nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d"
 nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
 nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
 nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
-- 
2.26.2

The following changes since commit ac793156f650ae2d77834932d72224175ee69086:

Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20201020-1' into staging (2020-10-20 21:11:35 +0100)

are available in the Git repository at:

https://gitlab.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to 32a3fd65e7e3551337fd26bfc0e2f899d70c028c:

iotests: add commit top->base cases to 274 (2020-10-22 09:55:39 +0100)

----------------------------------------------------------------
Pull request

v2:
 * Fix format string issues on 32-bit hosts [Peter]
 * Fix qemu-nbd.c CONFIG_POSIX ifdef issue [Eric]
 * Fix missing eventfd.h header on macOS [Peter]
 * Drop unreliable vhost-user-blk test (will send a new patch when ready) [Peter]

This pull request contains the vhost-user-blk server by Coiby Xu along with my
additions, block/nvme.c alignment and hardware error statistics by Philippe
Mathieu-Daudé, and bdrv_co_block_status_above() fixes by Vladimir
Sementsov-Ogievskiy.

----------------------------------------------------------------

Coiby Xu (6):
  libvhost-user: Allow vu_message_read to be replaced
  libvhost-user: remove watch for kick_fd when de-initialize vu-dev
  util/vhost-user-server: generic vhost user server
  block: move logical block size check function to a common utility
    function
  block/export: vhost-user block device backend server
  MAINTAINERS: Add vhost-user block device backend server maintainer

Philippe Mathieu-Daudé (1):
  block/nvme: Add driver statistics for access alignment and hw errors

Stefan Hajnoczi (16):
  util/vhost-user-server: s/fileds/fields/ typo fix
  util/vhost-user-server: drop unnecessary QOM cast
  util/vhost-user-server: drop unnecessary watch deletion
  block/export: consolidate request structs into VuBlockReq
  util/vhost-user-server: drop unused DevicePanicNotifier
  util/vhost-user-server: fix memory leak in vu_message_read()
  util/vhost-user-server: check EOF when reading payload
  util/vhost-user-server: rework vu_client_trip() coroutine lifecycle
  block/export: report flush errors
  block/export: convert vhost-user-blk server to block export API
  util/vhost-user-server: move header to include/
  util/vhost-user-server: use static library in meson.build
  qemu-storage-daemon: avoid compiling blockdev_ss twice
  block: move block exports to libblockdev
  block/export: add iothread and fixed-iothread options
  block/export: add vhost-user-blk multi-queue support

Vladimir Sementsov-Ogievskiy (5):
  block/io: fix bdrv_co_block_status_above
  block/io: bdrv_common_block_status_above: support include_base
  block/io: bdrv_common_block_status_above: support bs == base
  block/io: fix bdrv_is_allocated_above
  iotests: add commit top->base cases to 274

-- 
2.26.2

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Keep statistics of some hardware errors, and number of
aligned/unaligned I/O accesses.

QMP example booting a full RHEL 8.3 aarch64 guest:

{ "execute": "query-blockstats" }
{
    "return": [
        {
            "device": "",
            "node-name": "drive0",
            "stats": {
                "flush_total_time_ns": 6026948,
                "wr_highest_offset": 3383991230464,
                "wr_total_time_ns": 807450995,
                "failed_wr_operations": 0,
                "failed_rd_operations": 0,
                "wr_merged": 3,
                "wr_bytes": 50133504,
                "failed_unmap_operations": 0,
                "failed_flush_operations": 0,
                "account_invalid": false,
                "rd_total_time_ns": 1846979900,
                "flush_operations": 130,
                "wr_operations": 659,
                "rd_merged": 1192,
                "rd_bytes": 218244096,
                "account_failed": false,
                "idle_time_ns": 2678641497,
                "rd_operations": 7406,
            },
            "driver-specific": {
                "driver": "nvme",
                "completion-errors": 0,
                "unaligned-accesses": 2959,
                "aligned-accesses": 4477
            },
            "qdev": "/machine/peripheral-anon/device[0]/virtio-backend"
        }
    ]
}

Suggested-by: Stefan Hajnoczi <stefanha@gmail.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Acked-by: Markus Armbruster <armbru@redhat.com>
Message-id: 20201001162939.1567915-1-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qapi/block-core.json | 24 +++++++++++++++++++++++-
 block/nvme.c         | 27 +++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -XXX,XX +XXX,XX @@
       'discard-nb-failed': 'uint64',
       'discard-bytes-ok': 'uint64' } }
 
+##
+# @BlockStatsSpecificNvme:
+#
+# NVMe driver statistics
+#
+# @completion-errors: The number of completion errors.
+#
+# @aligned-accesses: The number of aligned accesses performed by
+#                    the driver.
+#
+# @unaligned-accesses: The number of unaligned accesses performed by
+#                      the driver.
+#
+# Since: 5.2
+##
+{ 'struct': 'BlockStatsSpecificNvme',
+  'data': {
+      'completion-errors': 'uint64',
+      'aligned-accesses': 'uint64',
+      'unaligned-accesses': 'uint64' } }
+
 ##
 # @BlockStatsSpecific:
 #
@@ -XXX,XX +XXX,XX @@
   'discriminator': 'driver',
   'data': {
       'file': 'BlockStatsSpecificFile',
-      'host_device': 'BlockStatsSpecificFile' } }
+      'host_device': 'BlockStatsSpecificFile',
+      'nvme': 'BlockStatsSpecificNvme' } }
 
 ##
 # @BlockStats:
diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState {
 
     /* PCI address (required for nvme_refresh_filename()) */
     char *device;
+
+    struct {
+        uint64_t completion_errors;
+        uint64_t aligned_accesses;
+        uint64_t unaligned_accesses;
+    } stats;
 };
 
 #define NVME_BLOCK_OPT_DEVICE "device"
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
             break;
         }
         ret = nvme_translate_error(c);
+        if (ret) {
+            s->stats.completion_errors++;
+        }
         q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
         if (!q->cq.head) {
             q->cq_phase = !q->cq_phase;
@@ -XXX,XX +XXX,XX @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
     assert(QEMU_IS_ALIGNED(bytes, s->page_size));
     assert(bytes <= s->max_transfer);
     if (nvme_qiov_aligned(bs, qiov)) {
+        s->stats.aligned_accesses++;
         return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags);
     }
+    s->stats.unaligned_accesses++;
     trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
     buf = qemu_try_memalign(s->page_size, bytes);
 
@@ -XXX,XX +XXX,XX @@ static void nvme_unregister_buf(BlockDriverState *bs, void *host)
     qemu_vfio_dma_unmap(s->vfio, host);
 }
 
+static BlockStatsSpecific *nvme_get_specific_stats(BlockDriverState *bs)
+{
+    BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
+    BDRVNVMeState *s = bs->opaque;
+
+    stats->driver = BLOCKDEV_DRIVER_NVME;
+    stats->u.nvme = (BlockStatsSpecificNvme) {
+        .completion_errors = s->stats.completion_errors,
+        .aligned_accesses = s->stats.aligned_accesses,
+        .unaligned_accesses = s->stats.unaligned_accesses,
+    };
+
+    return stats;
+}
+
 static const char *const nvme_strong_runtime_opts[] = {
     NVME_BLOCK_OPT_DEVICE,
     NVME_BLOCK_OPT_NAMESPACE,
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_nvme = {
     .bdrv_refresh_filename    = nvme_refresh_filename,
     .bdrv_refresh_limits      = nvme_refresh_limits,
     .strong_runtime_opts      = nvme_strong_runtime_opts,
+    .bdrv_get_specific_stats  = nvme_get_specific_stats,
 
     .bdrv_detach_aio_context  = nvme_detach_aio_context,
     .bdrv_attach_aio_context  = nvme_attach_aio_context,
-- 
2.26.2