Series comparison

-[PULL 00/33] Block patches
+[PULL 0/3] Block patches
-The following changes since commit 8507c9d5c9a62de2a0e281b640f995e26eac46af:
+The following changes since commit 9cf289af47bcfae5c75de37d8e5d6fd23705322c:
-  Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging (2020-11-03 15:59:44 +0000)
+  Merge tag 'qga-pull-request' of gitlab.com:marcandre.lureau/qemu into staging (2022-05-04 03:42:49 -0700)
 are available in the Git repository at:
   https://gitlab.com/stefanha/qemu.git tags/block-pull-request
-for you to fetch changes up to fc107d86840b3364e922c26cf7631b7fd38ce523:
+for you to fetch changes up to bef2e050d6a7feb865854c65570c496ac5a8cf53:
-  util/vfio-helpers: Assert offset is aligned to page size (2020-11-03 19:06:23 +0000)
+  util/event-loop-base: Introduce options to set the thread pool size (2022-05-04 17:02:19 +0100)
 ----------------------------------------------------------------
-Pull request for 5.2
+Pull request
-NVMe fixes to solve IOMMU issues on non-x86 and error message/tracing
+Add new thread-pool-min/thread-pool-max parameters to control the thread pool
-improvements. Elena Afanasova's ioeventfd fixes are also included.
+used for async I/O.
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ----------------------------------------------------------------
-Elena Afanasova (2):
+Nicolas Saenz Julienne (3):
-  accel/kvm: add PIO ioeventfds only in case kvm_eventfds_allowed is
+  Introduce event-loop-base abstract class
-    true
+  util/main-loop: Introduce the main loop into QOM
-  softmmu/memory: fix memory_region_ioeventfd_equal()
+  util/event-loop-base: Introduce options to set the thread pool size
-Eric Auger (4):
+ qapi/qom.json                    |  43 ++++++++--
-  block/nvme: Change size and alignment of IDENTIFY response buffer
+ meson.build                      |  26 +++---
-  block/nvme: Change size and alignment of queue
+ include/block/aio.h              |  10 +++
-  block/nvme: Change size and alignment of prp_list_pages
+ include/block/thread-pool.h      |   3 +
-  block/nvme: Align iov's va and size on host page size
+ include/qemu/main-loop.h         |  10 +++
+ include/sysemu/event-loop-base.h |  41 +++++++++
-Philippe Mathieu-Daudé (27):
+ include/sysemu/iothread.h        |   6 +-
-  MAINTAINERS: Cover "block/nvme.h" file
+ event-loop-base.c                | 140 +++++++++++++++++++++++++++++++
-  block/nvme: Use hex format to display offset in trace events
+ iothread.c                       |  68 +++++----------
-  block/nvme: Report warning with warn_report()
+ util/aio-posix.c                 |   1 +
-  block/nvme: Trace controller capabilities
+ util/async.c                     |  20 +++++
-  block/nvme: Trace nvme_poll_queue() per queue
+ util/main-loop.c                 |  65 ++++++++++++++
-  block/nvme: Improve nvme_free_req_queue_wait() trace information
+ util/thread-pool.c               |  55 +++++++++++-
-  block/nvme: Trace queue pair creation/deletion
+files changed, 419 insertions(+), 69 deletions(-)
-  block/nvme: Move definitions before structure declarations
+ create mode 100644 include/sysemu/event-loop-base.h
-  block/nvme: Use unsigned integer for queue counter/size
+ create mode 100644 event-loop-base.c
   block/nvme: Make nvme_identify() return boolean indicating error
   block/nvme: Make nvme_init_queue() return boolean indicating error
   block/nvme: Introduce Completion Queue definitions
   block/nvme: Use definitions instead of magic values in add_io_queue()
   block/nvme: Correctly initialize Admin Queue Attributes
   block/nvme: Simplify ADMIN queue access
   block/nvme: Simplify nvme_cmd_sync()
   block/nvme: Set request_alignment at initialization
   block/nvme: Correct minimum device page size
   block/nvme: Fix use of write-only doorbells page on Aarch64 arch
   block/nvme: Fix nvme_submit_command() on big-endian host
   util/vfio-helpers: Improve reporting unsupported IOMMU type
   util/vfio-helpers: Trace PCI I/O config accesses
   util/vfio-helpers: Trace PCI BAR region info
   util/vfio-helpers: Trace where BARs are mapped
   util/vfio-helpers: Improve DMA trace events
   util/vfio-helpers: Convert vfio_dump_mapping to trace events
   util/vfio-helpers: Assert offset is aligned to page size
  MAINTAINERS          |   2 +
  include/block/nvme.h |  18 ++--
  accel/kvm/kvm-all.c  |   6 +-
  block/nvme.c         | 209 ++++++++++++++++++++++++-------------------
  softmmu/memory.c     |  11 ++-
  util/vfio-helpers.c  |  43 +++++----
  block/trace-events   |  30 ++++---
  util/trace-events    |  10 ++-
 files changed, 195 insertions(+), 134 deletions(-)
 --
-.28.0
+.35.1

-[PULL 01/33] accel/kvm: add PIO ioeventfds only in case kvm_eventfds_allowed is true
+Deleted patch
-From: Elena Afanasova <eafanasova@gmail.com>
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
-Message-Id: <20201017210102.26036-1-eafanasova@gmail.com>
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- accel/kvm/kvm-all.c | 6 ++++--
-file changed, 4 insertions(+), 2 deletions(-)
-diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/kvm/kvm-all.c
-+++ b/accel/kvm/kvm-all.c
-@@ -XXX,XX +XXX,XX @@ static int kvm_init(MachineState *ms)
-     kvm_memory_listener_register(s, &s->memory_listener,
-                                  &address_space_memory, 0);
--    memory_listener_register(&kvm_io_listener,
--                             &address_space_io);
-+    if (kvm_eventfds_allowed) {
-+        memory_listener_register(&kvm_io_listener,
-+                                 &address_space_io);
-+    }
-     memory_listener_register(&kvm_coalesced_pio_listener,
-                              &address_space_io);
---
-.28.0

-[PULL 32/33] util/vfio-helpers: Convert vfio_dump_mapping to trace events
+[PULL 1/3] Introduce event-loop-base abstract class
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
+From: Nicolas Saenz Julienne <nsaenzju@redhat.com>
-The QEMU_VFIO_DEBUG definition is only modifiable at build-time.
+Introduce the 'event-loop-base' abstract class, it'll hold the
-Trace events can be enabled at run-time. As we prefer the latter,
+properties common to all event loops and provide the necessary hooks for
-convert qemu_vfio_dump_mappings() to use trace events instead
+their creation and maintenance. Then have iothread inherit from it.
-of fprintf().
+EventLoopBaseClass is defined as user creatable and provides a hook for
-Reviewed-by: Fam Zheng <fam@euphon.net>
+its children to attach themselves to the user creatable class 'complete'
 function. It also provides an update_params() callback to propagate
 property changes onto its children.
 The new 'event-loop-base' class will live in the root directory. It is
 built on its own using the 'link_whole' option (there are no direct
 function dependencies between the class and its children, it all happens
 trough 'constructor' magic). And also imposes new compilation
 dependencies:
     qom <- event-loop-base <- blockdev (iothread.c)
 And in subsequent patches:
     qom <- event-loop-base <- qemuutil (util/main-loop.c)
 All this forced some amount of reordering in meson.build:
  - Moved qom build definition before qemuutil. Doing it the other way
    around (i.e. moving qemuutil after qom) isn't possible as a lot of
    core libraries that live in between the two depend on it.
  - Process the 'hw' subdir earlier, as it introduces files into the
    'qom' source set.
 No functional changes intended.
 Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Acked-by: Markus Armbruster <armbru@redhat.com>
-Message-id: 20201103020733.2303148-7-philmd@redhat.com
+Message-id: 20220425075723.20019-2-nsaenzju@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
 ---
- util/vfio-helpers.c | 19 ++++---------------
+ qapi/qom.json                    |  22 +++++--
- util/trace-events   |  1 +
+ meson.build                      |  23 ++++---
-files changed, 5 insertions(+), 15 deletions(-)
+ include/sysemu/event-loop-base.h |  36 +++++++++++
+ include/sysemu/iothread.h        |   6 +-
-diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
+ event-loop-base.c                | 104 +++++++++++++++++++++++++++++++
  iothread.c                       |  65 ++++++-------------
 files changed, 192 insertions(+), 64 deletions(-)
  create mode 100644 include/sysemu/event-loop-base.h
  create mode 100644 event-loop-base.c
 diff --git a/qapi/qom.json b/qapi/qom.json
 index XXXXXXX..XXXXXXX 100644
---- a/util/vfio-helpers.c
+--- a/qapi/qom.json
-+++ b/util/vfio-helpers.c
++++ b/qapi/qom.json
-@@ -XXX,XX +XXX,XX @@ QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
+@@ -XXX,XX +XXX,XX @@
-     return s;
+             '*repeat': 'bool',
              '*grab-toggle': 'GrabToggleKeys' } }
 +##
 +# @EventLoopBaseProperties:
 +#
 +# Common properties for event loops
 +#
 +# @aio-max-batch: maximum number of requests in a batch for the AIO engine,
 +#                 0 means that the engine will use its default.
 +#                 (default: 0)
 +#
 +# Since: 7.1
 +##
 +{ 'struct': 'EventLoopBaseProperties',
 +  'data': { '*aio-max-batch': 'int' } }
 +
  ##
  # @IothreadProperties:
  #
@@ -XXX,XX +XXX,XX @@
  #               algorithm detects it is spending too long polling without
  #               encountering events. 0 selects a default behaviour (default: 0)
  #
 -# @aio-max-batch: maximum number of requests in a batch for the AIO engine,
 -#                 0 means that the engine will use its default
 -#                 (default:0, since 6.1)
 +# The @aio-max-batch option is available since 6.1.
  #
  # Since: 2.0
  ##
  { 'struct': 'IothreadProperties',
 +  'base': 'EventLoopBaseProperties',
    'data': { '*poll-max-ns': 'int',
              '*poll-grow': 'int',
 -            '*poll-shrink': 'int',
 -            '*aio-max-batch': 'int' } }
 +            '*poll-shrink': 'int' } }
  ##
  # @MemoryBackendProperties:
 diff --git a/meson.build b/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/meson.build
 +++ b/meson.build
@@ -XXX,XX +XXX,XX @@ subdir('qom')
  subdir('authz')
  subdir('crypto')
  subdir('ui')
 +subdir('hw')
  if enable_modules
@@ -XXX,XX +XXX,XX @@ if enable_modules
    modulecommon = declare_dependency(link_whole: libmodulecommon, compile_args: '-DBUILD_DSO')
  endif
 +qom_ss = qom_ss.apply(config_host, strict: false)
 +libqom = static_library('qom', qom_ss.sources() + genh,
 +                        dependencies: [qom_ss.dependencies()],
 +                        name_suffix: 'fa')
 +qom = declare_dependency(link_whole: libqom)
 +
 +event_loop_base = files('event-loop-base.c')
 +event_loop_base = static_library('event-loop-base', sources: event_loop_base + genh,
 +                                 build_by_default: true)
 +event_loop_base = declare_dependency(link_whole: event_loop_base,
 +                                     dependencies: [qom])
 +
  stub_ss = stub_ss.apply(config_all, strict: false)
  util_ss.add_all(trace_ss)
@@ -XXX,XX +XXX,XX @@ subdir('monitor')
  subdir('net')
  subdir('replay')
  subdir('semihosting')
 -subdir('hw')
  subdir('tcg')
  subdir('fpu')
  subdir('accel')
@@ -XXX,XX +XXX,XX @@ qemu_syms = custom_target('qemu.syms', output: 'qemu.syms',
                               capture: true,
                               command: [undefsym, nm, '@INPUT@'])
 -qom_ss = qom_ss.apply(config_host, strict: false)
 -libqom = static_library('qom', qom_ss.sources() + genh,
 -                        dependencies: [qom_ss.dependencies()],
 -                        name_suffix: 'fa')
 -
 -qom = declare_dependency(link_whole: libqom)
 -
  authz_ss = authz_ss.apply(config_host, strict: false)
  libauthz = static_library('authz', authz_ss.sources() + genh,
                            dependencies: [authz_ss.dependencies()],
@@ -XXX,XX +XXX,XX @@ libblockdev = static_library('blockdev', blockdev_ss.sources() + genh,
                               build_by_default: false)
  blockdev = declare_dependency(link_whole: [libblockdev],
 -                              dependencies: [block])
 +                              dependencies: [block, event_loop_base])
  qmp_ss = qmp_ss.apply(config_host, strict: false)
  libqmp = static_library('qmp', qmp_ss.sources() + genh,
 diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/include/sysemu/event-loop-base.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU event-loop backend
 + *
 + * Copyright (C) 2022 Red Hat Inc
 + *
 + * Authors:
 + *  Nicolas Saenz Julienne <nsaenzju@redhat.com>
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +#ifndef QEMU_EVENT_LOOP_BASE_H
 +#define QEMU_EVENT_LOOP_BASE_H
 +
 +#include "qom/object.h"
 +#include "block/aio.h"
 +#include "qemu/typedefs.h"
 +
 +#define TYPE_EVENT_LOOP_BASE         "event-loop-base"
 +OBJECT_DECLARE_TYPE(EventLoopBase, EventLoopBaseClass,
 +                    EVENT_LOOP_BASE)
 +
 +struct EventLoopBaseClass {
 +    ObjectClass parent_class;
 +
 +    void (*init)(EventLoopBase *base, Error **errp);
 +    void (*update_params)(EventLoopBase *base, Error **errp);
 +};
 +
 +struct EventLoopBase {
 +    Object parent;
 +
 +    /* AioContext AIO engine parameters */
 +    int64_t aio_max_batch;
 +};
 +#endif
 diff --git a/include/sysemu/iothread.h b/include/sysemu/iothread.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/sysemu/iothread.h
 +++ b/include/sysemu/iothread.h
@@ -XXX,XX +XXX,XX @@
  #include "block/aio.h"
  #include "qemu/thread.h"
  #include "qom/object.h"
 +#include "sysemu/event-loop-base.h"
  #define TYPE_IOTHREAD "iothread"
  struct IOThread {
 -    Object parent_obj;
 +    EventLoopBase parent_obj;
      QemuThread thread;
      AioContext *ctx;
@@ -XXX,XX +XXX,XX @@ struct IOThread {
      int64_t poll_max_ns;
      int64_t poll_grow;
      int64_t poll_shrink;
 -
 -    /* AioContext AIO engine parameters */
 -    int64_t aio_max_batch;
  };
  typedef struct IOThread IOThread;
 diff --git a/event-loop-base.c b/event-loop-base.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/event-loop-base.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU event-loop base
 + *
 + * Copyright (C) 2022 Red Hat Inc
 + *
 + * Authors:
 + *  Stefan Hajnoczi <stefanha@redhat.com>
 + *  Nicolas Saenz Julienne <nsaenzju@redhat.com>
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qom/object_interfaces.h"
 +#include "qapi/error.h"
 +#include "sysemu/event-loop-base.h"
 +
 +typedef struct {
 +    const char *name;
 +    ptrdiff_t offset; /* field's byte offset in EventLoopBase struct */
 +} EventLoopBaseParamInfo;
 +
 +static EventLoopBaseParamInfo aio_max_batch_info = {
 +    "aio-max-batch", offsetof(EventLoopBase, aio_max_batch),
 +};
 +
 +static void event_loop_base_get_param(Object *obj, Visitor *v,
 +        const char *name, void *opaque, Error **errp)
 +{
 +    EventLoopBase *event_loop_base = EVENT_LOOP_BASE(obj);
 +    EventLoopBaseParamInfo *info = opaque;
 +    int64_t *field = (void *)event_loop_base + info->offset;
 +
 +    visit_type_int64(v, name, field, errp);
 +}
 +
 +static void event_loop_base_set_param(Object *obj, Visitor *v,
 +        const char *name, void *opaque, Error **errp)
 +{
 +    EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(obj);
 +    EventLoopBase *base = EVENT_LOOP_BASE(obj);
 +    EventLoopBaseParamInfo *info = opaque;
 +    int64_t *field = (void *)base + info->offset;
 +    int64_t value;
 +
 +    if (!visit_type_int64(v, name, &value, errp)) {
 +        return;
 +    }
 +
 +    if (value < 0) {
 +        error_setg(errp, "%s value must be in range [0, %" PRId64 "]",
 +                   info->name, INT64_MAX);
 +        return;
 +    }
 +
 +    *field = value;
 +
 +    if (bc->update_params) {
 +        bc->update_params(base, errp);
 +    }
 +
 +    return;
 +}
 +
 +static void event_loop_base_complete(UserCreatable *uc, Error **errp)
 +{
 +    EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(uc);
 +    EventLoopBase *base = EVENT_LOOP_BASE(uc);
 +
 +    if (bc->init) {
 +        bc->init(base, errp);
 +    }
 +}
 +
 +static void event_loop_base_class_init(ObjectClass *klass, void *class_data)
 +{
 +    UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
 +    ucc->complete = event_loop_base_complete;
 +
 +    object_class_property_add(klass, "aio-max-batch", "int",
 +                              event_loop_base_get_param,
 +                              event_loop_base_set_param,
 +                              NULL, &aio_max_batch_info);
 +}
 +
 +static const TypeInfo event_loop_base_info = {
 +    .name = TYPE_EVENT_LOOP_BASE,
 +    .parent = TYPE_OBJECT,
 +    .instance_size = sizeof(EventLoopBase),
 +    .class_size = sizeof(EventLoopBaseClass),
 +    .class_init = event_loop_base_class_init,
 +    .abstract = true,
 +    .interfaces = (InterfaceInfo[]) {
 +        { TYPE_USER_CREATABLE },
 +        { }
 +    }
 +};
 +
 +static void register_types(void)
 +{
 +    type_register_static(&event_loop_base_info);
 +}
 +type_init(register_types);
 diff --git a/iothread.c b/iothread.c
 index XXXXXXX..XXXXXXX 100644
 --- a/iothread.c
 +++ b/iothread.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/module.h"
  #include "block/aio.h"
  #include "block/block.h"
 +#include "sysemu/event-loop-base.h"
  #include "sysemu/iothread.h"
  #include "qapi/error.h"
  #include "qapi/qapi-commands-misc.h"
@@ -XXX,XX +XXX,XX @@ static void iothread_init_gcontext(IOThread *iothread)
      iothread->main_loop = g_main_loop_new(iothread->worker_context, TRUE);
  }
--static void qemu_vfio_dump_mapping(IOVAMapping *m)
+-static void iothread_set_aio_context_params(IOThread *iothread, Error **errp)
 +static void iothread_set_aio_context_params(EventLoopBase *base, Error **errp)
  {
 +    IOThread *iothread = IOTHREAD(base);
      ERRP_GUARD();
 +    if (!iothread->ctx) {
 +        return;
 +    }
 +
      aio_context_set_poll_params(iothread->ctx,
                                  iothread->poll_max_ns,
                                  iothread->poll_grow,
@@ -XXX,XX +XXX,XX @@ static void iothread_set_aio_context_params(IOThread *iothread, Error **errp)
      }
      aio_context_set_aio_params(iothread->ctx,
 -                               iothread->aio_max_batch,
 +                               iothread->parent_obj.aio_max_batch,
                                 errp);
  }
 -static void iothread_complete(UserCreatable *obj, Error **errp)
 +
 +static void iothread_init(EventLoopBase *base, Error **errp)
  {
      Error *local_error = NULL;
 -    IOThread *iothread = IOTHREAD(obj);
 +    IOThread *iothread = IOTHREAD(base);
      char *thread_name;
      iothread->stopping = false;
@@ -XXX,XX +XXX,XX @@ static void iothread_complete(UserCreatable *obj, Error **errp)
       */
      iothread_init_gcontext(iothread);
 -    iothread_set_aio_context_params(iothread, &local_error);
 +    iothread_set_aio_context_params(base, &local_error);
      if (local_error) {
          error_propagate(errp, local_error);
          aio_context_unref(iothread->ctx);
@@ -XXX,XX +XXX,XX @@ static void iothread_complete(UserCreatable *obj, Error **errp)
       * to inherit.
       */
      thread_name = g_strdup_printf("IO %s",
 -                        object_get_canonical_path_component(OBJECT(obj)));
 +                        object_get_canonical_path_component(OBJECT(base)));
      qemu_thread_create(&iothread->thread, thread_name, iothread_run,
                         iothread, QEMU_THREAD_JOINABLE);
      g_free(thread_name);
@@ -XXX,XX +XXX,XX @@ static IOThreadParamInfo poll_grow_info = {
  static IOThreadParamInfo poll_shrink_info = {
      "poll-shrink", offsetof(IOThread, poll_shrink),
  };
 -static IOThreadParamInfo aio_max_batch_info = {
 -    "aio-max-batch", offsetof(IOThread, aio_max_batch),
 -};
  static void iothread_get_param(Object *obj, Visitor *v,
          const char *name, IOThreadParamInfo *info, Error **errp)
@@ -XXX,XX +XXX,XX @@ static void iothread_set_poll_param(Object *obj, Visitor *v,
      }
  }
 -static void iothread_get_aio_param(Object *obj, Visitor *v,
 -        const char *name, void *opaque, Error **errp)
 -{
--    if (QEMU_VFIO_DEBUG) {
+-    IOThreadParamInfo *info = opaque;
--        printf("  vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host,
+-
--               (uint64_t)m->size, (uint64_t)m->iova);
+-    iothread_get_param(obj, v, name, info, errp);
 -}
 -
 -static void iothread_set_aio_param(Object *obj, Visitor *v,
 -        const char *name, void *opaque, Error **errp)
 -{
 -    IOThread *iothread = IOTHREAD(obj);
 -    IOThreadParamInfo *info = opaque;
 -
 -    if (!iothread_set_param(obj, v, name, info, errp)) {
 -        return;
 -    }
 -
 -    if (iothread->ctx) {
 -        aio_context_set_aio_params(iothread->ctx,
 -                                   iothread->aio_max_batch,
 -                                   errp);
 -    }
 -}
 -
- static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
+ static void iothread_class_init(ObjectClass *klass, void *class_data)
  {
--    int i;
+-    UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
--
+-    ucc->complete = iothread_complete;
--    if (QEMU_VFIO_DEBUG) {
++    EventLoopBaseClass *bc = EVENT_LOOP_BASE_CLASS(klass);
--        printf("vfio mappings\n");
++
--        for (i = 0; i < s->nr_mappings; ++i) {
++    bc->init = iothread_init;
--            qemu_vfio_dump_mapping(&s->mappings[i]);
++    bc->update_params = iothread_set_aio_context_params;
--        }
-+    for (int i = 0; i < s->nr_mappings; ++i) {
+     object_class_property_add(klass, "poll-max-ns", "int",
-+        trace_qemu_vfio_dump_mapping(s->mappings[i].host,
+                               iothread_get_poll_param,
-+                                     s->mappings[i].iova,
+@@ -XXX,XX +XXX,XX @@ static void iothread_class_init(ObjectClass *klass, void *class_data)
-+                                     s->mappings[i].size);
+                               iothread_get_poll_param,
-     }
+                               iothread_set_poll_param,
                                NULL, &poll_shrink_info);
 -    object_class_property_add(klass, "aio-max-batch", "int",
 -                              iothread_get_aio_param,
 -                              iothread_set_aio_param,
 -                              NULL, &aio_max_batch_info);
  }
-diff --git a/util/trace-events b/util/trace-events
+ static const TypeInfo iothread_info = {
-index XXXXXXX..XXXXXXX 100644
+     .name = TYPE_IOTHREAD,
---- a/util/trace-events
+-    .parent = TYPE_OBJECT,
-+++ b/util/trace-events
++    .parent = TYPE_EVENT_LOOP_BASE,
-@@ -XXX,XX +XXX,XX @@ qemu_mutex_unlock(void *mutex, const char *file, const int line) "released mutex
+     .class_init = iothread_class_init,
- qemu_vfio_dma_reset_temporary(void *s) "s %p"
+     .instance_size = sizeof(IOThread),
- qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
+     .instance_init = iothread_instance_init,
- qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
+     .instance_finalize = iothread_instance_finalize,
-+qemu_vfio_dump_mapping(void *host, uint64_t iova, size_t size) "vfio mapping %p to iova 0x%08" PRIx64 " size 0x%zx"
+-    .interfaces = (InterfaceInfo[]) {
- qemu_vfio_find_mapping(void *s, void *p) "s %p host %p"
+-        {TYPE_USER_CREATABLE},
- qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size 0x%zx index %d iova 0x%"PRIx64
+-        {}
- qemu_vfio_do_mapping(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64 " size 0x%zx"
+-    },
  };
  static void iothread_register_types(void)
@@ -XXX,XX +XXX,XX @@ static int query_one_iothread(Object *object, void *opaque)
      info->poll_max_ns = iothread->poll_max_ns;
      info->poll_grow = iothread->poll_grow;
      info->poll_shrink = iothread->poll_shrink;
 -    info->aio_max_batch = iothread->aio_max_batch;
 +    info->aio_max_batch = iothread->parent_obj.aio_max_batch;
      QAPI_LIST_APPEND(*tail, info);
      return 0;
 --
-.28.0
+.35.1

-[PULL 12/33] block/nvme: Make nvme_identify() return boolean indicating error
+[PULL 2/3] util/main-loop: Introduce the main loop into QOM
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
+From: Nicolas Saenz Julienne <nsaenzju@redhat.com>
-Just for consistency, following the example documented since
+'event-loop-base' provides basic property handling for all 'AioContext'
-commit e3fe3988d7 ("error: Document Error API usage rules"),
+based event loops. So let's define a new 'MainLoopClass' that inherits
-return a boolean value indicating an error is set or not.
+from it. This will permit tweaking the main loop's properties through
-Directly pass errp as the local_err is not requested in our
+qapi as well as through the command line using the '-object' keyword[1].
-case.
+Only one instance of 'MainLoopClass' might be created at any time.
-Tested-by: Eric Auger <eric.auger@redhat.com>
+'EventLoopBaseClass' learns a new callback, 'can_be_deleted()' so as to
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+mark 'MainLoop' as non-deletable.
 [1] For example:
       -object main-loop,id=main-loop,aio-max-batch=<value>
 Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Message-id: 20201029093306.1063879-11-philmd@redhat.com
+Acked-by: Markus Armbruster <armbru@redhat.com>
 Message-id: 20220425075723.20019-3-nsaenzju@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
 ---
- block/nvme.c | 12 +++++++-----
+ qapi/qom.json                    | 13 ++++++++
-file changed, 7 insertions(+), 5 deletions(-)
+ meson.build                      |  3 +-
+ include/qemu/main-loop.h         | 10 ++++++
-diff --git a/block/nvme.c b/block/nvme.c
+ include/sysemu/event-loop-base.h |  1 +
-index XXXXXXX..XXXXXXX 100644
+ event-loop-base.c                | 13 ++++++++
---- a/block/nvme.c
+ util/main-loop.c                 | 56 ++++++++++++++++++++++++++++++++
-+++ b/block/nvme.c
+files changed, 95 insertions(+), 1 deletion(-)
-@@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
-     return ret;
+diff --git a/qapi/qom.json b/qapi/qom.json
 index XXXXXXX..XXXXXXX 100644
 --- a/qapi/qom.json
 +++ b/qapi/qom.json
@@ -XXX,XX +XXX,XX @@
              '*poll-grow': 'int',
              '*poll-shrink': 'int' } }
 +##
 +# @MainLoopProperties:
 +#
 +# Properties for the main-loop object.
 +#
 +# Since: 7.1
 +##
 +{ 'struct': 'MainLoopProperties',
 +  'base': 'EventLoopBaseProperties',
 +  'data': {} }
 +
  ##
  # @MemoryBackendProperties:
  #
@@ -XXX,XX +XXX,XX @@
      { 'name': 'input-linux',
        'if': 'CONFIG_LINUX' },
      'iothread',
 +    'main-loop',
      { 'name': 'memory-backend-epc',
        'if': 'CONFIG_LINUX' },
      'memory-backend-file',
@@ -XXX,XX +XXX,XX @@
        'input-linux':                { 'type': 'InputLinuxProperties',
                                        'if': 'CONFIG_LINUX' },
        'iothread':                   'IothreadProperties',
 +      'main-loop':                  'MainLoopProperties',
        'memory-backend-epc':         { 'type': 'MemoryBackendEpcProperties',
                                        'if': 'CONFIG_LINUX' },
        'memory-backend-file':        'MemoryBackendFileProperties',
 diff --git a/meson.build b/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/meson.build
 +++ b/meson.build
@@ -XXX,XX +XXX,XX @@ libqemuutil = static_library('qemuutil',
                               sources: util_ss.sources() + stub_ss.sources() + genh,
                               dependencies: [util_ss.dependencies(), libm, threads, glib, socket, malloc, pixman])
  qemuutil = declare_dependency(link_with: libqemuutil,
 -                              sources: genh + version_res)
 +                              sources: genh + version_res,
 +                              dependencies: [event_loop_base])
  if have_system or have_user
    decodetree = generator(find_program('scripts/decodetree.py'),
 diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/main-loop.h
 +++ b/include/qemu/main-loop.h
@@ -XXX,XX +XXX,XX @@
  #define QEMU_MAIN_LOOP_H
  #include "block/aio.h"
 +#include "qom/object.h"
 +#include "sysemu/event-loop-base.h"
  #define SIG_IPI SIGUSR1
 +#define TYPE_MAIN_LOOP  "main-loop"
 +OBJECT_DECLARE_TYPE(MainLoop, MainLoopClass, MAIN_LOOP)
 +
 +struct MainLoop {
 +    EventLoopBase parent_obj;
 +};
 +typedef struct MainLoop MainLoop;
 +
  /**
   * qemu_init_main_loop: Set up the process so that it can run the main loop.
   *
 diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/sysemu/event-loop-base.h
 +++ b/include/sysemu/event-loop-base.h
@@ -XXX,XX +XXX,XX @@ struct EventLoopBaseClass {
      void (*init)(EventLoopBase *base, Error **errp);
      void (*update_params)(EventLoopBase *base, Error **errp);
 +    bool (*can_be_deleted)(EventLoopBase *base);
  };
  struct EventLoopBase {
 diff --git a/event-loop-base.c b/event-loop-base.c
 index XXXXXXX..XXXXXXX 100644
 --- a/event-loop-base.c
 +++ b/event-loop-base.c
@@ -XXX,XX +XXX,XX @@ static void event_loop_base_complete(UserCreatable *uc, Error **errp)
      }
  }
--static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
++static bool event_loop_base_can_be_deleted(UserCreatable *uc)
-+/* Returns true on success, false on failure. */
++{
-+static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
++    EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(uc);
 +    EventLoopBase *backend = EVENT_LOOP_BASE(uc);
 +
 +    if (bc->can_be_deleted) {
 +        return bc->can_be_deleted(backend);
 +    }
 +
 +    return true;
 +}
 +
  static void event_loop_base_class_init(ObjectClass *klass, void *class_data)
  {
-     BDRVNVMeState *s = bs->opaque;
+     UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
-+    bool ret = false;
+     ucc->complete = event_loop_base_complete;
-     union {
++    ucc->can_be_deleted = event_loop_base_can_be_deleted;
-         NvmeIdCtrl ctrl;
-         NvmeIdNs ns;
+     object_class_property_add(klass, "aio-max-batch", "int",
-@@ -XXX,XX +XXX,XX @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
+                               event_loop_base_get_param,
-         goto out;
+diff --git a/util/main-loop.c b/util/main-loop.c
-     }
+index XXXXXXX..XXXXXXX 100644
+--- a/util/main-loop.c
-+    ret = true;
++++ b/util/main-loop.c
-     s->blkshift = lbaf->ds;
+@@ -XXX,XX +XXX,XX @@
- out:
+ #include "qemu/error-report.h"
-     qemu_vfio_dma_unmap(s->vfio, id);
+ #include "qemu/queue.h"
-     qemu_vfree(id);
+ #include "qemu/compiler.h"
-+
++#include "qom/object.h"
-+    return ret;
  #ifndef _WIN32
  #include <sys/wait.h>
@@ -XXX,XX +XXX,XX @@ int qemu_init_main_loop(Error **errp)
      return 0;
  }
- static bool nvme_poll_queue(NVMeQueuePair *q)
++static void main_loop_update_params(EventLoopBase *base, Error **errp)
-@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
++{
-     uint64_t cap;
++    if (!qemu_aio_context) {
-     uint64_t timeout_ms;
++        error_setg(errp, "qemu aio context not ready");
-     uint64_t deadline, now;
++        return;
--    Error *local_err = NULL;
++    }
-     volatile NvmeBar *regs = NULL;
++
++    aio_context_set_aio_params(qemu_aio_context, base->aio_max_batch, errp);
-     qemu_co_mutex_init(&s->dma_map_lock);
++}
-@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
++
-                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
++MainLoop *mloop;
-                            false, nvme_handle_event, nvme_poll_cb);
++
++static void main_loop_init(EventLoopBase *base, Error **errp)
--    nvme_identify(bs, namespace, &local_err);
++{
--    if (local_err) {
++    MainLoop *m = MAIN_LOOP(base);
--        error_propagate(errp, local_err);
++
-+    if (!nvme_identify(bs, namespace, errp)) {
++    if (mloop) {
-         ret = -EIO;
++        error_setg(errp, "only one main-loop instance allowed");
-         goto out;
++        return;
-     }
++    }
 +
 +    main_loop_update_params(base, errp);
 +
 +    mloop = m;
 +    return;
 +}
 +
 +static bool main_loop_can_be_deleted(EventLoopBase *base)
 +{
 +    return false;
 +}
 +
 +static void main_loop_class_init(ObjectClass *oc, void *class_data)
 +{
 +    EventLoopBaseClass *bc = EVENT_LOOP_BASE_CLASS(oc);
 +
 +    bc->init = main_loop_init;
 +    bc->update_params = main_loop_update_params;
 +    bc->can_be_deleted = main_loop_can_be_deleted;
 +}
 +
 +static const TypeInfo main_loop_info = {
 +    .name = TYPE_MAIN_LOOP,
 +    .parent = TYPE_EVENT_LOOP_BASE,
 +    .class_init = main_loop_class_init,
 +    .instance_size = sizeof(MainLoop),
 +};
 +
 +static void main_loop_register_types(void)
 +{
 +    type_register_static(&main_loop_info);
 +}
 +
 +type_init(main_loop_register_types)
 +
  static int max_priority;
  #ifndef _WIN32
 --
-.28.0
+.35.1

-[PULL 02/33] softmmu/memory: fix memory_region_ioeventfd_equal()
+[PULL 3/3] util/event-loop-base: Introduce options to set the thread pool size
-From: Elena Afanasova <eafanasova@gmail.com>
+From: Nicolas Saenz Julienne <nsaenzju@redhat.com>
-Eventfd can be registered with a zero length when fast_mmio is true.
+The thread pool regulates itself: when idle, it kills threads until
-Handle this case properly when dispatching through QEMU.
+empty, when in demand, it creates new threads until full. This behaviour
 doesn't play well with latency sensitive workloads where the price of
 creating a new thread is too high. For example, when paired with qemu's
 '-mlock', or using safety features like SafeStack, creating a new thread
 has been measured take multiple milliseconds.
-Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
+In order to mitigate this let's introduce a new 'EventLoopBase'
-Message-id: cf71a62eb04e61932ff8ffdd02e0b2aab4f495a0.camel@gmail.com
+property to set the thread pool size. The threads will be created during
 the pool's initialization or upon updating the property's value, remain
 available during its lifetime regardless of demand, and destroyed upon
 freeing it. A properly characterized workload will then be able to
 configure the pool to avoid any latency spikes.
 Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Acked-by: Markus Armbruster <armbru@redhat.com>
 Message-id: 20220425075723.20019-4-nsaenzju@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- softmmu/memory.c | 11 +++++++++--
+ qapi/qom.json                    | 10 +++++-
-file changed, 9 insertions(+), 2 deletions(-)
+ include/block/aio.h              | 10 ++++++
  include/block/thread-pool.h      |  3 ++
  include/sysemu/event-loop-base.h |  4 +++
  event-loop-base.c                | 23 +++++++++++++
  iothread.c                       |  3 ++
  util/aio-posix.c                 |  1 +
  util/async.c                     | 20 ++++++++++++
  util/main-loop.c                 |  9 ++++++
  util/thread-pool.c               | 55 +++++++++++++++++++++++++++++---
 files changed, 133 insertions(+), 5 deletions(-)
-diff --git a/softmmu/memory.c b/softmmu/memory.c
+diff --git a/qapi/qom.json b/qapi/qom.json
 index XXXXXXX..XXXXXXX 100644
---- a/softmmu/memory.c
+--- a/qapi/qom.json
-+++ b/softmmu/memory.c
++++ b/qapi/qom.json
-@@ -XXX,XX +XXX,XX @@ static bool memory_region_ioeventfd_before(MemoryRegionIoeventfd *a,
+@@ -XXX,XX +XXX,XX @@
- static bool memory_region_ioeventfd_equal(MemoryRegionIoeventfd *a,
+ #                 0 means that the engine will use its default.
-                                           MemoryRegionIoeventfd *b)
+ #                 (default: 0)
  #
 +# @thread-pool-min: minimum number of threads reserved in the thread pool
 +#                   (default:0)
 +#
 +# @thread-pool-max: maximum number of threads the thread pool can contain
 +#                   (default:64)
 +#
  # Since: 7.1
  ##
  { 'struct': 'EventLoopBaseProperties',
 -  'data': { '*aio-max-batch': 'int' } }
 +  'data': { '*aio-max-batch': 'int',
 +            '*thread-pool-min': 'int',
 +            '*thread-pool-max': 'int' } }
  ##
  # @IothreadProperties:
 diff --git a/include/block/aio.h b/include/block/aio.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/aio.h
 +++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ struct AioContext {
      QSLIST_HEAD(, Coroutine) scheduled_coroutines;
      QEMUBH *co_schedule_bh;
 +    int thread_pool_min;
 +    int thread_pool_max;
      /* Thread pool for performing work and receiving completion callbacks.
       * Has its own locking.
       */
@@ -XXX,XX +XXX,XX @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
  void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
                                  Error **errp);
 +/**
 + * aio_context_set_thread_pool_params:
 + * @ctx: the aio context
 + * @min: min number of threads to have readily available in the thread pool
 + * @min: max number of threads the thread pool can contain
 + */
 +void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
 +                                        int64_t max, Error **errp);
  #endif
 diff --git a/include/block/thread-pool.h b/include/block/thread-pool.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/thread-pool.h
 +++ b/include/block/thread-pool.h
@@ -XXX,XX +XXX,XX @@
  #include "block/block.h"
 +#define THREAD_POOL_MAX_THREADS_DEFAULT         64
 +
  typedef int ThreadPoolFunc(void *opaque);
  typedef struct ThreadPool ThreadPool;
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *thread_pool_submit_aio(ThreadPool *pool,
  int coroutine_fn thread_pool_submit_co(ThreadPool *pool,
          ThreadPoolFunc *func, void *arg);
  void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg);
 +void thread_pool_update_params(ThreadPool *pool, struct AioContext *ctx);
  #endif
 diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/sysemu/event-loop-base.h
 +++ b/include/sysemu/event-loop-base.h
@@ -XXX,XX +XXX,XX @@ struct EventLoopBase {
      /* AioContext AIO engine parameters */
      int64_t aio_max_batch;
 +
 +    /* AioContext thread pool parameters */
 +    int64_t thread_pool_min;
 +    int64_t thread_pool_max;
  };
  #endif
 diff --git a/event-loop-base.c b/event-loop-base.c
 index XXXXXXX..XXXXXXX 100644
 --- a/event-loop-base.c
 +++ b/event-loop-base.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "qom/object_interfaces.h"
  #include "qapi/error.h"
 +#include "block/thread-pool.h"
  #include "sysemu/event-loop-base.h"
  typedef struct {
@@ -XXX,XX +XXX,XX @@ typedef struct {
      ptrdiff_t offset; /* field's byte offset in EventLoopBase struct */
  } EventLoopBaseParamInfo;
 +static void event_loop_base_instance_init(Object *obj)
 +{
 +    EventLoopBase *base = EVENT_LOOP_BASE(obj);
 +
 +    base->thread_pool_max = THREAD_POOL_MAX_THREADS_DEFAULT;
 +}
 +
  static EventLoopBaseParamInfo aio_max_batch_info = {
      "aio-max-batch", offsetof(EventLoopBase, aio_max_batch),
  };
 +static EventLoopBaseParamInfo thread_pool_min_info = {
 +    "thread-pool-min", offsetof(EventLoopBase, thread_pool_min),
 +};
 +static EventLoopBaseParamInfo thread_pool_max_info = {
 +    "thread-pool-max", offsetof(EventLoopBase, thread_pool_max),
 +};
  static void event_loop_base_get_param(Object *obj, Visitor *v,
          const char *name, void *opaque, Error **errp)
@@ -XXX,XX +XXX,XX @@ static void event_loop_base_class_init(ObjectClass *klass, void *class_data)
                                event_loop_base_get_param,
                                event_loop_base_set_param,
                                NULL, &aio_max_batch_info);
 +    object_class_property_add(klass, "thread-pool-min", "int",
 +                              event_loop_base_get_param,
 +                              event_loop_base_set_param,
 +                              NULL, &thread_pool_min_info);
 +    object_class_property_add(klass, "thread-pool-max", "int",
 +                              event_loop_base_get_param,
 +                              event_loop_base_set_param,
 +                              NULL, &thread_pool_max_info);
  }
  static const TypeInfo event_loop_base_info = {
      .name = TYPE_EVENT_LOOP_BASE,
      .parent = TYPE_OBJECT,
      .instance_size = sizeof(EventLoopBase),
 +    .instance_init = event_loop_base_instance_init,
      .class_size = sizeof(EventLoopBaseClass),
      .class_init = event_loop_base_class_init,
      .abstract = true,
 diff --git a/iothread.c b/iothread.c
 index XXXXXXX..XXXXXXX 100644
 --- a/iothread.c
 +++ b/iothread.c
@@ -XXX,XX +XXX,XX @@ static void iothread_set_aio_context_params(EventLoopBase *base, Error **errp)
      aio_context_set_aio_params(iothread->ctx,
                                 iothread->parent_obj.aio_max_batch,
                                 errp);
 +
 +    aio_context_set_thread_pool_params(iothread->ctx, base->thread_pool_min,
 +                                       base->thread_pool_max, errp);
  }
 diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-posix.c
 +++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "block/block.h"
 +#include "block/thread-pool.h"
  #include "qemu/main-loop.h"
  #include "qemu/rcu.h"
  #include "qemu/rcu_queue.h"
 diff --git a/util/async.c b/util/async.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/async.c
 +++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
      ctx->aio_max_batch = 0;
 +    ctx->thread_pool_min = 0;
 +    ctx->thread_pool_max = THREAD_POOL_MAX_THREADS_DEFAULT;
 +
      return ctx;
  fail:
      g_source_destroy(&ctx->source);
@@ -XXX,XX +XXX,XX @@ void qemu_set_current_aio_context(AioContext *ctx)
      assert(!get_my_aiocontext());
      set_my_aiocontext(ctx);
  }
 +
 +void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
 +                                        int64_t max, Error **errp)
 +{
 +
 +    if (min > max || !max || min > INT_MAX || max > INT_MAX) {
 +        error_setg(errp, "bad thread-pool-min/thread-pool-max values");
 +        return;
 +    }
 +
 +    ctx->thread_pool_min = min;
 +    ctx->thread_pool_max = max;
 +
 +    if (ctx->thread_pool) {
 +        thread_pool_update_params(ctx->thread_pool, ctx);
 +    }
 +}
 diff --git a/util/main-loop.c b/util/main-loop.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/main-loop.c
 +++ b/util/main-loop.c
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/replay.h"
  #include "qemu/main-loop.h"
  #include "block/aio.h"
 +#include "block/thread-pool.h"
  #include "qemu/error-report.h"
  #include "qemu/queue.h"
  #include "qemu/compiler.h"
@@ -XXX,XX +XXX,XX @@ int qemu_init_main_loop(Error **errp)
  static void main_loop_update_params(EventLoopBase *base, Error **errp)
  {
--    return !memory_region_ioeventfd_before(a, b)
++    ERRP_GUARD();
--        && !memory_region_ioeventfd_before(b, a);
++
-+    if (int128_eq(a->addr.start, b->addr.start) &&
+     if (!qemu_aio_context) {
-+        (!int128_nz(a->addr.size) || !int128_nz(b->addr.size) ||
+         error_setg(errp, "qemu aio context not ready");
-+         (int128_eq(a->addr.size, b->addr.size) &&
+         return;
-+          (a->match_data == b->match_data) &&
+     }
-+          ((a->match_data && (a->data == b->data)) || !a->match_data) &&
-+          (a->e == b->e))))
+     aio_context_set_aio_params(qemu_aio_context, base->aio_max_batch, errp);
-+        return true;
++    if (*errp) {
 +        return;
 +    }
 +
 +    aio_context_set_thread_pool_params(qemu_aio_context, base->thread_pool_min,
 +                                       base->thread_pool_max, errp);
  }
  MainLoop *mloop;
 diff --git a/util/thread-pool.c b/util/thread-pool.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/thread-pool.c
 +++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@ struct ThreadPool {
      QemuMutex lock;
      QemuCond worker_stopped;
      QemuSemaphore sem;
 -    int max_threads;
      QEMUBH *new_thread_bh;
      /* The following variables are only accessed from one AioContext. */
@@ -XXX,XX +XXX,XX @@ struct ThreadPool {
      int new_threads;     /* backlog of threads we need to create */
      int pending_threads; /* threads created but not running yet */
      bool stopping;
 +    int min_threads;
 +    int max_threads;
  };
 +static inline bool back_to_sleep(ThreadPool *pool, int ret)
 +{
 +    /*
 +     * The semaphore timed out, we should exit the loop except when:
 +     *  - There is work to do, we raced with the signal.
 +     *  - The max threads threshold just changed, we raced with the signal.
 +     *  - The thread pool forces a minimum number of readily available threads.
 +     */
 +    if (ret == -1 && (!QTAILQ_EMPTY(&pool->request_list) ||
 +            pool->cur_threads > pool->max_threads ||
 +            pool->cur_threads <= pool->min_threads)) {
 +            return true;
 +    }
 +
 +    return false;
- }
++}
++
- /* Range of memory in the global map.  Addresses are absolute. */
+ static void *worker_thread(void *opaque)
  {
      ThreadPool *pool = opaque;
@@ -XXX,XX +XXX,XX @@ static void *worker_thread(void *opaque)
              ret = qemu_sem_timedwait(&pool->sem, 10000);
              qemu_mutex_lock(&pool->lock);
              pool->idle_threads--;
 -        } while (ret == -1 && !QTAILQ_EMPTY(&pool->request_list));
 -        if (ret == -1 || pool->stopping) {
 +        } while (back_to_sleep(pool, ret));
 +        if (ret == -1 || pool->stopping ||
 +            pool->cur_threads > pool->max_threads) {
              break;
          }
@@ -XXX,XX +XXX,XX @@ void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg)
      thread_pool_submit_aio(pool, func, arg, NULL, NULL);
  }
 +void thread_pool_update_params(ThreadPool *pool, AioContext *ctx)
 +{
 +    qemu_mutex_lock(&pool->lock);
 +
 +    pool->min_threads = ctx->thread_pool_min;
 +    pool->max_threads = ctx->thread_pool_max;
 +
 +    /*
 +     * We either have to:
 +     *  - Increase the number available of threads until over the min_threads
 +     *    threshold.
 +     *  - Decrease the number of available threads until under the max_threads
 +     *    threshold.
 +     *  - Do nothing. The current number of threads fall in between the min and
 +     *    max thresholds. We'll let the pool manage itself.
 +     */
 +    for (int i = pool->cur_threads; i < pool->min_threads; i++) {
 +        spawn_thread(pool);
 +    }
 +
 +    for (int i = pool->cur_threads; i > pool->max_threads; i--) {
 +        qemu_sem_post(&pool->sem);
 +    }
 +
 +    qemu_mutex_unlock(&pool->lock);
 +}
 +
  static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx)
  {
      if (!ctx) {
@@ -XXX,XX +XXX,XX @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx)
      qemu_mutex_init(&pool->lock);
      qemu_cond_init(&pool->worker_stopped);
      qemu_sem_init(&pool->sem, 0);
 -    pool->max_threads = 64;
      pool->new_thread_bh = aio_bh_new(ctx, spawn_thread_bh_fn, pool);
      QLIST_INIT(&pool->head);
      QTAILQ_INIT(&pool->request_list);
 +
 +    thread_pool_update_params(pool, ctx);
  }
  ThreadPool *thread_pool_new(AioContext *ctx)
 --
-.28.0
+.35.1

-[PULL 03/33] MAINTAINERS: Cover "block/nvme.h" file
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-The "block/nvme.h" header is shared by both the NVMe block
-driver and the NVMe emulated device. Add the 'F:' entry on
-both sections, so all maintainers/reviewers are notified
-when it is changed.
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
-Message-Id: <20200701140634.25994-1-philmd@redhat.com>
----
- MAINTAINERS | 2 ++
-file changed, 2 insertions(+)
-diff --git a/MAINTAINERS b/MAINTAINERS
-index XXXXXXX..XXXXXXX 100644
---- a/MAINTAINERS
-+++ b/MAINTAINERS
-@@ -XXX,XX +XXX,XX @@ M: Klaus Jensen <its@irrelevant.dk>
- L: qemu-block@nongnu.org
- S: Supported
- F: hw/block/nvme*
-+F: include/block/nvme.h
- F: tests/qtest/nvme-test.c
- F: docs/specs/nvme.txt
- T: git git://git.infradead.org/qemu-nvme.git nvme-next
-@@ -XXX,XX +XXX,XX @@ R: Fam Zheng <fam@euphon.net>
- L: qemu-block@nongnu.org
- S: Supported
- F: block/nvme*
-+F: include/block/nvme.h
- T: git https://github.com/stefanha/qemu.git block
- Bootdevice
---
-.28.0

-[PULL 04/33] block/nvme: Use hex format to display offset in trace events
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-Use the same format used for the hw/vfio/ trace events.
-Suggested-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-3-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/trace-events | 12 ++++++------
-file changed, 6 insertions(+), 6 deletions(-)
-diff --git a/block/trace-events b/block/trace-events
-index XXXXXXX..XXXXXXX 100644
---- a/block/trace-events
-+++ b/block/trace-events
-@@ -XXX,XX +XXX,XX @@ nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
- nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
- nvme_handle_event(void *s) "s %p"
- nvme_poll_cb(void *s) "s %p"
--nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset %"PRId64" bytes %"PRId64" flags %d niov %d"
--nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset %"PRId64" bytes %"PRId64" flags %d"
-+nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" flags %d niov %d"
-+nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset 0x%"PRIx64" bytes %"PRId64" flags %d"
- nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
--nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
--nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
--nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64""
--nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d"
-+nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset 0x%"PRIx64" bytes %"PRId64" niov %d is_write %d"
-+nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" ret %d"
-+nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" bytes %"PRId64""
-+nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d"
- nvme_dma_map_flush(void *s) "s %p"
- nvme_free_req_queue_wait(void *q) "q %p"
- nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
---
-.28.0

-[PULL 05/33] block/nvme: Report warning with warn_report()
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-Instead of displaying warning on stderr, use warn_report()
-which also displays it on the monitor.
-Reviewed-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-4-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c | 4 ++--
-file changed, 2 insertions(+), 2 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
-         }
-         cid = le16_to_cpu(c->cid);
-         if (cid == 0 || cid > NVME_QUEUE_SIZE) {
--            fprintf(stderr, "Unexpected CID in completion queue: %" PRIu32 "\n",
--                    cid);
-+            warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", "
-+                        "queue size: %u", cid, NVME_QUEUE_SIZE);
-             continue;
-         }
-         trace_nvme_complete_command(s, q->index, cid);
---
-.28.0

-[PULL 06/33] block/nvme: Trace controller capabilities
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-Controllers have different capabilities and report them in the
-CAP register. We are particularly interested by the page size
-limits.
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Eric Auger <eric.auger@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-5-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c       | 13 +++++++++++++
- block/trace-events |  2 ++
-files changed, 15 insertions(+)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
-      * Initialization". */
-     cap = le64_to_cpu(regs->cap);
-+    trace_nvme_controller_capability_raw(cap);
-+    trace_nvme_controller_capability("Maximum Queue Entries Supported",
-+                                     1 + NVME_CAP_MQES(cap));
-+    trace_nvme_controller_capability("Contiguous Queues Required",
-+                                     NVME_CAP_CQR(cap));
-+    trace_nvme_controller_capability("Doorbell Stride",
-+                                     2 << (2 + NVME_CAP_DSTRD(cap)));
-+    trace_nvme_controller_capability("Subsystem Reset Supported",
-+                                     NVME_CAP_NSSRS(cap));
-+    trace_nvme_controller_capability("Memory Page Size Minimum",
-+                                     1 << (12 + NVME_CAP_MPSMIN(cap)));
-+    trace_nvme_controller_capability("Memory Page Size Maximum",
-+                                     1 << (12 + NVME_CAP_MPSMAX(cap)));
-     if (!NVME_CAP_CSS(cap)) {
-         error_setg(errp, "Device doesn't support NVMe command set");
-         ret = -EINVAL;
-diff --git a/block/trace-events b/block/trace-events
-index XXXXXXX..XXXXXXX 100644
---- a/block/trace-events
-+++ b/block/trace-events
-@@ -XXX,XX +XXX,XX @@ qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t
- qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
- # nvme.c
-+nvme_controller_capability_raw(uint64_t value) "0x%08"PRIx64
-+nvme_controller_capability(const char *desc, uint64_t value) "%s: %"PRIu64
- nvme_kick(void *s, int queue) "s %p queue %d"
- nvme_dma_flush_queue_wait(void *s) "s %p"
- nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
---
-.28.0

-[PULL 07/33] block/nvme: Trace nvme_poll_queue() per queue
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-As we want to enable multiple queues, report the event
-in each nvme_poll_queue() call, rather than once in
-the callback calling nvme_poll_queues().
-Reviewed-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-6-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c       | 2 +-
- block/trace-events | 2 +-
-files changed, 2 insertions(+), 2 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queue(NVMeQueuePair *q)
-     const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
-     NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
-+    trace_nvme_poll_queue(q->s, q->index);
-     /*
-      * Do an early check for completions. q->lock isn't needed because
-      * nvme_process_completion() only runs in the event loop thread and
-@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_cb(void *opaque)
-     BDRVNVMeState *s = container_of(e, BDRVNVMeState,
-                                     irq_notifier[MSIX_SHARED_IRQ_IDX]);
--    trace_nvme_poll_cb(s);
-     return nvme_poll_queues(s);
- }
-diff --git a/block/trace-events b/block/trace-events
-index XXXXXXX..XXXXXXX 100644
---- a/block/trace-events
-+++ b/block/trace-events
-@@ -XXX,XX +XXX,XX @@ nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
- nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
- nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
- nvme_handle_event(void *s) "s %p"
--nvme_poll_cb(void *s) "s %p"
-+nvme_poll_queue(void *s, unsigned q_index) "s %p q #%u"
- nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" flags %d niov %d"
- nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset 0x%"PRIx64" bytes %"PRId64" flags %d"
- nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
---
-.28.0

-[PULL 08/33] block/nvme: Improve nvme_free_req_queue_wait() trace information
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-What we want to trace is the block driver state and the queue index.
-Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-7-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c       | 2 +-
- block/trace-events | 2 +-
-files changed, 2 insertions(+), 2 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
-     while (q->free_req_head == -1) {
-         if (qemu_in_coroutine()) {
--            trace_nvme_free_req_queue_wait(q);
-+            trace_nvme_free_req_queue_wait(q->s, q->index);
-             qemu_co_queue_wait(&q->free_req_queue, &q->lock);
-         } else {
-             qemu_mutex_unlock(&q->lock);
-diff --git a/block/trace-events b/block/trace-events
-index XXXXXXX..XXXXXXX 100644
---- a/block/trace-events
-+++ b/block/trace-events
-@@ -XXX,XX +XXX,XX @@ nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s
- nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" bytes %"PRId64""
- nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d"
- nvme_dma_map_flush(void *s) "s %p"
--nvme_free_req_queue_wait(void *q) "q %p"
-+nvme_free_req_queue_wait(void *s, unsigned q_index) "s %p q #%u"
- nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
- nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64
- nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d"
---
-.28.0

-[PULL 09/33] block/nvme: Trace queue pair creation/deletion
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-Reviewed-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-8-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c       | 3 +++
- block/trace-events | 2 ++
-files changed, 5 insertions(+)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
- static void nvme_free_queue_pair(NVMeQueuePair *q)
- {
-+    trace_nvme_free_queue_pair(q->index, q);
-     if (q->completion_bh) {
-         qemu_bh_delete(q->completion_bh);
-     }
-@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
-     if (!q) {
-         return NULL;
-     }
-+    trace_nvme_create_queue_pair(idx, q, size, aio_context,
-+                                 event_notifier_get_fd(s->irq_notifier));
-     q->prp_list_pages = qemu_try_memalign(s->page_size,
-                                           s->page_size * NVME_NUM_REQS);
-     if (!q->prp_list_pages) {
-diff --git a/block/trace-events b/block/trace-events
-index XXXXXXX..XXXXXXX 100644
---- a/block/trace-events
-+++ b/block/trace-events
-@@ -XXX,XX +XXX,XX @@ nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" byte
- nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d"
- nvme_dma_map_flush(void *s) "s %p"
- nvme_free_req_queue_wait(void *s, unsigned q_index) "s %p q #%u"
-+nvme_create_queue_pair(unsigned q_index, void *q, unsigned size, void *aio_context, int fd) "index %u q %p size %u aioctx %p fd %d"
-+nvme_free_queue_pair(unsigned q_index, void *q) "index %u q %p"
- nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
- nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64
- nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d"
---
-.28.0

-[PULL 10/33] block/nvme: Move definitions before structure declarations
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-To be able to use some definitions in structure declarations,
-move them earlier. No logical change.
-Reviewed-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-9-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c | 19 ++++++++++---------
-file changed, 10 insertions(+), 9 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@
- typedef struct BDRVNVMeState BDRVNVMeState;
-+/* Same index is used for queues and IRQs */
-+#define INDEX_ADMIN     0
-+#define INDEX_IO(n)     (1 + n)
-+
-+/* This driver shares a single MSIX IRQ for the admin and I/O queues */
-+enum {
-+    MSIX_SHARED_IRQ_IDX = 0,
-+    MSIX_IRQ_COUNT = 1
-+};
-+
- typedef struct {
-     int32_t  head, tail;
-     uint8_t  *queue;
-@@ -XXX,XX +XXX,XX @@ typedef struct {
-     QEMUBH      *completion_bh;
- } NVMeQueuePair;
--#define INDEX_ADMIN     0
--#define INDEX_IO(n)     (1 + n)
--
--/* This driver shares a single MSIX IRQ for the admin and I/O queues */
--enum {
--    MSIX_SHARED_IRQ_IDX = 0,
--    MSIX_IRQ_COUNT = 1
--};
--
- struct BDRVNVMeState {
-     AioContext *aio_context;
-     QEMUVFIOState *vfio;
---
-.28.0

-[PULL 11/33] block/nvme: Use unsigned integer for queue counter/size
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-We can not have negative queue count/size/index, use unsigned type.
-Rename 'nr_queues' as 'queue_count' to match the spec naming.
-Reviewed-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-10-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c       | 38 ++++++++++++++++++--------------------
- block/trace-events | 10 +++++-----
-files changed, 23 insertions(+), 25 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState {
-      * [1..]: io queues.
-      */
-     NVMeQueuePair **queues;
--    int nr_queues;
-+    unsigned queue_count;
-     size_t page_size;
-     /* How many uint32_t elements does each doorbell entry take. */
-     size_t doorbell_scale;
-@@ -XXX,XX +XXX,XX @@ static QemuOptsList runtime_opts = {
- };
- static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
--                            int nentries, int entry_bytes, Error **errp)
-+                            unsigned nentries, size_t entry_bytes, Error **errp)
- {
-     size_t bytes;
-     int r;
-@@ -XXX,XX +XXX,XX @@ static void nvme_free_req_queue_cb(void *opaque)
- static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
-                                              AioContext *aio_context,
--                                             int idx, int size,
-+                                             unsigned idx, size_t size,
-                                              Error **errp)
- {
-     int i, r;
-@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
-     bool progress = false;
-     int i;
--    for (i = 0; i < s->nr_queues; i++) {
-+    for (i = 0; i < s->queue_count; i++) {
-         if (nvme_poll_queue(s->queues[i])) {
-             progress = true;
-         }
-@@ -XXX,XX +XXX,XX @@ static void nvme_handle_event(EventNotifier *n)
- static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
- {
-     BDRVNVMeState *s = bs->opaque;
--    int n = s->nr_queues;
-+    unsigned n = s->queue_count;
-     NVMeQueuePair *q;
-     NvmeCmd cmd;
--    int queue_size = NVME_QUEUE_SIZE;
-+    unsigned queue_size = NVME_QUEUE_SIZE;
-     q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
-                                n, queue_size, errp);
-@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
-         .cdw11 = cpu_to_le32(0x3),
-     };
-     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
--        error_setg(errp, "Failed to create CQ io queue [%d]", n);
-+        error_setg(errp, "Failed to create CQ io queue [%u]", n);
-         goto out_error;
-     }
-     cmd = (NvmeCmd) {
-@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
-         .cdw11 = cpu_to_le32(0x1 | (n << 16)),
-     };
-     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
--        error_setg(errp, "Failed to create SQ io queue [%d]", n);
-+        error_setg(errp, "Failed to create SQ io queue [%u]", n);
-         goto out_error;
-     }
-     s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
-     s->queues[n] = q;
--    s->nr_queues++;
-+    s->queue_count++;
-     return true;
- out_error:
-     nvme_free_queue_pair(q);
-@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
-         ret = -EINVAL;
-         goto out;
-     }
--    s->nr_queues = 1;
-+    s->queue_count = 1;
-     QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
-     regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << AQA_ACQS_SHIFT) |
-                             (NVME_QUEUE_SIZE << AQA_ASQS_SHIFT));
-@@ -XXX,XX +XXX,XX @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
- static void nvme_close(BlockDriverState *bs)
- {
--    int i;
-     BDRVNVMeState *s = bs->opaque;
--    for (i = 0; i < s->nr_queues; ++i) {
-+    for (unsigned i = 0; i < s->queue_count; ++i) {
-         nvme_free_queue_pair(s->queues[i]);
-     }
-     g_free(s->queues);
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
-     };
-     trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov);
--    assert(s->nr_queues > 1);
-+    assert(s->queue_count > 1);
-     req = nvme_get_free_req(ioq);
-     assert(req);
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
-         .ret = -EINPROGRESS,
-     };
--    assert(s->nr_queues > 1);
-+    assert(s->queue_count > 1);
-     req = nvme_get_free_req(ioq);
-     assert(req);
-     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
-     cmd.cdw12 = cpu_to_le32(cdw12);
-     trace_nvme_write_zeroes(s, offset, bytes, flags);
--    assert(s->nr_queues > 1);
-+    assert(s->queue_count > 1);
-     req = nvme_get_free_req(ioq);
-     assert(req);
-@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
-         return -ENOTSUP;
-     }
--    assert(s->nr_queues > 1);
-+    assert(s->queue_count > 1);
-     buf = qemu_try_memalign(s->page_size, s->page_size);
-     if (!buf) {
-@@ -XXX,XX +XXX,XX @@ static void nvme_detach_aio_context(BlockDriverState *bs)
- {
-     BDRVNVMeState *s = bs->opaque;
--    for (int i = 0; i < s->nr_queues; i++) {
-+    for (unsigned i = 0; i < s->queue_count; i++) {
-         NVMeQueuePair *q = s->queues[i];
-         qemu_bh_delete(q->completion_bh);
-@@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs,
-     aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
-                            false, nvme_handle_event, nvme_poll_cb);
--    for (int i = 0; i < s->nr_queues; i++) {
-+    for (unsigned i = 0; i < s->queue_count; i++) {
-         NVMeQueuePair *q = s->queues[i];
-         q->completion_bh =
-@@ -XXX,XX +XXX,XX @@ static void nvme_aio_plug(BlockDriverState *bs)
- static void nvme_aio_unplug(BlockDriverState *bs)
- {
--    int i;
-     BDRVNVMeState *s = bs->opaque;
-     assert(s->plugged);
-     s->plugged = false;
--    for (i = INDEX_IO(0); i < s->nr_queues; i++) {
-+    for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
-         NVMeQueuePair *q = s->queues[i];
-         qemu_mutex_lock(&q->lock);
-         nvme_kick(q);
-diff --git a/block/trace-events b/block/trace-events
-index XXXXXXX..XXXXXXX 100644
---- a/block/trace-events
-+++ b/block/trace-events
-@@ -XXX,XX +XXX,XX @@ qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s
- # nvme.c
- nvme_controller_capability_raw(uint64_t value) "0x%08"PRIx64
- nvme_controller_capability(const char *desc, uint64_t value) "%s: %"PRIu64
--nvme_kick(void *s, int queue) "s %p queue %d"
-+nvme_kick(void *s, unsigned q_index) "s %p q #%u"
- nvme_dma_flush_queue_wait(void *s) "s %p"
- nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
--nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
--nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d"
--nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
--nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
-+nvme_process_completion(void *s, unsigned q_index, int inflight) "s %p q #%u inflight %d"
-+nvme_process_completion_queue_plugged(void *s, unsigned q_index) "s %p q #%u"
-+nvme_complete_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
-+nvme_submit_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
- nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
- nvme_handle_event(void *s) "s %p"
- nvme_poll_queue(void *s, unsigned q_index) "s %p q #%u"
---
-.28.0

-[PULL 13/33] block/nvme: Make nvme_init_queue() return boolean indicating error
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-Just for consistency, following the example documented since
-commit e3fe3988d7 ("error: Document Error API usage rules"),
-return a boolean value indicating an error is set or not.
-Directly pass errp as the local_err is not requested in our
-case. This simplifies a bit nvme_create_queue_pair().
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-12-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c | 16 +++++++---------
-file changed, 7 insertions(+), 9 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static QemuOptsList runtime_opts = {
-     },
- };
--static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
-+/* Returns true on success, false on failure. */
-+static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
-                             unsigned nentries, size_t entry_bytes, Error **errp)
- {
-     size_t bytes;
-@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
-     q->queue = qemu_try_memalign(s->page_size, bytes);
-     if (!q->queue) {
-         error_setg(errp, "Cannot allocate queue");
--        return;
-+        return false;
-     }
-     memset(q->queue, 0, bytes);
-     r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova);
-     if (r) {
-         error_setg(errp, "Cannot map queue");
-+        return false;
-     }
-+    return true;
- }
- static void nvme_free_queue_pair(NVMeQueuePair *q)
-@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
-                                              Error **errp)
- {
-     int i, r;
--    Error *local_err = NULL;
-     NVMeQueuePair *q;
-     uint64_t prp_list_iova;
-@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
-         req->prp_list_iova = prp_list_iova + i * s->page_size;
-     }
--    nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
--    if (local_err) {
--        error_propagate(errp, local_err);
-+    if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) {
-         goto fail;
-     }
-     q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;
--    nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err);
--    if (local_err) {
--        error_propagate(errp, local_err);
-+    if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) {
-         goto fail;
-     }
-     q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;
---
-.28.0

-[PULL 14/33] block/nvme: Introduce Completion Queue definitions
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-Rename Submission Queue flags with 'Sq' to differentiate
-submission queue flags from command queue flags, and introduce
-Completion Queue flag definitions.
-Reviewed-by: Eric Auger <eric.auger@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Message-id: 20201029093306.1063879-13-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- include/block/nvme.h | 18 ++++++++++++------
-file changed, 12 insertions(+), 6 deletions(-)
-diff --git a/include/block/nvme.h b/include/block/nvme.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/block/nvme.h
-+++ b/include/block/nvme.h
-@@ -XXX,XX +XXX,XX @@ typedef struct QEMU_PACKED NvmeCreateCq {
- #define NVME_CQ_FLAGS_PC(cq_flags)  (cq_flags & 0x1)
- #define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1)
-+enum NvmeFlagsCq {
-+    NVME_CQ_PC          = 1,
-+    NVME_CQ_IEN         = 2,
-+};
-+
- typedef struct QEMU_PACKED NvmeCreateSq {
-     uint8_t     opcode;
-     uint8_t     flags;
-@@ -XXX,XX +XXX,XX @@ typedef struct QEMU_PACKED NvmeCreateSq {
- #define NVME_SQ_FLAGS_PC(sq_flags)      (sq_flags & 0x1)
- #define NVME_SQ_FLAGS_QPRIO(sq_flags)   ((sq_flags >> 1) & 0x3)
--enum NvmeQueueFlags {
--    NVME_Q_PC           = 1,
--    NVME_Q_PRIO_URGENT  = 0,
--    NVME_Q_PRIO_HIGH    = 1,
--    NVME_Q_PRIO_NORMAL  = 2,
--    NVME_Q_PRIO_LOW     = 3,
-+enum NvmeFlagsSq {
-+    NVME_SQ_PC          = 1,
-+
-+    NVME_SQ_PRIO_URGENT = 0,
-+    NVME_SQ_PRIO_HIGH   = 1,
-+    NVME_SQ_PRIO_NORMAL = 2,
-+    NVME_SQ_PRIO_LOW    = 3,
- };
- typedef struct QEMU_PACKED NvmeIdentify {
---
-.28.0

-[PULL 15/33] block/nvme: Use definitions instead of magic values in add_io_queue()
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-Replace magic values by definitions, and simplifiy since the
-number of queues will never reach 64K.
-Reviewed-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-14-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c | 9 +++++----
-file changed, 5 insertions(+), 4 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
-     NvmeCmd cmd;
-     unsigned queue_size = NVME_QUEUE_SIZE;
-+    assert(n <= UINT16_MAX);
-     q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
-                                n, queue_size, errp);
-     if (!q) {
-@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
-     cmd = (NvmeCmd) {
-         .opcode = NVME_ADM_CMD_CREATE_CQ,
-         .dptr.prp1 = cpu_to_le64(q->cq.iova),
--        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
--        .cdw11 = cpu_to_le32(0x3),
-+        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
-+        .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC),
-     };
-     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
-         error_setg(errp, "Failed to create CQ io queue [%u]", n);
-@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
-     cmd = (NvmeCmd) {
-         .opcode = NVME_ADM_CMD_CREATE_SQ,
-         .dptr.prp1 = cpu_to_le64(q->sq.iova),
--        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
--        .cdw11 = cpu_to_le32(0x1 | (n << 16)),
-+        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
-+        .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)),
-     };
-     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
-         error_setg(errp, "Failed to create SQ io queue [%u]", n);
---
-.28.0

-[PULL 16/33] block/nvme: Correctly initialize Admin Queue Attributes
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-From the specification chapter 3.1.8 "AQA - Admin Queue Attributes"
-the Admin Submission Queue Size field is a 0’s based value:
-  Admin Submission Queue Size (ASQS):
-    Defines the size of the Admin Submission Queue in entries.
-    Enabling a controller while this field is cleared to 00h
-    produces undefined results. The minimum size of the Admin
-    Submission Queue is two entries. The maximum size of the
-    Admin Submission Queue is 4096 entries.
-    This is a 0’s based value.
-This bug has never been hit because the device initialization
-uses a single command synchronously :)
-Reviewed-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-15-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c | 6 +++---
-file changed, 3 insertions(+), 3 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
-         goto out;
-     }
-     s->queue_count = 1;
--    QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
--    regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << AQA_ACQS_SHIFT) |
--                            (NVME_QUEUE_SIZE << AQA_ASQS_SHIFT));
-+    QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000);
-+    regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) |
-+                            ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT));
-     regs->asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova);
-     regs->acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova);
---
-.28.0

-[PULL 17/33] block/nvme: Simplify ADMIN queue access
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-We don't need to dereference from BDRVNVMeState each time.
-Use a NVMeQueuePair pointer on the admin queue.
-The nvme_init() becomes easier to review, matching the style
-of nvme_add_io_queue().
-Reviewed-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-16-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c | 12 ++++++------
-file changed, 6 insertions(+), 6 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
-                      Error **errp)
- {
-     BDRVNVMeState *s = bs->opaque;
-+    NVMeQueuePair *q;
-     AioContext *aio_context = bdrv_get_aio_context(bs);
-     int ret;
-     uint64_t cap;
-@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
-     /* Set up admin queue. */
-     s->queues = g_new(NVMeQueuePair *, 1);
--    s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0,
--                                                          NVME_QUEUE_SIZE,
--                                                          errp);
--    if (!s->queues[INDEX_ADMIN]) {
-+    q = nvme_create_queue_pair(s, aio_context, 0, NVME_QUEUE_SIZE, errp);
-+    if (!q) {
-         ret = -EINVAL;
-         goto out;
-     }
-+    s->queues[INDEX_ADMIN] = q;
-     s->queue_count = 1;
-     QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000);
-     regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) |
-                             ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT));
--    regs->asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova);
--    regs->acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova);
-+    regs->asq = cpu_to_le64(q->sq.iova);
-+    regs->acq = cpu_to_le64(q->cq.iova);
-     /* After setting up all control registers we can enable device now. */
-     regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) |
---
-.28.0

-[PULL 18/33] block/nvme: Simplify nvme_cmd_sync()
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-As all commands use the ADMIN queue, it is pointless to pass
-it as argument each time. Remove the argument, and rename the
-function as nvme_admin_cmd_sync() to make this new behavior
-clearer.
-Reviewed-by: Eric Auger <eric.auger@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Message-id: 20201029093306.1063879-17-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c | 19 ++++++++++---------
-file changed, 10 insertions(+), 9 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
-     qemu_mutex_unlock(&q->lock);
- }
--static void nvme_cmd_sync_cb(void *opaque, int ret)
-+static void nvme_admin_cmd_sync_cb(void *opaque, int ret)
- {
-     int *pret = opaque;
-     *pret = ret;
-     aio_wait_kick();
- }
--static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
--                         NvmeCmd *cmd)
-+static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd)
- {
-+    BDRVNVMeState *s = bs->opaque;
-+    NVMeQueuePair *q = s->queues[INDEX_ADMIN];
-     AioContext *aio_context = bdrv_get_aio_context(bs);
-     NVMeRequest *req;
-     int ret = -EINPROGRESS;
-@@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
-     if (!req) {
-         return -EBUSY;
-     }
--    nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
-+    nvme_submit_command(q, req, cmd, nvme_admin_cmd_sync_cb, &ret);
-     AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS);
-     return ret;
-@@ -XXX,XX +XXX,XX @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
-     memset(id, 0, sizeof(*id));
-     cmd.dptr.prp1 = cpu_to_le64(iova);
--    if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
-+    if (nvme_admin_cmd_sync(bs, &cmd)) {
-         error_setg(errp, "Failed to identify controller");
-         goto out;
-     }
-@@ -XXX,XX +XXX,XX @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
-     memset(id, 0, sizeof(*id));
-     cmd.cdw10 = 0;
-     cmd.nsid = cpu_to_le32(namespace);
--    if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
-+    if (nvme_admin_cmd_sync(bs, &cmd)) {
-         error_setg(errp, "Failed to identify namespace");
-         goto out;
-     }
-@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
-         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
-         .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC),
-     };
--    if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
-+    if (nvme_admin_cmd_sync(bs, &cmd)) {
-         error_setg(errp, "Failed to create CQ io queue [%u]", n);
-         goto out_error;
-     }
-@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
-         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
-         .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)),
-     };
--    if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
-+    if (nvme_admin_cmd_sync(bs, &cmd)) {
-         error_setg(errp, "Failed to create SQ io queue [%u]", n);
-         goto out_error;
-     }
-@@ -XXX,XX +XXX,XX @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
-         .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
-     };
--    ret = nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd);
-+    ret = nvme_admin_cmd_sync(bs, &cmd);
-     if (ret) {
-         error_setg(errp, "Failed to configure NVMe write cache");
-     }
---
-.28.0

-[PULL 19/33] block/nvme: Set request_alignment at initialization
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-Commit bdd6a90a9e5 ("block: Add VFIO based NVMe driver")
-sets the request_alignment in nvme_refresh_limits().
-For consistency, also set it during initialization.
-Reported-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-18-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c | 1 +
-file changed, 1 insertion(+)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
-     s->page_size = MAX(4096, 1 << NVME_CAP_MPSMIN(cap));
-     s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t);
-     bs->bl.opt_mem_alignment = s->page_size;
-+    bs->bl.request_alignment = s->page_size;
-     timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000);
-     /* Reset device to get a clean state. */
---
-.28.0

-[PULL 20/33] block/nvme: Correct minimum device page size
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-While trying to simplify the code using a macro, we forgot
-the 12-bit shift... Correct that.
-Fixes: fad1eb68862 ("block/nvme: Use register definitions from 'block/nvme.h'")
-Reported-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Eric Auger <eric.auger@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-19-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
-         goto out;
-     }
--    s->page_size = MAX(4096, 1 << NVME_CAP_MPSMIN(cap));
-+    s->page_size = 1u << (12 + NVME_CAP_MPSMIN(cap));
-     s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t);
-     bs->bl.opt_mem_alignment = s->page_size;
-     bs->bl.request_alignment = s->page_size;
---
-.28.0

-[PULL 21/33] block/nvme: Change size and alignment of IDENTIFY response buffer
+Deleted patch
-From: Eric Auger <eric.auger@redhat.com>
-In preparation of 64kB host page support, let's change the size
-and alignment of the IDENTIFY command response buffer so that
-the VFIO DMA MAP succeeds. We align on the host page size.
-Signed-off-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-20-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c | 9 +++++----
-file changed, 5 insertions(+), 4 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
-         .opcode = NVME_ADM_CMD_IDENTIFY,
-         .cdw10 = cpu_to_le32(0x1),
-     };
-+    size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size);
--    id = qemu_try_memalign(s->page_size, sizeof(*id));
-+    id = qemu_try_memalign(qemu_real_host_page_size, id_size);
-     if (!id) {
-         error_setg(errp, "Cannot allocate buffer for identify response");
-         goto out;
-     }
--    r = qemu_vfio_dma_map(s->vfio, id, sizeof(*id), true, &iova);
-+    r = qemu_vfio_dma_map(s->vfio, id, id_size, true, &iova);
-     if (r) {
-         error_setg(errp, "Cannot map buffer for DMA");
-         goto out;
-     }
--    memset(id, 0, sizeof(*id));
-+    memset(id, 0, id_size);
-     cmd.dptr.prp1 = cpu_to_le64(iova);
-     if (nvme_admin_cmd_sync(bs, &cmd)) {
-         error_setg(errp, "Failed to identify controller");
-@@ -XXX,XX +XXX,XX @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
-     s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES);
-     s->supports_discard = !!(oncs & NVME_ONCS_DSM);
--    memset(id, 0, sizeof(*id));
-+    memset(id, 0, id_size);
-     cmd.cdw10 = 0;
-     cmd.nsid = cpu_to_le32(namespace);
-     if (nvme_admin_cmd_sync(bs, &cmd)) {
---
-.28.0

-[PULL 22/33] block/nvme: Change size and alignment of queue
+Deleted patch
-From: Eric Auger <eric.auger@redhat.com>
-In preparation of 64kB host page support, let's change the size
-and alignment of the queue so that the VFIO DMA MAP succeeds.
-We align on the host page size.
-Signed-off-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-21-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c | 4 ++--
-file changed, 2 insertions(+), 2 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
-     size_t bytes;
-     int r;
--    bytes = ROUND_UP(nentries * entry_bytes, s->page_size);
-+    bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size);
-     q->head = q->tail = 0;
--    q->queue = qemu_try_memalign(s->page_size, bytes);
-+    q->queue = qemu_try_memalign(qemu_real_host_page_size, bytes);
-     if (!q->queue) {
-         error_setg(errp, "Cannot allocate queue");
-         return false;
---
-.28.0

-[PULL 23/33] block/nvme: Change size and alignment of prp_list_pages
+Deleted patch
-From: Eric Auger <eric.auger@redhat.com>
-In preparation of 64kB host page support, let's change the size
-and alignment of the prp_list_pages so that the VFIO DMA MAP succeeds
-with 64kB host page size. We align on the host page size.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Signed-off-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-22-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c | 11 ++++++-----
-file changed, 6 insertions(+), 5 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
-     int i, r;
-     NVMeQueuePair *q;
-     uint64_t prp_list_iova;
-+    size_t bytes;
-     q = g_try_new0(NVMeQueuePair, 1);
-     if (!q) {
-@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
-     }
-     trace_nvme_create_queue_pair(idx, q, size, aio_context,
-                                  event_notifier_get_fd(s->irq_notifier));
--    q->prp_list_pages = qemu_try_memalign(s->page_size,
--                                          s->page_size * NVME_NUM_REQS);
-+    bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS,
-+                          qemu_real_host_page_size);
-+    q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size, bytes);
-     if (!q->prp_list_pages) {
-         goto fail;
-     }
--    memset(q->prp_list_pages, 0, s->page_size * NVME_NUM_REQS);
-+    memset(q->prp_list_pages, 0, bytes);
-     qemu_mutex_init(&q->lock);
-     q->s = s;
-     q->index = idx;
-     qemu_co_queue_init(&q->free_req_queue);
-     q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q);
--    r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
--                          s->page_size * NVME_NUM_REQS,
-+    r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, bytes,
-                           false, &prp_list_iova);
-     if (r) {
-         goto fail;
---
-.28.0

-[PULL 24/33] block/nvme: Align iov's va and size on host page size
+Deleted patch
-From: Eric Auger <eric.auger@redhat.com>
-Make sure iov's va and size are properly aligned on the
-host page size.
-Signed-off-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-23-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c | 14 ++++++++------
-file changed, 8 insertions(+), 6 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd,
-     for (i = 0; i < qiov->niov; ++i) {
-         bool retry = true;
-         uint64_t iova;
-+        size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len,
-+                                   qemu_real_host_page_size);
- try_map:
-         r = qemu_vfio_dma_map(s->vfio,
-                               qiov->iov[i].iov_base,
--                              qiov->iov[i].iov_len,
--                              true, &iova);
-+                              len, true, &iova);
-         if (r == -ENOMEM && retry) {
-             retry = false;
-             trace_nvme_dma_flush_queue_wait(s);
-@@ -XXX,XX +XXX,XX @@ static inline bool nvme_qiov_aligned(BlockDriverState *bs,
-     BDRVNVMeState *s = bs->opaque;
-     for (i = 0; i < qiov->niov; ++i) {
--        if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, s->page_size) ||
--            !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, s->page_size)) {
-+        if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base,
-+                                 qemu_real_host_page_size) ||
-+            !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size)) {
-             trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
-                                       qiov->iov[i].iov_len, s->page_size);
-             return false;
-@@ -XXX,XX +XXX,XX @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-     int r;
-     uint8_t *buf = NULL;
-     QEMUIOVector local_qiov;
--
-+    size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size);
-     assert(QEMU_IS_ALIGNED(offset, s->page_size));
-     assert(QEMU_IS_ALIGNED(bytes, s->page_size));
-     assert(bytes <= s->max_transfer);
-@@ -XXX,XX +XXX,XX @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
-     }
-     s->stats.unaligned_accesses++;
-     trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
--    buf = qemu_try_memalign(s->page_size, bytes);
-+    buf = qemu_try_memalign(qemu_real_host_page_size, len);
-     if (!buf) {
-         return -ENOMEM;
---
-.28.0

-[PULL 25/33] block/nvme: Fix use of write-only doorbells page on Aarch64 arch
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-qemu_vfio_pci_map_bar() calls mmap(), and mmap(2) states:
-  'offset' must be a multiple of the page size as returned
-   by sysconf(_SC_PAGE_SIZE).
-In commit f68453237b9 we started to use an offset of 4K which
-broke this contract on Aarch64 arch.
-Fix by mapping at offset 0, and and accessing doorbells at offset=4K.
-Fixes: f68453237b9 ("block/nvme: Map doorbells pages write-only")
-Reported-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Eric Auger <eric.auger@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201029093306.1063879-24-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c | 11 +++++++----
-file changed, 7 insertions(+), 4 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ typedef struct {
- struct BDRVNVMeState {
-     AioContext *aio_context;
-     QEMUVFIOState *vfio;
-+    void *bar0_wo_map;
-     /* Memory mapped registers */
-     volatile struct {
-         uint32_t sq_tail;
-@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
-         }
-     }
--    s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0, sizeof(NvmeBar),
--                                         NVME_DOORBELL_SIZE, PROT_WRITE, errp);
-+    s->bar0_wo_map = qemu_vfio_pci_map_bar(s->vfio, 0, 0,
-+                                           sizeof(NvmeBar) + NVME_DOORBELL_SIZE,
-+                                           PROT_WRITE, errp);
-+    s->doorbells = (void *)((uintptr_t)s->bar0_wo_map + sizeof(NvmeBar));
-     if (!s->doorbells) {
-         ret = -EINVAL;
-         goto out;
-@@ -XXX,XX +XXX,XX @@ static void nvme_close(BlockDriverState *bs)
-                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
-                            false, NULL, NULL);
-     event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
--    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells,
--                            sizeof(NvmeBar), NVME_DOORBELL_SIZE);
-+    qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map,
-+                            0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE);
-     qemu_vfio_close(s->vfio);
-     g_free(s->device);
---
-.28.0

-[PULL 26/33] block/nvme: Fix nvme_submit_command() on big-endian host
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-The Completion Queue Command Identifier is a 16-bit value,
-so nvme_submit_command() is unlikely to work on big-endian
-hosts, as the relevant bits are truncated.
-Fix by using the correct byte-swap function.
-Fixes: bdd6a90a9e5 ("block: Add VFIO based NVMe driver")
-Reported-by: Keith Busch <kbusch@kernel.org>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Message-id: 20201029093306.1063879-25-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- block/nvme.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
-     assert(!req->cb);
-     req->cb = cb;
-     req->opaque = opaque;
--    cmd->cid = cpu_to_le32(req->cid);
-+    cmd->cid = cpu_to_le16(req->cid);
-     trace_nvme_submit_command(q->s, q->index, req->cid);
-     nvme_trace_command(cmd);
---
-.28.0

-[PULL 27/33] util/vfio-helpers: Improve reporting unsupported IOMMU type
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-Change the confuse "VFIO IOMMU check failed" error message by
-the explicit "VFIO IOMMU Type1 is not supported" once.
-Example on POWER:
- $ qemu-system-ppc64 -drive if=none,id=nvme0,file=nvme://0001:01:00.0/1,format=raw
- qemu-system-ppc64: -drive if=none,id=nvme0,file=nvme://0001:01:00.0/1,format=raw: VFIO IOMMU Type1 is not supported
-Suggested-by: Alex Williamson <alex.williamson@redhat.com>
-Reviewed-by: Fam Zheng <fam@euphon.net>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201103020733.2303148-2-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- util/vfio-helpers.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
-index XXXXXXX..XXXXXXX 100644
---- a/util/vfio-helpers.c
-+++ b/util/vfio-helpers.c
-@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
-     }
-     if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
--        error_setg_errno(errp, errno, "VFIO IOMMU check failed");
-+        error_setg_errno(errp, errno, "VFIO IOMMU Type1 is not supported");
-         ret = -EINVAL;
-         goto fail_container;
-     }
---
-.28.0

-[PULL 28/33] util/vfio-helpers: Trace PCI I/O config accesses
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-We sometime get kernel panic with some devices on Aarch64
-hosts. Alex Williamson suggests it might be broken PCIe
-root complex. Add trace event to record the latest I/O
-access before crashing. In case, assert our accesses are
-aligned.
-Reviewed-by: Fam Zheng <fam@euphon.net>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201103020733.2303148-3-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- util/vfio-helpers.c | 8 ++++++++
- util/trace-events   | 2 ++
-files changed, 10 insertions(+)
-diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
-index XXXXXXX..XXXXXXX 100644
---- a/util/vfio-helpers.c
-+++ b/util/vfio-helpers.c
-@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
- {
-     int ret;
-+    trace_qemu_vfio_pci_read_config(buf, ofs, size,
-+                                    s->config_region_info.offset,
-+                                    s->config_region_info.size);
-+    assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
-     do {
-         ret = pread(s->device, buf, size, s->config_region_info.offset + ofs);
-     } while (ret == -1 && errno == EINTR);
-@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int
- {
-     int ret;
-+    trace_qemu_vfio_pci_write_config(buf, ofs, size,
-+                                     s->config_region_info.offset,
-+                                     s->config_region_info.size);
-+    assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
-     do {
-         ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs);
-     } while (ret == -1 && errno == EINTR);
-diff --git a/util/trace-events b/util/trace-events
-index XXXXXXX..XXXXXXX 100644
---- a/util/trace-events
-+++ b/util/trace-events
-@@ -XXX,XX +XXX,XX @@ qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova
- qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size 0x%zx iova 0x%"PRIx64
- qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d iova %p"
- qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
-+qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
-+qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
---
-.28.0

-[PULL 29/33] util/vfio-helpers: Trace PCI BAR region info
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-For debug purpose, trace BAR regions info.
-Reviewed-by: Fam Zheng <fam@euphon.net>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201103020733.2303148-4-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- util/vfio-helpers.c | 8 ++++++++
- util/trace-events   | 1 +
-files changed, 9 insertions(+)
-diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
-index XXXXXXX..XXXXXXX 100644
---- a/util/vfio-helpers.c
-+++ b/util/vfio-helpers.c
-@@ -XXX,XX +XXX,XX @@ static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
- static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
- {
-+    g_autofree char *barname = NULL;
-     assert_bar_index_valid(s, index);
-     s->bar_region_info[index] = (struct vfio_region_info) {
-         .index = VFIO_PCI_BAR0_REGION_INDEX + index,
-@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
-         error_setg_errno(errp, errno, "Failed to get BAR region info");
-         return -errno;
-     }
-+    barname = g_strdup_printf("bar[%d]", index);
-+    trace_qemu_vfio_region_info(barname, s->bar_region_info[index].offset,
-+                                s->bar_region_info[index].size,
-+                                s->bar_region_info[index].cap_offset);
-     return 0;
- }
-@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
-         ret = -errno;
-         goto fail;
-     }
-+    trace_qemu_vfio_region_info("config", s->config_region_info.offset,
-+                                s->config_region_info.size,
-+                                s->config_region_info.cap_offset);
-     for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) {
-         ret = qemu_vfio_pci_init_bar(s, i, errp);
-diff --git a/util/trace-events b/util/trace-events
-index XXXXXXX..XXXXXXX 100644
---- a/util/trace-events
-+++ b/util/trace-events
-@@ -XXX,XX +XXX,XX @@ qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *io
- qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
- qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
- qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
-+qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32
---
-.28.0

-[PULL 30/33] util/vfio-helpers: Trace where BARs are mapped
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-For debugging purpose, trace where a BAR is mapped.
-Reviewed-by: Fam Zheng <fam@euphon.net>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201103020733.2303148-5-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- util/vfio-helpers.c | 2 ++
- util/trace-events   | 1 +
-files changed, 3 insertions(+)
-diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
-index XXXXXXX..XXXXXXX 100644
---- a/util/vfio-helpers.c
-+++ b/util/vfio-helpers.c
-@@ -XXX,XX +XXX,XX @@ void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
-     p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
-              prot, MAP_SHARED,
-              s->device, s->bar_region_info[index].offset + offset);
-+    trace_qemu_vfio_pci_map_bar(index, s->bar_region_info[index].offset ,
-+                                size, offset, p);
-     if (p == MAP_FAILED) {
-         error_setg_errno(errp, errno, "Failed to map BAR region");
-         p = NULL;
-diff --git a/util/trace-events b/util/trace-events
-index XXXXXXX..XXXXXXX 100644
---- a/util/trace-events
-+++ b/util/trace-events
-@@ -XXX,XX +XXX,XX @@ qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
- qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
- qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
- qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32
-+qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
---
-.28.0

-[PULL 31/33] util/vfio-helpers: Improve DMA trace events
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-For debugging purpose, trace where DMA regions are mapped.
-Reviewed-by: Fam Zheng <fam@euphon.net>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201103020733.2303148-6-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- util/vfio-helpers.c | 3 ++-
- util/trace-events   | 5 +++--
-files changed, 5 insertions(+), 3 deletions(-)
-diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
-index XXXXXXX..XXXXXXX 100644
---- a/util/vfio-helpers.c
-+++ b/util/vfio-helpers.c
-@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
-         .vaddr = (uintptr_t)host,
-         .size = size,
-     };
--    trace_qemu_vfio_do_mapping(s, host, size, iova);
-+    trace_qemu_vfio_do_mapping(s, host, iova, size);
-     if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
-         error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
-@@ -XXX,XX +XXX,XX @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
-             }
-         }
-     }
-+    trace_qemu_vfio_dma_mapped(s, host, iova0, size);
-     if (iova) {
-         *iova = iova0;
-     }
-diff --git a/util/trace-events b/util/trace-events
-index XXXXXXX..XXXXXXX 100644
---- a/util/trace-events
-+++ b/util/trace-events
-@@ -XXX,XX +XXX,XX @@ qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%z
- qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
- qemu_vfio_find_mapping(void *s, void *p) "s %p host %p"
- qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size 0x%zx index %d iova 0x%"PRIx64
--qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size 0x%zx iova 0x%"PRIx64
--qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d iova %p"
-+qemu_vfio_do_mapping(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64 " size 0x%zx"
-+qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d &iova %p"
-+qemu_vfio_dma_mapped(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64" size 0x%zx"
- qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
- qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
- qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
---
-.28.0

-[PULL 33/33] util/vfio-helpers: Assert offset is aligned to page size
+Deleted patch
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
-mmap(2) states:
-  'offset' must be a multiple of the page size as returned
-   by sysconf(_SC_PAGE_SIZE).
-Add an assertion to be sure we don't break this contract.
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20201103020733.2303148-8-philmd@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Eric Auger <eric.auger@redhat.com>
----
- util/vfio-helpers.c | 1 +
-file changed, 1 insertion(+)
-diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
-index XXXXXXX..XXXXXXX 100644
---- a/util/vfio-helpers.c
-+++ b/util/vfio-helpers.c
-@@ -XXX,XX +XXX,XX @@ void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
-                             Error **errp)
- {
-     void *p;
-+    assert(QEMU_IS_ALIGNED(offset, qemu_real_host_page_size));
-     assert_bar_index_valid(s, index);
-     p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
-              prot, MAP_SHARED,
---
-.28.0

The following changes since commit 8507c9d5c9a62de2a0e281b640f995e26eac46af:

Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging (2020-11-03 15:59:44 +0000)

are available in the Git repository at:

https://gitlab.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to fc107d86840b3364e922c26cf7631b7fd38ce523:

util/vfio-helpers: Assert offset is aligned to page size (2020-11-03 19:06:23 +0000)

----------------------------------------------------------------
Pull request for 5.2

NVMe fixes to solve IOMMU issues on non-x86 and error message/tracing
improvements. Elena Afanasova's ioeventfd fixes are also included.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>

----------------------------------------------------------------

Elena Afanasova (2):
  accel/kvm: add PIO ioeventfds only in case kvm_eventfds_allowed is
    true
  softmmu/memory: fix memory_region_ioeventfd_equal()

Eric Auger (4):
  block/nvme: Change size and alignment of IDENTIFY response buffer
  block/nvme: Change size and alignment of queue
  block/nvme: Change size and alignment of prp_list_pages
  block/nvme: Align iov's va and size on host page size

Philippe Mathieu-Daudé (27):
  MAINTAINERS: Cover "block/nvme.h" file
  block/nvme: Use hex format to display offset in trace events
  block/nvme: Report warning with warn_report()
  block/nvme: Trace controller capabilities
  block/nvme: Trace nvme_poll_queue() per queue
  block/nvme: Improve nvme_free_req_queue_wait() trace information
  block/nvme: Trace queue pair creation/deletion
  block/nvme: Move definitions before structure declarations
  block/nvme: Use unsigned integer for queue counter/size
  block/nvme: Make nvme_identify() return boolean indicating error
  block/nvme: Make nvme_init_queue() return boolean indicating error
  block/nvme: Introduce Completion Queue definitions
  block/nvme: Use definitions instead of magic values in add_io_queue()
  block/nvme: Correctly initialize Admin Queue Attributes
  block/nvme: Simplify ADMIN queue access
  block/nvme: Simplify nvme_cmd_sync()
  block/nvme: Set request_alignment at initialization
  block/nvme: Correct minimum device page size
  block/nvme: Fix use of write-only doorbells page on Aarch64 arch
  block/nvme: Fix nvme_submit_command() on big-endian host
  util/vfio-helpers: Improve reporting unsupported IOMMU type
  util/vfio-helpers: Trace PCI I/O config accesses
  util/vfio-helpers: Trace PCI BAR region info
  util/vfio-helpers: Trace where BARs are mapped
  util/vfio-helpers: Improve DMA trace events
  util/vfio-helpers: Convert vfio_dump_mapping to trace events
  util/vfio-helpers: Assert offset is aligned to page size

-- 
2.28.0

From: Elena Afanasova <eafanasova@gmail.com>

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
Message-Id: <20201017210102.26036-1-eafanasova@gmail.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 accel/kvm/kvm-all.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -XXX,XX +XXX,XX @@ static int kvm_init(MachineState *ms)
 
     kvm_memory_listener_register(s, &s->memory_listener,
                                  &address_space_memory, 0);
-    memory_listener_register(&kvm_io_listener,
-                             &address_space_io);
+    if (kvm_eventfds_allowed) {
+        memory_listener_register(&kvm_io_listener,
+                                 &address_space_io);
+    }
     memory_listener_register(&kvm_coalesced_pio_listener,
                              &address_space_io);
 
-- 
2.28.0

From: Elena Afanasova <eafanasova@gmail.com>

Eventfd can be registered with a zero length when fast_mmio is true.
Handle this case properly when dispatching through QEMU.

Signed-off-by: Elena Afanasova <eafanasova@gmail.com>
Message-id: cf71a62eb04e61932ff8ffdd02e0b2aab4f495a0.camel@gmail.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 softmmu/memory.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/softmmu/memory.c b/softmmu/memory.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -XXX,XX +XXX,XX @@ static bool memory_region_ioeventfd_before(MemoryRegionIoeventfd *a,
 static bool memory_region_ioeventfd_equal(MemoryRegionIoeventfd *a,
                                           MemoryRegionIoeventfd *b)
 {
-    return !memory_region_ioeventfd_before(a, b)
-        && !memory_region_ioeventfd_before(b, a);
+    if (int128_eq(a->addr.start, b->addr.start) &&
+        (!int128_nz(a->addr.size) || !int128_nz(b->addr.size) ||
+         (int128_eq(a->addr.size, b->addr.size) &&
+          (a->match_data == b->match_data) &&
+          ((a->match_data && (a->data == b->data)) || !a->match_data) &&
+          (a->e == b->e))))
+        return true;
+
+    return false;
 }
 
 /* Range of memory in the global map.  Addresses are absolute. */
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

The "block/nvme.h" header is shared by both the NVMe block
driver and the NVMe emulated device. Add the 'F:' entry on
both sections, so all maintainers/reviewers are notified
when it is changed.

Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Message-Id: <20200701140634.25994-1-philmd@redhat.com>
---
 MAINTAINERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index XXXXXXX..XXXXXXX 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ M: Klaus Jensen <its@irrelevant.dk>
 L: qemu-block@nongnu.org
 S: Supported
 F: hw/block/nvme*
+F: include/block/nvme.h
 F: tests/qtest/nvme-test.c
 F: docs/specs/nvme.txt
 T: git git://git.infradead.org/qemu-nvme.git nvme-next
@@ -XXX,XX +XXX,XX @@ R: Fam Zheng <fam@euphon.net>
 L: qemu-block@nongnu.org
 S: Supported
 F: block/nvme*
+F: include/block/nvme.h
 T: git https://github.com/stefanha/qemu.git block
 
 Bootdevice
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Use the same format used for the hw/vfio/ trace events.

Suggested-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-3-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/trace-events | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/trace-events b/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
 nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
 nvme_handle_event(void *s) "s %p"
 nvme_poll_cb(void *s) "s %p"
-nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset %"PRId64" bytes %"PRId64" flags %d niov %d"
-nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset %"PRId64" bytes %"PRId64" flags %d"
+nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" flags %d niov %d"
+nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset 0x%"PRIx64" bytes %"PRId64" flags %d"
 nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
-nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
-nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
-nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64""
-nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d"
+nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset 0x%"PRIx64" bytes %"PRId64" niov %d is_write %d"
+nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" ret %d"
+nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" bytes %"PRId64""
+nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d"
 nvme_dma_map_flush(void *s) "s %p"
 nvme_free_req_queue_wait(void *q) "q %p"
 nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Instead of displaying warning on stderr, use warn_report()
which also displays it on the monitor.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-4-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
         }
         cid = le16_to_cpu(c->cid);
         if (cid == 0 || cid > NVME_QUEUE_SIZE) {
-            fprintf(stderr, "Unexpected CID in completion queue: %" PRIu32 "\n",
-                    cid);
+            warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", "
+                        "queue size: %u", cid, NVME_QUEUE_SIZE);
             continue;
         }
         trace_nvme_complete_command(s, q->index, cid);
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Controllers have different capabilities and report them in the
CAP register. We are particularly interested by the page size
limits.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-5-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c       | 13 +++++++++++++
 block/trace-events |  2 ++
 2 files changed, 15 insertions(+)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
      * Initialization". */
 
     cap = le64_to_cpu(regs->cap);
+    trace_nvme_controller_capability_raw(cap);
+    trace_nvme_controller_capability("Maximum Queue Entries Supported",
+                                     1 + NVME_CAP_MQES(cap));
+    trace_nvme_controller_capability("Contiguous Queues Required",
+                                     NVME_CAP_CQR(cap));
+    trace_nvme_controller_capability("Doorbell Stride",
+                                     2 << (2 + NVME_CAP_DSTRD(cap)));
+    trace_nvme_controller_capability("Subsystem Reset Supported",
+                                     NVME_CAP_NSSRS(cap));
+    trace_nvme_controller_capability("Memory Page Size Minimum",
+                                     1 << (12 + NVME_CAP_MPSMIN(cap)));
+    trace_nvme_controller_capability("Memory Page Size Maximum",
+                                     1 << (12 + NVME_CAP_MPSMAX(cap)));
     if (!NVME_CAP_CSS(cap)) {
         error_setg(errp, "Device doesn't support NVMe command set");
         ret = -EINVAL;
diff --git a/block/trace-events b/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t
 qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
 
 # nvme.c
+nvme_controller_capability_raw(uint64_t value) "0x%08"PRIx64
+nvme_controller_capability(const char *desc, uint64_t value) "%s: %"PRIu64
 nvme_kick(void *s, int queue) "s %p queue %d"
 nvme_dma_flush_queue_wait(void *s) "s %p"
 nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

As we want to enable multiple queues, report the event
in each nvme_poll_queue() call, rather than once in
the callback calling nvme_poll_queues().

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-6-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c       | 2 +-
 block/trace-events | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queue(NVMeQueuePair *q)
     const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
     NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
 
+    trace_nvme_poll_queue(q->s, q->index);
     /*
      * Do an early check for completions. q->lock isn't needed because
      * nvme_process_completion() only runs in the event loop thread and
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_cb(void *opaque)
     BDRVNVMeState *s = container_of(e, BDRVNVMeState,
                                     irq_notifier[MSIX_SHARED_IRQ_IDX]);
 
-    trace_nvme_poll_cb(s);
     return nvme_poll_queues(s);
 }
 
diff --git a/block/trace-events b/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
 nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
 nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
 nvme_handle_event(void *s) "s %p"
-nvme_poll_cb(void *s) "s %p"
+nvme_poll_queue(void *s, unsigned q_index) "s %p q #%u"
 nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" flags %d niov %d"
 nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset 0x%"PRIx64" bytes %"PRId64" flags %d"
 nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

What we want to trace is the block driver state and the queue index.

Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-7-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c       | 2 +-
 block/trace-events | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
 
     while (q->free_req_head == -1) {
         if (qemu_in_coroutine()) {
-            trace_nvme_free_req_queue_wait(q);
+            trace_nvme_free_req_queue_wait(q->s, q->index);
             qemu_co_queue_wait(&q->free_req_queue, &q->lock);
         } else {
             qemu_mutex_unlock(&q->lock);
diff --git a/block/trace-events b/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s
 nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" bytes %"PRId64""
 nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d"
 nvme_dma_map_flush(void *s) "s %p"
-nvme_free_req_queue_wait(void *q) "q %p"
+nvme_free_req_queue_wait(void *s, unsigned q_index) "s %p q #%u"
 nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
 nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64
 nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-8-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c       | 3 +++
 block/trace-events | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
 
 static void nvme_free_queue_pair(NVMeQueuePair *q)
 {
+    trace_nvme_free_queue_pair(q->index, q);
     if (q->completion_bh) {
         qemu_bh_delete(q->completion_bh);
     }
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
     if (!q) {
         return NULL;
     }
+    trace_nvme_create_queue_pair(idx, q, size, aio_context,
+                                 event_notifier_get_fd(s->irq_notifier));
     q->prp_list_pages = qemu_try_memalign(s->page_size,
                                           s->page_size * NVME_NUM_REQS);
     if (!q->prp_list_pages) {
diff --git a/block/trace-events b/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" byte
 nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d"
 nvme_dma_map_flush(void *s) "s %p"
 nvme_free_req_queue_wait(void *s, unsigned q_index) "s %p q #%u"
+nvme_create_queue_pair(unsigned q_index, void *q, unsigned size, void *aio_context, int fd) "index %u q %p size %u aioctx %p fd %d"
+nvme_free_queue_pair(unsigned q_index, void *q) "index %u q %p"
 nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
 nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64
 nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

To be able to use some definitions in structure declarations,
move them earlier. No logical change.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-9-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@
 
 typedef struct BDRVNVMeState BDRVNVMeState;
 
+/* Same index is used for queues and IRQs */
+#define INDEX_ADMIN     0
+#define INDEX_IO(n)     (1 + n)
+
+/* This driver shares a single MSIX IRQ for the admin and I/O queues */
+enum {
+    MSIX_SHARED_IRQ_IDX = 0,
+    MSIX_IRQ_COUNT = 1
+};
+
 typedef struct {
     int32_t  head, tail;
     uint8_t  *queue;
@@ -XXX,XX +XXX,XX @@ typedef struct {
     QEMUBH      *completion_bh;
 } NVMeQueuePair;
 
-#define INDEX_ADMIN     0
-#define INDEX_IO(n)     (1 + n)
-
-/* This driver shares a single MSIX IRQ for the admin and I/O queues */
-enum {
-    MSIX_SHARED_IRQ_IDX = 0,
-    MSIX_IRQ_COUNT = 1
-};
-
 struct BDRVNVMeState {
     AioContext *aio_context;
     QEMUVFIOState *vfio;
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

We can not have negative queue count/size/index, use unsigned type.
Rename 'nr_queues' as 'queue_count' to match the spec naming.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-10-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c       | 38 ++++++++++++++++++--------------------
 block/trace-events | 10 +++++-----
 2 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState {
      * [1..]: io queues.
      */
     NVMeQueuePair **queues;
-    int nr_queues;
+    unsigned queue_count;
     size_t page_size;
     /* How many uint32_t elements does each doorbell entry take. */
     size_t doorbell_scale;
@@ -XXX,XX +XXX,XX @@ static QemuOptsList runtime_opts = {
 };
 
 static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
-                            int nentries, int entry_bytes, Error **errp)
+                            unsigned nentries, size_t entry_bytes, Error **errp)
 {
     size_t bytes;
     int r;
@@ -XXX,XX +XXX,XX @@ static void nvme_free_req_queue_cb(void *opaque)
 
 static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
                                              AioContext *aio_context,
-                                             int idx, int size,
+                                             unsigned idx, size_t size,
                                              Error **errp)
 {
     int i, r;
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
     bool progress = false;
     int i;
 
-    for (i = 0; i < s->nr_queues; i++) {
+    for (i = 0; i < s->queue_count; i++) {
         if (nvme_poll_queue(s->queues[i])) {
             progress = true;
         }
@@ -XXX,XX +XXX,XX @@ static void nvme_handle_event(EventNotifier *n)
 static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
 {
     BDRVNVMeState *s = bs->opaque;
-    int n = s->nr_queues;
+    unsigned n = s->queue_count;
     NVMeQueuePair *q;
     NvmeCmd cmd;
-    int queue_size = NVME_QUEUE_SIZE;
+    unsigned queue_size = NVME_QUEUE_SIZE;
 
     q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
                                n, queue_size, errp);
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
         .cdw11 = cpu_to_le32(0x3),
     };
     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
-        error_setg(errp, "Failed to create CQ io queue [%d]", n);
+        error_setg(errp, "Failed to create CQ io queue [%u]", n);
         goto out_error;
     }
     cmd = (NvmeCmd) {
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
         .cdw11 = cpu_to_le32(0x1 | (n << 16)),
     };
     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
-        error_setg(errp, "Failed to create SQ io queue [%d]", n);
+        error_setg(errp, "Failed to create SQ io queue [%u]", n);
         goto out_error;
     }
     s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
     s->queues[n] = q;
-    s->nr_queues++;
+    s->queue_count++;
     return true;
 out_error:
     nvme_free_queue_pair(q);
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
         ret = -EINVAL;
         goto out;
     }
-    s->nr_queues = 1;
+    s->queue_count = 1;
     QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
     regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << AQA_ACQS_SHIFT) |
                             (NVME_QUEUE_SIZE << AQA_ASQS_SHIFT));
@@ -XXX,XX +XXX,XX @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
 
 static void nvme_close(BlockDriverState *bs)
 {
-    int i;
     BDRVNVMeState *s = bs->opaque;
 
-    for (i = 0; i < s->nr_queues; ++i) {
+    for (unsigned i = 0; i < s->queue_count; ++i) {
         nvme_free_queue_pair(s->queues[i]);
     }
     g_free(s->queues);
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
     };
 
     trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov);
-    assert(s->nr_queues > 1);
+    assert(s->queue_count > 1);
     req = nvme_get_free_req(ioq);
     assert(req);
 
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
         .ret = -EINPROGRESS,
     };
 
-    assert(s->nr_queues > 1);
+    assert(s->queue_count > 1);
     req = nvme_get_free_req(ioq);
     assert(req);
     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
     cmd.cdw12 = cpu_to_le32(cdw12);
 
     trace_nvme_write_zeroes(s, offset, bytes, flags);
-    assert(s->nr_queues > 1);
+    assert(s->queue_count > 1);
     req = nvme_get_free_req(ioq);
     assert(req);
 
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
         return -ENOTSUP;
     }
 
-    assert(s->nr_queues > 1);
+    assert(s->queue_count > 1);
 
     buf = qemu_try_memalign(s->page_size, s->page_size);
     if (!buf) {
@@ -XXX,XX +XXX,XX @@ static void nvme_detach_aio_context(BlockDriverState *bs)
 {
     BDRVNVMeState *s = bs->opaque;
 
-    for (int i = 0; i < s->nr_queues; i++) {
+    for (unsigned i = 0; i < s->queue_count; i++) {
         NVMeQueuePair *q = s->queues[i];
 
         qemu_bh_delete(q->completion_bh);
@@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs,
     aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
                            false, nvme_handle_event, nvme_poll_cb);
 
-    for (int i = 0; i < s->nr_queues; i++) {
+    for (unsigned i = 0; i < s->queue_count; i++) {
         NVMeQueuePair *q = s->queues[i];
 
         q->completion_bh =
@@ -XXX,XX +XXX,XX @@ static void nvme_aio_plug(BlockDriverState *bs)
 
 static void nvme_aio_unplug(BlockDriverState *bs)
 {
-    int i;
     BDRVNVMeState *s = bs->opaque;
     assert(s->plugged);
     s->plugged = false;
-    for (i = INDEX_IO(0); i < s->nr_queues; i++) {
+    for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
         NVMeQueuePair *q = s->queues[i];
         qemu_mutex_lock(&q->lock);
         nvme_kick(q);
diff --git a/block/trace-events b/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s
 # nvme.c
 nvme_controller_capability_raw(uint64_t value) "0x%08"PRIx64
 nvme_controller_capability(const char *desc, uint64_t value) "%s: %"PRIu64
-nvme_kick(void *s, int queue) "s %p queue %d"
+nvme_kick(void *s, unsigned q_index) "s %p q #%u"
 nvme_dma_flush_queue_wait(void *s) "s %p"
 nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
-nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
-nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d"
-nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
-nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
+nvme_process_completion(void *s, unsigned q_index, int inflight) "s %p q #%u inflight %d"
+nvme_process_completion_queue_plugged(void *s, unsigned q_index) "s %p q #%u"
+nvme_complete_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
+nvme_submit_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
 nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
 nvme_handle_event(void *s) "s %p"
 nvme_poll_queue(void *s, unsigned q_index) "s %p q #%u"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Just for consistency, following the example documented since
commit e3fe3988d7 ("error: Document Error API usage rules"),
return a boolean value indicating an error is set or not.
Directly pass errp as the local_err is not requested in our
case.

Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20201029093306.1063879-11-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
     return ret;
 }
 
-static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
+/* Returns true on success, false on failure. */
+static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
 {
     BDRVNVMeState *s = bs->opaque;
+    bool ret = false;
     union {
         NvmeIdCtrl ctrl;
         NvmeIdNs ns;
@@ -XXX,XX +XXX,XX @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
         goto out;
     }
 
+    ret = true;
     s->blkshift = lbaf->ds;
 out:
     qemu_vfio_dma_unmap(s->vfio, id);
     qemu_vfree(id);
+
+    return ret;
 }
 
 static bool nvme_poll_queue(NVMeQueuePair *q)
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
     uint64_t cap;
     uint64_t timeout_ms;
     uint64_t deadline, now;
-    Error *local_err = NULL;
     volatile NvmeBar *regs = NULL;
 
     qemu_co_mutex_init(&s->dma_map_lock);
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
                            false, nvme_handle_event, nvme_poll_cb);
 
-    nvme_identify(bs, namespace, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
+    if (!nvme_identify(bs, namespace, errp)) {
         ret = -EIO;
         goto out;
     }
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-12-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static QemuOptsList runtime_opts = {
     },
 };
 
-static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
+/* Returns true on success, false on failure. */
+static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
                             unsigned nentries, size_t entry_bytes, Error **errp)
 {
     size_t bytes;
@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
     q->queue = qemu_try_memalign(s->page_size, bytes);
     if (!q->queue) {
         error_setg(errp, "Cannot allocate queue");
-        return;
+        return false;
     }
     memset(q->queue, 0, bytes);
     r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova);
     if (r) {
         error_setg(errp, "Cannot map queue");
+        return false;
     }
+    return true;
 }
 
 static void nvme_free_queue_pair(NVMeQueuePair *q)
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
                                              Error **errp)
 {
     int i, r;
-    Error *local_err = NULL;
     NVMeQueuePair *q;
     uint64_t prp_list_iova;
 
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
         req->prp_list_iova = prp_list_iova + i * s->page_size;
     }
 
-    nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
+    if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) {
         goto fail;
     }
     q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;
 
-    nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
+    if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) {
         goto fail;
     }
     q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Rename Submission Queue flags with 'Sq' to differentiate
submission queue flags from command queue flags, and introduce
Completion Queue flag definitions.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20201029093306.1063879-13-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 include/block/nvme.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -XXX,XX +XXX,XX @@ typedef struct QEMU_PACKED NvmeCreateCq {
 #define NVME_CQ_FLAGS_PC(cq_flags)  (cq_flags & 0x1)
 #define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1)
 
+enum NvmeFlagsCq {
+    NVME_CQ_PC          = 1,
+    NVME_CQ_IEN         = 2,
+};
+
 typedef struct QEMU_PACKED NvmeCreateSq {
     uint8_t     opcode;
     uint8_t     flags;
@@ -XXX,XX +XXX,XX @@ typedef struct QEMU_PACKED NvmeCreateSq {
 #define NVME_SQ_FLAGS_PC(sq_flags)      (sq_flags & 0x1)
 #define NVME_SQ_FLAGS_QPRIO(sq_flags)   ((sq_flags >> 1) & 0x3)
 
-enum NvmeQueueFlags {
-    NVME_Q_PC           = 1,
-    NVME_Q_PRIO_URGENT  = 0,
-    NVME_Q_PRIO_HIGH    = 1,
-    NVME_Q_PRIO_NORMAL  = 2,
-    NVME_Q_PRIO_LOW     = 3,
+enum NvmeFlagsSq {
+    NVME_SQ_PC          = 1,
+
+    NVME_SQ_PRIO_URGENT = 0,
+    NVME_SQ_PRIO_HIGH   = 1,
+    NVME_SQ_PRIO_NORMAL = 2,
+    NVME_SQ_PRIO_LOW    = 3,
 };
 
 typedef struct QEMU_PACKED NvmeIdentify {
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Replace magic values by definitions, and simplifiy since the
number of queues will never reach 64K.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-14-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
     NvmeCmd cmd;
     unsigned queue_size = NVME_QUEUE_SIZE;
 
+    assert(n <= UINT16_MAX);
     q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
                                n, queue_size, errp);
     if (!q) {
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
     cmd = (NvmeCmd) {
         .opcode = NVME_ADM_CMD_CREATE_CQ,
         .dptr.prp1 = cpu_to_le64(q->cq.iova),
-        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
-        .cdw11 = cpu_to_le32(0x3),
+        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
+        .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC),
     };
     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
         error_setg(errp, "Failed to create CQ io queue [%u]", n);
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
     cmd = (NvmeCmd) {
         .opcode = NVME_ADM_CMD_CREATE_SQ,
         .dptr.prp1 = cpu_to_le64(q->sq.iova),
-        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
-        .cdw11 = cpu_to_le32(0x1 | (n << 16)),
+        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
+        .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)),
     };
     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
         error_setg(errp, "Failed to create SQ io queue [%u]", n);
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

From the specification chapter 3.1.8 "AQA - Admin Queue Attributes"
the Admin Submission Queue Size field is a 0’s based value:

Admin Submission Queue Size (ASQS):

Defines the size of the Admin Submission Queue in entries.
    Enabling a controller while this field is cleared to 00h
    produces undefined results. The minimum size of the Admin
    Submission Queue is two entries. The maximum size of the
    Admin Submission Queue is 4096 entries.
    This is a 0’s based value.

This bug has never been hit because the device initialization
uses a single command synchronously :)

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-15-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
         goto out;
     }
     s->queue_count = 1;
-    QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
-    regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << AQA_ACQS_SHIFT) |
-                            (NVME_QUEUE_SIZE << AQA_ASQS_SHIFT));
+    QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000);
+    regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) |
+                            ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT));
     regs->asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova);
     regs->acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova);
 
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

We don't need to dereference from BDRVNVMeState each time.
Use a NVMeQueuePair pointer on the admin queue.
The nvme_init() becomes easier to review, matching the style
of nvme_add_io_queue().

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-16-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
                      Error **errp)
 {
     BDRVNVMeState *s = bs->opaque;
+    NVMeQueuePair *q;
     AioContext *aio_context = bdrv_get_aio_context(bs);
     int ret;
     uint64_t cap;
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
 
     /* Set up admin queue. */
     s->queues = g_new(NVMeQueuePair *, 1);
-    s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0,
-                                                          NVME_QUEUE_SIZE,
-                                                          errp);
-    if (!s->queues[INDEX_ADMIN]) {
+    q = nvme_create_queue_pair(s, aio_context, 0, NVME_QUEUE_SIZE, errp);
+    if (!q) {
         ret = -EINVAL;
         goto out;
     }
+    s->queues[INDEX_ADMIN] = q;
     s->queue_count = 1;
     QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000);
     regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) |
                             ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT));
-    regs->asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova);
-    regs->acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova);
+    regs->asq = cpu_to_le64(q->sq.iova);
+    regs->acq = cpu_to_le64(q->cq.iova);
 
     /* After setting up all control registers we can enable device now. */
     regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) |
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

As all commands use the ADMIN queue, it is pointless to pass
it as argument each time. Remove the argument, and rename the
function as nvme_admin_cmd_sync() to make this new behavior
clearer.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20201029093306.1063879-17-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
     qemu_mutex_unlock(&q->lock);
 }
 
-static void nvme_cmd_sync_cb(void *opaque, int ret)
+static void nvme_admin_cmd_sync_cb(void *opaque, int ret)
 {
     int *pret = opaque;
     *pret = ret;
     aio_wait_kick();
 }
 
-static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
-                         NvmeCmd *cmd)
+static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd)
 {
+    BDRVNVMeState *s = bs->opaque;
+    NVMeQueuePair *q = s->queues[INDEX_ADMIN];
     AioContext *aio_context = bdrv_get_aio_context(bs);
     NVMeRequest *req;
     int ret = -EINPROGRESS;
@@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
     if (!req) {
         return -EBUSY;
     }
-    nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
+    nvme_submit_command(q, req, cmd, nvme_admin_cmd_sync_cb, &ret);
 
     AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS);
     return ret;
@@ -XXX,XX +XXX,XX @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
 
     memset(id, 0, sizeof(*id));
     cmd.dptr.prp1 = cpu_to_le64(iova);
-    if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
+    if (nvme_admin_cmd_sync(bs, &cmd)) {
         error_setg(errp, "Failed to identify controller");
         goto out;
     }
@@ -XXX,XX +XXX,XX @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
     memset(id, 0, sizeof(*id));
     cmd.cdw10 = 0;
     cmd.nsid = cpu_to_le32(namespace);
-    if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
+    if (nvme_admin_cmd_sync(bs, &cmd)) {
         error_setg(errp, "Failed to identify namespace");
         goto out;
     }
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
         .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC),
     };
-    if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
+    if (nvme_admin_cmd_sync(bs, &cmd)) {
         error_setg(errp, "Failed to create CQ io queue [%u]", n);
         goto out_error;
     }
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
         .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)),
     };
-    if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
+    if (nvme_admin_cmd_sync(bs, &cmd)) {
         error_setg(errp, "Failed to create SQ io queue [%u]", n);
         goto out_error;
     }
@@ -XXX,XX +XXX,XX @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
         .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
     };
 
-    ret = nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd);
+    ret = nvme_admin_cmd_sync(bs, &cmd);
     if (ret) {
         error_setg(errp, "Failed to configure NVMe write cache");
     }
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Commit bdd6a90a9e5 ("block: Add VFIO based NVMe driver")
sets the request_alignment in nvme_refresh_limits().
For consistency, also set it during initialization.

Reported-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-18-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
     s->page_size = MAX(4096, 1 << NVME_CAP_MPSMIN(cap));
     s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t);
     bs->bl.opt_mem_alignment = s->page_size;
+    bs->bl.request_alignment = s->page_size;
     timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000);
 
     /* Reset device to get a clean state. */
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

While trying to simplify the code using a macro, we forgot
the 12-bit shift... Correct that.

Fixes: fad1eb68862 ("block/nvme: Use register definitions from 'block/nvme.h'")
Reported-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-19-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
         goto out;
     }
 
-    s->page_size = MAX(4096, 1 << NVME_CAP_MPSMIN(cap));
+    s->page_size = 1u << (12 + NVME_CAP_MPSMIN(cap));
     s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t);
     bs->bl.opt_mem_alignment = s->page_size;
     bs->bl.request_alignment = s->page_size;
-- 
2.28.0

From: Eric Auger <eric.auger@redhat.com>

In preparation of 64kB host page support, let's change the size
and alignment of the IDENTIFY command response buffer so that
the VFIO DMA MAP succeeds. We align on the host page size.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-20-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
         .opcode = NVME_ADM_CMD_IDENTIFY,
         .cdw10 = cpu_to_le32(0x1),
     };
+    size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size);
 
-    id = qemu_try_memalign(s->page_size, sizeof(*id));
+    id = qemu_try_memalign(qemu_real_host_page_size, id_size);
     if (!id) {
         error_setg(errp, "Cannot allocate buffer for identify response");
         goto out;
     }
-    r = qemu_vfio_dma_map(s->vfio, id, sizeof(*id), true, &iova);
+    r = qemu_vfio_dma_map(s->vfio, id, id_size, true, &iova);
     if (r) {
         error_setg(errp, "Cannot map buffer for DMA");
         goto out;
     }
 
-    memset(id, 0, sizeof(*id));
+    memset(id, 0, id_size);
     cmd.dptr.prp1 = cpu_to_le64(iova);
     if (nvme_admin_cmd_sync(bs, &cmd)) {
         error_setg(errp, "Failed to identify controller");
@@ -XXX,XX +XXX,XX @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
     s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES);
     s->supports_discard = !!(oncs & NVME_ONCS_DSM);
 
-    memset(id, 0, sizeof(*id));
+    memset(id, 0, id_size);
     cmd.cdw10 = 0;
     cmd.nsid = cpu_to_le32(namespace);
     if (nvme_admin_cmd_sync(bs, &cmd)) {
-- 
2.28.0

From: Eric Auger <eric.auger@redhat.com>

In preparation of 64kB host page support, let's change the size
and alignment of the queue so that the VFIO DMA MAP succeeds.
We align on the host page size.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-21-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
     size_t bytes;
     int r;
 
-    bytes = ROUND_UP(nentries * entry_bytes, s->page_size);
+    bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size);
     q->head = q->tail = 0;
-    q->queue = qemu_try_memalign(s->page_size, bytes);
+    q->queue = qemu_try_memalign(qemu_real_host_page_size, bytes);
     if (!q->queue) {
         error_setg(errp, "Cannot allocate queue");
         return false;
-- 
2.28.0

From: Eric Auger <eric.auger@redhat.com>

In preparation of 64kB host page support, let's change the size
and alignment of the prp_list_pages so that the VFIO DMA MAP succeeds
with 64kB host page size. We align on the host page size.

Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-22-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
     int i, r;
     NVMeQueuePair *q;
     uint64_t prp_list_iova;
+    size_t bytes;
 
     q = g_try_new0(NVMeQueuePair, 1);
     if (!q) {
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
     }
     trace_nvme_create_queue_pair(idx, q, size, aio_context,
                                  event_notifier_get_fd(s->irq_notifier));
-    q->prp_list_pages = qemu_try_memalign(s->page_size,
-                                          s->page_size * NVME_NUM_REQS);
+    bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS,
+                          qemu_real_host_page_size);
+    q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size, bytes);
     if (!q->prp_list_pages) {
         goto fail;
     }
-    memset(q->prp_list_pages, 0, s->page_size * NVME_NUM_REQS);
+    memset(q->prp_list_pages, 0, bytes);
     qemu_mutex_init(&q->lock);
     q->s = s;
     q->index = idx;
     qemu_co_queue_init(&q->free_req_queue);
     q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q);
-    r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
-                          s->page_size * NVME_NUM_REQS,
+    r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, bytes,
                           false, &prp_list_iova);
     if (r) {
         goto fail;
-- 
2.28.0

From: Eric Auger <eric.auger@redhat.com>

Make sure iov's va and size are properly aligned on the
host page size.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-23-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd,
     for (i = 0; i < qiov->niov; ++i) {
         bool retry = true;
         uint64_t iova;
+        size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len,
+                                   qemu_real_host_page_size);
 try_map:
         r = qemu_vfio_dma_map(s->vfio,
                               qiov->iov[i].iov_base,
-                              qiov->iov[i].iov_len,
-                              true, &iova);
+                              len, true, &iova);
         if (r == -ENOMEM && retry) {
             retry = false;
             trace_nvme_dma_flush_queue_wait(s);
@@ -XXX,XX +XXX,XX @@ static inline bool nvme_qiov_aligned(BlockDriverState *bs,
     BDRVNVMeState *s = bs->opaque;
 
     for (i = 0; i < qiov->niov; ++i) {
-        if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, s->page_size) ||
-            !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, s->page_size)) {
+        if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base,
+                                 qemu_real_host_page_size) ||
+            !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size)) {
             trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
                                       qiov->iov[i].iov_len, s->page_size);
             return false;
@@ -XXX,XX +XXX,XX @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
     int r;
     uint8_t *buf = NULL;
     QEMUIOVector local_qiov;
-
+    size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size);
     assert(QEMU_IS_ALIGNED(offset, s->page_size));
     assert(QEMU_IS_ALIGNED(bytes, s->page_size));
     assert(bytes <= s->max_transfer);
@@ -XXX,XX +XXX,XX @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
     }
     s->stats.unaligned_accesses++;
     trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
-    buf = qemu_try_memalign(s->page_size, bytes);
+    buf = qemu_try_memalign(qemu_real_host_page_size, len);
 
     if (!buf) {
         return -ENOMEM;
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

qemu_vfio_pci_map_bar() calls mmap(), and mmap(2) states:

'offset' must be a multiple of the page size as returned
   by sysconf(_SC_PAGE_SIZE).

In commit f68453237b9 we started to use an offset of 4K which
broke this contract on Aarch64 arch.

Fix by mapping at offset 0, and and accessing doorbells at offset=4K.

Fixes: f68453237b9 ("block/nvme: Map doorbells pages write-only")
Reported-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201029093306.1063879-24-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
 struct BDRVNVMeState {
     AioContext *aio_context;
     QEMUVFIOState *vfio;
+    void *bar0_wo_map;
     /* Memory mapped registers */
     volatile struct {
         uint32_t sq_tail;
@@ -XXX,XX +XXX,XX @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
         }
     }
 
-    s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0, sizeof(NvmeBar),
-                                         NVME_DOORBELL_SIZE, PROT_WRITE, errp);
+    s->bar0_wo_map = qemu_vfio_pci_map_bar(s->vfio, 0, 0,
+                                           sizeof(NvmeBar) + NVME_DOORBELL_SIZE,
+                                           PROT_WRITE, errp);
+    s->doorbells = (void *)((uintptr_t)s->bar0_wo_map + sizeof(NvmeBar));
     if (!s->doorbells) {
         ret = -EINVAL;
         goto out;
@@ -XXX,XX +XXX,XX @@ static void nvme_close(BlockDriverState *bs)
                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
                            false, NULL, NULL);
     event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
-    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells,
-                            sizeof(NvmeBar), NVME_DOORBELL_SIZE);
+    qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map,
+                            0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE);
     qemu_vfio_close(s->vfio);
 
     g_free(s->device);
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

The Completion Queue Command Identifier is a 16-bit value,
so nvme_submit_command() is unlikely to work on big-endian
hosts, as the relevant bits are truncated.
Fix by using the correct byte-swap function.

Fixes: bdd6a90a9e5 ("block: Add VFIO based NVMe driver")
Reported-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20201029093306.1063879-25-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
     assert(!req->cb);
     req->cb = cb;
     req->opaque = opaque;
-    cmd->cid = cpu_to_le32(req->cid);
+    cmd->cid = cpu_to_le16(req->cid);
 
     trace_nvme_submit_command(q->s, q->index, req->cid);
     nvme_trace_command(cmd);
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Change the confuse "VFIO IOMMU check failed" error message by
the explicit "VFIO IOMMU Type1 is not supported" once.

Example on POWER:

$ qemu-system-ppc64 -drive if=none,id=nvme0,file=nvme://0001:01:00.0/1,format=raw
 qemu-system-ppc64: -drive if=none,id=nvme0,file=nvme://0001:01:00.0/1,format=raw: VFIO IOMMU Type1 is not supported

Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Fam Zheng <fam@euphon.net>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201103020733.2303148-2-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index XXXXXXX..XXXXXXX 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
     }
 
     if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
-        error_setg_errno(errp, errno, "VFIO IOMMU check failed");
+        error_setg_errno(errp, errno, "VFIO IOMMU Type1 is not supported");
         ret = -EINVAL;
         goto fail_container;
     }
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

We sometime get kernel panic with some devices on Aarch64
hosts. Alex Williamson suggests it might be broken PCIe
root complex. Add trace event to record the latest I/O
access before crashing. In case, assert our accesses are
aligned.

Reviewed-by: Fam Zheng <fam@euphon.net>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201103020733.2303148-3-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 8 ++++++++
 util/trace-events   | 2 ++
 2 files changed, 10 insertions(+)

diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index XXXXXXX..XXXXXXX 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
 {
     int ret;
 
+    trace_qemu_vfio_pci_read_config(buf, ofs, size,
+                                    s->config_region_info.offset,
+                                    s->config_region_info.size);
+    assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
     do {
         ret = pread(s->device, buf, size, s->config_region_info.offset + ofs);
     } while (ret == -1 && errno == EINTR);
@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int
 {
     int ret;
 
+    trace_qemu_vfio_pci_write_config(buf, ofs, size,
+                                     s->config_region_info.offset,
+                                     s->config_region_info.size);
+    assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
     do {
         ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs);
     } while (ret == -1 && errno == EINTR);
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova
 qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size 0x%zx iova 0x%"PRIx64
 qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d iova %p"
 qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
+qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
+qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

For debug purpose, trace BAR regions info.

Reviewed-by: Fam Zheng <fam@euphon.net>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201103020733.2303148-4-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 8 ++++++++
 util/trace-events   | 1 +
 2 files changed, 9 insertions(+)

diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index XXXXXXX..XXXXXXX 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -XXX,XX +XXX,XX @@ static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
 
 static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
 {
+    g_autofree char *barname = NULL;
     assert_bar_index_valid(s, index);
     s->bar_region_info[index] = (struct vfio_region_info) {
         .index = VFIO_PCI_BAR0_REGION_INDEX + index,
@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
         error_setg_errno(errp, errno, "Failed to get BAR region info");
         return -errno;
     }
+    barname = g_strdup_printf("bar[%d]", index);
+    trace_qemu_vfio_region_info(barname, s->bar_region_info[index].offset,
+                                s->bar_region_info[index].size,
+                                s->bar_region_info[index].cap_offset);
 
     return 0;
 }
@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
         ret = -errno;
         goto fail;
     }
+    trace_qemu_vfio_region_info("config", s->config_region_info.offset,
+                                s->config_region_info.size,
+                                s->config_region_info.cap_offset);
 
     for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) {
         ret = qemu_vfio_pci_init_bar(s, i, errp);
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *io
 qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
 qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
 qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
+qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

For debugging purpose, trace where a BAR is mapped.

Reviewed-by: Fam Zheng <fam@euphon.net>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201103020733.2303148-5-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 2 ++
 util/trace-events   | 1 +
 2 files changed, 3 insertions(+)

diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index XXXXXXX..XXXXXXX 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -XXX,XX +XXX,XX @@ void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
     p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
              prot, MAP_SHARED,
              s->device, s->bar_region_info[index].offset + offset);
+    trace_qemu_vfio_pci_map_bar(index, s->bar_region_info[index].offset ,
+                                size, offset, p);
     if (p == MAP_FAILED) {
         error_setg_errno(errp, errno, "Failed to map BAR region");
         p = NULL;
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
 qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
 qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
 qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32
+qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

For debugging purpose, trace where DMA regions are mapped.

Reviewed-by: Fam Zheng <fam@euphon.net>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201103020733.2303148-6-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 3 ++-
 util/trace-events   | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index XXXXXXX..XXXXXXX 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -XXX,XX +XXX,XX @@ static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
         .vaddr = (uintptr_t)host,
         .size = size,
     };
-    trace_qemu_vfio_do_mapping(s, host, size, iova);
+    trace_qemu_vfio_do_mapping(s, host, iova, size);
 
     if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
         error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
@@ -XXX,XX +XXX,XX @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
             }
         }
     }
+    trace_qemu_vfio_dma_mapped(s, host, iova0, size);
     if (iova) {
         *iova = iova0;
     }
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%z
 qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
 qemu_vfio_find_mapping(void *s, void *p) "s %p host %p"
 qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size 0x%zx index %d iova 0x%"PRIx64
-qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size 0x%zx iova 0x%"PRIx64
-qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d iova %p"
+qemu_vfio_do_mapping(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64 " size 0x%zx"
+qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d &iova %p"
+qemu_vfio_dma_mapped(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64" size 0x%zx"
 qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
 qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
 qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
-- 
2.28.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

The QEMU_VFIO_DEBUG definition is only modifiable at build-time.
Trace events can be enabled at run-time. As we prefer the latter,
convert qemu_vfio_dump_mappings() to use trace events instead
of fprintf().

Reviewed-by: Fam Zheng <fam@euphon.net>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20201103020733.2303148-7-philmd@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 19 ++++---------------
 util/trace-events   |  1 +
 2 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index XXXXXXX..XXXXXXX 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -XXX,XX +XXX,XX @@ QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
     return s;
 }
 
-static void qemu_vfio_dump_mapping(IOVAMapping *m)
-{
-    if (QEMU_VFIO_DEBUG) {
-        printf("  vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host,
-               (uint64_t)m->size, (uint64_t)m->iova);
-    }
-}
-
 static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
 {
-    int i;
-
-    if (QEMU_VFIO_DEBUG) {
-        printf("vfio mappings\n");
-        for (i = 0; i < s->nr_mappings; ++i) {
-            qemu_vfio_dump_mapping(&s->mappings[i]);
-        }
+    for (int i = 0; i < s->nr_mappings; ++i) {
+        trace_qemu_vfio_dump_mapping(s->mappings[i].host,
+                                     s->mappings[i].iova,
+                                     s->mappings[i].size);
     }
 }
 
diff --git a/util/trace-events b/util/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -XXX,XX +XXX,XX @@ qemu_mutex_unlock(void *mutex, const char *file, const int line) "released mutex
 qemu_vfio_dma_reset_temporary(void *s) "s %p"
 qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
 qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
+qemu_vfio_dump_mapping(void *host, uint64_t iova, size_t size) "vfio mapping %p to iova 0x%08" PRIx64 " size 0x%zx"
 qemu_vfio_find_mapping(void *s, void *p) "s %p host %p"
 qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size 0x%zx index %d iova 0x%"PRIx64
 qemu_vfio_do_mapping(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64 " size 0x%zx"
-- 
2.28.0

The following changes since commit 9cf289af47bcfae5c75de37d8e5d6fd23705322c:

Merge tag 'qga-pull-request' of gitlab.com:marcandre.lureau/qemu into staging (2022-05-04 03:42:49 -0700)

are available in the Git repository at:

https://gitlab.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to bef2e050d6a7feb865854c65570c496ac5a8cf53:

util/event-loop-base: Introduce options to set the thread pool size (2022-05-04 17:02:19 +0100)

----------------------------------------------------------------
Pull request

Add new thread-pool-min/thread-pool-max parameters to control the thread pool
used for async I/O.

----------------------------------------------------------------

Nicolas Saenz Julienne (3):
  Introduce event-loop-base abstract class
  util/main-loop: Introduce the main loop into QOM
  util/event-loop-base: Introduce options to set the thread pool size

-- 
2.35.1

From: Nicolas Saenz Julienne <nsaenzju@redhat.com>

Introduce the 'event-loop-base' abstract class, it'll hold the
properties common to all event loops and provide the necessary hooks for
their creation and maintenance. Then have iothread inherit from it.

EventLoopBaseClass is defined as user creatable and provides a hook for
its children to attach themselves to the user creatable class 'complete'
function. It also provides an update_params() callback to propagate
property changes onto its children.

The new 'event-loop-base' class will live in the root directory. It is
built on its own using the 'link_whole' option (there are no direct
function dependencies between the class and its children, it all happens
trough 'constructor' magic). And also imposes new compilation
dependencies:

qom <- event-loop-base <- blockdev (iothread.c)

And in subsequent patches:

qom <- event-loop-base <- qemuutil (util/main-loop.c)

All this forced some amount of reordering in meson.build:

- Moved qom build definition before qemuutil. Doing it the other way
   around (i.e. moving qemuutil after qom) isn't possible as a lot of
   core libraries that live in between the two depend on it.

- Process the 'hw' subdir earlier, as it introduces files into the
   'qom' source set.

No functional changes intended.

Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Markus Armbruster <armbru@redhat.com>
Message-id: 20220425075723.20019-2-nsaenzju@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qapi/qom.json                    |  22 +++++--
 meson.build                      |  23 ++++---
 include/sysemu/event-loop-base.h |  36 +++++++++++
 include/sysemu/iothread.h        |   6 +-
 event-loop-base.c                | 104 +++++++++++++++++++++++++++++++
 iothread.c                       |  65 ++++++-------------
 6 files changed, 192 insertions(+), 64 deletions(-)
 create mode 100644 include/sysemu/event-loop-base.h
 create mode 100644 event-loop-base.c

diff --git a/qapi/qom.json b/qapi/qom.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -XXX,XX +XXX,XX @@
             '*repeat': 'bool',
             '*grab-toggle': 'GrabToggleKeys' } }
 
+##
+# @EventLoopBaseProperties:
+#
+# Common properties for event loops
+#
+# @aio-max-batch: maximum number of requests in a batch for the AIO engine,
+#                 0 means that the engine will use its default.
+#                 (default: 0)
+#
+# Since: 7.1
+##
+{ 'struct': 'EventLoopBaseProperties',
+  'data': { '*aio-max-batch': 'int' } }
+
 ##
 # @IothreadProperties:
 #
@@ -XXX,XX +XXX,XX @@
 #               algorithm detects it is spending too long polling without
 #               encountering events. 0 selects a default behaviour (default: 0)
 #
-# @aio-max-batch: maximum number of requests in a batch for the AIO engine,
-#                 0 means that the engine will use its default
-#                 (default:0, since 6.1)
+# The @aio-max-batch option is available since 6.1.
 #
 # Since: 2.0
 ##
 { 'struct': 'IothreadProperties',
+  'base': 'EventLoopBaseProperties',
   'data': { '*poll-max-ns': 'int',
             '*poll-grow': 'int',
-            '*poll-shrink': 'int',
-            '*aio-max-batch': 'int' } }
+            '*poll-shrink': 'int' } }
 
 ##
 # @MemoryBackendProperties:
diff --git a/meson.build b/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/meson.build
+++ b/meson.build
@@ -XXX,XX +XXX,XX @@ subdir('qom')
 subdir('authz')
 subdir('crypto')
 subdir('ui')
+subdir('hw')
 
 
 if enable_modules
@@ -XXX,XX +XXX,XX @@ if enable_modules
   modulecommon = declare_dependency(link_whole: libmodulecommon, compile_args: '-DBUILD_DSO')
 endif
 
+qom_ss = qom_ss.apply(config_host, strict: false)
+libqom = static_library('qom', qom_ss.sources() + genh,
+                        dependencies: [qom_ss.dependencies()],
+                        name_suffix: 'fa')
+qom = declare_dependency(link_whole: libqom)
+
+event_loop_base = files('event-loop-base.c')
+event_loop_base = static_library('event-loop-base', sources: event_loop_base + genh,
+                                 build_by_default: true)
+event_loop_base = declare_dependency(link_whole: event_loop_base,
+                                     dependencies: [qom])
+
 stub_ss = stub_ss.apply(config_all, strict: false)
 
 util_ss.add_all(trace_ss)
@@ -XXX,XX +XXX,XX @@ subdir('monitor')
 subdir('net')
 subdir('replay')
 subdir('semihosting')
-subdir('hw')
 subdir('tcg')
 subdir('fpu')
 subdir('accel')
@@ -XXX,XX +XXX,XX @@ qemu_syms = custom_target('qemu.syms', output: 'qemu.syms',
                              capture: true,
                              command: [undefsym, nm, '@INPUT@'])
 
-qom_ss = qom_ss.apply(config_host, strict: false)
-libqom = static_library('qom', qom_ss.sources() + genh,
-                        dependencies: [qom_ss.dependencies()],
-                        name_suffix: 'fa')
-
-qom = declare_dependency(link_whole: libqom)
-
 authz_ss = authz_ss.apply(config_host, strict: false)
 libauthz = static_library('authz', authz_ss.sources() + genh,
                           dependencies: [authz_ss.dependencies()],
@@ -XXX,XX +XXX,XX @@ libblockdev = static_library('blockdev', blockdev_ss.sources() + genh,
                              build_by_default: false)
 
 blockdev = declare_dependency(link_whole: [libblockdev],
-                              dependencies: [block])
+                              dependencies: [block, event_loop_base])
 
 qmp_ss = qmp_ss.apply(config_host, strict: false)
 libqmp = static_library('qmp', qmp_ss.sources() + genh,
diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/sysemu/event-loop-base.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU event-loop backend
+ *
+ * Copyright (C) 2022 Red Hat Inc
+ *
+ * Authors:
+ *  Nicolas Saenz Julienne <nsaenzju@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#ifndef QEMU_EVENT_LOOP_BASE_H
+#define QEMU_EVENT_LOOP_BASE_H
+
+#include "qom/object.h"
+#include "block/aio.h"
+#include "qemu/typedefs.h"
+
+#define TYPE_EVENT_LOOP_BASE         "event-loop-base"
+OBJECT_DECLARE_TYPE(EventLoopBase, EventLoopBaseClass,
+                    EVENT_LOOP_BASE)
+
+struct EventLoopBaseClass {
+    ObjectClass parent_class;
+
+    void (*init)(EventLoopBase *base, Error **errp);
+    void (*update_params)(EventLoopBase *base, Error **errp);
+};
+
+struct EventLoopBase {
+    Object parent;
+
+    /* AioContext AIO engine parameters */
+    int64_t aio_max_batch;
+};
+#endif
diff --git a/include/sysemu/iothread.h b/include/sysemu/iothread.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/iothread.h
+++ b/include/sysemu/iothread.h
@@ -XXX,XX +XXX,XX @@
 #include "block/aio.h"
 #include "qemu/thread.h"
 #include "qom/object.h"
+#include "sysemu/event-loop-base.h"
 
 #define TYPE_IOTHREAD "iothread"
 
 struct IOThread {
-    Object parent_obj;
+    EventLoopBase parent_obj;
 
     QemuThread thread;
     AioContext *ctx;
@@ -XXX,XX +XXX,XX @@ struct IOThread {
     int64_t poll_max_ns;
     int64_t poll_grow;
     int64_t poll_shrink;
-
-    /* AioContext AIO engine parameters */
-    int64_t aio_max_batch;
 };
 typedef struct IOThread IOThread;
 
diff --git a/event-loop-base.c b/event-loop-base.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/event-loop-base.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU event-loop base
+ *
+ * Copyright (C) 2022 Red Hat Inc
+ *
+ * Authors:
+ *  Stefan Hajnoczi <stefanha@redhat.com>
+ *  Nicolas Saenz Julienne <nsaenzju@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qom/object_interfaces.h"
+#include "qapi/error.h"
+#include "sysemu/event-loop-base.h"
+
+typedef struct {
+    const char *name;
+    ptrdiff_t offset; /* field's byte offset in EventLoopBase struct */
+} EventLoopBaseParamInfo;
+
+static EventLoopBaseParamInfo aio_max_batch_info = {
+    "aio-max-batch", offsetof(EventLoopBase, aio_max_batch),
+};
+
+static void event_loop_base_get_param(Object *obj, Visitor *v,
+        const char *name, void *opaque, Error **errp)
+{
+    EventLoopBase *event_loop_base = EVENT_LOOP_BASE(obj);
+    EventLoopBaseParamInfo *info = opaque;
+    int64_t *field = (void *)event_loop_base + info->offset;
+
+    visit_type_int64(v, name, field, errp);
+}
+
+static void event_loop_base_set_param(Object *obj, Visitor *v,
+        const char *name, void *opaque, Error **errp)
+{
+    EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(obj);
+    EventLoopBase *base = EVENT_LOOP_BASE(obj);
+    EventLoopBaseParamInfo *info = opaque;
+    int64_t *field = (void *)base + info->offset;
+    int64_t value;
+
+    if (!visit_type_int64(v, name, &value, errp)) {
+        return;
+    }
+
+    if (value < 0) {
+        error_setg(errp, "%s value must be in range [0, %" PRId64 "]",
+                   info->name, INT64_MAX);
+        return;
+    }
+
+    *field = value;
+
+    if (bc->update_params) {
+        bc->update_params(base, errp);
+    }
+
+    return;
+}
+
+static void event_loop_base_complete(UserCreatable *uc, Error **errp)
+{
+    EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(uc);
+    EventLoopBase *base = EVENT_LOOP_BASE(uc);
+
+    if (bc->init) {
+        bc->init(base, errp);
+    }
+}
+
+static void event_loop_base_class_init(ObjectClass *klass, void *class_data)
+{
+    UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
+    ucc->complete = event_loop_base_complete;
+
+    object_class_property_add(klass, "aio-max-batch", "int",
+                              event_loop_base_get_param,
+                              event_loop_base_set_param,
+                              NULL, &aio_max_batch_info);
+}
+
+static const TypeInfo event_loop_base_info = {
+    .name = TYPE_EVENT_LOOP_BASE,
+    .parent = TYPE_OBJECT,
+    .instance_size = sizeof(EventLoopBase),
+    .class_size = sizeof(EventLoopBaseClass),
+    .class_init = event_loop_base_class_init,
+    .abstract = true,
+    .interfaces = (InterfaceInfo[]) {
+        { TYPE_USER_CREATABLE },
+        { }
+    }
+};
+
+static void register_types(void)
+{
+    type_register_static(&event_loop_base_info);
+}
+type_init(register_types);
diff --git a/iothread.c b/iothread.c
index XXXXXXX..XXXXXXX 100644
--- a/iothread.c
+++ b/iothread.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/module.h"
 #include "block/aio.h"
 #include "block/block.h"
+#include "sysemu/event-loop-base.h"
 #include "sysemu/iothread.h"
 #include "qapi/error.h"
 #include "qapi/qapi-commands-misc.h"
@@ -XXX,XX +XXX,XX @@ static void iothread_init_gcontext(IOThread *iothread)
     iothread->main_loop = g_main_loop_new(iothread->worker_context, TRUE);
 }
 
-static void iothread_set_aio_context_params(IOThread *iothread, Error **errp)
+static void iothread_set_aio_context_params(EventLoopBase *base, Error **errp)
 {
+    IOThread *iothread = IOTHREAD(base);
     ERRP_GUARD();
 
+    if (!iothread->ctx) {
+        return;
+    }
+
     aio_context_set_poll_params(iothread->ctx,
                                 iothread->poll_max_ns,
                                 iothread->poll_grow,
@@ -XXX,XX +XXX,XX @@ static void iothread_set_aio_context_params(IOThread *iothread, Error **errp)
     }
 
     aio_context_set_aio_params(iothread->ctx,
-                               iothread->aio_max_batch,
+                               iothread->parent_obj.aio_max_batch,
                                errp);
 }
 
-static void iothread_complete(UserCreatable *obj, Error **errp)
+
+static void iothread_init(EventLoopBase *base, Error **errp)
 {
     Error *local_error = NULL;
-    IOThread *iothread = IOTHREAD(obj);
+    IOThread *iothread = IOTHREAD(base);
     char *thread_name;
 
     iothread->stopping = false;
@@ -XXX,XX +XXX,XX @@ static void iothread_complete(UserCreatable *obj, Error **errp)
      */
     iothread_init_gcontext(iothread);
 
-    iothread_set_aio_context_params(iothread, &local_error);
+    iothread_set_aio_context_params(base, &local_error);
     if (local_error) {
         error_propagate(errp, local_error);
         aio_context_unref(iothread->ctx);
@@ -XXX,XX +XXX,XX @@ static void iothread_complete(UserCreatable *obj, Error **errp)
      * to inherit.
      */
     thread_name = g_strdup_printf("IO %s",
-                        object_get_canonical_path_component(OBJECT(obj)));
+                        object_get_canonical_path_component(OBJECT(base)));
     qemu_thread_create(&iothread->thread, thread_name, iothread_run,
                        iothread, QEMU_THREAD_JOINABLE);
     g_free(thread_name);
@@ -XXX,XX +XXX,XX @@ static IOThreadParamInfo poll_grow_info = {
 static IOThreadParamInfo poll_shrink_info = {
     "poll-shrink", offsetof(IOThread, poll_shrink),
 };
-static IOThreadParamInfo aio_max_batch_info = {
-    "aio-max-batch", offsetof(IOThread, aio_max_batch),
-};
 
 static void iothread_get_param(Object *obj, Visitor *v,
         const char *name, IOThreadParamInfo *info, Error **errp)
@@ -XXX,XX +XXX,XX @@ static void iothread_set_poll_param(Object *obj, Visitor *v,
     }
 }
 
-static void iothread_get_aio_param(Object *obj, Visitor *v,
-        const char *name, void *opaque, Error **errp)
-{
-    IOThreadParamInfo *info = opaque;
-
-    iothread_get_param(obj, v, name, info, errp);
-}
-
-static void iothread_set_aio_param(Object *obj, Visitor *v,
-        const char *name, void *opaque, Error **errp)
-{
-    IOThread *iothread = IOTHREAD(obj);
-    IOThreadParamInfo *info = opaque;
-
-    if (!iothread_set_param(obj, v, name, info, errp)) {
-        return;
-    }
-
-    if (iothread->ctx) {
-        aio_context_set_aio_params(iothread->ctx,
-                                   iothread->aio_max_batch,
-                                   errp);
-    }
-}
-
 static void iothread_class_init(ObjectClass *klass, void *class_data)
 {
-    UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
-    ucc->complete = iothread_complete;
+    EventLoopBaseClass *bc = EVENT_LOOP_BASE_CLASS(klass);
+
+    bc->init = iothread_init;
+    bc->update_params = iothread_set_aio_context_params;
 
     object_class_property_add(klass, "poll-max-ns", "int",
                               iothread_get_poll_param,
@@ -XXX,XX +XXX,XX @@ static void iothread_class_init(ObjectClass *klass, void *class_data)
                               iothread_get_poll_param,
                               iothread_set_poll_param,
                               NULL, &poll_shrink_info);
-    object_class_property_add(klass, "aio-max-batch", "int",
-                              iothread_get_aio_param,
-                              iothread_set_aio_param,
-                              NULL, &aio_max_batch_info);
 }
 
 static const TypeInfo iothread_info = {
     .name = TYPE_IOTHREAD,
-    .parent = TYPE_OBJECT,
+    .parent = TYPE_EVENT_LOOP_BASE,
     .class_init = iothread_class_init,
     .instance_size = sizeof(IOThread),
     .instance_init = iothread_instance_init,
     .instance_finalize = iothread_instance_finalize,
-    .interfaces = (InterfaceInfo[]) {
-        {TYPE_USER_CREATABLE},
-        {}
-    },
 };
 
 static void iothread_register_types(void)
@@ -XXX,XX +XXX,XX @@ static int query_one_iothread(Object *object, void *opaque)
     info->poll_max_ns = iothread->poll_max_ns;
     info->poll_grow = iothread->poll_grow;
     info->poll_shrink = iothread->poll_shrink;
-    info->aio_max_batch = iothread->aio_max_batch;
+    info->aio_max_batch = iothread->parent_obj.aio_max_batch;
 
     QAPI_LIST_APPEND(*tail, info);
     return 0;
-- 
2.35.1

From: Nicolas Saenz Julienne <nsaenzju@redhat.com>

'event-loop-base' provides basic property handling for all 'AioContext'
based event loops. So let's define a new 'MainLoopClass' that inherits
from it. This will permit tweaking the main loop's properties through
qapi as well as through the command line using the '-object' keyword[1].
Only one instance of 'MainLoopClass' might be created at any time.

'EventLoopBaseClass' learns a new callback, 'can_be_deleted()' so as to
mark 'MainLoop' as non-deletable.

[1] For example:
      -object main-loop,id=main-loop,aio-max-batch=<value>

Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Markus Armbruster <armbru@redhat.com>
Message-id: 20220425075723.20019-3-nsaenzju@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qapi/qom.json                    | 13 ++++++++
 meson.build                      |  3 +-
 include/qemu/main-loop.h         | 10 ++++++
 include/sysemu/event-loop-base.h |  1 +
 event-loop-base.c                | 13 ++++++++
 util/main-loop.c                 | 56 ++++++++++++++++++++++++++++++++
 6 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/qapi/qom.json b/qapi/qom.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -XXX,XX +XXX,XX @@
             '*poll-grow': 'int',
             '*poll-shrink': 'int' } }
 
+##
+# @MainLoopProperties:
+#
+# Properties for the main-loop object.
+#
+# Since: 7.1
+##
+{ 'struct': 'MainLoopProperties',
+  'base': 'EventLoopBaseProperties',
+  'data': {} }
+
 ##
 # @MemoryBackendProperties:
 #
@@ -XXX,XX +XXX,XX @@
     { 'name': 'input-linux',
       'if': 'CONFIG_LINUX' },
     'iothread',
+    'main-loop',
     { 'name': 'memory-backend-epc',
       'if': 'CONFIG_LINUX' },
     'memory-backend-file',
@@ -XXX,XX +XXX,XX @@
       'input-linux':                { 'type': 'InputLinuxProperties',
                                       'if': 'CONFIG_LINUX' },
       'iothread':                   'IothreadProperties',
+      'main-loop':                  'MainLoopProperties',
       'memory-backend-epc':         { 'type': 'MemoryBackendEpcProperties',
                                       'if': 'CONFIG_LINUX' },
       'memory-backend-file':        'MemoryBackendFileProperties',
diff --git a/meson.build b/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/meson.build
+++ b/meson.build
@@ -XXX,XX +XXX,XX @@ libqemuutil = static_library('qemuutil',
                              sources: util_ss.sources() + stub_ss.sources() + genh,
                              dependencies: [util_ss.dependencies(), libm, threads, glib, socket, malloc, pixman])
 qemuutil = declare_dependency(link_with: libqemuutil,
-                              sources: genh + version_res)
+                              sources: genh + version_res,
+                              dependencies: [event_loop_base])
 
 if have_system or have_user
   decodetree = generator(find_program('scripts/decodetree.py'),
diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -XXX,XX +XXX,XX @@
 #define QEMU_MAIN_LOOP_H
 
 #include "block/aio.h"
+#include "qom/object.h"
+#include "sysemu/event-loop-base.h"
 
 #define SIG_IPI SIGUSR1
 
+#define TYPE_MAIN_LOOP  "main-loop"
+OBJECT_DECLARE_TYPE(MainLoop, MainLoopClass, MAIN_LOOP)
+
+struct MainLoop {
+    EventLoopBase parent_obj;
+};
+typedef struct MainLoop MainLoop;
+
 /**
  * qemu_init_main_loop: Set up the process so that it can run the main loop.
  *
diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/event-loop-base.h
+++ b/include/sysemu/event-loop-base.h
@@ -XXX,XX +XXX,XX @@ struct EventLoopBaseClass {
 
     void (*init)(EventLoopBase *base, Error **errp);
     void (*update_params)(EventLoopBase *base, Error **errp);
+    bool (*can_be_deleted)(EventLoopBase *base);
 };
 
 struct EventLoopBase {
diff --git a/event-loop-base.c b/event-loop-base.c
index XXXXXXX..XXXXXXX 100644
--- a/event-loop-base.c
+++ b/event-loop-base.c
@@ -XXX,XX +XXX,XX @@ static void event_loop_base_complete(UserCreatable *uc, Error **errp)
     }
 }
 
+static bool event_loop_base_can_be_deleted(UserCreatable *uc)
+{
+    EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(uc);
+    EventLoopBase *backend = EVENT_LOOP_BASE(uc);
+
+    if (bc->can_be_deleted) {
+        return bc->can_be_deleted(backend);
+    }
+
+    return true;
+}
+
 static void event_loop_base_class_init(ObjectClass *klass, void *class_data)
 {
     UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
     ucc->complete = event_loop_base_complete;
+    ucc->can_be_deleted = event_loop_base_can_be_deleted;
 
     object_class_property_add(klass, "aio-max-batch", "int",
                               event_loop_base_get_param,
diff --git a/util/main-loop.c b/util/main-loop.c
index XXXXXXX..XXXXXXX 100644
--- a/util/main-loop.c
+++ b/util/main-loop.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/error-report.h"
 #include "qemu/queue.h"
 #include "qemu/compiler.h"
+#include "qom/object.h"
 
 #ifndef _WIN32
 #include <sys/wait.h>
@@ -XXX,XX +XXX,XX @@ int qemu_init_main_loop(Error **errp)
     return 0;
 }
 
+static void main_loop_update_params(EventLoopBase *base, Error **errp)
+{
+    if (!qemu_aio_context) {
+        error_setg(errp, "qemu aio context not ready");
+        return;
+    }
+
+    aio_context_set_aio_params(qemu_aio_context, base->aio_max_batch, errp);
+}
+
+MainLoop *mloop;
+
+static void main_loop_init(EventLoopBase *base, Error **errp)
+{
+    MainLoop *m = MAIN_LOOP(base);
+
+    if (mloop) {
+        error_setg(errp, "only one main-loop instance allowed");
+        return;
+    }
+
+    main_loop_update_params(base, errp);
+
+    mloop = m;
+    return;
+}
+
+static bool main_loop_can_be_deleted(EventLoopBase *base)
+{
+    return false;
+}
+
+static void main_loop_class_init(ObjectClass *oc, void *class_data)
+{
+    EventLoopBaseClass *bc = EVENT_LOOP_BASE_CLASS(oc);
+
+    bc->init = main_loop_init;
+    bc->update_params = main_loop_update_params;
+    bc->can_be_deleted = main_loop_can_be_deleted;
+}
+
+static const TypeInfo main_loop_info = {
+    .name = TYPE_MAIN_LOOP,
+    .parent = TYPE_EVENT_LOOP_BASE,
+    .class_init = main_loop_class_init,
+    .instance_size = sizeof(MainLoop),
+};
+
+static void main_loop_register_types(void)
+{
+    type_register_static(&main_loop_info);
+}
+
+type_init(main_loop_register_types)
+
 static int max_priority;
 
 #ifndef _WIN32
-- 
2.35.1

From: Nicolas Saenz Julienne <nsaenzju@redhat.com>

The thread pool regulates itself: when idle, it kills threads until
empty, when in demand, it creates new threads until full. This behaviour
doesn't play well with latency sensitive workloads where the price of
creating a new thread is too high. For example, when paired with qemu's
'-mlock', or using safety features like SafeStack, creating a new thread
has been measured take multiple milliseconds.

In order to mitigate this let's introduce a new 'EventLoopBase'
property to set the thread pool size. The threads will be created during
the pool's initialization or upon updating the property's value, remain
available during its lifetime regardless of demand, and destroyed upon
freeing it. A properly characterized workload will then be able to
configure the pool to avoid any latency spikes.

Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Markus Armbruster <armbru@redhat.com>
Message-id: 20220425075723.20019-4-nsaenzju@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qapi/qom.json                    | 10 +++++-
 include/block/aio.h              | 10 ++++++
 include/block/thread-pool.h      |  3 ++
 include/sysemu/event-loop-base.h |  4 +++
 event-loop-base.c                | 23 +++++++++++++
 iothread.c                       |  3 ++
 util/aio-posix.c                 |  1 +
 util/async.c                     | 20 ++++++++++++
 util/main-loop.c                 |  9 ++++++
 util/thread-pool.c               | 55 +++++++++++++++++++++++++++++---
 10 files changed, 133 insertions(+), 5 deletions(-)

diff --git a/qapi/qom.json b/qapi/qom.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -XXX,XX +XXX,XX @@
 #                 0 means that the engine will use its default.
 #                 (default: 0)
 #
+# @thread-pool-min: minimum number of threads reserved in the thread pool
+#                   (default:0)
+#
+# @thread-pool-max: maximum number of threads the thread pool can contain
+#                   (default:64)
+#
 # Since: 7.1
 ##
 { 'struct': 'EventLoopBaseProperties',
-  'data': { '*aio-max-batch': 'int' } }
+  'data': { '*aio-max-batch': 'int',
+            '*thread-pool-min': 'int',
+            '*thread-pool-max': 'int' } }
 
 ##
 # @IothreadProperties:
diff --git a/include/block/aio.h b/include/block/aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ struct AioContext {
     QSLIST_HEAD(, Coroutine) scheduled_coroutines;
     QEMUBH *co_schedule_bh;
 
+    int thread_pool_min;
+    int thread_pool_max;
     /* Thread pool for performing work and receiving completion callbacks.
      * Has its own locking.
      */
@@ -XXX,XX +XXX,XX @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
 void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
                                 Error **errp);
 
+/**
+ * aio_context_set_thread_pool_params:
+ * @ctx: the aio context
+ * @min: min number of threads to have readily available in the thread pool
+ * @min: max number of threads the thread pool can contain
+ */
+void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
+                                        int64_t max, Error **errp);
 #endif
diff --git a/include/block/thread-pool.h b/include/block/thread-pool.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/thread-pool.h
+++ b/include/block/thread-pool.h
@@ -XXX,XX +XXX,XX @@
 
 #include "block/block.h"
 
+#define THREAD_POOL_MAX_THREADS_DEFAULT         64
+
 typedef int ThreadPoolFunc(void *opaque);
 
 typedef struct ThreadPool ThreadPool;
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *thread_pool_submit_aio(ThreadPool *pool,
 int coroutine_fn thread_pool_submit_co(ThreadPool *pool,
         ThreadPoolFunc *func, void *arg);
 void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg);
+void thread_pool_update_params(ThreadPool *pool, struct AioContext *ctx);
 
 #endif
diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/event-loop-base.h
+++ b/include/sysemu/event-loop-base.h
@@ -XXX,XX +XXX,XX @@ struct EventLoopBase {
 
     /* AioContext AIO engine parameters */
     int64_t aio_max_batch;
+
+    /* AioContext thread pool parameters */
+    int64_t thread_pool_min;
+    int64_t thread_pool_max;
 };
 #endif
diff --git a/event-loop-base.c b/event-loop-base.c
index XXXXXXX..XXXXXXX 100644
--- a/event-loop-base.c
+++ b/event-loop-base.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "qom/object_interfaces.h"
 #include "qapi/error.h"
+#include "block/thread-pool.h"
 #include "sysemu/event-loop-base.h"
 
 typedef struct {
@@ -XXX,XX +XXX,XX @@ typedef struct {
     ptrdiff_t offset; /* field's byte offset in EventLoopBase struct */
 } EventLoopBaseParamInfo;
 
+static void event_loop_base_instance_init(Object *obj)
+{
+    EventLoopBase *base = EVENT_LOOP_BASE(obj);
+
+    base->thread_pool_max = THREAD_POOL_MAX_THREADS_DEFAULT;
+}
+
 static EventLoopBaseParamInfo aio_max_batch_info = {
     "aio-max-batch", offsetof(EventLoopBase, aio_max_batch),
 };
+static EventLoopBaseParamInfo thread_pool_min_info = {
+    "thread-pool-min", offsetof(EventLoopBase, thread_pool_min),
+};
+static EventLoopBaseParamInfo thread_pool_max_info = {
+    "thread-pool-max", offsetof(EventLoopBase, thread_pool_max),
+};
 
 static void event_loop_base_get_param(Object *obj, Visitor *v,
         const char *name, void *opaque, Error **errp)
@@ -XXX,XX +XXX,XX @@ static void event_loop_base_class_init(ObjectClass *klass, void *class_data)
                               event_loop_base_get_param,
                               event_loop_base_set_param,
                               NULL, &aio_max_batch_info);
+    object_class_property_add(klass, "thread-pool-min", "int",
+                              event_loop_base_get_param,
+                              event_loop_base_set_param,
+                              NULL, &thread_pool_min_info);
+    object_class_property_add(klass, "thread-pool-max", "int",
+                              event_loop_base_get_param,
+                              event_loop_base_set_param,
+                              NULL, &thread_pool_max_info);
 }
 
 static const TypeInfo event_loop_base_info = {
     .name = TYPE_EVENT_LOOP_BASE,
     .parent = TYPE_OBJECT,
     .instance_size = sizeof(EventLoopBase),
+    .instance_init = event_loop_base_instance_init,
     .class_size = sizeof(EventLoopBaseClass),
     .class_init = event_loop_base_class_init,
     .abstract = true,
diff --git a/iothread.c b/iothread.c
index XXXXXXX..XXXXXXX 100644
--- a/iothread.c
+++ b/iothread.c
@@ -XXX,XX +XXX,XX @@ static void iothread_set_aio_context_params(EventLoopBase *base, Error **errp)
     aio_context_set_aio_params(iothread->ctx,
                                iothread->parent_obj.aio_max_batch,
                                errp);
+
+    aio_context_set_thread_pool_params(iothread->ctx, base->thread_pool_min,
+                                       base->thread_pool_max, errp);
 }
 
 
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "block/block.h"
+#include "block/thread-pool.h"
 #include "qemu/main-loop.h"
 #include "qemu/rcu.h"
 #include "qemu/rcu_queue.h"
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
 
     ctx->aio_max_batch = 0;
 
+    ctx->thread_pool_min = 0;
+    ctx->thread_pool_max = THREAD_POOL_MAX_THREADS_DEFAULT;
+
     return ctx;
 fail:
     g_source_destroy(&ctx->source);
@@ -XXX,XX +XXX,XX @@ void qemu_set_current_aio_context(AioContext *ctx)
     assert(!get_my_aiocontext());
     set_my_aiocontext(ctx);
 }
+
+void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
+                                        int64_t max, Error **errp)
+{
+
+    if (min > max || !max || min > INT_MAX || max > INT_MAX) {
+        error_setg(errp, "bad thread-pool-min/thread-pool-max values");
+        return;
+    }
+
+    ctx->thread_pool_min = min;
+    ctx->thread_pool_max = max;
+
+    if (ctx->thread_pool) {
+        thread_pool_update_params(ctx->thread_pool, ctx);
+    }
+}
diff --git a/util/main-loop.c b/util/main-loop.c
index XXXXXXX..XXXXXXX 100644
--- a/util/main-loop.c
+++ b/util/main-loop.c
@@ -XXX,XX +XXX,XX @@
 #include "sysemu/replay.h"
 #include "qemu/main-loop.h"
 #include "block/aio.h"
+#include "block/thread-pool.h"
 #include "qemu/error-report.h"
 #include "qemu/queue.h"
 #include "qemu/compiler.h"
@@ -XXX,XX +XXX,XX @@ int qemu_init_main_loop(Error **errp)
 
 static void main_loop_update_params(EventLoopBase *base, Error **errp)
 {
+    ERRP_GUARD();
+
     if (!qemu_aio_context) {
         error_setg(errp, "qemu aio context not ready");
         return;
     }
 
     aio_context_set_aio_params(qemu_aio_context, base->aio_max_batch, errp);
+    if (*errp) {
+        return;
+    }
+
+    aio_context_set_thread_pool_params(qemu_aio_context, base->thread_pool_min,
+                                       base->thread_pool_max, errp);
 }
 
 MainLoop *mloop;
diff --git a/util/thread-pool.c b/util/thread-pool.c
index XXXXXXX..XXXXXXX 100644
--- a/util/thread-pool.c
+++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@ struct ThreadPool {
     QemuMutex lock;
     QemuCond worker_stopped;
     QemuSemaphore sem;
-    int max_threads;
     QEMUBH *new_thread_bh;
 
     /* The following variables are only accessed from one AioContext. */
@@ -XXX,XX +XXX,XX @@ struct ThreadPool {
     int new_threads;     /* backlog of threads we need to create */
     int pending_threads; /* threads created but not running yet */
     bool stopping;
+    int min_threads;
+    int max_threads;
 };
 
+static inline bool back_to_sleep(ThreadPool *pool, int ret)
+{
+    /*
+     * The semaphore timed out, we should exit the loop except when:
+     *  - There is work to do, we raced with the signal.
+     *  - The max threads threshold just changed, we raced with the signal.
+     *  - The thread pool forces a minimum number of readily available threads.
+     */
+    if (ret == -1 && (!QTAILQ_EMPTY(&pool->request_list) ||
+            pool->cur_threads > pool->max_threads ||
+            pool->cur_threads <= pool->min_threads)) {
+            return true;
+    }
+
+    return false;
+}
+
 static void *worker_thread(void *opaque)
 {
     ThreadPool *pool = opaque;
@@ -XXX,XX +XXX,XX @@ static void *worker_thread(void *opaque)
             ret = qemu_sem_timedwait(&pool->sem, 10000);
             qemu_mutex_lock(&pool->lock);
             pool->idle_threads--;
-        } while (ret == -1 && !QTAILQ_EMPTY(&pool->request_list));
-        if (ret == -1 || pool->stopping) {
+        } while (back_to_sleep(pool, ret));
+        if (ret == -1 || pool->stopping ||
+            pool->cur_threads > pool->max_threads) {
             break;
         }
 
@@ -XXX,XX +XXX,XX @@ void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg)
     thread_pool_submit_aio(pool, func, arg, NULL, NULL);
 }
 
+void thread_pool_update_params(ThreadPool *pool, AioContext *ctx)
+{
+    qemu_mutex_lock(&pool->lock);
+
+    pool->min_threads = ctx->thread_pool_min;
+    pool->max_threads = ctx->thread_pool_max;
+
+    /*
+     * We either have to:
+     *  - Increase the number available of threads until over the min_threads
+     *    threshold.
+     *  - Decrease the number of available threads until under the max_threads
+     *    threshold.
+     *  - Do nothing. The current number of threads fall in between the min and
+     *    max thresholds. We'll let the pool manage itself.
+     */
+    for (int i = pool->cur_threads; i < pool->min_threads; i++) {
+        spawn_thread(pool);
+    }
+
+    for (int i = pool->cur_threads; i > pool->max_threads; i--) {
+        qemu_sem_post(&pool->sem);
+    }
+
+    qemu_mutex_unlock(&pool->lock);
+}
+
 static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx)
 {
     if (!ctx) {
@@ -XXX,XX +XXX,XX @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx)
     qemu_mutex_init(&pool->lock);
     qemu_cond_init(&pool->worker_stopped);
     qemu_sem_init(&pool->sem, 0);
-    pool->max_threads = 64;
     pool->new_thread_bh = aio_bh_new(ctx, spawn_thread_bh_fn, pool);
 
     QLIST_INIT(&pool->head);
     QTAILQ_INIT(&pool->request_list);
+
+    thread_pool_update_params(pool, ctx);
 }
 
 ThreadPool *thread_pool_new(AioContext *ctx)
-- 
2.35.1