Series comparison

-[PULL 00/12] Block patches
+[PULL 0/3] Block patches
-The following changes since commit 171199f56f5f9bdf1e5d670d09ef1351d8f01bae:
+The following changes since commit 9cf289af47bcfae5c75de37d8e5d6fd23705322c:
-  Merge remote-tracking branch 'remotes/alistair/tags/pull-riscv-to-apply-20200619-3' into staging (2020-06-22 14:45:25 +0100)
+  Merge tag 'qga-pull-request' of gitlab.com:marcandre.lureau/qemu into staging (2022-05-04 03:42:49 -0700)
 are available in the Git repository at:
-  https://github.com/stefanha/qemu.git tags/block-pull-request
+  https://gitlab.com/stefanha/qemu.git tags/block-pull-request
-for you to fetch changes up to 7838c67f22a81fcf669785cd6c0876438422071a:
+for you to fetch changes up to bef2e050d6a7feb865854c65570c496ac5a8cf53:
-  block/nvme: support nested aio_poll() (2020-06-23 15:46:08 +0100)
+  util/event-loop-base: Introduce options to set the thread pool size (2022-05-04 17:02:19 +0100)
 ----------------------------------------------------------------
 Pull request
+Add new thread-pool-min/thread-pool-max parameters to control the thread pool
+used for async I/O.
 ----------------------------------------------------------------
-Daniele Buono (4):
+Nicolas Saenz Julienne (3):
-  coroutine: support SafeStack in ucontext backend
+  Introduce event-loop-base abstract class
-  coroutine: add check for SafeStack in sigaltstack
+  util/main-loop: Introduce the main loop into QOM
-  configure: add flags to support SafeStack
+  util/event-loop-base: Introduce options to set the thread pool size
   check-block: enable iotests with SafeStack
-Stefan Hajnoczi (8):
+ qapi/qom.json                    |  43 ++++++++--
-  minikconf: explicitly set encoding to UTF-8
+ meson.build                      |  26 +++---
-  block/nvme: poll queues without q->lock
+ include/block/aio.h              |  10 +++
-  block/nvme: drop tautologous assertion
+ include/block/thread-pool.h      |   3 +
-  block/nvme: don't access CQE after moving cq.head
+ include/qemu/main-loop.h         |  10 +++
-  block/nvme: switch to a NVMeRequest freelist
+ include/sysemu/event-loop-base.h |  41 +++++++++
-  block/nvme: clarify that free_req_queue is protected by q->lock
+ include/sysemu/iothread.h        |   6 +-
-  block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair
+ event-loop-base.c                | 140 +++++++++++++++++++++++++++++++
-  block/nvme: support nested aio_poll()
+ iothread.c                       |  68 +++++----------
+ util/aio-posix.c                 |   1 +
- configure                    |  73 ++++++++++++
+ util/async.c                     |  20 +++++
- include/qemu/coroutine_int.h |   5 +
+ util/main-loop.c                 |  65 ++++++++++++++
- block/nvme.c                 | 220 +++++++++++++++++++++++++----------
+ util/thread-pool.c               |  55 +++++++++++-
- util/coroutine-sigaltstack.c |   4 +
+files changed, 419 insertions(+), 69 deletions(-)
- util/coroutine-ucontext.c    |  28 +++++
+ create mode 100644 include/sysemu/event-loop-base.h
- block/trace-events           |   2 +-
+ create mode 100644 event-loop-base.c
  scripts/minikconf.py         |   6 +-
  tests/check-block.sh         |  12 +-
 files changed, 284 insertions(+), 66 deletions(-)
 --
-.26.2
+.35.1

-[PULL 01/12] minikconf: explicitly set encoding to UTF-8
+Deleted patch
-QEMU currently only has ASCII Kconfig files but Linux actually uses
-UTF-8. Explicitly specify the encoding and that we're doing text file
-I/O.
-It's unclear whether or not QEMU will ever need Unicode in its Kconfig
-files. If we start using the help text then it will become an issue
-sooner or later. Make this change now for consistency with Linux
-Kconfig.
-Reported-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
-Message-id: 20200521153616.307100-1-stefanha@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- scripts/minikconf.py | 6 +++---
-file changed, 3 insertions(+), 3 deletions(-)
-diff --git a/scripts/minikconf.py b/scripts/minikconf.py
-index XXXXXXX..XXXXXXX 100755
---- a/scripts/minikconf.py
-+++ b/scripts/minikconf.py
-@@ -XXX,XX +XXX,XX @@ class KconfigParser:
-         if incl_abs_fname in self.data.previously_included:
-             return
-         try:
--            fp = open(incl_abs_fname, 'r')
-+            fp = open(incl_abs_fname, 'rt', encoding='utf-8')
-         except IOError as e:
-             raise KconfigParserError(self,
-                                 '%s: %s' % (e.strerror, include))
-@@ -XXX,XX +XXX,XX @@ if __name__ == '__main__':
-             parser.do_assignment(name, value == 'y')
-             external_vars.add(name[7:])
-         else:
--            fp = open(arg, 'r')
-+            fp = open(arg, 'rt', encoding='utf-8')
-             parser.parse_file(fp)
-             fp.close()
-@@ -XXX,XX +XXX,XX @@ if __name__ == '__main__':
-         if key not in external_vars and config[key]:
-             print ('CONFIG_%s=y' % key)
--    deps = open(argv[2], 'w')
-+    deps = open(argv[2], 'wt', encoding='utf-8')
-     for fname in data.previously_included:
-         print ('%s: %s' % (argv[1], fname), file=deps)
-     deps.close()
---
-.26.2

-[PULL 02/12] coroutine: support SafeStack in ucontext backend
+[PULL 1/3] Introduce event-loop-base abstract class
-From: Daniele Buono <dbuono@linux.vnet.ibm.com>
+From: Nicolas Saenz Julienne <nsaenzju@redhat.com>
-LLVM's SafeStack instrumentation does not yet support programs that make
+Introduce the 'event-loop-base' abstract class, it'll hold the
-use of the APIs in ucontext.h
+properties common to all event loops and provide the necessary hooks for
-With the current implementation of coroutine-ucontext, the resulting
+their creation and maintenance. Then have iothread inherit from it.
-binary is incorrect, with different coroutines sharing the same unsafe
-stack and producing undefined behavior at runtime.
+EventLoopBaseClass is defined as user creatable and provides a hook for
-This fix allocates an additional unsafe stack area for each coroutine,
+its children to attach themselves to the user creatable class 'complete'
-and sets the new unsafe stack pointer before calling swapcontext() in
+function. It also provides an update_params() callback to propagate
-qemu_coroutine_new.
+property changes onto its children.
-This is the only place where the pointer needs to be manually updated,
-since sigsetjmp/siglongjmp are already instrumented by LLVM to properly
+The new 'event-loop-base' class will live in the root directory. It is
-support SafeStack.
+built on its own using the 'link_whole' option (there are no direct
-The additional stack is then freed in qemu_coroutine_delete.
+function dependencies between the class and its children, it all happens
+trough 'constructor' magic). And also imposes new compilation
-Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
+dependencies:
-Message-id: 20200529205122.714-2-dbuono@linux.vnet.ibm.com
     qom <- event-loop-base <- blockdev (iothread.c)
 And in subsequent patches:
     qom <- event-loop-base <- qemuutil (util/main-loop.c)
 All this forced some amount of reordering in meson.build:
  - Moved qom build definition before qemuutil. Doing it the other way
    around (i.e. moving qemuutil after qom) isn't possible as a lot of
    core libraries that live in between the two depend on it.
  - Process the 'hw' subdir earlier, as it introduces files into the
    'qom' source set.
 No functional changes intended.
 Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Acked-by: Markus Armbruster <armbru@redhat.com>
 Message-id: 20220425075723.20019-2-nsaenzju@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- include/qemu/coroutine_int.h |  5 +++++
+ qapi/qom.json                    |  22 +++++--
- util/coroutine-ucontext.c    | 28 ++++++++++++++++++++++++++++
+ meson.build                      |  23 ++++---
-files changed, 33 insertions(+)
+ include/sysemu/event-loop-base.h |  36 +++++++++++
+ include/sysemu/iothread.h        |   6 +-
-diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
+ event-loop-base.c                | 104 +++++++++++++++++++++++++++++++
  iothread.c                       |  65 ++++++-------------
 files changed, 192 insertions(+), 64 deletions(-)
  create mode 100644 include/sysemu/event-loop-base.h
  create mode 100644 event-loop-base.c
 diff --git a/qapi/qom.json b/qapi/qom.json
 index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/coroutine_int.h
+--- a/qapi/qom.json
-+++ b/include/qemu/coroutine_int.h
++++ b/qapi/qom.json
 @@ -XXX,XX +XXX,XX @@
- #include "qemu/queue.h"
+             '*repeat': 'bool',
- #include "qemu/coroutine.h"
+             '*grab-toggle': 'GrabToggleKeys' } }
-+#ifdef CONFIG_SAFESTACK
++##
-+/* Pointer to the unsafe stack, defined by the compiler */
++# @EventLoopBaseProperties:
-+extern __thread void *__safestack_unsafe_stack_ptr;
++#
 +# Common properties for event loops
 +#
 +# @aio-max-batch: maximum number of requests in a batch for the AIO engine,
 +#                 0 means that the engine will use its default.
 +#                 (default: 0)
 +#
 +# Since: 7.1
 +##
 +{ 'struct': 'EventLoopBaseProperties',
 +  'data': { '*aio-max-batch': 'int' } }
 +
  ##
  # @IothreadProperties:
  #
@@ -XXX,XX +XXX,XX @@
  #               algorithm detects it is spending too long polling without
  #               encountering events. 0 selects a default behaviour (default: 0)
  #
 -# @aio-max-batch: maximum number of requests in a batch for the AIO engine,
 -#                 0 means that the engine will use its default
 -#                 (default:0, since 6.1)
 +# The @aio-max-batch option is available since 6.1.
  #
  # Since: 2.0
  ##
  { 'struct': 'IothreadProperties',
 +  'base': 'EventLoopBaseProperties',
    'data': { '*poll-max-ns': 'int',
              '*poll-grow': 'int',
 -            '*poll-shrink': 'int',
 -            '*aio-max-batch': 'int' } }
 +            '*poll-shrink': 'int' } }
  ##
  # @MemoryBackendProperties:
 diff --git a/meson.build b/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/meson.build
 +++ b/meson.build
@@ -XXX,XX +XXX,XX @@ subdir('qom')
  subdir('authz')
  subdir('crypto')
  subdir('ui')
 +subdir('hw')
  if enable_modules
@@ -XXX,XX +XXX,XX @@ if enable_modules
    modulecommon = declare_dependency(link_whole: libmodulecommon, compile_args: '-DBUILD_DSO')
  endif
 +qom_ss = qom_ss.apply(config_host, strict: false)
 +libqom = static_library('qom', qom_ss.sources() + genh,
 +                        dependencies: [qom_ss.dependencies()],
 +                        name_suffix: 'fa')
 +qom = declare_dependency(link_whole: libqom)
 +
 +event_loop_base = files('event-loop-base.c')
 +event_loop_base = static_library('event-loop-base', sources: event_loop_base + genh,
 +                                 build_by_default: true)
 +event_loop_base = declare_dependency(link_whole: event_loop_base,
 +                                     dependencies: [qom])
 +
  stub_ss = stub_ss.apply(config_all, strict: false)
  util_ss.add_all(trace_ss)
@@ -XXX,XX +XXX,XX @@ subdir('monitor')
  subdir('net')
  subdir('replay')
  subdir('semihosting')
 -subdir('hw')
  subdir('tcg')
  subdir('fpu')
  subdir('accel')
@@ -XXX,XX +XXX,XX @@ qemu_syms = custom_target('qemu.syms', output: 'qemu.syms',
                               capture: true,
                               command: [undefsym, nm, '@INPUT@'])
 -qom_ss = qom_ss.apply(config_host, strict: false)
 -libqom = static_library('qom', qom_ss.sources() + genh,
 -                        dependencies: [qom_ss.dependencies()],
 -                        name_suffix: 'fa')
 -
 -qom = declare_dependency(link_whole: libqom)
 -
  authz_ss = authz_ss.apply(config_host, strict: false)
  libauthz = static_library('authz', authz_ss.sources() + genh,
                            dependencies: [authz_ss.dependencies()],
@@ -XXX,XX +XXX,XX @@ libblockdev = static_library('blockdev', blockdev_ss.sources() + genh,
                               build_by_default: false)
  blockdev = declare_dependency(link_whole: [libblockdev],
 -                              dependencies: [block])
 +                              dependencies: [block, event_loop_base])
  qmp_ss = qmp_ss.apply(config_host, strict: false)
  libqmp = static_library('qmp', qmp_ss.sources() + genh,
 diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/include/sysemu/event-loop-base.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU event-loop backend
 + *
 + * Copyright (C) 2022 Red Hat Inc
 + *
 + * Authors:
 + *  Nicolas Saenz Julienne <nsaenzju@redhat.com>
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +#ifndef QEMU_EVENT_LOOP_BASE_H
 +#define QEMU_EVENT_LOOP_BASE_H
 +
 +#include "qom/object.h"
 +#include "block/aio.h"
 +#include "qemu/typedefs.h"
 +
 +#define TYPE_EVENT_LOOP_BASE         "event-loop-base"
 +OBJECT_DECLARE_TYPE(EventLoopBase, EventLoopBaseClass,
 +                    EVENT_LOOP_BASE)
 +
 +struct EventLoopBaseClass {
 +    ObjectClass parent_class;
 +
 +    void (*init)(EventLoopBase *base, Error **errp);
 +    void (*update_params)(EventLoopBase *base, Error **errp);
 +};
 +
 +struct EventLoopBase {
 +    Object parent;
 +
 +    /* AioContext AIO engine parameters */
 +    int64_t aio_max_batch;
 +};
 +#endif
-+
+diff --git a/include/sysemu/iothread.h b/include/sysemu/iothread.h
  #define COROUTINE_STACK_SIZE (1 << 20)
  typedef enum {
 diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c
 index XXXXXXX..XXXXXXX 100644
---- a/util/coroutine-ucontext.c
+--- a/include/sysemu/iothread.h
-+++ b/util/coroutine-ucontext.c
++++ b/include/sysemu/iothread.h
-@@ -XXX,XX +XXX,XX @@ typedef struct {
+@@ -XXX,XX +XXX,XX @@
-     Coroutine base;
+ #include "block/aio.h"
-     void *stack;
+ #include "qemu/thread.h"
-     size_t stack_size;
+ #include "qom/object.h"
-+#ifdef CONFIG_SAFESTACK
++#include "sysemu/event-loop-base.h"
-+    /* Need an unsafe stack for each coroutine */
-+    void *unsafe_stack;
+ #define TYPE_IOTHREAD "iothread"
-+    size_t unsafe_stack_size;
-+#endif
+ struct IOThread {
-     sigjmp_buf env;
+-    Object parent_obj;
++    EventLoopBase parent_obj;
-     void *tsan_co_fiber;
-@@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void)
+     QemuThread thread;
-     co = g_malloc0(sizeof(*co));
+     AioContext *ctx;
-     co->stack_size = COROUTINE_STACK_SIZE;
+@@ -XXX,XX +XXX,XX @@ struct IOThread {
-     co->stack = qemu_alloc_stack(&co->stack_size);
+     int64_t poll_max_ns;
-+#ifdef CONFIG_SAFESTACK
+     int64_t poll_grow;
-+    co->unsafe_stack_size = COROUTINE_STACK_SIZE;
+     int64_t poll_shrink;
-+    co->unsafe_stack = qemu_alloc_stack(&co->unsafe_stack_size);
+-
-+#endif
+-    /* AioContext AIO engine parameters */
-     co->base.entry_arg = &old_env; /* stash away our jmp_buf */
+-    int64_t aio_max_batch;
+ };
-     uc.uc_link = &old_uc;
+ typedef struct IOThread IOThread;
-@@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void)
-             COROUTINE_YIELD,
+diff --git a/event-loop-base.c b/event-loop-base.c
-             &fake_stack_save,
+new file mode 100644
-             co->stack, co->stack_size, co->tsan_co_fiber);
+index XXXXXXX..XXXXXXX
-+
+--- /dev/null
-+#ifdef CONFIG_SAFESTACK
++++ b/event-loop-base.c
-+        /*
+@@ -XXX,XX +XXX,XX @@
-+         * Before we swap the context, set the new unsafe stack
++/*
-+         * The unsafe stack grows just like the normal stack, so start from
++ * QEMU event-loop base
-+         * the last usable location of the memory area.
++ *
-+         * NOTE: we don't have to re-set the usp afterwards because we are
++ * Copyright (C) 2022 Red Hat Inc
-+         * coming back to this context through a siglongjmp.
++ *
-+         * The compiler already wrapped the corresponding sigsetjmp call with
++ * Authors:
-+         * code that saves the usp on the (safe) stack before the call, and
++ *  Stefan Hajnoczi <stefanha@redhat.com>
-+         * restores it right after (which is where we return with siglongjmp).
++ *  Nicolas Saenz Julienne <nsaenzju@redhat.com>
-+         */
++ *
-+        void *usp = co->unsafe_stack + co->unsafe_stack_size;
++ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+        __safestack_unsafe_stack_ptr = usp;
++ * See the COPYING file in the top-level directory.
-+#endif
++ */
 +
-         swapcontext(&old_uc, &uc);
++#include "qemu/osdep.h"
 +#include "qom/object_interfaces.h"
 +#include "qapi/error.h"
 +#include "sysemu/event-loop-base.h"
 +
 +typedef struct {
 +    const char *name;
 +    ptrdiff_t offset; /* field's byte offset in EventLoopBase struct */
 +} EventLoopBaseParamInfo;
 +
 +static EventLoopBaseParamInfo aio_max_batch_info = {
 +    "aio-max-batch", offsetof(EventLoopBase, aio_max_batch),
 +};
 +
 +static void event_loop_base_get_param(Object *obj, Visitor *v,
 +        const char *name, void *opaque, Error **errp)
 +{
 +    EventLoopBase *event_loop_base = EVENT_LOOP_BASE(obj);
 +    EventLoopBaseParamInfo *info = opaque;
 +    int64_t *field = (void *)event_loop_base + info->offset;
 +
 +    visit_type_int64(v, name, field, errp);
 +}
 +
 +static void event_loop_base_set_param(Object *obj, Visitor *v,
 +        const char *name, void *opaque, Error **errp)
 +{
 +    EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(obj);
 +    EventLoopBase *base = EVENT_LOOP_BASE(obj);
 +    EventLoopBaseParamInfo *info = opaque;
 +    int64_t *field = (void *)base + info->offset;
 +    int64_t value;
 +
 +    if (!visit_type_int64(v, name, &value, errp)) {
 +        return;
 +    }
 +
 +    if (value < 0) {
 +        error_setg(errp, "%s value must be in range [0, %" PRId64 "]",
 +                   info->name, INT64_MAX);
 +        return;
 +    }
 +
 +    *field = value;
 +
 +    if (bc->update_params) {
 +        bc->update_params(base, errp);
 +    }
 +
 +    return;
 +}
 +
 +static void event_loop_base_complete(UserCreatable *uc, Error **errp)
 +{
 +    EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(uc);
 +    EventLoopBase *base = EVENT_LOOP_BASE(uc);
 +
 +    if (bc->init) {
 +        bc->init(base, errp);
 +    }
 +}
 +
 +static void event_loop_base_class_init(ObjectClass *klass, void *class_data)
 +{
 +    UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
 +    ucc->complete = event_loop_base_complete;
 +
 +    object_class_property_add(klass, "aio-max-batch", "int",
 +                              event_loop_base_get_param,
 +                              event_loop_base_set_param,
 +                              NULL, &aio_max_batch_info);
 +}
 +
 +static const TypeInfo event_loop_base_info = {
 +    .name = TYPE_EVENT_LOOP_BASE,
 +    .parent = TYPE_OBJECT,
 +    .instance_size = sizeof(EventLoopBase),
 +    .class_size = sizeof(EventLoopBaseClass),
 +    .class_init = event_loop_base_class_init,
 +    .abstract = true,
 +    .interfaces = (InterfaceInfo[]) {
 +        { TYPE_USER_CREATABLE },
 +        { }
 +    }
 +};
 +
 +static void register_types(void)
 +{
 +    type_register_static(&event_loop_base_info);
 +}
 +type_init(register_types);
 diff --git a/iothread.c b/iothread.c
 index XXXXXXX..XXXXXXX 100644
 --- a/iothread.c
 +++ b/iothread.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/module.h"
  #include "block/aio.h"
  #include "block/block.h"
 +#include "sysemu/event-loop-base.h"
  #include "sysemu/iothread.h"
  #include "qapi/error.h"
  #include "qapi/qapi-commands-misc.h"
@@ -XXX,XX +XXX,XX @@ static void iothread_init_gcontext(IOThread *iothread)
      iothread->main_loop = g_main_loop_new(iothread->worker_context, TRUE);
  }
 -static void iothread_set_aio_context_params(IOThread *iothread, Error **errp)
 +static void iothread_set_aio_context_params(EventLoopBase *base, Error **errp)
  {
 +    IOThread *iothread = IOTHREAD(base);
      ERRP_GUARD();
 +    if (!iothread->ctx) {
 +        return;
 +    }
 +
      aio_context_set_poll_params(iothread->ctx,
                                  iothread->poll_max_ns,
                                  iothread->poll_grow,
@@ -XXX,XX +XXX,XX @@ static void iothread_set_aio_context_params(IOThread *iothread, Error **errp)
      }
-@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_delete(Coroutine *co_)
+     aio_context_set_aio_params(iothread->ctx,
- #endif
+-                               iothread->aio_max_batch,
++                               iothread->parent_obj.aio_max_batch,
-     qemu_free_stack(co->stack, co->stack_size);
+                                errp);
 +#ifdef CONFIG_SAFESTACK
 +    qemu_free_stack(co->unsafe_stack, co->unsafe_stack_size);
 +#endif
      g_free(co);
  }
+-static void iothread_complete(UserCreatable *obj, Error **errp)
++
++static void iothread_init(EventLoopBase *base, Error **errp)
+ {
+     Error *local_error = NULL;
+-    IOThread *iothread = IOTHREAD(obj);
++    IOThread *iothread = IOTHREAD(base);
+     char *thread_name;
+     iothread->stopping = false;
+@@ -XXX,XX +XXX,XX @@ static void iothread_complete(UserCreatable *obj, Error **errp)
+      */
+     iothread_init_gcontext(iothread);
+-    iothread_set_aio_context_params(iothread, &local_error);
++    iothread_set_aio_context_params(base, &local_error);
+     if (local_error) {
+         error_propagate(errp, local_error);
+         aio_context_unref(iothread->ctx);
+@@ -XXX,XX +XXX,XX @@ static void iothread_complete(UserCreatable *obj, Error **errp)
+      * to inherit.
+      */
+     thread_name = g_strdup_printf("IO %s",
+-                        object_get_canonical_path_component(OBJECT(obj)));
++                        object_get_canonical_path_component(OBJECT(base)));
+     qemu_thread_create(&iothread->thread, thread_name, iothread_run,
+                        iothread, QEMU_THREAD_JOINABLE);
+     g_free(thread_name);
+@@ -XXX,XX +XXX,XX @@ static IOThreadParamInfo poll_grow_info = {
+ static IOThreadParamInfo poll_shrink_info = {
+     "poll-shrink", offsetof(IOThread, poll_shrink),
+ };
+-static IOThreadParamInfo aio_max_batch_info = {
+-    "aio-max-batch", offsetof(IOThread, aio_max_batch),
+-};
+ static void iothread_get_param(Object *obj, Visitor *v,
+         const char *name, IOThreadParamInfo *info, Error **errp)
+@@ -XXX,XX +XXX,XX @@ static void iothread_set_poll_param(Object *obj, Visitor *v,
+     }
+ }
+-static void iothread_get_aio_param(Object *obj, Visitor *v,
+-        const char *name, void *opaque, Error **errp)
+-{
+-    IOThreadParamInfo *info = opaque;
+-
+-    iothread_get_param(obj, v, name, info, errp);
+-}
+-
+-static void iothread_set_aio_param(Object *obj, Visitor *v,
+-        const char *name, void *opaque, Error **errp)
+-{
+-    IOThread *iothread = IOTHREAD(obj);
+-    IOThreadParamInfo *info = opaque;
+-
+-    if (!iothread_set_param(obj, v, name, info, errp)) {
+-        return;
+-    }
+-
+-    if (iothread->ctx) {
+-        aio_context_set_aio_params(iothread->ctx,
+-                                   iothread->aio_max_batch,
+-                                   errp);
+-    }
+-}
+-
+ static void iothread_class_init(ObjectClass *klass, void *class_data)
+ {
+-    UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
+-    ucc->complete = iothread_complete;
++    EventLoopBaseClass *bc = EVENT_LOOP_BASE_CLASS(klass);
++
++    bc->init = iothread_init;
++    bc->update_params = iothread_set_aio_context_params;
+     object_class_property_add(klass, "poll-max-ns", "int",
+                               iothread_get_poll_param,
+@@ -XXX,XX +XXX,XX @@ static void iothread_class_init(ObjectClass *klass, void *class_data)
+                               iothread_get_poll_param,
+                               iothread_set_poll_param,
+                               NULL, &poll_shrink_info);
+-    object_class_property_add(klass, "aio-max-batch", "int",
+-                              iothread_get_aio_param,
+-                              iothread_set_aio_param,
+-                              NULL, &aio_max_batch_info);
+ }
+ static const TypeInfo iothread_info = {
+     .name = TYPE_IOTHREAD,
+-    .parent = TYPE_OBJECT,
++    .parent = TYPE_EVENT_LOOP_BASE,
+     .class_init = iothread_class_init,
+     .instance_size = sizeof(IOThread),
+     .instance_init = iothread_instance_init,
+     .instance_finalize = iothread_instance_finalize,
+-    .interfaces = (InterfaceInfo[]) {
+-        {TYPE_USER_CREATABLE},
+-        {}
+-    },
+ };
+ static void iothread_register_types(void)
+@@ -XXX,XX +XXX,XX @@ static int query_one_iothread(Object *object, void *opaque)
+     info->poll_max_ns = iothread->poll_max_ns;
+     info->poll_grow = iothread->poll_grow;
+     info->poll_shrink = iothread->poll_shrink;
+-    info->aio_max_batch = iothread->aio_max_batch;
++    info->aio_max_batch = iothread->parent_obj.aio_max_batch;
+     QAPI_LIST_APPEND(*tail, info);
+     return 0;
 --
-.26.2
+.35.1

-[PULL 03/12] coroutine: add check for SafeStack in sigaltstack
+Deleted patch
-From: Daniele Buono <dbuono@linux.vnet.ibm.com>
-Current implementation of LLVM's SafeStack is not compatible with
-code that uses an alternate stack created with sigaltstack().
-Since coroutine-sigaltstack relies on sigaltstack(), it is not
-compatible with SafeStack. The resulting binary is incorrect, with
-different coroutines sharing the same unsafe stack and producing
-undefined behavior at runtime.
-In the future LLVM may provide a SafeStack implementation compatible with
-sigaltstack(). In the meantime, if SafeStack is desired, the coroutine
-implementation from coroutine-ucontext should be used.
-As a safety check, add a control in coroutine-sigaltstack to throw a
-preprocessor #error if SafeStack is enabled and we are trying to
-use coroutine-sigaltstack to implement coroutines.
-Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
-Message-id: 20200529205122.714-3-dbuono@linux.vnet.ibm.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- util/coroutine-sigaltstack.c | 4 ++++
-file changed, 4 insertions(+)
-diff --git a/util/coroutine-sigaltstack.c b/util/coroutine-sigaltstack.c
-index XXXXXXX..XXXXXXX 100644
---- a/util/coroutine-sigaltstack.c
-+++ b/util/coroutine-sigaltstack.c
-@@ -XXX,XX +XXX,XX @@
- #include "qemu-common.h"
- #include "qemu/coroutine_int.h"
-+#ifdef CONFIG_SAFESTACK
-+#error "SafeStack is not compatible with code run in alternate signal stacks"
-+#endif
-+
- typedef struct {
-     Coroutine base;
-     void *stack;
---
-.26.2

-[PULL 04/12] configure: add flags to support SafeStack
+Deleted patch
-From: Daniele Buono <dbuono@linux.vnet.ibm.com>
-This patch adds a flag to enable/disable the SafeStack instrumentation
-provided by LLVM.
-On enable, make sure that the compiler supports the flags, and that we
-are using the proper coroutine implementation (coroutine-ucontext).
-On disable, explicitly disable the option if it was enabled by default.
-While SafeStack is supported only on Linux, NetBSD, FreeBSD and macOS,
-we are not checking for the O.S. since this is already done by LLVM.
-Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
-Message-id: 20200529205122.714-4-dbuono@linux.vnet.ibm.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- configure | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
-file changed, 73 insertions(+)
-diff --git a/configure b/configure
-index XXXXXXX..XXXXXXX 100755
---- a/configure
-+++ b/configure
-@@ -XXX,XX +XXX,XX @@ audio_win_int=""
- libs_qga=""
- debug_info="yes"
- stack_protector=""
-+safe_stack=""
- use_containers="yes"
- gdb_bin=$(command -v "gdb-multiarch" || command -v "gdb")
-@@ -XXX,XX +XXX,XX @@ for opt do
-   ;;
-   --disable-stack-protector) stack_protector="no"
-   ;;
-+  --enable-safe-stack) safe_stack="yes"
-+  ;;
-+  --disable-safe-stack) safe_stack="no"
-+  ;;
-   --disable-curses) curses="no"
-   ;;
-   --enable-curses) curses="yes"
-@@ -XXX,XX +XXX,XX @@ disabled with --disable-FEATURE, default is enabled if available:
-   debug-tcg       TCG debugging (default is disabled)
-   debug-info      debugging information
-   sparse          sparse checker
-+  safe-stack      SafeStack Stack Smash Protection. Depends on
-+                  clang/llvm >= 3.7 and requires coroutine backend ucontext.
-   gnutls          GNUTLS cryptography support
-   nettle          nettle cryptography support
-@@ -XXX,XX +XXX,XX @@ if test "$debug_stack_usage" = "yes"; then
-   fi
- fi
-+##################################################
-+# SafeStack
-+
-+
-+if test "$safe_stack" = "yes"; then
-+cat > $TMPC << EOF
-+int main(int argc, char *argv[])
-+{
-+#if ! __has_feature(safe_stack)
-+#error SafeStack Disabled
-+#endif
-+    return 0;
-+}
-+EOF
-+  flag="-fsanitize=safe-stack"
-+  # Check that safe-stack is supported and enabled.
-+  if compile_prog "-Werror $flag" "$flag"; then
-+    # Flag needed both at compilation and at linking
-+    QEMU_CFLAGS="$QEMU_CFLAGS $flag"
-+    QEMU_LDFLAGS="$QEMU_LDFLAGS $flag"
-+  else
-+    error_exit "SafeStack not supported by your compiler"
-+  fi
-+  if test "$coroutine" != "ucontext"; then
-+    error_exit "SafeStack is only supported by the coroutine backend ucontext"
-+  fi
-+else
-+cat > $TMPC << EOF
-+int main(int argc, char *argv[])
-+{
-+#if defined(__has_feature)
-+#if __has_feature(safe_stack)
-+#error SafeStack Enabled
-+#endif
-+#endif
-+    return 0;
-+}
-+EOF
-+if test "$safe_stack" = "no"; then
-+  # Make sure that safe-stack is disabled
-+  if ! compile_prog "-Werror" ""; then
-+    # SafeStack was already enabled, try to explicitly remove the feature
-+    flag="-fno-sanitize=safe-stack"
-+    if ! compile_prog "-Werror $flag" "$flag"; then
-+      error_exit "Configure cannot disable SafeStack"
-+    fi
-+    QEMU_CFLAGS="$QEMU_CFLAGS $flag"
-+    QEMU_LDFLAGS="$QEMU_LDFLAGS $flag"
-+  fi
-+else # "$safe_stack" = ""
-+  # Set safe_stack to yes or no based on pre-existing flags
-+  if compile_prog "-Werror" ""; then
-+    safe_stack="no"
-+  else
-+    safe_stack="yes"
-+    if test "$coroutine" != "ucontext"; then
-+      error_exit "SafeStack is only supported by the coroutine backend ucontext"
-+    fi
-+  fi
-+fi
-+fi
- ##########################################
- # check if we have open_by_handle_at
-@@ -XXX,XX +XXX,XX @@ echo "sparse enabled    $sparse"
- echo "strip binaries    $strip_opt"
- echo "profiler          $profiler"
- echo "static build      $static"
-+echo "safe stack        $safe_stack"
- if test "$darwin" = "yes" ; then
-     echo "Cocoa support     $cocoa"
- fi
-@@ -XXX,XX +XXX,XX @@ if test "$ccache_cpp2" = "yes"; then
-   echo "export CCACHE_CPP2=y" >> $config_host_mak
- fi
-+if test "$safe_stack" = "yes"; then
-+  echo "CONFIG_SAFESTACK=y" >> $config_host_mak
-+fi
-+
- # If we're using a separate build tree, set it up now.
- # DIRS are directories which we simply mkdir in the build tree;
- # LINKS are things to symlink back into the source tree
---
-.26.2

-[PULL 05/12] check-block: enable iotests with SafeStack
+Deleted patch
-From: Daniele Buono <dbuono@linux.vnet.ibm.com>
-SafeStack is a stack protection technique implemented in llvm. It is
-enabled with a -fsanitize flag.
-iotests are currently disabled when any -fsanitize option is used,
-because such options tend to produce additional warnings and false
-positives.
-While common -fsanitize options are used to verify the code and not
-added in production, SafeStack's main use is in production environments
-to protect against stack smashing.
-Since SafeStack does not print any warning or false positive, enable
-iotests when SafeStack is the only -fsanitize option used.
-This is likely going to be a production binary and we want to make sure
-it works correctly.
-Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
-Message-id: 20200529205122.714-5-dbuono@linux.vnet.ibm.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- tests/check-block.sh | 12 +++++++++++-
-file changed, 11 insertions(+), 1 deletion(-)
-diff --git a/tests/check-block.sh b/tests/check-block.sh
-index XXXXXXX..XXXXXXX 100755
---- a/tests/check-block.sh
-+++ b/tests/check-block.sh
-@@ -XXX,XX +XXX,XX @@ if grep -q "CONFIG_GPROF=y" config-host.mak 2>/dev/null ; then
-     exit 0
- fi
--if grep -q "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null ; then
-+# Disable tests with any sanitizer except for SafeStack
-+CFLAGS=$( grep "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null )
-+SANITIZE_FLAGS=""
-+#Remove all occurrencies of -fsanitize=safe-stack
-+for i in ${CFLAGS}; do
-+        if [ "${i}" != "-fsanitize=safe-stack" ]; then
-+                SANITIZE_FLAGS="${SANITIZE_FLAGS} ${i}"
-+        fi
-+done
-+if echo ${SANITIZE_FLAGS} | grep -q "\-fsanitize" 2>/dev/null; then
-+    # Have a sanitize flag that is not allowed, stop
-     echo "Sanitizers are enabled ==> Not running the qemu-iotests."
-     exit 0
- fi
---
-.26.2

-[PULL 06/12] block/nvme: poll queues without q->lock
+Deleted patch
-A lot of CPU time is spent simply locking/unlocking q->lock during
-polling. Check for completion outside the lock to make q->lock disappear
-from the profile.
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Sergio Lopez <slp@redhat.com>
-Message-id: 20200617132201.1832152-2-stefanha@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- block/nvme.c | 12 ++++++++++++
-file changed, 12 insertions(+)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
-     for (i = 0; i < s->nr_queues; i++) {
-         NVMeQueuePair *q = s->queues[i];
-+        const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
-+        NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
-+
-+        /*
-+         * Do an early check for completions. q->lock isn't needed because
-+         * nvme_process_completion() only runs in the event loop thread and
-+         * cannot race with itself.
-+         */
-+        if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
-+            continue;
-+        }
-+
-         qemu_mutex_lock(&q->lock);
-         while (nvme_process_completion(s, q)) {
-             /* Keep polling */
---
-.26.2

-[PULL 07/12] block/nvme: drop tautologous assertion
+Deleted patch
-nvme_process_completion() explicitly checks cid so the assertion that
-follows is always true:
-  if (cid == 0 || cid > NVME_QUEUE_SIZE) {
-      ...
-      continue;
-  }
-  assert(cid <= NVME_QUEUE_SIZE);
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Sergio Lopez <slp@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20200617132201.1832152-3-stefanha@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- block/nvme.c | 1 -
-file changed, 1 deletion(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
-                     cid);
-             continue;
-         }
--        assert(cid <= NVME_QUEUE_SIZE);
-         trace_nvme_complete_command(s, q->index, cid);
-         preq = &q->reqs[cid - 1];
-         req = *preq;
---
-.26.2

-[PULL 08/12] block/nvme: don't access CQE after moving cq.head
+Deleted patch
-Do not access a CQE after incrementing q->cq.head and releasing q->lock.
-It is unlikely that this causes problems in practice but it's a latent
-bug.
-The reason why it should be safe at the moment is that completion
-processing is not re-entrant and the CQ doorbell isn't written until the
-end of nvme_process_completion().
-Make this change now because QEMU expects completion processing to be
-re-entrant and later patches will do that.
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Sergio Lopez <slp@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20200617132201.1832152-4-stefanha@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- block/nvme.c | 5 ++++-
-file changed, 4 insertions(+), 1 deletion(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
-     q->busy = true;
-     assert(q->inflight >= 0);
-     while (q->inflight) {
-+        int ret;
-         int16_t cid;
-+
-         c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
-         if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
-             break;
-         }
-+        ret = nvme_translate_error(c);
-         q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
-         if (!q->cq.head) {
-             q->cq_phase = !q->cq_phase;
-@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
-         preq->busy = false;
-         preq->cb = preq->opaque = NULL;
-         qemu_mutex_unlock(&q->lock);
--        req.cb(req.opaque, nvme_translate_error(c));
-+        req.cb(req.opaque, ret);
-         qemu_mutex_lock(&q->lock);
-         q->inflight--;
-         progress = true;
---
-.26.2

-[PULL 11/12] block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair
+[PULL 2/3] util/main-loop: Introduce the main loop into QOM
-Passing around both BDRVNVMeState and NVMeQueuePair is unwieldy. Reduce
+From: Nicolas Saenz Julienne <nsaenzju@redhat.com>
-the number of function arguments by keeping the BDRVNVMeState pointer in
-NVMeQueuePair. This will come in handly when a BH is introduced in a
+'event-loop-base' provides basic property handling for all 'AioContext'
-later patch and only one argument can be passed to it.
+based event loops. So let's define a new 'MainLoopClass' that inherits
+from it. This will permit tweaking the main loop's properties through
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+qapi as well as through the command line using the '-object' keyword[1].
-Reviewed-by: Sergio Lopez <slp@redhat.com>
+Only one instance of 'MainLoopClass' might be created at any time.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20200617132201.1832152-7-stefanha@redhat.com
+'EventLoopBaseClass' learns a new callback, 'can_be_deleted()' so as to
 mark 'MainLoop' as non-deletable.
 [1] For example:
       -object main-loop,id=main-loop,aio-max-batch=<value>
 Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Acked-by: Markus Armbruster <armbru@redhat.com>
 Message-id: 20220425075723.20019-3-nsaenzju@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/nvme.c | 70 ++++++++++++++++++++++++++++------------------------
+ qapi/qom.json                    | 13 ++++++++
-file changed, 38 insertions(+), 32 deletions(-)
+ meson.build                      |  3 +-
+ include/qemu/main-loop.h         | 10 ++++++
-diff --git a/block/nvme.c b/block/nvme.c
+ include/sysemu/event-loop-base.h |  1 +
-index XXXXXXX..XXXXXXX 100644
+ event-loop-base.c                | 13 ++++++++
---- a/block/nvme.c
+ util/main-loop.c                 | 56 ++++++++++++++++++++++++++++++++
-+++ b/block/nvme.c
+files changed, 95 insertions(+), 1 deletion(-)
-@@ -XXX,XX +XXX,XX @@
-  */
+diff --git a/qapi/qom.json b/qapi/qom.json
- #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
+index XXXXXXX..XXXXXXX 100644
+--- a/qapi/qom.json
-+typedef struct BDRVNVMeState BDRVNVMeState;
++++ b/qapi/qom.json
-+
+@@ -XXX,XX +XXX,XX @@
- typedef struct {
+             '*poll-grow': 'int',
-     int32_t  head, tail;
+             '*poll-shrink': 'int' } }
-     uint8_t  *queue;
-@@ -XXX,XX +XXX,XX @@ typedef struct {
++##
- typedef struct {
++# @MainLoopProperties:
-     QemuMutex   lock;
++#
++# Properties for the main-loop object.
-+    /* Read from I/O code path, initialized under BQL */
++#
-+    BDRVNVMeState   *s;
++# Since: 7.1
-+    int             index;
++##
-+
++{ 'struct': 'MainLoopProperties',
-     /* Fields protected by BQL */
++  'base': 'EventLoopBaseProperties',
--    int         index;
++  'data': {} }
-     uint8_t     *prp_list_pages;
++
+ ##
-     /* Fields protected by @lock */
+ # @MemoryBackendProperties:
-@@ -XXX,XX +XXX,XX @@ typedef volatile struct {
+ #
+@@ -XXX,XX +XXX,XX @@
- QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000);
+     { 'name': 'input-linux',
+       'if': 'CONFIG_LINUX' },
--typedef struct {
+     'iothread',
-+struct BDRVNVMeState {
++    'main-loop',
-     AioContext *aio_context;
+     { 'name': 'memory-backend-epc',
-     QEMUVFIOState *vfio;
+       'if': 'CONFIG_LINUX' },
-     NVMeRegs *regs;
+     'memory-backend-file',
-@@ -XXX,XX +XXX,XX @@ typedef struct {
+@@ -XXX,XX +XXX,XX @@
+       'input-linux':                { 'type': 'InputLinuxProperties',
-     /* PCI address (required for nvme_refresh_filename()) */
+                                       'if': 'CONFIG_LINUX' },
-     char *device;
+       'iothread':                   'IothreadProperties',
--} BDRVNVMeState;
++      'main-loop':                  'MainLoopProperties',
        'memory-backend-epc':         { 'type': 'MemoryBackendEpcProperties',
                                        'if': 'CONFIG_LINUX' },
        'memory-backend-file':        'MemoryBackendFileProperties',
 diff --git a/meson.build b/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/meson.build
 +++ b/meson.build
@@ -XXX,XX +XXX,XX @@ libqemuutil = static_library('qemuutil',
                               sources: util_ss.sources() + stub_ss.sources() + genh,
                               dependencies: [util_ss.dependencies(), libm, threads, glib, socket, malloc, pixman])
  qemuutil = declare_dependency(link_with: libqemuutil,
 -                              sources: genh + version_res)
 +                              sources: genh + version_res,
 +                              dependencies: [event_loop_base])
  if have_system or have_user
    decodetree = generator(find_program('scripts/decodetree.py'),
 diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/main-loop.h
 +++ b/include/qemu/main-loop.h
@@ -XXX,XX +XXX,XX @@
  #define QEMU_MAIN_LOOP_H
  #include "block/aio.h"
 +#include "qom/object.h"
 +#include "sysemu/event-loop-base.h"
  #define SIG_IPI SIGUSR1
 +#define TYPE_MAIN_LOOP  "main-loop"
 +OBJECT_DECLARE_TYPE(MainLoop, MainLoopClass, MAIN_LOOP)
 +
 +struct MainLoop {
 +    EventLoopBase parent_obj;
 +};
++typedef struct MainLoop MainLoop;
- #define NVME_BLOCK_OPT_DEVICE "device"
++
- #define NVME_BLOCK_OPT_NAMESPACE "namespace"
+ /**
-@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
+  * qemu_init_main_loop: Set up the process so that it can run the main loop.
   *
 diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/sysemu/event-loop-base.h
 +++ b/include/sysemu/event-loop-base.h
@@ -XXX,XX +XXX,XX @@ struct EventLoopBaseClass {
      void (*init)(EventLoopBase *base, Error **errp);
      void (*update_params)(EventLoopBase *base, Error **errp);
 +    bool (*can_be_deleted)(EventLoopBase *base);
  };
  struct EventLoopBase {
 diff --git a/event-loop-base.c b/event-loop-base.c
 index XXXXXXX..XXXXXXX 100644
 --- a/event-loop-base.c
 +++ b/event-loop-base.c
@@ -XXX,XX +XXX,XX @@ static void event_loop_base_complete(UserCreatable *uc, Error **errp)
      }
  }
--static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q)
++static bool event_loop_base_can_be_deleted(UserCreatable *uc)
-+static void nvme_free_queue_pair(NVMeQueuePair *q)
++{
 +    EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(uc);
 +    EventLoopBase *backend = EVENT_LOOP_BASE(uc);
 +
 +    if (bc->can_be_deleted) {
 +        return bc->can_be_deleted(backend);
 +    }
 +
 +    return true;
 +}
 +
  static void event_loop_base_class_init(ObjectClass *klass, void *class_data)
  {
-     qemu_vfree(q->prp_list_pages);
+     UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
-     qemu_vfree(q->sq.queue);
+     ucc->complete = event_loop_base_complete;
-@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
++    ucc->can_be_deleted = event_loop_base_can_be_deleted;
-     uint64_t prp_list_iova;
+     object_class_property_add(klass, "aio-max-batch", "int",
-     qemu_mutex_init(&q->lock);
+                               event_loop_base_get_param,
-+    q->s = s;
+diff --git a/util/main-loop.c b/util/main-loop.c
-     q->index = idx;
+index XXXXXXX..XXXXXXX 100644
-     qemu_co_queue_init(&q->free_req_queue);
+--- a/util/main-loop.c
-     q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
++++ b/util/main-loop.c
-@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
+@@ -XXX,XX +XXX,XX @@
+ #include "qemu/error-report.h"
-     return q;
+ #include "qemu/queue.h"
- fail:
+ #include "qemu/compiler.h"
--    nvme_free_queue_pair(bs, q);
++#include "qom/object.h"
-+    nvme_free_queue_pair(q);
-     return NULL;
+ #ifndef _WIN32
  #include <sys/wait.h>
@@ -XXX,XX +XXX,XX @@ int qemu_init_main_loop(Error **errp)
      return 0;
  }
- /* With q->lock */
++static void main_loop_update_params(EventLoopBase *base, Error **errp)
--static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
++{
-+static void nvme_kick(NVMeQueuePair *q)
++    if (!qemu_aio_context) {
- {
++        error_setg(errp, "qemu aio context not ready");
-+    BDRVNVMeState *s = q->s;
++        return;
-+
++    }
-     if (s->plugged || !q->need_kick) {
++
-         return;
++    aio_context_set_aio_params(qemu_aio_context, base->aio_max_batch, errp);
-     }
++}
-@@ -XXX,XX +XXX,XX @@ static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
++
- }
++MainLoop *mloop;
++
- /* With q->lock */
++static void main_loop_init(EventLoopBase *base, Error **errp)
--static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
++{
-+static void nvme_wake_free_req_locked(NVMeQueuePair *q)
++    MainLoop *m = MAIN_LOOP(base);
- {
++
-     if (!qemu_co_queue_empty(&q->free_req_queue)) {
++    if (mloop) {
--        replay_bh_schedule_oneshot_event(s->aio_context,
++        error_setg(errp, "only one main-loop instance allowed");
-+        replay_bh_schedule_oneshot_event(q->s->aio_context,
++        return;
-                 nvme_free_req_queue_cb, q);
++    }
-     }
++
- }
++    main_loop_update_params(base, errp);
++
- /* Insert a request in the freelist and wake waiters */
++    mloop = m;
--static void nvme_put_free_req_and_wake(BDRVNVMeState *s,  NVMeQueuePair *q,
++    return;
--                                       NVMeRequest *req)
++}
-+static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req)
++
- {
++static bool main_loop_can_be_deleted(EventLoopBase *base)
-     qemu_mutex_lock(&q->lock);
++{
-     nvme_put_free_req_locked(q, req);
++    return false;
--    nvme_wake_free_req_locked(s, q);
++}
-+    nvme_wake_free_req_locked(q);
++
-     qemu_mutex_unlock(&q->lock);
++static void main_loop_class_init(ObjectClass *oc, void *class_data)
- }
++{
++    EventLoopBaseClass *bc = EVENT_LOOP_BASE_CLASS(oc);
-@@ -XXX,XX +XXX,XX @@ static inline int nvme_translate_error(const NvmeCqe *c)
++
- }
++    bc->init = main_loop_init;
++    bc->update_params = main_loop_update_params;
- /* With q->lock */
++    bc->can_be_deleted = main_loop_can_be_deleted;
--static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
++}
-+static bool nvme_process_completion(NVMeQueuePair *q)
++
- {
++static const TypeInfo main_loop_info = {
-+    BDRVNVMeState *s = q->s;
++    .name = TYPE_MAIN_LOOP,
-     bool progress = false;
++    .parent = TYPE_EVENT_LOOP_BASE,
-     NVMeRequest *preq;
++    .class_init = main_loop_class_init,
-     NVMeRequest req;
++    .instance_size = sizeof(MainLoop),
-@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
++};
-         /* Notify the device so it can post more completions. */
++
-         smp_mb_release();
++static void main_loop_register_types(void)
-         *q->cq.doorbell = cpu_to_le32(q->cq.head);
++{
--        nvme_wake_free_req_locked(s, q);
++    type_register_static(&main_loop_info);
-+        nvme_wake_free_req_locked(q);
++}
-     }
++
-     q->busy = false;
++type_init(main_loop_register_types)
-     return progress;
++
-@@ -XXX,XX +XXX,XX @@ static void nvme_trace_command(const NvmeCmd *cmd)
+ static int max_priority;
-     }
- }
+ #ifndef _WIN32
 -static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
 -                                NVMeRequest *req,
 +static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
                                  NvmeCmd *cmd, BlockCompletionFunc cb,
                                  void *opaque)
  {
@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
      req->opaque = opaque;
      cmd->cid = cpu_to_le32(req->cid);
 -    trace_nvme_submit_command(s, q->index, req->cid);
 +    trace_nvme_submit_command(q->s, q->index, req->cid);
      nvme_trace_command(cmd);
      qemu_mutex_lock(&q->lock);
      memcpy((uint8_t *)q->sq.queue +
             q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
      q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
      q->need_kick++;
 -    nvme_kick(s, q);
 -    nvme_process_completion(s, q);
 +    nvme_kick(q);
 +    nvme_process_completion(q);
      qemu_mutex_unlock(&q->lock);
  }
@@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
                           NvmeCmd *cmd)
  {
      NVMeRequest *req;
 -    BDRVNVMeState *s = bs->opaque;
      int ret = -EINPROGRESS;
      req = nvme_get_free_req(q);
      if (!req) {
          return -EBUSY;
      }
 -    nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret);
 +    nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
      BDRV_POLL_WHILE(bs, ret == -EINPROGRESS);
      return ret;
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
          }
          qemu_mutex_lock(&q->lock);
 -        while (nvme_process_completion(s, q)) {
 +        while (nvme_process_completion(q)) {
              /* Keep polling */
              progress = true;
          }
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
      };
      if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
          error_setg(errp, "Failed to create io queue [%d]", n);
 -        nvme_free_queue_pair(bs, q);
 +        nvme_free_queue_pair(q);
          return false;
      }
      cmd = (NvmeCmd) {
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
      };
      if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
          error_setg(errp, "Failed to create io queue [%d]", n);
 -        nvme_free_queue_pair(bs, q);
 +        nvme_free_queue_pair(q);
          return false;
      }
      s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
@@ -XXX,XX +XXX,XX @@ static void nvme_close(BlockDriverState *bs)
      BDRVNVMeState *s = bs->opaque;
      for (i = 0; i < s->nr_queues; ++i) {
 -        nvme_free_queue_pair(bs, s->queues[i]);
 +        nvme_free_queue_pair(s->queues[i]);
      }
      g_free(s->queues);
      aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
      r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
      qemu_co_mutex_unlock(&s->dma_map_lock);
      if (r) {
 -        nvme_put_free_req_and_wake(s, ioq, req);
 +        nvme_put_free_req_and_wake(ioq, req);
          return r;
      }
 -    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
 +    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
      data.co = qemu_coroutine_self();
      while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
      assert(s->nr_queues > 1);
      req = nvme_get_free_req(ioq);
      assert(req);
 -    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
 +    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
      data.co = qemu_coroutine_self();
      if (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
      req = nvme_get_free_req(ioq);
      assert(req);
 -    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
 +    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
      data.co = qemu_coroutine_self();
      while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
      qemu_co_mutex_unlock(&s->dma_map_lock);
      if (ret) {
 -        nvme_put_free_req_and_wake(s, ioq, req);
 +        nvme_put_free_req_and_wake(ioq, req);
          goto out;
      }
      trace_nvme_dsm(s, offset, bytes);
 -    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
 +    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
      data.co = qemu_coroutine_self();
      while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static void nvme_aio_unplug(BlockDriverState *bs)
      for (i = 1; i < s->nr_queues; i++) {
          NVMeQueuePair *q = s->queues[i];
          qemu_mutex_lock(&q->lock);
 -        nvme_kick(s, q);
 -        nvme_process_completion(s, q);
 +        nvme_kick(q);
 +        nvme_process_completion(q);
          qemu_mutex_unlock(&q->lock);
      }
  }
 --
-.26.2
+.35.1

-[PULL 09/12] block/nvme: switch to a NVMeRequest freelist
+[PULL 3/3] util/event-loop-base: Introduce options to set the thread pool size
-There are three issues with the current NVMeRequest->busy field:
+From: Nicolas Saenz Julienne <nsaenzju@redhat.com>
 . The busy field is accidentally accessed outside q->lock when request
    submission fails.
 . Waiters on free_req_queue are not woken when a request is returned
    early due to submission failure.
 . Finding a free request involves scanning all requests. This makes
    request submission O(n^2).
-Switch to an O(1) freelist that is always accessed under the lock.
+The thread pool regulates itself: when idle, it kills threads until
 empty, when in demand, it creates new threads until full. This behaviour
 doesn't play well with latency sensitive workloads where the price of
 creating a new thread is too high. For example, when paired with qemu's
 '-mlock', or using safety features like SafeStack, creating a new thread
 has been measured take multiple milliseconds.
-Also differentiate between NVME_QUEUE_SIZE, the actual SQ/CQ size, and
+In order to mitigate this let's introduce a new 'EventLoopBase'
-NVME_NUM_REQS, the number of usable requests. This makes the code
+property to set the thread pool size. The threads will be created during
-simpler than using NVME_QUEUE_SIZE everywhere and having to keep in mind
+the pool's initialization or upon updating the property's value, remain
-that one slot is reserved.
+available during its lifetime regardless of demand, and destroyed upon
 freeing it. A properly characterized workload will then be able to
 configure the pool to avoid any latency spikes.
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
-Reviewed-by: Sergio Lopez <slp@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Message-id: 20200617132201.1832152-5-stefanha@redhat.com
+Acked-by: Markus Armbruster <armbru@redhat.com>
 Message-id: 20220425075723.20019-4-nsaenzju@redhat.com
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
- block/nvme.c | 81 ++++++++++++++++++++++++++++++++++------------------
+ qapi/qom.json                    | 10 +++++-
-file changed, 54 insertions(+), 27 deletions(-)
+ include/block/aio.h              | 10 ++++++
  include/block/thread-pool.h      |  3 ++
  include/sysemu/event-loop-base.h |  4 +++
  event-loop-base.c                | 23 +++++++++++++
  iothread.c                       |  3 ++
  util/aio-posix.c                 |  1 +
  util/async.c                     | 20 ++++++++++++
  util/main-loop.c                 |  9 ++++++
  util/thread-pool.c               | 55 +++++++++++++++++++++++++++++---
 files changed, 133 insertions(+), 5 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
+diff --git a/qapi/qom.json b/qapi/qom.json
 index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
+--- a/qapi/qom.json
-+++ b/block/nvme.c
++++ b/qapi/qom.json
 @@ -XXX,XX +XXX,XX @@
- #define NVME_QUEUE_SIZE 128
+ #                 0 means that the engine will use its default.
- #define NVME_BAR_SIZE 8192
+ #                 (default: 0)
+ #
-+/*
++# @thread-pool-min: minimum number of threads reserved in the thread pool
-+ * We have to leave one slot empty as that is the full queue case where
++#                   (default:0)
-+ * head == tail + 1.
++#
 +# @thread-pool-max: maximum number of threads the thread pool can contain
 +#                   (default:64)
 +#
  # Since: 7.1
  ##
  { 'struct': 'EventLoopBaseProperties',
 -  'data': { '*aio-max-batch': 'int' } }
 +  'data': { '*aio-max-batch': 'int',
 +            '*thread-pool-min': 'int',
 +            '*thread-pool-max': 'int' } }
  ##
  # @IothreadProperties:
 diff --git a/include/block/aio.h b/include/block/aio.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/block/aio.h
 +++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ struct AioContext {
      QSLIST_HEAD(, Coroutine) scheduled_coroutines;
      QEMUBH *co_schedule_bh;
 +    int thread_pool_min;
 +    int thread_pool_max;
      /* Thread pool for performing work and receiving completion callbacks.
       * Has its own locking.
       */
@@ -XXX,XX +XXX,XX @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
  void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
                                  Error **errp);
 +/**
 + * aio_context_set_thread_pool_params:
 + * @ctx: the aio context
 + * @min: min number of threads to have readily available in the thread pool
 + * @min: max number of threads the thread pool can contain
 + */
-+#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
++void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
-+
++                                        int64_t max, Error **errp);
- typedef struct {
+ #endif
-     int32_t  head, tail;
+diff --git a/include/block/thread-pool.h b/include/block/thread-pool.h
-     uint8_t  *queue;
+index XXXXXXX..XXXXXXX 100644
-@@ -XXX,XX +XXX,XX @@ typedef struct {
+--- a/include/block/thread-pool.h
-     int cid;
++++ b/include/block/thread-pool.h
-     void *prp_list_page;
+@@ -XXX,XX +XXX,XX @@
-     uint64_t prp_list_iova;
--    bool busy;
+ #include "block/block.h"
-+    int free_req_next; /* q->reqs[] index of next free req */
- } NVMeRequest;
++#define THREAD_POOL_MAX_THREADS_DEFAULT         64
 +
  typedef int ThreadPoolFunc(void *opaque);
  typedef struct ThreadPool ThreadPool;
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *thread_pool_submit_aio(ThreadPool *pool,
  int coroutine_fn thread_pool_submit_co(ThreadPool *pool,
          ThreadPoolFunc *func, void *arg);
  void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg);
 +void thread_pool_update_params(ThreadPool *pool, struct AioContext *ctx);
  #endif
 diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/sysemu/event-loop-base.h
 +++ b/include/sysemu/event-loop-base.h
@@ -XXX,XX +XXX,XX @@ struct EventLoopBase {
      /* AioContext AIO engine parameters */
      int64_t aio_max_batch;
 +
 +    /* AioContext thread pool parameters */
 +    int64_t thread_pool_min;
 +    int64_t thread_pool_max;
  };
  #endif
 diff --git a/event-loop-base.c b/event-loop-base.c
 index XXXXXXX..XXXXXXX 100644
 --- a/event-loop-base.c
 +++ b/event-loop-base.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "qom/object_interfaces.h"
  #include "qapi/error.h"
 +#include "block/thread-pool.h"
  #include "sysemu/event-loop-base.h"
  typedef struct {
 @@ -XXX,XX +XXX,XX @@ typedef struct {
-     /* Fields protected by @lock */
+     ptrdiff_t offset; /* field's byte offset in EventLoopBase struct */
-     NVMeQueue   sq, cq;
+ } EventLoopBaseParamInfo;
-     int         cq_phase;
--    NVMeRequest reqs[NVME_QUEUE_SIZE];
++static void event_loop_base_instance_init(Object *obj)
-+    int         free_req_head;
++{
-+    NVMeRequest reqs[NVME_NUM_REQS];
++    EventLoopBase *base = EVENT_LOOP_BASE(obj);
-     bool        busy;
++
-     int         need_kick;
++    base->thread_pool_max = THREAD_POOL_MAX_THREADS_DEFAULT;
-     int         inflight;
++}
-@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
++
-     qemu_mutex_init(&q->lock);
+ static EventLoopBaseParamInfo aio_max_batch_info = {
-     q->index = idx;
+     "aio-max-batch", offsetof(EventLoopBase, aio_max_batch),
-     qemu_co_queue_init(&q->free_req_queue);
+ };
--    q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE);
++static EventLoopBaseParamInfo thread_pool_min_info = {
-+    q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
++    "thread-pool-min", offsetof(EventLoopBase, thread_pool_min),
-     r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
++};
--                          s->page_size * NVME_QUEUE_SIZE,
++static EventLoopBaseParamInfo thread_pool_max_info = {
-+                          s->page_size * NVME_NUM_REQS,
++    "thread-pool-max", offsetof(EventLoopBase, thread_pool_max),
-                           false, &prp_list_iova);
++};
-     if (r) {
-         goto fail;
+ static void event_loop_base_get_param(Object *obj, Visitor *v,
          const char *name, void *opaque, Error **errp)
@@ -XXX,XX +XXX,XX @@ static void event_loop_base_class_init(ObjectClass *klass, void *class_data)
                                event_loop_base_get_param,
                                event_loop_base_set_param,
                                NULL, &aio_max_batch_info);
 +    object_class_property_add(klass, "thread-pool-min", "int",
 +                              event_loop_base_get_param,
 +                              event_loop_base_set_param,
 +                              NULL, &thread_pool_min_info);
 +    object_class_property_add(klass, "thread-pool-max", "int",
 +                              event_loop_base_get_param,
 +                              event_loop_base_set_param,
 +                              NULL, &thread_pool_max_info);
  }
  static const TypeInfo event_loop_base_info = {
      .name = TYPE_EVENT_LOOP_BASE,
      .parent = TYPE_OBJECT,
      .instance_size = sizeof(EventLoopBase),
 +    .instance_init = event_loop_base_instance_init,
      .class_size = sizeof(EventLoopBaseClass),
      .class_init = event_loop_base_class_init,
      .abstract = true,
 diff --git a/iothread.c b/iothread.c
 index XXXXXXX..XXXXXXX 100644
 --- a/iothread.c
 +++ b/iothread.c
@@ -XXX,XX +XXX,XX @@ static void iothread_set_aio_context_params(EventLoopBase *base, Error **errp)
      aio_context_set_aio_params(iothread->ctx,
                                 iothread->parent_obj.aio_max_batch,
                                 errp);
 +
 +    aio_context_set_thread_pool_params(iothread->ctx, base->thread_pool_min,
 +                                       base->thread_pool_max, errp);
  }
 diff --git a/util/aio-posix.c b/util/aio-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/aio-posix.c
 +++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "block/block.h"
 +#include "block/thread-pool.h"
  #include "qemu/main-loop.h"
  #include "qemu/rcu.h"
  #include "qemu/rcu_queue.h"
 diff --git a/util/async.c b/util/async.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/async.c
 +++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
      ctx->aio_max_batch = 0;
 +    ctx->thread_pool_min = 0;
 +    ctx->thread_pool_max = THREAD_POOL_MAX_THREADS_DEFAULT;
 +
      return ctx;
  fail:
      g_source_destroy(&ctx->source);
@@ -XXX,XX +XXX,XX @@ void qemu_set_current_aio_context(AioContext *ctx)
      assert(!get_my_aiocontext());
      set_my_aiocontext(ctx);
  }
 +
 +void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
 +                                        int64_t max, Error **errp)
 +{
 +
 +    if (min > max || !max || min > INT_MAX || max > INT_MAX) {
 +        error_setg(errp, "bad thread-pool-min/thread-pool-max values");
 +        return;
 +    }
 +
 +    ctx->thread_pool_min = min;
 +    ctx->thread_pool_max = max;
 +
 +    if (ctx->thread_pool) {
 +        thread_pool_update_params(ctx->thread_pool, ctx);
 +    }
 +}
 diff --git a/util/main-loop.c b/util/main-loop.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/main-loop.c
 +++ b/util/main-loop.c
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/replay.h"
  #include "qemu/main-loop.h"
  #include "block/aio.h"
 +#include "block/thread-pool.h"
  #include "qemu/error-report.h"
  #include "qemu/queue.h"
  #include "qemu/compiler.h"
@@ -XXX,XX +XXX,XX @@ int qemu_init_main_loop(Error **errp)
  static void main_loop_update_params(EventLoopBase *base, Error **errp)
  {
 +    ERRP_GUARD();
 +
      if (!qemu_aio_context) {
          error_setg(errp, "qemu aio context not ready");
          return;
      }
--    for (i = 0; i < NVME_QUEUE_SIZE; i++) {
-+    q->free_req_head = -1;
+     aio_context_set_aio_params(qemu_aio_context, base->aio_max_batch, errp);
-+    for (i = 0; i < NVME_NUM_REQS; i++) {
++    if (*errp) {
-         NVMeRequest *req = &q->reqs[i];
++        return;
-         req->cid = i + 1;
++    }
-+        req->free_req_next = q->free_req_head;
++
-+        q->free_req_head = i;
++    aio_context_set_thread_pool_params(qemu_aio_context, base->thread_pool_min,
-         req->prp_list_page = q->prp_list_pages + i * s->page_size;
++                                       base->thread_pool_max, errp);
-         req->prp_list_iova = prp_list_iova + i * s->page_size;
+ }
-     }
-+
+ MainLoop *mloop;
-     nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
+diff --git a/util/thread-pool.c b/util/thread-pool.c
-     if (local_err) {
+index XXXXXXX..XXXXXXX 100644
-         error_propagate(errp, local_err);
+--- a/util/thread-pool.c
-@@ -XXX,XX +XXX,XX @@ static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
++++ b/util/thread-pool.c
-  */
+@@ -XXX,XX +XXX,XX @@ struct ThreadPool {
- static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
+     QemuMutex lock;
      QemuCond worker_stopped;
      QemuSemaphore sem;
 -    int max_threads;
      QEMUBH *new_thread_bh;
      /* The following variables are only accessed from one AioContext. */
@@ -XXX,XX +XXX,XX @@ struct ThreadPool {
      int new_threads;     /* backlog of threads we need to create */
      int pending_threads; /* threads created but not running yet */
      bool stopping;
 +    int min_threads;
 +    int max_threads;
  };
 +static inline bool back_to_sleep(ThreadPool *pool, int ret)
 +{
 +    /*
 +     * The semaphore timed out, we should exit the loop except when:
 +     *  - There is work to do, we raced with the signal.
 +     *  - The max threads threshold just changed, we raced with the signal.
 +     *  - The thread pool forces a minimum number of readily available threads.
 +     */
 +    if (ret == -1 && (!QTAILQ_EMPTY(&pool->request_list) ||
 +            pool->cur_threads > pool->max_threads ||
 +            pool->cur_threads <= pool->min_threads)) {
 +            return true;
 +    }
 +
 +    return false;
 +}
 +
  static void *worker_thread(void *opaque)
  {
--    int i;
+     ThreadPool *pool = opaque;
--    NVMeRequest *req = NULL;
+@@ -XXX,XX +XXX,XX @@ static void *worker_thread(void *opaque)
-+    NVMeRequest *req;
+             ret = qemu_sem_timedwait(&pool->sem, 10000);
+             qemu_mutex_lock(&pool->lock);
-     qemu_mutex_lock(&q->lock);
+             pool->idle_threads--;
--    while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) {
+-        } while (ret == -1 && !QTAILQ_EMPTY(&pool->request_list));
--        /* We have to leave one slot empty as that is the full queue case (head
+-        if (ret == -1 || pool->stopping) {
--         * == tail + 1). */
++        } while (back_to_sleep(pool, ret));
-+
++        if (ret == -1 || pool->stopping ||
-+    while (q->free_req_head == -1) {
++            pool->cur_threads > pool->max_threads) {
-         if (qemu_in_coroutine()) {
+             break;
              trace_nvme_free_req_queue_wait(q);
              qemu_co_queue_wait(&q->free_req_queue, &q->lock);
@@ -XXX,XX +XXX,XX @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
              return NULL;
          }
-     }
--    for (i = 0; i < NVME_QUEUE_SIZE; i++) {
+@@ -XXX,XX +XXX,XX @@ void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg)
--        if (!q->reqs[i].busy) {
+     thread_pool_submit_aio(pool, func, arg, NULL, NULL);
--            q->reqs[i].busy = true;
+ }
--            req = &q->reqs[i];
--            break;
++void thread_pool_update_params(ThreadPool *pool, AioContext *ctx)
 -        }
 -    }
 -    /* We have checked inflight and need_kick while holding q->lock, so one
 -     * free req must be available. */
 -    assert(req);
 +
 +    req = &q->reqs[q->free_req_head];
 +    q->free_req_head = req->free_req_next;
 +    req->free_req_next = -1;
 +
      qemu_mutex_unlock(&q->lock);
      return req;
  }
 +/* With q->lock */
 +static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
 +{
-+    req->free_req_next = q->free_req_head;
++    qemu_mutex_lock(&pool->lock);
-+    q->free_req_head = req - q->reqs;
++
 +    pool->min_threads = ctx->thread_pool_min;
 +    pool->max_threads = ctx->thread_pool_max;
 +
 +    /*
 +     * We either have to:
 +     *  - Increase the number available of threads until over the min_threads
 +     *    threshold.
 +     *  - Decrease the number of available threads until under the max_threads
 +     *    threshold.
 +     *  - Do nothing. The current number of threads fall in between the min and
 +     *    max thresholds. We'll let the pool manage itself.
 +     */
 +    for (int i = pool->cur_threads; i < pool->min_threads; i++) {
 +        spawn_thread(pool);
 +    }
 +
 +    for (int i = pool->cur_threads; i > pool->max_threads; i--) {
 +        qemu_sem_post(&pool->sem);
 +    }
 +
 +    qemu_mutex_unlock(&pool->lock);
 +}
 +
-+/* With q->lock */
+ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx)
 +static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
 +{
 +    if (!qemu_co_queue_empty(&q->free_req_queue)) {
 +        replay_bh_schedule_oneshot_event(s->aio_context,
 +                nvme_free_req_queue_cb, q);
 +    }
 +}
 +
 +/* Insert a request in the freelist and wake waiters */
 +static void nvme_put_free_req_and_wake(BDRVNVMeState *s,  NVMeQueuePair *q,
 +                                       NVMeRequest *req)
 +{
 +    qemu_mutex_lock(&q->lock);
 +    nvme_put_free_req_locked(q, req);
 +    nvme_wake_free_req_locked(s, q);
 +    qemu_mutex_unlock(&q->lock);
 +}
 +
  static inline int nvme_translate_error(const NvmeCqe *c)
  {
-     uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
+     if (!ctx) {
-@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
+@@ -XXX,XX +XXX,XX @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx)
-         req = *preq;
+     qemu_mutex_init(&pool->lock);
-         assert(req.cid == cid);
+     qemu_cond_init(&pool->worker_stopped);
-         assert(req.cb);
+     qemu_sem_init(&pool->sem, 0);
--        preq->busy = false;
+-    pool->max_threads = 64;
-+        nvme_put_free_req_locked(q, preq);
+     pool->new_thread_bh = aio_bh_new(ctx, spawn_thread_bh_fn, pool);
-         preq->cb = preq->opaque = NULL;
-         qemu_mutex_unlock(&q->lock);
+     QLIST_INIT(&pool->head);
-         req.cb(req.opaque, ret);
+     QTAILQ_INIT(&pool->request_list);
-@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
++
-         /* Notify the device so it can post more completions. */
++    thread_pool_update_params(pool, ctx);
-         smp_mb_release();
+ }
-         *q->cq.doorbell = cpu_to_le32(q->cq.head);
--        if (!qemu_co_queue_empty(&q->free_req_queue)) {
+ ThreadPool *thread_pool_new(AioContext *ctx)
 -            replay_bh_schedule_oneshot_event(s->aio_context,
 -                                             nvme_free_req_queue_cb, q);
 -        }
 +        nvme_wake_free_req_locked(s, q);
      }
      q->busy = false;
      return progress;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
      r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
      qemu_co_mutex_unlock(&s->dma_map_lock);
      if (r) {
 -        req->busy = false;
 +        nvme_put_free_req_and_wake(s, ioq, req);
          return r;
      }
      nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
      qemu_co_mutex_unlock(&s->dma_map_lock);
      if (ret) {
 -        req->busy = false;
 +        nvme_put_free_req_and_wake(s, ioq, req);
          goto out;
      }
 --
-.26.2
+.35.1

-[PULL 10/12] block/nvme: clarify that free_req_queue is protected by q->lock
+Deleted patch
-Existing users access free_req_queue under q->lock. Document this.
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Sergio Lopez <slp@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Message-id: 20200617132201.1832152-6-stefanha@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- block/nvme.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ typedef struct {
- } NVMeRequest;
- typedef struct {
--    CoQueue     free_req_queue;
-     QemuMutex   lock;
-     /* Fields protected by BQL */
-@@ -XXX,XX +XXX,XX @@ typedef struct {
-     uint8_t     *prp_list_pages;
-     /* Fields protected by @lock */
-+    CoQueue     free_req_queue;
-     NVMeQueue   sq, cq;
-     int         cq_phase;
-     int         free_req_head;
---
-.26.2

-[PULL 12/12] block/nvme: support nested aio_poll()
+Deleted patch
-QEMU block drivers are supposed to support aio_poll() from I/O
-completion callback functions. This means completion processing must be
-re-entrant.
-The standard approach is to schedule a BH during completion processing
-and cancel it at the end of processing. If aio_poll() is invoked by a
-callback function then the BH will run. The BH continues the suspended
-completion processing.
-All of this means that request A's cb() can synchronously wait for
-request B to complete. Previously the nvme block driver would hang
-because it didn't process completions from nested aio_poll().
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Reviewed-by: Sergio Lopez <slp@redhat.com>
-Message-id: 20200617132201.1832152-8-stefanha@redhat.com
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
----
- block/nvme.c       | 67 ++++++++++++++++++++++++++++++++++++++++------
- block/trace-events |  2 +-
-files changed, 60 insertions(+), 9 deletions(-)
-diff --git a/block/nvme.c b/block/nvme.c
-index XXXXXXX..XXXXXXX 100644
---- a/block/nvme.c
-+++ b/block/nvme.c
-@@ -XXX,XX +XXX,XX @@ typedef struct {
-     int         cq_phase;
-     int         free_req_head;
-     NVMeRequest reqs[NVME_NUM_REQS];
--    bool        busy;
-     int         need_kick;
-     int         inflight;
-+
-+    /* Thread-safe, no lock necessary */
-+    QEMUBH      *completion_bh;
- } NVMeQueuePair;
- /* Memory mapped registers */
-@@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState {
- #define NVME_BLOCK_OPT_DEVICE "device"
- #define NVME_BLOCK_OPT_NAMESPACE "namespace"
-+static void nvme_process_completion_bh(void *opaque);
-+
- static QemuOptsList runtime_opts = {
-     .name = "nvme",
-     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
-@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
- static void nvme_free_queue_pair(NVMeQueuePair *q)
- {
-+    if (q->completion_bh) {
-+        qemu_bh_delete(q->completion_bh);
-+    }
-     qemu_vfree(q->prp_list_pages);
-     qemu_vfree(q->sq.queue);
-     qemu_vfree(q->cq.queue);
-@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
-     q->index = idx;
-     qemu_co_queue_init(&q->free_req_queue);
-     q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
-+    q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs),
-+                                  nvme_process_completion_bh, q);
-     r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
-                           s->page_size * NVME_NUM_REQS,
-                           false, &prp_list_iova);
-@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
-     NvmeCqe *c;
-     trace_nvme_process_completion(s, q->index, q->inflight);
--    if (q->busy || s->plugged) {
--        trace_nvme_process_completion_queue_busy(s, q->index);
-+    if (s->plugged) {
-+        trace_nvme_process_completion_queue_plugged(s, q->index);
-         return false;
-     }
--    q->busy = true;
-+
-+    /*
-+     * Support re-entrancy when a request cb() function invokes aio_poll().
-+     * Pending completions must be visible to aio_poll() so that a cb()
-+     * function can wait for the completion of another request.
-+     *
-+     * The aio_poll() loop will execute our BH and we'll resume completion
-+     * processing there.
-+     */
-+    qemu_bh_schedule(q->completion_bh);
-+
-     assert(q->inflight >= 0);
-     while (q->inflight) {
-         int ret;
-@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
-         assert(req.cb);
-         nvme_put_free_req_locked(q, preq);
-         preq->cb = preq->opaque = NULL;
--        qemu_mutex_unlock(&q->lock);
--        req.cb(req.opaque, ret);
--        qemu_mutex_lock(&q->lock);
-         q->inflight--;
-+        qemu_mutex_unlock(&q->lock);
-+        req.cb(req.opaque, ret);
-+        qemu_mutex_lock(&q->lock);
-         progress = true;
-     }
-     if (progress) {
-@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
-         *q->cq.doorbell = cpu_to_le32(q->cq.head);
-         nvme_wake_free_req_locked(q);
-     }
--    q->busy = false;
-+
-+    qemu_bh_cancel(q->completion_bh);
-+
-     return progress;
- }
-+static void nvme_process_completion_bh(void *opaque)
-+{
-+    NVMeQueuePair *q = opaque;
-+
-+    /*
-+     * We're being invoked because a nvme_process_completion() cb() function
-+     * called aio_poll(). The callback may be waiting for further completions
-+     * so notify the device that it has space to fill in more completions now.
-+     */
-+    smp_mb_release();
-+    *q->cq.doorbell = cpu_to_le32(q->cq.head);
-+    nvme_wake_free_req_locked(q);
-+
-+    nvme_process_completion(q);
-+}
-+
- static void nvme_trace_command(const NvmeCmd *cmd)
- {
-     int i;
-@@ -XXX,XX +XXX,XX @@ static void nvme_detach_aio_context(BlockDriverState *bs)
- {
-     BDRVNVMeState *s = bs->opaque;
-+    for (int i = 0; i < s->nr_queues; i++) {
-+        NVMeQueuePair *q = s->queues[i];
-+
-+        qemu_bh_delete(q->completion_bh);
-+        q->completion_bh = NULL;
-+    }
-+
-     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
-                            false, NULL, NULL);
- }
-@@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs,
-     s->aio_context = new_context;
-     aio_set_event_notifier(new_context, &s->irq_notifier,
-                            false, nvme_handle_event, nvme_poll_cb);
-+
-+    for (int i = 0; i < s->nr_queues; i++) {
-+        NVMeQueuePair *q = s->queues[i];
-+
-+        q->completion_bh =
-+            aio_bh_new(new_context, nvme_process_completion_bh, q);
-+    }
- }
- static void nvme_aio_plug(BlockDriverState *bs)
-diff --git a/block/trace-events b/block/trace-events
-index XXXXXXX..XXXXXXX 100644
---- a/block/trace-events
-+++ b/block/trace-events
-@@ -XXX,XX +XXX,XX @@ nvme_kick(void *s, int queue) "s %p queue %d"
- nvme_dma_flush_queue_wait(void *s) "s %p"
- nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
- nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
--nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d"
-+nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d"
- nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
- nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
- nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
---
-.26.2

The following changes since commit 171199f56f5f9bdf1e5d670d09ef1351d8f01bae:

Merge remote-tracking branch 'remotes/alistair/tags/pull-riscv-to-apply-20200619-3' into staging (2020-06-22 14:45:25 +0100)

are available in the Git repository at:

https://github.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to 7838c67f22a81fcf669785cd6c0876438422071a:

block/nvme: support nested aio_poll() (2020-06-23 15:46:08 +0100)

----------------------------------------------------------------
Pull request

----------------------------------------------------------------

Daniele Buono (4):
  coroutine: support SafeStack in ucontext backend
  coroutine: add check for SafeStack in sigaltstack
  configure: add flags to support SafeStack
  check-block: enable iotests with SafeStack

Stefan Hajnoczi (8):
  minikconf: explicitly set encoding to UTF-8
  block/nvme: poll queues without q->lock
  block/nvme: drop tautologous assertion
  block/nvme: don't access CQE after moving cq.head
  block/nvme: switch to a NVMeRequest freelist
  block/nvme: clarify that free_req_queue is protected by q->lock
  block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair
  block/nvme: support nested aio_poll()

-- 
2.26.2

QEMU currently only has ASCII Kconfig files but Linux actually uses
UTF-8. Explicitly specify the encoding and that we're doing text file
I/O.

It's unclear whether or not QEMU will ever need Unicode in its Kconfig
files. If we start using the help text then it will become an issue
sooner or later. Make this change now for consistency with Linux
Kconfig.

Reported-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20200521153616.307100-1-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 scripts/minikconf.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/minikconf.py b/scripts/minikconf.py
index XXXXXXX..XXXXXXX 100755
--- a/scripts/minikconf.py
+++ b/scripts/minikconf.py
@@ -XXX,XX +XXX,XX @@ class KconfigParser:
         if incl_abs_fname in self.data.previously_included:
             return
         try:
-            fp = open(incl_abs_fname, 'r')
+            fp = open(incl_abs_fname, 'rt', encoding='utf-8')
         except IOError as e:
             raise KconfigParserError(self,
                                 '%s: %s' % (e.strerror, include))
@@ -XXX,XX +XXX,XX @@ if __name__ == '__main__':
             parser.do_assignment(name, value == 'y')
             external_vars.add(name[7:])
         else:
-            fp = open(arg, 'r')
+            fp = open(arg, 'rt', encoding='utf-8')
             parser.parse_file(fp)
             fp.close()
 
@@ -XXX,XX +XXX,XX @@ if __name__ == '__main__':
         if key not in external_vars and config[key]:
             print ('CONFIG_%s=y' % key)
 
-    deps = open(argv[2], 'w')
+    deps = open(argv[2], 'wt', encoding='utf-8')
     for fname in data.previously_included:
         print ('%s: %s' % (argv[1], fname), file=deps)
     deps.close()
-- 
2.26.2

From: Daniele Buono <dbuono@linux.vnet.ibm.com>

LLVM's SafeStack instrumentation does not yet support programs that make
use of the APIs in ucontext.h
With the current implementation of coroutine-ucontext, the resulting
binary is incorrect, with different coroutines sharing the same unsafe
stack and producing undefined behavior at runtime.
This fix allocates an additional unsafe stack area for each coroutine,
and sets the new unsafe stack pointer before calling swapcontext() in
qemu_coroutine_new.
This is the only place where the pointer needs to be manually updated,
since sigsetjmp/siglongjmp are already instrumented by LLVM to properly
support SafeStack.
The additional stack is then freed in qemu_coroutine_delete.

Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
Message-id: 20200529205122.714-2-dbuono@linux.vnet.ibm.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine_int.h |  5 +++++
 util/coroutine-ucontext.c    | 28 ++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/coroutine_int.h
+++ b/include/qemu/coroutine_int.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/queue.h"
 #include "qemu/coroutine.h"
 
+#ifdef CONFIG_SAFESTACK
+/* Pointer to the unsafe stack, defined by the compiler */
+extern __thread void *__safestack_unsafe_stack_ptr;
+#endif
+
 #define COROUTINE_STACK_SIZE (1 << 20)
 
 typedef enum {
diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c
index XXXXXXX..XXXXXXX 100644
--- a/util/coroutine-ucontext.c
+++ b/util/coroutine-ucontext.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
     Coroutine base;
     void *stack;
     size_t stack_size;
+#ifdef CONFIG_SAFESTACK
+    /* Need an unsafe stack for each coroutine */
+    void *unsafe_stack;
+    size_t unsafe_stack_size;
+#endif
     sigjmp_buf env;
 
     void *tsan_co_fiber;
@@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void)
     co = g_malloc0(sizeof(*co));
     co->stack_size = COROUTINE_STACK_SIZE;
     co->stack = qemu_alloc_stack(&co->stack_size);
+#ifdef CONFIG_SAFESTACK
+    co->unsafe_stack_size = COROUTINE_STACK_SIZE;
+    co->unsafe_stack = qemu_alloc_stack(&co->unsafe_stack_size);
+#endif
     co->base.entry_arg = &old_env; /* stash away our jmp_buf */
 
     uc.uc_link = &old_uc;
@@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void)
             COROUTINE_YIELD,
             &fake_stack_save,
             co->stack, co->stack_size, co->tsan_co_fiber);
+
+#ifdef CONFIG_SAFESTACK
+        /*
+         * Before we swap the context, set the new unsafe stack
+         * The unsafe stack grows just like the normal stack, so start from
+         * the last usable location of the memory area.
+         * NOTE: we don't have to re-set the usp afterwards because we are
+         * coming back to this context through a siglongjmp.
+         * The compiler already wrapped the corresponding sigsetjmp call with
+         * code that saves the usp on the (safe) stack before the call, and
+         * restores it right after (which is where we return with siglongjmp).
+         */
+        void *usp = co->unsafe_stack + co->unsafe_stack_size;
+        __safestack_unsafe_stack_ptr = usp;
+#endif
+
         swapcontext(&old_uc, &uc);
     }
 
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_delete(Coroutine *co_)
 #endif
 
     qemu_free_stack(co->stack, co->stack_size);
+#ifdef CONFIG_SAFESTACK
+    qemu_free_stack(co->unsafe_stack, co->unsafe_stack_size);
+#endif
     g_free(co);
 }
 
-- 
2.26.2

From: Daniele Buono <dbuono@linux.vnet.ibm.com>

Current implementation of LLVM's SafeStack is not compatible with
code that uses an alternate stack created with sigaltstack().
Since coroutine-sigaltstack relies on sigaltstack(), it is not
compatible with SafeStack. The resulting binary is incorrect, with
different coroutines sharing the same unsafe stack and producing
undefined behavior at runtime.

In the future LLVM may provide a SafeStack implementation compatible with
sigaltstack(). In the meantime, if SafeStack is desired, the coroutine
implementation from coroutine-ucontext should be used.
As a safety check, add a control in coroutine-sigaltstack to throw a
preprocessor #error if SafeStack is enabled and we are trying to
use coroutine-sigaltstack to implement coroutines.

Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
Message-id: 20200529205122.714-3-dbuono@linux.vnet.ibm.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 util/coroutine-sigaltstack.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/util/coroutine-sigaltstack.c b/util/coroutine-sigaltstack.c
index XXXXXXX..XXXXXXX 100644
--- a/util/coroutine-sigaltstack.c
+++ b/util/coroutine-sigaltstack.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu-common.h"
 #include "qemu/coroutine_int.h"
 
+#ifdef CONFIG_SAFESTACK
+#error "SafeStack is not compatible with code run in alternate signal stacks"
+#endif
+
 typedef struct {
     Coroutine base;
     void *stack;
-- 
2.26.2

From: Daniele Buono <dbuono@linux.vnet.ibm.com>

This patch adds a flag to enable/disable the SafeStack instrumentation
provided by LLVM.

On enable, make sure that the compiler supports the flags, and that we
are using the proper coroutine implementation (coroutine-ucontext).
On disable, explicitly disable the option if it was enabled by default.

While SafeStack is supported only on Linux, NetBSD, FreeBSD and macOS,
we are not checking for the O.S. since this is already done by LLVM.

Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
Message-id: 20200529205122.714-4-dbuono@linux.vnet.ibm.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 configure | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/configure b/configure
index XXXXXXX..XXXXXXX 100755
--- a/configure
+++ b/configure
@@ -XXX,XX +XXX,XX @@ audio_win_int=""
 libs_qga=""
 debug_info="yes"
 stack_protector=""
+safe_stack=""
 use_containers="yes"
 gdb_bin=$(command -v "gdb-multiarch" || command -v "gdb")
 
@@ -XXX,XX +XXX,XX @@ for opt do
   ;;
   --disable-stack-protector) stack_protector="no"
   ;;
+  --enable-safe-stack) safe_stack="yes"
+  ;;
+  --disable-safe-stack) safe_stack="no"
+  ;;
   --disable-curses) curses="no"
   ;;
   --enable-curses) curses="yes"
@@ -XXX,XX +XXX,XX @@ disabled with --disable-FEATURE, default is enabled if available:
   debug-tcg       TCG debugging (default is disabled)
   debug-info      debugging information
   sparse          sparse checker
+  safe-stack      SafeStack Stack Smash Protection. Depends on
+                  clang/llvm >= 3.7 and requires coroutine backend ucontext.
 
   gnutls          GNUTLS cryptography support
   nettle          nettle cryptography support
@@ -XXX,XX +XXX,XX @@ if test "$debug_stack_usage" = "yes"; then
   fi
 fi
 
+##################################################
+# SafeStack
+
+
+if test "$safe_stack" = "yes"; then
+cat > $TMPC << EOF
+int main(int argc, char *argv[])
+{
+#if ! __has_feature(safe_stack)
+#error SafeStack Disabled
+#endif
+    return 0;
+}
+EOF
+  flag="-fsanitize=safe-stack"
+  # Check that safe-stack is supported and enabled.
+  if compile_prog "-Werror $flag" "$flag"; then
+    # Flag needed both at compilation and at linking
+    QEMU_CFLAGS="$QEMU_CFLAGS $flag"
+    QEMU_LDFLAGS="$QEMU_LDFLAGS $flag"
+  else
+    error_exit "SafeStack not supported by your compiler"
+  fi
+  if test "$coroutine" != "ucontext"; then
+    error_exit "SafeStack is only supported by the coroutine backend ucontext"
+  fi
+else
+cat > $TMPC << EOF
+int main(int argc, char *argv[])
+{
+#if defined(__has_feature)
+#if __has_feature(safe_stack)
+#error SafeStack Enabled
+#endif
+#endif
+    return 0;
+}
+EOF
+if test "$safe_stack" = "no"; then
+  # Make sure that safe-stack is disabled
+  if ! compile_prog "-Werror" ""; then
+    # SafeStack was already enabled, try to explicitly remove the feature
+    flag="-fno-sanitize=safe-stack"
+    if ! compile_prog "-Werror $flag" "$flag"; then
+      error_exit "Configure cannot disable SafeStack"
+    fi
+    QEMU_CFLAGS="$QEMU_CFLAGS $flag"
+    QEMU_LDFLAGS="$QEMU_LDFLAGS $flag"
+  fi
+else # "$safe_stack" = ""
+  # Set safe_stack to yes or no based on pre-existing flags
+  if compile_prog "-Werror" ""; then
+    safe_stack="no"
+  else
+    safe_stack="yes"
+    if test "$coroutine" != "ucontext"; then
+      error_exit "SafeStack is only supported by the coroutine backend ucontext"
+    fi
+  fi
+fi
+fi
 
 ##########################################
 # check if we have open_by_handle_at
@@ -XXX,XX +XXX,XX @@ echo "sparse enabled    $sparse"
 echo "strip binaries    $strip_opt"
 echo "profiler          $profiler"
 echo "static build      $static"
+echo "safe stack        $safe_stack"
 if test "$darwin" = "yes" ; then
     echo "Cocoa support     $cocoa"
 fi
@@ -XXX,XX +XXX,XX @@ if test "$ccache_cpp2" = "yes"; then
   echo "export CCACHE_CPP2=y" >> $config_host_mak
 fi
 
+if test "$safe_stack" = "yes"; then
+  echo "CONFIG_SAFESTACK=y" >> $config_host_mak
+fi
+
 # If we're using a separate build tree, set it up now.
 # DIRS are directories which we simply mkdir in the build tree;
 # LINKS are things to symlink back into the source tree
-- 
2.26.2

From: Daniele Buono <dbuono@linux.vnet.ibm.com>

SafeStack is a stack protection technique implemented in llvm. It is
enabled with a -fsanitize flag.
iotests are currently disabled when any -fsanitize option is used,
because such options tend to produce additional warnings and false
positives.

While common -fsanitize options are used to verify the code and not
added in production, SafeStack's main use is in production environments
to protect against stack smashing.

Since SafeStack does not print any warning or false positive, enable
iotests when SafeStack is the only -fsanitize option used.
This is likely going to be a production binary and we want to make sure
it works correctly.

Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
Message-id: 20200529205122.714-5-dbuono@linux.vnet.ibm.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/check-block.sh | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/check-block.sh b/tests/check-block.sh
index XXXXXXX..XXXXXXX 100755
--- a/tests/check-block.sh
+++ b/tests/check-block.sh
@@ -XXX,XX +XXX,XX @@ if grep -q "CONFIG_GPROF=y" config-host.mak 2>/dev/null ; then
     exit 0
 fi
 
-if grep -q "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null ; then
+# Disable tests with any sanitizer except for SafeStack
+CFLAGS=$( grep "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null )
+SANITIZE_FLAGS=""
+#Remove all occurrencies of -fsanitize=safe-stack
+for i in ${CFLAGS}; do
+        if [ "${i}" != "-fsanitize=safe-stack" ]; then
+                SANITIZE_FLAGS="${SANITIZE_FLAGS} ${i}"
+        fi
+done
+if echo ${SANITIZE_FLAGS} | grep -q "\-fsanitize" 2>/dev/null; then
+    # Have a sanitize flag that is not allowed, stop
     echo "Sanitizers are enabled ==> Not running the qemu-iotests."
     exit 0
 fi
-- 
2.26.2

A lot of CPU time is spent simply locking/unlocking q->lock during
polling. Check for completion outside the lock to make q->lock disappear
from the profile.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Message-id: 20200617132201.1832152-2-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/nvme.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
 
     for (i = 0; i < s->nr_queues; i++) {
         NVMeQueuePair *q = s->queues[i];
+        const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
+        NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
+
+        /*
+         * Do an early check for completions. q->lock isn't needed because
+         * nvme_process_completion() only runs in the event loop thread and
+         * cannot race with itself.
+         */
+        if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
+            continue;
+        }
+
         qemu_mutex_lock(&q->lock);
         while (nvme_process_completion(s, q)) {
             /* Keep polling */
-- 
2.26.2

Do not access a CQE after incrementing q->cq.head and releasing q->lock.
It is unlikely that this causes problems in practice but it's a latent
bug.

The reason why it should be safe at the moment is that completion
processing is not re-entrant and the CQ doorbell isn't written until the
end of nvme_process_completion().

Make this change now because QEMU expects completion processing to be
re-entrant and later patches will do that.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20200617132201.1832152-4-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/nvme.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
     q->busy = true;
     assert(q->inflight >= 0);
     while (q->inflight) {
+        int ret;
         int16_t cid;
+
         c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
         if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
             break;
         }
+        ret = nvme_translate_error(c);
         q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
         if (!q->cq.head) {
             q->cq_phase = !q->cq_phase;
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
         preq->busy = false;
         preq->cb = preq->opaque = NULL;
         qemu_mutex_unlock(&q->lock);
-        req.cb(req.opaque, nvme_translate_error(c));
+        req.cb(req.opaque, ret);
         qemu_mutex_lock(&q->lock);
         q->inflight--;
         progress = true;
-- 
2.26.2

There are three issues with the current NVMeRequest->busy field:
1. The busy field is accidentally accessed outside q->lock when request
   submission fails.
2. Waiters on free_req_queue are not woken when a request is returned
   early due to submission failure.
2. Finding a free request involves scanning all requests. This makes
   request submission O(n^2).

Switch to an O(1) freelist that is always accessed under the lock.

Also differentiate between NVME_QUEUE_SIZE, the actual SQ/CQ size, and
NVME_NUM_REQS, the number of usable requests. This makes the code
simpler than using NVME_QUEUE_SIZE everywhere and having to keep in mind
that one slot is reserved.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Message-id: 20200617132201.1832152-5-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/nvme.c | 81 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 54 insertions(+), 27 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@
 #define NVME_QUEUE_SIZE 128
 #define NVME_BAR_SIZE 8192
 
+/*
+ * We have to leave one slot empty as that is the full queue case where
+ * head == tail + 1.
+ */
+#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
+
 typedef struct {
     int32_t  head, tail;
     uint8_t  *queue;
@@ -XXX,XX +XXX,XX @@ typedef struct {
     int cid;
     void *prp_list_page;
     uint64_t prp_list_iova;
-    bool busy;
+    int free_req_next; /* q->reqs[] index of next free req */
 } NVMeRequest;
 
 typedef struct {
@@ -XXX,XX +XXX,XX @@ typedef struct {
     /* Fields protected by @lock */
     NVMeQueue   sq, cq;
     int         cq_phase;
-    NVMeRequest reqs[NVME_QUEUE_SIZE];
+    int         free_req_head;
+    NVMeRequest reqs[NVME_NUM_REQS];
     bool        busy;
     int         need_kick;
     int         inflight;
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
     qemu_mutex_init(&q->lock);
     q->index = idx;
     qemu_co_queue_init(&q->free_req_queue);
-    q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE);
+    q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
     r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
-                          s->page_size * NVME_QUEUE_SIZE,
+                          s->page_size * NVME_NUM_REQS,
                           false, &prp_list_iova);
     if (r) {
         goto fail;
     }
-    for (i = 0; i < NVME_QUEUE_SIZE; i++) {
+    q->free_req_head = -1;
+    for (i = 0; i < NVME_NUM_REQS; i++) {
         NVMeRequest *req = &q->reqs[i];
         req->cid = i + 1;
+        req->free_req_next = q->free_req_head;
+        q->free_req_head = i;
         req->prp_list_page = q->prp_list_pages + i * s->page_size;
         req->prp_list_iova = prp_list_iova + i * s->page_size;
     }
+
     nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
@@ -XXX,XX +XXX,XX @@ static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
  */
 static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
 {
-    int i;
-    NVMeRequest *req = NULL;
+    NVMeRequest *req;
 
     qemu_mutex_lock(&q->lock);
-    while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) {
-        /* We have to leave one slot empty as that is the full queue case (head
-         * == tail + 1). */
+
+    while (q->free_req_head == -1) {
         if (qemu_in_coroutine()) {
             trace_nvme_free_req_queue_wait(q);
             qemu_co_queue_wait(&q->free_req_queue, &q->lock);
@@ -XXX,XX +XXX,XX @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
             return NULL;
         }
     }
-    for (i = 0; i < NVME_QUEUE_SIZE; i++) {
-        if (!q->reqs[i].busy) {
-            q->reqs[i].busy = true;
-            req = &q->reqs[i];
-            break;
-        }
-    }
-    /* We have checked inflight and need_kick while holding q->lock, so one
-     * free req must be available. */
-    assert(req);
+
+    req = &q->reqs[q->free_req_head];
+    q->free_req_head = req->free_req_next;
+    req->free_req_next = -1;
+
     qemu_mutex_unlock(&q->lock);
     return req;
 }
 
+/* With q->lock */
+static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
+{
+    req->free_req_next = q->free_req_head;
+    q->free_req_head = req - q->reqs;
+}
+
+/* With q->lock */
+static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
+{
+    if (!qemu_co_queue_empty(&q->free_req_queue)) {
+        replay_bh_schedule_oneshot_event(s->aio_context,
+                nvme_free_req_queue_cb, q);
+    }
+}
+
+/* Insert a request in the freelist and wake waiters */
+static void nvme_put_free_req_and_wake(BDRVNVMeState *s,  NVMeQueuePair *q,
+                                       NVMeRequest *req)
+{
+    qemu_mutex_lock(&q->lock);
+    nvme_put_free_req_locked(q, req);
+    nvme_wake_free_req_locked(s, q);
+    qemu_mutex_unlock(&q->lock);
+}
+
 static inline int nvme_translate_error(const NvmeCqe *c)
 {
     uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
         req = *preq;
         assert(req.cid == cid);
         assert(req.cb);
-        preq->busy = false;
+        nvme_put_free_req_locked(q, preq);
         preq->cb = preq->opaque = NULL;
         qemu_mutex_unlock(&q->lock);
         req.cb(req.opaque, ret);
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
         /* Notify the device so it can post more completions. */
         smp_mb_release();
         *q->cq.doorbell = cpu_to_le32(q->cq.head);
-        if (!qemu_co_queue_empty(&q->free_req_queue)) {
-            replay_bh_schedule_oneshot_event(s->aio_context,
-                                             nvme_free_req_queue_cb, q);
-        }
+        nvme_wake_free_req_locked(s, q);
     }
     q->busy = false;
     return progress;
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
     r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
     qemu_co_mutex_unlock(&s->dma_map_lock);
     if (r) {
-        req->busy = false;
+        nvme_put_free_req_and_wake(s, ioq, req);
         return r;
     }
     nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
     qemu_co_mutex_unlock(&s->dma_map_lock);
 
     if (ret) {
-        req->busy = false;
+        nvme_put_free_req_and_wake(s, ioq, req);
         goto out;
     }
 
-- 
2.26.2

Passing around both BDRVNVMeState and NVMeQueuePair is unwieldy. Reduce
the number of function arguments by keeping the BDRVNVMeState pointer in
NVMeQueuePair. This will come in handly when a BH is introduced in a
later patch and only one argument can be passed to it.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20200617132201.1832152-7-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/nvme.c | 70 ++++++++++++++++++++++++++++------------------------
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@
  */
 #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
 
+typedef struct BDRVNVMeState BDRVNVMeState;
+
 typedef struct {
     int32_t  head, tail;
     uint8_t  *queue;
@@ -XXX,XX +XXX,XX @@ typedef struct {
 typedef struct {
     QemuMutex   lock;
 
+    /* Read from I/O code path, initialized under BQL */
+    BDRVNVMeState   *s;
+    int             index;
+
     /* Fields protected by BQL */
-    int         index;
     uint8_t     *prp_list_pages;
 
     /* Fields protected by @lock */
@@ -XXX,XX +XXX,XX @@ typedef volatile struct {
 
 QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000);
 
-typedef struct {
+struct BDRVNVMeState {
     AioContext *aio_context;
     QEMUVFIOState *vfio;
     NVMeRegs *regs;
@@ -XXX,XX +XXX,XX @@ typedef struct {
 
     /* PCI address (required for nvme_refresh_filename()) */
     char *device;
-} BDRVNVMeState;
+};
 
 #define NVME_BLOCK_OPT_DEVICE "device"
 #define NVME_BLOCK_OPT_NAMESPACE "namespace"
@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
     }
 }
 
-static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q)
+static void nvme_free_queue_pair(NVMeQueuePair *q)
 {
     qemu_vfree(q->prp_list_pages);
     qemu_vfree(q->sq.queue);
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
     uint64_t prp_list_iova;
 
     qemu_mutex_init(&q->lock);
+    q->s = s;
     q->index = idx;
     qemu_co_queue_init(&q->free_req_queue);
     q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
 
     return q;
 fail:
-    nvme_free_queue_pair(bs, q);
+    nvme_free_queue_pair(q);
     return NULL;
 }
 
 /* With q->lock */
-static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
+static void nvme_kick(NVMeQueuePair *q)
 {
+    BDRVNVMeState *s = q->s;
+
     if (s->plugged || !q->need_kick) {
         return;
     }
@@ -XXX,XX +XXX,XX @@ static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
 }
 
 /* With q->lock */
-static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
+static void nvme_wake_free_req_locked(NVMeQueuePair *q)
 {
     if (!qemu_co_queue_empty(&q->free_req_queue)) {
-        replay_bh_schedule_oneshot_event(s->aio_context,
+        replay_bh_schedule_oneshot_event(q->s->aio_context,
                 nvme_free_req_queue_cb, q);
     }
 }
 
 /* Insert a request in the freelist and wake waiters */
-static void nvme_put_free_req_and_wake(BDRVNVMeState *s,  NVMeQueuePair *q,
-                                       NVMeRequest *req)
+static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req)
 {
     qemu_mutex_lock(&q->lock);
     nvme_put_free_req_locked(q, req);
-    nvme_wake_free_req_locked(s, q);
+    nvme_wake_free_req_locked(q);
     qemu_mutex_unlock(&q->lock);
 }
 
@@ -XXX,XX +XXX,XX @@ static inline int nvme_translate_error(const NvmeCqe *c)
 }
 
 /* With q->lock */
-static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
+static bool nvme_process_completion(NVMeQueuePair *q)
 {
+    BDRVNVMeState *s = q->s;
     bool progress = false;
     NVMeRequest *preq;
     NVMeRequest req;
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
         /* Notify the device so it can post more completions. */
         smp_mb_release();
         *q->cq.doorbell = cpu_to_le32(q->cq.head);
-        nvme_wake_free_req_locked(s, q);
+        nvme_wake_free_req_locked(q);
     }
     q->busy = false;
     return progress;
@@ -XXX,XX +XXX,XX @@ static void nvme_trace_command(const NvmeCmd *cmd)
     }
 }
 
-static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
-                                NVMeRequest *req,
+static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
                                 NvmeCmd *cmd, BlockCompletionFunc cb,
                                 void *opaque)
 {
@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
     req->opaque = opaque;
     cmd->cid = cpu_to_le32(req->cid);
 
-    trace_nvme_submit_command(s, q->index, req->cid);
+    trace_nvme_submit_command(q->s, q->index, req->cid);
     nvme_trace_command(cmd);
     qemu_mutex_lock(&q->lock);
     memcpy((uint8_t *)q->sq.queue +
            q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
     q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
     q->need_kick++;
-    nvme_kick(s, q);
-    nvme_process_completion(s, q);
+    nvme_kick(q);
+    nvme_process_completion(q);
     qemu_mutex_unlock(&q->lock);
 }
 
@@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
                          NvmeCmd *cmd)
 {
     NVMeRequest *req;
-    BDRVNVMeState *s = bs->opaque;
     int ret = -EINPROGRESS;
     req = nvme_get_free_req(q);
     if (!req) {
         return -EBUSY;
     }
-    nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret);
+    nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
 
     BDRV_POLL_WHILE(bs, ret == -EINPROGRESS);
     return ret;
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
         }
 
         qemu_mutex_lock(&q->lock);
-        while (nvme_process_completion(s, q)) {
+        while (nvme_process_completion(q)) {
             /* Keep polling */
             progress = true;
         }
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
     };
     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
         error_setg(errp, "Failed to create io queue [%d]", n);
-        nvme_free_queue_pair(bs, q);
+        nvme_free_queue_pair(q);
         return false;
     }
     cmd = (NvmeCmd) {
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
     };
     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
         error_setg(errp, "Failed to create io queue [%d]", n);
-        nvme_free_queue_pair(bs, q);
+        nvme_free_queue_pair(q);
         return false;
     }
     s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
@@ -XXX,XX +XXX,XX @@ static void nvme_close(BlockDriverState *bs)
     BDRVNVMeState *s = bs->opaque;
 
     for (i = 0; i < s->nr_queues; ++i) {
-        nvme_free_queue_pair(bs, s->queues[i]);
+        nvme_free_queue_pair(s->queues[i]);
     }
     g_free(s->queues);
     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
     r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
     qemu_co_mutex_unlock(&s->dma_map_lock);
     if (r) {
-        nvme_put_free_req_and_wake(s, ioq, req);
+        nvme_put_free_req_and_wake(ioq, req);
         return r;
     }
-    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
 
     data.co = qemu_coroutine_self();
     while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
     assert(s->nr_queues > 1);
     req = nvme_get_free_req(ioq);
     assert(req);
-    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
 
     data.co = qemu_coroutine_self();
     if (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
     req = nvme_get_free_req(ioq);
     assert(req);
 
-    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
 
     data.co = qemu_coroutine_self();
     while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
     qemu_co_mutex_unlock(&s->dma_map_lock);
 
     if (ret) {
-        nvme_put_free_req_and_wake(s, ioq, req);
+        nvme_put_free_req_and_wake(ioq, req);
         goto out;
     }
 
     trace_nvme_dsm(s, offset, bytes);
 
-    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+    nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
 
     data.co = qemu_coroutine_self();
     while (data.ret == -EINPROGRESS) {
@@ -XXX,XX +XXX,XX @@ static void nvme_aio_unplug(BlockDriverState *bs)
     for (i = 1; i < s->nr_queues; i++) {
         NVMeQueuePair *q = s->queues[i];
         qemu_mutex_lock(&q->lock);
-        nvme_kick(s, q);
-        nvme_process_completion(s, q);
+        nvme_kick(q);
+        nvme_process_completion(q);
         qemu_mutex_unlock(&q->lock);
     }
 }
-- 
2.26.2

QEMU block drivers are supposed to support aio_poll() from I/O
completion callback functions. This means completion processing must be
re-entrant.

The standard approach is to schedule a BH during completion processing
and cancel it at the end of processing. If aio_poll() is invoked by a
callback function then the BH will run. The BH continues the suspended
completion processing.

All of this means that request A's cb() can synchronously wait for
request B to complete. Previously the nvme block driver would hang
because it didn't process completions from nested aio_poll().

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Message-id: 20200617132201.1832152-8-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/nvme.c       | 67 ++++++++++++++++++++++++++++++++++++++++------
 block/trace-events |  2 +-
 2 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index XXXXXXX..XXXXXXX 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -XXX,XX +XXX,XX @@ typedef struct {
     int         cq_phase;
     int         free_req_head;
     NVMeRequest reqs[NVME_NUM_REQS];
-    bool        busy;
     int         need_kick;
     int         inflight;
+
+    /* Thread-safe, no lock necessary */
+    QEMUBH      *completion_bh;
 } NVMeQueuePair;
 
 /* Memory mapped registers */
@@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState {
 #define NVME_BLOCK_OPT_DEVICE "device"
 #define NVME_BLOCK_OPT_NAMESPACE "namespace"
 
+static void nvme_process_completion_bh(void *opaque);
+
 static QemuOptsList runtime_opts = {
     .name = "nvme",
     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
 
 static void nvme_free_queue_pair(NVMeQueuePair *q)
 {
+    if (q->completion_bh) {
+        qemu_bh_delete(q->completion_bh);
+    }
     qemu_vfree(q->prp_list_pages);
     qemu_vfree(q->sq.queue);
     qemu_vfree(q->cq.queue);
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
     q->index = idx;
     qemu_co_queue_init(&q->free_req_queue);
     q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
+    q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs),
+                                  nvme_process_completion_bh, q);
     r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
                           s->page_size * NVME_NUM_REQS,
                           false, &prp_list_iova);
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
     NvmeCqe *c;
 
     trace_nvme_process_completion(s, q->index, q->inflight);
-    if (q->busy || s->plugged) {
-        trace_nvme_process_completion_queue_busy(s, q->index);
+    if (s->plugged) {
+        trace_nvme_process_completion_queue_plugged(s, q->index);
         return false;
     }
-    q->busy = true;
+
+    /*
+     * Support re-entrancy when a request cb() function invokes aio_poll().
+     * Pending completions must be visible to aio_poll() so that a cb()
+     * function can wait for the completion of another request.
+     *
+     * The aio_poll() loop will execute our BH and we'll resume completion
+     * processing there.
+     */
+    qemu_bh_schedule(q->completion_bh);
+
     assert(q->inflight >= 0);
     while (q->inflight) {
         int ret;
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
         assert(req.cb);
         nvme_put_free_req_locked(q, preq);
         preq->cb = preq->opaque = NULL;
-        qemu_mutex_unlock(&q->lock);
-        req.cb(req.opaque, ret);
-        qemu_mutex_lock(&q->lock);
         q->inflight--;
+        qemu_mutex_unlock(&q->lock);
+        req.cb(req.opaque, ret);
+        qemu_mutex_lock(&q->lock);
         progress = true;
     }
     if (progress) {
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
         *q->cq.doorbell = cpu_to_le32(q->cq.head);
         nvme_wake_free_req_locked(q);
     }
-    q->busy = false;
+
+    qemu_bh_cancel(q->completion_bh);
+
     return progress;
 }
 
+static void nvme_process_completion_bh(void *opaque)
+{
+    NVMeQueuePair *q = opaque;
+
+    /*
+     * We're being invoked because a nvme_process_completion() cb() function
+     * called aio_poll(). The callback may be waiting for further completions
+     * so notify the device that it has space to fill in more completions now.
+     */
+    smp_mb_release();
+    *q->cq.doorbell = cpu_to_le32(q->cq.head);
+    nvme_wake_free_req_locked(q);
+
+    nvme_process_completion(q);
+}
+
 static void nvme_trace_command(const NvmeCmd *cmd)
 {
     int i;
@@ -XXX,XX +XXX,XX @@ static void nvme_detach_aio_context(BlockDriverState *bs)
 {
     BDRVNVMeState *s = bs->opaque;
 
+    for (int i = 0; i < s->nr_queues; i++) {
+        NVMeQueuePair *q = s->queues[i];
+
+        qemu_bh_delete(q->completion_bh);
+        q->completion_bh = NULL;
+    }
+
     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
                            false, NULL, NULL);
 }
@@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs,
     s->aio_context = new_context;
     aio_set_event_notifier(new_context, &s->irq_notifier,
                            false, nvme_handle_event, nvme_poll_cb);
+
+    for (int i = 0; i < s->nr_queues; i++) {
+        NVMeQueuePair *q = s->queues[i];
+
+        q->completion_bh =
+            aio_bh_new(new_context, nvme_process_completion_bh, q);
+    }
 }
 
 static void nvme_aio_plug(BlockDriverState *bs)
diff --git a/block/trace-events b/block/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -XXX,XX +XXX,XX @@ nvme_kick(void *s, int queue) "s %p queue %d"
 nvme_dma_flush_queue_wait(void *s) "s %p"
 nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
 nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
-nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d"
+nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d"
 nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
 nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
 nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
-- 
2.26.2

The following changes since commit 9cf289af47bcfae5c75de37d8e5d6fd23705322c:

Merge tag 'qga-pull-request' of gitlab.com:marcandre.lureau/qemu into staging (2022-05-04 03:42:49 -0700)

are available in the Git repository at:

https://gitlab.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to bef2e050d6a7feb865854c65570c496ac5a8cf53:

util/event-loop-base: Introduce options to set the thread pool size (2022-05-04 17:02:19 +0100)

----------------------------------------------------------------
Pull request

Add new thread-pool-min/thread-pool-max parameters to control the thread pool
used for async I/O.

----------------------------------------------------------------

Nicolas Saenz Julienne (3):
  Introduce event-loop-base abstract class
  util/main-loop: Introduce the main loop into QOM
  util/event-loop-base: Introduce options to set the thread pool size

-- 
2.35.1

From: Nicolas Saenz Julienne <nsaenzju@redhat.com>

Introduce the 'event-loop-base' abstract class, it'll hold the
properties common to all event loops and provide the necessary hooks for
their creation and maintenance. Then have iothread inherit from it.

EventLoopBaseClass is defined as user creatable and provides a hook for
its children to attach themselves to the user creatable class 'complete'
function. It also provides an update_params() callback to propagate
property changes onto its children.

The new 'event-loop-base' class will live in the root directory. It is
built on its own using the 'link_whole' option (there are no direct
function dependencies between the class and its children, it all happens
trough 'constructor' magic). And also imposes new compilation
dependencies:

qom <- event-loop-base <- blockdev (iothread.c)

And in subsequent patches:

qom <- event-loop-base <- qemuutil (util/main-loop.c)

All this forced some amount of reordering in meson.build:

- Moved qom build definition before qemuutil. Doing it the other way
   around (i.e. moving qemuutil after qom) isn't possible as a lot of
   core libraries that live in between the two depend on it.

- Process the 'hw' subdir earlier, as it introduces files into the
   'qom' source set.

No functional changes intended.

Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Markus Armbruster <armbru@redhat.com>
Message-id: 20220425075723.20019-2-nsaenzju@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qapi/qom.json                    |  22 +++++--
 meson.build                      |  23 ++++---
 include/sysemu/event-loop-base.h |  36 +++++++++++
 include/sysemu/iothread.h        |   6 +-
 event-loop-base.c                | 104 +++++++++++++++++++++++++++++++
 iothread.c                       |  65 ++++++-------------
 6 files changed, 192 insertions(+), 64 deletions(-)
 create mode 100644 include/sysemu/event-loop-base.h
 create mode 100644 event-loop-base.c

diff --git a/qapi/qom.json b/qapi/qom.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -XXX,XX +XXX,XX @@
             '*repeat': 'bool',
             '*grab-toggle': 'GrabToggleKeys' } }
 
+##
+# @EventLoopBaseProperties:
+#
+# Common properties for event loops
+#
+# @aio-max-batch: maximum number of requests in a batch for the AIO engine,
+#                 0 means that the engine will use its default.
+#                 (default: 0)
+#
+# Since: 7.1
+##
+{ 'struct': 'EventLoopBaseProperties',
+  'data': { '*aio-max-batch': 'int' } }
+
 ##
 # @IothreadProperties:
 #
@@ -XXX,XX +XXX,XX @@
 #               algorithm detects it is spending too long polling without
 #               encountering events. 0 selects a default behaviour (default: 0)
 #
-# @aio-max-batch: maximum number of requests in a batch for the AIO engine,
-#                 0 means that the engine will use its default
-#                 (default:0, since 6.1)
+# The @aio-max-batch option is available since 6.1.
 #
 # Since: 2.0
 ##
 { 'struct': 'IothreadProperties',
+  'base': 'EventLoopBaseProperties',
   'data': { '*poll-max-ns': 'int',
             '*poll-grow': 'int',
-            '*poll-shrink': 'int',
-            '*aio-max-batch': 'int' } }
+            '*poll-shrink': 'int' } }
 
 ##
 # @MemoryBackendProperties:
diff --git a/meson.build b/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/meson.build
+++ b/meson.build
@@ -XXX,XX +XXX,XX @@ subdir('qom')
 subdir('authz')
 subdir('crypto')
 subdir('ui')
+subdir('hw')
 
 
 if enable_modules
@@ -XXX,XX +XXX,XX @@ if enable_modules
   modulecommon = declare_dependency(link_whole: libmodulecommon, compile_args: '-DBUILD_DSO')
 endif
 
+qom_ss = qom_ss.apply(config_host, strict: false)
+libqom = static_library('qom', qom_ss.sources() + genh,
+                        dependencies: [qom_ss.dependencies()],
+                        name_suffix: 'fa')
+qom = declare_dependency(link_whole: libqom)
+
+event_loop_base = files('event-loop-base.c')
+event_loop_base = static_library('event-loop-base', sources: event_loop_base + genh,
+                                 build_by_default: true)
+event_loop_base = declare_dependency(link_whole: event_loop_base,
+                                     dependencies: [qom])
+
 stub_ss = stub_ss.apply(config_all, strict: false)
 
 util_ss.add_all(trace_ss)
@@ -XXX,XX +XXX,XX @@ subdir('monitor')
 subdir('net')
 subdir('replay')
 subdir('semihosting')
-subdir('hw')
 subdir('tcg')
 subdir('fpu')
 subdir('accel')
@@ -XXX,XX +XXX,XX @@ qemu_syms = custom_target('qemu.syms', output: 'qemu.syms',
                              capture: true,
                              command: [undefsym, nm, '@INPUT@'])
 
-qom_ss = qom_ss.apply(config_host, strict: false)
-libqom = static_library('qom', qom_ss.sources() + genh,
-                        dependencies: [qom_ss.dependencies()],
-                        name_suffix: 'fa')
-
-qom = declare_dependency(link_whole: libqom)
-
 authz_ss = authz_ss.apply(config_host, strict: false)
 libauthz = static_library('authz', authz_ss.sources() + genh,
                           dependencies: [authz_ss.dependencies()],
@@ -XXX,XX +XXX,XX @@ libblockdev = static_library('blockdev', blockdev_ss.sources() + genh,
                              build_by_default: false)
 
 blockdev = declare_dependency(link_whole: [libblockdev],
-                              dependencies: [block])
+                              dependencies: [block, event_loop_base])
 
 qmp_ss = qmp_ss.apply(config_host, strict: false)
 libqmp = static_library('qmp', qmp_ss.sources() + genh,
diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/sysemu/event-loop-base.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU event-loop backend
+ *
+ * Copyright (C) 2022 Red Hat Inc
+ *
+ * Authors:
+ *  Nicolas Saenz Julienne <nsaenzju@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#ifndef QEMU_EVENT_LOOP_BASE_H
+#define QEMU_EVENT_LOOP_BASE_H
+
+#include "qom/object.h"
+#include "block/aio.h"
+#include "qemu/typedefs.h"
+
+#define TYPE_EVENT_LOOP_BASE         "event-loop-base"
+OBJECT_DECLARE_TYPE(EventLoopBase, EventLoopBaseClass,
+                    EVENT_LOOP_BASE)
+
+struct EventLoopBaseClass {
+    ObjectClass parent_class;
+
+    void (*init)(EventLoopBase *base, Error **errp);
+    void (*update_params)(EventLoopBase *base, Error **errp);
+};
+
+struct EventLoopBase {
+    Object parent;
+
+    /* AioContext AIO engine parameters */
+    int64_t aio_max_batch;
+};
+#endif
diff --git a/include/sysemu/iothread.h b/include/sysemu/iothread.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/iothread.h
+++ b/include/sysemu/iothread.h
@@ -XXX,XX +XXX,XX @@
 #include "block/aio.h"
 #include "qemu/thread.h"
 #include "qom/object.h"
+#include "sysemu/event-loop-base.h"
 
 #define TYPE_IOTHREAD "iothread"
 
 struct IOThread {
-    Object parent_obj;
+    EventLoopBase parent_obj;
 
     QemuThread thread;
     AioContext *ctx;
@@ -XXX,XX +XXX,XX @@ struct IOThread {
     int64_t poll_max_ns;
     int64_t poll_grow;
     int64_t poll_shrink;
-
-    /* AioContext AIO engine parameters */
-    int64_t aio_max_batch;
 };
 typedef struct IOThread IOThread;
 
diff --git a/event-loop-base.c b/event-loop-base.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/event-loop-base.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU event-loop base
+ *
+ * Copyright (C) 2022 Red Hat Inc
+ *
+ * Authors:
+ *  Stefan Hajnoczi <stefanha@redhat.com>
+ *  Nicolas Saenz Julienne <nsaenzju@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qom/object_interfaces.h"
+#include "qapi/error.h"
+#include "sysemu/event-loop-base.h"
+
+typedef struct {
+    const char *name;
+    ptrdiff_t offset; /* field's byte offset in EventLoopBase struct */
+} EventLoopBaseParamInfo;
+
+static EventLoopBaseParamInfo aio_max_batch_info = {
+    "aio-max-batch", offsetof(EventLoopBase, aio_max_batch),
+};
+
+static void event_loop_base_get_param(Object *obj, Visitor *v,
+        const char *name, void *opaque, Error **errp)
+{
+    EventLoopBase *event_loop_base = EVENT_LOOP_BASE(obj);
+    EventLoopBaseParamInfo *info = opaque;
+    int64_t *field = (void *)event_loop_base + info->offset;
+
+    visit_type_int64(v, name, field, errp);
+}
+
+static void event_loop_base_set_param(Object *obj, Visitor *v,
+        const char *name, void *opaque, Error **errp)
+{
+    EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(obj);
+    EventLoopBase *base = EVENT_LOOP_BASE(obj);
+    EventLoopBaseParamInfo *info = opaque;
+    int64_t *field = (void *)base + info->offset;
+    int64_t value;
+
+    if (!visit_type_int64(v, name, &value, errp)) {
+        return;
+    }
+
+    if (value < 0) {
+        error_setg(errp, "%s value must be in range [0, %" PRId64 "]",
+                   info->name, INT64_MAX);
+        return;
+    }
+
+    *field = value;
+
+    if (bc->update_params) {
+        bc->update_params(base, errp);
+    }
+
+    return;
+}
+
+static void event_loop_base_complete(UserCreatable *uc, Error **errp)
+{
+    EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(uc);
+    EventLoopBase *base = EVENT_LOOP_BASE(uc);
+
+    if (bc->init) {
+        bc->init(base, errp);
+    }
+}
+
+static void event_loop_base_class_init(ObjectClass *klass, void *class_data)
+{
+    UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
+    ucc->complete = event_loop_base_complete;
+
+    object_class_property_add(klass, "aio-max-batch", "int",
+                              event_loop_base_get_param,
+                              event_loop_base_set_param,
+                              NULL, &aio_max_batch_info);
+}
+
+static const TypeInfo event_loop_base_info = {
+    .name = TYPE_EVENT_LOOP_BASE,
+    .parent = TYPE_OBJECT,
+    .instance_size = sizeof(EventLoopBase),
+    .class_size = sizeof(EventLoopBaseClass),
+    .class_init = event_loop_base_class_init,
+    .abstract = true,
+    .interfaces = (InterfaceInfo[]) {
+        { TYPE_USER_CREATABLE },
+        { }
+    }
+};
+
+static void register_types(void)
+{
+    type_register_static(&event_loop_base_info);
+}
+type_init(register_types);
diff --git a/iothread.c b/iothread.c
index XXXXXXX..XXXXXXX 100644
--- a/iothread.c
+++ b/iothread.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/module.h"
 #include "block/aio.h"
 #include "block/block.h"
+#include "sysemu/event-loop-base.h"
 #include "sysemu/iothread.h"
 #include "qapi/error.h"
 #include "qapi/qapi-commands-misc.h"
@@ -XXX,XX +XXX,XX @@ static void iothread_init_gcontext(IOThread *iothread)
     iothread->main_loop = g_main_loop_new(iothread->worker_context, TRUE);
 }
 
-static void iothread_set_aio_context_params(IOThread *iothread, Error **errp)
+static void iothread_set_aio_context_params(EventLoopBase *base, Error **errp)
 {
+    IOThread *iothread = IOTHREAD(base);
     ERRP_GUARD();
 
+    if (!iothread->ctx) {
+        return;
+    }
+
     aio_context_set_poll_params(iothread->ctx,
                                 iothread->poll_max_ns,
                                 iothread->poll_grow,
@@ -XXX,XX +XXX,XX @@ static void iothread_set_aio_context_params(IOThread *iothread, Error **errp)
     }
 
     aio_context_set_aio_params(iothread->ctx,
-                               iothread->aio_max_batch,
+                               iothread->parent_obj.aio_max_batch,
                                errp);
 }
 
-static void iothread_complete(UserCreatable *obj, Error **errp)
+
+static void iothread_init(EventLoopBase *base, Error **errp)
 {
     Error *local_error = NULL;
-    IOThread *iothread = IOTHREAD(obj);
+    IOThread *iothread = IOTHREAD(base);
     char *thread_name;
 
     iothread->stopping = false;
@@ -XXX,XX +XXX,XX @@ static void iothread_complete(UserCreatable *obj, Error **errp)
      */
     iothread_init_gcontext(iothread);
 
-    iothread_set_aio_context_params(iothread, &local_error);
+    iothread_set_aio_context_params(base, &local_error);
     if (local_error) {
         error_propagate(errp, local_error);
         aio_context_unref(iothread->ctx);
@@ -XXX,XX +XXX,XX @@ static void iothread_complete(UserCreatable *obj, Error **errp)
      * to inherit.
      */
     thread_name = g_strdup_printf("IO %s",
-                        object_get_canonical_path_component(OBJECT(obj)));
+                        object_get_canonical_path_component(OBJECT(base)));
     qemu_thread_create(&iothread->thread, thread_name, iothread_run,
                        iothread, QEMU_THREAD_JOINABLE);
     g_free(thread_name);
@@ -XXX,XX +XXX,XX @@ static IOThreadParamInfo poll_grow_info = {
 static IOThreadParamInfo poll_shrink_info = {
     "poll-shrink", offsetof(IOThread, poll_shrink),
 };
-static IOThreadParamInfo aio_max_batch_info = {
-    "aio-max-batch", offsetof(IOThread, aio_max_batch),
-};
 
 static void iothread_get_param(Object *obj, Visitor *v,
         const char *name, IOThreadParamInfo *info, Error **errp)
@@ -XXX,XX +XXX,XX @@ static void iothread_set_poll_param(Object *obj, Visitor *v,
     }
 }
 
-static void iothread_get_aio_param(Object *obj, Visitor *v,
-        const char *name, void *opaque, Error **errp)
-{
-    IOThreadParamInfo *info = opaque;
-
-    iothread_get_param(obj, v, name, info, errp);
-}
-
-static void iothread_set_aio_param(Object *obj, Visitor *v,
-        const char *name, void *opaque, Error **errp)
-{
-    IOThread *iothread = IOTHREAD(obj);
-    IOThreadParamInfo *info = opaque;
-
-    if (!iothread_set_param(obj, v, name, info, errp)) {
-        return;
-    }
-
-    if (iothread->ctx) {
-        aio_context_set_aio_params(iothread->ctx,
-                                   iothread->aio_max_batch,
-                                   errp);
-    }
-}
-
 static void iothread_class_init(ObjectClass *klass, void *class_data)
 {
-    UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
-    ucc->complete = iothread_complete;
+    EventLoopBaseClass *bc = EVENT_LOOP_BASE_CLASS(klass);
+
+    bc->init = iothread_init;
+    bc->update_params = iothread_set_aio_context_params;
 
     object_class_property_add(klass, "poll-max-ns", "int",
                               iothread_get_poll_param,
@@ -XXX,XX +XXX,XX @@ static void iothread_class_init(ObjectClass *klass, void *class_data)
                               iothread_get_poll_param,
                               iothread_set_poll_param,
                               NULL, &poll_shrink_info);
-    object_class_property_add(klass, "aio-max-batch", "int",
-                              iothread_get_aio_param,
-                              iothread_set_aio_param,
-                              NULL, &aio_max_batch_info);
 }
 
 static const TypeInfo iothread_info = {
     .name = TYPE_IOTHREAD,
-    .parent = TYPE_OBJECT,
+    .parent = TYPE_EVENT_LOOP_BASE,
     .class_init = iothread_class_init,
     .instance_size = sizeof(IOThread),
     .instance_init = iothread_instance_init,
     .instance_finalize = iothread_instance_finalize,
-    .interfaces = (InterfaceInfo[]) {
-        {TYPE_USER_CREATABLE},
-        {}
-    },
 };
 
 static void iothread_register_types(void)
@@ -XXX,XX +XXX,XX @@ static int query_one_iothread(Object *object, void *opaque)
     info->poll_max_ns = iothread->poll_max_ns;
     info->poll_grow = iothread->poll_grow;
     info->poll_shrink = iothread->poll_shrink;
-    info->aio_max_batch = iothread->aio_max_batch;
+    info->aio_max_batch = iothread->parent_obj.aio_max_batch;
 
     QAPI_LIST_APPEND(*tail, info);
     return 0;
-- 
2.35.1

From: Nicolas Saenz Julienne <nsaenzju@redhat.com>

'event-loop-base' provides basic property handling for all 'AioContext'
based event loops. So let's define a new 'MainLoopClass' that inherits
from it. This will permit tweaking the main loop's properties through
qapi as well as through the command line using the '-object' keyword[1].
Only one instance of 'MainLoopClass' might be created at any time.

'EventLoopBaseClass' learns a new callback, 'can_be_deleted()' so as to
mark 'MainLoop' as non-deletable.

[1] For example:
      -object main-loop,id=main-loop,aio-max-batch=<value>

Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Markus Armbruster <armbru@redhat.com>
Message-id: 20220425075723.20019-3-nsaenzju@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qapi/qom.json                    | 13 ++++++++
 meson.build                      |  3 +-
 include/qemu/main-loop.h         | 10 ++++++
 include/sysemu/event-loop-base.h |  1 +
 event-loop-base.c                | 13 ++++++++
 util/main-loop.c                 | 56 ++++++++++++++++++++++++++++++++
 6 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/qapi/qom.json b/qapi/qom.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -XXX,XX +XXX,XX @@
             '*poll-grow': 'int',
             '*poll-shrink': 'int' } }
 
+##
+# @MainLoopProperties:
+#
+# Properties for the main-loop object.
+#
+# Since: 7.1
+##
+{ 'struct': 'MainLoopProperties',
+  'base': 'EventLoopBaseProperties',
+  'data': {} }
+
 ##
 # @MemoryBackendProperties:
 #
@@ -XXX,XX +XXX,XX @@
     { 'name': 'input-linux',
       'if': 'CONFIG_LINUX' },
     'iothread',
+    'main-loop',
     { 'name': 'memory-backend-epc',
       'if': 'CONFIG_LINUX' },
     'memory-backend-file',
@@ -XXX,XX +XXX,XX @@
       'input-linux':                { 'type': 'InputLinuxProperties',
                                       'if': 'CONFIG_LINUX' },
       'iothread':                   'IothreadProperties',
+      'main-loop':                  'MainLoopProperties',
       'memory-backend-epc':         { 'type': 'MemoryBackendEpcProperties',
                                       'if': 'CONFIG_LINUX' },
       'memory-backend-file':        'MemoryBackendFileProperties',
diff --git a/meson.build b/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/meson.build
+++ b/meson.build
@@ -XXX,XX +XXX,XX @@ libqemuutil = static_library('qemuutil',
                              sources: util_ss.sources() + stub_ss.sources() + genh,
                              dependencies: [util_ss.dependencies(), libm, threads, glib, socket, malloc, pixman])
 qemuutil = declare_dependency(link_with: libqemuutil,
-                              sources: genh + version_res)
+                              sources: genh + version_res,
+                              dependencies: [event_loop_base])
 
 if have_system or have_user
   decodetree = generator(find_program('scripts/decodetree.py'),
diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -XXX,XX +XXX,XX @@
 #define QEMU_MAIN_LOOP_H
 
 #include "block/aio.h"
+#include "qom/object.h"
+#include "sysemu/event-loop-base.h"
 
 #define SIG_IPI SIGUSR1
 
+#define TYPE_MAIN_LOOP  "main-loop"
+OBJECT_DECLARE_TYPE(MainLoop, MainLoopClass, MAIN_LOOP)
+
+struct MainLoop {
+    EventLoopBase parent_obj;
+};
+typedef struct MainLoop MainLoop;
+
 /**
  * qemu_init_main_loop: Set up the process so that it can run the main loop.
  *
diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/event-loop-base.h
+++ b/include/sysemu/event-loop-base.h
@@ -XXX,XX +XXX,XX @@ struct EventLoopBaseClass {
 
     void (*init)(EventLoopBase *base, Error **errp);
     void (*update_params)(EventLoopBase *base, Error **errp);
+    bool (*can_be_deleted)(EventLoopBase *base);
 };
 
 struct EventLoopBase {
diff --git a/event-loop-base.c b/event-loop-base.c
index XXXXXXX..XXXXXXX 100644
--- a/event-loop-base.c
+++ b/event-loop-base.c
@@ -XXX,XX +XXX,XX @@ static void event_loop_base_complete(UserCreatable *uc, Error **errp)
     }
 }
 
+static bool event_loop_base_can_be_deleted(UserCreatable *uc)
+{
+    EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(uc);
+    EventLoopBase *backend = EVENT_LOOP_BASE(uc);
+
+    if (bc->can_be_deleted) {
+        return bc->can_be_deleted(backend);
+    }
+
+    return true;
+}
+
 static void event_loop_base_class_init(ObjectClass *klass, void *class_data)
 {
     UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
     ucc->complete = event_loop_base_complete;
+    ucc->can_be_deleted = event_loop_base_can_be_deleted;
 
     object_class_property_add(klass, "aio-max-batch", "int",
                               event_loop_base_get_param,
diff --git a/util/main-loop.c b/util/main-loop.c
index XXXXXXX..XXXXXXX 100644
--- a/util/main-loop.c
+++ b/util/main-loop.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/error-report.h"
 #include "qemu/queue.h"
 #include "qemu/compiler.h"
+#include "qom/object.h"
 
 #ifndef _WIN32
 #include <sys/wait.h>
@@ -XXX,XX +XXX,XX @@ int qemu_init_main_loop(Error **errp)
     return 0;
 }
 
+static void main_loop_update_params(EventLoopBase *base, Error **errp)
+{
+    if (!qemu_aio_context) {
+        error_setg(errp, "qemu aio context not ready");
+        return;
+    }
+
+    aio_context_set_aio_params(qemu_aio_context, base->aio_max_batch, errp);
+}
+
+MainLoop *mloop;
+
+static void main_loop_init(EventLoopBase *base, Error **errp)
+{
+    MainLoop *m = MAIN_LOOP(base);
+
+    if (mloop) {
+        error_setg(errp, "only one main-loop instance allowed");
+        return;
+    }
+
+    main_loop_update_params(base, errp);
+
+    mloop = m;
+    return;
+}
+
+static bool main_loop_can_be_deleted(EventLoopBase *base)
+{
+    return false;
+}
+
+static void main_loop_class_init(ObjectClass *oc, void *class_data)
+{
+    EventLoopBaseClass *bc = EVENT_LOOP_BASE_CLASS(oc);
+
+    bc->init = main_loop_init;
+    bc->update_params = main_loop_update_params;
+    bc->can_be_deleted = main_loop_can_be_deleted;
+}
+
+static const TypeInfo main_loop_info = {
+    .name = TYPE_MAIN_LOOP,
+    .parent = TYPE_EVENT_LOOP_BASE,
+    .class_init = main_loop_class_init,
+    .instance_size = sizeof(MainLoop),
+};
+
+static void main_loop_register_types(void)
+{
+    type_register_static(&main_loop_info);
+}
+
+type_init(main_loop_register_types)
+
 static int max_priority;
 
 #ifndef _WIN32
-- 
2.35.1

From: Nicolas Saenz Julienne <nsaenzju@redhat.com>

The thread pool regulates itself: when idle, it kills threads until
empty, when in demand, it creates new threads until full. This behaviour
doesn't play well with latency sensitive workloads where the price of
creating a new thread is too high. For example, when paired with qemu's
'-mlock', or using safety features like SafeStack, creating a new thread
has been measured take multiple milliseconds.

In order to mitigate this let's introduce a new 'EventLoopBase'
property to set the thread pool size. The threads will be created during
the pool's initialization or upon updating the property's value, remain
available during its lifetime regardless of demand, and destroyed upon
freeing it. A properly characterized workload will then be able to
configure the pool to avoid any latency spikes.

Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Markus Armbruster <armbru@redhat.com>
Message-id: 20220425075723.20019-4-nsaenzju@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qapi/qom.json                    | 10 +++++-
 include/block/aio.h              | 10 ++++++
 include/block/thread-pool.h      |  3 ++
 include/sysemu/event-loop-base.h |  4 +++
 event-loop-base.c                | 23 +++++++++++++
 iothread.c                       |  3 ++
 util/aio-posix.c                 |  1 +
 util/async.c                     | 20 ++++++++++++
 util/main-loop.c                 |  9 ++++++
 util/thread-pool.c               | 55 +++++++++++++++++++++++++++++---
 10 files changed, 133 insertions(+), 5 deletions(-)

diff --git a/qapi/qom.json b/qapi/qom.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -XXX,XX +XXX,XX @@
 #                 0 means that the engine will use its default.
 #                 (default: 0)
 #
+# @thread-pool-min: minimum number of threads reserved in the thread pool
+#                   (default:0)
+#
+# @thread-pool-max: maximum number of threads the thread pool can contain
+#                   (default:64)
+#
 # Since: 7.1
 ##
 { 'struct': 'EventLoopBaseProperties',
-  'data': { '*aio-max-batch': 'int' } }
+  'data': { '*aio-max-batch': 'int',
+            '*thread-pool-min': 'int',
+            '*thread-pool-max': 'int' } }
 
 ##
 # @IothreadProperties:
diff --git a/include/block/aio.h b/include/block/aio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -XXX,XX +XXX,XX @@ struct AioContext {
     QSLIST_HEAD(, Coroutine) scheduled_coroutines;
     QEMUBH *co_schedule_bh;
 
+    int thread_pool_min;
+    int thread_pool_max;
     /* Thread pool for performing work and receiving completion callbacks.
      * Has its own locking.
      */
@@ -XXX,XX +XXX,XX @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
 void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
                                 Error **errp);
 
+/**
+ * aio_context_set_thread_pool_params:
+ * @ctx: the aio context
+ * @min: min number of threads to have readily available in the thread pool
+ * @min: max number of threads the thread pool can contain
+ */
+void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
+                                        int64_t max, Error **errp);
 #endif
diff --git a/include/block/thread-pool.h b/include/block/thread-pool.h
index XXXXXXX..XXXXXXX 100644
--- a/include/block/thread-pool.h
+++ b/include/block/thread-pool.h
@@ -XXX,XX +XXX,XX @@
 
 #include "block/block.h"
 
+#define THREAD_POOL_MAX_THREADS_DEFAULT         64
+
 typedef int ThreadPoolFunc(void *opaque);
 
 typedef struct ThreadPool ThreadPool;
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *thread_pool_submit_aio(ThreadPool *pool,
 int coroutine_fn thread_pool_submit_co(ThreadPool *pool,
         ThreadPoolFunc *func, void *arg);
 void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg);
+void thread_pool_update_params(ThreadPool *pool, struct AioContext *ctx);
 
 #endif
diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/event-loop-base.h
+++ b/include/sysemu/event-loop-base.h
@@ -XXX,XX +XXX,XX @@ struct EventLoopBase {
 
     /* AioContext AIO engine parameters */
     int64_t aio_max_batch;
+
+    /* AioContext thread pool parameters */
+    int64_t thread_pool_min;
+    int64_t thread_pool_max;
 };
 #endif
diff --git a/event-loop-base.c b/event-loop-base.c
index XXXXXXX..XXXXXXX 100644
--- a/event-loop-base.c
+++ b/event-loop-base.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "qom/object_interfaces.h"
 #include "qapi/error.h"
+#include "block/thread-pool.h"
 #include "sysemu/event-loop-base.h"
 
 typedef struct {
@@ -XXX,XX +XXX,XX @@ typedef struct {
     ptrdiff_t offset; /* field's byte offset in EventLoopBase struct */
 } EventLoopBaseParamInfo;
 
+static void event_loop_base_instance_init(Object *obj)
+{
+    EventLoopBase *base = EVENT_LOOP_BASE(obj);
+
+    base->thread_pool_max = THREAD_POOL_MAX_THREADS_DEFAULT;
+}
+
 static EventLoopBaseParamInfo aio_max_batch_info = {
     "aio-max-batch", offsetof(EventLoopBase, aio_max_batch),
 };
+static EventLoopBaseParamInfo thread_pool_min_info = {
+    "thread-pool-min", offsetof(EventLoopBase, thread_pool_min),
+};
+static EventLoopBaseParamInfo thread_pool_max_info = {
+    "thread-pool-max", offsetof(EventLoopBase, thread_pool_max),
+};
 
 static void event_loop_base_get_param(Object *obj, Visitor *v,
         const char *name, void *opaque, Error **errp)
@@ -XXX,XX +XXX,XX @@ static void event_loop_base_class_init(ObjectClass *klass, void *class_data)
                               event_loop_base_get_param,
                               event_loop_base_set_param,
                               NULL, &aio_max_batch_info);
+    object_class_property_add(klass, "thread-pool-min", "int",
+                              event_loop_base_get_param,
+                              event_loop_base_set_param,
+                              NULL, &thread_pool_min_info);
+    object_class_property_add(klass, "thread-pool-max", "int",
+                              event_loop_base_get_param,
+                              event_loop_base_set_param,
+                              NULL, &thread_pool_max_info);
 }
 
 static const TypeInfo event_loop_base_info = {
     .name = TYPE_EVENT_LOOP_BASE,
     .parent = TYPE_OBJECT,
     .instance_size = sizeof(EventLoopBase),
+    .instance_init = event_loop_base_instance_init,
     .class_size = sizeof(EventLoopBaseClass),
     .class_init = event_loop_base_class_init,
     .abstract = true,
diff --git a/iothread.c b/iothread.c
index XXXXXXX..XXXXXXX 100644
--- a/iothread.c
+++ b/iothread.c
@@ -XXX,XX +XXX,XX @@ static void iothread_set_aio_context_params(EventLoopBase *base, Error **errp)
     aio_context_set_aio_params(iothread->ctx,
                                iothread->parent_obj.aio_max_batch,
                                errp);
+
+    aio_context_set_thread_pool_params(iothread->ctx, base->thread_pool_min,
+                                       base->thread_pool_max, errp);
 }
 
 
diff --git a/util/aio-posix.c b/util/aio-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "block/block.h"
+#include "block/thread-pool.h"
 #include "qemu/main-loop.h"
 #include "qemu/rcu.h"
 #include "qemu/rcu_queue.h"
diff --git a/util/async.c b/util/async.c
index XXXXXXX..XXXXXXX 100644
--- a/util/async.c
+++ b/util/async.c
@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
 
     ctx->aio_max_batch = 0;
 
+    ctx->thread_pool_min = 0;
+    ctx->thread_pool_max = THREAD_POOL_MAX_THREADS_DEFAULT;
+
     return ctx;
 fail:
     g_source_destroy(&ctx->source);
@@ -XXX,XX +XXX,XX @@ void qemu_set_current_aio_context(AioContext *ctx)
     assert(!get_my_aiocontext());
     set_my_aiocontext(ctx);
 }
+
+void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
+                                        int64_t max, Error **errp)
+{
+
+    if (min > max || !max || min > INT_MAX || max > INT_MAX) {
+        error_setg(errp, "bad thread-pool-min/thread-pool-max values");
+        return;
+    }
+
+    ctx->thread_pool_min = min;
+    ctx->thread_pool_max = max;
+
+    if (ctx->thread_pool) {
+        thread_pool_update_params(ctx->thread_pool, ctx);
+    }
+}
diff --git a/util/main-loop.c b/util/main-loop.c
index XXXXXXX..XXXXXXX 100644
--- a/util/main-loop.c
+++ b/util/main-loop.c
@@ -XXX,XX +XXX,XX @@
 #include "sysemu/replay.h"
 #include "qemu/main-loop.h"
 #include "block/aio.h"
+#include "block/thread-pool.h"
 #include "qemu/error-report.h"
 #include "qemu/queue.h"
 #include "qemu/compiler.h"
@@ -XXX,XX +XXX,XX @@ int qemu_init_main_loop(Error **errp)
 
 static void main_loop_update_params(EventLoopBase *base, Error **errp)
 {
+    ERRP_GUARD();
+
     if (!qemu_aio_context) {
         error_setg(errp, "qemu aio context not ready");
         return;
     }
 
     aio_context_set_aio_params(qemu_aio_context, base->aio_max_batch, errp);
+    if (*errp) {
+        return;
+    }
+
+    aio_context_set_thread_pool_params(qemu_aio_context, base->thread_pool_min,
+                                       base->thread_pool_max, errp);
 }
 
 MainLoop *mloop;
diff --git a/util/thread-pool.c b/util/thread-pool.c
index XXXXXXX..XXXXXXX 100644
--- a/util/thread-pool.c
+++ b/util/thread-pool.c
@@ -XXX,XX +XXX,XX @@ struct ThreadPool {
     QemuMutex lock;
     QemuCond worker_stopped;
     QemuSemaphore sem;
-    int max_threads;
     QEMUBH *new_thread_bh;
 
     /* The following variables are only accessed from one AioContext. */
@@ -XXX,XX +XXX,XX @@ struct ThreadPool {
     int new_threads;     /* backlog of threads we need to create */
     int pending_threads; /* threads created but not running yet */
     bool stopping;
+    int min_threads;
+    int max_threads;
 };
 
+static inline bool back_to_sleep(ThreadPool *pool, int ret)
+{
+    /*
+     * The semaphore timed out, we should exit the loop except when:
+     *  - There is work to do, we raced with the signal.
+     *  - The max threads threshold just changed, we raced with the signal.
+     *  - The thread pool forces a minimum number of readily available threads.
+     */
+    if (ret == -1 && (!QTAILQ_EMPTY(&pool->request_list) ||
+            pool->cur_threads > pool->max_threads ||
+            pool->cur_threads <= pool->min_threads)) {
+            return true;
+    }
+
+    return false;
+}
+
 static void *worker_thread(void *opaque)
 {
     ThreadPool *pool = opaque;
@@ -XXX,XX +XXX,XX @@ static void *worker_thread(void *opaque)
             ret = qemu_sem_timedwait(&pool->sem, 10000);
             qemu_mutex_lock(&pool->lock);
             pool->idle_threads--;
-        } while (ret == -1 && !QTAILQ_EMPTY(&pool->request_list));
-        if (ret == -1 || pool->stopping) {
+        } while (back_to_sleep(pool, ret));
+        if (ret == -1 || pool->stopping ||
+            pool->cur_threads > pool->max_threads) {
             break;
         }
 
@@ -XXX,XX +XXX,XX @@ void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg)
     thread_pool_submit_aio(pool, func, arg, NULL, NULL);
 }
 
+void thread_pool_update_params(ThreadPool *pool, AioContext *ctx)
+{
+    qemu_mutex_lock(&pool->lock);
+
+    pool->min_threads = ctx->thread_pool_min;
+    pool->max_threads = ctx->thread_pool_max;
+
+    /*
+     * We either have to:
+     *  - Increase the number available of threads until over the min_threads
+     *    threshold.
+     *  - Decrease the number of available threads until under the max_threads
+     *    threshold.
+     *  - Do nothing. The current number of threads fall in between the min and
+     *    max thresholds. We'll let the pool manage itself.
+     */
+    for (int i = pool->cur_threads; i < pool->min_threads; i++) {
+        spawn_thread(pool);
+    }
+
+    for (int i = pool->cur_threads; i > pool->max_threads; i--) {
+        qemu_sem_post(&pool->sem);
+    }
+
+    qemu_mutex_unlock(&pool->lock);
+}
+
 static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx)
 {
     if (!ctx) {
@@ -XXX,XX +XXX,XX @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx)
     qemu_mutex_init(&pool->lock);
     qemu_cond_init(&pool->worker_stopped);
     qemu_sem_init(&pool->sem, 0);
-    pool->max_threads = 64;
     pool->new_thread_bh = aio_bh_new(ctx, spawn_thread_bh_fn, pool);
 
     QLIST_INIT(&pool->head);
     QTAILQ_INIT(&pool->request_list);
+
+    thread_pool_update_params(pool, ctx);
 }
 
 ThreadPool *thread_pool_new(AioContext *ctx)
-- 
2.35.1