[PATCH] accel/kvm: event-driven wakeup for dirty ring reaper thread

Bin Guo posted 1 patch 5 days, 2 hours ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20260603031612.6173-1-guobin@linux.alibaba.com
Maintainers: Paolo Bonzini <pbonzini@redhat.com>, Hyman Huang <infra.ai.cloud@bitdeer.com>
There is a newer version of this series
accel/kvm/kvm-all.c      | 62 +++++++++++++++++++++++++++++++++++++---
accel/stubs/kvm-stub.c   |  4 +++
include/system/kvm.h     |  9 ++++++
include/system/kvm_int.h |  3 ++
system/dirtylimit.c      |  6 ++++
5 files changed, 80 insertions(+), 4 deletions(-)
[PATCH] accel/kvm: event-driven wakeup for dirty ring reaper thread
Posted by Bin Guo 5 days, 2 hours ago
The dirty ring reaper thread polls with a fixed sleep(1), which the
code itself flags with a TODO for a smarter timeout.  Events that
would benefit from a prompt reap must wait up to one full sleep tick
before the reaper notices them.

Replace the polling loop with an EventNotifier-based wait via
qemu_poll_ns(), kicked by paths that have first-hand evidence that
a reap is desirable.  A 1s fallback timeout is retained as a
defensive backstop.

Two kick sites are wired up:

  * dirtylimit_change(false) -- when dirty-limit is cancelled, kick
    the reaper so it resumes real reaping immediately instead of
    waiting for the next sleep tick.

    Measured with strace on the reaper TID (20 set/cancel cycles
    against a stress-ng dirty-page workload), latency from the
    cancel-vcpu-dirty-limit QMP ack to the reaper actually waking:

        before: median 530ms, max 937ms
        after:  median <1ms,  max <1ms

  * kvm_cpu_exec() KVM_EXIT_DIRTY_RING_FULL handler -- the handler
    already reaps the exiting vCPU synchronously; the kick hints the
    reaper to also check other vCPUs whose rings are likely filling
    concurrently.

kvm_dirty_ring_reaper_kick() is exposed as a public API; multiple
kicks collapse into a single wake via the eventfd counter.
kvm_dirty_ring_reaper_init() now returns int again (it was made void
in commit 43a5e377f4) to propagate event_notifier_init() failure.
A stub is added for non-KVM builds.

Signed-off-by: Bin Guo <guobin@linux.alibaba.com>
---
 accel/kvm/kvm-all.c      | 62 +++++++++++++++++++++++++++++++++++++---
 accel/stubs/kvm-stub.c   |  4 +++
 include/system/kvm.h     |  9 ++++++
 include/system/kvm_int.h |  3 ++
 system/dirtylimit.c      |  6 ++++
 5 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 96f90ebb24..f964102f09 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1754,6 +1754,9 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
     } while (size);
 }
 
+/* Fallback liveness timeout for the dirty ring reaper, in nanoseconds. */
+#define KVM_DIRTY_RING_REAPER_FALLBACK_NS  (1 * NANOSECONDS_PER_SECOND)
+
 static void *kvm_dirty_ring_reaper_thread(void *data)
 {
     KVMState *s = data;
@@ -1764,12 +1767,30 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
     trace_kvm_dirty_ring_reaper("init");
 
     while (true) {
+        GPollFD pfd = {
+            .fd = event_notifier_get_fd(&r->reaper_notifier),
+            .events = G_IO_IN,
+        };
+
         r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
         trace_kvm_dirty_ring_reaper("wait");
+
         /*
-         * TODO: provide a smarter timeout rather than a constant?
+         * Event-driven wait: sleep until something kicks the reaper
+         * (vCPU ring-full exit, dirtylimit disabled, ...) or until the
+         * fallback timeout fires.  The fallback preserves the original
+         * sleep(1) worst-case behaviour as a liveness backstop in case
+         * a kick site is ever missed.
          */
-        sleep(1);
+        qemu_poll_ns(&pfd, 1, KVM_DIRTY_RING_REAPER_FALLBACK_NS);
+
+        /*
+         * Drain the notifier whether or not the wakeup came from it --
+         * any pending kick is satisfied by the reap we are about to
+         * perform, so we must not leave a stale event behind that would
+         * cause the next iteration to spin without sleeping.
+         */
+        event_notifier_test_and_clear(&r->reaper_notifier);
 
         /* keep sleeping so that dirtylimit not be interfered by reaper */
         if (dirtylimit_in_service()) {
@@ -1789,13 +1810,39 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
     g_assert_not_reached();
 }
 
-static void kvm_dirty_ring_reaper_init(KVMState *s)
+static int kvm_dirty_ring_reaper_init(KVMState *s, Error **errp)
 {
     struct KVMDirtyRingReaper *r = &s->reaper;
+    int ret;
+
+    ret = event_notifier_init(&r->reaper_notifier, 0);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret,
+                         "Failed to initialize dirty ring reaper notifier");
+        return ret;
+    }
 
     qemu_thread_create(&r->reaper_thr, "kvm-reaper",
                        kvm_dirty_ring_reaper_thread,
                        s, QEMU_THREAD_JOINABLE);
+    return 0;
+}
+
+/*
+ * Wake the dirty ring reaper thread so it performs a reap as soon as
+ * possible.  Safe to call from any thread; safe to call even when the
+ * dirty ring is not enabled (no-op in that case).  Coalescing is
+ * provided by the eventfd counter -- multiple kicks before the reaper
+ * runs collapse into a single wakeup.
+ */
+void kvm_dirty_ring_reaper_kick(void)
+{
+    KVMState *s = kvm_state;
+
+    if (!s || !s->kvm_dirty_ring_size) {
+        return;
+    }
+    event_notifier_set(&s->reaper.reaper_notifier);
 }
 
 static int kvm_dirty_ring_init(KVMState *s)
@@ -3097,7 +3144,9 @@ static int kvm_init(AccelState *as, MachineState *ms)
     }
 
     if (s->kvm_dirty_ring_size) {
-        kvm_dirty_ring_reaper_init(s);
+        if (kvm_dirty_ring_reaper_init(s, errp) < 0) {
+            goto err;
+        }
     }
 
     if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) {
@@ -3571,6 +3620,11 @@ int kvm_cpu_exec(CPUState *cpu)
                 kvm_dirty_ring_reap(kvm_state, NULL);
             }
             bql_unlock();
+            /*
+             * Ring-full pressure is the strongest signal that background
+             * reaping should stay hot; wake the reaper unconditionally.
+             */
+            kvm_dirty_ring_reaper_kick();
             dirtylimit_vcpu_execute(cpu);
             ret = 0;
             break;
diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
index c4617caac6..b878598552 100644
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -134,6 +134,10 @@ uint32_t kvm_dirty_ring_size(void)
     return 0;
 }
 
+void kvm_dirty_ring_reaper_kick(void)
+{
+}
+
 bool kvm_hwpoisoned_mem(void)
 {
     return false;
diff --git a/include/system/kvm.h b/include/system/kvm.h
index 5fa33eddda..c42c8e0b74 100644
--- a/include/system/kvm.h
+++ b/include/system/kvm.h
@@ -553,6 +553,15 @@ bool kvm_dirty_ring_enabled(void);
 
 uint32_t kvm_dirty_ring_size(void);
 
+/**
+ * kvm_dirty_ring_reaper_kick - wake the background dirty ring reaper.
+ *
+ * Hint that an immediate reap is desirable (e.g. ring-full pressure
+ * detected, dirtylimit disabled).  Coalesced via eventfd.  No-op when
+ * the dirty ring feature is not in use.
+ */
+void kvm_dirty_ring_reaper_kick(void);
+
 void kvm_mark_guest_state_protected(void);
 
 /**
diff --git a/include/system/kvm_int.h b/include/system/kvm_int.h
index 0876aac938..08e28d075a 100644
--- a/include/system/kvm_int.h
+++ b/include/system/kvm_int.h
@@ -12,6 +12,7 @@
 #include "system/memory.h"
 #include "qapi/qapi-types-common.h"
 #include "qemu/accel.h"
+#include "qemu/event_notifier.h"
 #include "qemu/queue.h"
 #include "system/kvm.h"
 #include "accel/accel-ops.h"
@@ -100,6 +101,8 @@ struct KVMDirtyRingReaper {
     QemuThread reaper_thr;
     volatile uint64_t reaper_iteration; /* iteration number of reaper thr */
     volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */
+    /* Wakeup channel: kicked by ring-full vCPU exits, dirtylimit toggle, ... */
+    EventNotifier reaper_notifier;
 };
 struct KVMState
 {
diff --git a/system/dirtylimit.c b/system/dirtylimit.c
index c934ceb0de..3ee9c58479 100644
--- a/system/dirtylimit.c
+++ b/system/dirtylimit.c
@@ -393,6 +393,12 @@ void dirtylimit_change(bool start)
         qatomic_set(&dirtylimit_quit, 0);
     } else {
         qatomic_set(&dirtylimit_quit, 1);
+        /*
+         * The reaper has been short-circuiting via the
+         * dirtylimit_in_service() branch.  Kick it so it picks up the
+         * policy change immediately instead of after the next 1s tick.
+         */
+        kvm_dirty_ring_reaper_kick();
     }
 }
 
-- 
2.50.1 (Apple Git-155)
Re: [PATCH] accel/kvm: event-driven wakeup for dirty ring reaper thread
Posted by Hy Man 4 days, 2 hours ago
The patch looks good to me overall, but I encountered a compilation error
(see details below). I suspect it might have been tested on a vendor-specific
version; could you please re-verify it against the latest mainline version?
I’ll be happy to merge it once the build issue is resolved.

On Wed, Jun 3, 2026 at 11:16 AM Bin Guo <guobin@linux.alibaba.com> wrote:
>
> The dirty ring reaper thread polls with a fixed sleep(1), which the
> code itself flags with a TODO for a smarter timeout.  Events that
> would benefit from a prompt reap must wait up to one full sleep tick
> before the reaper notices them.
>
> Replace the polling loop with an EventNotifier-based wait via
> qemu_poll_ns(), kicked by paths that have first-hand evidence that
> a reap is desirable.  A 1s fallback timeout is retained as a
> defensive backstop.
>
> Two kick sites are wired up:
>
>   * dirtylimit_change(false) -- when dirty-limit is cancelled, kick
>     the reaper so it resumes real reaping immediately instead of
>     waiting for the next sleep tick.
>
>     Measured with strace on the reaper TID (20 set/cancel cycles
>     against a stress-ng dirty-page workload), latency from the
>     cancel-vcpu-dirty-limit QMP ack to the reaper actually waking:
>
>         before: median 530ms, max 937ms
>         after:  median <1ms,  max <1ms
>
>   * kvm_cpu_exec() KVM_EXIT_DIRTY_RING_FULL handler -- the handler
>     already reaps the exiting vCPU synchronously; the kick hints the
>     reaper to also check other vCPUs whose rings are likely filling
>     concurrently.
>
> kvm_dirty_ring_reaper_kick() is exposed as a public API; multiple
> kicks collapse into a single wake via the eventfd counter.
> kvm_dirty_ring_reaper_init() now returns int again (it was made void
> in commit 43a5e377f4) to propagate event_notifier_init() failure.
> A stub is added for non-KVM builds.
>
> Signed-off-by: Bin Guo <guobin@linux.alibaba.com>
> ---
>  accel/kvm/kvm-all.c      | 62 +++++++++++++++++++++++++++++++++++++---
>  accel/stubs/kvm-stub.c   |  4 +++
>  include/system/kvm.h     |  9 ++++++
>  include/system/kvm_int.h |  3 ++
>  system/dirtylimit.c      |  6 ++++
>  5 files changed, 80 insertions(+), 4 deletions(-)
>
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 96f90ebb24..f964102f09 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -1754,6 +1754,9 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
>      } while (size);
>  }
>
> +/* Fallback liveness timeout for the dirty ring reaper, in nanoseconds. */
> +#define KVM_DIRTY_RING_REAPER_FALLBACK_NS  (1 * NANOSECONDS_PER_SECOND)
> +
>  static void *kvm_dirty_ring_reaper_thread(void *data)
>  {
>      KVMState *s = data;
> @@ -1764,12 +1767,30 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
>      trace_kvm_dirty_ring_reaper("init");
>
>      while (true) {
> +        GPollFD pfd = {
> +            .fd = event_notifier_get_fd(&r->reaper_notifier),
> +            .events = G_IO_IN,
> +        };
> +
>          r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
>          trace_kvm_dirty_ring_reaper("wait");
> +
>          /*
> -         * TODO: provide a smarter timeout rather than a constant?
> +         * Event-driven wait: sleep until something kicks the reaper
> +         * (vCPU ring-full exit, dirtylimit disabled, ...) or until the
> +         * fallback timeout fires.  The fallback preserves the original
> +         * sleep(1) worst-case behaviour as a liveness backstop in case
> +         * a kick site is ever missed.
>           */
> -        sleep(1);
> +        qemu_poll_ns(&pfd, 1, KVM_DIRTY_RING_REAPER_FALLBACK_NS);
> +
> +        /*
> +         * Drain the notifier whether or not the wakeup came from it --
> +         * any pending kick is satisfied by the reap we are about to
> +         * perform, so we must not leave a stale event behind that would
> +         * cause the next iteration to spin without sleeping.
> +         */
> +        event_notifier_test_and_clear(&r->reaper_notifier);
>
>          /* keep sleeping so that dirtylimit not be interfered by reaper */
>          if (dirtylimit_in_service()) {
> @@ -1789,13 +1810,39 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
>      g_assert_not_reached();
>  }
>
> -static void kvm_dirty_ring_reaper_init(KVMState *s)
> +static int kvm_dirty_ring_reaper_init(KVMState *s, Error **errp)
>  {
>      struct KVMDirtyRingReaper *r = &s->reaper;
> +    int ret;
> +
> +    ret = event_notifier_init(&r->reaper_notifier, 0);
> +    if (ret < 0) {
> +        error_setg_errno(errp, -ret,
> +                         "Failed to initialize dirty ring reaper notifier");
> +        return ret;
> +    }
>
>      qemu_thread_create(&r->reaper_thr, "kvm-reaper",
>                         kvm_dirty_ring_reaper_thread,
>                         s, QEMU_THREAD_JOINABLE);
> +    return 0;
> +}
> +
> +/*
> + * Wake the dirty ring reaper thread so it performs a reap as soon as
> + * possible.  Safe to call from any thread; safe to call even when the
> + * dirty ring is not enabled (no-op in that case).  Coalescing is
> + * provided by the eventfd counter -- multiple kicks before the reaper
> + * runs collapse into a single wakeup.
> + */
> +void kvm_dirty_ring_reaper_kick(void)
> +{
> +    KVMState *s = kvm_state;
> +
> +    if (!s || !s->kvm_dirty_ring_size) {
> +        return;
> +    }
> +    event_notifier_set(&s->reaper.reaper_notifier);
>  }
>
>  static int kvm_dirty_ring_init(KVMState *s)
> @@ -3097,7 +3144,9 @@ static int kvm_init(AccelState *as, MachineState *ms)
>      }
>
>      if (s->kvm_dirty_ring_size) {
> -        kvm_dirty_ring_reaper_init(s);

Compilation error: 'errp' is undefined in this context.

> +        if (kvm_dirty_ring_reaper_init(s, errp) < 0) {
> +            goto err;
> +        }
>      }
>
>      if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) {
> @@ -3571,6 +3620,11 @@ int kvm_cpu_exec(CPUState *cpu)
>                  kvm_dirty_ring_reap(kvm_state, NULL);
>              }
>              bql_unlock();
> +            /*
> +             * Ring-full pressure is the strongest signal that background
> +             * reaping should stay hot; wake the reaper unconditionally.
> +             */
> +            kvm_dirty_ring_reaper_kick();
>              dirtylimit_vcpu_execute(cpu);
>              ret = 0;
>              break;
> diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
> index c4617caac6..b878598552 100644
> --- a/accel/stubs/kvm-stub.c
> +++ b/accel/stubs/kvm-stub.c
> @@ -134,6 +134,10 @@ uint32_t kvm_dirty_ring_size(void)
>      return 0;
>  }
>
> +void kvm_dirty_ring_reaper_kick(void)
> +{
> +}
> +
>  bool kvm_hwpoisoned_mem(void)
>  {
>      return false;
> diff --git a/include/system/kvm.h b/include/system/kvm.h
> index 5fa33eddda..c42c8e0b74 100644
> --- a/include/system/kvm.h
> +++ b/include/system/kvm.h
> @@ -553,6 +553,15 @@ bool kvm_dirty_ring_enabled(void);
>
>  uint32_t kvm_dirty_ring_size(void);
>
> +/**
> + * kvm_dirty_ring_reaper_kick - wake the background dirty ring reaper.
> + *
> + * Hint that an immediate reap is desirable (e.g. ring-full pressure
> + * detected, dirtylimit disabled).  Coalesced via eventfd.  No-op when
> + * the dirty ring feature is not in use.
> + */
> +void kvm_dirty_ring_reaper_kick(void);
> +
>  void kvm_mark_guest_state_protected(void);
>
>  /**
> diff --git a/include/system/kvm_int.h b/include/system/kvm_int.h
> index 0876aac938..08e28d075a 100644
> --- a/include/system/kvm_int.h
> +++ b/include/system/kvm_int.h
> @@ -12,6 +12,7 @@
>  #include "system/memory.h"
>  #include "qapi/qapi-types-common.h"
>  #include "qemu/accel.h"
> +#include "qemu/event_notifier.h"
>  #include "qemu/queue.h"
>  #include "system/kvm.h"
>  #include "accel/accel-ops.h"
> @@ -100,6 +101,8 @@ struct KVMDirtyRingReaper {
>      QemuThread reaper_thr;
>      volatile uint64_t reaper_iteration; /* iteration number of reaper thr */
>      volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */
> +    /* Wakeup channel: kicked by ring-full vCPU exits, dirtylimit toggle, ... */
> +    EventNotifier reaper_notifier;
>  };
>  struct KVMState
>  {
> diff --git a/system/dirtylimit.c b/system/dirtylimit.c
> index c934ceb0de..3ee9c58479 100644
> --- a/system/dirtylimit.c
> +++ b/system/dirtylimit.c
> @@ -393,6 +393,12 @@ void dirtylimit_change(bool start)
>          qatomic_set(&dirtylimit_quit, 0);
>      } else {
>          qatomic_set(&dirtylimit_quit, 1);
> +        /*
> +         * The reaper has been short-circuiting via the
> +         * dirtylimit_in_service() branch.  Kick it so it picks up the
> +         * policy change immediately instead of after the next 1s tick.
> +         */
> +        kvm_dirty_ring_reaper_kick();
>      }
>  }
>
> --
> 2.50.1 (Apple Git-155)
>
Re: [PATCH] accel/kvm: event-driven wakeup for dirty ring reaper thread
Posted by Hy Man 4 days, 3 hours ago
The patch looks good to me overall, but I encountered a compilation error
(see details below). I suspect it might have been tested on a
vendor-specific
version; could you please re-verify it against the latest mainline version?
I’ll be happy to merge it once the build issue is resolved.

On Wed, Jun 3, 2026 at 11:16 AM Bin Guo <guobin@linux.alibaba.com> wrote:

> The dirty ring reaper thread polls with a fixed sleep(1), which the
> code itself flags with a TODO for a smarter timeout.  Events that
> would benefit from a prompt reap must wait up to one full sleep tick
> before the reaper notices them.
>
> Replace the polling loop with an EventNotifier-based wait via
> qemu_poll_ns(), kicked by paths that have first-hand evidence that
> a reap is desirable.  A 1s fallback timeout is retained as a
> defensive backstop.
>
> Two kick sites are wired up:
>
>   * dirtylimit_change(false) -- when dirty-limit is cancelled, kick
>     the reaper so it resumes real reaping immediately instead of
>     waiting for the next sleep tick.
>
>     Measured with strace on the reaper TID (20 set/cancel cycles
>     against a stress-ng dirty-page workload), latency from the
>     cancel-vcpu-dirty-limit QMP ack to the reaper actually waking:
>
>         before: median 530ms, max 937ms
>         after:  median <1ms,  max <1ms
>
>   * kvm_cpu_exec() KVM_EXIT_DIRTY_RING_FULL handler -- the handler
>     already reaps the exiting vCPU synchronously; the kick hints the
>     reaper to also check other vCPUs whose rings are likely filling
>     concurrently.
>
> kvm_dirty_ring_reaper_kick() is exposed as a public API; multiple
> kicks collapse into a single wake via the eventfd counter.
> kvm_dirty_ring_reaper_init() now returns int again (it was made void
> in commit 43a5e377f4) to propagate event_notifier_init() failure.
> A stub is added for non-KVM builds.
>
> Signed-off-by: Bin Guo <guobin@linux.alibaba.com>
> ---
>  accel/kvm/kvm-all.c      | 62 +++++++++++++++++++++++++++++++++++++---
>  accel/stubs/kvm-stub.c   |  4 +++
>  include/system/kvm.h     |  9 ++++++
>  include/system/kvm_int.h |  3 ++
>  system/dirtylimit.c      |  6 ++++
>  5 files changed, 80 insertions(+), 4 deletions(-)
>
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 96f90ebb24..f964102f09 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -1754,6 +1754,9 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
>      } while (size);
>  }
>
> +/* Fallback liveness timeout for the dirty ring reaper, in nanoseconds. */
> +#define KVM_DIRTY_RING_REAPER_FALLBACK_NS  (1 * NANOSECONDS_PER_SECOND)
> +
>  static void *kvm_dirty_ring_reaper_thread(void *data)
>  {
>      KVMState *s = data;
> @@ -1764,12 +1767,30 @@ static void *kvm_dirty_ring_reaper_thread(void
> *data)
>      trace_kvm_dirty_ring_reaper("init");
>
>      while (true) {
> +        GPollFD pfd = {
> +            .fd = event_notifier_get_fd(&r->reaper_notifier),
> +            .events = G_IO_IN,
> +        };
> +
>          r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
>          trace_kvm_dirty_ring_reaper("wait");
> +
>          /*
> -         * TODO: provide a smarter timeout rather than a constant?
> +         * Event-driven wait: sleep until something kicks the reaper
> +         * (vCPU ring-full exit, dirtylimit disabled, ...) or until the
> +         * fallback timeout fires.  The fallback preserves the original
> +         * sleep(1) worst-case behaviour as a liveness backstop in case
> +         * a kick site is ever missed.
>           */
> -        sleep(1);
> +        qemu_poll_ns(&pfd, 1, KVM_DIRTY_RING_REAPER_FALLBACK_NS);
> +
> +        /*
> +         * Drain the notifier whether or not the wakeup came from it --
> +         * any pending kick is satisfied by the reap we are about to
> +         * perform, so we must not leave a stale event behind that would
> +         * cause the next iteration to spin without sleeping.
> +         */
> +        event_notifier_test_and_clear(&r->reaper_notifier);
>
>          /* keep sleeping so that dirtylimit not be interfered by reaper */
>          if (dirtylimit_in_service()) {
> @@ -1789,13 +1810,39 @@ static void *kvm_dirty_ring_reaper_thread(void
> *data)
>      g_assert_not_reached();
>  }
>
> -static void kvm_dirty_ring_reaper_init(KVMState *s)
> +static int kvm_dirty_ring_reaper_init(KVMState *s, Error **errp)
>  {
>      struct KVMDirtyRingReaper *r = &s->reaper;
> +    int ret;
> +
> +    ret = event_notifier_init(&r->reaper_notifier, 0);
> +    if (ret < 0) {
> +        error_setg_errno(errp, -ret,
> +                         "Failed to initialize dirty ring reaper
> notifier");
> +        return ret;
> +    }
>
>      qemu_thread_create(&r->reaper_thr, "kvm-reaper",
>                         kvm_dirty_ring_reaper_thread,
>                         s, QEMU_THREAD_JOINABLE);
> +    return 0;
> +}
> +
> +/*
> + * Wake the dirty ring reaper thread so it performs a reap as soon as
> + * possible.  Safe to call from any thread; safe to call even when the
> + * dirty ring is not enabled (no-op in that case).  Coalescing is
> + * provided by the eventfd counter -- multiple kicks before the reaper
> + * runs collapse into a single wakeup.
> + */
> +void kvm_dirty_ring_reaper_kick(void)
> +{
> +    KVMState *s = kvm_state;
> +
> +    if (!s || !s->kvm_dirty_ring_size) {
> +        return;
> +    }
> +    event_notifier_set(&s->reaper.reaper_notifier);
>  }
>
>  static int kvm_dirty_ring_init(KVMState *s)
> @@ -3097,7 +3144,9 @@ static int kvm_init(AccelState *as, MachineState *ms)
>      }
>
>      if (s->kvm_dirty_ring_size) {
> -        kvm_dirty_ring_reaper_init(s);
>

Compilation error: 'errp' is undefined in this context.

+        if (kvm_dirty_ring_reaper_init(s, errp) < 0) {
> +            goto err;
> +        }
>      }
>
>      if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) {
> @@ -3571,6 +3620,11 @@ int kvm_cpu_exec(CPUState *cpu)
>                  kvm_dirty_ring_reap(kvm_state, NULL);
>              }
>              bql_unlock();
> +            /*
> +             * Ring-full pressure is the strongest signal that background
> +             * reaping should stay hot; wake the reaper unconditionally.
> +             */
> +            kvm_dirty_ring_reaper_kick();
>              dirtylimit_vcpu_execute(cpu);
>              ret = 0;
>              break;
> diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
> index c4617caac6..b878598552 100644
> --- a/accel/stubs/kvm-stub.c
> +++ b/accel/stubs/kvm-stub.c
> @@ -134,6 +134,10 @@ uint32_t kvm_dirty_ring_size(void)
>      return 0;
>  }
>
> +void kvm_dirty_ring_reaper_kick(void)
> +{
> +}
> +
>  bool kvm_hwpoisoned_mem(void)
>  {
>      return false;
> diff --git a/include/system/kvm.h b/include/system/kvm.h
> index 5fa33eddda..c42c8e0b74 100644
> --- a/include/system/kvm.h
> +++ b/include/system/kvm.h
> @@ -553,6 +553,15 @@ bool kvm_dirty_ring_enabled(void);
>
>  uint32_t kvm_dirty_ring_size(void);
>
> +/**
> + * kvm_dirty_ring_reaper_kick - wake the background dirty ring reaper.
> + *
> + * Hint that an immediate reap is desirable (e.g. ring-full pressure
> + * detected, dirtylimit disabled).  Coalesced via eventfd.  No-op when
> + * the dirty ring feature is not in use.
> + */
> +void kvm_dirty_ring_reaper_kick(void);
> +
>  void kvm_mark_guest_state_protected(void);
>
>  /**
> diff --git a/include/system/kvm_int.h b/include/system/kvm_int.h
> index 0876aac938..08e28d075a 100644
> --- a/include/system/kvm_int.h
> +++ b/include/system/kvm_int.h
> @@ -12,6 +12,7 @@
>  #include "system/memory.h"
>  #include "qapi/qapi-types-common.h"
>  #include "qemu/accel.h"
> +#include "qemu/event_notifier.h"
>  #include "qemu/queue.h"
>  #include "system/kvm.h"
>  #include "accel/accel-ops.h"
> @@ -100,6 +101,8 @@ struct KVMDirtyRingReaper {
>      QemuThread reaper_thr;
>      volatile uint64_t reaper_iteration; /* iteration number of reaper thr
> */
>      volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state
> */
> +    /* Wakeup channel: kicked by ring-full vCPU exits, dirtylimit toggle,
> ... */
> +    EventNotifier reaper_notifier;
>  };
>  struct KVMState
>  {
> diff --git a/system/dirtylimit.c b/system/dirtylimit.c
> index c934ceb0de..3ee9c58479 100644
> --- a/system/dirtylimit.c
> +++ b/system/dirtylimit.c
> @@ -393,6 +393,12 @@ void dirtylimit_change(bool start)
>          qatomic_set(&dirtylimit_quit, 0);
>      } else {
>          qatomic_set(&dirtylimit_quit, 1);
> +        /*
> +         * The reaper has been short-circuiting via the
> +         * dirtylimit_in_service() branch.  Kick it so it picks up the
> +         * policy change immediately instead of after the next 1s tick.
> +         */
> +        kvm_dirty_ring_reaper_kick();
>      }
>  }
>
> --
> 2.50.1 (Apple Git-155)
>
>