migration/vfio: Fix a few issues on API misuse or statistic reports

[PATCH RFC 09/12] migration: Make iteration counter out of RAM

Posted by Peter Xu 2 days, 15 hours ago

It used to hide in RAM dirty sync path.  Now with more modules being able
to slow sync on dirty information, keeping it there may not be good anymore
because it's not RAM's own concept for iterations: all modules should
follow.

More importantly, mgmt may try to query dirty info (to make policy
decisions like adjusting downtime) by listening to iteration count changes
via QMP events.  So we must make sure the boost of iterations only happens
_after_ the dirty sync operations with whatever form (RAM's dirty bitmap
sync, or VFIO's different ioctls to fetch latest dirty info from kernel).

Move this to core migration path to manage, together with the event
generation, so that it can be well ordered with the sync operations for all
modules.

This brings a good side effect that we should have an old issue regarding
to cpu_throttle_dirty_sync_timer_tick() which can randomly boost iteration
counts (because it invokes sync ops).  Now it won't, which is actually the
right behavior.

Said that, we have code (not only QEMU, but likely mgmt too) assuming the
1st iteration will always shows dirty count to 1.  Make it initialized with
1 this time, because we'll miss the dirty sync for setup() on boosting this
counter now.

Cc: Yong Huang <yong.huang@smartx.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 migration/migration-stats.h |  3 ++-
 migration/migration.c       | 29 ++++++++++++++++++++++++++---
 migration/ram.c             |  6 ------
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/migration/migration-stats.h b/migration/migration-stats.h
index 1153520f7a..326ddb0088 100644
--- a/migration/migration-stats.h
+++ b/migration/migration-stats.h
@@ -43,7 +43,8 @@ typedef struct {
      */
     uint64_t dirty_pages_rate;
     /*
-     * Number of times we have synchronized guest bitmaps.
+     * Number of times we have synchronized guest bitmaps.  This always
+     * starts from 1 for the 1st iteration.
      */
     uint64_t dirty_sync_count;
     /*
diff --git a/migration/migration.c b/migration/migration.c
index 42facb16d1..ad8a824585 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1654,10 +1654,15 @@ int migrate_init(MigrationState *s, Error **errp)
     s->threshold_size = 0;
     s->switchover_acked = false;
     s->rdma_migration = false;
+
     /*
-     * set mig_stats memory to zero for a new migration
+     * set mig_stats memory to zero for a new migration.. except the
+     * iteration counter, which we want to make sure it returns 1 for the
+     * first iteration.
      */
     memset(&mig_stats, 0, sizeof(mig_stats));
+    mig_stats.dirty_sync_count = 1;
+
     migration_reset_vfio_bytes_transferred();
 
     s->postcopy_package_loaded = false;
@@ -3230,10 +3235,28 @@ static bool migration_iteration_next_ready(MigrationState *s,
 static void migration_iteration_go_next(MigPendingData *pending)
 {
     /*
-     * Do a slow sync will achieve this.  TODO: move RAM iteration code
-     * into the core layer.
+     * Do a slow sync first before boosting the iteration count.
      */
     qemu_savevm_query_pending(pending, false);
+
+    /*
+     * Boost dirty sync count to reflect we finished one iteration.
+     *
+     * NOTE: we need to make sure when this happens (together with the
+     * event sent below) all modules have slow-synced the pending data
+     * above.  That means a write mem barrier, but qatomic_add() should be
+     * enough.
+     *
+     * It's because a mgmt could wait on the iteration event to query again
+     * on pending data for policy changes (e.g. downtime adjustments).  The
+     * ordering will make sure the query will fetch the latest results from
+     * all the modules.
+     */
+    qatomic_add(&mig_stats.dirty_sync_count, 1);
+
+    if (migrate_events()) {
+        qapi_event_send_migration_pass(mig_stats.dirty_sync_count);
+    }
 }
 
 /*
diff --git a/migration/ram.c b/migration/ram.c
index 89f761a471..29e9608715 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1136,8 +1136,6 @@ static void migration_bitmap_sync(RAMState *rs, bool last_stage)
     RAMBlock *block;
     int64_t end_time;
 
-    qatomic_add(&mig_stats.dirty_sync_count, 1);
-
     if (!rs->time_last_bitmap_sync) {
         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
     }
@@ -1172,10 +1170,6 @@ static void migration_bitmap_sync(RAMState *rs, bool last_stage)
         rs->num_dirty_pages_period = 0;
         rs->bytes_xfer_prev = migration_transferred_bytes();
     }
-    if (migrate_events()) {
-        uint64_t generation = qatomic_read(&mig_stats.dirty_sync_count);
-        qapi_event_send_migration_pass(generation);
-    }
 }
 
 void migration_bitmap_sync_precopy(bool last_stage)
-- 
2.50.1

Re: [PATCH RFC 09/12] migration: Make iteration counter out of RAM

Posted by Prasad Pandit 2 days, 4 hours ago

On Fri, 20 Mar 2026 at 04:44, Peter Xu <peterx@redhat.com> wrote:
> It used to hide in RAM dirty sync path.  Now with more modules being able
> to slow sync on dirty information, keeping it there may not be good anymore
> because it's not RAM's own concept for iterations: all modules should
> follow.
>
> More importantly, mgmt may try to query dirty info (to make policy
> decisions like adjusting downtime) by listening to iteration count changes
> via QMP events.  So we must make sure the boost of iterations only happens
> _after_ the dirty sync operations with whatever form (RAM's dirty bitmap
> sync, or VFIO's different ioctls to fetch latest dirty info from kernel).
>
> Move this to core migration path to manage, together with the event
> generation, so that it can be well ordered with the sync operations for all
> modules.
>
> This brings a good side effect that we should have an old issue regarding
> to cpu_throttle_dirty_sync_timer_tick() which can randomly boost iteration
> counts (because it invokes sync ops).  Now it won't, which is actually the
> right behavior.
>
> Said that, we have code (not only QEMU, but likely mgmt too) assuming the
> 1st iteration will always shows dirty count to 1.

* Where do we make this assumption? I mostly see 'dirty_sync_count'
read/used as is, only cpu_throttle_dirty_sync_timer_tick() seems to
skip one *_bitmap_sync_precopy() call when sync_cnt <= 1. This'd works
for zero(0) as well.

> Cc: Yong Huang <yong.huang@smartx.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  migration/migration-stats.h |  3 ++-
>  migration/migration.c       | 29 ++++++++++++++++++++++++++---
>  migration/ram.c             |  6 ------
>  3 files changed, 28 insertions(+), 10 deletions(-)
>
> diff --git a/migration/migration-stats.h b/migration/migration-stats.h
> index 1153520f7a..326ddb0088 100644
> --- a/migration/migration-stats.h
> +++ b/migration/migration-stats.h
> @@ -43,7 +43,8 @@ typedef struct {
>       */
>      uint64_t dirty_pages_rate;
>      /*
> -     * Number of times we have synchronized guest bitmaps.
> +     * Number of times we have synchronized guest bitmaps.  This always
> +     * starts from 1 for the 1st iteration.
>       */
>      uint64_t dirty_sync_count;
>      /*
> diff --git a/migration/migration.c b/migration/migration.c
> index 42facb16d1..ad8a824585 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -1654,10 +1654,15 @@ int migrate_init(MigrationState *s, Error **errp)
>      s->threshold_size = 0;
>      s->switchover_acked = false;
>      s->rdma_migration = false;
> +
>      /*
> -     * set mig_stats memory to zero for a new migration
> +     * set mig_stats memory to zero for a new migration.. except the
> +     * iteration counter, which we want to make sure it returns 1 for the
> +     * first iteration.
>       */
>      memset(&mig_stats, 0, sizeof(mig_stats));
> +    mig_stats.dirty_sync_count = 1;
> +
>      migration_reset_vfio_bytes_transferred();
>
>      s->postcopy_package_loaded = false;
> @@ -3230,10 +3235,28 @@ static bool migration_iteration_next_ready(MigrationState *s,
>  static void migration_iteration_go_next(MigPendingData *pending)
>  {
>      /*
> -     * Do a slow sync will achieve this.  TODO: move RAM iteration code
> -     * into the core layer.
> +     * Do a slow sync first before boosting the iteration count.
>       */
>      qemu_savevm_query_pending(pending, false);
> +
> +    /*
> +     * Boost dirty sync count to reflect we finished one iteration.
> +     *
> +     * NOTE: we need to make sure when this happens (together with the
> +     * event sent below) all modules have slow-synced the pending data
> +     * above.  That means a write mem barrier, but qatomic_add() should be
> +     * enough.
> +     *
> +     * It's because a mgmt could wait on the iteration event to query again
> +     * on pending data for policy changes (e.g. downtime adjustments).  The
> +     * ordering will make sure the query will fetch the latest results from
> +     * all the modules.
> +     */
> +    qatomic_add(&mig_stats.dirty_sync_count, 1);
> +
> +    if (migrate_events()) {
> +        qapi_event_send_migration_pass(mig_stats.dirty_sync_count);
> +    }
>  }
>
>  /*
> diff --git a/migration/ram.c b/migration/ram.c
> index 89f761a471..29e9608715 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -1136,8 +1136,6 @@ static void migration_bitmap_sync(RAMState *rs, bool last_stage)
>      RAMBlock *block;
>      int64_t end_time;
>
> -    qatomic_add(&mig_stats.dirty_sync_count, 1);
> -
>      if (!rs->time_last_bitmap_sync) {
>          rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
>      }
> @@ -1172,10 +1170,6 @@ static void migration_bitmap_sync(RAMState *rs, bool last_stage)
>          rs->num_dirty_pages_period = 0;
>          rs->bytes_xfer_prev = migration_transferred_bytes();
>      }
> -    if (migrate_events()) {
> -        uint64_t generation = qatomic_read(&mig_stats.dirty_sync_count);
> -        qapi_event_send_migration_pass(generation);
> -    }
>  }
>
>  void migration_bitmap_sync_precopy(bool last_stage)
> --

* Change looks okay. Setting dirty_sync_count = 1 seems like special
casing, we need not do it if it's not required.

Reviewed-by: Prasad Pandit <pjp@fedoraproject.org>

Thank you.
---
  - Prasad

Re: [PATCH RFC 09/12] migration: Make iteration counter out of RAM

Posted by Yong Huang 2 days, 8 hours ago

Thanks,

Reviewed-by: Hyman Huang <yong.huang@smartx.com>

On Fri, Mar 20, 2026 at 7:13 AM Peter Xu <peterx@redhat.com> wrote:

> It used to hide in RAM dirty sync path.  Now with more modules being able
> to slow sync on dirty information, keeping it there may not be good anymore
> because it's not RAM's own concept for iterations: all modules should
> follow.
>
> More importantly, mgmt may try to query dirty info (to make policy
> decisions like adjusting downtime) by listening to iteration count changes
> via QMP events.  So we must make sure the boost of iterations only happens
> _after_ the dirty sync operations with whatever form (RAM's dirty bitmap
> sync, or VFIO's different ioctls to fetch latest dirty info from kernel).
>
> Move this to core migration path to manage, together with the event
> generation, so that it can be well ordered with the sync operations for all
> modules.
>
> This brings a good side effect that we should have an old issue regarding
> to cpu_throttle_dirty_sync_timer_tick() which can randomly boost iteration
> counts (because it invokes sync ops).  Now it won't, which is actually the
> right behavior.
>
> Said that, we have code (not only QEMU, but likely mgmt too) assuming the
> 1st iteration will always shows dirty count to 1.  Make it initialized with
> 1 this time, because we'll miss the dirty sync for setup() on boosting this
> counter now.
>
> Cc: Yong Huang <yong.huang@smartx.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  migration/migration-stats.h |  3 ++-
>  migration/migration.c       | 29 ++++++++++++++++++++++++++---
>  migration/ram.c             |  6 ------
>  3 files changed, 28 insertions(+), 10 deletions(-)
>
> diff --git a/migration/migration-stats.h b/migration/migration-stats.h
> index 1153520f7a..326ddb0088 100644
> --- a/migration/migration-stats.h
> +++ b/migration/migration-stats.h
> @@ -43,7 +43,8 @@ typedef struct {
>       */
>      uint64_t dirty_pages_rate;
>      /*
> -     * Number of times we have synchronized guest bitmaps.
> +     * Number of times we have synchronized guest bitmaps.  This always
> +     * starts from 1 for the 1st iteration.
>       */
>      uint64_t dirty_sync_count;
>      /*
> diff --git a/migration/migration.c b/migration/migration.c
> index 42facb16d1..ad8a824585 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -1654,10 +1654,15 @@ int migrate_init(MigrationState *s, Error **errp)
>      s->threshold_size = 0;
>      s->switchover_acked = false;
>      s->rdma_migration = false;
> +
>      /*
> -     * set mig_stats memory to zero for a new migration
> +     * set mig_stats memory to zero for a new migration.. except the
> +     * iteration counter, which we want to make sure it returns 1 for the
> +     * first iteration.
>       */
>      memset(&mig_stats, 0, sizeof(mig_stats));
> +    mig_stats.dirty_sync_count = 1;
> +
>      migration_reset_vfio_bytes_transferred();
>
>      s->postcopy_package_loaded = false;
> @@ -3230,10 +3235,28 @@ static bool
> migration_iteration_next_ready(MigrationState *s,
>  static void migration_iteration_go_next(MigPendingData *pending)
>  {
>      /*
> -     * Do a slow sync will achieve this.  TODO: move RAM iteration code
> -     * into the core layer.
> +     * Do a slow sync first before boosting the iteration count.
>       */
>      qemu_savevm_query_pending(pending, false);
> +
> +    /*
> +     * Boost dirty sync count to reflect we finished one iteration.
> +     *
> +     * NOTE: we need to make sure when this happens (together with the
> +     * event sent below) all modules have slow-synced the pending data
> +     * above.  That means a write mem barrier, but qatomic_add() should be
> +     * enough.
> +     *
> +     * It's because a mgmt could wait on the iteration event to query
> again
> +     * on pending data for policy changes (e.g. downtime adjustments).
> The
> +     * ordering will make sure the query will fetch the latest results
> from
> +     * all the modules.
> +     */
> +    qatomic_add(&mig_stats.dirty_sync_count, 1);
> +
> +    if (migrate_events()) {
> +        qapi_event_send_migration_pass(mig_stats.dirty_sync_count);
> +    }
>  }
>
>  /*
> diff --git a/migration/ram.c b/migration/ram.c
> index 89f761a471..29e9608715 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -1136,8 +1136,6 @@ static void migration_bitmap_sync(RAMState *rs, bool
> last_stage)
>      RAMBlock *block;
>      int64_t end_time;
>
> -    qatomic_add(&mig_stats.dirty_sync_count, 1);
> -
>      if (!rs->time_last_bitmap_sync) {
>          rs->time_last_bitmap_sync =
> qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
>      }
> @@ -1172,10 +1170,6 @@ static void migration_bitmap_sync(RAMState *rs,
> bool last_stage)
>          rs->num_dirty_pages_period = 0;
>          rs->bytes_xfer_prev = migration_transferred_bytes();
>      }
> -    if (migrate_events()) {
> -        uint64_t generation = qatomic_read(&mig_stats.dirty_sync_count);
> -        qapi_event_send_migration_pass(generation);
> -    }
>  }
>
>  void migration_bitmap_sync_precopy(bool last_stage)
> --
> 2.50.1
>
>

-- 
Best regards