[RFC PATCH] record-replay: support SMP target machine

Nicholas Piggin posted 1 patch 9 months ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20230805063238.241780-1-npiggin@gmail.com
Maintainers: Richard Henderson <richard.henderson@linaro.org>, Paolo Bonzini <pbonzini@redhat.com>, Pavel Dovgalyuk <pavel.dovgaluk@ispras.ru>
There is a newer version of this series
accel/tcg/tcg-accel-ops-icount.c |  9 +++-
accel/tcg/tcg-accel-ops-rr.c     | 73 ++++++++++++++++++++++++--------
hw/ppc/spapr.c                   |  1 +
include/exec/replay-core.h       |  3 ++
replay/replay-internal.h         |  1 +
replay/replay.c                  | 34 ++++++++++++++-
scripts/replay-dump.py           |  5 +++
softmmu/vl.c                     |  2 +-
8 files changed, 107 insertions(+), 21 deletions(-)
[RFC PATCH] record-replay: support SMP target machine
Posted by Nicholas Piggin 9 months ago
Hi, I have a use for RR in SMP machine. It seems conceptually simple to
add, RR scheduler is driven in a deterministic way like everything else.
In practice, I'm not entirely sure if I have all the locking and corner
cases covered. The code is also a bit ugly at the moment in terms of
CPU switching and recording, possibly an initial tidy patch to wrap up
the CPU scheduling in a nice way would make that better.

In any case, this series works enough on a ppc smp 2 guest including
step and reverse step of both threads when booting Linux (with some
other ppc rr fix patches that have already been posted).

Any thoughts on the approach, or details?

Thanks
Nick

---
RR CPU switching is driven by timers and events so it is deterministic
like everything else. Record a CPU switch event and use that to drive
the CPU switch on replay.

Some of the RR CPU scheduling variables have to be reset to get an
exact replay.

---
 accel/tcg/tcg-accel-ops-icount.c |  9 +++-
 accel/tcg/tcg-accel-ops-rr.c     | 73 ++++++++++++++++++++++++--------
 hw/ppc/spapr.c                   |  1 +
 include/exec/replay-core.h       |  3 ++
 replay/replay-internal.h         |  1 +
 replay/replay.c                  | 34 ++++++++++++++-
 scripts/replay-dump.py           |  5 +++
 softmmu/vl.c                     |  2 +-
 8 files changed, 107 insertions(+), 21 deletions(-)

diff --git a/accel/tcg/tcg-accel-ops-icount.c b/accel/tcg/tcg-accel-ops-icount.c
index 3d2cfbbc97..c26782a56a 100644
--- a/accel/tcg/tcg-accel-ops-icount.c
+++ b/accel/tcg/tcg-accel-ops-icount.c
@@ -93,10 +93,15 @@ void icount_handle_deadline(void)
 int64_t icount_percpu_budget(int cpu_count)
 {
     int64_t limit = icount_get_limit();
-    int64_t timeslice = limit / cpu_count;
+    int64_t timeslice;
 
-    if (timeslice == 0) {
+    if (replay_mode == REPLAY_MODE_PLAY) {
         timeslice = limit;
+    } else {
+        timeslice = limit / cpu_count;
+        if (timeslice == 0) {
+            timeslice = limit;
+        }
     }
 
     return timeslice;
diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
index 2d523289a8..672432db87 100644
--- a/accel/tcg/tcg-accel-ops-rr.c
+++ b/accel/tcg/tcg-accel-ops-rr.c
@@ -27,6 +27,7 @@
 #include "qemu/lockable.h"
 #include "sysemu/tcg.h"
 #include "sysemu/replay.h"
+#include "sysemu/reset.h"
 #include "sysemu/cpu-timers.h"
 #include "qemu/main-loop.h"
 #include "qemu/notify.h"
@@ -61,6 +67,19 @@ void rr_kick_vcpu_thread(CPUState *unused)
 
 static QEMUTimer *rr_kick_vcpu_timer;
 static CPUState *rr_current_cpu;
+static CPUState *rr_next_cpu;
+static CPUState *rr_last_cpu;
+
+static void record_replay_reset(void *param)
+{
+    if (rr_kick_vcpu_timer) {
+        timer_del(rr_kick_vcpu_timer);
+    }
+    rr_current_cpu = NULL;
+    rr_next_cpu = NULL;
+    rr_last_cpu = NULL;
+    current_cpu = NULL;
+}
 
 static inline int64_t rr_next_kick_time(void)
 {
@@ -114,8 +133,6 @@ static void rr_wait_io_event(void)
         qemu_cond_wait_iothread(first_cpu->halt_cond);
     }
 
-    rr_start_kick_timer();
-
     CPU_FOREACH(cpu) {
         qemu_wait_io_event_common(cpu);
     }
@@ -182,6 +199,8 @@ static void *rr_cpu_thread_fn(void *arg)
     Notifier force_rcu;
     CPUState *cpu = arg;
 
+    qemu_register_reset(record_replay_reset, NULL);
+
     assert(tcg_enabled());
     rcu_register_thread();
     force_rcu.notify = rr_force_rcu;
@@ -207,8 +226,6 @@ static void *rr_cpu_thread_fn(void *arg)
         }
     }
 
-    rr_start_kick_timer();
-
     cpu = first_cpu;
 
     /* process any pending work */
@@ -222,9 +239,19 @@ static void *rr_cpu_thread_fn(void *arg)
         replay_mutex_lock();
         qemu_mutex_lock_iothread();
 
-        if (icount_enabled()) {
-            int cpu_count = rr_cpu_count();
+        rr_start_kick_timer();
 
+        if (!rr_next_cpu) {
+            qatomic_set_mb(&rr_next_cpu, first_cpu);
+        }
+        cpu = rr_next_cpu;
+
+        if (cpu != rr_last_cpu) {
+            replay_switch_cpu();
+            qatomic_set_mb(&rr_last_cpu, cpu);
+        }
+
+        if (icount_enabled()) {
             /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
             icount_account_warp_timer();
             /*
@@ -233,15 +260,11 @@ static void *rr_cpu_thread_fn(void *arg)
              */
             icount_handle_deadline();
 
-            cpu_budget = icount_percpu_budget(cpu_count);
+            cpu_budget = icount_percpu_budget(rr_cpu_count());
         }
 
         replay_mutex_unlock();
 
-        if (!cpu) {
-            cpu = first_cpu;
-        }
-
         while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
             /* Store rr_current_cpu before evaluating cpu_can_run().  */
             qatomic_set_mb(&rr_current_cpu, cpu);
@@ -280,7 +303,21 @@ static void *rr_cpu_thread_fn(void *arg)
                 break;
             }
 
-            cpu = CPU_NEXT(cpu);
+            if (replay_mode == REPLAY_MODE_NONE) {
+                cpu = CPU_NEXT(cpu);
+	    } else if (replay_mode == REPLAY_MODE_RECORD) {
+                cpu = CPU_NEXT(cpu);
+                break;
+            } else if (replay_mode == REPLAY_MODE_PLAY) {
+                qemu_mutex_unlock_iothread();
+                replay_mutex_lock();
+                qemu_mutex_lock_iothread();
+                if (replay_has_switch_cpu()) {
+                    cpu = CPU_NEXT(cpu);
+                }
+                replay_mutex_unlock();
+                break;
+            }
         } /* while (cpu && !cpu->exit_request).. */
 
         /* Does not need a memory barrier because a spurious wakeup is okay.  */
@@ -290,6 +327,8 @@ static void *rr_cpu_thread_fn(void *arg)
             qatomic_set_mb(&cpu->exit_request, 0);
         }
 
+        qatomic_set(&rr_next_cpu, cpu);
+
         if (icount_enabled() && all_cpu_threads_idle()) {
             /*
              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
diff --git a/include/exec/replay-core.h b/include/exec/replay-core.h
index 244c77acce..543c129a1d 100644
--- a/include/exec/replay-core.h
+++ b/include/exec/replay-core.h
@@ -52,6 +52,9 @@ void replay_gdb_attached(void);
 
 /* Interrupts and exceptions */
 
+bool replay_switch_cpu(void);
+bool replay_has_switch_cpu(void);
+
 /* Called by exception handler to write or read exception processing events */
 bool replay_exception(void);
 /*
diff --git a/replay/replay-internal.h b/replay/replay-internal.h
index b6836354ac..95849e7461 100644
--- a/replay/replay-internal.h
+++ b/replay/replay-internal.h
@@ -58,6 +58,7 @@ enum ReplayEvents {
     /* some of greater codes are reserved for checkpoints */
     EVENT_CHECKPOINT,
     EVENT_CHECKPOINT_LAST = EVENT_CHECKPOINT + CHECKPOINT_COUNT - 1,
+    EVENT_SWITCH_CPU,
     /* end of log event */
     EVENT_END,
     EVENT_COUNT
diff --git a/replay/replay.c b/replay/replay.c
index 0f7d766efe..ed0f020903 100644
--- a/replay/replay.c
+++ b/replay/replay.c
@@ -98,9 +98,41 @@ void replay_account_executed_instructions(void)
     }
 }
 
-bool replay_exception(void)
+bool replay_switch_cpu(void)
+{
+    if (replay_mode == REPLAY_MODE_RECORD) {
+        g_assert(replay_mutex_locked());
+        replay_save_instructions();
+        replay_put_event(EVENT_SWITCH_CPU);
+        return true;
+    } else if (replay_mode == REPLAY_MODE_PLAY) {
+        bool res = replay_has_switch_cpu();
+        if (res) {
+            replay_finish_event();
+        } else {
+            assert(0);
+        }
+        return res;
+    }
+
+    return true;
+}
+
+bool replay_has_switch_cpu(void)
 {
+    bool res = false;
+    if (replay_mode == REPLAY_MODE_PLAY) {
+        g_assert(replay_mutex_locked());
+        replay_account_executed_instructions();
+        res = replay_next_event_is(EVENT_SWITCH_CPU);
+    }
+
+    return res;
+}
 
+
+bool replay_exception(void)
+{
     if (replay_mode == REPLAY_MODE_RECORD) {
         g_assert(replay_mutex_locked());
         replay_save_instructions();
diff --git a/softmmu/vl.c b/softmmu/vl.c
index 0f39996640..fe57170ed7 100644
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@@ -1883,7 +1883,7 @@ static void qemu_apply_machine_options(QDict *qdict)
         semihosting_arg_fallback(current_machine->kernel_filename, current_machine->kernel_cmdline);
     }
 
-    if (current_machine->smp.cpus > 1) {
+    if (0 && current_machine->smp.cpus > 1) {
         replay_add_blocker("smp");
     }
 }
-- 
2.40.1