[v2] xen/sched: migrate timers to correct cpus after suspend

[PATCH-for-4.17 v2] xen/sched: migrate timers to correct cpus after suspend

Posted by Juergen Gross 1 year, 6 months ago

Today all timers are migrated to cpu 0 when the system is being
suspended. They are not migrated back after resuming the system again.

This results (at least) to problems with the credit scheduler, as the
timer isn't handled on the cpu it was expected to occur.

Add migrating the scheduling related timers of a specific cpu from cpu
0 back to its original cpu when that cpu has gone up when resuming the
system.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
V2:
- fix smt=0 case (Marek Marczykowski-Górecki)
---
 xen/common/sched/core.c    | 26 +++++++++++++++++
 xen/common/sched/cpupool.c |  2 ++
 xen/common/sched/credit.c  | 13 +++++++++
 xen/common/sched/private.h | 10 +++++++
 xen/common/sched/rt.c      | 58 ++++++++++++++++++++++++++------------
 5 files changed, 91 insertions(+), 18 deletions(-)

diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
index 23fa6845a8..0f577e472b 100644
--- a/xen/common/sched/core.c
+++ b/xen/common/sched/core.c
@@ -1284,6 +1284,32 @@ static int cpu_disable_scheduler_check(unsigned int cpu)
     return 0;
 }
 
+/*
+ * Called after a cpu has come up again in a suspend/resume cycle.
+ * Note that on a system with smt=0 this will be called for the sibling cpus,
+ * too, so the case for no scheduling resource being available must be
+ * considered.
+ * Migrate all timers for this cpu (they have been migrated to cpu 0 when the
+ * cpu was going down).
+ * Note that only timers related to a physical cpu are migrated, not the ones
+ * related to a vcpu or domain.
+ */
+void sched_migrate_timers(unsigned int cpu)
+{
+    struct sched_resource *sr;
+
+    rcu_read_lock(&sched_res_rculock);
+
+    sr = get_sched_res(cpu);
+    if ( sr && sr->master_cpu == cpu )
+    {
+        migrate_timer(&sr->s_timer, cpu);
+        sched_move_timers(sr->scheduler, sr);
+    }
+
+    rcu_read_unlock(&sched_res_rculock);
+}
+
 /*
  * In general, this must be called with the scheduler lock held, because the
  * adjust_affinity hook may want to modify the vCPU state. However, when the
diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c
index b2c6f520c3..bdf6030ab0 100644
--- a/xen/common/sched/cpupool.c
+++ b/xen/common/sched/cpupool.c
@@ -1035,6 +1035,8 @@ static int cf_check cpu_callback(
     case CPU_ONLINE:
         if ( system_state <= SYS_STATE_active )
             rc = cpupool_cpu_add(cpu);
+        else
+            sched_migrate_timers(cpu);
         break;
     case CPU_DOWN_PREPARE:
         /* Suspend/Resume don't change assignments of cpus to cpupools. */
diff --git a/xen/common/sched/credit.c b/xen/common/sched/credit.c
index 47945c2834..f2cd3d9da3 100644
--- a/xen/common/sched/credit.c
+++ b/xen/common/sched/credit.c
@@ -614,6 +614,18 @@ init_pdata(struct csched_private *prv, struct csched_pcpu *spc, int cpu)
     spc->nr_runnable = 0;
 }
 
+static void cf_check
+csched_move_timers(const struct scheduler *ops, struct sched_resource *sr)
+{
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    struct csched_pcpu *spc = sr->sched_priv;
+
+    if ( sr->master_cpu == prv->master )
+        migrate_timer(&prv->master_ticker, prv->master);
+
+    migrate_timer(&spc->ticker, sr->master_cpu);
+}
+
 /* Change the scheduler of cpu to us (Credit). */
 static spinlock_t *cf_check
 csched_switch_sched(struct scheduler *new_ops, unsigned int cpu,
@@ -2264,6 +2276,7 @@ static const struct scheduler sched_credit_def = {
     .switch_sched   = csched_switch_sched,
     .alloc_domdata  = csched_alloc_domdata,
     .free_domdata   = csched_free_domdata,
+    .move_timers    = csched_move_timers,
 };
 
 REGISTER_SCHEDULER(sched_credit_def);
diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h
index 0126a4bb9e..0527a8c70d 100644
--- a/xen/common/sched/private.h
+++ b/xen/common/sched/private.h
@@ -331,6 +331,8 @@ struct scheduler {
                                     struct xen_sysctl_scheduler_op *);
     void         (*dump_settings)  (const struct scheduler *);
     void         (*dump_cpu_state) (const struct scheduler *, int);
+    void         (*move_timers)    (const struct scheduler *,
+                                    struct sched_resource *);
 };
 
 static inline int sched_init(struct scheduler *s)
@@ -485,6 +487,13 @@ static inline int sched_adjust_cpupool(const struct scheduler *s,
     return s->adjust_global ? s->adjust_global(s, op) : 0;
 }
 
+static inline void sched_move_timers(const struct scheduler *s,
+                                     struct sched_resource *sr)
+{
+    if ( s->move_timers )
+        s->move_timers(s, sr);
+}
+
 static inline void sched_unit_pause_nosync(const struct sched_unit *unit)
 {
     struct vcpu *v;
@@ -622,6 +631,7 @@ struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu, bool aff_alloc);
 void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu);
 int schedule_cpu_rm(unsigned int cpu, struct cpu_rm_data *mem);
 int sched_move_domain(struct domain *d, struct cpupool *c);
+void sched_migrate_timers(unsigned int cpu);
 struct cpupool *cpupool_get_by_id(unsigned int poolid);
 void cpupool_put(struct cpupool *pool);
 int cpupool_add_domain(struct domain *d, unsigned int poolid);
diff --git a/xen/common/sched/rt.c b/xen/common/sched/rt.c
index 1f8d074884..d443cd5831 100644
--- a/xen/common/sched/rt.c
+++ b/xen/common/sched/rt.c
@@ -750,6 +750,27 @@ rt_switch_sched(struct scheduler *new_ops, unsigned int cpu,
     return &prv->lock;
 }
 
+static void move_repl_timer(struct rt_private *prv, unsigned int old_cpu)
+{
+    cpumask_t *online = get_sched_res(old_cpu)->cpupool->res_valid;
+    unsigned int new_cpu = cpumask_cycle(old_cpu, online);
+
+    /*
+     * Make sure the timer run on one of the cpus that are still available
+     * to this scheduler. If there aren't any left, it means it's the time
+     * to just kill it.
+     */
+    if ( new_cpu >= nr_cpu_ids )
+    {
+        kill_timer(&prv->repl_timer);
+        dprintk(XENLOG_DEBUG, "RTDS: timer killed on cpu %d\n", old_cpu);
+    }
+    else
+    {
+        migrate_timer(&prv->repl_timer, new_cpu);
+    }
+}
+
 static void cf_check
 rt_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
 {
@@ -759,25 +780,25 @@ rt_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
     spin_lock_irqsave(&prv->lock, flags);
 
     if ( prv->repl_timer.cpu == cpu )
-    {
-        cpumask_t *online = get_sched_res(cpu)->cpupool->res_valid;
-        unsigned int new_cpu = cpumask_cycle(cpu, online);
+        move_repl_timer(prv, cpu);
 
-        /*
-         * Make sure the timer run on one of the cpus that are still available
-         * to this scheduler. If there aren't any left, it means it's the time
-         * to just kill it.
-         */
-        if ( new_cpu >= nr_cpu_ids )
-        {
-            kill_timer(&prv->repl_timer);
-            dprintk(XENLOG_DEBUG, "RTDS: timer killed on cpu %d\n", cpu);
-        }
-        else
-        {
-            migrate_timer(&prv->repl_timer, new_cpu);
-        }
-    }
+    spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+static void cf_check
+rt_move_timers(const struct scheduler *ops, struct sched_resource *sr)
+{
+    unsigned long flags;
+    struct rt_private *prv = rt_priv(ops);
+    unsigned int old_cpu;
+
+    spin_lock_irqsave(&prv->lock, flags);
+
+    old_cpu = prv->repl_timer.cpu;
+    if ( prv->repl_timer.status != TIMER_STATUS_invalid &&
+         prv->repl_timer.status != TIMER_STATUS_killed &&
+         !cpumask_test_cpu(old_cpu, sr->cpupool->res_valid) )
+        move_repl_timer(prv, old_cpu);
 
     spin_unlock_irqrestore(&prv->lock, flags);
 }
@@ -1561,6 +1582,7 @@ static const struct scheduler sched_rtds_def = {
     .sleep          = rt_unit_sleep,
     .wake           = rt_unit_wake,
     .context_saved  = rt_context_saved,
+    .move_timers    = rt_move_timers,
 };
 
 REGISTER_SCHEDULER(sched_rtds_def);
-- 
2.35.3

Re: [PATCH-for-4.17 v2] xen/sched: migrate timers to correct cpus after suspend

Posted by Jan Beulich 1 year, 6 months ago

On 28.10.2022 13:12, Juergen Gross wrote:
> Today all timers are migrated to cpu 0 when the system is being
> suspended. They are not migrated back after resuming the system again.
> 
> This results (at least) to problems with the credit scheduler, as the
> timer isn't handled on the cpu it was expected to occur.

While you say "at least", this doesn't really make clear in how far all
four timers for which you change their handling are actually problematic,
or whether for some you make the adjustment "just in case". Looking at
core.c's s_timer I'm inclined to say that with s_timer_fn() raising the
schedule softirq things can't go well when this doesn't happen on the
correct CPU. Just that it won't be an obvious problem like the crash in
credit1 which had prompted the creation of this patch.

> Add migrating the scheduling related timers of a specific cpu from cpu
> 0 back to its original cpu when that cpu has gone up when resuming the
> system.
> 
> Signed-off-by: Juergen Gross <jgross@suse.com>

Is there any Fixes: tag possible? If not, could you add a respective
note in the post-commit-message area, to increase the chance of
recalling that this will want queuing for backport? (Or maybe the lack
of a reasonable Fixes: tag could actually justify the use of the
Backport: one.)

> --- a/xen/common/sched/core.c
> +++ b/xen/common/sched/core.c
> @@ -1284,6 +1284,32 @@ static int cpu_disable_scheduler_check(unsigned int cpu)
>      return 0;
>  }
>  
> +/*
> + * Called after a cpu has come up again in a suspend/resume cycle.
> + * Note that on a system with smt=0 this will be called for the sibling cpus,
> + * too, so the case for no scheduling resource being available must be
> + * considered.

I think this part of the comment would better live ...

> + * Migrate all timers for this cpu (they have been migrated to cpu 0 when the
> + * cpu was going down).
> + * Note that only timers related to a physical cpu are migrated, not the ones
> + * related to a vcpu or domain.
> + */
> +void sched_migrate_timers(unsigned int cpu)
> +{
> +    struct sched_resource *sr;
> +
> +    rcu_read_lock(&sched_res_rculock);
> +
> +    sr = get_sched_res(cpu);

... inbetween here, increasing the chance that there won't be someone
trying to remove the extra check ...

> +    if ( sr && sr->master_cpu == cpu )

... from here.

I further think that explicitly mentioning "smt=0" isn't very helpful.
Aiui a system with some CPUs otherwise soft-offlined would suffer the
same problem. And I further assume no problem would occur even with
"smt=0" on AMD hardware or Arm (where we don't park CPUs). At the very
least I'd therefore like to ask for "e.g." to be inserted; better
would be a more generic statement like "with some CPUs parked".

Jan

Re: [PATCH-for-4.17 v2] xen/sched: migrate timers to correct cpus after suspend

Posted by Juergen Gross 1 year, 6 months ago

On 02.11.22 15:07, Jan Beulich wrote:
> On 28.10.2022 13:12, Juergen Gross wrote:
>> Today all timers are migrated to cpu 0 when the system is being
>> suspended. They are not migrated back after resuming the system again.
>>
>> This results (at least) to problems with the credit scheduler, as the
>> timer isn't handled on the cpu it was expected to occur.
> 
> While you say "at least", this doesn't really make clear in how far all
> four timers for which you change their handling are actually problematic,
> or whether for some you make the adjustment "just in case". Looking at
> core.c's s_timer I'm inclined to say that with s_timer_fn() raising the
> schedule softirq things can't go well when this doesn't happen on the
> correct CPU. Just that it won't be an obvious problem like the crash in
> credit1 which had prompted the creation of this patch.
> 
>> Add migrating the scheduling related timers of a specific cpu from cpu
>> 0 back to its original cpu when that cpu has gone up when resuming the
>> system.
>>
>> Signed-off-by: Juergen Gross <jgross@suse.com>
> 
> Is there any Fixes: tag possible? If not, could you add a respective
> note in the post-commit-message area, to increase the chance of
> recalling that this will want queuing for backport? (Or maybe the lack
> of a reasonable Fixes: tag could actually justify the use of the
> Backport: one.)

I'm not sure whether it is really correct, but I assume the most probable
candidate for a Fixes: tag would be commit 0763cd268789.

> 
>> --- a/xen/common/sched/core.c
>> +++ b/xen/common/sched/core.c
>> @@ -1284,6 +1284,32 @@ static int cpu_disable_scheduler_check(unsigned int cpu)
>>       return 0;
>>   }
>>   
>> +/*
>> + * Called after a cpu has come up again in a suspend/resume cycle.
>> + * Note that on a system with smt=0 this will be called for the sibling cpus,
>> + * too, so the case for no scheduling resource being available must be
>> + * considered.
> 
> I think this part of the comment would better live ...
> 
>> + * Migrate all timers for this cpu (they have been migrated to cpu 0 when the
>> + * cpu was going down).
>> + * Note that only timers related to a physical cpu are migrated, not the ones
>> + * related to a vcpu or domain.
>> + */
>> +void sched_migrate_timers(unsigned int cpu)
>> +{
>> +    struct sched_resource *sr;
>> +
>> +    rcu_read_lock(&sched_res_rculock);
>> +
>> +    sr = get_sched_res(cpu);
> 
> ... inbetween here, increasing the chance that there won't be someone
> trying to remove the extra check ...

Okay.

> 
>> +    if ( sr && sr->master_cpu == cpu )
> 
> ... from here.
> 
> I further think that explicitly mentioning "smt=0" isn't very helpful.
> Aiui a system with some CPUs otherwise soft-offlined would suffer the
> same problem. And I further assume no problem would occur even with
> "smt=0" on AMD hardware or Arm (where we don't park CPUs). At the very
> least I'd therefore like to ask for "e.g." to be inserted; better
> would be a more generic statement like "with some CPUs parked".

I'll go with the latter.


Juergen

Re: [PATCH-for-4.17 v2] xen/sched: migrate timers to correct cpus after suspend

Posted by Marek Marczykowski-Górecki 1 year, 6 months ago

On Fri, Oct 28, 2022 at 01:12:31PM +0200, Juergen Gross wrote:
> Today all timers are migrated to cpu 0 when the system is being
> suspended. They are not migrated back after resuming the system again.
> 
> This results (at least) to problems with the credit scheduler, as the
> timer isn't handled on the cpu it was expected to occur.
> 
> Add migrating the scheduling related timers of a specific cpu from cpu
> 0 back to its original cpu when that cpu has gone up when resuming the
> system.
> 
> Signed-off-by: Juergen Gross <jgross@suse.com>

This time it works, thanks!

Tested-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>

-- 
Best Regards,
Marek Marczykowski-Górecki
Invisible Things Lab