Dietmar reported that commit 3840cbe24cf0 ("sched: psi: fix bogus
pressure spikes from aggregation race") caused a regression for him on
a high context switch rate benchmark (schbench) due to the now
repeating cpu_clock() calls.
In particular the problem is that get_recent_times() will extrapolate
the current state to 'now'. But if an update uses a timestamp from
before the start of the update, it is possible to get two reads
with inconsistent results. It is effectively back-dating an update.
(note that this all hard-relies on the clock being synchronized across
CPUs -- if this is not the case, all bets are off).
Combine this problem with the fact that there are per-group-per-cpu
seqcounts, the commit in question pushed the clock read into the group
iteration, causing tree-depth cpu_clock() calls. On architectures
where cpu_clock() has appreciable overhead, this hurts.
Instead move to a per-cpu seqcount, which allows us to have a single
clock read for all group updates, increasing internal consistency and
lowering update overhead. This comes at the cost of a longer update
side (proportional to the tree depth) which can cause the read side to
retry more often.
Fixes: 3840cbe24cf0 ("sched: psi: fix bogus pressure spikes from aggregation race")
Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>,
Link: https://lkml.kernel.org/20250522084844.GC31726@noisy.programming.kicks-ass.net
---
include/linux/psi_types.h | 6 --
kernel/sched/psi.c | 121 +++++++++++++++++++++++++---------------------
2 files changed, 68 insertions(+), 59 deletions(-)
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -84,11 +84,9 @@ enum psi_aggregators {
struct psi_group_cpu {
/* 1st cacheline updated by the scheduler */
- /* Aggregator needs to know of concurrent changes */
- seqcount_t seq ____cacheline_aligned_in_smp;
-
/* States of the tasks belonging to this group */
- unsigned int tasks[NR_PSI_TASK_COUNTS];
+ unsigned int tasks[NR_PSI_TASK_COUNTS]
+ ____cacheline_aligned_in_smp;
/* Aggregate pressure state derived from the tasks */
u32 state_mask;
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -176,6 +176,28 @@ struct psi_group psi_system = {
.pcpu = &system_group_pcpu,
};
+static DEFINE_PER_CPU(seqcount_t, psi_seq);
+
+static inline void psi_write_begin(int cpu)
+{
+ write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline void psi_write_end(int cpu)
+{
+ write_seqcount_end(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline u32 psi_read_begin(int cpu)
+{
+ return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline bool psi_read_retry(int cpu, u32 seq)
+{
+ return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq);
+}
+
static void psi_avgs_work(struct work_struct *work);
static void poll_timer_fn(struct timer_list *t);
@@ -186,7 +208,7 @@ static void group_init(struct psi_group
group->enabled = true;
for_each_possible_cpu(cpu)
- seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
+ seqcount_init(per_cpu_ptr(&psi_seq, cpu));
group->avg_last_update = sched_clock();
group->avg_next_update = group->avg_last_update + psi_period;
mutex_init(&group->avgs_lock);
@@ -266,14 +288,14 @@ static void get_recent_times(struct psi_
/* Snapshot a coherent view of the CPU state */
do {
- seq = read_seqcount_begin(&groupc->seq);
+ seq = psi_read_begin(cpu);
now = cpu_clock(cpu);
memcpy(times, groupc->times, sizeof(groupc->times));
state_mask = groupc->state_mask;
state_start = groupc->state_start;
if (cpu == current_cpu)
memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
- } while (read_seqcount_retry(&groupc->seq, seq));
+ } while (psi_read_retry(cpu, seq));
/* Calculate state time deltas against the previous snapshot */
for (s = 0; s < NR_PSI_STATES; s++) {
@@ -772,31 +794,21 @@ static void record_times(struct psi_grou
groupc->times[PSI_NONIDLE] += delta;
}
+#define for_each_group(iter, group) \
+ for (typeof(group) iter = group; iter; iter = iter->parent)
+
static void psi_group_change(struct psi_group *group, int cpu,
unsigned int clear, unsigned int set,
- bool wake_clock)
+ u64 now, bool wake_clock)
{
struct psi_group_cpu *groupc;
unsigned int t, m;
u32 state_mask;
- u64 now;
lockdep_assert_rq_held(cpu_rq(cpu));
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
- * First we update the task counts according to the state
- * change requested through the @clear and @set bits.
- *
- * Then if the cgroup PSI stats accounting enabled, we
- * assess the aggregate resource states this CPU's tasks
- * have been in since the last change, and account any
- * SOME and FULL time these may have resulted in.
- */
- write_seqcount_begin(&groupc->seq);
- now = cpu_clock(cpu);
-
- /*
* Start with TSK_ONCPU, which doesn't have a corresponding
* task count - it's just a boolean flag directly encoded in
* the state mask. Clear, set, or carry the current state if
@@ -847,7 +859,6 @@ static void psi_group_change(struct psi_
groupc->state_mask = state_mask;
- write_seqcount_end(&groupc->seq);
return;
}
@@ -868,8 +879,6 @@ static void psi_group_change(struct psi_
groupc->state_mask = state_mask;
- write_seqcount_end(&groupc->seq);
-
if (state_mask & group->rtpoll_states)
psi_schedule_rtpoll_work(group, 1, false);
@@ -904,24 +913,29 @@ static void psi_flags_change(struct task
void psi_task_change(struct task_struct *task, int clear, int set)
{
int cpu = task_cpu(task);
- struct psi_group *group;
+ u64 now;
if (!task->pid)
return;
psi_flags_change(task, clear, set);
- group = task_psi_group(task);
- do {
- psi_group_change(group, cpu, clear, set, true);
- } while ((group = group->parent));
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+ for_each_group(group, task_psi_group(task))
+ psi_group_change(group, cpu, clear, set, now, true);
+ psi_write_end(cpu);
}
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep)
{
- struct psi_group *group, *common = NULL;
+ struct psi_group *common = NULL;
int cpu = task_cpu(prev);
+ u64 now;
+
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
if (next->pid) {
psi_flags_change(next, 0, TSK_ONCPU);
@@ -930,16 +944,15 @@ void psi_task_switch(struct task_struct
* ancestors with @prev, those will already have @prev's
* TSK_ONCPU bit set, and we can stop the iteration there.
*/
- group = task_psi_group(next);
- do {
- if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
- PSI_ONCPU) {
+ for_each_group(group, task_psi_group(next)) {
+ struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+
+ if (groupc->state_mask & PSI_ONCPU) {
common = group;
break;
}
-
- psi_group_change(group, cpu, 0, TSK_ONCPU, true);
- } while ((group = group->parent));
+ psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
+ }
}
if (prev->pid) {
@@ -972,12 +985,11 @@ void psi_task_switch(struct task_struct
psi_flags_change(prev, clear, set);
- group = task_psi_group(prev);
- do {
+ for_each_group(group, task_psi_group(prev)) {
if (group == common)
break;
- psi_group_change(group, cpu, clear, set, wake_clock);
- } while ((group = group->parent));
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
+ }
/*
* TSK_ONCPU is handled up to the common ancestor. If there are
@@ -987,20 +999,21 @@ void psi_task_switch(struct task_struct
*/
if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
clear &= ~TSK_ONCPU;
- for (; group; group = group->parent)
- psi_group_change(group, cpu, clear, set, wake_clock);
+ for_each_group(group, common)
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
}
}
+ psi_write_end(cpu);
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
{
int cpu = task_cpu(curr);
- struct psi_group *group;
struct psi_group_cpu *groupc;
s64 delta;
u64 irq;
+ u64 now;
if (static_branch_likely(&psi_disabled) || !irqtime_enabled())
return;
@@ -1009,8 +1022,7 @@ void psi_account_irqtime(struct rq *rq,
return;
lockdep_assert_rq_held(rq);
- group = task_psi_group(curr);
- if (prev && task_psi_group(prev) == group)
+ if (prev && task_psi_group(prev) == task_psi_group(curr))
return;
irq = irq_time_read(cpu);
@@ -1019,25 +1031,22 @@ void psi_account_irqtime(struct rq *rq,
return;
rq->psi_irq_time = irq;
- do {
- u64 now;
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+ for_each_group(group, task_psi_group(curr)) {
if (!group->enabled)
continue;
groupc = per_cpu_ptr(group->pcpu, cpu);
- write_seqcount_begin(&groupc->seq);
- now = cpu_clock(cpu);
-
record_times(groupc, now);
groupc->times[PSI_IRQ_FULL] += delta;
- write_seqcount_end(&groupc->seq);
-
if (group->rtpoll_states & (1 << PSI_IRQ_FULL))
psi_schedule_rtpoll_work(group, 1, false);
- } while ((group = group->parent));
+ }
+ psi_write_end(cpu);
}
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
@@ -1225,12 +1234,14 @@ void psi_cgroup_restart(struct psi_group
return;
for_each_possible_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
+ u64 now;
- rq_lock_irq(rq, &rf);
- psi_group_change(group, cpu, 0, 0, true);
- rq_unlock_irq(rq, &rf);
+ guard(rq_lock_irq)(cpu_rq(cpu));
+
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+ psi_group_change(group, cpu, 0, 0, now, true);
+ psi_write_end(cpu);
}
}
#endif /* CONFIG_CGROUPS */
On 7/2/25 7:49 AM, Peter Zijlstra wrote: > Dietmar reported that commit 3840cbe24cf0 ("sched: psi: fix bogus > pressure spikes from aggregation race") caused a regression for him on > a high context switch rate benchmark (schbench) due to the now > repeating cpu_clock() calls. > > In particular the problem is that get_recent_times() will extrapolate > the current state to 'now'. But if an update uses a timestamp from > before the start of the update, it is possible to get two reads > with inconsistent results. It is effectively back-dating an update. > > (note that this all hard-relies on the clock being synchronized across > CPUs -- if this is not the case, all bets are off). > > Combine this problem with the fact that there are per-group-per-cpu > seqcounts, the commit in question pushed the clock read into the group > iteration, causing tree-depth cpu_clock() calls. On architectures > where cpu_clock() has appreciable overhead, this hurts. > > Instead move to a per-cpu seqcount, which allows us to have a single > clock read for all group updates, increasing internal consistency and > lowering update overhead. This comes at the cost of a longer update > side (proportional to the tree depth) which can cause the read side to > retry more often. > > Fixes: 3840cbe24cf0 ("sched: psi: fix bogus pressure spikes from aggregation race") > Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > Acked-by: Johannes Weiner <hannes@cmpxchg.org> > Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>, > Link: https://lkml.kernel.org/20250522084844.GC31726@noisy.programming.kicks-ass.net > --- > include/linux/psi_types.h | 6 -- > kernel/sched/psi.c | 121 +++++++++++++++++++++++++--------------------- > 2 files changed, 68 insertions(+), 59 deletions(-) > > --- a/include/linux/psi_types.h > +++ b/include/linux/psi_types.h > @@ -84,11 +84,9 @@ enum psi_aggregators { > struct psi_group_cpu { > /* 1st cacheline updated by the scheduler */ > > - /* Aggregator needs to know of concurrent changes */ > - seqcount_t seq ____cacheline_aligned_in_smp; > - > /* States of the tasks belonging to this group */ > - unsigned int tasks[NR_PSI_TASK_COUNTS]; > + unsigned int tasks[NR_PSI_TASK_COUNTS] > + ____cacheline_aligned_in_smp; > > /* Aggregate pressure state derived from the tasks */ > u32 state_mask; > --- a/kernel/sched/psi.c > +++ b/kernel/sched/psi.c > @@ -176,6 +176,28 @@ struct psi_group psi_system = { > .pcpu = &system_group_pcpu, > }; > > +static DEFINE_PER_CPU(seqcount_t, psi_seq); [ ... ] > @@ -186,7 +208,7 @@ static void group_init(struct psi_group > > group->enabled = true; > for_each_possible_cpu(cpu) > - seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); > + seqcount_init(per_cpu_ptr(&psi_seq, cpu)); > group->avg_last_update = sched_clock(); > group->avg_next_update = group->avg_last_update + psi_period; > mutex_init(&group->avgs_lock); I'm not sure if someone mentioned this already, but testing the series I got a bunch of softlockups in get_recent_times() that randomly jumped from CPU to CPU. This fixed it for me, but reading it now I'm wondering if we want to seqcount_init() unconditionally even when PSI is off. diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 2024c1d36402d..979a447bc281f 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -207,8 +207,6 @@ static void group_init(struct psi_group *group) int cpu; group->enabled = true; - for_each_possible_cpu(cpu) - seqcount_init(per_cpu_ptr(&psi_seq, cpu)); group->avg_last_update = sched_clock(); group->avg_next_update = group->avg_last_update + psi_period; mutex_init(&group->avgs_lock); @@ -231,6 +229,7 @@ static void group_init(struct psi_group *group) void __init psi_init(void) { + int cpu; if (!psi_enable) { static_branch_enable(&psi_disabled); static_branch_disable(&psi_cgroups_enabled); @@ -241,6 +240,8 @@ void __init psi_init(void) static_branch_disable(&psi_cgroups_enabled); psi_period = jiffies_to_nsecs(PSI_FREQ); + for_each_possible_cpu(cpu) + seqcount_init(per_cpu_ptr(&psi_seq, cpu)); group_init(&psi_system); }
On Tue, Jul 15, 2025 at 03:11:14PM -0400, Chris Mason wrote: > On 7/2/25 7:49 AM, Peter Zijlstra wrote: > > Dietmar reported that commit 3840cbe24cf0 ("sched: psi: fix bogus > > pressure spikes from aggregation race") caused a regression for him on > > a high context switch rate benchmark (schbench) due to the now > > repeating cpu_clock() calls. > > > > In particular the problem is that get_recent_times() will extrapolate > > the current state to 'now'. But if an update uses a timestamp from > > before the start of the update, it is possible to get two reads > > with inconsistent results. It is effectively back-dating an update. > > > > (note that this all hard-relies on the clock being synchronized across > > CPUs -- if this is not the case, all bets are off). > > > > Combine this problem with the fact that there are per-group-per-cpu > > seqcounts, the commit in question pushed the clock read into the group > > iteration, causing tree-depth cpu_clock() calls. On architectures > > where cpu_clock() has appreciable overhead, this hurts. > > > > Instead move to a per-cpu seqcount, which allows us to have a single > > clock read for all group updates, increasing internal consistency and > > lowering update overhead. This comes at the cost of a longer update > > side (proportional to the tree depth) which can cause the read side to > > retry more often. > > > > Fixes: 3840cbe24cf0 ("sched: psi: fix bogus pressure spikes from aggregation race") > > Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com> > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > > Acked-by: Johannes Weiner <hannes@cmpxchg.org> > > Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>, > > Link: https://lkml.kernel.org/20250522084844.GC31726@noisy.programming.kicks-ass.net > > --- > > include/linux/psi_types.h | 6 -- > > kernel/sched/psi.c | 121 +++++++++++++++++++++++++--------------------- > > 2 files changed, 68 insertions(+), 59 deletions(-) > > > > --- a/include/linux/psi_types.h > > +++ b/include/linux/psi_types.h > > @@ -84,11 +84,9 @@ enum psi_aggregators { > > struct psi_group_cpu { > > /* 1st cacheline updated by the scheduler */ > > > > - /* Aggregator needs to know of concurrent changes */ > > - seqcount_t seq ____cacheline_aligned_in_smp; > > - > > /* States of the tasks belonging to this group */ > > - unsigned int tasks[NR_PSI_TASK_COUNTS]; > > + unsigned int tasks[NR_PSI_TASK_COUNTS] > > + ____cacheline_aligned_in_smp; > > > > /* Aggregate pressure state derived from the tasks */ > > u32 state_mask; > > --- a/kernel/sched/psi.c > > +++ b/kernel/sched/psi.c > > @@ -176,6 +176,28 @@ struct psi_group psi_system = { > > .pcpu = &system_group_pcpu, > > }; > > > > +static DEFINE_PER_CPU(seqcount_t, psi_seq); > > [ ... ] > > > @@ -186,7 +208,7 @@ static void group_init(struct psi_group > > > > group->enabled = true; > > for_each_possible_cpu(cpu) > > - seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); > > + seqcount_init(per_cpu_ptr(&psi_seq, cpu)); > > group->avg_last_update = sched_clock(); > > group->avg_next_update = group->avg_last_update + psi_period; > > mutex_init(&group->avgs_lock); > > I'm not sure if someone mentioned this already, but testing the > series I got a bunch of softlockups in get_recent_times() > that randomly jumped from CPU to CPU. ... was beaten to it. I can observe the same. > > This fixed it for me, but reading it now I'm wondering > if we want to seqcount_init() unconditionally even when PSI > is off. > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c > index 2024c1d36402d..979a447bc281f 100644 > --- a/kernel/sched/psi.c > +++ b/kernel/sched/psi.c > @@ -207,8 +207,6 @@ static void group_init(struct psi_group *group) > int cpu; > > group->enabled = true; > - for_each_possible_cpu(cpu) > - seqcount_init(per_cpu_ptr(&psi_seq, cpu)); > group->avg_last_update = sched_clock(); > group->avg_next_update = group->avg_last_update + psi_period; > mutex_init(&group->avgs_lock); > @@ -231,6 +229,7 @@ static void group_init(struct psi_group *group) > > void __init psi_init(void) > { > + int cpu; > if (!psi_enable) { > static_branch_enable(&psi_disabled); > static_branch_disable(&psi_cgroups_enabled); > @@ -241,6 +240,8 @@ void __init psi_init(void) > static_branch_disable(&psi_cgroups_enabled); > > psi_period = jiffies_to_nsecs(PSI_FREQ); > + for_each_possible_cpu(cpu) > + seqcount_init(per_cpu_ptr(&psi_seq, cpu)); > group_init(&psi_system); > } > > Wouldn't it be enough to use SEQCNT_ZERO? Those are static per-cpu ones. --- BR Beata
On Wed, Jul 16, 2025 at 08:53:01AM +0200, Beata Michalska wrote: > Wouldn't it be enough to use SEQCNT_ZERO? Those are static per-cpu ones. Yeah, I suppose that should work. The below builds, but I've not yet observed the issue myself. --- Subject: sched/psi: Fix psi_seq initialization From: Peter Zijlstra <peterz@infradead.org> Date: Tue, 15 Jul 2025 15:11:14 -0400 With the seqcount moved out of the group into a global psi_seq, re-initializing the seqcount on group creation is causing seqcount corruption. Fixes: 570c8efd5eb7 ("sched/psi: Optimize psi_group_change() cpu_clock() usage") Reported-by: Chris Mason <clm@meta.com> Suggested-by: Beata Michalska <beata.michalska@arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> --- kernel/sched/psi.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -176,7 +176,7 @@ struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; -static DEFINE_PER_CPU(seqcount_t, psi_seq); +static DEFINE_PER_CPU(seqcount_t, psi_seq) = SEQCNT_ZERO(psi_seq); static inline void psi_write_begin(int cpu) { @@ -204,11 +204,7 @@ static void poll_timer_fn(struct timer_l static void group_init(struct psi_group *group) { - int cpu; - group->enabled = true; - for_each_possible_cpu(cpu) - seqcount_init(per_cpu_ptr(&psi_seq, cpu)); group->avg_last_update = sched_clock(); group->avg_next_update = group->avg_last_update + psi_period; mutex_init(&group->avgs_lock);
Hello Ingo, Peter, On 7/16/2025 4:10 PM, Peter Zijlstra wrote: > On Wed, Jul 16, 2025 at 08:53:01AM +0200, Beata Michalska wrote: >> Wouldn't it be enough to use SEQCNT_ZERO? Those are static per-cpu ones. > > Yeah, I suppose that should work. The below builds, but I've not yet > observed the issue myself. > > --- > Subject: sched/psi: Fix psi_seq initialization > From: Peter Zijlstra <peterz@infradead.org> > Date: Tue, 15 Jul 2025 15:11:14 -0400 > > With the seqcount moved out of the group into a global psi_seq, > re-initializing the seqcount on group creation is causing seqcount > corruption. > > Fixes: 570c8efd5eb7 ("sched/psi: Optimize psi_group_change() cpu_clock() usage") > Reported-by: Chris Mason <clm@meta.com> > Suggested-by: Beata Michalska <beata.michalska@arm.com> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> I've been running with this fix for a bunch of my testing and when I forget about it (as was the case when testing John's Proxy Exec branch), I usually run into the softlockup in psi_avgs_work(). Is it too late to include this in tip:sched/core for v6.17? Also feel free to include: Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> -- Thanks and Regards, Prateek
On 7/16/2025 4:10 PM, Peter Zijlstra wrote: > On Wed, Jul 16, 2025 at 08:53:01AM +0200, Beata Michalska wrote: >> Wouldn't it be enough to use SEQCNT_ZERO? Those are static per-cpu ones. > > Yeah, I suppose that should work. The below builds, but I've not yet > observed the issue myself. > > --- > Subject: sched/psi: Fix psi_seq initialization > From: Peter Zijlstra <peterz@infradead.org> > Date: Tue, 15 Jul 2025 15:11:14 -0400 > > With the seqcount moved out of the group into a global psi_seq, > re-initializing the seqcount on group creation is causing seqcount > corruption. > > Fixes: 570c8efd5eb7 ("sched/psi: Optimize psi_group_change() cpu_clock() usage") > Reported-by: Chris Mason <clm@meta.com> > Suggested-by: Beata Michalska <beata.michalska@arm.com> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > --- > kernel/sched/psi.c | 6 +----- > 1 file changed, 1 insertion(+), 5 deletions(-) > > --- a/kernel/sched/psi.c > +++ b/kernel/sched/psi.c > @@ -176,7 +176,7 @@ struct psi_group psi_system = { > .pcpu = &system_group_pcpu, > }; > > -static DEFINE_PER_CPU(seqcount_t, psi_seq); > +static DEFINE_PER_CPU(seqcount_t, psi_seq) = SEQCNT_ZERO(psi_seq); > > static inline void psi_write_begin(int cpu) > { > @@ -204,11 +204,7 @@ static void poll_timer_fn(struct timer_l > > static void group_init(struct psi_group *group) > { > - int cpu; > - > group->enabled = true; > - for_each_possible_cpu(cpu) > - seqcount_init(per_cpu_ptr(&psi_seq, cpu)); > group->avg_last_update = sched_clock(); > group->avg_next_update = group->avg_last_update + psi_period; > mutex_init(&group->avgs_lock); I've tested the above patch, and it resolves the issue. Could we include this patch in the linux-next builds? We have been encountering the reported issue regularly in our daily CI. Tested-by: Srikanth Aithal <sraithal@amd.com>
On 7/16/25 6:40 AM, Peter Zijlstra wrote: > On Wed, Jul 16, 2025 at 08:53:01AM +0200, Beata Michalska wrote: >> Wouldn't it be enough to use SEQCNT_ZERO? Those are static per-cpu ones. > > Yeah, I suppose that should work. The below builds, but I've not yet > observed the issue myself. [ ... ] > > -static DEFINE_PER_CPU(seqcount_t, psi_seq); > +static DEFINE_PER_CPU(seqcount_t, psi_seq) = SEQCNT_ZERO(psi_seq); As expected, this also works. Thanks everyone. -chris
On Wed, Jul 16, 2025 at 12:40:50PM +0200, Peter Zijlstra wrote: > On Wed, Jul 16, 2025 at 08:53:01AM +0200, Beata Michalska wrote: > > Wouldn't it be enough to use SEQCNT_ZERO? Those are static per-cpu ones. > > Yeah, I suppose that should work. The below builds, but I've not yet > observed the issue myself. > > --- > Subject: sched/psi: Fix psi_seq initialization > From: Peter Zijlstra <peterz@infradead.org> > Date: Tue, 15 Jul 2025 15:11:14 -0400 > > With the seqcount moved out of the group into a global psi_seq, > re-initializing the seqcount on group creation is causing seqcount > corruption. > > Fixes: 570c8efd5eb7 ("sched/psi: Optimize psi_group_change() cpu_clock() usage") > Reported-by: Chris Mason <clm@meta.com> > Suggested-by: Beata Michalska <beata.michalska@arm.com> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Argh, missed that during the review as well. Acked-by: Johannes Weiner <hannes@cmpxchg.org>
(+ Ayush, Srikanth) Hello Chris, On 7/16/2025 12:41 AM, Chris Mason wrote: >> @@ -186,7 +208,7 @@ static void group_init(struct psi_group >> >> group->enabled = true; >> for_each_possible_cpu(cpu) >> - seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); >> + seqcount_init(per_cpu_ptr(&psi_seq, cpu)); >> group->avg_last_update = sched_clock(); >> group->avg_next_update = group->avg_last_update + psi_period; >> mutex_init(&group->avgs_lock); > > I'm not sure if someone mentioned this already, but testing the > series I got a bunch of softlockups in get_recent_times() > that randomly jumped from CPU to CPU. Ayush, Srikanth, and I ran into this yesterday when testing different trees (next, queue:sched/core) with similar signatures: watchdog: BUG: soft lockup - CPU#0 stuck for 21s! [kworker/0:3:2140] Modules linked in: ... CPU: 0 UID: 0 PID: 2140 Comm: kworker/0:3 Not tainted 6.16.0-rc1-test+ #20 PREEMPT(voluntary) Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS 2.7.3 03/30/2022 Workqueue: events psi_avgs_work RIP: 0010:collect_percpu_times+0x3a0/0x670 Code: 65 48 2b 05 4a 79 d2 02 0f 85 dc 02 00 00 48 8d 65 d8 5b 41 5c 41 5d 41 5e 41 5f 5d e9 34 ba d2 ff f3 90 49 81 ff ff 1f 00 00 <0f> 86 73 fd ff ff 4c 89 fe 48 c7 c7 80 9d 29 bb e8 cb 92 73 00 e9 RSP: 0018:ffffcda753383d10 EFLAGS: 00000297 RAX: ffff8be86fadcd40 RBX: ffffeda7308d4580 RCX: 000000000000006b RDX: 000000000000002b RSI: 0000000000000100 RDI: ffffffffbab3f400 RBP: ffffcda753383e30 R08: 000000000000006b R09: 0000000000000000 R10: 0000008cca6be372 R11: 0000000000000006 R12: 000000000000006b R13: ffffeda7308d4594 R14: 00000000000037e5 R15: 000000000000006b FS: 0000000000000000(0000) GS:ffff8ba8c1118000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055b3cf990c3c CR3: 000000807dc40006 CR4: 0000000000f70ef0 PKRU: 55555554 Call Trace: <TASK> ? srso_alias_return_thunk+0x5/0xfbef5 ? psi_group_change+0x1ff/0x460 ? add_timer_on+0x10a/0x160 psi_avgs_work+0x4c/0xd0 ? queue_delayed_work_on+0x6d/0x80 process_one_work+0x193/0x3c0 worker_thread+0x29d/0x3c0 ? __pfx_worker_thread+0x10/0x10 kthread+0xff/0x210 ? __pfx_kthread+0x10/0x10 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x171/0x1a0 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 </TASK> I was able to reproduce this reliably running 100 copies of an infinite loop doing - cgroup create, move a task, move task back to root, remove cgroup - alongside hackbench running in a seperate cgroup and I hit this in ~5-10min. I have been running the same test with your fix and haven't run into this for over 30min now. Feel free to add: Reviewed-and-tested-by: K Prateek Nayak <kprateek.nayak@amd.com> > > This fixed it for me, but reading it now I'm wondering > if we want to seqcount_init() unconditionally even when PSI > is off. Looking at "psi_enable", it can only be toggled via the kernel parameter "psi=" and I don't see anything that does a "static_branch_disable(&psi_disabled)" at runtime so I think your fix should be good. > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c > index 2024c1d36402d..979a447bc281f 100644 > --- a/kernel/sched/psi.c > +++ b/kernel/sched/psi.c > @@ -207,8 +207,6 @@ static void group_init(struct psi_group *group) > int cpu; "cpu" variable can be removed too from group_init() now. > > group->enabled = true; > - for_each_possible_cpu(cpu) > - seqcount_init(per_cpu_ptr(&psi_seq, cpu)); > group->avg_last_update = sched_clock(); > group->avg_next_update = group->avg_last_update + psi_period; > mutex_init(&group->avgs_lock); > @@ -231,6 +229,7 @@ static void group_init(struct psi_group *group) > > void __init psi_init(void) > { > + int cpu; nit. newline after declaration. > if (!psi_enable) { > static_branch_enable(&psi_disabled); > static_branch_disable(&psi_cgroups_enabled); > @@ -241,6 +240,8 @@ void __init psi_init(void) > static_branch_disable(&psi_cgroups_enabled); > > psi_period = jiffies_to_nsecs(PSI_FREQ); > + for_each_possible_cpu(cpu) > + seqcount_init(per_cpu_ptr(&psi_seq, cpu)); > group_init(&psi_system); > } > > -- Thanks and Regards, Prateek
© 2016 - 2025 Red Hat, Inc.