sched: Address schbench regression

[PATCH v2 01/12] sched/psi: Optimize psi_group_change() cpu_clock() usage

Posted by Peter Zijlstra 3 months, 1 week ago

Dietmar reported that commit 3840cbe24cf0 ("sched: psi: fix bogus
pressure spikes from aggregation race") caused a regression for him on
a high context switch rate benchmark (schbench) due to the now
repeating cpu_clock() calls.

In particular the problem is that get_recent_times() will extrapolate
the current state to 'now'. But if an update uses a timestamp from
before the start of the update, it is possible to get two reads
with inconsistent results. It is effectively back-dating an update.

(note that this all hard-relies on the clock being synchronized across
CPUs -- if this is not the case, all bets are off).

Combine this problem with the fact that there are per-group-per-cpu
seqcounts, the commit in question pushed the clock read into the group
iteration, causing tree-depth cpu_clock() calls. On architectures
where cpu_clock() has appreciable overhead, this hurts.

Instead move to a per-cpu seqcount, which allows us to have a single
clock read for all group updates, increasing internal consistency and
lowering update overhead. This comes at the cost of a longer update
side (proportional to the tree depth) which can cause the read side to
retry more often.

Fixes: 3840cbe24cf0 ("sched: psi: fix bogus pressure spikes from aggregation race")
Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>,
Link: https://lkml.kernel.org/20250522084844.GC31726@noisy.programming.kicks-ass.net
---
 include/linux/psi_types.h |    6 --
 kernel/sched/psi.c        |  121 +++++++++++++++++++++++++---------------------
 2 files changed, 68 insertions(+), 59 deletions(-)

--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -84,11 +84,9 @@ enum psi_aggregators {
 struct psi_group_cpu {
 	/* 1st cacheline updated by the scheduler */
 
-	/* Aggregator needs to know of concurrent changes */
-	seqcount_t seq ____cacheline_aligned_in_smp;
-
 	/* States of the tasks belonging to this group */
-	unsigned int tasks[NR_PSI_TASK_COUNTS];
+	unsigned int tasks[NR_PSI_TASK_COUNTS]
+			____cacheline_aligned_in_smp;
 
 	/* Aggregate pressure state derived from the tasks */
 	u32 state_mask;
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -176,6 +176,28 @@ struct psi_group psi_system = {
 	.pcpu = &system_group_pcpu,
 };
 
+static DEFINE_PER_CPU(seqcount_t, psi_seq);
+
+static inline void psi_write_begin(int cpu)
+{
+	write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline void psi_write_end(int cpu)
+{
+	write_seqcount_end(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline u32 psi_read_begin(int cpu)
+{
+	return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline bool psi_read_retry(int cpu, u32 seq)
+{
+	return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq);
+}
+
 static void psi_avgs_work(struct work_struct *work);
 
 static void poll_timer_fn(struct timer_list *t);
@@ -186,7 +208,7 @@ static void group_init(struct psi_group
 
 	group->enabled = true;
 	for_each_possible_cpu(cpu)
-		seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
+		seqcount_init(per_cpu_ptr(&psi_seq, cpu));
 	group->avg_last_update = sched_clock();
 	group->avg_next_update = group->avg_last_update + psi_period;
 	mutex_init(&group->avgs_lock);
@@ -266,14 +288,14 @@ static void get_recent_times(struct psi_
 
 	/* Snapshot a coherent view of the CPU state */
 	do {
-		seq = read_seqcount_begin(&groupc->seq);
+		seq = psi_read_begin(cpu);
 		now = cpu_clock(cpu);
 		memcpy(times, groupc->times, sizeof(groupc->times));
 		state_mask = groupc->state_mask;
 		state_start = groupc->state_start;
 		if (cpu == current_cpu)
 			memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
-	} while (read_seqcount_retry(&groupc->seq, seq));
+	} while (psi_read_retry(cpu, seq));
 
 	/* Calculate state time deltas against the previous snapshot */
 	for (s = 0; s < NR_PSI_STATES; s++) {
@@ -772,31 +794,21 @@ static void record_times(struct psi_grou
 		groupc->times[PSI_NONIDLE] += delta;
 }
 
+#define for_each_group(iter, group) \
+	for (typeof(group) iter = group; iter; iter = iter->parent)
+
 static void psi_group_change(struct psi_group *group, int cpu,
 			     unsigned int clear, unsigned int set,
-			     bool wake_clock)
+			     u64 now, bool wake_clock)
 {
 	struct psi_group_cpu *groupc;
 	unsigned int t, m;
 	u32 state_mask;
-	u64 now;
 
 	lockdep_assert_rq_held(cpu_rq(cpu));
 	groupc = per_cpu_ptr(group->pcpu, cpu);
 
 	/*
-	 * First we update the task counts according to the state
-	 * change requested through the @clear and @set bits.
-	 *
-	 * Then if the cgroup PSI stats accounting enabled, we
-	 * assess the aggregate resource states this CPU's tasks
-	 * have been in since the last change, and account any
-	 * SOME and FULL time these may have resulted in.
-	 */
-	write_seqcount_begin(&groupc->seq);
-	now = cpu_clock(cpu);
-
-	/*
 	 * Start with TSK_ONCPU, which doesn't have a corresponding
 	 * task count - it's just a boolean flag directly encoded in
 	 * the state mask. Clear, set, or carry the current state if
@@ -847,7 +859,6 @@ static void psi_group_change(struct psi_
 
 		groupc->state_mask = state_mask;
 
-		write_seqcount_end(&groupc->seq);
 		return;
 	}
 
@@ -868,8 +879,6 @@ static void psi_group_change(struct psi_
 
 	groupc->state_mask = state_mask;
 
-	write_seqcount_end(&groupc->seq);
-
 	if (state_mask & group->rtpoll_states)
 		psi_schedule_rtpoll_work(group, 1, false);
 
@@ -904,24 +913,29 @@ static void psi_flags_change(struct task
 void psi_task_change(struct task_struct *task, int clear, int set)
 {
 	int cpu = task_cpu(task);
-	struct psi_group *group;
+	u64 now;
 
 	if (!task->pid)
 		return;
 
 	psi_flags_change(task, clear, set);
 
-	group = task_psi_group(task);
-	do {
-		psi_group_change(group, cpu, clear, set, true);
-	} while ((group = group->parent));
+	psi_write_begin(cpu);
+	now = cpu_clock(cpu);
+	for_each_group(group, task_psi_group(task))
+		psi_group_change(group, cpu, clear, set, now, true);
+	psi_write_end(cpu);
 }
 
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		     bool sleep)
 {
-	struct psi_group *group, *common = NULL;
+	struct psi_group *common = NULL;
 	int cpu = task_cpu(prev);
+	u64 now;
+
+	psi_write_begin(cpu);
+	now = cpu_clock(cpu);
 
 	if (next->pid) {
 		psi_flags_change(next, 0, TSK_ONCPU);
@@ -930,16 +944,15 @@ void psi_task_switch(struct task_struct
 		 * ancestors with @prev, those will already have @prev's
 		 * TSK_ONCPU bit set, and we can stop the iteration there.
 		 */
-		group = task_psi_group(next);
-		do {
-			if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
-			    PSI_ONCPU) {
+		for_each_group(group, task_psi_group(next)) {
+			struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+
+			if (groupc->state_mask & PSI_ONCPU) {
 				common = group;
 				break;
 			}
-
-			psi_group_change(group, cpu, 0, TSK_ONCPU, true);
-		} while ((group = group->parent));
+			psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
+		}
 	}
 
 	if (prev->pid) {
@@ -972,12 +985,11 @@ void psi_task_switch(struct task_struct
 
 		psi_flags_change(prev, clear, set);
 
-		group = task_psi_group(prev);
-		do {
+		for_each_group(group, task_psi_group(prev)) {
 			if (group == common)
 				break;
-			psi_group_change(group, cpu, clear, set, wake_clock);
-		} while ((group = group->parent));
+			psi_group_change(group, cpu, clear, set, now, wake_clock);
+		}
 
 		/*
 		 * TSK_ONCPU is handled up to the common ancestor. If there are
@@ -987,20 +999,21 @@ void psi_task_switch(struct task_struct
 		 */
 		if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
 			clear &= ~TSK_ONCPU;
-			for (; group; group = group->parent)
-				psi_group_change(group, cpu, clear, set, wake_clock);
+			for_each_group(group, common)
+				psi_group_change(group, cpu, clear, set, now, wake_clock);
 		}
 	}
+	psi_write_end(cpu);
 }
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
 {
 	int cpu = task_cpu(curr);
-	struct psi_group *group;
 	struct psi_group_cpu *groupc;
 	s64 delta;
 	u64 irq;
+	u64 now;
 
 	if (static_branch_likely(&psi_disabled) || !irqtime_enabled())
 		return;
@@ -1009,8 +1022,7 @@ void psi_account_irqtime(struct rq *rq,
 		return;
 
 	lockdep_assert_rq_held(rq);
-	group = task_psi_group(curr);
-	if (prev && task_psi_group(prev) == group)
+	if (prev && task_psi_group(prev) == task_psi_group(curr))
 		return;
 
 	irq = irq_time_read(cpu);
@@ -1019,25 +1031,22 @@ void psi_account_irqtime(struct rq *rq,
 		return;
 	rq->psi_irq_time = irq;
 
-	do {
-		u64 now;
+	psi_write_begin(cpu);
+	now = cpu_clock(cpu);
 
+	for_each_group(group, task_psi_group(curr)) {
 		if (!group->enabled)
 			continue;
 
 		groupc = per_cpu_ptr(group->pcpu, cpu);
 
-		write_seqcount_begin(&groupc->seq);
-		now = cpu_clock(cpu);
-
 		record_times(groupc, now);
 		groupc->times[PSI_IRQ_FULL] += delta;
 
-		write_seqcount_end(&groupc->seq);
-
 		if (group->rtpoll_states & (1 << PSI_IRQ_FULL))
 			psi_schedule_rtpoll_work(group, 1, false);
-	} while ((group = group->parent));
+	}
+	psi_write_end(cpu);
 }
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
@@ -1225,12 +1234,14 @@ void psi_cgroup_restart(struct psi_group
 		return;
 
 	for_each_possible_cpu(cpu) {
-		struct rq *rq = cpu_rq(cpu);
-		struct rq_flags rf;
+		u64 now;
 
-		rq_lock_irq(rq, &rf);
-		psi_group_change(group, cpu, 0, 0, true);
-		rq_unlock_irq(rq, &rf);
+		guard(rq_lock_irq)(cpu_rq(cpu));
+
+		psi_write_begin(cpu);
+		now = cpu_clock(cpu);
+		psi_group_change(group, cpu, 0, 0, now, true);
+		psi_write_end(cpu);
 	}
 }
 #endif /* CONFIG_CGROUPS */

Re: [PATCH v2 01/12] sched/psi: Optimize psi_group_change() cpu_clock() usage

Posted by Chris Mason 2 months, 3 weeks ago

On 7/2/25 7:49 AM, Peter Zijlstra wrote:
> Dietmar reported that commit 3840cbe24cf0 ("sched: psi: fix bogus
> pressure spikes from aggregation race") caused a regression for him on
> a high context switch rate benchmark (schbench) due to the now
> repeating cpu_clock() calls.
> 
> In particular the problem is that get_recent_times() will extrapolate
> the current state to 'now'. But if an update uses a timestamp from
> before the start of the update, it is possible to get two reads
> with inconsistent results. It is effectively back-dating an update.
> 
> (note that this all hard-relies on the clock being synchronized across
> CPUs -- if this is not the case, all bets are off).
> 
> Combine this problem with the fact that there are per-group-per-cpu
> seqcounts, the commit in question pushed the clock read into the group
> iteration, causing tree-depth cpu_clock() calls. On architectures
> where cpu_clock() has appreciable overhead, this hurts.
> 
> Instead move to a per-cpu seqcount, which allows us to have a single
> clock read for all group updates, increasing internal consistency and
> lowering update overhead. This comes at the cost of a longer update
> side (proportional to the tree depth) which can cause the read side to
> retry more often.
> 
> Fixes: 3840cbe24cf0 ("sched: psi: fix bogus pressure spikes from aggregation race")
> Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>,
> Link: https://lkml.kernel.org/20250522084844.GC31726@noisy.programming.kicks-ass.net 
> ---
>  include/linux/psi_types.h |    6 --
>  kernel/sched/psi.c        |  121 +++++++++++++++++++++++++---------------------
>  2 files changed, 68 insertions(+), 59 deletions(-)
> 
> --- a/include/linux/psi_types.h
> +++ b/include/linux/psi_types.h
> @@ -84,11 +84,9 @@ enum psi_aggregators {
>  struct psi_group_cpu {
>  	/* 1st cacheline updated by the scheduler */
>  
> -	/* Aggregator needs to know of concurrent changes */
> -	seqcount_t seq ____cacheline_aligned_in_smp;
> -
>  	/* States of the tasks belonging to this group */
> -	unsigned int tasks[NR_PSI_TASK_COUNTS];
> +	unsigned int tasks[NR_PSI_TASK_COUNTS]
> +			____cacheline_aligned_in_smp;
>  
>  	/* Aggregate pressure state derived from the tasks */
>  	u32 state_mask;
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -176,6 +176,28 @@ struct psi_group psi_system = {
>  	.pcpu = &system_group_pcpu,
>  };
>  
> +static DEFINE_PER_CPU(seqcount_t, psi_seq);

[ ... ]

> @@ -186,7 +208,7 @@ static void group_init(struct psi_group
>  
>  	group->enabled = true;
>  	for_each_possible_cpu(cpu)
> -		seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
> +		seqcount_init(per_cpu_ptr(&psi_seq, cpu));
>  	group->avg_last_update = sched_clock();
>  	group->avg_next_update = group->avg_last_update + psi_period;
>  	mutex_init(&group->avgs_lock);

I'm not sure if someone mentioned this already, but testing the
series I got a bunch of softlockups in get_recent_times()
that randomly jumped from CPU to CPU.

This fixed it for me, but reading it now I'm wondering
if we want to seqcount_init() unconditionally even when PSI
is off.  

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 2024c1d36402d..979a447bc281f 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -207,8 +207,6 @@ static void group_init(struct psi_group *group)
        int cpu;

        group->enabled = true;
-       for_each_possible_cpu(cpu)
-               seqcount_init(per_cpu_ptr(&psi_seq, cpu));
        group->avg_last_update = sched_clock();
        group->avg_next_update = group->avg_last_update + psi_period;
        mutex_init(&group->avgs_lock);
@@ -231,6 +229,7 @@ static void group_init(struct psi_group *group)

 void __init psi_init(void)
 {
+       int cpu;
        if (!psi_enable) {
                static_branch_enable(&psi_disabled);
                static_branch_disable(&psi_cgroups_enabled);
@@ -241,6 +240,8 @@ void __init psi_init(void)
                static_branch_disable(&psi_cgroups_enabled);

        psi_period = jiffies_to_nsecs(PSI_FREQ);
+       for_each_possible_cpu(cpu)
+               seqcount_init(per_cpu_ptr(&psi_seq, cpu));
        group_init(&psi_system);
 }

Re: [PATCH v2 01/12] sched/psi: Optimize psi_group_change() cpu_clock() usage

Posted by Beata Michalska 2 months, 3 weeks ago

On Tue, Jul 15, 2025 at 03:11:14PM -0400, Chris Mason wrote:
> On 7/2/25 7:49 AM, Peter Zijlstra wrote:
> > Dietmar reported that commit 3840cbe24cf0 ("sched: psi: fix bogus
> > pressure spikes from aggregation race") caused a regression for him on
> > a high context switch rate benchmark (schbench) due to the now
> > repeating cpu_clock() calls.
> > 
> > In particular the problem is that get_recent_times() will extrapolate
> > the current state to 'now'. But if an update uses a timestamp from
> > before the start of the update, it is possible to get two reads
> > with inconsistent results. It is effectively back-dating an update.
> > 
> > (note that this all hard-relies on the clock being synchronized across
> > CPUs -- if this is not the case, all bets are off).
> > 
> > Combine this problem with the fact that there are per-group-per-cpu
> > seqcounts, the commit in question pushed the clock read into the group
> > iteration, causing tree-depth cpu_clock() calls. On architectures
> > where cpu_clock() has appreciable overhead, this hurts.
> > 
> > Instead move to a per-cpu seqcount, which allows us to have a single
> > clock read for all group updates, increasing internal consistency and
> > lowering update overhead. This comes at the cost of a longer update
> > side (proportional to the tree depth) which can cause the read side to
> > retry more often.
> > 
> > Fixes: 3840cbe24cf0 ("sched: psi: fix bogus pressure spikes from aggregation race")
> > Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> > Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>,
> > Link: https://lkml.kernel.org/20250522084844.GC31726@noisy.programming.kicks-ass.net 
> > ---
> >  include/linux/psi_types.h |    6 --
> >  kernel/sched/psi.c        |  121 +++++++++++++++++++++++++---------------------
> >  2 files changed, 68 insertions(+), 59 deletions(-)
> > 
> > --- a/include/linux/psi_types.h
> > +++ b/include/linux/psi_types.h
> > @@ -84,11 +84,9 @@ enum psi_aggregators {
> >  struct psi_group_cpu {
> >  	/* 1st cacheline updated by the scheduler */
> >  
> > -	/* Aggregator needs to know of concurrent changes */
> > -	seqcount_t seq ____cacheline_aligned_in_smp;
> > -
> >  	/* States of the tasks belonging to this group */
> > -	unsigned int tasks[NR_PSI_TASK_COUNTS];
> > +	unsigned int tasks[NR_PSI_TASK_COUNTS]
> > +			____cacheline_aligned_in_smp;
> >  
> >  	/* Aggregate pressure state derived from the tasks */
> >  	u32 state_mask;
> > --- a/kernel/sched/psi.c
> > +++ b/kernel/sched/psi.c
> > @@ -176,6 +176,28 @@ struct psi_group psi_system = {
> >  	.pcpu = &system_group_pcpu,
> >  };
> >  
> > +static DEFINE_PER_CPU(seqcount_t, psi_seq);
> 
> [ ... ]
> 
> > @@ -186,7 +208,7 @@ static void group_init(struct psi_group
> >  
> >  	group->enabled = true;
> >  	for_each_possible_cpu(cpu)
> > -		seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
> > +		seqcount_init(per_cpu_ptr(&psi_seq, cpu));
> >  	group->avg_last_update = sched_clock();
> >  	group->avg_next_update = group->avg_last_update + psi_period;
> >  	mutex_init(&group->avgs_lock);
> 
> I'm not sure if someone mentioned this already, but testing the
> series I got a bunch of softlockups in get_recent_times()
> that randomly jumped from CPU to CPU.

... was beaten to it. I can observe the same.
> 
> This fixed it for me, but reading it now I'm wondering
> if we want to seqcount_init() unconditionally even when PSI
> is off.  
> 
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 2024c1d36402d..979a447bc281f 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -207,8 +207,6 @@ static void group_init(struct psi_group *group)
>         int cpu;
> 
>         group->enabled = true;
> -       for_each_possible_cpu(cpu)
> -               seqcount_init(per_cpu_ptr(&psi_seq, cpu));
>         group->avg_last_update = sched_clock();
>         group->avg_next_update = group->avg_last_update + psi_period;
>         mutex_init(&group->avgs_lock);
> @@ -231,6 +229,7 @@ static void group_init(struct psi_group *group)
> 
>  void __init psi_init(void)
>  {
> +       int cpu;
>         if (!psi_enable) {
>                 static_branch_enable(&psi_disabled);
>                 static_branch_disable(&psi_cgroups_enabled);
> @@ -241,6 +240,8 @@ void __init psi_init(void)
>                 static_branch_disable(&psi_cgroups_enabled);
> 
>         psi_period = jiffies_to_nsecs(PSI_FREQ);
> +       for_each_possible_cpu(cpu)
> +               seqcount_init(per_cpu_ptr(&psi_seq, cpu));
>         group_init(&psi_system);
>  }
> 
> 
Wouldn't it be enough to use SEQCNT_ZERO? Those are static per-cpu ones.

---
BR
Beata

Re: [PATCH v2 01/12] sched/psi: Optimize psi_group_change() cpu_clock() usage

Posted by Peter Zijlstra 2 months, 3 weeks ago

On Wed, Jul 16, 2025 at 08:53:01AM +0200, Beata Michalska wrote:
> Wouldn't it be enough to use SEQCNT_ZERO? Those are static per-cpu ones.

Yeah, I suppose that should work. The below builds, but I've not yet
observed the issue myself.

---
Subject: sched/psi: Fix psi_seq initialization
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 15 Jul 2025 15:11:14 -0400

With the seqcount moved out of the group into a global psi_seq,
re-initializing the seqcount on group creation is causing seqcount
corruption.

Fixes: 570c8efd5eb7 ("sched/psi: Optimize psi_group_change() cpu_clock() usage")
Reported-by: Chris Mason <clm@meta.com>
Suggested-by: Beata Michalska <beata.michalska@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/psi.c |    6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -176,7 +176,7 @@ struct psi_group psi_system = {
 	.pcpu = &system_group_pcpu,
 };
 
-static DEFINE_PER_CPU(seqcount_t, psi_seq);
+static DEFINE_PER_CPU(seqcount_t, psi_seq) = SEQCNT_ZERO(psi_seq);
 
 static inline void psi_write_begin(int cpu)
 {
@@ -204,11 +204,7 @@ static void poll_timer_fn(struct timer_l
 
 static void group_init(struct psi_group *group)
 {
-	int cpu;
-
 	group->enabled = true;
-	for_each_possible_cpu(cpu)
-		seqcount_init(per_cpu_ptr(&psi_seq, cpu));
 	group->avg_last_update = sched_clock();
 	group->avg_next_update = group->avg_last_update + psi_period;
 	mutex_init(&group->avgs_lock);

Re: [PATCH v2 01/12] sched/psi: Optimize psi_group_change() cpu_clock() usage

Posted by K Prateek Nayak 2 months, 2 weeks ago

Hello Ingo, Peter,

On 7/16/2025 4:10 PM, Peter Zijlstra wrote:
> On Wed, Jul 16, 2025 at 08:53:01AM +0200, Beata Michalska wrote:
>> Wouldn't it be enough to use SEQCNT_ZERO? Those are static per-cpu ones.
> 
> Yeah, I suppose that should work. The below builds, but I've not yet
> observed the issue myself.
> 
> ---
> Subject: sched/psi: Fix psi_seq initialization
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Tue, 15 Jul 2025 15:11:14 -0400
> 
> With the seqcount moved out of the group into a global psi_seq,
> re-initializing the seqcount on group creation is causing seqcount
> corruption.
> 
> Fixes: 570c8efd5eb7 ("sched/psi: Optimize psi_group_change() cpu_clock() usage")
> Reported-by: Chris Mason <clm@meta.com>
> Suggested-by: Beata Michalska <beata.michalska@arm.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

I've been running with this fix for a bunch of my testing and when I forget
about it (as was the case when testing John's Proxy Exec branch), I usually
run into the softlockup in psi_avgs_work().

Is it too late to include this in tip:sched/core for v6.17?

Also feel free to include:

Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>

-- 
Thanks and Regards,
Prateek

Re: [PATCH v2 01/12] sched/psi: Optimize psi_group_change() cpu_clock() usage

Posted by Aithal, Srikanth 2 months, 2 weeks ago

On 7/16/2025 4:10 PM, Peter Zijlstra wrote:
> On Wed, Jul 16, 2025 at 08:53:01AM +0200, Beata Michalska wrote:
>> Wouldn't it be enough to use SEQCNT_ZERO? Those are static per-cpu ones.
> 
> Yeah, I suppose that should work. The below builds, but I've not yet
> observed the issue myself.
> 
> ---
> Subject: sched/psi: Fix psi_seq initialization
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Tue, 15 Jul 2025 15:11:14 -0400
> 
> With the seqcount moved out of the group into a global psi_seq,
> re-initializing the seqcount on group creation is causing seqcount
> corruption.
> 
> Fixes: 570c8efd5eb7 ("sched/psi: Optimize psi_group_change() cpu_clock() usage")
> Reported-by: Chris Mason <clm@meta.com>
> Suggested-by: Beata Michalska <beata.michalska@arm.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>   kernel/sched/psi.c |    6 +-----
>   1 file changed, 1 insertion(+), 5 deletions(-)
> 
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -176,7 +176,7 @@ struct psi_group psi_system = {
>   	.pcpu = &system_group_pcpu,
>   };
>   
> -static DEFINE_PER_CPU(seqcount_t, psi_seq);
> +static DEFINE_PER_CPU(seqcount_t, psi_seq) = SEQCNT_ZERO(psi_seq);
>   
>   static inline void psi_write_begin(int cpu)
>   {
> @@ -204,11 +204,7 @@ static void poll_timer_fn(struct timer_l
>   
>   static void group_init(struct psi_group *group)
>   {
> -	int cpu;
> -
>   	group->enabled = true;
> -	for_each_possible_cpu(cpu)
> -		seqcount_init(per_cpu_ptr(&psi_seq, cpu));
>   	group->avg_last_update = sched_clock();
>   	group->avg_next_update = group->avg_last_update + psi_period;
>   	mutex_init(&group->avgs_lock);

I've tested the above patch, and it resolves the issue. Could we include 
this patch in the linux-next builds? We have been encountering the 
reported issue regularly in our daily CI.


Tested-by: Srikanth Aithal <sraithal@amd.com>

Re: [PATCH v2 01/12] sched/psi: Optimize psi_group_change() cpu_clock() usage

Posted by Chris Mason 2 months, 3 weeks ago

On 7/16/25 6:40 AM, Peter Zijlstra wrote:
> On Wed, Jul 16, 2025 at 08:53:01AM +0200, Beata Michalska wrote:
>> Wouldn't it be enough to use SEQCNT_ZERO? Those are static per-cpu ones.
> 
> Yeah, I suppose that should work. The below builds, but I've not yet
> observed the issue myself.

[ ... ]

>  
> -static DEFINE_PER_CPU(seqcount_t, psi_seq);
> +static DEFINE_PER_CPU(seqcount_t, psi_seq) = SEQCNT_ZERO(psi_seq);

As expected, this also works.  Thanks everyone.

-chris

Re: [PATCH v2 01/12] sched/psi: Optimize psi_group_change() cpu_clock() usage

Posted by Johannes Weiner 2 months, 3 weeks ago

On Wed, Jul 16, 2025 at 12:40:50PM +0200, Peter Zijlstra wrote:
> On Wed, Jul 16, 2025 at 08:53:01AM +0200, Beata Michalska wrote:
> > Wouldn't it be enough to use SEQCNT_ZERO? Those are static per-cpu ones.
> 
> Yeah, I suppose that should work. The below builds, but I've not yet
> observed the issue myself.
> 
> ---
> Subject: sched/psi: Fix psi_seq initialization
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Tue, 15 Jul 2025 15:11:14 -0400
> 
> With the seqcount moved out of the group into a global psi_seq,
> re-initializing the seqcount on group creation is causing seqcount
> corruption.
> 
> Fixes: 570c8efd5eb7 ("sched/psi: Optimize psi_group_change() cpu_clock() usage")
> Reported-by: Chris Mason <clm@meta.com>
> Suggested-by: Beata Michalska <beata.michalska@arm.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Argh, missed that during the review as well.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>

Re: [PATCH v2 01/12] sched/psi: Optimize psi_group_change() cpu_clock() usage

Posted by K Prateek Nayak 2 months, 3 weeks ago

(+ Ayush, Srikanth)

Hello Chris,

On 7/16/2025 12:41 AM, Chris Mason wrote:
>> @@ -186,7 +208,7 @@ static void group_init(struct psi_group
>>  
>>  	group->enabled = true;
>>  	for_each_possible_cpu(cpu)
>> -		seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
>> +		seqcount_init(per_cpu_ptr(&psi_seq, cpu));
>>  	group->avg_last_update = sched_clock();
>>  	group->avg_next_update = group->avg_last_update + psi_period;
>>  	mutex_init(&group->avgs_lock);
> 
> I'm not sure if someone mentioned this already, but testing the
> series I got a bunch of softlockups in get_recent_times()
> that randomly jumped from CPU to CPU.

Ayush, Srikanth, and I ran into this yesterday when testing different
trees (next, queue:sched/core) with similar signatures:

    watchdog: BUG: soft lockup - CPU#0 stuck for 21s! [kworker/0:3:2140]
    Modules linked in: ...
    CPU: 0 UID: 0 PID: 2140 Comm: kworker/0:3 Not tainted 6.16.0-rc1-test+ #20 PREEMPT(voluntary)
    Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS 2.7.3 03/30/2022
    Workqueue: events psi_avgs_work
    RIP: 0010:collect_percpu_times+0x3a0/0x670
    Code: 65 48 2b 05 4a 79 d2 02 0f 85 dc 02 00 00 48 8d 65 d8 5b 41 5c 41 5d 41 5e 41 5f 5d e9 34 ba d2 ff f3 90 49 81 ff ff 1f 00 00 <0f> 86 73 fd ff ff 4c 89 fe 48 c7 c7 80 9d 29 bb e8 cb 92 73 00 e9
    RSP: 0018:ffffcda753383d10 EFLAGS: 00000297
    RAX: ffff8be86fadcd40 RBX: ffffeda7308d4580 RCX: 000000000000006b
    RDX: 000000000000002b RSI: 0000000000000100 RDI: ffffffffbab3f400
    RBP: ffffcda753383e30 R08: 000000000000006b R09: 0000000000000000
    R10: 0000008cca6be372 R11: 0000000000000006 R12: 000000000000006b
    R13: ffffeda7308d4594 R14: 00000000000037e5 R15: 000000000000006b
    FS:  0000000000000000(0000) GS:ffff8ba8c1118000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 000055b3cf990c3c CR3: 000000807dc40006 CR4: 0000000000f70ef0
    PKRU: 55555554
    Call Trace:
     <TASK>
     ? srso_alias_return_thunk+0x5/0xfbef5
     ? psi_group_change+0x1ff/0x460
     ? add_timer_on+0x10a/0x160
     psi_avgs_work+0x4c/0xd0
     ? queue_delayed_work_on+0x6d/0x80
     process_one_work+0x193/0x3c0
     worker_thread+0x29d/0x3c0
     ? __pfx_worker_thread+0x10/0x10
     kthread+0xff/0x210
     ? __pfx_kthread+0x10/0x10
     ? __pfx_kthread+0x10/0x10
     ret_from_fork+0x171/0x1a0
     ? __pfx_kthread+0x10/0x10
     ret_from_fork_asm+0x1a/0x30
     </TASK>

I was able to reproduce this reliably running 100 copies of an infinite
loop doing - cgroup create, move a task, move task back to root, remove
cgroup - alongside hackbench running in a seperate cgroup and I hit this
in ~5-10min.

I have been running the same test with your fix and haven't run into
this for over 30min now. Feel free to add:

Reviewed-and-tested-by: K Prateek Nayak <kprateek.nayak@amd.com>

> 
> This fixed it for me, but reading it now I'm wondering
> if we want to seqcount_init() unconditionally even when PSI
> is off.

Looking at "psi_enable", it can only be toggled via the kernel
parameter "psi=" and I don't see anything that does a
"static_branch_disable(&psi_disabled)" at runtime so I think
your fix should be good.

> 
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 2024c1d36402d..979a447bc281f 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -207,8 +207,6 @@ static void group_init(struct psi_group *group)
>         int cpu;

"cpu" variable can be removed too from group_init() now.

> 
>         group->enabled = true;
> -       for_each_possible_cpu(cpu)
> -               seqcount_init(per_cpu_ptr(&psi_seq, cpu));
>         group->avg_last_update = sched_clock();
>         group->avg_next_update = group->avg_last_update + psi_period;
>         mutex_init(&group->avgs_lock);
> @@ -231,6 +229,7 @@ static void group_init(struct psi_group *group)
> 
>  void __init psi_init(void)
>  {
> +       int cpu;

nit. newline after declaration.

>         if (!psi_enable) {
>                 static_branch_enable(&psi_disabled);
>                 static_branch_disable(&psi_cgroups_enabled);
> @@ -241,6 +240,8 @@ void __init psi_init(void)
>                 static_branch_disable(&psi_cgroups_enabled);
> 
>         psi_period = jiffies_to_nsecs(PSI_FREQ);
> +       for_each_possible_cpu(cpu)
> +               seqcount_init(per_cpu_ptr(&psi_seq, cpu));
>         group_init(&psi_system);
>  }
> 
> 

-- 
Thanks and Regards,
Prateek