RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

[PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

Posted by John Stultz 1 year, 8 months ago

I recently got a bug report that
clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) had regressed between
5.10 and 6.1. Its not a huge regression in absolute time
(~30-40ns), but is >10% change.

I narrowed the cause down to the addition of
psi_account_irqtime() in update_rq_clock_task(), in commit
52b1364ba0b1 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ
pressure")

So that explains the behavior change, but it also seems odd that
we're doing psi irq accounting from a syscall that is just
trying to read the thread's cputime.

Thinking about it more, it seems the re-use of update_rq_clock()
to handle accounting for any in-progress time for the current
task has the potential for side effects and unnecessary work.

So instead rework the logic so we calculate the current cpu
runtime in a read-only fashion.

This has the side benefit of improving
clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) performance by ~12%
over the behavior in 5.10, and ~21% over the 6.1 behavior.

NOTE: I'm not 100% sure this is correct yet. There may be some
edge cases I've overlooked, so I'd greatly appreciate any
review or feedback.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Qais Yousef <qyousef@layalina.io>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: kernel-team@android.com
Signed-off-by: John Stultz <jstultz@google.com>
---
 kernel/sched/core.c | 82 ++++++++++++++++++++++++++-------------------
 1 file changed, 47 insertions(+), 35 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcf2c4cc0522..b29cde5ded84 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -692,16 +692,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
  * RQ-clock updating methods:
  */
 
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-/*
- * In theory, the compile should just see 0 here, and optimize out the call
- * to sched_rt_avg_update. But I don't trust it...
- */
-	s64 __maybe_unused steal = 0, irq_delta = 0;
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
-	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+static inline s64 get_irq_delta(struct rq *rq, s64 delta)
+{
+	s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
 
 	/*
 	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
@@ -720,7 +715,45 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 	 */
 	if (irq_delta > delta)
 		irq_delta = delta;
+	return irq_delta;
+}
+#else
+static inline s64 get_irq_delta(struct rq *rq, s64 delta)
+{
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+static inline s64 get_steal_time(struct rq *rq, s64 delta)
+{
+	s64 steal;
 
+	if (!static_key_false(&paravirt_steal_rq_enabled))
+		return 0;
+	steal = paravirt_steal_clock(cpu_of(rq));
+	steal -= rq->prev_steal_time_rq;
+	if (unlikely(steal > delta))
+		steal = delta;
+	return steal;
+}
+#else
+static inline s64 get_steal_time(struct rq *rq, s64 delta)
+{
+	return 0;
+}
+#endif
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+	s64 __maybe_unused steal = 0, irq_delta = 0;
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	irq_delta = get_irq_delta(rq, delta);
 	rq->prev_irq_time += irq_delta;
 	delta -= irq_delta;
 	psi_account_irqtime(rq->curr, irq_delta);
@@ -728,12 +761,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 	if (static_key_false((&paravirt_steal_rq_enabled))) {
-		steal = paravirt_steal_clock(cpu_of(rq));
-		steal -= rq->prev_steal_time_rq;
-
-		if (unlikely(steal > delta))
-			steal = delta;
-
+		steal = get_steal_time(rq, delta);
 		rq->prev_steal_time_rq += steal;
 		delta -= steal;
 	}
@@ -5547,23 +5575,6 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
 
-/*
- * The function fair_sched_class.update_curr accesses the struct curr
- * and its field curr->exec_start; when called from task_sched_runtime(),
- * we observe a high rate of cache misses in practice.
- * Prefetching this data results in improved performance.
- */
-static inline void prefetch_curr_exec_start(struct task_struct *p)
-{
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	struct sched_entity *curr = (&p->se)->cfs_rq->curr;
-#else
-	struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
-#endif
-	prefetch(curr);
-	prefetch(&curr->exec_start);
-}
-
 /*
  * Return accounted runtime for the task.
  * In case the task is currently running, return the runtime plus current's
@@ -5573,6 +5584,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 {
 	struct rq_flags rf;
 	struct rq *rq;
+	s64 delta_exec = 0;
 	u64 ns;
 
 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
@@ -5598,11 +5610,11 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	 * thread, breaking clock_gettime().
 	 */
 	if (task_current(rq, p) && task_on_rq_queued(p)) {
-		prefetch_curr_exec_start(p);
-		update_rq_clock(rq);
-		p->sched_class->update_curr(rq);
+		delta_exec = sched_clock_cpu(cpu_of(rq)) - p->se.exec_start;
+		delta_exec -= get_irq_delta(rq, delta_exec);
+		delta_exec -= get_steal_time(rq, delta_exec);
 	}
-	ns = p->se.sum_exec_runtime;
+	ns = p->se.sum_exec_runtime + delta_exec;
 	task_rq_unlock(rq, p, &rf);
 
 	return ns;
-- 
2.45.2.505.gda0bf45e8d-goog

Re: [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

Posted by Peter Zijlstra 1 year, 8 months ago

On Wed, Jun 12, 2024 at 06:58:26PM -0700, John Stultz wrote:
> I recently got a bug report that
> clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) had regressed between
> 5.10 and 6.1. Its not a huge regression in absolute time
> (~30-40ns), but is >10% change.
> 
> I narrowed the cause down to the addition of
> psi_account_irqtime() in update_rq_clock_task(), in commit
> 52b1364ba0b1 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ
> pressure")
> 
> So that explains the behavior change, 

It doesn't really... that patch just feeds the irq_time we *already*
subtracted prior to it, to PSI, such that PSI can also enjoy the 'view'.

The only explanation I have is that interrupts that end up in the
scheduler (wakeups, tick, etc..) now get to do that PSI cgroup iteration
and that cost adds up to the IRQ time itself, and as such the task time
slows down accordingly.

Are you using silly deep cgroup hierarchies?

> but it also seems odd that
> we're doing psi irq accounting from a syscall that is just
> trying to read the thread's cputime.

In order to avoid doing all the accounting in the IRQ entry/exit paths,
those paths only do the bare minimum of tracking how much IRQ time there
is.

update_rq_clock_task() then looks at the increase of IRQ time and
subtracts this from the task time -- after all, all time spend in the
IRQ wasn't spend on the task itself.

It did that prior to the PSI patch, and it does so after. The only
change is it now feeds this delta into the PSI thing.

> NOTE: I'm not 100% sure this is correct yet. There may be some
> edge cases I've overlooked, so I'd greatly appreciate any
> review or feedback.

Urgh, you're sprinkling the details of what is clock_task over multiple
places.

Does something like so work?

---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0935f9d4bb7b..d4b87539d72a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 
 	rq->prev_irq_time += irq_delta;
 	delta -= irq_delta;
-	psi_account_irqtime(rq->curr, irq_delta);
 	delayacct_irq(rq->curr, irq_delta);
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
@@ -5459,6 +5458,8 @@ void sched_tick(void)
 
 	sched_clock_tick();
 
+	psi_account_irqtime(curr, &rq->psi_irq_time);
+
 	rq_lock(rq, &rf);
 
 	update_rq_clock(rq);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 146baa91d104..57fdb0b9efbd 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -991,12 +991,13 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 }
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct task_struct *task, u32 delta)
+void psi_account_irqtime(struct task_struct *task, u64 *prev)
 {
 	int cpu = task_cpu(task);
 	struct psi_group *group;
 	struct psi_group_cpu *groupc;
-	u64 now;
+	u64 now, irq;
+	s64 delta;
 
 	if (static_branch_likely(&psi_disabled))
 		return;
@@ -1005,6 +1006,11 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
 		return;
 
 	now = cpu_clock(cpu);
+	irq = irq_time_read(cpu);
+	delta = (s64)(irq - *prev);
+	if (delta < 0)
+		return;
+	*prev = irq;
 
 	group = task_psi_group(task);
 	do {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 62fd8bc6fd08..a63eb546bc4a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1133,6 +1133,7 @@ struct rq {
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	u64			prev_irq_time;
+	u64			psi_irq_time;
 #endif
 #ifdef CONFIG_PARAVIRT
 	u64			prev_steal_time;
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index d1445410840a..1111f060264f 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -110,7 +110,7 @@ __schedstats_from_se(struct sched_entity *se)
 void psi_task_change(struct task_struct *task, int clear, int set);
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		     bool sleep);
-void psi_account_irqtime(struct task_struct *task, u32 delta);
+void psi_account_irqtime(struct task_struct *task, u64 *prev);
 
 /*
  * PSI tracks state that persists across sleeps, such as iowaits and
@@ -192,7 +192,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
 static inline void psi_sched_switch(struct task_struct *prev,
 				    struct task_struct *next,
 				    bool sleep) {}
-static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
+static inline void psi_account_irqtime(struct task_struct *task, u64 *prev) {}
 #endif /* CONFIG_PSI */
 
 #ifdef CONFIG_SCHED_INFO

Re: [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

Posted by John Stultz 1 year, 8 months ago

On Thu, Jun 13, 2024 at 3:04 AM Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, Jun 12, 2024 at 06:58:26PM -0700, John Stultz wrote:
> > I recently got a bug report that
> > clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) had regressed between
> > 5.10 and 6.1. Its not a huge regression in absolute time
> > (~30-40ns), but is >10% change.
> >
> > I narrowed the cause down to the addition of
> > psi_account_irqtime() in update_rq_clock_task(), in commit
> > 52b1364ba0b1 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ
> > pressure")
> >
> > So that explains the behavior change,
>
> It doesn't really... that patch just feeds the irq_time we *already*
> subtracted prior to it, to PSI, such that PSI can also enjoy the 'view'.
>
> The only explanation I have is that interrupts that end up in the
> scheduler (wakeups, tick, etc..) now get to do that PSI cgroup iteration
> and that cost adds up to the IRQ time itself, and as such the task time
> slows down accordingly.
>
> Are you using silly deep cgroup hierarchies?

Not deeply nested. We have a handful of cgroups, but due to
priority-inversion issues we're not putting any tasks in them right
now.

Again, this is only ~30necs delta, so it's not a huge amount of time
we're talking about. The cpu_clock() call in psi_account_irqtime is
probably a reasonable chunk of that (as just the vdso clock_gettime()
takes a similar order of time).

> > but it also seems odd that
> > we're doing psi irq accounting from a syscall that is just
> > trying to read the thread's cputime.
>
> In order to avoid doing all the accounting in the IRQ entry/exit paths,
> those paths only do the bare minimum of tracking how much IRQ time there
> is.
>
> update_rq_clock_task() then looks at the increase of IRQ time and
> subtracts this from the task time -- after all, all time spend in the
> IRQ wasn't spend on the task itself.
>
> It did that prior to the PSI patch, and it does so after. The only
> change is it now feeds this delta into the PSI thing.

Yeah. I don't see the PSI logic as particularly bad, it's just that I
was surprised we were getting into the accounting paths during a read.

Coming from the timekeeping subsystem, we usually do the time
accumulation/accounting (write side) periodically, and on the read
fast-path we avoid anything extra. So it was just a bit unintuitive to
me to find we were doing the accounting/accumulation to the task clock
on each read.

> > NOTE: I'm not 100% sure this is correct yet. There may be some
> > edge cases I've overlooked, so I'd greatly appreciate any
> > review or feedback.
>
> Urgh, you're sprinkling the details of what is clock_task over multiple
> places.

Yeah. Agreed, the base-irq-steal logic I added should have been
wrapped up better.

> Does something like so work?

Yeah, it takes the psi_account_irqtime out of the hotpath, so with it
we're back to ~5.10 numbers.

Qais' correctness concerns seem reasonable, and I fret the second
psi_irq_time accounting base creates multiple irq_time timelines to
manage (and eventually fumble up). So I still think the idea of
separate read/write paths might be an overall improvement. But
obviously your patch seemingly works properly and mine doesn't, so I
can't object too strongly here. :)  Though I may still take another
pass on my patch to see if we can make the clock_gettime read path
avoid the accounting logic, but it can be a throwaway if you really
don't like it.

> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 0935f9d4bb7b..d4b87539d72a 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
>
>         rq->prev_irq_time += irq_delta;
>         delta -= irq_delta;
> -       psi_account_irqtime(rq->curr, irq_delta);
>         delayacct_irq(rq->curr, irq_delta);
>  #endif
>  #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> @@ -5459,6 +5458,8 @@ void sched_tick(void)
>
>         sched_clock_tick();
>
> +       psi_account_irqtime(curr, &rq->psi_irq_time);
> +
>         rq_lock(rq, &rf);

Should the psi_account_irqtime (and its use of the rq->psi_irq_time)
go under the rq_lock here?  This is the only user, so there isn't any
practical parallel access, but the locking rules are subtle enough
already.

thanks
-john

Re: [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

Posted by Qais Yousef 1 year, 8 months ago

On 06/13/24 12:04, Peter Zijlstra wrote:

> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 0935f9d4bb7b..d4b87539d72a 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
>  
>  	rq->prev_irq_time += irq_delta;
>  	delta -= irq_delta;
> -	psi_account_irqtime(rq->curr, irq_delta);
>  	delayacct_irq(rq->curr, irq_delta);
>  #endif
>  #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> @@ -5459,6 +5458,8 @@ void sched_tick(void)
>  
>  	sched_clock_tick();
>  
> +	psi_account_irqtime(curr, &rq->psi_irq_time);
> +

If wakeup preemption causes a context switch, wouldn't we lose this
information then? I *think* active migration might cause this information to be
lost too.

pick_next_task() might be a better place to do the accounting?

>  	rq_lock(rq, &rf);
>  
>  	update_rq_clock(rq);

Re: [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

Posted by Peter Zijlstra 1 year, 8 months ago

On Thu, Jun 13, 2024 at 12:51:42PM +0100, Qais Yousef wrote:
> On 06/13/24 12:04, Peter Zijlstra wrote:
> 
> > ---
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 0935f9d4bb7b..d4b87539d72a 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
> >  
> >  	rq->prev_irq_time += irq_delta;
> >  	delta -= irq_delta;
> > -	psi_account_irqtime(rq->curr, irq_delta);
> >  	delayacct_irq(rq->curr, irq_delta);
> >  #endif
> >  #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> > @@ -5459,6 +5458,8 @@ void sched_tick(void)
> >  
> >  	sched_clock_tick();
> >  
> > +	psi_account_irqtime(curr, &rq->psi_irq_time);
> > +
> 
> If wakeup preemption causes a context switch, wouldn't we lose this
> information then? I *think* active migration might cause this information to be
> lost too.

I'm not sure what would be lost ?! the accounting is per cpu, not per
task afaict. That said,...

> pick_next_task() might be a better place to do the accounting?

Additionally, when there has been an effective cgroup switch. Only on
switch doesn't work for long running tasks, then the PSI information
will be artitrarily long out of date.

Which then gets me something like the (completely untested) below..

Hmm?

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0935f9d4bb7b..36aed99d6a6c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 
 	rq->prev_irq_time += irq_delta;
 	delta -= irq_delta;
-	psi_account_irqtime(rq->curr, irq_delta);
 	delayacct_irq(rq->curr, irq_delta);
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
@@ -5459,6 +5458,8 @@ void sched_tick(void)
 
 	sched_clock_tick();
 
+	psi_account_irqtime(curr, NULL, &rq->psi_irq_time);
+
 	rq_lock(rq, &rf);
 
 	update_rq_clock(rq);
@@ -6521,6 +6524,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 		++*switch_count;
 
 		migrate_disable_switch(rq, prev);
+		psi_account_irqtime(prev, next, &rq->psi_irq_time);
 		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
 		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 146baa91d104..65bba162408f 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -991,22 +991,31 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 }
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct task_struct *task, u32 delta)
+void psi_account_irqtime(struct task_struct *curr, struct task_struct *prev, u64 *time)
 {
-	int cpu = task_cpu(task);
+	int cpu = task_cpu(curr);
 	struct psi_group *group;
 	struct psi_group_cpu *groupc;
-	u64 now;
+	u64 now, irq;
+	s64 delta;
 
 	if (static_branch_likely(&psi_disabled))
 		return;
 
-	if (!task->pid)
+	if (!curr->pid)
+		return;
+
+	group = task_psi_group(curr);
+	if( prev && task_psi_group(prev) == group)
 		return;
 
 	now = cpu_clock(cpu);
+	irq = irq_time_read(cpu);
+	delta = (s64)(irq - *time);
+	if (delta < 0)
+		return;
+	*time = irq;
 
-	group = task_psi_group(task);
 	do {
 		if (!group->enabled)
 			continue;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 62fd8bc6fd08..a63eb546bc4a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1133,6 +1133,7 @@ struct rq {
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	u64			prev_irq_time;
+	u64			psi_irq_time;
 #endif
 #ifdef CONFIG_PARAVIRT
 	u64			prev_steal_time;
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index d1445410840a..1e290054c5db 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -110,7 +110,7 @@ __schedstats_from_se(struct sched_entity *se)
 void psi_task_change(struct task_struct *task, int clear, int set);
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		     bool sleep);
-void psi_account_irqtime(struct task_struct *task, u32 delta);
+void psi_account_irqtime(struct task_struct *curr, struct task_struct *prev, u64 *time);
 
 /*
  * PSI tracks state that persists across sleeps, such as iowaits and
@@ -192,7 +192,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
 static inline void psi_sched_switch(struct task_struct *prev,
 				    struct task_struct *next,
 				    bool sleep) {}
-static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
+static inline void psi_account_irqtime(struct task_struct *curr, struct task_struct *prev, u64 *time) {}
 #endif /* CONFIG_PSI */
 
 #ifdef CONFIG_SCHED_INFO

Re: [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

Posted by John Stultz 1 year, 7 months ago

On Fri, Jun 14, 2024 at 2:48 AM Peter Zijlstra <peterz@infradead.org> wrote:
> Which then gets me something like the (completely untested) below..
>
> Hmm?
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 0935f9d4bb7b..36aed99d6a6c 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
>
>         rq->prev_irq_time += irq_delta;
>         delta -= irq_delta;
> -       psi_account_irqtime(rq->curr, irq_delta);
>         delayacct_irq(rq->curr, irq_delta);
>  #endif
>  #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> @@ -5459,6 +5458,8 @@ void sched_tick(void)
>
>         sched_clock_tick();
>
> +       psi_account_irqtime(curr, NULL, &rq->psi_irq_time);
> +
>         rq_lock(rq, &rf);
>
>         update_rq_clock(rq);
> @@ -6521,6 +6524,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
>                 ++*switch_count;
>
>                 migrate_disable_switch(rq, prev);
> +               psi_account_irqtime(prev, next, &rq->psi_irq_time);

FYI: These references to psi_irq_time hit build issues if
CONFIG_IRQ_TIME_ACCOUNTING is disabled.

Also, separately, while I didn't see this earlier testing on physical
devices, when running virtualized, I can pretty easily trip over the
following:

[   65.207340] watchdog: BUG: soft lockup - CPU#0 stuck for 26s!
[kworker/0:3:374]
[   65.211107] irq event stamp: 118664
[   65.212786] hardirqs last  enabled at (118663):
[<ffffffff97a00e46>] asm_sysvec_apic_timer_interrupt+0x16/0x20
[   65.218440] hardirqs last disabled at (118664):
[<ffffffff977fdeca>] sysvec_apic_timer_interrupt+0xa/0xc0
[   65.223074] softirqs last  enabled at (118546):
[<ffffffff9676db78>] __irq_exit_rcu+0x88/0xe0
[   65.227118] softirqs last disabled at (118541):
[<ffffffff9676db78>] __irq_exit_rcu+0x88/0xe0
[   65.231137] CPU: 0 PID: 374 Comm: kworker/0:3 Not tainted
6.10.0-rc4-dirty #4393
[   65.234625] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[   65.239089] Workqueue: events psi_avgs_work
[   65.241122] RIP: 0010:collect_percpu_times+0xff/0x310
[   65.243525] Code: b9 02 00 00 00 48 89 df e8 8e a4 01 00 48 8b b4
24 d0 00 00 00 48 89 df e8 5e 9e 01 00 58 45 8b 34 24 41 f6 c6 01 74
0c f3 0
[   65.252926] RSP: 0018:ffff958501263d50 EFLAGS: 00000202
[   65.255433] RAX: 0000000000017b61 RBX: ffff9585b901d848 RCX: 0000000000000006
[   65.258755] RDX: ffffffff967eb6ac RSI: ffffffff9819e305 RDI: ffffffff98177748
[   65.262113] RBP: ffff958501263db0 R08: 0000000000000001 R09: 0000000000000000
[   65.265475] R10: 0000000000000001 R11: 0000000000000001 R12: ffff9585b901d840
[   65.268785] R13: ffff9585b901d884 R14: 0000000000033d8b R15: 0000000000000000
[   65.272146] FS:  0000000000000000(0000) GS:ffff9585b9000000(0000)
knlGS:0000000000000000
[   65.275908] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   65.278629] CR2: 00005631aee8b000 CR3: 0000000116c0e001 CR4: 0000000000370ef0
[   65.282002] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   65.285386] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   65.288730] Call Trace:
[   65.289958]  <IRQ>
[   65.290965]  ? watchdog_timer_fn+0x275/0x310
[   65.293185]  ? __pfx_watchdog_timer_fn+0x10/0x10
[   65.295379]  ? __hrtimer_run_queues+0x190/0x3b0
[   65.297795]  ? hrtimer_interrupt+0xf9/0x230
[   65.299782]  ? __sysvec_apic_timer_interrupt+0x82/0x210
[   65.302243]  ? sysvec_apic_timer_interrupt+0x98/0xc0
[   65.304590]  </IRQ>
[   65.305658]  <TASK>
[   65.306708]  ? asm_sysvec_apic_timer_interrupt+0x16/0x20
[   65.309206]  ? psi_avgs_work+0x3c/0xb0
[   65.311001]  ? collect_percpu_times+0xff/0x310
[   65.313153]  psi_avgs_work+0x3c/0xb0
[   65.314864]  process_one_work+0x1fe/0x700
[   65.316782]  ? lock_is_held_type+0xcd/0x120
[   65.318782]  worker_thread+0x1c7/0x3b0
[   65.320571]  ? __pfx_worker_thread+0x10/0x10
[   65.322626]  kthread+0xe0/0x110
[   65.324103]  ? __pfx_kthread+0x10/0x10
[   65.325853]  ret_from_fork+0x28/0x40
[   65.327512]  ? __pfx_kthread+0x10/0x10
[   65.329255]  ret_from_fork_asm+0x1a/0x30
[   65.331073]  </TASK>
[   65.332119] Kernel panic - not syncing: softlockup: hung tasks

Where collect_percpu_times+0xff/0x310:
__seqprop_sequence at include/linux/seqlock.h:211 (discriminator 2)
(inlined by) get_recent_times at kernel/sched/psi.c:261 (discriminator 2)
(inlined by) collect_percpu_times at kernel/sched/psi.c:359 (discriminator 2)

Which looks like its getting stuck in the seqlock loop, and the only
way I can see that catching right off, is if we're in some sort of
livelock where the calls to psi_account_irqtime(curr, NULL,
&rq->psi_irq_time) is coming in frequently enough to change the seqno
each iteration through the reader.  But from my initial trace_printk
debugging, it seems like from a kworker we enter the loop in
get_recent_times(), hit an irq and somehow never really come back out
of irq context. Though we continue to get ticks on the task and
continue to call psi_account_irqtime().  I was worried we were somehow
getting stuck in the 'while ((group = group->parent));' loop in
psi_account_irqtime(), but that doesn't seem to be the case.

[  238.297094] kworker/-798       0..... 200647713us :
collect_percpu_times: JDB: get_recent_times at top of loop 0!
[  238.301705] kworker/-798       0..... 200647767us :
collect_percpu_times: JDB: get_recent_times done with read (looped:
1)!
[  238.306689] kworker/-798       0..... 200647768us :
collect_percpu_times: JDB: get_recent_times at top of loop 0!
[  238.311313] kworker/-798       0..... 200647769us :
collect_percpu_times: JDB: get_recent_times done with read (looped:
1)!
[  238.316318] kworker/-798       0d..2. 200647786us :
psi_group_change: JDB: psi_group_change seqwrite
<normal behavior above>
[  238.320460] kworker/-10        0..... 200647790us :
collect_percpu_times: JDB: get_recent_times at top of loop 0!
[  238.325131] kworker/-10        0d.h.. 200648408us :
psi_account_irqtime: JDB: psi_account_irqtime seqwrite (loop count: 0)
[  238.330149] kworker/-10        0d.h.. 200649406us :
psi_account_irqtime: JDB: psi_account_irqtime seqwrite (loop count: 0)
[  238.335140] kworker/-10        0d.h.. 200650405us :
psi_account_irqtime: JDB: psi_account_irqtime seqwrite (loop count: 0)
...
With the psi_account_irqtime just repeating there each ms/tick.

I'm still digging a bit here to understand what's going on. But I
wanted to share.

thanks
-john

Re: [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

Posted by Peter Zijlstra 1 year, 7 months ago

On Mon, Jun 17, 2024 at 05:42:25PM -0700, John Stultz wrote:
> On Fri, Jun 14, 2024 at 2:48 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > Which then gets me something like the (completely untested) below..
> >
> > Hmm?
> >
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 0935f9d4bb7b..36aed99d6a6c 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
> >
> >         rq->prev_irq_time += irq_delta;
> >         delta -= irq_delta;
> > -       psi_account_irqtime(rq->curr, irq_delta);
> >         delayacct_irq(rq->curr, irq_delta);
> >  #endif
> >  #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> > @@ -5459,6 +5458,8 @@ void sched_tick(void)
> >
> >         sched_clock_tick();
> >
> > +       psi_account_irqtime(curr, NULL, &rq->psi_irq_time);
> > +
> >         rq_lock(rq, &rf);
> >
> >         update_rq_clock(rq);
> > @@ -6521,6 +6524,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
> >                 ++*switch_count;
> >
> >                 migrate_disable_switch(rq, prev);
> > +               psi_account_irqtime(prev, next, &rq->psi_irq_time);
> 
> FYI: These references to psi_irq_time hit build issues if
> CONFIG_IRQ_TIME_ACCOUNTING is disabled.

Ah, well, I did say it was untested...

> Also, separately, while I didn't see this earlier testing on physical
> devices, when running virtualized, I can pretty easily trip over the
> following:
> 
> [   65.207340] watchdog: BUG: soft lockup - CPU#0 stuck for 26s!
> [kworker/0:3:374]
> [   65.211107] irq event stamp: 118664
> [   65.212786] hardirqs last  enabled at (118663):
> [<ffffffff97a00e46>] asm_sysvec_apic_timer_interrupt+0x16/0x20
> [   65.218440] hardirqs last disabled at (118664):
> [<ffffffff977fdeca>] sysvec_apic_timer_interrupt+0xa/0xc0
> [   65.223074] softirqs last  enabled at (118546):
> [<ffffffff9676db78>] __irq_exit_rcu+0x88/0xe0
> [   65.227118] softirqs last disabled at (118541):
> [<ffffffff9676db78>] __irq_exit_rcu+0x88/0xe0
> [   65.231137] CPU: 0 PID: 374 Comm: kworker/0:3 Not tainted
> 6.10.0-rc4-dirty #4393
> [   65.234625] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> [   65.239089] Workqueue: events psi_avgs_work
> [   65.241122] RIP: 0010:collect_percpu_times+0xff/0x310
> [   65.243525] Code: b9 02 00 00 00 48 89 df e8 8e a4 01 00 48 8b b4
> 24 d0 00 00 00 48 89 df e8 5e 9e 01 00 58 45 8b 34 24 41 f6 c6 01 74
> 0c f3 0
> [   65.252926] RSP: 0018:ffff958501263d50 EFLAGS: 00000202
> [   65.255433] RAX: 0000000000017b61 RBX: ffff9585b901d848 RCX: 0000000000000006
> [   65.258755] RDX: ffffffff967eb6ac RSI: ffffffff9819e305 RDI: ffffffff98177748
> [   65.262113] RBP: ffff958501263db0 R08: 0000000000000001 R09: 0000000000000000
> [   65.265475] R10: 0000000000000001 R11: 0000000000000001 R12: ffff9585b901d840
> [   65.268785] R13: ffff9585b901d884 R14: 0000000000033d8b R15: 0000000000000000
> [   65.272146] FS:  0000000000000000(0000) GS:ffff9585b9000000(0000)
> knlGS:0000000000000000
> [   65.275908] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   65.278629] CR2: 00005631aee8b000 CR3: 0000000116c0e001 CR4: 0000000000370ef0
> [   65.282002] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [   65.285386] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [   65.288730] Call Trace:
> [   65.289958]  <IRQ>
> [   65.290965]  ? watchdog_timer_fn+0x275/0x310
> [   65.293185]  ? __pfx_watchdog_timer_fn+0x10/0x10
> [   65.295379]  ? __hrtimer_run_queues+0x190/0x3b0
> [   65.297795]  ? hrtimer_interrupt+0xf9/0x230
> [   65.299782]  ? __sysvec_apic_timer_interrupt+0x82/0x210
> [   65.302243]  ? sysvec_apic_timer_interrupt+0x98/0xc0
> [   65.304590]  </IRQ>
> [   65.305658]  <TASK>
> [   65.306708]  ? asm_sysvec_apic_timer_interrupt+0x16/0x20
> [   65.309206]  ? psi_avgs_work+0x3c/0xb0
> [   65.311001]  ? collect_percpu_times+0xff/0x310
> [   65.313153]  psi_avgs_work+0x3c/0xb0
> [   65.314864]  process_one_work+0x1fe/0x700
> [   65.316782]  ? lock_is_held_type+0xcd/0x120
> [   65.318782]  worker_thread+0x1c7/0x3b0
> [   65.320571]  ? __pfx_worker_thread+0x10/0x10
> [   65.322626]  kthread+0xe0/0x110
> [   65.324103]  ? __pfx_kthread+0x10/0x10
> [   65.325853]  ret_from_fork+0x28/0x40
> [   65.327512]  ? __pfx_kthread+0x10/0x10
> [   65.329255]  ret_from_fork_asm+0x1a/0x30
> [   65.331073]  </TASK>
> [   65.332119] Kernel panic - not syncing: softlockup: hung tasks
> 
> Where collect_percpu_times+0xff/0x310:
> __seqprop_sequence at include/linux/seqlock.h:211 (discriminator 2)
> (inlined by) get_recent_times at kernel/sched/psi.c:261 (discriminator 2)
> (inlined by) collect_percpu_times at kernel/sched/psi.c:359 (discriminator 2)
> 
> Which looks like its getting stuck in the seqlock loop, and the only
> way I can see that catching right off, is if we're in some sort of
> livelock where the calls to psi_account_irqtime(curr, NULL,
> &rq->psi_irq_time) is coming in frequently enough to change the seqno
> each iteration through the reader.  But from my initial trace_printk
> debugging, it seems like from a kworker we enter the loop in
> get_recent_times(), hit an irq and somehow never really come back out
> of irq context. Though we continue to get ticks on the task and
> continue to call psi_account_irqtime().  I was worried we were somehow
> getting stuck in the 'while ((group = group->parent));' loop in
> psi_account_irqtime(), but that doesn't seem to be the case.
> 
> [  238.297094] kworker/-798       0..... 200647713us :
> collect_percpu_times: JDB: get_recent_times at top of loop 0!
> [  238.301705] kworker/-798       0..... 200647767us :
> collect_percpu_times: JDB: get_recent_times done with read (looped:
> 1)!
> [  238.306689] kworker/-798       0..... 200647768us :
> collect_percpu_times: JDB: get_recent_times at top of loop 0!
> [  238.311313] kworker/-798       0..... 200647769us :
> collect_percpu_times: JDB: get_recent_times done with read (looped:
> 1)!
> [  238.316318] kworker/-798       0d..2. 200647786us :
> psi_group_change: JDB: psi_group_change seqwrite
> <normal behavior above>
> [  238.320460] kworker/-10        0..... 200647790us :
> collect_percpu_times: JDB: get_recent_times at top of loop 0!
> [  238.325131] kworker/-10        0d.h.. 200648408us :
> psi_account_irqtime: JDB: psi_account_irqtime seqwrite (loop count: 0)
> [  238.330149] kworker/-10        0d.h.. 200649406us :
> psi_account_irqtime: JDB: psi_account_irqtime seqwrite (loop count: 0)
> [  238.335140] kworker/-10        0d.h.. 200650405us :
> psi_account_irqtime: JDB: psi_account_irqtime seqwrite (loop count: 0)
> ...
> With the psi_account_irqtime just repeating there each ms/tick.
> 
> I'm still digging a bit here to understand what's going on. But I
> wanted to share.

Urgh, that's not making any sense to me either. Johannes, considering:

 https://lkml.kernel.org/20240614094833.GM8774@noisy.programming.kicks-ass.net

Do you have any idea what's going on here?

Re: [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

Posted by Johannes Weiner 1 year, 7 months ago

On Tue, Jun 18, 2024 at 10:12:36AM +0200, Peter Zijlstra wrote:
> On Mon, Jun 17, 2024 at 05:42:25PM -0700, John Stultz wrote:
> > On Fri, Jun 14, 2024 at 2:48 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > Which then gets me something like the (completely untested) below..
> > >
> > > Hmm?
> > >
> > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > > index 0935f9d4bb7b..36aed99d6a6c 100644
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
> > >
> > >         rq->prev_irq_time += irq_delta;
> > >         delta -= irq_delta;
> > > -       psi_account_irqtime(rq->curr, irq_delta);
> > >         delayacct_irq(rq->curr, irq_delta);
> > >  #endif
> > >  #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> > > @@ -5459,6 +5458,8 @@ void sched_tick(void)
> > >
> > >         sched_clock_tick();
> > >
> > > +       psi_account_irqtime(curr, NULL, &rq->psi_irq_time);
> > > +
> > >         rq_lock(rq, &rf);
> > >
> > >         update_rq_clock(rq);
> > > @@ -6521,6 +6524,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
> > >                 ++*switch_count;
> > >
> > >                 migrate_disable_switch(rq, prev);
> > > +               psi_account_irqtime(prev, next, &rq->psi_irq_time);
> > 
> > FYI: These references to psi_irq_time hit build issues if
> > CONFIG_IRQ_TIME_ACCOUNTING is disabled.
> 
> Ah, well, I did say it was untested...
> 
> > Also, separately, while I didn't see this earlier testing on physical
> > devices, when running virtualized, I can pretty easily trip over the
> > following:
> > 
> > [   65.207340] watchdog: BUG: soft lockup - CPU#0 stuck for 26s!
> > [kworker/0:3:374]
> > [   65.211107] irq event stamp: 118664
> > [   65.212786] hardirqs last  enabled at (118663):
> > [<ffffffff97a00e46>] asm_sysvec_apic_timer_interrupt+0x16/0x20
> > [   65.218440] hardirqs last disabled at (118664):
> > [<ffffffff977fdeca>] sysvec_apic_timer_interrupt+0xa/0xc0
> > [   65.223074] softirqs last  enabled at (118546):
> > [<ffffffff9676db78>] __irq_exit_rcu+0x88/0xe0
> > [   65.227118] softirqs last disabled at (118541):
> > [<ffffffff9676db78>] __irq_exit_rcu+0x88/0xe0
> > [   65.231137] CPU: 0 PID: 374 Comm: kworker/0:3 Not tainted
> > 6.10.0-rc4-dirty #4393
> > [   65.234625] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> > BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> > [   65.239089] Workqueue: events psi_avgs_work
> > [   65.241122] RIP: 0010:collect_percpu_times+0xff/0x310
> > [   65.243525] Code: b9 02 00 00 00 48 89 df e8 8e a4 01 00 48 8b b4
> > 24 d0 00 00 00 48 89 df e8 5e 9e 01 00 58 45 8b 34 24 41 f6 c6 01 74
> > 0c f3 0
> > [   65.252926] RSP: 0018:ffff958501263d50 EFLAGS: 00000202
> > [   65.255433] RAX: 0000000000017b61 RBX: ffff9585b901d848 RCX: 0000000000000006
> > [   65.258755] RDX: ffffffff967eb6ac RSI: ffffffff9819e305 RDI: ffffffff98177748
> > [   65.262113] RBP: ffff958501263db0 R08: 0000000000000001 R09: 0000000000000000
> > [   65.265475] R10: 0000000000000001 R11: 0000000000000001 R12: ffff9585b901d840
> > [   65.268785] R13: ffff9585b901d884 R14: 0000000000033d8b R15: 0000000000000000
> > [   65.272146] FS:  0000000000000000(0000) GS:ffff9585b9000000(0000)
> > knlGS:0000000000000000
> > [   65.275908] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > [   65.278629] CR2: 00005631aee8b000 CR3: 0000000116c0e001 CR4: 0000000000370ef0
> > [   65.282002] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > [   65.285386] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > [   65.288730] Call Trace:
> > [   65.289958]  <IRQ>
> > [   65.290965]  ? watchdog_timer_fn+0x275/0x310
> > [   65.293185]  ? __pfx_watchdog_timer_fn+0x10/0x10
> > [   65.295379]  ? __hrtimer_run_queues+0x190/0x3b0
> > [   65.297795]  ? hrtimer_interrupt+0xf9/0x230
> > [   65.299782]  ? __sysvec_apic_timer_interrupt+0x82/0x210
> > [   65.302243]  ? sysvec_apic_timer_interrupt+0x98/0xc0
> > [   65.304590]  </IRQ>
> > [   65.305658]  <TASK>
> > [   65.306708]  ? asm_sysvec_apic_timer_interrupt+0x16/0x20
> > [   65.309206]  ? psi_avgs_work+0x3c/0xb0
> > [   65.311001]  ? collect_percpu_times+0xff/0x310
> > [   65.313153]  psi_avgs_work+0x3c/0xb0
> > [   65.314864]  process_one_work+0x1fe/0x700
> > [   65.316782]  ? lock_is_held_type+0xcd/0x120
> > [   65.318782]  worker_thread+0x1c7/0x3b0
> > [   65.320571]  ? __pfx_worker_thread+0x10/0x10
> > [   65.322626]  kthread+0xe0/0x110
> > [   65.324103]  ? __pfx_kthread+0x10/0x10
> > [   65.325853]  ret_from_fork+0x28/0x40
> > [   65.327512]  ? __pfx_kthread+0x10/0x10
> > [   65.329255]  ret_from_fork_asm+0x1a/0x30
> > [   65.331073]  </TASK>
> > [   65.332119] Kernel panic - not syncing: softlockup: hung tasks
> > 
> > Where collect_percpu_times+0xff/0x310:
> > __seqprop_sequence at include/linux/seqlock.h:211 (discriminator 2)
> > (inlined by) get_recent_times at kernel/sched/psi.c:261 (discriminator 2)
> > (inlined by) collect_percpu_times at kernel/sched/psi.c:359 (discriminator 2)
> > 
> > Which looks like its getting stuck in the seqlock loop, and the only
> > way I can see that catching right off, is if we're in some sort of
> > livelock where the calls to psi_account_irqtime(curr, NULL,
> > &rq->psi_irq_time) is coming in frequently enough to change the seqno
> > each iteration through the reader.  But from my initial trace_printk
> > debugging, it seems like from a kworker we enter the loop in
> > get_recent_times(), hit an irq and somehow never really come back out
> > of irq context. Though we continue to get ticks on the task and
> > continue to call psi_account_irqtime().  I was worried we were somehow
> > getting stuck in the 'while ((group = group->parent));' loop in
> > psi_account_irqtime(), but that doesn't seem to be the case.
> > 
> > [  238.297094] kworker/-798       0..... 200647713us :
> > collect_percpu_times: JDB: get_recent_times at top of loop 0!
> > [  238.301705] kworker/-798       0..... 200647767us :
> > collect_percpu_times: JDB: get_recent_times done with read (looped:
> > 1)!
> > [  238.306689] kworker/-798       0..... 200647768us :
> > collect_percpu_times: JDB: get_recent_times at top of loop 0!
> > [  238.311313] kworker/-798       0..... 200647769us :
> > collect_percpu_times: JDB: get_recent_times done with read (looped:
> > 1)!
> > [  238.316318] kworker/-798       0d..2. 200647786us :
> > psi_group_change: JDB: psi_group_change seqwrite
> > <normal behavior above>
> > [  238.320460] kworker/-10        0..... 200647790us :
> > collect_percpu_times: JDB: get_recent_times at top of loop 0!
> > [  238.325131] kworker/-10        0d.h.. 200648408us :
> > psi_account_irqtime: JDB: psi_account_irqtime seqwrite (loop count: 0)
> > [  238.330149] kworker/-10        0d.h.. 200649406us :
> > psi_account_irqtime: JDB: psi_account_irqtime seqwrite (loop count: 0)
> > [  238.335140] kworker/-10        0d.h.. 200650405us :
> > psi_account_irqtime: JDB: psi_account_irqtime seqwrite (loop count: 0)
> > ...
> > With the psi_account_irqtime just repeating there each ms/tick.
> > 
> > I'm still digging a bit here to understand what's going on. But I
> > wanted to share.
> 
> Urgh, that's not making any sense to me either. Johannes, considering:
> 
>  https://lkml.kernel.org/20240614094833.GM8774@noisy.programming.kicks-ass.net

This might be the culprit:

@@ -5459,6 +5458,8 @@ void sched_tick(void)
 
 	sched_clock_tick();
 
+	psi_account_irqtime(curr, NULL, &rq->psi_irq_time);
+
 	rq_lock(rq, &rf);
 
 	update_rq_clock(rq);

rq->lock is required for these state updates because it also
serializes against tasks moving between cgroups. If another CPU
changes the task->cgroup association during the update, AFAICS it
could call write_seqcount_begin() one group 1 and _end() on group 2,
leaving both groups' seqcounters odd.

Can you try moving the accounting call below rq_lock()?

Re: [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

Posted by John Stultz 1 year, 7 months ago

On Tue, Jun 18, 2024 at 10:59 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Tue, Jun 18, 2024 at 10:12:36AM +0200, Peter Zijlstra wrote:
> > On Mon, Jun 17, 2024 at 05:42:25PM -0700, John Stultz wrote:
> > > On Fri, Jun 14, 2024 at 2:48 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > Which then gets me something like the (completely untested) below..
> > > >
> > > > Hmm?
> > > >
> > > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > > > index 0935f9d4bb7b..36aed99d6a6c 100644
> > > > --- a/kernel/sched/core.c
> > > > +++ b/kernel/sched/core.c
> > > > @@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
> > > >
> > > >         rq->prev_irq_time += irq_delta;
> > > >         delta -= irq_delta;
> > > > -       psi_account_irqtime(rq->curr, irq_delta);
> > > >         delayacct_irq(rq->curr, irq_delta);
> > > >  #endif
> > > >  #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> > > > @@ -5459,6 +5458,8 @@ void sched_tick(void)
> > > >
> > > >         sched_clock_tick();
> > > >
> > > > +       psi_account_irqtime(curr, NULL, &rq->psi_irq_time);
> > > > +
> > > >         rq_lock(rq, &rf);
> > > >
> > > >         update_rq_clock(rq);
> > > > @@ -6521,6 +6524,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
> > > >                 ++*switch_count;
> > > >
> > > >                 migrate_disable_switch(rq, prev);
> > > > +               psi_account_irqtime(prev, next, &rq->psi_irq_time);
> > >
> > > FYI: These references to psi_irq_time hit build issues if
> > > CONFIG_IRQ_TIME_ACCOUNTING is disabled.
> >
> > Ah, well, I did say it was untested...
> >
> > > Also, separately, while I didn't see this earlier testing on physical
> > > devices, when running virtualized, I can pretty easily trip over the
> > > following:
> > >
> > > [   65.207340] watchdog: BUG: soft lockup - CPU#0 stuck for 26s!
> > > [kworker/0:3:374]
> > > [   65.211107] irq event stamp: 118664
> > > [   65.212786] hardirqs last  enabled at (118663):
> > > [<ffffffff97a00e46>] asm_sysvec_apic_timer_interrupt+0x16/0x20
> > > [   65.218440] hardirqs last disabled at (118664):
> > > [<ffffffff977fdeca>] sysvec_apic_timer_interrupt+0xa/0xc0
> > > [   65.223074] softirqs last  enabled at (118546):
> > > [<ffffffff9676db78>] __irq_exit_rcu+0x88/0xe0
> > > [   65.227118] softirqs last disabled at (118541):
> > > [<ffffffff9676db78>] __irq_exit_rcu+0x88/0xe0
> > > [   65.231137] CPU: 0 PID: 374 Comm: kworker/0:3 Not tainted
> > > 6.10.0-rc4-dirty #4393
> > > [   65.234625] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> > > BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> > > [   65.239089] Workqueue: events psi_avgs_work
> > > [   65.241122] RIP: 0010:collect_percpu_times+0xff/0x310
> > > [   65.243525] Code: b9 02 00 00 00 48 89 df e8 8e a4 01 00 48 8b b4
> > > 24 d0 00 00 00 48 89 df e8 5e 9e 01 00 58 45 8b 34 24 41 f6 c6 01 74
> > > 0c f3 0
> > > [   65.252926] RSP: 0018:ffff958501263d50 EFLAGS: 00000202
> > > [   65.255433] RAX: 0000000000017b61 RBX: ffff9585b901d848 RCX: 0000000000000006
> > > [   65.258755] RDX: ffffffff967eb6ac RSI: ffffffff9819e305 RDI: ffffffff98177748
> > > [   65.262113] RBP: ffff958501263db0 R08: 0000000000000001 R09: 0000000000000000
> > > [   65.265475] R10: 0000000000000001 R11: 0000000000000001 R12: ffff9585b901d840
> > > [   65.268785] R13: ffff9585b901d884 R14: 0000000000033d8b R15: 0000000000000000
> > > [   65.272146] FS:  0000000000000000(0000) GS:ffff9585b9000000(0000)
> > > knlGS:0000000000000000
> > > [   65.275908] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > [   65.278629] CR2: 00005631aee8b000 CR3: 0000000116c0e001 CR4: 0000000000370ef0
> > > [   65.282002] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > > [   65.285386] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > > [   65.288730] Call Trace:
> > > [   65.289958]  <IRQ>
> > > [   65.290965]  ? watchdog_timer_fn+0x275/0x310
> > > [   65.293185]  ? __pfx_watchdog_timer_fn+0x10/0x10
> > > [   65.295379]  ? __hrtimer_run_queues+0x190/0x3b0
> > > [   65.297795]  ? hrtimer_interrupt+0xf9/0x230
> > > [   65.299782]  ? __sysvec_apic_timer_interrupt+0x82/0x210
> > > [   65.302243]  ? sysvec_apic_timer_interrupt+0x98/0xc0
> > > [   65.304590]  </IRQ>
> > > [   65.305658]  <TASK>
> > > [   65.306708]  ? asm_sysvec_apic_timer_interrupt+0x16/0x20
> > > [   65.309206]  ? psi_avgs_work+0x3c/0xb0
> > > [   65.311001]  ? collect_percpu_times+0xff/0x310
> > > [   65.313153]  psi_avgs_work+0x3c/0xb0
> > > [   65.314864]  process_one_work+0x1fe/0x700
> > > [   65.316782]  ? lock_is_held_type+0xcd/0x120
> > > [   65.318782]  worker_thread+0x1c7/0x3b0
> > > [   65.320571]  ? __pfx_worker_thread+0x10/0x10
> > > [   65.322626]  kthread+0xe0/0x110
> > > [   65.324103]  ? __pfx_kthread+0x10/0x10
> > > [   65.325853]  ret_from_fork+0x28/0x40
> > > [   65.327512]  ? __pfx_kthread+0x10/0x10
> > > [   65.329255]  ret_from_fork_asm+0x1a/0x30
> > > [   65.331073]  </TASK>
> > > [   65.332119] Kernel panic - not syncing: softlockup: hung tasks
> > >
> > > Where collect_percpu_times+0xff/0x310:
> > > __seqprop_sequence at include/linux/seqlock.h:211 (discriminator 2)
> > > (inlined by) get_recent_times at kernel/sched/psi.c:261 (discriminator 2)
> > > (inlined by) collect_percpu_times at kernel/sched/psi.c:359 (discriminator 2)
> > >
> > > Which looks like its getting stuck in the seqlock loop, and the only
> > > way I can see that catching right off, is if we're in some sort of
> > > livelock where the calls to psi_account_irqtime(curr, NULL,
> > > &rq->psi_irq_time) is coming in frequently enough to change the seqno
> > > each iteration through the reader.  But from my initial trace_printk
> > > debugging, it seems like from a kworker we enter the loop in
> > > get_recent_times(), hit an irq and somehow never really come back out
> > > of irq context. Though we continue to get ticks on the task and
> > > continue to call psi_account_irqtime().  I was worried we were somehow
> > > getting stuck in the 'while ((group = group->parent));' loop in
> > > psi_account_irqtime(), but that doesn't seem to be the case.
> > >
> > > [  238.297094] kworker/-798       0..... 200647713us :
> > > collect_percpu_times: JDB: get_recent_times at top of loop 0!
> > > [  238.301705] kworker/-798       0..... 200647767us :
> > > collect_percpu_times: JDB: get_recent_times done with read (looped:
> > > 1)!
> > > [  238.306689] kworker/-798       0..... 200647768us :
> > > collect_percpu_times: JDB: get_recent_times at top of loop 0!
> > > [  238.311313] kworker/-798       0..... 200647769us :
> > > collect_percpu_times: JDB: get_recent_times done with read (looped:
> > > 1)!
> > > [  238.316318] kworker/-798       0d..2. 200647786us :
> > > psi_group_change: JDB: psi_group_change seqwrite
> > > <normal behavior above>
> > > [  238.320460] kworker/-10        0..... 200647790us :
> > > collect_percpu_times: JDB: get_recent_times at top of loop 0!
> > > [  238.325131] kworker/-10        0d.h.. 200648408us :
> > > psi_account_irqtime: JDB: psi_account_irqtime seqwrite (loop count: 0)
> > > [  238.330149] kworker/-10        0d.h.. 200649406us :
> > > psi_account_irqtime: JDB: psi_account_irqtime seqwrite (loop count: 0)
> > > [  238.335140] kworker/-10        0d.h.. 200650405us :
> > > psi_account_irqtime: JDB: psi_account_irqtime seqwrite (loop count: 0)
> > > ...
> > > With the psi_account_irqtime just repeating there each ms/tick.
> > >
> > > I'm still digging a bit here to understand what's going on. But I
> > > wanted to share.
> >
> > Urgh, that's not making any sense to me either. Johannes, considering:
> >
> >  https://lkml.kernel.org/20240614094833.GM8774@noisy.programming.kicks-ass.net
>
> This might be the culprit:
>
> @@ -5459,6 +5458,8 @@ void sched_tick(void)
>
>         sched_clock_tick();
>
> +       psi_account_irqtime(curr, NULL, &rq->psi_irq_time);
> +
>         rq_lock(rq, &rf);
>
>         update_rq_clock(rq);
>
> rq->lock is required for these state updates because it also
> serializes against tasks moving between cgroups. If another CPU
> changes the task->cgroup association during the update, AFAICS it
> could call write_seqcount_begin() one group 1 and _end() on group 2,
> leaving both groups' seqcounters odd.
>
> Can you try moving the accounting call below rq_lock()?

Ah, Yeah! This very much aligns with my testing.
Putting it under the rq_lock avoids the issue, so thank you for the
explanation of why!

The other part that was confounding to me as I was debugging this was
what seemed like we were never returning out of irq context, but in
fact that wasn't the case. I realize now the read_seqcount_begin()
will internally spin if the seqno is odd, and my expectation was
*very* dated that we always entered the loop and repeated while it was
odd, so my debug trace message in the loop wouldn't be expected to
print.

So that nicely resolves it.

Peter: I'll add the fix and write up the change log and send it along.

thanks
-john

Re: [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

Posted by John Stultz 1 year, 7 months ago

On Mon, Jun 17, 2024 at 5:42 PM John Stultz <jstultz@google.com> wrote:
>
> Also, separately, while I didn't see this earlier testing on physical
> devices, when running virtualized, I can pretty easily trip over the
> following:
>
> [   65.207340] watchdog: BUG: soft lockup - CPU#0 stuck for 26s!
> [kworker/0:3:374]
> [   65.211107] irq event stamp: 118664
> [   65.212786] hardirqs last  enabled at (118663):
> [<ffffffff97a00e46>] asm_sysvec_apic_timer_interrupt+0x16/0x20
> [   65.218440] hardirqs last disabled at (118664):
> [<ffffffff977fdeca>] sysvec_apic_timer_interrupt+0xa/0xc0
> [   65.223074] softirqs last  enabled at (118546):
> [<ffffffff9676db78>] __irq_exit_rcu+0x88/0xe0
> [   65.227118] softirqs last disabled at (118541):
> [<ffffffff9676db78>] __irq_exit_rcu+0x88/0xe0
> [   65.231137] CPU: 0 PID: 374 Comm: kworker/0:3 Not tainted
> 6.10.0-rc4-dirty #4393
> [   65.234625] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> [   65.239089] Workqueue: events psi_avgs_work
> [   65.241122] RIP: 0010:collect_percpu_times+0xff/0x310
...
> [   65.288730] Call Trace:
> [   65.289958]  <IRQ>
> [   65.290965]  ? watchdog_timer_fn+0x275/0x310
> [   65.293185]  ? __pfx_watchdog_timer_fn+0x10/0x10
> [   65.295379]  ? __hrtimer_run_queues+0x190/0x3b0
> [   65.297795]  ? hrtimer_interrupt+0xf9/0x230
> [   65.299782]  ? __sysvec_apic_timer_interrupt+0x82/0x210
> [   65.302243]  ? sysvec_apic_timer_interrupt+0x98/0xc0
> [   65.304590]  </IRQ>
> [   65.305658]  <TASK>
> [   65.306708]  ? asm_sysvec_apic_timer_interrupt+0x16/0x20
> [   65.309206]  ? psi_avgs_work+0x3c/0xb0
> [   65.311001]  ? collect_percpu_times+0xff/0x310
> [   65.313153]  psi_avgs_work+0x3c/0xb0
> [   65.314864]  process_one_work+0x1fe/0x700
> [   65.316782]  ? lock_is_held_type+0xcd/0x120
> [   65.318782]  worker_thread+0x1c7/0x3b0
> [   65.320571]  ? __pfx_worker_thread+0x10/0x10
> [   65.322626]  kthread+0xe0/0x110
> [   65.324103]  ? __pfx_kthread+0x10/0x10
> [   65.325853]  ret_from_fork+0x28/0x40
> [   65.327512]  ? __pfx_kthread+0x10/0x10
> [   65.329255]  ret_from_fork_asm+0x1a/0x30
> [   65.331073]  </TASK>
> [   65.332119] Kernel panic - not syncing: softlockup: hung tasks

As I've been digging on this some more, I'm still a bit baffled.  The
kworker seems to get stuck once the irq lands and we don't seem to go
back to the workqueue work that was in progress. That cpu ends up not
doing anything but occasionally handling ticks until we crash from the
softlockup watchdog.

Interestingly, if I move the psi_account_irqtime() call to be under
the rq_lock in sched_tick(), as I suggested earlier, I have yet to
reproduce the problem.
But I'm still trying to piece together exactly why.

thanks
-john

Re: [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

Posted by Qais Yousef 1 year, 7 months ago

On 06/17/24 21:45, John Stultz wrote:
> On Mon, Jun 17, 2024 at 5:42 PM John Stultz <jstultz@google.com> wrote:
> >
> > Also, separately, while I didn't see this earlier testing on physical
> > devices, when running virtualized, I can pretty easily trip over the
> > following:
> >
> > [   65.207340] watchdog: BUG: soft lockup - CPU#0 stuck for 26s!
> > [kworker/0:3:374]
> > [   65.211107] irq event stamp: 118664
> > [   65.212786] hardirqs last  enabled at (118663):
> > [<ffffffff97a00e46>] asm_sysvec_apic_timer_interrupt+0x16/0x20
> > [   65.218440] hardirqs last disabled at (118664):
> > [<ffffffff977fdeca>] sysvec_apic_timer_interrupt+0xa/0xc0
> > [   65.223074] softirqs last  enabled at (118546):
> > [<ffffffff9676db78>] __irq_exit_rcu+0x88/0xe0
> > [   65.227118] softirqs last disabled at (118541):
> > [<ffffffff9676db78>] __irq_exit_rcu+0x88/0xe0
> > [   65.231137] CPU: 0 PID: 374 Comm: kworker/0:3 Not tainted
> > 6.10.0-rc4-dirty #4393
> > [   65.234625] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> > BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> > [   65.239089] Workqueue: events psi_avgs_work
> > [   65.241122] RIP: 0010:collect_percpu_times+0xff/0x310
> ...
> > [   65.288730] Call Trace:
> > [   65.289958]  <IRQ>
> > [   65.290965]  ? watchdog_timer_fn+0x275/0x310
> > [   65.293185]  ? __pfx_watchdog_timer_fn+0x10/0x10
> > [   65.295379]  ? __hrtimer_run_queues+0x190/0x3b0
> > [   65.297795]  ? hrtimer_interrupt+0xf9/0x230
> > [   65.299782]  ? __sysvec_apic_timer_interrupt+0x82/0x210
> > [   65.302243]  ? sysvec_apic_timer_interrupt+0x98/0xc0
> > [   65.304590]  </IRQ>
> > [   65.305658]  <TASK>
> > [   65.306708]  ? asm_sysvec_apic_timer_interrupt+0x16/0x20
> > [   65.309206]  ? psi_avgs_work+0x3c/0xb0
> > [   65.311001]  ? collect_percpu_times+0xff/0x310
> > [   65.313153]  psi_avgs_work+0x3c/0xb0
> > [   65.314864]  process_one_work+0x1fe/0x700
> > [   65.316782]  ? lock_is_held_type+0xcd/0x120
> > [   65.318782]  worker_thread+0x1c7/0x3b0
> > [   65.320571]  ? __pfx_worker_thread+0x10/0x10
> > [   65.322626]  kthread+0xe0/0x110
> > [   65.324103]  ? __pfx_kthread+0x10/0x10
> > [   65.325853]  ret_from_fork+0x28/0x40
> > [   65.327512]  ? __pfx_kthread+0x10/0x10
> > [   65.329255]  ret_from_fork_asm+0x1a/0x30
> > [   65.331073]  </TASK>
> > [   65.332119] Kernel panic - not syncing: softlockup: hung tasks
> 
> As I've been digging on this some more, I'm still a bit baffled.  The
> kworker seems to get stuck once the irq lands and we don't seem to go
> back to the workqueue work that was in progress. That cpu ends up not
> doing anything but occasionally handling ticks until we crash from the
> softlockup watchdog.
> 
> Interestingly, if I move the psi_account_irqtime() call to be under
> the rq_lock in sched_tick(), as I suggested earlier, I have yet to
> reproduce the problem.
> But I'm still trying to piece together exactly why.

A bit of a guess, but should irq_time_read() be done with interrupts disabled?

Re: [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

Posted by Qais Yousef 1 year, 8 months ago

On 06/14/24 11:48, Peter Zijlstra wrote:
> On Thu, Jun 13, 2024 at 12:51:42PM +0100, Qais Yousef wrote:
> > On 06/13/24 12:04, Peter Zijlstra wrote:
> > 
> > > ---
> > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > > index 0935f9d4bb7b..d4b87539d72a 100644
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
> > >  
> > >  	rq->prev_irq_time += irq_delta;
> > >  	delta -= irq_delta;
> > > -	psi_account_irqtime(rq->curr, irq_delta);
> > >  	delayacct_irq(rq->curr, irq_delta);
> > >  #endif
> > >  #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> > > @@ -5459,6 +5458,8 @@ void sched_tick(void)
> > >  
> > >  	sched_clock_tick();
> > >  
> > > +	psi_account_irqtime(curr, &rq->psi_irq_time);
> > > +
> > 
> > If wakeup preemption causes a context switch, wouldn't we lose this
> > information then? I *think* active migration might cause this information to be
> > lost too.
> 
> I'm not sure what would be lost ?! the accounting is per cpu, not per
> task afaict. That said,...

Yes sorry I mixed this with per task accounting..

> 
> > pick_next_task() might be a better place to do the accounting?
> 
> Additionally, when there has been an effective cgroup switch. Only on
> switch doesn't work for long running tasks, then the PSI information
> will be artitrarily long out of date.
> 
> Which then gets me something like the (completely untested) below..
> 
> Hmm?
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 0935f9d4bb7b..36aed99d6a6c 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
>  
>  	rq->prev_irq_time += irq_delta;
>  	delta -= irq_delta;
> -	psi_account_irqtime(rq->curr, irq_delta);
>  	delayacct_irq(rq->curr, irq_delta);
>  #endif
>  #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> @@ -5459,6 +5458,8 @@ void sched_tick(void)
>  
>  	sched_clock_tick();
>  
> +	psi_account_irqtime(curr, NULL, &rq->psi_irq_time);
> +
>  	rq_lock(rq, &rf);
>  
>  	update_rq_clock(rq);
> @@ -6521,6 +6524,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
>  		++*switch_count;
>  
>  		migrate_disable_switch(rq, prev);
> +		psi_account_irqtime(prev, next, &rq->psi_irq_time);

Hmm are prev and next swapped here? next == curr in my view if there's no
subtly I missed

>  		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
>  
>  		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 146baa91d104..65bba162408f 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -991,22 +991,31 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
>  }
>  
>  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
> -void psi_account_irqtime(struct task_struct *task, u32 delta)
> +void psi_account_irqtime(struct task_struct *curr, struct task_struct *prev, u64 *time)
>  {
> -	int cpu = task_cpu(task);
> +	int cpu = task_cpu(curr);
>  	struct psi_group *group;
>  	struct psi_group_cpu *groupc;
> -	u64 now;
> +	u64 now, irq;
> +	s64 delta;
>  
>  	if (static_branch_likely(&psi_disabled))
>  		return;
>  
> -	if (!task->pid)
> +	if (!curr->pid)
> +		return;
> +
> +	group = task_psi_group(curr);
> +	if( prev && task_psi_group(prev) == group)

nit: whitespace misplaced

LGTM otherwise.

Reviewed-by: Qais Yousef <qyousef@layalina.io>

>  		return;
>  
>  	now = cpu_clock(cpu);
> +	irq = irq_time_read(cpu);
> +	delta = (s64)(irq - *time);
> +	if (delta < 0)
> +		return;
> +	*time = irq;
>  
> -	group = task_psi_group(task);
>  	do {
>  		if (!group->enabled)
>  			continue;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 62fd8bc6fd08..a63eb546bc4a 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1133,6 +1133,7 @@ struct rq {
>  
>  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
>  	u64			prev_irq_time;
> +	u64			psi_irq_time;
>  #endif
>  #ifdef CONFIG_PARAVIRT
>  	u64			prev_steal_time;
> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> index d1445410840a..1e290054c5db 100644
> --- a/kernel/sched/stats.h
> +++ b/kernel/sched/stats.h
> @@ -110,7 +110,7 @@ __schedstats_from_se(struct sched_entity *se)
>  void psi_task_change(struct task_struct *task, int clear, int set);
>  void psi_task_switch(struct task_struct *prev, struct task_struct *next,
>  		     bool sleep);
> -void psi_account_irqtime(struct task_struct *task, u32 delta);
> +void psi_account_irqtime(struct task_struct *curr, struct task_struct *prev, u64 *time);
>  
>  /*
>   * PSI tracks state that persists across sleeps, such as iowaits and
> @@ -192,7 +192,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
>  static inline void psi_sched_switch(struct task_struct *prev,
>  				    struct task_struct *next,
>  				    bool sleep) {}
> -static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
> +static inline void psi_account_irqtime(struct task_struct *curr, struct task_struct *prev, u64 *time) {}
>  #endif /* CONFIG_PSI */
>  
>  #ifdef CONFIG_SCHED_INFO
> 
> 
>

Re: [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

Posted by Peter Zijlstra 1 year, 7 months ago

On Sun, Jun 16, 2024 at 11:36:16PM +0100, Qais Yousef wrote:

> > Which then gets me something like the (completely untested) below..
> > 
> > Hmm?
> > 
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 0935f9d4bb7b..36aed99d6a6c 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
> >  
> >  	rq->prev_irq_time += irq_delta;
> >  	delta -= irq_delta;
> > -	psi_account_irqtime(rq->curr, irq_delta);
> >  	delayacct_irq(rq->curr, irq_delta);
> >  #endif
> >  #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> > @@ -5459,6 +5458,8 @@ void sched_tick(void)
> >  
> >  	sched_clock_tick();
> >  
> > +	psi_account_irqtime(curr, NULL, &rq->psi_irq_time);
> > +
> >  	rq_lock(rq, &rf);
> >  
> >  	update_rq_clock(rq);
> > @@ -6521,6 +6524,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
> >  		++*switch_count;
> >  
> >  		migrate_disable_switch(rq, prev);
> > +		psi_account_irqtime(prev, next, &rq->psi_irq_time);
> 
> Hmm are prev and next swapped here? next == curr in my view if there's no
> subtly I missed

This is before context_switch() so prev == current at this point.
However, more importantly, the PSI thing accounts to its 'curr' group
and that should very much be the outgoing task's group in this case.

That is, we need to make sure the outgoing group is up-to-date before
switching to a new group.

Makes sense?

> >  		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
> >  
> >  		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
> > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > index 146baa91d104..65bba162408f 100644
> > --- a/kernel/sched/psi.c
> > +++ b/kernel/sched/psi.c
> > @@ -991,22 +991,31 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
> >  }
> >  
> >  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
> > -void psi_account_irqtime(struct task_struct *task, u32 delta)
> > +void psi_account_irqtime(struct task_struct *curr, struct task_struct *prev, u64 *time)
> >  {
> > -	int cpu = task_cpu(task);
> > +	int cpu = task_cpu(curr);
> >  	struct psi_group *group;
> >  	struct psi_group_cpu *groupc;
> > -	u64 now;
> > +	u64 now, irq;
> > +	s64 delta;
> >  
> >  	if (static_branch_likely(&psi_disabled))
> >  		return;
> >  
> > -	if (!task->pid)
> > +	if (!curr->pid)
> > +		return;
> > +
> > +	group = task_psi_group(curr);
> > +	if( prev && task_psi_group(prev) == group)
> 
> nit: whitespace misplaced

Ha!, is that's all and it all works in one go it's awesome :-)

I'm still trying to learn to type again after switching keyboards. I've
used a thinkpad keyboard (either on an actual laptop or the travel
version on my desktop for nearly 20 years... Now I've picked up a split
keyboard out of necessity (UHK 60 v2 for those interested) and muscle
memory is still cursing me every single day.

As a result, I now also cannot type on my laptop anymore, so lose-lose I
suppose ... urgh.

> LGTM otherwise.
> 
> Reviewed-by: Qais Yousef <qyousef@layalina.io>

Thanks!

John, can you write up a changelog with some pretty numbers and all
that? Also, when you re-post, can you make sure to Cc the PSI folks
(johannes and suren iirc, get_maintainers.pl seems to find them).

Re: [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

Posted by Qais Yousef 1 year, 7 months ago

On 06/18/24 09:04, Peter Zijlstra wrote:
> On Sun, Jun 16, 2024 at 11:36:16PM +0100, Qais Yousef wrote:
> 
> > > Which then gets me something like the (completely untested) below..
> > > 
> > > Hmm?
> > > 
> > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > > index 0935f9d4bb7b..36aed99d6a6c 100644
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
> > >  
> > >  	rq->prev_irq_time += irq_delta;
> > >  	delta -= irq_delta;
> > > -	psi_account_irqtime(rq->curr, irq_delta);
> > >  	delayacct_irq(rq->curr, irq_delta);
> > >  #endif
> > >  #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> > > @@ -5459,6 +5458,8 @@ void sched_tick(void)
> > >  
> > >  	sched_clock_tick();
> > >  
> > > +	psi_account_irqtime(curr, NULL, &rq->psi_irq_time);
> > > +
> > >  	rq_lock(rq, &rf);
> > >  
> > >  	update_rq_clock(rq);
> > > @@ -6521,6 +6524,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
> > >  		++*switch_count;
> > >  
> > >  		migrate_disable_switch(rq, prev);
> > > +		psi_account_irqtime(prev, next, &rq->psi_irq_time);
> > 
> > Hmm are prev and next swapped here? next == curr in my view if there's no
> > subtly I missed
> 
> This is before context_switch() so prev == current at this point.
> However, more importantly, the PSI thing accounts to its 'curr' group
> and that should very much be the outgoing task's group in this case.
> 
> That is, we need to make sure the outgoing group is up-to-date before
> switching to a new group.
> 
> Makes sense?

Yes, thanks! After reviewing the patch though I wondered if we care to handle
when the group changes while we're RUNNING. There will be no __schedule() then.
But it's a matter of how accurate this is supposed to be. In sched_move_task()
we dequeue with DEQUEUE_NOCLOCK which IIUC would have not caused
psi_account_irqtime() to be called on the move already. So we're doing the
same.

We can still maybe move this to sched_change_group() (keeping the psi group
change check you added)? This still LGTM anyway, but thought I'll spill it out.

> 
> > >  		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
> > >  
> > >  		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
> > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > > index 146baa91d104..65bba162408f 100644
> > > --- a/kernel/sched/psi.c
> > > +++ b/kernel/sched/psi.c
> > > @@ -991,22 +991,31 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
> > >  }
> > >  
> > >  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
> > > -void psi_account_irqtime(struct task_struct *task, u32 delta)
> > > +void psi_account_irqtime(struct task_struct *curr, struct task_struct *prev, u64 *time)
> > >  {
> > > -	int cpu = task_cpu(task);
> > > +	int cpu = task_cpu(curr);
> > >  	struct psi_group *group;
> > >  	struct psi_group_cpu *groupc;
> > > -	u64 now;
> > > +	u64 now, irq;
> > > +	s64 delta;
> > >  
> > >  	if (static_branch_likely(&psi_disabled))
> > >  		return;
> > >  
> > > -	if (!task->pid)
> > > +	if (!curr->pid)
> > > +		return;
> > > +
> > > +	group = task_psi_group(curr);
> > > +	if( prev && task_psi_group(prev) == group)
> > 
> > nit: whitespace misplaced
> 
> Ha!, is that's all and it all works in one go it's awesome :-)
> 
> I'm still trying to learn to type again after switching keyboards. I've
> used a thinkpad keyboard (either on an actual laptop or the travel
> version on my desktop for nearly 20 years... Now I've picked up a split
> keyboard out of necessity (UHK 60 v2 for those interested) and muscle
> memory is still cursing me every single day.

I failed to use those split keyboards..

> 
> As a result, I now also cannot type on my laptop anymore, so lose-lose I
> suppose ... urgh.

You mean you're not using AI yet?! :-O

> 
> > LGTM otherwise.
> > 
> > Reviewed-by: Qais Yousef <qyousef@layalina.io>
> 
> Thanks!
> 
> John, can you write up a changelog with some pretty numbers and all
> that? Also, when you re-post, can you make sure to Cc the PSI folks
> (johannes and suren iirc, get_maintainers.pl seems to find them).

Re: [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

Posted by John Stultz 1 year, 8 months ago

On Fri, Jun 14, 2024 at 2:48 AM Peter Zijlstra <peterz@infradead.org> wrote:
> On Thu, Jun 13, 2024 at 12:51:42PM +0100, Qais Yousef wrote:
> > On 06/13/24 12:04, Peter Zijlstra wrote:
> > > @@ -5459,6 +5458,8 @@ void sched_tick(void)
> > >
> > >     sched_clock_tick();
> > >
> > > +   psi_account_irqtime(curr, &rq->psi_irq_time);
> > > +
> >
> > If wakeup preemption causes a context switch, wouldn't we lose this
> > information then? I *think* active migration might cause this information to be
> > lost too.
>
> I'm not sure what would be lost ?! the accounting is per cpu, not per
> task afaict. That said,...
>
> > pick_next_task() might be a better place to do the accounting?
>
> Additionally, when there has been an effective cgroup switch. Only on
> switch doesn't work for long running tasks, then the PSI information
> will be artitrarily long out of date.
>
> Which then gets me something like the (completely untested) below..
>
> Hmm?

I applied and booted with this. It still takes the accounting out of
the hotpath for the CLOCK_THREAD_CPUTIME_ID the microbenchmark
performance is back to 5.10 numbers.

I don't have any correctness tests for irqtime measurements, so I'll
have to try to work something up for that next week.

thanks
-john

Re: [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock

Posted by John Stultz 1 year, 8 months ago

On Wed, Jun 12, 2024 at 6:58 PM John Stultz <jstultz@google.com> wrote:
>
> I recently got a bug report that
> clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) had regressed between
> 5.10 and 6.1. Its not a huge regression in absolute time
> (~30-40ns), but is >10% change.
>
> I narrowed the cause down to the addition of
> psi_account_irqtime() in update_rq_clock_task(), in commit
> 52b1364ba0b1 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ
> pressure")
>
> So that explains the behavior change, but it also seems odd that
> we're doing psi irq accounting from a syscall that is just
> trying to read the thread's cputime.
>
> Thinking about it more, it seems the re-use of update_rq_clock()
> to handle accounting for any in-progress time for the current
> task has the potential for side effects and unnecessary work.
>
> So instead rework the logic so we calculate the current cpu
> runtime in a read-only fashion.
>
> This has the side benefit of improving
> clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) performance by ~12%
> over the behavior in 5.10, and ~21% over the 6.1 behavior.
>
> NOTE: I'm not 100% sure this is correct yet. There may be some
> edge cases I've overlooked, so I'd greatly appreciate any
> review or feedback.

Actually, I've definitely overlooked something as I'm seeing
inconsistencies in CLOCK_THREAD_CPUTIME_ID with testing.
I'll have to stare at this some more and resend once I've sorted it out.

Even so, I'd appreciate thoughts on the general approach of avoiding
update_rq_clock() in the clock_gettime() path.

thanks
-john