Currently the tick subsystem stores the idle cputime accounting in
private fields, allowing cohabitation with architecture idle vtime
accounting. The former is fetched on online CPUs, the latter on offline
CPUs.
For consolidation purpose, architecture vtime accounting will continue
to account the cputime but will make a break when the idle tick is
stopped. The dyntick cputime accounting will then be relayed by the tick
subsystem so that the idle cputime is still seen advancing coherently
even when the tick isn't there to flush the idle vtime.
Prepare for that and introduce three new APIs which will be used in
subsequent patches:
_ vtime_dynticks_start() is deemed to be called when idle enters in
dyntick mode. The idle cputime that elapsed so far is accumulated
and accounted. Also idle time accounting is ignored.
- vtime_dynticks_stop() is deemed to be called when idle exits from
dyntick mode. The vtime entry clocks are fast-forward to current time
so that idle accounting restarts elapsing from now. Also idle time
accounting is resumed.
- vtime_reset() is deemed to be called from dynticks idle IRQ entry to
fast-forward the clock to current time so that the IRQ time is still
accounted by vtime while nohz cputime is paused.
Also accumulated vtime won't be flushed from dyntick-idle ticks to avoid
accounting twice the idle cputime, along with nohz accounting.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
arch/s390/include/asm/idle.h | 11 +++---
arch/s390/kernel/idle.c | 13 ++++++--
arch/s390/kernel/vtime.c | 65 ++++++++++++++++++++++++++++++------
3 files changed, 71 insertions(+), 18 deletions(-)
diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h
index 09f763b9eb40..2770c4f761e1 100644
--- a/arch/s390/include/asm/idle.h
+++ b/arch/s390/include/asm/idle.h
@@ -12,11 +12,12 @@
#include <linux/device.h>
struct s390_idle_data {
- unsigned long idle_count;
- unsigned long idle_time;
- unsigned long clock_idle_enter;
- unsigned long timer_idle_enter;
- unsigned long mt_cycles_enter[8];
+ bool idle_dyntick;
+ unsigned long idle_count;
+ unsigned long idle_time;
+ unsigned long clock_idle_enter;
+ unsigned long timer_idle_enter;
+ unsigned long mt_cycles_enter[8];
};
extern struct device_attribute dev_attr_idle_count;
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index 39cb8d0ae348..54bb932184dd 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -35,6 +35,12 @@ void account_idle_time_irq(void)
this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]);
}
+ WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1);
+
+ /* Dyntick idle time accounted by nohz/scheduler */
+ if (idle->idle_dyntick)
+ return;
+
idle_time = lc->int_clock - idle->clock_idle_enter;
lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock;
@@ -45,7 +51,6 @@ void account_idle_time_irq(void)
/* Account time spent with enabled wait psw loaded as idle time. */
WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time);
- WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1);
account_idle_time(cputime_to_nsecs(idle_time));
}
@@ -61,8 +66,10 @@ void noinstr arch_cpu_idle(void)
set_cpu_flag(CIF_ENABLED_WAIT);
if (smp_cpu_mtid)
stcctm(MT_DIAG, smp_cpu_mtid, (u64 *)&idle->mt_cycles_enter);
- idle->clock_idle_enter = get_tod_clock_fast();
- idle->timer_idle_enter = get_cpu_timer();
+ if (!idle->idle_dyntick) {
+ idle->clock_idle_enter = get_tod_clock_fast();
+ idle->timer_idle_enter = get_cpu_timer();
+ }
bpon();
__load_psw_mask(psw_mask);
}
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 234a0ba30510..c19528eb4ee3 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -17,6 +17,7 @@
#include <asm/vtimer.h>
#include <asm/vtime.h>
#include <asm/cpu_mf.h>
+#include <asm/idle.h>
#include <asm/smp.h>
#include "entry.h"
@@ -111,23 +112,30 @@ static void account_system_index_scaled(struct task_struct *p, u64 cputime,
account_system_index_time(p, cputime_to_nsecs(cputime), index);
}
-/*
- * Update process times based on virtual cpu times stored by entry.S
- * to the lowcore fields user_timer, system_timer & steal_clock.
- */
-static int do_account_vtime(struct task_struct *tsk)
+static inline void vtime_reset_last_update(struct lowcore *lc)
{
- u64 timer, clock, user, guest, system, hardirq, softirq;
- struct lowcore *lc = get_lowcore();
-
- timer = lc->last_update_timer;
- clock = lc->last_update_clock;
asm volatile(
" stpt %0\n" /* Store current cpu timer value */
" stckf %1" /* Store current tod clock value */
: "=Q" (lc->last_update_timer),
"=Q" (lc->last_update_clock)
: : "cc");
+}
+
+/*
+ * Update process times based on virtual cpu times stored by entry.S
+ * to the lowcore fields user_timer, system_timer & steal_clock.
+ */
+static int do_account_vtime(struct task_struct *tsk)
+{
+ u64 timer, clock, user, guest, system, hardirq, softirq;
+ struct lowcore *lc = get_lowcore();
+
+ timer = lc->last_update_timer;
+ clock = lc->last_update_clock;
+
+ vtime_reset_last_update(lc);
+
clock = lc->last_update_clock - clock;
timer -= lc->last_update_timer;
@@ -261,6 +269,43 @@ void vtime_account_hardirq(struct task_struct *tsk)
virt_timer_forward(delta);
}
+#ifdef CONFIG_NO_HZ_COMMON
+/**
+ * vtime_reset - Fast forward vtime entry clocks
+ *
+ * Called from dynticks idle IRQ entry to fast-forward the clocks to current time
+ * so that the IRQ time is still accounted by vtime while nohz cputime is paused.
+ */
+void vtime_reset(void)
+{
+ vtime_reset_last_update(get_lowcore());
+}
+
+/**
+ * vtime_dyntick_start - Inform vtime about entry to idle-dynticks
+ *
+ * Called when idle enters in dyntick mode. The idle cputime that elapsed so far
+ * is flushed and the tick subsystem takes over the idle cputime accounting.
+ */
+void vtime_dyntick_start(void)
+{
+ __this_cpu_write(s390_idle.idle_dyntick, true);
+ vtime_flush(current);
+}
+
+/**
+ * vtime_dyntick_stop - Inform vtime about exit from idle-dynticks
+ *
+ * Called when idle exits from dyntick mode. The vtime entry clocks are
+ * fast-forward to current time and idle accounting resumes.
+ */
+void vtime_dyntick_stop(void)
+{
+ vtime_reset_last_update(get_lowcore());
+ __this_cpu_write(s390_idle.idle_dyntick, false);
+}
+#endif /* CONFIG_NO_HZ_COMMON */
+
/*
* Sorted add to a list. List is linear searched until first bigger
* element is found.
--
2.51.1
On Fri, Jan 16, 2026 at 03:51:58PM +0100, Frederic Weisbecker wrote:
> diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
> index 39cb8d0ae348..54bb932184dd 100644
> --- a/arch/s390/kernel/idle.c
> +++ b/arch/s390/kernel/idle.c
> @@ -35,6 +35,12 @@ void account_idle_time_irq(void)
> this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]);
> }
>
> + WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1);
> +
> + /* Dyntick idle time accounted by nohz/scheduler */
> + if (idle->idle_dyntick)
> + return;
> +
> idle_time = lc->int_clock - idle->clock_idle_enter;
>
> lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock;
> @@ -45,7 +51,6 @@ void account_idle_time_irq(void)
>
> /* Account time spent with enabled wait psw loaded as idle time. */
> WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time);
> - WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1);
> account_idle_time(cputime_to_nsecs(idle_time));
> }
This breaks idle time reporting (aka enabled wait psw time) via the per-cpu
sysfs files (see show_idle_time()). That is: the second WRITE_ONCE() should
also go above the early return statement; but of course this leads to other
dependencies...
Not sure what to do with this. I thought about removing those sysfs files
already in the past, since they are of very limited use; and most likely
nothing in user space would miss them.
Anyway, you need to integrate the trivial patch below, so everything compiles
for s390. It also _seems_ to work.
Guess I need to spend some more time on accounting and see what it would take
to convert to VIRT_CPU_ACCOUNTING_GEN, while keeping the current precision and
functionality.
diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h
index 2770c4f761e1..285b3da318d6 100644
--- a/arch/s390/include/asm/idle.h
+++ b/arch/s390/include/asm/idle.h
@@ -8,6 +8,7 @@
#ifndef _S390_IDLE_H
#define _S390_IDLE_H
+#include <linux/percpu-defs.h>
#include <linux/types.h>
#include <linux/device.h>
@@ -20,6 +21,8 @@ struct s390_idle_data {
unsigned long mt_cycles_enter[8];
};
+DECLARE_PER_CPU(struct s390_idle_data, s390_idle);
+
extern struct device_attribute dev_attr_idle_count;
extern struct device_attribute dev_attr_idle_time_us;
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index 54bb932184dd..e3fe64e7adbe 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -19,7 +19,7 @@
#include <asm/smp.h>
#include "entry.h"
-static DEFINE_PER_CPU(struct s390_idle_data, s390_idle);
+DEFINE_PER_CPU(struct s390_idle_data, s390_idle);
void account_idle_time_irq(void)
{
Le Wed, Jan 21, 2026 at 01:17:48PM +0100, Heiko Carstens a écrit :
> On Fri, Jan 16, 2026 at 03:51:58PM +0100, Frederic Weisbecker wrote:
> > diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
> > index 39cb8d0ae348..54bb932184dd 100644
> > --- a/arch/s390/kernel/idle.c
> > +++ b/arch/s390/kernel/idle.c
> > @@ -35,6 +35,12 @@ void account_idle_time_irq(void)
> > this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]);
> > }
> >
> > + WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1);
> > +
> > + /* Dyntick idle time accounted by nohz/scheduler */
> > + if (idle->idle_dyntick)
> > + return;
> > +
> > idle_time = lc->int_clock - idle->clock_idle_enter;
> >
> > lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock;
> > @@ -45,7 +51,6 @@ void account_idle_time_irq(void)
> >
> > /* Account time spent with enabled wait psw loaded as idle time. */
> > WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time);
> > - WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1);
> > account_idle_time(cputime_to_nsecs(idle_time));
> > }
>
> This breaks idle time reporting (aka enabled wait psw time) via the per-cpu
> sysfs files (see show_idle_time()). That is: the second WRITE_ONCE() should
> also go above the early return statement; but of course this leads to other
> dependencies...
Oh right! Will fix that.
BTW here is a question for you, does the timer (as in get_cpu_timer()) still
decrements while in idle? I would assume not, given how lc->system_timer
is updated in account_idle_time_irq().
And another question in this same function is this :
lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock;
clock_idle_enter is updated right before halting the CPU. But when was
last_update_clock updated last? Could be either task switch to idle, or
a previous idle tick interrupt or a previous idle IRQ entry. In any case
I'm not sure the difference is meaningful as steal time.
I must be missing something.
> Not sure what to do with this. I thought about removing those sysfs files
> already in the past, since they are of very limited use; and most likely
> nothing in user space would miss them.
Perhaps but this file is a good comparison point against /proc/stat because
s390 vtime is much closer to measuring the actual CPU halted time than what
the generic nohz accounting does (which includes more idle code execution).
>
> Anyway, you need to integrate the trivial patch below, so everything compiles
> for s390. It also _seems_ to work.
Thanks, I'll include that.
>
> Guess I need to spend some more time on accounting and see what it would take
> to convert to VIRT_CPU_ACCOUNTING_GEN, while keeping the current precision and
> functionality.
I would expect more overhead with VIRT_CPU_ACCOUNTING_GEN, though that has yet
to be measured. In any case you'll lose some idle cputime precision (but
you need to read that through s390 sysfs files) if what we want to measure
here is the actual halted time.
Perhaps we could enhance VIRT_CPU_ACCOUNTING_GEN and nohz idle cputime
accounting to match s390 precision. Though I expect some cost
accessing the clock inevitably more often on some machines.
Thanks.
--
Frederic Weisbecker
SUSE Labs
On Wed, Jan 21, 2026 at 07:04:35PM +0100, Frederic Weisbecker wrote: > BTW here is a question for you, does the timer (as in get_cpu_timer()) still > decrements while in idle? I would assume not, given how lc->system_timer > is updated in account_idle_time_irq(). It is not decremented while in idle (or when the hypervisor schedules the virtual cpu away). We use the fact that the cpu timer is not decremented when the virtual cpu is not running vs the real time-of-day clock to calculate steal time. > And another question in this same function is this : > > lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock; > > clock_idle_enter is updated right before halting the CPU. But when was > last_update_clock updated last? Could be either task switch to idle, or > a previous idle tick interrupt or a previous idle IRQ entry. In any case > I'm not sure the difference is meaningful as steal time. > > I must be missing something. "It has been like that forever" :) However I do agree that this doesn't seem to make any sense. At least with the current implementation I cannot see how that makes sense, since the difference of two time stamps, which do not include any steal time are added. Maybe it broke by some of all the changes over the years, or it was always wrong, or I am missing something too. Will investigate and address it if required. Thank you for bringing this up! > > Not sure what to do with this. I thought about removing those sysfs files > > already in the past, since they are of very limited use; and most likely > > nothing in user space would miss them. > > Perhaps but this file is a good comparison point against /proc/stat because > s390 vtime is much closer to measuring the actual CPU halted time than what > the generic nohz accounting does (which includes more idle code execution). Yes, while comparing those files I also see an unexpected difference of several seconds after two days of uptime; that is before your changes. In theory the sum of idle and iowait in /proc/stat should be the same like the per-cpu idle_time_us sysfs file. But there is a difference, which shouldn't be there as far as I can tell. Yet another thing to look into. > > Guess I need to spend some more time on accounting and see what it would take > > to convert to VIRT_CPU_ACCOUNTING_GEN, while keeping the current precision and > > functionality. > > I would expect more overhead with VIRT_CPU_ACCOUNTING_GEN, though that has yet > to be measured. In any case you'll lose some idle cputime precision (but > you need to read that through s390 sysfs files) if what we want to measure > here is the actual halted time. > > Perhaps we could enhance VIRT_CPU_ACCOUNTING_GEN and nohz idle cputime > accounting to match s390 precision. Though I expect some cost > accessing the clock inevitably more often on some machines. Let me experiment with that, but first I want to understand the oddities pointed out above.
Le Thu, Jan 22, 2026 at 03:40:45PM +0100, Heiko Carstens a écrit : > On Wed, Jan 21, 2026 at 07:04:35PM +0100, Frederic Weisbecker wrote: > > BTW here is a question for you, does the timer (as in get_cpu_timer()) still > > decrements while in idle? I would assume not, given how lc->system_timer > > is updated in account_idle_time_irq(). > > It is not decremented while in idle (or when the hypervisor schedules > the virtual cpu away). We use the fact that the cpu timer is not > decremented when the virtual cpu is not running vs the real > time-of-day clock to calculate steal time. Ok, good then! > > > And another question in this same function is this : > > > > lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock; > > > > clock_idle_enter is updated right before halting the CPU. But when was > > last_update_clock updated last? Could be either task switch to idle, or > > a previous idle tick interrupt or a previous idle IRQ entry. In any case > > I'm not sure the difference is meaningful as steal time. > > > > I must be missing something. > > "It has been like that forever" :) However I do agree that this doesn't seem > to make any sense. At least with the current implementation I cannot see how > that makes sense, since the difference of two time stamps, which do not > include any steal time are added. > > Maybe it broke by some of all the changes over the years, or it was always > wrong, or I am missing something too. > > Will investigate and address it if required. Thank you for bringing this up! Ok, I take some relief from the fact it's not only unclear to me :-) > > > > Not sure what to do with this. I thought about removing those sysfs files > > > already in the past, since they are of very limited use; and most likely > > > nothing in user space would miss them. > > > > Perhaps but this file is a good comparison point against /proc/stat because > > s390 vtime is much closer to measuring the actual CPU halted time than what > > the generic nohz accounting does (which includes more idle code execution). > > Yes, while comparing those files I also see an unexpected difference of > several seconds after two days of uptime; that is before your changes. > > In theory the sum of idle and iowait in /proc/stat should be the same like the > per-cpu idle_time_us sysfs file. But there is a difference, which shouldn't be > there as far as I can tell. Yet another thing to look into. Yes and that's expected both before and after my changes. * /proc/stat is the time spent between tick_nohz_idle_enter() and tick_nohz_idle_exit() (to simplify, because there are some pause during idle IRQs). * The s390 idle sysfs file depicts more closely the time spent while the CPU is really idle (and not executing idle code). Different semantics and this is why you observe different results. I guess /proc/stat has higher values (with idle + iowait) and that is expected. Thanks. -- Frederic Weisbecker SUSE Labs
© 2016 - 2026 Red Hat, Inc.