[PATCH 03/15] sched/cputime: Correctly support generic vtime idle time

Frederic Weisbecker posted 15 patches 3 weeks, 1 day ago
There is a newer version of this series
[PATCH 03/15] sched/cputime: Correctly support generic vtime idle time
Posted by Frederic Weisbecker 3 weeks, 1 day ago
Currently whether generic vtime is running or not, the idle cputime is
fetched from the nohz accounting.

However generic vtime already does its own idle cputime accounting. Only
the kernel stat accessors are not plugged to support it.

Read the idle generic vtime cputime when it's running, this will allow
to later more clearly split nohz and vtime cputime accounting.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 fs/proc/stat.c           |  8 ++++----
 include/linux/vtime.h    |  7 ++++++-
 kernel/sched/cputime.c   | 38 +++++++++++++++++++++++++++++++-------
 kernel/time/tick-sched.c |  2 +-
 4 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 8b444e862319..6ac2a13b8be5 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -30,8 +30,8 @@ u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
 		idle_usecs = get_cpu_idle_time_us(cpu, NULL);
 
 	if (idle_usecs == -1ULL)
-		/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
-		idle = kcs->cpustat[CPUTIME_IDLE];
+		/* !NO_HZ or cpu offline or vtime so we can rely on cpustat.idle */
+		idle = kcpustat_field(CPUTIME_IDLE, cpu);
 	else
 		idle = idle_usecs * NSEC_PER_USEC;
 
@@ -46,8 +46,8 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
 		iowait_usecs = get_cpu_iowait_time_us(cpu, NULL);
 
 	if (iowait_usecs == -1ULL)
-		/* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
-		iowait = kcs->cpustat[CPUTIME_IOWAIT];
+		/* !NO_HZ or cpu offline or vtime so we can rely on cpustat.iowait */
+		iowait = kcpustat_field(CPUTIME_IOWAIT, cpu);
 	else
 		iowait = iowait_usecs * NSEC_PER_USEC;
 
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 29dd5b91dd7d..737930f66c3e 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -27,6 +27,11 @@ static inline void vtime_guest_exit(struct task_struct *tsk) { }
 static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
 #endif
 
+static inline bool vtime_generic_enabled_cpu(int cpu)
+{
+	return context_tracking_enabled_cpu(cpu);
+}
+
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset);
 extern void vtime_account_softirq(struct task_struct *tsk);
@@ -74,7 +79,7 @@ static inline bool vtime_accounting_enabled(void)
 
 static inline bool vtime_accounting_enabled_cpu(int cpu)
 {
-	return context_tracking_enabled_cpu(cpu);
+	return vtime_generic_enabled_cpu(cpu);
 }
 
 static inline bool vtime_accounting_enabled_this_cpu(void)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5dcb0f2e01bc..f32c169da11a 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -761,7 +761,11 @@ EXPORT_SYMBOL_GPL(vtime_guest_exit);
 
 void vtime_account_idle(struct task_struct *tsk)
 {
-	account_idle_time(get_vtime_delta(&tsk->vtime));
+	struct vtime *vtime = &tsk->vtime;
+
+	write_seqcount_begin(&vtime->seqcount);
+	account_idle_time(get_vtime_delta(vtime));
+	write_seqcount_end(&vtime->seqcount);
 }
 
 void vtime_task_switch_generic(struct task_struct *prev)
@@ -912,6 +916,7 @@ static int kcpustat_field_vtime(u64 *cpustat,
 				int cpu, u64 *val)
 {
 	struct vtime *vtime = &tsk->vtime;
+	struct rq *rq = cpu_rq(cpu);
 	unsigned int seq;
 
 	do {
@@ -953,6 +958,14 @@ static int kcpustat_field_vtime(u64 *cpustat,
 			if (state == VTIME_GUEST && task_nice(tsk) > 0)
 				*val += vtime->gtime + vtime_delta(vtime);
 			break;
+		case CPUTIME_IDLE:
+			if (state == VTIME_IDLE && !atomic_read(&rq->nr_iowait))
+				*val += vtime_delta(vtime);
+			break;
+		case CPUTIME_IOWAIT:
+			if (state == VTIME_IDLE && atomic_read(&rq->nr_iowait) > 0)
+				*val += vtime_delta(vtime);
+			break;
 		default:
 			break;
 		}
@@ -1015,8 +1028,8 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
 		*dst = *src;
 		cpustat = dst->cpustat;
 
-		/* Task is sleeping, dead or idle, nothing to add */
-		if (state < VTIME_SYS)
+		/* Task is sleeping or dead, nothing to add */
+		if (state < VTIME_IDLE)
 			continue;
 
 		delta = vtime_delta(vtime);
@@ -1025,15 +1038,17 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
 		 * Task runs either in user (including guest) or kernel space,
 		 * add pending nohz time to the right place.
 		 */
-		if (state == VTIME_SYS) {
+		switch (vtime->state) {
+		case VTIME_SYS:
 			cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
-		} else if (state == VTIME_USER) {
+			break;
+		case VTIME_USER:
 			if (task_nice(tsk) > 0)
 				cpustat[CPUTIME_NICE] += vtime->utime + delta;
 			else
 				cpustat[CPUTIME_USER] += vtime->utime + delta;
-		} else {
-			WARN_ON_ONCE(state != VTIME_GUEST);
+			break;
+		case VTIME_GUEST:
 			if (task_nice(tsk) > 0) {
 				cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
 				cpustat[CPUTIME_NICE] += vtime->gtime + delta;
@@ -1041,6 +1056,15 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
 				cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
 				cpustat[CPUTIME_USER] += vtime->gtime + delta;
 			}
+			break;
+		case VTIME_IDLE:
+			if (atomic_read(&cpu_rq(cpu)->nr_iowait) > 0)
+				cpustat[CPUTIME_IOWAIT] += delta;
+			else
+				cpustat[CPUTIME_IDLE] += delta;
+			break;
+		default:
+			WARN_ON_ONCE(1);
 		}
 	} while (read_seqcount_retry(&vtime->seqcount, seq));
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 8ddf74e705d3..f1d07a0276a5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -780,7 +780,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
 	ktime_t now, idle;
 	unsigned int seq;
 
-	if (!tick_nohz_active)
+	if (!tick_nohz_active || vtime_generic_enabled_cpu(cpu))
 		return -1;
 
 	now = ktime_get();
-- 
2.51.1
Re: [PATCH 03/15] sched/cputime: Correctly support generic vtime idle time
Posted by Peter Zijlstra 2 weeks, 5 days ago
On Fri, Jan 16, 2026 at 03:51:56PM +0100, Frederic Weisbecker wrote:

> diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
> index 8ddf74e705d3..f1d07a0276a5 100644
> --- a/kernel/time/tick-sched.c
> +++ b/kernel/time/tick-sched.c
> @@ -780,7 +780,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
>  	ktime_t now, idle;
>  	unsigned int seq;
>  
> -	if (!tick_nohz_active)
> +	if (!tick_nohz_active || vtime_generic_enabled_cpu(cpu))
>  		return -1;
>  
>  	now = ktime_get();

Is this not broken? IIUC this means that you can no longer use
get_cpu_{idle,iowait}_time_us() the moment you have context tracking
enabled.
Re: [PATCH 03/15] sched/cputime: Correctly support generic vtime idle time
Posted by Frederic Weisbecker 2 weeks, 5 days ago
Le Mon, Jan 19, 2026 at 02:02:22PM +0100, Peter Zijlstra a écrit :
> On Fri, Jan 16, 2026 at 03:51:56PM +0100, Frederic Weisbecker wrote:
> 
> > diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
> > index 8ddf74e705d3..f1d07a0276a5 100644
> > --- a/kernel/time/tick-sched.c
> > +++ b/kernel/time/tick-sched.c
> > @@ -780,7 +780,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
> >  	ktime_t now, idle;
> >  	unsigned int seq;
> >  
> > -	if (!tick_nohz_active)
> > +	if (!tick_nohz_active || vtime_generic_enabled_cpu(cpu))
> >  		return -1;
> >  
> >  	now = ktime_get();
> 
> Is this not broken? IIUC this means that you can no longer use
> get_cpu_{idle,iowait}_time_us() the moment you have context tracking
> enabled.

It is supported again in patch 13/15. And it's not exactly breaking
bisection in the meantime because the sole user is cpufreq and cpufreq
shouldn't be relevant with nohz_full.

Ok a few subsystem rely on the resulting cpufreq API get_cpu_idle_time():

- the legacy drivers/macintosh/rack-meter.c
- drivers/scsi/lpfc/lpfc_init.c

But cpufreq provides a low-resolution version in the worst case for nohz_full
(again until 13/15).

Hmm, but you're right this is confusing. I think I should be able to fix that
in this patch.

Thanks.

-- 
Frederic Weisbecker
SUSE Labs