The per-task and per-cpu accounting have already been tracked by
t->sched_info.run_delay and rq->rq_sched_info.run_delay respectively.
Extends this to also include cgroups.
The "some" field of cpu.pressure indicator may lose the insight into
how severely one cgroup is stalled on certain cpu, because PSI tracks
stall time for each cpu through:
tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
which turns nr_delayed_tasks[cpu] into boolean value. So together with
this cgroup level run_delay accounting, the scheduling info of cgroups
will be better illustrated.
Only cgroup v2 is supported. Similar to the task accounting, the cgroup
accounting requires that CONFIG_SCHED_INFO is enabled.
Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
---
include/linux/cgroup-defs.h | 3 +++
include/linux/kernel_stat.h | 14 ++++++++++++++
kernel/cgroup/rstat.c | 25 +++++++++++++++++++++++++
kernel/sched/cputime.c | 12 ++++++++++++
kernel/sched/stats.h | 3 +++
5 files changed, 57 insertions(+)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1b20d2d8ef7c..287366e60414 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -328,6 +328,9 @@ struct cgroup_base_stat {
u64 forceidle_sum;
#endif
u64 ntime;
+#ifdef CONFIG_SCHED_INFO
+ u64 run_delay;
+#endif
};
/*
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index b97ce2df376f..e2ac42a166c1 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -31,6 +31,15 @@ enum cpu_usage_stat {
CPUTIME_FORCEIDLE,
#endif
NR_STATS,
+
+#ifdef CONFIG_SCHED_INFO
+ /*
+ * Instead of cputime, run_delay is tracked through
+ * sched_info by task and rq, so there is no need to
+ * extend the cpustat[] array.
+ */
+ CPUTIME_RUN_DELAY,
+#endif
};
struct kernel_cpustat {
@@ -141,4 +150,9 @@ extern void account_idle_ticks(unsigned long ticks);
extern void __account_forceidle_time(struct task_struct *tsk, u64 delta);
#endif
+#ifdef CONFIG_SCHED_INFO
+extern void account_run_delay_time(struct task_struct *tsk, u64 delta);
+extern u64 get_cpu_run_delay(int cpu);
+#endif
+
#endif /* _LINUX_KERNEL_STAT_H */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index c2784c317cdd..504da76553ee 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -445,6 +445,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
#endif
dst_bstat->ntime += src_bstat->ntime;
+#ifdef CONFIG_SCHED_INFO
+ dst_bstat->run_delay += src_bstat->run_delay;
+#endif
}
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -457,6 +460,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
#endif
dst_bstat->ntime -= src_bstat->ntime;
+#ifdef CONFIG_SCHED_INFO
+ dst_bstat->run_delay -= src_bstat->run_delay;
+#endif
}
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
@@ -551,6 +557,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
case CPUTIME_FORCEIDLE:
rstatc->bstat.forceidle_sum += delta_exec;
break;
+#endif
+#ifdef CONFIG_SCHED_INFO
+ case CPUTIME_RUN_DELAY:
+ rstatc->bstat.run_delay += delta_exec;
+ break;
#endif
default:
break;
@@ -596,6 +607,9 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
#endif
bstat->ntime += cpustat[CPUTIME_NICE];
+#ifdef CONFIG_SCHED_INFO
+ bstat->run_delay += get_cpu_run_delay(i);
+#endif
}
}
@@ -610,6 +624,16 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat
#endif
}
+static void cgroup_run_delay_show(struct seq_file *seq, struct cgroup_base_stat *bstat)
+{
+#ifdef CONFIG_SCHED_INFO
+ u64 run_delay = bstat->run_delay;
+
+ do_div(run_delay, NSEC_PER_USEC);
+ seq_printf(seq, "run_delay_usec %llu\n", run_delay);
+#endif
+}
+
void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
@@ -640,6 +664,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
bstat.ntime);
cgroup_force_idle_show(seq, &bstat);
+ cgroup_run_delay_show(seq, &bstat);
}
/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5d9143dd0879..e6be57cdb54e 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -243,6 +243,18 @@ void __account_forceidle_time(struct task_struct *p, u64 delta)
}
#endif
+#ifdef CONFIG_SCHED_INFO
+void account_run_delay_time(struct task_struct *p, u64 delta)
+{
+ cgroup_account_cputime_field(p, CPUTIME_RUN_DELAY, delta);
+}
+
+u64 get_cpu_run_delay(int cpu)
+{
+ return cpu_rq(cpu)->rq_sched_info.run_delay;
+}
+#endif
+
/*
* When a guest is interrupted for a longer amount of time, missed clock
* ticks are not redelivered later. Due to that, this function may on
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 19cdbe96f93d..fdfd04a89b05 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -252,7 +252,9 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
t->sched_info.max_run_delay = delta;
if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
t->sched_info.min_run_delay = delta;
+
rq_sched_info_dequeue(rq, delta);
+ account_run_delay_time(t, delta);
}
/*
@@ -279,6 +281,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
t->sched_info.min_run_delay = delta;
rq_sched_info_arrive(rq, delta);
+ account_run_delay_time(t, delta);
}
/*
--
2.37.3
Hi Abel,
kernel test robot noticed the following build errors:
[auto build test ERROR on tj-cgroup/for-next]
[also build test ERROR on tip/sched/core akpm-mm/mm-everything linus/master v6.14-rc1 next-20250207]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Abel-Wu/cgroup-rstat-Fix-forceidle-time-in-cpu-stat/20250207-121257
base: https://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git for-next
patch link: https://lore.kernel.org/r/20250207041012.89192-3-wuyun.abel%40bytedance.com
patch subject: [PATCH v3 2/2] cgroup/rstat: Add run_delay accounting for cgroups
config: x86_64-kexec (https://download.01.org/0day-ci/archive/20250208/202502081318.c9fYNNx8-lkp@intel.com/config)
compiler: clang version 19.1.3 (https://github.com/llvm/llvm-project ab51eccf88f5321e7c60591c5546b254b6afab99)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250208/202502081318.c9fYNNx8-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202502081318.c9fYNNx8-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from kernel/sched/build_policy.c:19:
In file included from include/linux/sched/isolation.h:5:
In file included from include/linux/cpuset.h:17:
In file included from include/linux/mm.h:2224:
include/linux/vmstat.h:504:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
504 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~ ^
505 | item];
| ~~~~
include/linux/vmstat.h:511:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
511 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~ ^
512 | NR_VM_NUMA_EVENT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~~
include/linux/vmstat.h:524:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
524 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~ ^
525 | NR_VM_NUMA_EVENT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~~
In file included from kernel/sched/build_policy.c:59:
>> kernel/sched/cputime.c:254:22: error: no member named 'rq_sched_info' in 'struct rq'
254 | return cpu_rq(cpu)->rq_sched_info.run_delay;
| ~~~~~~~~~~~ ^
3 warnings and 1 error generated.
vim +254 kernel/sched/cputime.c
251
252 u64 get_cpu_run_delay(int cpu)
253 {
> 254 return cpu_rq(cpu)->rq_sched_info.run_delay;
255 }
256 #endif
257
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On 2/8/25 1:47 PM, kernel test robot Wrote:
> Hi Abel,
>
> kernel test robot noticed the following build errors:
>
> [auto build test ERROR on tj-cgroup/for-next]
> [also build test ERROR on tip/sched/core akpm-mm/mm-everything linus/master v6.14-rc1 next-20250207]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> url: https://github.com/intel-lab-lkp/linux/commits/Abel-Wu/cgroup-rstat-Fix-forceidle-time-in-cpu-stat/20250207-121257
> base: https://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git for-next
> patch link: https://lore.kernel.org/r/20250207041012.89192-3-wuyun.abel%40bytedance.com
> patch subject: [PATCH v3 2/2] cgroup/rstat: Add run_delay accounting for cgroups
> config: x86_64-kexec (https://download.01.org/0day-ci/archive/20250208/202502081318.c9fYNNx8-lkp@intel.com/config)
> compiler: clang version 19.1.3 (https://github.com/llvm/llvm-project ab51eccf88f5321e7c60591c5546b254b6afab99)
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250208/202502081318.c9fYNNx8-lkp@intel.com/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202502081318.c9fYNNx8-lkp@intel.com/
>
> All errors (new ones prefixed by >>):
>
> In file included from kernel/sched/build_policy.c:19:
> In file included from include/linux/sched/isolation.h:5:
> In file included from include/linux/cpuset.h:17:
> In file included from include/linux/mm.h:2224:
> include/linux/vmstat.h:504:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
> 504 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
> | ~~~~~~~~~~~~~~~~~~~~~ ^
> 505 | item];
> | ~~~~
> include/linux/vmstat.h:511:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
> 511 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
> | ~~~~~~~~~~~~~~~~~~~~~ ^
> 512 | NR_VM_NUMA_EVENT_ITEMS +
> | ~~~~~~~~~~~~~~~~~~~~~~
> include/linux/vmstat.h:524:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
> 524 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
> | ~~~~~~~~~~~~~~~~~~~~~ ^
> 525 | NR_VM_NUMA_EVENT_ITEMS +
> | ~~~~~~~~~~~~~~~~~~~~~~
> In file included from kernel/sched/build_policy.c:59:
>>> kernel/sched/cputime.c:254:22: error: no member named 'rq_sched_info' in 'struct rq'
> 254 | return cpu_rq(cpu)->rq_sched_info.run_delay;
> | ~~~~~~~~~~~ ^
> 3 warnings and 1 error generated.
Oops.. SCHED_INFO can be selected by either TASK_DELAY_ACCT or SCHEDSTATS.
Will fix. Thanks a lot!
>
>
> vim +254 kernel/sched/cputime.c
>
> 251
> 252 u64 get_cpu_run_delay(int cpu)
> 253 {
> > 254 return cpu_rq(cpu)->rq_sched_info.run_delay;
> 255 }
> 256 #endif
> 257
>
© 2016 - 2026 Red Hat, Inc.