kernel/sched/psi.c | 48 ++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 27 deletions(-)
Given that psi_group_change() can be called rather frequently from the
scheduler task switching code lets streamline it a bit to reduce the
number of loops and conditionals on the typical invocation.
First thing is that we replace the open coded mask walks with the standard
for_each_set_bit(). This makes the source code a bit more readable and
also enables usage of the efficient CPU specific zero bit skip
instructions.
In doing so we also remove the need to mask out the special TSK_ONCPU bit
from the set and clear masks, since for_each_set_bit() now directly limits
the array index to the safe range.
As the last remaining step we can now easily move the new state mask
computation to only run when required.
End result is hopefully more readable code and a very small but measurable
reduction in task switching CPU overhead.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Peter Ziljstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
---
kernel/sched/psi.c | 48 ++++++++++++++++++++--------------------------
1 file changed, 21 insertions(+), 27 deletions(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 59fdb7ebbf22..fe19aeef8dbd 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -798,39 +798,26 @@ static void psi_group_change(struct psi_group *group, int cpu,
u64 now, bool wake_clock)
{
struct psi_group_cpu *groupc;
- unsigned int t, m;
+ unsigned long t, m;
u32 state_mask;
lockdep_assert_rq_held(cpu_rq(cpu));
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
- * Start with TSK_ONCPU, which doesn't have a corresponding
- * task count - it's just a boolean flag directly encoded in
- * the state mask. Clear, set, or carry the current state if
- * no changes are requested.
+ * TSK_ONCPU does not have a corresponding task count - it's just a
+ * boolean flag directly encoded in the state mask. Clear, set, or carry
+ * the current state if no changes are requested.
+ *
+ * The rest of the state mask is calculated based on the task counts.
+ * Update those first, then construct the mask.
*/
- if (unlikely(clear & TSK_ONCPU)) {
- state_mask = 0;
- clear &= ~TSK_ONCPU;
- } else if (unlikely(set & TSK_ONCPU)) {
- state_mask = PSI_ONCPU;
- set &= ~TSK_ONCPU;
- } else {
- state_mask = groupc->state_mask & PSI_ONCPU;
- }
-
- /*
- * The rest of the state mask is calculated based on the task
- * counts. Update those first, then construct the mask.
- */
- for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
- if (!(m & (1 << t)))
- continue;
- if (groupc->tasks[t]) {
+ m = clear;
+ for_each_set_bit(t, &m, ARRAY_SIZE(groupc->tasks)) {
+ if (likely(groupc->tasks[t])) {
groupc->tasks[t]--;
} else if (!psi_bug) {
- printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
+ printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%lu tasks=[%u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
groupc->tasks[3], clear, set);
@@ -838,9 +825,9 @@ static void psi_group_change(struct psi_group *group, int cpu,
}
}
- for (t = 0; set; set &= ~(1 << t), t++)
- if (set & (1 << t))
- groupc->tasks[t]++;
+ m = set;
+ for_each_set_bit(t, &m, ARRAY_SIZE(groupc->tasks))
+ groupc->tasks[t]++;
if (!group->enabled) {
/*
@@ -853,6 +840,13 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
record_times(groupc, now);
+ if (unlikely(clear & TSK_ONCPU))
+ state_mask = 0;
+ else if (unlikely(set & TSK_ONCPU))
+ state_mask = PSI_ONCPU;
+ else
+ state_mask = groupc->state_mask & PSI_ONCPU;
+
groupc->state_mask = state_mask;
return;
--
2.51.1
Hi Tvrtko,
kernel test robot noticed the following build warnings:
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Tvrtko-Ursulin/sched-psi-Streamline-the-flow-in-psi_group_change/20251113-203009
base: tip/sched/core
patch link: https://lore.kernel.org/r/20251113122254.40445-1-tvrtko.ursulin%40igalia.com
patch subject: [PATCH] sched/psi: Streamline the flow in psi_group_change
config: sh-randconfig-r071-20251115 (https://download.01.org/0day-ci/archive/20251116/202511161642.NB7XYtH9-lkp@intel.com/config)
compiler: sh4-linux-gcc (GCC) 15.1.0
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
| Closes: https://lore.kernel.org/r/202511161642.NB7XYtH9-lkp@intel.com/
smatch warnings:
kernel/sched/psi.c:855 psi_group_change() error: uninitialized symbol 'state_mask'.
vim +/state_mask +855 kernel/sched/psi.c
36b238d5717279 Johannes Weiner 2020-03-16 796 static void psi_group_change(struct psi_group *group, int cpu,
3840cbe24cf060 Johannes Weiner 2024-10-03 797 unsigned int clear, unsigned int set,
570c8efd5eb79c Peter Zijlstra 2025-05-23 798 u64 now, bool wake_clock)
eb414681d5a07d Johannes Weiner 2018-10-26 799 {
eb414681d5a07d Johannes Weiner 2018-10-26 800 struct psi_group_cpu *groupc;
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 801 unsigned long t, m;
71dbdde7914d32 Johannes Weiner 2022-08-26 802 u32 state_mask;
eb414681d5a07d Johannes Weiner 2018-10-26 803
ddae0ca2a8fe12 John Stultz 2024-06-18 804 lockdep_assert_rq_held(cpu_rq(cpu));
eb414681d5a07d Johannes Weiner 2018-10-26 805 groupc = per_cpu_ptr(group->pcpu, cpu);
eb414681d5a07d Johannes Weiner 2018-10-26 806
71dbdde7914d32 Johannes Weiner 2022-08-26 807 /*
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 808 * TSK_ONCPU does not have a corresponding task count - it's just a
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 809 * boolean flag directly encoded in the state mask. Clear, set, or carry
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 810 * the current state if no changes are requested.
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 811 *
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 812 * The rest of the state mask is calculated based on the task counts.
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 813 * Update those first, then construct the mask.
71dbdde7914d32 Johannes Weiner 2022-08-26 814 */
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 815 m = clear;
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 816 for_each_set_bit(t, &m, ARRAY_SIZE(groupc->tasks)) {
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 817 if (likely(groupc->tasks[t])) {
9d10a13d1e4c34 Charan Teja Reddy 2021-04-16 818 groupc->tasks[t]--;
9d10a13d1e4c34 Charan Teja Reddy 2021-04-16 819 } else if (!psi_bug) {
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 820 printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%lu tasks=[%u %u %u %u] clear=%x set=%x\n",
eb414681d5a07d Johannes Weiner 2018-10-26 821 cpu, t, groupc->tasks[0],
eb414681d5a07d Johannes Weiner 2018-10-26 822 groupc->tasks[1], groupc->tasks[2],
71dbdde7914d32 Johannes Weiner 2022-08-26 823 groupc->tasks[3], clear, set);
eb414681d5a07d Johannes Weiner 2018-10-26 824 psi_bug = 1;
eb414681d5a07d Johannes Weiner 2018-10-26 825 }
eb414681d5a07d Johannes Weiner 2018-10-26 826 }
eb414681d5a07d Johannes Weiner 2018-10-26 827
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 828 m = set;
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 829 for_each_set_bit(t, &m, ARRAY_SIZE(groupc->tasks))
eb414681d5a07d Johannes Weiner 2018-10-26 830 groupc->tasks[t]++;
eb414681d5a07d Johannes Weiner 2018-10-26 831
34f26a15611afb Chengming Zhou 2022-09-07 832 if (!group->enabled) {
34f26a15611afb Chengming Zhou 2022-09-07 833 /*
34f26a15611afb Chengming Zhou 2022-09-07 834 * On the first group change after disabling PSI, conclude
34f26a15611afb Chengming Zhou 2022-09-07 835 * the current state and flush its time. This is unlikely
34f26a15611afb Chengming Zhou 2022-09-07 836 * to matter to the user, but aggregation (get_recent_times)
34f26a15611afb Chengming Zhou 2022-09-07 837 * may have already incorporated the live state into times_prev;
34f26a15611afb Chengming Zhou 2022-09-07 838 * avoid a delta sample underflow when PSI is later re-enabled.
34f26a15611afb Chengming Zhou 2022-09-07 839 */
34f26a15611afb Chengming Zhou 2022-09-07 840 if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
34f26a15611afb Chengming Zhou 2022-09-07 841 record_times(groupc, now);
34f26a15611afb Chengming Zhou 2022-09-07 842
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 843 if (unlikely(clear & TSK_ONCPU))
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 844 state_mask = 0;
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 845 else if (unlikely(set & TSK_ONCPU))
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 846 state_mask = PSI_ONCPU;
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 847 else
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 848 state_mask = groupc->state_mask & PSI_ONCPU;
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 849
34f26a15611afb Chengming Zhou 2022-09-07 850 groupc->state_mask = state_mask;
34f26a15611afb Chengming Zhou 2022-09-07 851
34f26a15611afb Chengming Zhou 2022-09-07 852 return;
we return on this path where state_mask is set.
34f26a15611afb Chengming Zhou 2022-09-07 853 }
34f26a15611afb Chengming Zhou 2022-09-07 854
0ec208ce983492 Tvrtko Ursulin 2024-06-25 @855 state_mask = test_states(groupc->tasks, state_mask);
^^^^^^^^^^
state_mask is totally uninitialized here.
7fae6c8171d20a Chengming Zhou 2021-03-03 856
7fae6c8171d20a Chengming Zhou 2021-03-03 857 /*
7fae6c8171d20a Chengming Zhou 2021-03-03 858 * Since we care about lost potential, a memstall is FULL
7fae6c8171d20a Chengming Zhou 2021-03-03 859 * when there are no other working tasks, but also when
7fae6c8171d20a Chengming Zhou 2021-03-03 860 * the CPU is actively reclaiming and nothing productive
7fae6c8171d20a Chengming Zhou 2021-03-03 861 * could run even if it were runnable. So when the current
7fae6c8171d20a Chengming Zhou 2021-03-03 862 * task in a cgroup is in_memstall, the corresponding groupc
7fae6c8171d20a Chengming Zhou 2021-03-03 863 * on that cpu is in PSI_MEM_FULL state.
7fae6c8171d20a Chengming Zhou 2021-03-03 864 */
71dbdde7914d32 Johannes Weiner 2022-08-26 865 if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
7fae6c8171d20a Chengming Zhou 2021-03-03 866 state_mask |= (1 << PSI_MEM_FULL);
7fae6c8171d20a Chengming Zhou 2021-03-03 867
34f26a15611afb Chengming Zhou 2022-09-07 868 record_times(groupc, now);
34f26a15611afb Chengming Zhou 2022-09-07 869
33b2d6302abc4c Suren Baghdasaryan 2019-05-14 870 groupc->state_mask = state_mask;
33b2d6302abc4c Suren Baghdasaryan 2019-05-14 871
65457b74aa9437 Domenico Cerasuolo 2023-03-30 872 if (state_mask & group->rtpoll_states)
65457b74aa9437 Domenico Cerasuolo 2023-03-30 873 psi_schedule_rtpoll_work(group, 1, false);
36b238d5717279 Johannes Weiner 2020-03-16 874
36b238d5717279 Johannes Weiner 2020-03-16 875 if (wake_clock && !delayed_work_pending(&group->avgs_work))
36b238d5717279 Johannes Weiner 2020-03-16 876 schedule_delayed_work(&group->avgs_work, PSI_FREQ);
eb414681d5a07d Johannes Weiner 2018-10-26 877 }
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi Tvrtko,
kernel test robot noticed the following build warnings:
[auto build test WARNING on tip/sched/core]
[also build test WARNING on linus/master v6.18-rc5 next-20251113]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Tvrtko-Ursulin/sched-psi-Streamline-the-flow-in-psi_group_change/20251113-203009
base: tip/sched/core
patch link: https://lore.kernel.org/r/20251113122254.40445-1-tvrtko.ursulin%40igalia.com
patch subject: [PATCH] sched/psi: Streamline the flow in psi_group_change
config: arm-randconfig-003-20251114 (https://download.01.org/0day-ci/archive/20251114/202511141344.rntAQaRB-lkp@intel.com/config)
compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 0bba1e76581bad04e7d7f09f5115ae5e2989e0d9)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251114/202511141344.rntAQaRB-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202511141344.rntAQaRB-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from kernel/sched/build_utility.c:93:
>> kernel/sched/psi.c:855:42: warning: variable 'state_mask' is uninitialized when used here [-Wuninitialized]
855 | state_mask = test_states(groupc->tasks, state_mask);
| ^~~~~~~~~~
kernel/sched/psi.c:802:16: note: initialize the variable 'state_mask' to silence this warning
802 | u32 state_mask;
| ^
| = 0
1 warning generated.
vim +/state_mask +855 kernel/sched/psi.c
eb414681d5a07d Johannes Weiner 2018-10-26 792
570c8efd5eb79c Peter Zijlstra 2025-05-23 793 #define for_each_group(iter, group) \
570c8efd5eb79c Peter Zijlstra 2025-05-23 794 for (typeof(group) iter = group; iter; iter = iter->parent)
570c8efd5eb79c Peter Zijlstra 2025-05-23 795
36b238d5717279 Johannes Weiner 2020-03-16 796 static void psi_group_change(struct psi_group *group, int cpu,
3840cbe24cf060 Johannes Weiner 2024-10-03 797 unsigned int clear, unsigned int set,
570c8efd5eb79c Peter Zijlstra 2025-05-23 798 u64 now, bool wake_clock)
eb414681d5a07d Johannes Weiner 2018-10-26 799 {
eb414681d5a07d Johannes Weiner 2018-10-26 800 struct psi_group_cpu *groupc;
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 801 unsigned long t, m;
71dbdde7914d32 Johannes Weiner 2022-08-26 802 u32 state_mask;
eb414681d5a07d Johannes Weiner 2018-10-26 803
ddae0ca2a8fe12 John Stultz 2024-06-18 804 lockdep_assert_rq_held(cpu_rq(cpu));
eb414681d5a07d Johannes Weiner 2018-10-26 805 groupc = per_cpu_ptr(group->pcpu, cpu);
eb414681d5a07d Johannes Weiner 2018-10-26 806
71dbdde7914d32 Johannes Weiner 2022-08-26 807 /*
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 808 * TSK_ONCPU does not have a corresponding task count - it's just a
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 809 * boolean flag directly encoded in the state mask. Clear, set, or carry
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 810 * the current state if no changes are requested.
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 811 *
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 812 * The rest of the state mask is calculated based on the task counts.
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 813 * Update those first, then construct the mask.
71dbdde7914d32 Johannes Weiner 2022-08-26 814 */
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 815 m = clear;
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 816 for_each_set_bit(t, &m, ARRAY_SIZE(groupc->tasks)) {
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 817 if (likely(groupc->tasks[t])) {
9d10a13d1e4c34 Charan Teja Reddy 2021-04-16 818 groupc->tasks[t]--;
9d10a13d1e4c34 Charan Teja Reddy 2021-04-16 819 } else if (!psi_bug) {
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 820 printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%lu tasks=[%u %u %u %u] clear=%x set=%x\n",
eb414681d5a07d Johannes Weiner 2018-10-26 821 cpu, t, groupc->tasks[0],
eb414681d5a07d Johannes Weiner 2018-10-26 822 groupc->tasks[1], groupc->tasks[2],
71dbdde7914d32 Johannes Weiner 2022-08-26 823 groupc->tasks[3], clear, set);
eb414681d5a07d Johannes Weiner 2018-10-26 824 psi_bug = 1;
eb414681d5a07d Johannes Weiner 2018-10-26 825 }
eb414681d5a07d Johannes Weiner 2018-10-26 826 }
eb414681d5a07d Johannes Weiner 2018-10-26 827
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 828 m = set;
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 829 for_each_set_bit(t, &m, ARRAY_SIZE(groupc->tasks))
eb414681d5a07d Johannes Weiner 2018-10-26 830 groupc->tasks[t]++;
eb414681d5a07d Johannes Weiner 2018-10-26 831
34f26a15611afb Chengming Zhou 2022-09-07 832 if (!group->enabled) {
34f26a15611afb Chengming Zhou 2022-09-07 833 /*
34f26a15611afb Chengming Zhou 2022-09-07 834 * On the first group change after disabling PSI, conclude
34f26a15611afb Chengming Zhou 2022-09-07 835 * the current state and flush its time. This is unlikely
34f26a15611afb Chengming Zhou 2022-09-07 836 * to matter to the user, but aggregation (get_recent_times)
34f26a15611afb Chengming Zhou 2022-09-07 837 * may have already incorporated the live state into times_prev;
34f26a15611afb Chengming Zhou 2022-09-07 838 * avoid a delta sample underflow when PSI is later re-enabled.
34f26a15611afb Chengming Zhou 2022-09-07 839 */
34f26a15611afb Chengming Zhou 2022-09-07 840 if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
34f26a15611afb Chengming Zhou 2022-09-07 841 record_times(groupc, now);
34f26a15611afb Chengming Zhou 2022-09-07 842
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 843 if (unlikely(clear & TSK_ONCPU))
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 844 state_mask = 0;
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 845 else if (unlikely(set & TSK_ONCPU))
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 846 state_mask = PSI_ONCPU;
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 847 else
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 848 state_mask = groupc->state_mask & PSI_ONCPU;
c6ccdabec91131 Tvrtko Ursulin 2025-11-13 849
34f26a15611afb Chengming Zhou 2022-09-07 850 groupc->state_mask = state_mask;
34f26a15611afb Chengming Zhou 2022-09-07 851
34f26a15611afb Chengming Zhou 2022-09-07 852 return;
34f26a15611afb Chengming Zhou 2022-09-07 853 }
34f26a15611afb Chengming Zhou 2022-09-07 854
0ec208ce983492 Tvrtko Ursulin 2024-06-25 @855 state_mask = test_states(groupc->tasks, state_mask);
7fae6c8171d20a Chengming Zhou 2021-03-03 856
7fae6c8171d20a Chengming Zhou 2021-03-03 857 /*
7fae6c8171d20a Chengming Zhou 2021-03-03 858 * Since we care about lost potential, a memstall is FULL
7fae6c8171d20a Chengming Zhou 2021-03-03 859 * when there are no other working tasks, but also when
7fae6c8171d20a Chengming Zhou 2021-03-03 860 * the CPU is actively reclaiming and nothing productive
7fae6c8171d20a Chengming Zhou 2021-03-03 861 * could run even if it were runnable. So when the current
7fae6c8171d20a Chengming Zhou 2021-03-03 862 * task in a cgroup is in_memstall, the corresponding groupc
7fae6c8171d20a Chengming Zhou 2021-03-03 863 * on that cpu is in PSI_MEM_FULL state.
7fae6c8171d20a Chengming Zhou 2021-03-03 864 */
71dbdde7914d32 Johannes Weiner 2022-08-26 865 if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
7fae6c8171d20a Chengming Zhou 2021-03-03 866 state_mask |= (1 << PSI_MEM_FULL);
7fae6c8171d20a Chengming Zhou 2021-03-03 867
34f26a15611afb Chengming Zhou 2022-09-07 868 record_times(groupc, now);
34f26a15611afb Chengming Zhou 2022-09-07 869
33b2d6302abc4c Suren Baghdasaryan 2019-05-14 870 groupc->state_mask = state_mask;
33b2d6302abc4c Suren Baghdasaryan 2019-05-14 871
65457b74aa9437 Domenico Cerasuolo 2023-03-30 872 if (state_mask & group->rtpoll_states)
65457b74aa9437 Domenico Cerasuolo 2023-03-30 873 psi_schedule_rtpoll_work(group, 1, false);
36b238d5717279 Johannes Weiner 2020-03-16 874
36b238d5717279 Johannes Weiner 2020-03-16 875 if (wake_clock && !delayed_work_pending(&group->avgs_work))
36b238d5717279 Johannes Weiner 2020-03-16 876 schedule_delayed_work(&group->avgs_work, PSI_FREQ);
eb414681d5a07d Johannes Weiner 2018-10-26 877 }
eb414681d5a07d Johannes Weiner 2018-10-26 878
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On Thu, Nov 13, 2025 at 12:22:54PM +0000, Tvrtko Ursulin wrote:
> Given that psi_group_change() can be called rather frequently from the
> scheduler task switching code lets streamline it a bit to reduce the
> number of loops and conditionals on the typical invocation.
>
> First thing is that we replace the open coded mask walks with the standard
> for_each_set_bit(). This makes the source code a bit more readable and
> also enables usage of the efficient CPU specific zero bit skip
> instructions.
>
> In doing so we also remove the need to mask out the special TSK_ONCPU bit
> from the set and clear masks, since for_each_set_bit() now directly limits
> the array index to the safe range.
>
> As the last remaining step we can now easily move the new state mask
> computation to only run when required.
>
> End result is hopefully more readable code and a very small but measurable
> reduction in task switching CPU overhead.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Suren Baghdasaryan <surenb@google.com>
> Cc: Peter Ziljstra <peterz@infradead.org>
> Cc: linux-kernel@vger.kernel.org
> ---
> kernel/sched/psi.c | 48 ++++++++++++++++++++--------------------------
> 1 file changed, 21 insertions(+), 27 deletions(-)
>
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 59fdb7ebbf22..fe19aeef8dbd 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -798,39 +798,26 @@ static void psi_group_change(struct psi_group *group, int cpu,
> u64 now, bool wake_clock)
> {
> struct psi_group_cpu *groupc;
> - unsigned int t, m;
> + unsigned long t, m;
> u32 state_mask;
>
> lockdep_assert_rq_held(cpu_rq(cpu));
> groupc = per_cpu_ptr(group->pcpu, cpu);
>
> /*
> - * Start with TSK_ONCPU, which doesn't have a corresponding
> - * task count - it's just a boolean flag directly encoded in
> - * the state mask. Clear, set, or carry the current state if
> - * no changes are requested.
> + * TSK_ONCPU does not have a corresponding task count - it's just a
> + * boolean flag directly encoded in the state mask. Clear, set, or carry
> + * the current state if no changes are requested.
> + *
> + * The rest of the state mask is calculated based on the task counts.
> + * Update those first, then construct the mask.
> */
> - if (unlikely(clear & TSK_ONCPU)) {
> - state_mask = 0;
> - clear &= ~TSK_ONCPU;
> - } else if (unlikely(set & TSK_ONCPU)) {
> - state_mask = PSI_ONCPU;
> - set &= ~TSK_ONCPU;
> - } else {
> - state_mask = groupc->state_mask & PSI_ONCPU;
> - }
This doesn't look right. Without PSI_ONCPU in state_mask, the results
of test_states() will be bogus, as well as the PSI_MEM_FULL special
case for an active reclaimer on the CPU.
> - /*
> - * The rest of the state mask is calculated based on the task
> - * counts. Update those first, then construct the mask.
> - */
> - for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
> - if (!(m & (1 << t)))
> - continue;
> - if (groupc->tasks[t]) {
> + m = clear;
> + for_each_set_bit(t, &m, ARRAY_SIZE(groupc->tasks)) {
The current version relies on !!m and doesn't need the range checks
for_each_set_bit() introduces. This seems less efficient. Did you
compare the generated code?
> + if (likely(groupc->tasks[t])) {
> groupc->tasks[t]--;
> } else if (!psi_bug) {
> - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
> + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%lu tasks=[%u %u %u %u] clear=%x set=%x\n",
> cpu, t, groupc->tasks[0],
> groupc->tasks[1], groupc->tasks[2],
> groupc->tasks[3], clear, set);
> @@ -838,9 +825,9 @@ static void psi_group_change(struct psi_group *group, int cpu,
> }
> }
>
> - for (t = 0; set; set &= ~(1 << t), t++)
> - if (set & (1 << t))
> - groupc->tasks[t]++;
> + m = set;
> + for_each_set_bit(t, &m, ARRAY_SIZE(groupc->tasks))
> + groupc->tasks[t]++;
>
> if (!group->enabled) {
> /*
> @@ -853,6 +840,13 @@ static void psi_group_change(struct psi_group *group, int cpu,
> if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
> record_times(groupc, now);
>
> + if (unlikely(clear & TSK_ONCPU))
> + state_mask = 0;
> + else if (unlikely(set & TSK_ONCPU))
> + state_mask = PSI_ONCPU;
> + else
> + state_mask = groupc->state_mask & PSI_ONCPU;
You moved it here, but this is the !group->enabled exception
only. What about the common case when the group is enabled?
On 13/11/2025 15:22, Johannes Weiner wrote:
> On Thu, Nov 13, 2025 at 12:22:54PM +0000, Tvrtko Ursulin wrote:
>> Given that psi_group_change() can be called rather frequently from the
>> scheduler task switching code lets streamline it a bit to reduce the
>> number of loops and conditionals on the typical invocation.
>>
>> First thing is that we replace the open coded mask walks with the standard
>> for_each_set_bit(). This makes the source code a bit more readable and
>> also enables usage of the efficient CPU specific zero bit skip
>> instructions.
>>
>> In doing so we also remove the need to mask out the special TSK_ONCPU bit
>> from the set and clear masks, since for_each_set_bit() now directly limits
>> the array index to the safe range.
>>
>> As the last remaining step we can now easily move the new state mask
>> computation to only run when required.
>>
>> End result is hopefully more readable code and a very small but measurable
>> reduction in task switching CPU overhead.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
>> Cc: Johannes Weiner <hannes@cmpxchg.org>
>> Cc: Suren Baghdasaryan <surenb@google.com>
>> Cc: Peter Ziljstra <peterz@infradead.org>
>> Cc: linux-kernel@vger.kernel.org
>> ---
>> kernel/sched/psi.c | 48 ++++++++++++++++++++--------------------------
>> 1 file changed, 21 insertions(+), 27 deletions(-)
>>
>> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
>> index 59fdb7ebbf22..fe19aeef8dbd 100644
>> --- a/kernel/sched/psi.c
>> +++ b/kernel/sched/psi.c
>> @@ -798,39 +798,26 @@ static void psi_group_change(struct psi_group *group, int cpu,
>> u64 now, bool wake_clock)
>> {
>> struct psi_group_cpu *groupc;
>> - unsigned int t, m;
>> + unsigned long t, m;
>> u32 state_mask;
>>
>> lockdep_assert_rq_held(cpu_rq(cpu));
>> groupc = per_cpu_ptr(group->pcpu, cpu);
>>
>> /*
>> - * Start with TSK_ONCPU, which doesn't have a corresponding
>> - * task count - it's just a boolean flag directly encoded in
>> - * the state mask. Clear, set, or carry the current state if
>> - * no changes are requested.
>> + * TSK_ONCPU does not have a corresponding task count - it's just a
>> + * boolean flag directly encoded in the state mask. Clear, set, or carry
>> + * the current state if no changes are requested.
>> + *
>> + * The rest of the state mask is calculated based on the task counts.
>> + * Update those first, then construct the mask.
>> */
>> - if (unlikely(clear & TSK_ONCPU)) {
>> - state_mask = 0;
>> - clear &= ~TSK_ONCPU;
>> - } else if (unlikely(set & TSK_ONCPU)) {
>> - state_mask = PSI_ONCPU;
>> - set &= ~TSK_ONCPU;
>> - } else {
>> - state_mask = groupc->state_mask & PSI_ONCPU;
>> - }
>
> This doesn't look right. Without PSI_ONCPU in state_mask, the results
> of test_states() will be bogus, as well as the PSI_MEM_FULL special
> case for an active reclaimer on the CPU.
You are completely right, I was sure local state_mask was not used
outside the !group->enabled branch but missed it is an input parameter
to test_states().
>
>> - /*
>> - * The rest of the state mask is calculated based on the task
>> - * counts. Update those first, then construct the mask.
>> - */
>> - for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
>> - if (!(m & (1 << t)))
>> - continue;
>> - if (groupc->tasks[t]) {
>> + m = clear;
>> + for_each_set_bit(t, &m, ARRAY_SIZE(groupc->tasks)) {
>
> The current version relies on !!m and doesn't need the range checks
> for_each_set_bit() introduces. This seems less efficient. Did you
> compare the generated code?
Yes, slightly more .text but empirically it looks a tiny bit fewer
cycles. Which I thought was due being able to use the CPU specific
optimised __ffs variants. So it still bails on as soon as the last set
bit "goes away" just differently.
I will need to redo the tests with the state_mask breakage fixed.
>> + if (likely(groupc->tasks[t])) {
>> groupc->tasks[t]--;
>> } else if (!psi_bug) {
>> - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
>> + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%lu tasks=[%u %u %u %u] clear=%x set=%x\n",
>> cpu, t, groupc->tasks[0],
>> groupc->tasks[1], groupc->tasks[2],
>> groupc->tasks[3], clear, set);
>> @@ -838,9 +825,9 @@ static void psi_group_change(struct psi_group *group, int cpu,
>> }
>> }
>>
>> - for (t = 0; set; set &= ~(1 << t), t++)
>> - if (set & (1 << t))
>> - groupc->tasks[t]++;
>> + m = set;
>> + for_each_set_bit(t, &m, ARRAY_SIZE(groupc->tasks))
>> + groupc->tasks[t]++;
>>
>> if (!group->enabled) {
>> /*
>> @@ -853,6 +840,13 @@ static void psi_group_change(struct psi_group *group, int cpu,
>> if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
>> record_times(groupc, now);
>>
>> + if (unlikely(clear & TSK_ONCPU))
>> + state_mask = 0;
>> + else if (unlikely(set & TSK_ONCPU))
>> + state_mask = PSI_ONCPU;
>> + else
>> + state_mask = groupc->state_mask & PSI_ONCPU;
>
> You moved it here, but this is the !group->enabled exception
> only. What about the common case when the group is enabled?
Yep, I was blind. I will get back to you with v2 if there is still some
cpu cycles to be saved.
Regards,
Tvrtko
© 2016 - 2026 Red Hat, Inc.