include/linux/sched.h | 4 +++ kernel/sched/debug.c | 6 ++++ kernel/sched/fair.c | 69 ++++++++++++++++++++++++++++++++++++------- kernel/sched/sched.h | 3 ++ 4 files changed, 72 insertions(+), 10 deletions(-)
When a preferred LLC is selected and remains stable, task_cache_work does
not need to run frequently. Because it scans all system CPUs for
computation, high-frequency execution hurts performance. We thus reduce
the scan rate in such cases.
On the other hand, if the preferred node becomes suboptimal, we should
increase the scan frequency to quickly find a better placement. The scan
period is therefore dynamically adjusted.
Signed-off-by: Jianyong Wu <wujianyong@hygon.cn>
---
Hi ChenYu, Tim, Gengkun,
I have another approach to address this issue, based on the observation
that the scan work can be canceled if the preferred node is stable.This
patch merely demonstrates the idea, but still needs more testing to
verify its functionality. I'm sending it out early to gather feedback and
opinions.
Thanks
Jianyong
---
include/linux/sched.h | 4 +++
kernel/sched/debug.c | 6 ++++
kernel/sched/fair.c | 69 ++++++++++++++++++++++++++++++++++++-------
kernel/sched/sched.h | 3 ++
4 files changed, 72 insertions(+), 10 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e24b2b86aba4..87ce70ba6552 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2393,7 +2393,11 @@ struct sched_cache_stat {
struct sched_cache_time __percpu *pcpu_time;
raw_spinlock_t lock;
unsigned long epoch;
+ unsigned long last_reset_tick;
+ unsigned long next_scan;
+ unsigned long scan_period;
u64 nr_running_avg;
+ int need_scan;
int cpu;
} ____cacheline_aligned_in_smp;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4469e1c152c8..56ebc379127a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -679,6 +679,12 @@ static __init int sched_init_debug(void)
&llc_overaggr_pct);
debugfs_create_u32("imb_pct", 0644, llc,
&llc_imb_pct);
+ debugfs_create_u32("scan_period_max", 0644, llc,
+ &llc_scan_period_max);
+ debugfs_create_u32("scan_period_min", 0644, llc,
+ &llc_scan_period_min);
+ debugfs_create_u32("scan_period_threshold", 0644, llc,
+ &llc_scan_period_threshold);
#endif
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f446d755f3c5..974fe4b992ca 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1287,6 +1287,9 @@ __read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
__read_mostly unsigned int llc_imb_pct = 20;
__read_mostly unsigned int llc_overaggr_pct = 50;
+__read_mostly unsigned int llc_scan_period_min = 1;
+__read_mostly unsigned int llc_scan_period_max = 64 * HZ;
+__read_mostly unsigned int llc_scan_period_threshold = HZ;
bool sched_cache_inuse(void)
{
@@ -1486,6 +1489,7 @@ void mm_init_sched(struct mm_struct *mm,
raw_spin_lock_init(&mm->sc_stat.lock);
mm->sc_stat.epoch = epoch;
mm->sc_stat.cpu = -1;
+ mm->sc_stat.scan_period = llc_scan_period_min;
/*
* The update to mm->sc_stat should not be reordered
@@ -1611,15 +1615,13 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
epoch = rq->cpu_epoch;
}
- /*
- * If this process hasn't hit task_cache_work() for a while, or it
- * has only 1 thread, invalidate its preferred state.
- */
+ /* If it has only 1 thread, invalidate its preferred state */
if (time_after(epoch,
- READ_ONCE(mm->sc_stat.epoch) + llc_epoch_affinity_timeout) ||
- get_nr_threads(p) <= 1 ||
+ READ_ONCE(mm->sc_stat.epoch) + llc_epoch_affinity_timeout) ||
+ get_nr_threads(p) <= 1 ||
exceed_llc_nr(mm, cpu_of(rq), p) ||
exceed_llc_capacity(mm, cpu_of(rq), p)) {
+ mm->sc_stat.scan_period = llc_scan_period_min;
if (mm->sc_stat.cpu != -1)
mm->sc_stat.cpu = -1;
}
@@ -1652,6 +1654,10 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
if (time_after_eq(mm->sc_stat.epoch, epoch))
return;
+ if (llc_scan_period_min < llc_scan_period_max && time_before(jiffies, mm->sc_stat.next_scan) &&
+ !mm->sc_stat.need_scan)
+ return;
+
guard(raw_spinlock)(&mm->sc_stat.lock);
if (work->next == work) {
@@ -1728,7 +1734,7 @@ static void task_cache_work(struct callback_head *work)
struct task_struct *p = current, *cur;
unsigned long curr_m_a_occ = 0;
struct mm_struct *mm = p->mm;
- unsigned long m_a_occ = 0;
+ unsigned long m_a_occ = 0, need_scan = 0, now;
cpumask_var_t cpus;
u64 t0, scan_cost;
@@ -1753,6 +1759,12 @@ static void task_cache_work(struct callback_head *work)
t0 = sched_clock_cpu(curr_cpu);
+ now = jiffies;
+ if (time_before(now, READ_ONCE(mm->sc_stat.next_scan)))
+ return;
+
+ WRITE_ONCE(mm->sc_stat.next_scan, (now + mm->sc_stat.scan_period));
+
scoped_guard (cpus_read_lock) {
get_scan_cpumasks(cpus, p);
@@ -1811,7 +1823,8 @@ static void task_cache_work(struct callback_head *work)
scan_cost = sched_clock_cpu(curr_cpu) - t0;
trace_sched_llc_scan(p, scan_cost);
- if (m_a_occ > (2 * curr_m_a_occ)) {
+ need_scan = READ_ONCE(mm->sc_stat.need_scan);
+ if (m_a_occ > (2 * curr_m_a_occ) || need_scan) {
/*
* Avoid switching sc_stat.cpu too fast.
* The reason to choose 2X is because:
@@ -1822,9 +1835,35 @@ static void task_cache_work(struct callback_head *work)
* 3. 2X is chosen based on test results, as it delivers
* the optimal performance gain so far.
*/
- mm->sc_stat.cpu = m_a_cpu;
+ if (m_a_occ > (2 * curr_m_a_occ))
+ mm->sc_stat.cpu = m_a_cpu;
+
+ if (!mm->sc_stat.last_reset_tick)
+ mm->sc_stat.last_reset_tick = now;
+
+ /* Change scan_period when preferred LLC changed */
+ if (((mm->sc_stat.cpu != -1) && (m_a_cpu != -1)
+ && (llc_id(mm->sc_stat.cpu) != llc_id(m_a_cpu)))
+ || need_scan) {
+ if (!need_scan)
+ need_scan = 1;
+
+ WRITE_ONCE(mm->sc_stat.scan_period,
+ max(mm->sc_stat.scan_period >> 1, llc_scan_period_min));
+ WRITE_ONCE(mm->sc_stat.last_reset_tick, now);
+ }
+ }
+
+ if ((now - READ_ONCE(mm->sc_stat.last_reset_tick) > llc_scan_period_threshold)
+ && !need_scan) {
+ WRITE_ONCE(mm->sc_stat.scan_period, min(mm->sc_stat.scan_period << 1,
+ llc_scan_period_max));
+ WRITE_ONCE(mm->sc_stat.last_reset_tick, now);
}
+ if (READ_ONCE(mm->sc_stat.need_scan))
+ WRITE_ONCE(mm->sc_stat.need_scan, 0);
+
update_avg_scale(&mm->sc_stat.nr_running_avg, nr_running);
free_cpumask_var(cpus);
}
@@ -10260,6 +10299,7 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
struct mm_struct *mm;
bool to_pref;
int cpu;
+ enum llc_mig ret;
mm = p->mm;
if (!mm)
@@ -10287,8 +10327,17 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
else
return mig_unrestricted;
- return can_migrate_llc(src_cpu, dst_cpu,
+ ret = can_migrate_llc(src_cpu, dst_cpu,
task_util(p), to_pref);
+
+ /*
+ * If the preferred node cannot accommodate the process,
+ * accelerate task_cache_work to find a better node.
+ */
+ if (to_pref && ret == mig_forbid)
+ mm->sc_stat.need_scan = 1;
+
+ return ret;
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b757812725f7..08462175f73f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -4043,6 +4043,9 @@ extern unsigned int llc_epoch_period;
extern unsigned int llc_epoch_affinity_timeout;
extern unsigned int llc_imb_pct;
extern unsigned int llc_overaggr_pct;
+extern unsigned int llc_scan_period_min;
+extern unsigned int llc_scan_period_max;
+extern unsigned int llc_scan_period_threshold;
static inline bool sched_cache_enabled(void)
{
--
2.34.1
On Mon, 2026-04-13 at 15:23 +0800, Jianyong Wu wrote:
> When a preferred LLC is selected and remains stable, task_cache_work does
> not need to run frequently. Because it scans all system CPUs for
> computation, high-frequency execution hurts performance. We thus reduce
> the scan rate in such cases.
>
Thanks for your patch proposal.
> On the other hand, if the preferred node becomes suboptimal, we should
You mean preferred LLC right? preferred node is from NUMA balancing.
> increase the scan frequency to quickly find a better placement. The scan
> period is therefore dynamically adjusted.
>
> Signed-off-by: Jianyong Wu <wujianyong@hygon.cn>
>
> ---
> Hi ChenYu, Tim, Gengkun,
>
> I have another approach to address this issue, based on the observation
> that the scan work can be canceled if the preferred node is stable.This
> patch merely demonstrates the idea, but still needs more testing to
> verify its functionality. I'm sending it out early to gather feedback and
> opinions.
>
>
<...>
> @@ -1822,9 +1835,35 @@ static void task_cache_work(struct callback_head *work)
> * 3. 2X is chosen based on test results, as it delivers
> * the optimal performance gain so far.
> */
> - mm->sc_stat.cpu = m_a_cpu;
> + if (m_a_occ > (2 * curr_m_a_occ))
> + mm->sc_stat.cpu = m_a_cpu;
> +
> + if (!mm->sc_stat.last_reset_tick)
> + mm->sc_stat.last_reset_tick = now;
> +
> + /* Change scan_period when preferred LLC changed */
> + if (((mm->sc_stat.cpu != -1) && (m_a_cpu != -1)
> + && (llc_id(mm->sc_stat.cpu) != llc_id(m_a_cpu)))
> + || need_scan) {
> + if (!need_scan)
> + need_scan = 1;
> +
> + WRITE_ONCE(mm->sc_stat.scan_period,
> + max(mm->sc_stat.scan_period >> 1, llc_scan_period_min));
> + WRITE_ONCE(mm->sc_stat.last_reset_tick, now);
> + }
> + }
> +
> + if ((now - READ_ONCE(mm->sc_stat.last_reset_tick) > llc_scan_period_threshold)
> + && !need_scan) {
> + WRITE_ONCE(mm->sc_stat.scan_period, min(mm->sc_stat.scan_period << 1,
> + llc_scan_period_max));
I think that llc_scan_period_max should be the same as llc_epoch_affinity_timeout.
We should not increase the scan period beyond that as that's the time scale
where we consider cache data relevant.
Tim
Hi Tim,
> On Mon, 2026-04-13 at 15:23 +0800, Jianyong Wu wrote:
> > When a preferred LLC is selected and remains stable, task_cache_work
> > does not need to run frequently. Because it scans all system CPUs for
> > computation, high-frequency execution hurts performance. We thus
> > reduce the scan rate in such cases.
> >
>
> Thanks for your patch proposal.
>
> > On the other hand, if the preferred node becomes suboptimal, we
> should
>
> You mean preferred LLC right? preferred node is from NUMA balancing.
Sorry for the misunderstanding. Yes, I meant the preferred LLC here.
>
>
> > @@ -1822,9 +1835,35 @@ static void task_cache_work(struct
> callback_head *work)
> > * 3. 2X is chosen based on test results, as it delivers
> > * the optimal performance gain so far.
> > */
> > - mm->sc_stat.cpu = m_a_cpu;
> > + if (m_a_occ > (2 * curr_m_a_occ))
> > + mm->sc_stat.cpu = m_a_cpu;
> > +
> > + if (!mm->sc_stat.last_reset_tick)
> > + mm->sc_stat.last_reset_tick = now;
> > +
> > + /* Change scan_period when preferred LLC changed */
> > + if (((mm->sc_stat.cpu != -1) && (m_a_cpu != -1)
> > + && (llc_id(mm->sc_stat.cpu) != llc_id(m_a_cpu)))
> > + || need_scan) {
> > + if (!need_scan)
> > + need_scan = 1;
> > +
> > + WRITE_ONCE(mm->sc_stat.scan_period,
> > + max(mm->sc_stat.scan_period >> 1,
> llc_scan_period_min));
> > + WRITE_ONCE(mm->sc_stat.last_reset_tick, now);
> > + }
> > + }
> > +
> > + if ((now - READ_ONCE(mm->sc_stat.last_reset_tick) >
> llc_scan_period_threshold)
> > + && !need_scan) {
> > + WRITE_ONCE(mm->sc_stat.scan_period,
> min(mm->sc_stat.scan_period << 1,
> > + llc_scan_period_max));
>
> I think that llc_scan_period_max should be the same as
> llc_epoch_affinity_timeout.
> We should not increase the scan period beyond that as that's the time
> scale where we consider cache data relevant.
Sounds reasonable. I'll run some tests to verify if this is sufficient.
Thanks
Jianyong
>
> Tim
>
>
Hi Jianyong,
On 4/13/2026 3:23 PM, Jianyong Wu wrote:
> When a preferred LLC is selected and remains stable, task_cache_work does
> not need to run frequently. Because it scans all system CPUs for
> computation, high-frequency execution hurts performance. We thus reduce
> the scan rate in such cases.
>
> On the other hand, if the preferred node becomes suboptimal, we should
> increase the scan frequency to quickly find a better placement. The scan
> period is therefore dynamically adjusted.
>
> Signed-off-by: Jianyong Wu <wujianyong@hygon.cn>
>
> ---
> Hi ChenYu, Tim, Gengkun,
>
> I have another approach to address this issue, based on the observation
> that the scan work can be canceled if the preferred node is stable.This
> patch merely demonstrates the idea, but still needs more testing to
> verify its functionality. I'm sending it out early to gather feedback and
> opinions.
>
Thanks for providing this patch.
> if (work->next == work) {
> @@ -1728,7 +1734,7 @@ static void task_cache_work(struct callback_head *work)
> struct task_struct *p = current, *cur;
> unsigned long curr_m_a_occ = 0;
> struct mm_struct *mm = p->mm;
> - unsigned long m_a_occ = 0;
> + unsigned long m_a_occ = 0, need_scan = 0, now;
> cpumask_var_t cpus;
> u64 t0, scan_cost;
>
> @@ -1753,6 +1759,12 @@ static void task_cache_work(struct callback_head *work)
>
> t0 = sched_clock_cpu(curr_cpu);
>
> + now = jiffies;
> + if (time_before(now, READ_ONCE(mm->sc_stat.next_scan)))
> + return;
> +
I agree that limiting the scan rate would be useful,
and your above change is actually similar to what NUMA balancing
did in task_numa_work(). It allows only one thread within the
same process to perform the statistics calculation, which avoids
redundant computation.
> + WRITE_ONCE(mm->sc_stat.next_scan, (now + mm->sc_stat.scan_period));
> +
I suppose the above should be try_cmpxchg()?
That is to say, with your above change, we have already limited the scan
ratio for multi-threaded processes significantly. There appears to be no
need to perform adaptive adjustment of scan_period - the benefit of
introducing
an adaptive scan_period may not offset the overhead of frequent writing
to the
"global" mm->sc_stat.scan_period due to c2c?
thanks,
Chenyu
Hi Chenyu,
> -----Original Message-----
> From: Chen, Yu C <yu.c.chen@intel.com>
> Sent: Monday, April 13, 2026 4:38 PM
> To: Jianyong Wu <wujianyong@hygon.cn>
> Cc: peterz@infradead.org; kprateek.nayak@amd.com; mingo@redhat.com;
> vincent.guittot@linaro.org; juri.lelli@redhat.com;
> dietmar.eggemann@arm.com; rostedt@goodmis.org; bsegall@google.com;
> mgorman@suse.de; vschneid@redhat.com; vineethr@linux.ibm.com;
> hdanton@sina.com; sshegde@linux.ibm.com; jianyong.wu@outlook.com;
> cyy@cyyself.name; tingyin.duan@gmail.com; vernhao@tencent.com;
> haoxing990@gmail.com; len.brown@intel.com; aubrey.li@intel.com;
> zhao1.liu@intel.com; adamli@os.amperecomputing.com;
> tim.c.chen@linux.intel.com; ziqianlu@bytedance.com;
> tim.c.chen@intel.com; joshdon@google.com; gavinguo@igalia.com;
> qyousef@layalina.io; libchen@purestorage.com;
> linux-kernel@vger.kernel.org; Huangsj <huangsj@hygon.cn>;
> luogengkun2@huawei.com
> Subject: Re: [RFC PATCH] sched/fair: dynamically scale the period of cache
> work
>
> Hi Jianyong,
>
> On 4/13/2026 3:23 PM, Jianyong Wu wrote:
> > When a preferred LLC is selected and remains stable, task_cache_work
> > does not need to run frequently. Because it scans all system CPUs for
> > computation, high-frequency execution hurts performance. We thus
> > reduce the scan rate in such cases.
> >
> > On the other hand, if the preferred node becomes suboptimal, we
> should
> > increase the scan frequency to quickly find a better placement. The
> > scan period is therefore dynamically adjusted.
> >
> > Signed-off-by: Jianyong Wu <wujianyong@hygon.cn>
> >
> > ---
> > Hi ChenYu, Tim, Gengkun,
> >
> > I have another approach to address this issue, based on the
> > observation that the scan work can be canceled if the preferred node
> > is stable.This patch merely demonstrates the idea, but still needs
> > more testing to verify its functionality. I'm sending it out early to
> > gather feedback and opinions.
> >
>
> Thanks for providing this patch.
>
> > if (work->next == work) {
> > @@ -1728,7 +1734,7 @@ static void task_cache_work(struct
> callback_head *work)
> > struct task_struct *p = current, *cur;
> > unsigned long curr_m_a_occ = 0;
> > struct mm_struct *mm = p->mm;
> > - unsigned long m_a_occ = 0;
> > + unsigned long m_a_occ = 0, need_scan = 0, now;
> > cpumask_var_t cpus;
> > u64 t0, scan_cost;
> >
> > @@ -1753,6 +1759,12 @@ static void task_cache_work(struct
> > callback_head *work)
> >
> > t0 = sched_clock_cpu(curr_cpu);
> >
> > + now = jiffies;
> > + if (time_before(now, READ_ONCE(mm->sc_stat.next_scan)))
> > + return;
> > +
>
> I agree that limiting the scan rate would be useful, and your above change
> is actually similar to what NUMA balancing did in task_numa_work(). It
> allows only one thread within the same process to perform the statistics
> calculation, which avoids redundant computation.
>
> > + WRITE_ONCE(mm->sc_stat.next_scan, (now +
> mm->sc_stat.scan_period));
> > +
>
> I suppose the above should be try_cmpxchg()?
Even though the update is not observed by others, it may not be a big problem.
However, using try_cmpxchg may incur more overhead than WRITE_ONCE.
So, I wonder if we can tolerate such a loss of "correctness" for the sake of performance.
>
> That is to say, with your above change, we have already limited the scan
> ratio for multi-threaded processes significantly. There appears to be no
> need to perform adaptive adjustment of scan_period - the benefit of
> introducing an adaptive scan_period may not offset the overhead of
> frequent writing to the "global" mm->sc_stat.scan_period due to c2c?
>
If we can increase the scan period, the operations inside the scan work will
not be executed frequently. Thus, there is little overhead from writing to the global variable.
This only occurs when the preferred node is unstable and the scan work runs
frequently. If the preferred node remains stable most of the time, we can still benefit from it.
Thanks
Jianyong
> thanks,
> Chenyu
>
>
>
Hi Jianyong, On 4/13/2026 7:27 PM, Jianyong Wu wrote: > Hi Chenyu, > [ ... ] >>> + WRITE_ONCE(mm->sc_stat.next_scan, (now + >> mm->sc_stat.scan_period)); >>> + >> >> I suppose the above should be try_cmpxchg()? > > Even though the update is not observed by others, it may not be a big problem. > However, using try_cmpxchg may incur more overhead than WRITE_ONCE. > So, I wonder if we can tolerate such a loss of "correctness" for the sake of performance. > try_cmpxchg is triggered not very frequently(10 ms) so the overhead might not be that high? try_cmpxchg "strictly" avoid two threads entering the same loop, and it seems that in the end of task_cache_work() there is a update_avg_scale() which involves u64 *avg += xxx which is not atomic so maybe try_cmpxchg could help with that? >> >> That is to say, with your above change, we have already limited the scan >> ratio for multi-threaded processes significantly. There appears to be no >> need to perform adaptive adjustment of scan_period - the benefit of >> introducing an adaptive scan_period may not offset the overhead of >> frequent writing to the "global" mm->sc_stat.scan_period due to c2c? >> > > If we can increase the scan period, the operations inside the scan work will > not be executed frequently. Thus, there is little overhead from writing to the global variable. > This only occurs when the preferred node is unstable and the scan work runs > frequently. If the preferred node remains stable most of the time, we can still benefit from it. > I see. BTW, why mm->sc_stat.need_scan is needed? With sc_stat.next_scan and sc_stat.scan_period, we should be able to adjust the timeout. Is it because you want other condition to shrink the scan_period? Like below: + if (to_pref && ret == mig_forbid) + mm->sc_stat.need_scan = 1; thanks, Chenyu
Hi Chenyu, > try_cmpxchg is triggered not very frequently(10 ms) so the overhead might > not be that high? try_cmpxchg "strictly" avoid two threads entering the > same loop, and it seems that in the end of task_cache_work() there is a > update_avg_scale() > which involves > u64 *avg += xxx > which is not atomic so maybe try_cmpxchg could help with that? > Ok, looks like try_cmpxchg has little performance overhead. I'll use it in the next version. > >> > >> That is to say, with your above change, we have already limited the > >> scan ratio for multi-threaded processes significantly. There appears > >> to be no need to perform adaptive adjustment of scan_period - the > >> benefit of introducing an adaptive scan_period may not offset the > >> overhead of frequent writing to the "global" mm->sc_stat.scan_period > due to c2c? > >> > > > > If we can increase the scan period, the operations inside the scan > > work will not be executed frequently. Thus, there is little overhead from > writing to the global variable. > > This only occurs when the preferred node is unstable and the scan work > > runs frequently. If the preferred node remains stable most of the time, > we can still benefit from it. > > > > I see. BTW, why mm->sc_stat.need_scan is needed? With sc_stat.next_scan > and sc_stat.scan_period, we should be able to adjust the timeout. > > Is it because you want other condition to shrink the scan_period? > Like below: > + if (to_pref && ret == mig_forbid) > + mm->sc_stat.need_scan = 1; > Yes, here I intend to shorten the scan period once the current preferred LLC may be no longer suitable for the task. Thanks Jianyong > thanks, > Chenyu
© 2016 - 2026 Red Hat, Inc.