include/linux/sched.h | 1 + kernel/sched/debug.c | 50 +++++++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 25 +++++++++++++++++++--- kernel/sched/sched.h | 6 ++++++ 4 files changed, 79 insertions(+), 3 deletions(-)
The overhead of task_cache_work is high, espeically in multi-NUMA system.
Currently, task_cache_work try to find the pref_llc by scan all cpus in the
system. However, most of these scans are meaningless, such as those for
cpus that have never been visited or were accessed a long time ago.
To address this problem, this patch introduces visited_cpus to track the
visited cpus and uses llc_epoch_visited_timeout to evict cpus that have
timed out.
Signed-off-by: Luo Gengkun <luogengkun2@huawei.com>
---
Thanks for the reviews. I've updated the patch based on your feedback.
v2 Changes:
1. Added a pre-check before set/clear visited_cpus to avoid C2C overhead.
2. Optimized llc_epoch_visited_timeout by using a static key to minimize overhead.
---
include/linux/sched.h | 1 +
kernel/sched/debug.c | 50 +++++++++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 25 +++++++++++++++++++---
kernel/sched/sched.h | 6 ++++++
4 files changed, 79 insertions(+), 3 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dfa4bfd099c6..f2327a13fda8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2390,6 +2390,7 @@ struct sched_cache_time {
struct sched_cache_stat {
struct sched_cache_time __percpu *pcpu_sched;
+ struct cpumask visited_cpus;
raw_spinlock_t lock;
unsigned long epoch;
u64 nr_running_avg;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4469e1c152c8..46aa73939f9e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -247,6 +247,54 @@ static const struct file_operations sched_cache_enable_fops = {
.llseek = seq_lseek,
.release = single_release,
};
+
+static void sched_cache_timeout_set(void)
+{
+ if (llc_epoch_visited_timeout) {
+ if (!static_branch_likely(&sched_cache_timeout))
+ static_branch_enable(&sched_cache_timeout);
+ } else {
+ if (static_branch_likely(&sched_cache_timeout))
+ static_branch_disable(&sched_cache_timeout);
+ }
+}
+
+static ssize_t
+sched_cache_timeout_enable_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ int val, ret;
+
+ ret = kstrtouint_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ llc_epoch_visited_timeout = val;
+
+ sched_cache_timeout_set();
+
+ return cnt;
+}
+
+static int sched_cache_timeout_enable_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%d\n", llc_epoch_visited_timeout);
+ return 0;
+}
+
+static int sched_cache_timeout_enable_open(struct inode *inode,
+ struct file *filp)
+{
+ return single_open(filp, sched_cache_timeout_enable_show, NULL);
+}
+
+static const struct file_operations sched_cache_timeout_enable_fops = {
+ .open = sched_cache_timeout_enable_open,
+ .write = sched_cache_timeout_enable_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
#ifdef CONFIG_PREEMPT_DYNAMIC
@@ -669,6 +717,8 @@ static __init int sched_init_debug(void)
llc = debugfs_create_dir("llc_balancing", debugfs_sched);
debugfs_create_file("enabled", 0644, llc, NULL,
&sched_cache_enable_fops);
+ debugfs_create_file("epoch_visited_timeout", 0644, llc, NULL,
+ &sched_cache_timeout_enable_fops);
debugfs_create_u32("aggr_tolerance", 0644, llc,
&llc_aggr_tolerance);
debugfs_create_u32("epoch_period", 0644, llc,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4e22696a0b1..89f44ea97fee 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1285,9 +1285,12 @@ static void set_next_buddy(struct sched_entity *se);
__read_mostly unsigned int llc_aggr_tolerance = 1;
__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
+__read_mostly unsigned int llc_epoch_visited_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
__read_mostly unsigned int llc_imb_pct = 20;
__read_mostly unsigned int llc_overaggr_pct = 50;
+DEFINE_STATIC_KEY_TRUE(sched_cache_timeout);
+
static int llc_id(int cpu)
{
if (cpu < 0)
@@ -1466,6 +1469,7 @@ void mm_init_sched(struct mm_struct *mm,
raw_spin_lock_init(&mm->sc_stat.lock);
mm->sc_stat.epoch = epoch;
mm->sc_stat.cpu = -1;
+ cpumask_clear(&mm->sc_stat.visited_cpus);
/*
* The update to mm->sc_stat should not be reordered
@@ -1582,6 +1586,9 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
pcpu_sched->runtime += delta_exec;
rq->cpu_runtime += delta_exec;
epoch = rq->cpu_epoch;
+ if (sched_cache_timeout_enabled() &&
+ !cpumask_test_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus))
+ cpumask_set_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus);
}
/*
@@ -1724,7 +1731,10 @@ static void task_cache_work(struct callback_head *work)
return;
scoped_guard (cpus_read_lock) {
- get_scan_cpumasks(cpus, p);
+ if (!sched_cache_timeout_enabled())
+ get_scan_cpumasks(cpus, p);
+ else
+ cpumask_and(cpus, cpu_online_mask, &mm->sc_stat.visited_cpus);
for_each_cpu(cpu, cpus) {
/* XXX sched_cluster_active */
@@ -1736,8 +1746,17 @@ static void task_cache_work(struct callback_head *work)
continue;
for_each_cpu(i, sched_domain_span(sd)) {
- occ = fraction_mm_sched(cpu_rq(i),
- per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
+ struct rq *rq = cpu_rq(i);
+ struct sched_cache_time *pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, i);
+ /* Skip the rq that has not been hit for a long time */
+ if (sched_cache_timeout_enabled() &&
+ cpumask_test_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus) &&
+ (rq->cpu_epoch - pcpu_sched->epoch) >
+ llc_epoch_visited_timeout) {
+ cpumask_clear_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus);
+ continue;
+ }
+ occ = fraction_mm_sched(rq, pcpu_sched);
a_occ += occ;
if (occ > m_occ) {
m_occ = occ;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b757812725f7..2ba09e9567af 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -4037,10 +4037,12 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct
#ifdef CONFIG_SCHED_CACHE
DECLARE_STATIC_KEY_FALSE(sched_cache_present);
DECLARE_STATIC_KEY_FALSE(sched_cache_active);
+DECLARE_STATIC_KEY_TRUE(sched_cache_timeout);
extern int sysctl_sched_cache_user;
extern unsigned int llc_aggr_tolerance;
extern unsigned int llc_epoch_period;
extern unsigned int llc_epoch_affinity_timeout;
+extern unsigned int llc_epoch_visited_timeout;
extern unsigned int llc_imb_pct;
extern unsigned int llc_overaggr_pct;
@@ -4051,6 +4053,10 @@ static inline bool sched_cache_enabled(void)
extern void sched_cache_active_set_unlocked(void);
+static inline bool sched_cache_timeout_enabled(void)
+{
+ return static_branch_unlikely(&sched_cache_timeout);
+}
#endif
void sched_domains_free_llc_id(int cpu);
--
2.34.1
Hi Gengkun,
On 4/14/2026 11:07 PM, Luo Gengkun wrote:
> The overhead of task_cache_work is high, espeically in multi-NUMA system.
> Currently, task_cache_work try to find the pref_llc by scan all cpus in the
> system. However, most of these scans are meaningless, such as those for
> cpus that have never been visited or were accessed a long time ago.
>
> To address this problem, this patch introduces visited_cpus to track the
> visited cpus and uses llc_epoch_visited_timeout to evict cpus that have
> timed out.
>
> Signed-off-by: Luo Gengkun <luogengkun2@huawei.com>
> ---
> Thanks for the reviews. I've updated the patch based on your feedback.
>
> v2 Changes:
> 1. Added a pre-check before set/clear visited_cpus to avoid C2C overhead.
> 2. Optimized llc_epoch_visited_timeout by using a static key to minimize overhead.
Since the visited CPUs optimization should help reduce the scan cost,
I wonder if we should enable it by default, regardless of the timeout
value set by the user. This mainly helps avoid introducing extra debugfs
controls/static key.
> #ifdef CONFIG_PREEMPT_DYNAMIC
> @@ -669,6 +717,8 @@ static __init int sched_init_debug(void)
> llc = debugfs_create_dir("llc_balancing", debugfs_sched);
> debugfs_create_file("enabled", 0644, llc, NULL,
> &sched_cache_enable_fops);
> + debugfs_create_file("epoch_visited_timeout", 0644, llc, NULL,
> + &sched_cache_timeout_enable_fops);
Is it possible to reuse llc_epoch_affinity_timeout without introducing
epoch_visited_timeout? The idea is that if a task has not run on that CPU
for 10 ms (by default), its footprint will be cleared.
[ ... ]
> @@ -1736,8 +1746,17 @@ static void task_cache_work(struct callback_head *work)
> continue;
>
> for_each_cpu(i, sched_domain_span(sd)) {
> - occ = fraction_mm_sched(cpu_rq(i),
> - per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
> + struct rq *rq = cpu_rq(i);
> + struct sched_cache_time *pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, i);
> + /* Skip the rq that has not been hit for a long time */
> + if (sched_cache_timeout_enabled() &&
> + cpumask_test_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus) &&
cpumask_test_cpu(i) should be fine. The rq access above doesn't hold
cpu_epoch_lock.
I wonder if we can safely calculate rq->cpu_epoch - pcpu_sched->epoch
inside fraction_mm_sched while holding the lock?
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
I'll test your patch after fixing the bug reported by sashiko.dev.
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
thanks,
Chenyu
On 2026/4/15 11:10, Chen, Yu C wrote:
> Hi Gengkun,
>
> On 4/14/2026 11:07 PM, Luo Gengkun wrote:
>> The overhead of task_cache_work is high, espeically in multi-NUMA system.
>> Currently, task_cache_work try to find the pref_llc by scan all cpus in the
>> system. However, most of these scans are meaningless, such as those for
>> cpus that have never been visited or were accessed a long time ago.
>>
>> To address this problem, this patch introduces visited_cpus to track the
>> visited cpus and uses llc_epoch_visited_timeout to evict cpus that have
>> timed out.
>>
>> Signed-off-by: Luo Gengkun <luogengkun2@huawei.com>
>> ---
>> Thanks for the reviews. I've updated the patch based on your feedback.
>>
>> v2 Changes:
>> 1. Added a pre-check before set/clear visited_cpus to avoid C2C overhead.
>> 2. Optimized llc_epoch_visited_timeout by using a static key to minimize overhead.
>
> Since the visited CPUs optimization should help reduce the scan cost,
> I wonder if we should enable it by default, regardless of the timeout
> value set by the user. This mainly helps avoid introducing extra debugfs
> controls/static key.
I would be happy to do this.
>
>> #ifdef CONFIG_PREEMPT_DYNAMIC
>> @@ -669,6 +717,8 @@ static __init int sched_init_debug(void)
>> llc = debugfs_create_dir("llc_balancing", debugfs_sched);
>> debugfs_create_file("enabled", 0644, llc, NULL,
>> &sched_cache_enable_fops);
>> + debugfs_create_file("epoch_visited_timeout", 0644, llc, NULL,
>> + &sched_cache_timeout_enable_fops);
>
> Is it possible to reuse llc_epoch_affinity_timeout without introducing
> epoch_visited_timeout? The idea is that if a task has not run on that CPU
> for 10 ms (by default), its footprint will be cleared.
I think this is also acceptable, because visited_timeout is inspired by
affinity_timeout.
>
> [ ... ]
>
>> @@ -1736,8 +1746,17 @@ static void task_cache_work(struct callback_head *work)
>> continue;
>> for_each_cpu(i, sched_domain_span(sd)) {
>> - occ = fraction_mm_sched(cpu_rq(i),
>> - per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
>> + struct rq *rq = cpu_rq(i);
>> + struct sched_cache_time *pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, i);
>> + /* Skip the rq that has not been hit for a long time */
>> + if (sched_cache_timeout_enabled() &&
>> + cpumask_test_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus) &&
>
> cpumask_test_cpu(i) should be fine. The rq access above doesn't hold cpu_epoch_lock.
> I wonder if we can safely calculate rq->cpu_epoch - pcpu_sched->epoch
> inside fraction_mm_sched while holding the lock?
Do we really need to access rq->cpu_epoch under the lock for read scenarios?
I noticed task_tick_cache accesses it directly. Plus, moving this access outside
the lock would help reduce lock contention.
thanks,
Luo Gengkun
>
> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> I'll test your patch after fixing the bug reported by sashiko.dev.
> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>
> thanks,
> Chenyu
>
>
>
>
>
On 4/18/2026 5:01 PM, Luo Gengkun wrote:
>
>
> On 2026/4/15 11:10, Chen, Yu C wrote:
>> Hi Gengkun,
>>
[ ... ]
>>> @@ -1736,8 +1746,17 @@ static void task_cache_work(struct
>>> callback_head *work)
>>> continue;
>>> for_each_cpu(i, sched_domain_span(sd)) {
>>> - occ = fraction_mm_sched(cpu_rq(i),
>>> - per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
>>> + struct rq *rq = cpu_rq(i);
>>> + struct sched_cache_time *pcpu_sched =
>>> per_cpu_ptr(mm->sc_stat.pcpu_sched, i);
>>> + /* Skip the rq that has not been hit for a long time */
>>> + if (sched_cache_timeout_enabled() &&
>>> + cpumask_test_cpu(cpu_of(rq), &mm-
>>> >sc_stat.visited_cpus) &&
>>
>> cpumask_test_cpu(i) should be fine. The rq access above doesn't hold
>> cpu_epoch_lock.
>> I wonder if we can safely calculate rq->cpu_epoch - pcpu_sched->epoch
>> inside fraction_mm_sched while holding the lock?
> Do we really need to access rq->cpu_epoch under the lock for read
> scenarios?
> I noticed task_tick_cache accesses it directly. Plus, moving this access
> outside
> the lock would help reduce lock contention.
>
Good question. task_tick_cache() access local rq->cpu_epoch with
rq->lock held
and irq disabled, while task_cache_work() is running with irq enabled
without
any rq->lock hold, and might not be run on local rq - see
__exit_to_user_mode_loop(),
it checks _TIF_NEED_RESCHED before _TIF_NOTIFY_RESUME, so p could be
switched out
and woken up and run task_cache_work() on a different CPU.
That is to say, I just wonder if there could be a race window
that bring inconsistency between two reads of rq->cpu_epoch -
pcpu_sched->epoch
- not necessary a critical issue though.
thanks,
Chenyu
The overhead of task_cache_work is high, espeically in multi-NUMA system.
Currently, task_cache_work try to find the pref_llc by scan all cpus in the
system. However, most of these scans are meaningless, such as those for
cpus that have never been visited or were accessed a long time ago.
To address this problem, this patch introduces visited_cpus to track the
visited cpus and uses llc_epoch_affinity_timeout to evict cpus that have
timed out.
Signed-off-by: Luo Gengkun <luogengkun2@huawei.com>
---
Changes history
**v3 Changes:**
1. Remove the static key and enable this feature by default.
2. Reuse llc_epoch_affinity_timeout instead of introducing
llc_epoch_visited_timeout.
3. Move the calculation of rq->cpu_epoch - pcpu_sched->epoch into
fraction_mm_sched() to avoid race between task_cache_work() and
__update_mm_sched().
4. Reset work->next at the end of task_cache_work() to prevent concurrent
executions by multiple threads within the same process.
**v2 Changes:**
1. Added a pre-check before set/clear visited_cpus to avoid C2C overhead.
2. Optimized llc_epoch_visited_timeout by using a static key to minimize overhead.
---
include/linux/sched.h | 1 +
kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++++--------
2 files changed, 31 insertions(+), 8 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dfa4bfd099c6..f2327a13fda8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2390,6 +2390,7 @@ struct sched_cache_time {
struct sched_cache_stat {
struct sched_cache_time __percpu *pcpu_sched;
+ struct cpumask visited_cpus;
raw_spinlock_t lock;
unsigned long epoch;
u64 nr_running_avg;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4e22696a0b1..49369f656d53 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1466,6 +1466,7 @@ void mm_init_sched(struct mm_struct *mm,
raw_spin_lock_init(&mm->sc_stat.lock);
mm->sc_stat.epoch = epoch;
mm->sc_stat.cpu = -1;
+ cpumask_clear(&mm->sc_stat.visited_cpus);
/*
* The update to mm->sc_stat should not be reordered
@@ -1507,11 +1508,18 @@ static inline void __update_mm_sched(struct rq *rq,
}
}
-static unsigned long fraction_mm_sched(struct rq *rq,
- struct sched_cache_time *pcpu_sched)
+static unsigned long fraction_mm_sched(int cpu, struct mm_struct *mm)
{
+ struct rq *rq = cpu_rq(cpu);
+ struct sched_cache_time *pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, cpu);
guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
+ /* Skip the rq that has not been hit for a long time */
+ if ((rq->cpu_epoch - pcpu_sched->epoch) > llc_epoch_affinity_timeout) {
+ cpumask_clear_cpu(cpu, &mm->sc_stat.visited_cpus);
+ return 0;
+ }
+
__update_mm_sched(rq, pcpu_sched);
/*
@@ -1582,6 +1590,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
pcpu_sched->runtime += delta_exec;
rq->cpu_runtime += delta_exec;
epoch = rq->cpu_epoch;
+ if (!cpumask_test_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus))
+ cpumask_set_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus);
}
/*
@@ -1627,7 +1637,11 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
guard(raw_spinlock)(&mm->sc_stat.lock);
- if (work->next == work) {
+ /*
+ * Pairs with smp_store_release in task_cache_work() to ensure that
+ * tash_cache_work() has finished before re-queueing the work.
+ */
+ if (smp_load_acquire(&work->next) == work) {
task_work_add(p, work, TWA_RESUME);
WRITE_ONCE(mm->sc_stat.epoch, epoch);
}
@@ -1695,6 +1709,8 @@ static inline void update_avg_scale(u64 *avg, u64 sample)
*avg += div64_s64(diff, divisor);
}
+DEFINE_FREE(reset_work, struct callback_head *, smp_store_release(&_T->next, _T))
+
static void task_cache_work(struct callback_head *work)
{
int cpu, m_a_cpu = -1, nr_running = 0, curr_cpu;
@@ -1703,11 +1719,14 @@ static void task_cache_work(struct callback_head *work)
struct mm_struct *mm = p->mm;
unsigned long m_a_occ = 0;
cpumask_var_t cpus;
+ /*
+ * Reset work->next at the end to avoid race between threads
+ * within a process.
+ */
+ struct callback_head *_w __free(reset_work) = work;
WARN_ON_ONCE(work != &p->cache_work);
- work->next = work;
-
if (p->flags & PF_EXITING)
return;
@@ -1725,6 +1744,7 @@ static void task_cache_work(struct callback_head *work)
scoped_guard (cpus_read_lock) {
get_scan_cpumasks(cpus, p);
+ cpumask_and(cpus, cpus, &mm->sc_stat.visited_cpus);
for_each_cpu(cpu, cpus) {
/* XXX sched_cluster_active */
@@ -1735,9 +1755,11 @@ static void task_cache_work(struct callback_head *work)
if (!sd)
continue;
- for_each_cpu(i, sched_domain_span(sd)) {
- occ = fraction_mm_sched(cpu_rq(i),
- per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
+ for_each_cpu_and(i, sched_domain_span(sd), &mm->sc_stat.visited_cpus) {
+ occ = fraction_mm_sched(i, mm);
+ if (occ == 0)
+ continue;
+
a_occ += occ;
if (occ > m_occ) {
m_occ = occ;
--
2.34.1
© 2016 - 2026 Red Hat, Inc.