[v2] sched/cache: Reduce the overhead of task_cache_work by only scan the visisted cpus.

[PATCH v2] sched/cache: Reduce the overhead of task_cache_work by only scan the visisted cpus.

Posted by Luo Gengkun 2 months ago

The overhead of task_cache_work is high, espeically in multi-NUMA system.
Currently, task_cache_work try to find the pref_llc by scan all cpus in the
system. However, most of these scans are meaningless, such as those for
cpus that have never been visited or were accessed a long time ago.

To address this problem, this patch introduces visited_cpus to track the
visited cpus and uses llc_epoch_visited_timeout to evict cpus that have
timed out.

Signed-off-by: Luo Gengkun <luogengkun2@huawei.com>
---
Thanks for the reviews. I've updated the patch based on your feedback.

v2 Changes:
1. Added a pre-check before set/clear visited_cpus to avoid C2C overhead.
2. Optimized llc_epoch_visited_timeout by using a static key to minimize overhead.
---
 include/linux/sched.h |  1 +
 kernel/sched/debug.c  | 50 +++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c   | 25 +++++++++++++++++++---
 kernel/sched/sched.h  |  6 ++++++
 4 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dfa4bfd099c6..f2327a13fda8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2390,6 +2390,7 @@ struct sched_cache_time {
 
 struct sched_cache_stat {
 	struct sched_cache_time __percpu *pcpu_sched;
+	struct cpumask visited_cpus;
 	raw_spinlock_t lock;
 	unsigned long epoch;
 	u64 nr_running_avg;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4469e1c152c8..46aa73939f9e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -247,6 +247,54 @@ static const struct file_operations sched_cache_enable_fops = {
 	.llseek         = seq_lseek,
 	.release        = single_release,
 };
+
+static void sched_cache_timeout_set(void)
+{
+	if (llc_epoch_visited_timeout) {
+		if (!static_branch_likely(&sched_cache_timeout))
+			static_branch_enable(&sched_cache_timeout);
+	} else {
+		if (static_branch_likely(&sched_cache_timeout))
+			static_branch_disable(&sched_cache_timeout);
+	}
+}
+
+static ssize_t
+sched_cache_timeout_enable_write(struct file *filp, const char __user *ubuf,
+				 size_t cnt, loff_t *ppos)
+{
+	int val, ret;
+
+	ret = kstrtouint_from_user(ubuf, cnt, 10, &val);
+	if (ret)
+		return ret;
+
+	llc_epoch_visited_timeout = val;
+
+	sched_cache_timeout_set();
+
+	return cnt;
+}
+
+static int sched_cache_timeout_enable_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", llc_epoch_visited_timeout);
+	return 0;
+}
+
+static int sched_cache_timeout_enable_open(struct inode *inode,
+					   struct file *filp)
+{
+	return single_open(filp, sched_cache_timeout_enable_show, NULL);
+}
+
+static const struct file_operations sched_cache_timeout_enable_fops = {
+	.open           = sched_cache_timeout_enable_open,
+	.write          = sched_cache_timeout_enable_write,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = single_release,
+};
 #endif
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
@@ -669,6 +717,8 @@ static __init int sched_init_debug(void)
 	llc = debugfs_create_dir("llc_balancing", debugfs_sched);
 	debugfs_create_file("enabled", 0644, llc, NULL,
 			    &sched_cache_enable_fops);
+	debugfs_create_file("epoch_visited_timeout", 0644, llc, NULL,
+			    &sched_cache_timeout_enable_fops);
 	debugfs_create_u32("aggr_tolerance", 0644, llc,
 			   &llc_aggr_tolerance);
 	debugfs_create_u32("epoch_period", 0644, llc,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4e22696a0b1..89f44ea97fee 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1285,9 +1285,12 @@ static void set_next_buddy(struct sched_entity *se);
 __read_mostly unsigned int llc_aggr_tolerance	= 1;
 __read_mostly unsigned int llc_epoch_period	= EPOCH_PERIOD;
 __read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
+__read_mostly unsigned int llc_epoch_visited_timeout  = EPOCH_LLC_AFFINITY_TIMEOUT;
 __read_mostly unsigned int llc_imb_pct		= 20;
 __read_mostly unsigned int llc_overaggr_pct	= 50;
 
+DEFINE_STATIC_KEY_TRUE(sched_cache_timeout);
+
 static int llc_id(int cpu)
 {
 	if (cpu < 0)
@@ -1466,6 +1469,7 @@ void mm_init_sched(struct mm_struct *mm,
 	raw_spin_lock_init(&mm->sc_stat.lock);
 	mm->sc_stat.epoch = epoch;
 	mm->sc_stat.cpu = -1;
+	cpumask_clear(&mm->sc_stat.visited_cpus);
 
 	/*
 	 * The update to mm->sc_stat should not be reordered
@@ -1582,6 +1586,9 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
 		pcpu_sched->runtime += delta_exec;
 		rq->cpu_runtime += delta_exec;
 		epoch = rq->cpu_epoch;
+		if (sched_cache_timeout_enabled() &&
+		    !cpumask_test_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus))
+			cpumask_set_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus);
 	}
 
 	/*
@@ -1724,7 +1731,10 @@ static void task_cache_work(struct callback_head *work)
 		return;
 
 	scoped_guard (cpus_read_lock) {
-		get_scan_cpumasks(cpus, p);
+		if (!sched_cache_timeout_enabled())
+			get_scan_cpumasks(cpus, p);
+		else
+			cpumask_and(cpus, cpu_online_mask, &mm->sc_stat.visited_cpus);
 
 		for_each_cpu(cpu, cpus) {
 			/* XXX sched_cluster_active */
@@ -1736,8 +1746,17 @@ static void task_cache_work(struct callback_head *work)
 				continue;
 
 			for_each_cpu(i, sched_domain_span(sd)) {
-				occ = fraction_mm_sched(cpu_rq(i),
-							per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
+				struct rq *rq = cpu_rq(i);
+				struct sched_cache_time *pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, i);
+				/* Skip the rq that has not been hit for a long time */
+				if (sched_cache_timeout_enabled() &&
+				    cpumask_test_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus) &&
+				    (rq->cpu_epoch - pcpu_sched->epoch) >
+				    llc_epoch_visited_timeout) {
+					cpumask_clear_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus);
+					continue;
+				}
+				occ = fraction_mm_sched(rq, pcpu_sched);
 				a_occ += occ;
 				if (occ > m_occ) {
 					m_occ = occ;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b757812725f7..2ba09e9567af 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -4037,10 +4037,12 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct
 #ifdef CONFIG_SCHED_CACHE
 DECLARE_STATIC_KEY_FALSE(sched_cache_present);
 DECLARE_STATIC_KEY_FALSE(sched_cache_active);
+DECLARE_STATIC_KEY_TRUE(sched_cache_timeout);
 extern int sysctl_sched_cache_user;
 extern unsigned int llc_aggr_tolerance;
 extern unsigned int llc_epoch_period;
 extern unsigned int llc_epoch_affinity_timeout;
+extern unsigned int llc_epoch_visited_timeout;
 extern unsigned int llc_imb_pct;
 extern unsigned int llc_overaggr_pct;
 
@@ -4051,6 +4053,10 @@ static inline bool sched_cache_enabled(void)
 
 extern void sched_cache_active_set_unlocked(void);
 
+static inline bool sched_cache_timeout_enabled(void)
+{
+	return static_branch_unlikely(&sched_cache_timeout);
+}
 #endif
 
 void sched_domains_free_llc_id(int cpu);
-- 
2.34.1

Re: [PATCH v2] sched/cache: Reduce the overhead of task_cache_work by only scan the visisted cpus.

Posted by Chen, Yu C 2 months ago

Hi Gengkun,

On 4/14/2026 11:07 PM, Luo Gengkun wrote:
> The overhead of task_cache_work is high, espeically in multi-NUMA system.
> Currently, task_cache_work try to find the pref_llc by scan all cpus in the
> system. However, most of these scans are meaningless, such as those for
> cpus that have never been visited or were accessed a long time ago.
> 
> To address this problem, this patch introduces visited_cpus to track the
> visited cpus and uses llc_epoch_visited_timeout to evict cpus that have
> timed out.
> 
> Signed-off-by: Luo Gengkun <luogengkun2@huawei.com>
> ---
> Thanks for the reviews. I've updated the patch based on your feedback.
> 
> v2 Changes:
> 1. Added a pre-check before set/clear visited_cpus to avoid C2C overhead.
> 2. Optimized llc_epoch_visited_timeout by using a static key to minimize overhead.

Since the visited CPUs optimization should help reduce the scan cost,
I wonder if we should enable it by default, regardless of the timeout
value set by the user. This mainly helps avoid introducing extra debugfs
controls/static key.

>   #ifdef CONFIG_PREEMPT_DYNAMIC
> @@ -669,6 +717,8 @@ static __init int sched_init_debug(void)
>   	llc = debugfs_create_dir("llc_balancing", debugfs_sched);
>   	debugfs_create_file("enabled", 0644, llc, NULL,
>   			    &sched_cache_enable_fops);
> +	debugfs_create_file("epoch_visited_timeout", 0644, llc, NULL,
> +			    &sched_cache_timeout_enable_fops);

Is it possible to reuse llc_epoch_affinity_timeout without introducing
epoch_visited_timeout? The idea is that if a task has not run on that CPU
for 10 ms (by default), its footprint will be cleared.

[ ... ]

> @@ -1736,8 +1746,17 @@ static void task_cache_work(struct callback_head *work)
>   				continue;
>   
>   			for_each_cpu(i, sched_domain_span(sd)) {
> -				occ = fraction_mm_sched(cpu_rq(i),
> -							per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
> +				struct rq *rq = cpu_rq(i);
> +				struct sched_cache_time *pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, i);
> +				/* Skip the rq that has not been hit for a long time */
> +				if (sched_cache_timeout_enabled() &&
> +				    cpumask_test_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus) &&

cpumask_test_cpu(i) should be fine. The rq access above doesn't hold 
cpu_epoch_lock.
I wonder if we can safely calculate rq->cpu_epoch - pcpu_sched->epoch
inside fraction_mm_sched while holding the lock?

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
I'll test your patch after fixing the bug reported by sashiko.dev.
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

thanks,
Chenyu

Re: [PATCH v2] sched/cache: Reduce the overhead of task_cache_work by only scan the visisted cpus.

Posted by Luo Gengkun 1 month, 4 weeks ago


On 2026/4/15 11:10, Chen, Yu C wrote:
> Hi Gengkun,
> 
> On 4/14/2026 11:07 PM, Luo Gengkun wrote:
>> The overhead of task_cache_work is high, espeically in multi-NUMA system.
>> Currently, task_cache_work try to find the pref_llc by scan all cpus in the
>> system. However, most of these scans are meaningless, such as those for
>> cpus that have never been visited or were accessed a long time ago.
>>
>> To address this problem, this patch introduces visited_cpus to track the
>> visited cpus and uses llc_epoch_visited_timeout to evict cpus that have
>> timed out.
>>
>> Signed-off-by: Luo Gengkun <luogengkun2@huawei.com>
>> ---
>> Thanks for the reviews. I've updated the patch based on your feedback.
>>
>> v2 Changes:
>> 1. Added a pre-check before set/clear visited_cpus to avoid C2C overhead.
>> 2. Optimized llc_epoch_visited_timeout by using a static key to minimize overhead.
> 
> Since the visited CPUs optimization should help reduce the scan cost,
> I wonder if we should enable it by default, regardless of the timeout
> value set by the user. This mainly helps avoid introducing extra debugfs
> controls/static key.

I would be happy to do this.

> 
>>   #ifdef CONFIG_PREEMPT_DYNAMIC
>> @@ -669,6 +717,8 @@ static __init int sched_init_debug(void)
>>       llc = debugfs_create_dir("llc_balancing", debugfs_sched);
>>       debugfs_create_file("enabled", 0644, llc, NULL,
>>                   &sched_cache_enable_fops);
>> +    debugfs_create_file("epoch_visited_timeout", 0644, llc, NULL,
>> +                &sched_cache_timeout_enable_fops);
> 
> Is it possible to reuse llc_epoch_affinity_timeout without introducing
> epoch_visited_timeout? The idea is that if a task has not run on that CPU
> for 10 ms (by default), its footprint will be cleared.

I think this is also acceptable, because visited_timeout is inspired by
affinity_timeout.
> 
> [ ... ]
> 
>> @@ -1736,8 +1746,17 @@ static void task_cache_work(struct callback_head *work)
>>                   continue;
>>               for_each_cpu(i, sched_domain_span(sd)) {
>> -                occ = fraction_mm_sched(cpu_rq(i),
>> -                            per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
>> +                struct rq *rq = cpu_rq(i);
>> +                struct sched_cache_time *pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, i);
>> +                /* Skip the rq that has not been hit for a long time */
>> +                if (sched_cache_timeout_enabled() &&
>> +                    cpumask_test_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus) &&
> 
> cpumask_test_cpu(i) should be fine. The rq access above doesn't hold cpu_epoch_lock.
> I wonder if we can safely calculate rq->cpu_epoch - pcpu_sched->epoch
> inside fraction_mm_sched while holding the lock?
Do we really need to access rq->cpu_epoch under the lock for read scenarios?
I noticed task_tick_cache accesses it directly. Plus, moving this access outside
the lock would help reduce lock contention.

thanks,
Luo Gengkun
> 
> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> I'll test your patch after fixing the bug reported by sashiko.dev.
> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 
> thanks,
> Chenyu
> 
> 
> 
> 
>

Re: [PATCH v2] sched/cache: Reduce the overhead of task_cache_work by only scan the visisted cpus.

Posted by Chen, Yu C 1 month, 3 weeks ago

On 4/18/2026 5:01 PM, Luo Gengkun wrote:
> 
> 
> On 2026/4/15 11:10, Chen, Yu C wrote:
>> Hi Gengkun,
>>

[ ... ]

>>> @@ -1736,8 +1746,17 @@ static void task_cache_work(struct 
>>> callback_head *work)
>>>                   continue;
>>>               for_each_cpu(i, sched_domain_span(sd)) {
>>> -                occ = fraction_mm_sched(cpu_rq(i),
>>> -                            per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
>>> +                struct rq *rq = cpu_rq(i);
>>> +                struct sched_cache_time *pcpu_sched = 
>>> per_cpu_ptr(mm->sc_stat.pcpu_sched, i);
>>> +                /* Skip the rq that has not been hit for a long time */
>>> +                if (sched_cache_timeout_enabled() &&
>>> +                    cpumask_test_cpu(cpu_of(rq), &mm- 
>>> >sc_stat.visited_cpus) &&
>>
>> cpumask_test_cpu(i) should be fine. The rq access above doesn't hold 
>> cpu_epoch_lock.
>> I wonder if we can safely calculate rq->cpu_epoch - pcpu_sched->epoch
>> inside fraction_mm_sched while holding the lock?
> Do we really need to access rq->cpu_epoch under the lock for read 
> scenarios?
> I noticed task_tick_cache accesses it directly. Plus, moving this access 
> outside
> the lock would help reduce lock contention.
> 

Good question. task_tick_cache() access local rq->cpu_epoch with 
rq->lock held
and irq disabled, while task_cache_work() is running with irq enabled 
without
any rq->lock hold, and might not be run on local rq - see 
__exit_to_user_mode_loop(),
it checks _TIF_NEED_RESCHED before _TIF_NOTIFY_RESUME, so p could be 
switched out
and woken up and run task_cache_work() on a different CPU.
That is to say, I just wonder if there could be a race window
that bring inconsistency between two reads of rq->cpu_epoch - 
pcpu_sched->epoch
- not necessary a critical issue though.

thanks,
Chenyu

[PATCH v3] sched/cache: Reduce the overhead of task_cache_work by only scan the visisted cpus.

Posted by Luo Gengkun 1 month, 3 weeks ago

The overhead of task_cache_work is high, espeically in multi-NUMA system.
Currently, task_cache_work try to find the pref_llc by scan all cpus in the
system. However, most of these scans are meaningless, such as those for
cpus that have never been visited or were accessed a long time ago.

To address this problem, this patch introduces visited_cpus to track the
visited cpus and uses llc_epoch_affinity_timeout to evict cpus that have
timed out.

Signed-off-by: Luo Gengkun <luogengkun2@huawei.com>
---
Changes history
**v3 Changes:**
1. Remove the static key and enable this feature by default.
2. Reuse llc_epoch_affinity_timeout instead of introducing
llc_epoch_visited_timeout.
3. Move the calculation of rq->cpu_epoch - pcpu_sched->epoch into
fraction_mm_sched() to avoid race between task_cache_work() and
__update_mm_sched(). 
4. Reset work->next at the end of task_cache_work() to prevent concurrent
executions by multiple threads within the same process.


**v2 Changes:**
1. Added a pre-check before set/clear visited_cpus to avoid C2C overhead.
2. Optimized llc_epoch_visited_timeout by using a static key to minimize overhead.
---
 include/linux/sched.h |  1 +
 kernel/sched/fair.c   | 38 ++++++++++++++++++++++++++++++--------
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dfa4bfd099c6..f2327a13fda8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2390,6 +2390,7 @@ struct sched_cache_time {
 
 struct sched_cache_stat {
 	struct sched_cache_time __percpu *pcpu_sched;
+	struct cpumask visited_cpus;
 	raw_spinlock_t lock;
 	unsigned long epoch;
 	u64 nr_running_avg;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4e22696a0b1..49369f656d53 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1466,6 +1466,7 @@ void mm_init_sched(struct mm_struct *mm,
 	raw_spin_lock_init(&mm->sc_stat.lock);
 	mm->sc_stat.epoch = epoch;
 	mm->sc_stat.cpu = -1;
+	cpumask_clear(&mm->sc_stat.visited_cpus);
 
 	/*
 	 * The update to mm->sc_stat should not be reordered
@@ -1507,11 +1508,18 @@ static inline void __update_mm_sched(struct rq *rq,
 	}
 }
 
-static unsigned long fraction_mm_sched(struct rq *rq,
-				       struct sched_cache_time *pcpu_sched)
+static unsigned long fraction_mm_sched(int cpu, struct mm_struct *mm)
 {
+	struct rq *rq = cpu_rq(cpu);
+	struct sched_cache_time *pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, cpu);
 	guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
 
+	/* Skip the rq that has not been hit for a long time */
+	if ((rq->cpu_epoch - pcpu_sched->epoch) > llc_epoch_affinity_timeout) {
+		cpumask_clear_cpu(cpu, &mm->sc_stat.visited_cpus);
+		return 0;
+	}
+
 	__update_mm_sched(rq, pcpu_sched);
 
 	/*
@@ -1582,6 +1590,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
 		pcpu_sched->runtime += delta_exec;
 		rq->cpu_runtime += delta_exec;
 		epoch = rq->cpu_epoch;
+		if (!cpumask_test_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus))
+			cpumask_set_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus);
 	}
 
 	/*
@@ -1627,7 +1637,11 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
 
 	guard(raw_spinlock)(&mm->sc_stat.lock);
 
-	if (work->next == work) {
+	/*
+	 * Pairs with smp_store_release in task_cache_work() to ensure that
+	 * tash_cache_work() has finished before re-queueing the work.
+	 */
+	if (smp_load_acquire(&work->next) == work) {
 		task_work_add(p, work, TWA_RESUME);
 		WRITE_ONCE(mm->sc_stat.epoch, epoch);
 	}
@@ -1695,6 +1709,8 @@ static inline void update_avg_scale(u64 *avg, u64 sample)
 	*avg += div64_s64(diff, divisor);
 }
 
+DEFINE_FREE(reset_work, struct callback_head *, smp_store_release(&_T->next, _T))
+
 static void task_cache_work(struct callback_head *work)
 {
 	int cpu, m_a_cpu = -1, nr_running = 0, curr_cpu;
@@ -1703,11 +1719,14 @@ static void task_cache_work(struct callback_head *work)
 	struct mm_struct *mm = p->mm;
 	unsigned long m_a_occ = 0;
 	cpumask_var_t cpus;
+	/*
+	 * Reset work->next at the end to avoid race between threads
+	 * within a process.
+	 */
+	struct callback_head *_w __free(reset_work) = work;
 
 	WARN_ON_ONCE(work != &p->cache_work);
 
-	work->next = work;
-
 	if (p->flags & PF_EXITING)
 		return;
 
@@ -1725,6 +1744,7 @@ static void task_cache_work(struct callback_head *work)
 
 	scoped_guard (cpus_read_lock) {
 		get_scan_cpumasks(cpus, p);
+		cpumask_and(cpus, cpus, &mm->sc_stat.visited_cpus);
 
 		for_each_cpu(cpu, cpus) {
 			/* XXX sched_cluster_active */
@@ -1735,9 +1755,11 @@ static void task_cache_work(struct callback_head *work)
 			if (!sd)
 				continue;
 
-			for_each_cpu(i, sched_domain_span(sd)) {
-				occ = fraction_mm_sched(cpu_rq(i),
-							per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
+			for_each_cpu_and(i, sched_domain_span(sd), &mm->sc_stat.visited_cpus) {
+				occ = fraction_mm_sched(i, mm);
+				if (occ == 0)
+					continue;
+
 				a_occ += occ;
 				if (occ > m_occ) {
 					m_occ = occ;
-- 
2.34.1