[v2] Cache aware scheduling

[PATCH v2 01/23] sched/cache: Introduce infrastructure for cache-aware load balancing

Posted by Tim Chen 2 months ago

From: "Peter Zijlstra (Intel)" <peterz@infradead.org>

Adds infrastructure to enable cache-aware load balancing,
which improves cache locality by grouping tasks that share resources
within the same cache domain. This reduces cache misses and improves
overall data access efficiency.

In this initial implementation, threads belonging to the same process
are treated as entities that likely share working sets. The mechanism
tracks per-process CPU occupancy across cache domains and attempts to
migrate threads toward cache-hot domains where their process already
has active threads, thereby enhancing locality.

This provides a basic model for cache affinity. While the current code
targets the last-level cache (LLC), the approach could be extended to
other domain types such as clusters (L2) or node-internal groupings.

At present, the mechanism selects the CPU within an LLC that has the
highest recent runtime. Subsequent patches in this series will use this
information in the load-balancing path to guide task placement toward
preferred LLCs.

In the future, more advanced policies could be integrated through NUMA
balancing-for example, migrating a task to its preferred LLC when spare
capacity exists, or swapping tasks across LLCs to improve cache affinity.
Grouping of tasks could also be generalized from that of a process
to be that of a NUMA group, or be user configurable.

Originally-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---

Notes:
    v1->v2:
       Restore the original CPU scan to cover all online CPUs,
       rather than scanning within the preferred NUMA node.
       (Peter Zijlstra)
    
       Use rq->curr instead of rq->donor. (K Prateek Nayak)
    
       Minor fix in task_tick_cache() to use
       if (mm->mm_sched_epoch >= rq->cpu_epoch)
       to avoid mm_sched_epoch going backwards.

 include/linux/mm_types.h |  44 +++++++
 include/linux/sched.h    |  11 ++
 init/Kconfig             |  11 ++
 kernel/fork.c            |   6 +
 kernel/sched/core.c      |   6 +
 kernel/sched/fair.c      | 258 +++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h     |   8 ++
 7 files changed, 344 insertions(+)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 90e5790c318f..1ea16ef90566 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -939,6 +939,11 @@ typedef struct {
 	DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS);
 } __private mm_flags_t;
 
+struct mm_sched {
+	u64 runtime;
+	unsigned long epoch;
+};
+
 struct kioctx_table;
 struct iommu_mm_data;
 struct mm_struct {
@@ -1029,6 +1034,17 @@ struct mm_struct {
 		 */
 		raw_spinlock_t cpus_allowed_lock;
 #endif
+#ifdef CONFIG_SCHED_CACHE
+		/*
+		 * Track per-cpu-per-process occupancy as a proxy for cache residency.
+		 * See account_mm_sched() and ...
+		 */
+		struct mm_sched __percpu *pcpu_sched;
+		raw_spinlock_t mm_sched_lock;
+		unsigned long mm_sched_epoch;
+		int mm_sched_cpu;
+#endif
+
 #ifdef CONFIG_MMU
 		atomic_long_t pgtables_bytes;	/* size of all page tables */
 #endif
@@ -1487,6 +1503,34 @@ static inline unsigned int mm_cid_size(void)
 static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
 #endif /* CONFIG_SCHED_MM_CID */
 
+#ifdef CONFIG_SCHED_CACHE
+void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched);
+
+static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
+{
+	struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
+
+	if (!pcpu_sched)
+		return -ENOMEM;
+
+	mm_init_sched(mm, pcpu_sched);
+	return 0;
+}
+
+#define mm_alloc_sched(...)	alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__))
+
+static inline void mm_destroy_sched(struct mm_struct *mm)
+{
+	free_percpu(mm->pcpu_sched);
+	mm->pcpu_sched = NULL;
+}
+#else /* !CONFIG_SCHED_CACHE */
+
+static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; }
+static inline void mm_destroy_sched(struct mm_struct *mm) { }
+
+#endif /* CONFIG_SCHED_CACHE */
+
 struct mmu_gather;
 extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b469878de25c..278b529c91df 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1406,6 +1406,10 @@ struct task_struct {
 	unsigned long			numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_SCHED_CACHE
+	struct callback_head		cache_work;
+#endif
+
 #ifdef CONFIG_RSEQ
 	struct rseq __user *rseq;
 	u32 rseq_len;
@@ -2428,4 +2432,11 @@ extern void migrate_enable(void);
 
 DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
 
+#ifdef CONFIG_SCHED_CACHE
+static inline bool sched_cache_enabled(void)
+{
+	return false;
+}
+#endif
+
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index cab3ad28ca49..88556ef8cfd1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -983,6 +983,17 @@ config NUMA_BALANCING
 
 	  This system will be inactive on UMA systems.
 
+config SCHED_CACHE
+	bool "Cache aware load balance"
+	default y
+	depends on SMP
+	help
+	  When enabled, the scheduler will attempt to aggregate tasks from
+	  the same process onto a single Last Level Cache (LLC) domain when
+	  possible. This improves cache locality by keeping tasks that share
+	  resources within the same cache domain, reducing cache misses and
+	  lowering data access latency.
+
 config NUMA_BALANCING_DEFAULT_ENABLED
 	bool "Automatically enable NUMA aware memory/task placement"
 	default y
diff --git a/kernel/fork.c b/kernel/fork.c
index 3da0f08615a9..aae5053d1e30 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -680,6 +680,7 @@ void __mmdrop(struct mm_struct *mm)
 	cleanup_lazy_tlbs(mm);
 
 	WARN_ON_ONCE(mm == current->active_mm);
+	mm_destroy_sched(mm);
 	mm_free_pgd(mm);
 	mm_free_id(mm);
 	destroy_context(mm);
@@ -1083,6 +1084,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	if (mm_alloc_cid(mm, p))
 		goto fail_cid;
 
+	if (mm_alloc_sched(mm))
+		goto fail_sched;
+
 	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
 				     NR_MM_COUNTERS))
 		goto fail_pcpu;
@@ -1092,6 +1096,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	return mm;
 
 fail_pcpu:
+	mm_destroy_sched(mm);
+fail_sched:
 	mm_destroy_cid(mm);
 fail_cid:
 	destroy_context(mm);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f754a60de848..e8bdf03a4b7f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4488,6 +4488,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
 	p->wake_entry.u_flags = CSD_TYPE_TTWU;
 	p->migration_pending = NULL;
 	init_sched_mm_cid(p);
+	init_sched_mm(p);
 }
 
 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -8791,6 +8792,11 @@ void __init sched_init(void)
 
 		rq->core_cookie = 0UL;
 #endif
+#ifdef CONFIG_SCHED_CACHE
+		raw_spin_lock_init(&rq->cpu_epoch_lock);
+		rq->cpu_epoch_next = jiffies;
+#endif
+
 		zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
 	}
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5b752324270b..cb82f558dc5b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1152,6 +1152,8 @@ void post_init_entity_util_avg(struct task_struct *p)
 	sa->runnable_avg = sa->util_avg;
 }
 
+static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec);
+
 static s64 update_se(struct rq *rq, struct sched_entity *se)
 {
 	u64 now = rq_clock_task(rq);
@@ -1174,6 +1176,7 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
 
 		trace_sched_stat_runtime(running, delta_exec);
 		account_group_exec_runtime(running, delta_exec);
+		account_mm_sched(rq, running, delta_exec);
 
 		/* cgroup time is always accounted against the donor */
 		cgroup_account_cputime(donor, delta_exec);
@@ -1193,6 +1196,259 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
 	return delta_exec;
 }
 
+#ifdef CONFIG_SCHED_CACHE
+
+/*
+ * XXX numbers come from a place the sun don't shine -- probably wants to be SD
+ * tunable or so.
+ */
+#define EPOCH_PERIOD	(HZ / 100)	/* 10 ms */
+#define EPOCH_LLC_AFFINITY_TIMEOUT	5	/* 50 ms */
+
+static int llc_id(int cpu)
+{
+	if (cpu < 0)
+		return -1;
+
+	return per_cpu(sd_llc_id, cpu);
+}
+
+void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+{
+	unsigned long epoch;
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
+		struct rq *rq = cpu_rq(i);
+
+		pcpu_sched->runtime = 0;
+		pcpu_sched->epoch = rq->cpu_epoch;
+		epoch = rq->cpu_epoch;
+	}
+
+	raw_spin_lock_init(&mm->mm_sched_lock);
+	mm->mm_sched_epoch = epoch;
+	mm->mm_sched_cpu = -1;
+
+	/*
+	 * The update to mm->pcpu_sched should not be reordered
+	 * before initialization to mm's other fields, in case
+	 * the readers may get invalid mm_sched_epoch, etc.
+	 */
+	smp_store_release(&mm->pcpu_sched, _pcpu_sched);
+}
+
+/* because why would C be fully specified */
+static __always_inline void __shr_u64(u64 *val, unsigned int n)
+{
+	if (n >= 64) {
+		*val = 0;
+		return;
+	}
+	*val >>= n;
+}
+
+static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
+{
+	lockdep_assert_held(&rq->cpu_epoch_lock);
+
+	unsigned long n, now = jiffies;
+	long delta = now - rq->cpu_epoch_next;
+
+	if (delta > 0) {
+		n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
+		rq->cpu_epoch += n;
+		rq->cpu_epoch_next += n * EPOCH_PERIOD;
+		__shr_u64(&rq->cpu_runtime, n);
+	}
+
+	n = rq->cpu_epoch - pcpu_sched->epoch;
+	if (n) {
+		pcpu_sched->epoch += n;
+		__shr_u64(&pcpu_sched->runtime, n);
+	}
+}
+
+static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
+{
+	guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
+
+	__update_mm_sched(rq, pcpu_sched);
+
+	/*
+	 * Runtime is a geometric series (r=0.5) and as such will sum to twice
+	 * the accumulation period, this means the multiplcation here should
+	 * not overflow.
+	 */
+	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
+}
+
+static inline
+void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+{
+	struct mm_struct *mm = p->mm;
+	struct mm_sched *pcpu_sched;
+	unsigned long epoch;
+
+	if (!sched_cache_enabled())
+		return;
+
+	if (p->sched_class != &fair_sched_class)
+		return;
+	/*
+	 * init_task and kthreads don't having mm
+	 */
+	if (!mm || !mm->pcpu_sched)
+		return;
+
+	pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq));
+
+	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
+		__update_mm_sched(rq, pcpu_sched);
+		pcpu_sched->runtime += delta_exec;
+		rq->cpu_runtime += delta_exec;
+		epoch = rq->cpu_epoch;
+	}
+
+	/*
+	 * If this task hasn't hit task_cache_work() for a while, or it
+	 * has only 1 thread, invalidate its preferred state.
+	 */
+	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
+	    get_nr_threads(p) <= 1) {
+		if (mm->mm_sched_cpu != -1)
+			mm->mm_sched_cpu = -1;
+	}
+}
+
+static void task_tick_cache(struct rq *rq, struct task_struct *p)
+{
+	struct callback_head *work = &p->cache_work;
+	struct mm_struct *mm = p->mm;
+
+	if (!sched_cache_enabled())
+		return;
+
+	if (!mm || !mm->pcpu_sched)
+		return;
+
+	/* avoid moving backwards */
+	if (mm->mm_sched_epoch >= rq->cpu_epoch)
+		return;
+
+	guard(raw_spinlock)(&mm->mm_sched_lock);
+
+	if (work->next == work) {
+		task_work_add(p, work, TWA_RESUME);
+		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
+	}
+}
+
+static void __no_profile task_cache_work(struct callback_head *work)
+{
+	struct task_struct *p = current;
+	struct mm_struct *mm = p->mm;
+	unsigned long m_a_occ = 0;
+	unsigned long curr_m_a_occ = 0;
+	int cpu, m_a_cpu = -1;
+	cpumask_var_t cpus;
+
+	WARN_ON_ONCE(work != &p->cache_work);
+
+	work->next = work;
+
+	if (p->flags & PF_EXITING)
+		return;
+
+	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
+		return;
+
+	scoped_guard (cpus_read_lock) {
+		cpumask_copy(cpus, cpu_online_mask);
+
+		for_each_cpu(cpu, cpus) {
+			/* XXX sched_cluster_active */
+			struct sched_domain *sd = per_cpu(sd_llc, cpu);
+			unsigned long occ, m_occ = 0, a_occ = 0;
+			int m_cpu = -1, i;
+
+			if (!sd)
+				continue;
+
+			for_each_cpu(i, sched_domain_span(sd)) {
+				occ = fraction_mm_sched(cpu_rq(i),
+							per_cpu_ptr(mm->pcpu_sched, i));
+				a_occ += occ;
+				if (occ > m_occ) {
+					m_occ = occ;
+					m_cpu = i;
+				}
+			}
+
+			/*
+			 * Compare the accumulated occupancy of each LLC. The
+			 * reason for using accumulated occupancy rather than average
+			 * per CPU occupancy is that it works better in asymmetric LLC
+			 * scenarios.
+			 * For example, if there are 2 threads in a 4CPU LLC and 3
+			 * threads in an 8CPU LLC, it might be better to choose the one
+			 * with 3 threads. However, this would not be the case if the
+			 * occupancy is divided by the number of CPUs in an LLC (i.e.,
+			 * if average per CPU occupancy is used).
+			 * Besides, NUMA balancing fault statistics behave similarly:
+			 * the total number of faults per node is compared rather than
+			 * the average number of faults per CPU. This strategy is also
+			 * followed here.
+			 */
+			if (a_occ > m_a_occ) {
+				m_a_occ = a_occ;
+				m_a_cpu = m_cpu;
+			}
+
+			if (llc_id(cpu) == llc_id(mm->mm_sched_cpu))
+				curr_m_a_occ = a_occ;
+
+			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
+		}
+	}
+
+	if (m_a_occ > (2 * curr_m_a_occ)) {
+		/*
+		 * Avoid switching mm_sched_cpu too fast.
+		 * The reason to choose 2X is because:
+		 * 1. It is better to keep the preferred LLC stable,
+		 *    rather than changing it frequently and cause migrations
+		 * 2. 2X means the new preferred LLC has at least 1 more
+		 *    busy CPU than the old one(200% vs 100%, eg)
+		 * 3. 2X is chosen based on test results, as it delivers
+		 *    the optimal performance gain so far.
+		 */
+		mm->mm_sched_cpu = m_a_cpu;
+	}
+
+	free_cpumask_var(cpus);
+}
+
+void init_sched_mm(struct task_struct *p)
+{
+	struct callback_head *work = &p->cache_work;
+
+	init_task_work(work, task_cache_work);
+	work->next = work;
+}
+
+#else
+
+static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
+				    s64 delta_exec) { }
+
+void init_sched_mm(struct task_struct *p) { }
+
+static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
+
+#endif
+
 /*
  * Used by other classes to account runtime.
  */
@@ -13124,6 +13380,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 	if (static_branch_unlikely(&sched_numa_balancing))
 		task_tick_numa(rq, curr);
 
+	task_tick_cache(rq, curr);
+
 	update_misfit_status(curr, rq);
 	check_update_overutilized_status(task_rq(curr));
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index adfb6e3409d7..84118b522f22 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1194,6 +1194,12 @@ struct rq {
 	u64			clock_pelt_idle_copy;
 	u64			clock_idle_copy;
 #endif
+#ifdef CONFIG_SCHED_CACHE
+	raw_spinlock_t		cpu_epoch_lock ____cacheline_aligned;
+	u64			cpu_runtime;
+	unsigned long		cpu_epoch;
+	unsigned long		cpu_epoch_next;
+#endif
 
 	atomic_t		nr_iowait;
 
@@ -3819,6 +3825,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
 static inline void init_sched_mm_cid(struct task_struct *t) { }
 #endif /* !CONFIG_SCHED_MM_CID */
 
+extern void init_sched_mm(struct task_struct *p);
+
 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
 extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
 static inline
-- 
2.32.0

Re: [PATCH v2 01/23] sched/cache: Introduce infrastructure for cache-aware load balancing

Posted by Vern Hao 1 month, 4 weeks ago

Hi, Peter, Chen Yu and Tim:

On 2025/12/4 07:07, Tim Chen wrote:
> From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
>
> Adds infrastructure to enable cache-aware load balancing,
> which improves cache locality by grouping tasks that share resources
> within the same cache domain. This reduces cache misses and improves
> overall data access efficiency.
>
> In this initial implementation, threads belonging to the same process
> are treated as entities that likely share working sets. The mechanism
> tracks per-process CPU occupancy across cache domains and attempts to
> migrate threads toward cache-hot domains where their process already
> has active threads, thereby enhancing locality.
>
> This provides a basic model for cache affinity. While the current code
> targets the last-level cache (LLC), the approach could be extended to
> other domain types such as clusters (L2) or node-internal groupings.
>
> At present, the mechanism selects the CPU within an LLC that has the
> highest recent runtime. Subsequent patches in this series will use this
> information in the load-balancing path to guide task placement toward
> preferred LLCs.
>
> In the future, more advanced policies could be integrated through NUMA
> balancing-for example, migrating a task to its preferred LLC when spare
> capacity exists, or swapping tasks across LLCs to improve cache affinity.
> Grouping of tasks could also be generalized from that of a process
> to be that of a NUMA group, or be user configurable.
>
> Originally-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> ---
>
> Notes:
>      v1->v2:
>         Restore the original CPU scan to cover all online CPUs,
>         rather than scanning within the preferred NUMA node.
>         (Peter Zijlstra)
>      
>         Use rq->curr instead of rq->donor. (K Prateek Nayak)
>      
>         Minor fix in task_tick_cache() to use
>         if (mm->mm_sched_epoch >= rq->cpu_epoch)
>         to avoid mm_sched_epoch going backwards.
>
>   include/linux/mm_types.h |  44 +++++++
>   include/linux/sched.h    |  11 ++
>   init/Kconfig             |  11 ++
>   kernel/fork.c            |   6 +
>   kernel/sched/core.c      |   6 +
>   kernel/sched/fair.c      | 258 +++++++++++++++++++++++++++++++++++++++
>   kernel/sched/sched.h     |   8 ++
>   7 files changed, 344 insertions(+)
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 90e5790c318f..1ea16ef90566 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -939,6 +939,11 @@ typedef struct {
>   	DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS);
>   } __private mm_flags_t;
>   
> +struct mm_sched {
> +	u64 runtime;
> +	unsigned long epoch;
> +};
> +
>   struct kioctx_table;
>   struct iommu_mm_data;
>   struct mm_struct {
> @@ -1029,6 +1034,17 @@ struct mm_struct {
>   		 */
>   		raw_spinlock_t cpus_allowed_lock;
>   #endif
> +#ifdef CONFIG_SCHED_CACHE
> +		/*
> +		 * Track per-cpu-per-process occupancy as a proxy for cache residency.
> +		 * See account_mm_sched() and ...
> +		 */
> +		struct mm_sched __percpu *pcpu_sched;
> +		raw_spinlock_t mm_sched_lock;
> +		unsigned long mm_sched_epoch;
> +		int mm_sched_cpu;
As we discussed earlier，I continue to believe that dedicating 
'mm_sched_cpu' to handle the aggregated hotspots of all threads is 
inappropriate, as the multiple threads lack a necessary correlation in 
our real application.

So, I was wondering if we could put this variable into struct 
task_struct, That allows us to better monitor the hotspot CPU of each 
thread, despite some details needing consideration.

> +#endif
> +
>   #ifdef CONFIG_MMU
>   		atomic_long_t pgtables_bytes;	/* size of all page tables */
>   #endif
> @@ -1487,6 +1503,34 @@ static inline unsigned int mm_cid_size(void)
>   static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
>   #endif /* CONFIG_SCHED_MM_CID */
>   
> +#ifdef CONFIG_SCHED_CACHE
> +void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched);
> +
> +static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
> +{
> +	struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
> +
> +	if (!pcpu_sched)
> +		return -ENOMEM;
> +
> +	mm_init_sched(mm, pcpu_sched);
> +	return 0;
> +}
> +
> +#define mm_alloc_sched(...)	alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__))
> +
> +static inline void mm_destroy_sched(struct mm_struct *mm)
> +{
> +	free_percpu(mm->pcpu_sched);
> +	mm->pcpu_sched = NULL;
> +}
> +#else /* !CONFIG_SCHED_CACHE */
> +
> +static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; }
> +static inline void mm_destroy_sched(struct mm_struct *mm) { }
> +
> +#endif /* CONFIG_SCHED_CACHE */
> +
>   struct mmu_gather;
>   extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
>   extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index b469878de25c..278b529c91df 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1406,6 +1406,10 @@ struct task_struct {
>   	unsigned long			numa_pages_migrated;
>   #endif /* CONFIG_NUMA_BALANCING */
>   
> +#ifdef CONFIG_SCHED_CACHE
> +	struct callback_head		cache_work;
> +#endif
> +
>   #ifdef CONFIG_RSEQ
>   	struct rseq __user *rseq;
>   	u32 rseq_len;
> @@ -2428,4 +2432,11 @@ extern void migrate_enable(void);
>   
>   DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>   
> +#ifdef CONFIG_SCHED_CACHE
> +static inline bool sched_cache_enabled(void)
> +{
> +	return false;
> +}
> +#endif
> +
>   #endif
> diff --git a/init/Kconfig b/init/Kconfig
> index cab3ad28ca49..88556ef8cfd1 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -983,6 +983,17 @@ config NUMA_BALANCING
>   
>   	  This system will be inactive on UMA systems.
>   
> +config SCHED_CACHE
> +	bool "Cache aware load balance"
> +	default y
> +	depends on SMP
> +	help
> +	  When enabled, the scheduler will attempt to aggregate tasks from
> +	  the same process onto a single Last Level Cache (LLC) domain when
> +	  possible. This improves cache locality by keeping tasks that share
> +	  resources within the same cache domain, reducing cache misses and
> +	  lowering data access latency.
> +
>   config NUMA_BALANCING_DEFAULT_ENABLED
>   	bool "Automatically enable NUMA aware memory/task placement"
>   	default y
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 3da0f08615a9..aae5053d1e30 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -680,6 +680,7 @@ void __mmdrop(struct mm_struct *mm)
>   	cleanup_lazy_tlbs(mm);
>   
>   	WARN_ON_ONCE(mm == current->active_mm);
> +	mm_destroy_sched(mm);
>   	mm_free_pgd(mm);
>   	mm_free_id(mm);
>   	destroy_context(mm);
> @@ -1083,6 +1084,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>   	if (mm_alloc_cid(mm, p))
>   		goto fail_cid;
>   
> +	if (mm_alloc_sched(mm))
> +		goto fail_sched;
> +
>   	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
>   				     NR_MM_COUNTERS))
>   		goto fail_pcpu;
> @@ -1092,6 +1096,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>   	return mm;
>   
>   fail_pcpu:
> +	mm_destroy_sched(mm);
> +fail_sched:
>   	mm_destroy_cid(mm);
>   fail_cid:
>   	destroy_context(mm);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index f754a60de848..e8bdf03a4b7f 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4488,6 +4488,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
>   	p->wake_entry.u_flags = CSD_TYPE_TTWU;
>   	p->migration_pending = NULL;
>   	init_sched_mm_cid(p);
> +	init_sched_mm(p);
>   }
>   
>   DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
> @@ -8791,6 +8792,11 @@ void __init sched_init(void)
>   
>   		rq->core_cookie = 0UL;
>   #endif
> +#ifdef CONFIG_SCHED_CACHE
> +		raw_spin_lock_init(&rq->cpu_epoch_lock);
> +		rq->cpu_epoch_next = jiffies;
> +#endif
> +
>   		zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
>   	}
>   
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5b752324270b..cb82f558dc5b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1152,6 +1152,8 @@ void post_init_entity_util_avg(struct task_struct *p)
>   	sa->runnable_avg = sa->util_avg;
>   }
>   
> +static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec);
> +
>   static s64 update_se(struct rq *rq, struct sched_entity *se)
>   {
>   	u64 now = rq_clock_task(rq);
> @@ -1174,6 +1176,7 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
>   
>   		trace_sched_stat_runtime(running, delta_exec);
>   		account_group_exec_runtime(running, delta_exec);
> +		account_mm_sched(rq, running, delta_exec);
>   
>   		/* cgroup time is always accounted against the donor */
>   		cgroup_account_cputime(donor, delta_exec);
> @@ -1193,6 +1196,259 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
>   	return delta_exec;
>   }
>   
> +#ifdef CONFIG_SCHED_CACHE
> +
> +/*
> + * XXX numbers come from a place the sun don't shine -- probably wants to be SD
> + * tunable or so.
> + */
> +#define EPOCH_PERIOD	(HZ / 100)	/* 10 ms */
> +#define EPOCH_LLC_AFFINITY_TIMEOUT	5	/* 50 ms */
> +
> +static int llc_id(int cpu)
> +{
> +	if (cpu < 0)
> +		return -1;
> +
> +	return per_cpu(sd_llc_id, cpu);
> +}
> +
> +void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
> +{
> +	unsigned long epoch;
> +	int i;
> +
> +	for_each_possible_cpu(i) {
> +		struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
> +		struct rq *rq = cpu_rq(i);
> +
> +		pcpu_sched->runtime = 0;
> +		pcpu_sched->epoch = rq->cpu_epoch;
> +		epoch = rq->cpu_epoch;
> +	}
> +
> +	raw_spin_lock_init(&mm->mm_sched_lock);
> +	mm->mm_sched_epoch = epoch;
> +	mm->mm_sched_cpu = -1;
> +
> +	/*
> +	 * The update to mm->pcpu_sched should not be reordered
> +	 * before initialization to mm's other fields, in case
> +	 * the readers may get invalid mm_sched_epoch, etc.
> +	 */
> +	smp_store_release(&mm->pcpu_sched, _pcpu_sched);
> +}
> +
> +/* because why would C be fully specified */
> +static __always_inline void __shr_u64(u64 *val, unsigned int n)
> +{
> +	if (n >= 64) {
> +		*val = 0;
> +		return;
> +	}
> +	*val >>= n;
> +}
> +
> +static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
> +{
> +	lockdep_assert_held(&rq->cpu_epoch_lock);
> +
> +	unsigned long n, now = jiffies;
> +	long delta = now - rq->cpu_epoch_next;
> +
> +	if (delta > 0) {
> +		n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
> +		rq->cpu_epoch += n;
> +		rq->cpu_epoch_next += n * EPOCH_PERIOD;
> +		__shr_u64(&rq->cpu_runtime, n);
> +	}
> +
> +	n = rq->cpu_epoch - pcpu_sched->epoch;
> +	if (n) {
> +		pcpu_sched->epoch += n;
> +		__shr_u64(&pcpu_sched->runtime, n);
> +	}
> +}
> +
> +static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
> +{
> +	guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
> +
> +	__update_mm_sched(rq, pcpu_sched);
> +
> +	/*
> +	 * Runtime is a geometric series (r=0.5) and as such will sum to twice
> +	 * the accumulation period, this means the multiplcation here should
> +	 * not overflow.
> +	 */
> +	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
> +}
> +
> +static inline
> +void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> +{
> +	struct mm_struct *mm = p->mm;
> +	struct mm_sched *pcpu_sched;
> +	unsigned long epoch;
> +
> +	if (!sched_cache_enabled())
> +		return;
> +
> +	if (p->sched_class != &fair_sched_class)
> +		return;
> +	/*
> +	 * init_task and kthreads don't having mm
> +	 */
> +	if (!mm || !mm->pcpu_sched)
> +		return;
> +
> +	pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq));
> +
> +	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
> +		__update_mm_sched(rq, pcpu_sched);
> +		pcpu_sched->runtime += delta_exec;
> +		rq->cpu_runtime += delta_exec;
> +		epoch = rq->cpu_epoch;
> +	}
> +
> +	/*
> +	 * If this task hasn't hit task_cache_work() for a while, or it
> +	 * has only 1 thread, invalidate its preferred state.
> +	 */
> +	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
> +	    get_nr_threads(p) <= 1) {
> +		if (mm->mm_sched_cpu != -1)
> +			mm->mm_sched_cpu = -1;
> +	}
> +}
> +
> +static void task_tick_cache(struct rq *rq, struct task_struct *p)
> +{
> +	struct callback_head *work = &p->cache_work;
> +	struct mm_struct *mm = p->mm;
> +
> +	if (!sched_cache_enabled())
> +		return;
> +
> +	if (!mm || !mm->pcpu_sched)
> +		return;
> +
> +	/* avoid moving backwards */
> +	if (mm->mm_sched_epoch >= rq->cpu_epoch)
> +		return;
> +
> +	guard(raw_spinlock)(&mm->mm_sched_lock);
> +
> +	if (work->next == work) {
> +		task_work_add(p, work, TWA_RESUME);
> +		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
> +	}
> +}
> +
> +static void __no_profile task_cache_work(struct callback_head *work)
> +{
> +	struct task_struct *p = current;
> +	struct mm_struct *mm = p->mm;
> +	unsigned long m_a_occ = 0;
> +	unsigned long curr_m_a_occ = 0;
> +	int cpu, m_a_cpu = -1;
> +	cpumask_var_t cpus;
> +
> +	WARN_ON_ONCE(work != &p->cache_work);
> +
> +	work->next = work;
> +
> +	if (p->flags & PF_EXITING)
> +		return;
> +
> +	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
> +		return;
> +
> +	scoped_guard (cpus_read_lock) {
> +		cpumask_copy(cpus, cpu_online_mask);
> +
> +		for_each_cpu(cpu, cpus) {
> +			/* XXX sched_cluster_active */
> +			struct sched_domain *sd = per_cpu(sd_llc, cpu);
> +			unsigned long occ, m_occ = 0, a_occ = 0;
> +			int m_cpu = -1, i;
> +
> +			if (!sd)
> +				continue;
> +
> +			for_each_cpu(i, sched_domain_span(sd)) {
> +				occ = fraction_mm_sched(cpu_rq(i),
> +							per_cpu_ptr(mm->pcpu_sched, i));
> +				a_occ += occ;
> +				if (occ > m_occ) {
> +					m_occ = occ;
> +					m_cpu = i;
> +				}
> +			}
> +
> +			/*
> +			 * Compare the accumulated occupancy of each LLC. The
> +			 * reason for using accumulated occupancy rather than average
> +			 * per CPU occupancy is that it works better in asymmetric LLC
> +			 * scenarios.
> +			 * For example, if there are 2 threads in a 4CPU LLC and 3
> +			 * threads in an 8CPU LLC, it might be better to choose the one
> +			 * with 3 threads. However, this would not be the case if the
> +			 * occupancy is divided by the number of CPUs in an LLC (i.e.,
> +			 * if average per CPU occupancy is used).
> +			 * Besides, NUMA balancing fault statistics behave similarly:
> +			 * the total number of faults per node is compared rather than
> +			 * the average number of faults per CPU. This strategy is also
> +			 * followed here.
> +			 */
> +			if (a_occ > m_a_occ) {
> +				m_a_occ = a_occ;
> +				m_a_cpu = m_cpu;
> +			}
> +
> +			if (llc_id(cpu) == llc_id(mm->mm_sched_cpu))
> +				curr_m_a_occ = a_occ;
> +
> +			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
> +		}
> +	}
> +
> +	if (m_a_occ > (2 * curr_m_a_occ)) {
> +		/*
> +		 * Avoid switching mm_sched_cpu too fast.
> +		 * The reason to choose 2X is because:
> +		 * 1. It is better to keep the preferred LLC stable,
> +		 *    rather than changing it frequently and cause migrations
> +		 * 2. 2X means the new preferred LLC has at least 1 more
> +		 *    busy CPU than the old one(200% vs 100%, eg)
> +		 * 3. 2X is chosen based on test results, as it delivers
> +		 *    the optimal performance gain so far.
> +		 */
> +		mm->mm_sched_cpu = m_a_cpu;
> +	}
> +
> +	free_cpumask_var(cpus);
> +}
> +
> +void init_sched_mm(struct task_struct *p)
> +{
> +	struct callback_head *work = &p->cache_work;
> +
> +	init_task_work(work, task_cache_work);
> +	work->next = work;
> +}
> +
> +#else
> +
> +static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
> +				    s64 delta_exec) { }
> +
> +void init_sched_mm(struct task_struct *p) { }
> +
> +static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
> +
> +#endif
> +
>   /*
>    * Used by other classes to account runtime.
>    */
> @@ -13124,6 +13380,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>   	if (static_branch_unlikely(&sched_numa_balancing))
>   		task_tick_numa(rq, curr);
>   
> +	task_tick_cache(rq, curr);
> +
>   	update_misfit_status(curr, rq);
>   	check_update_overutilized_status(task_rq(curr));
>   
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index adfb6e3409d7..84118b522f22 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1194,6 +1194,12 @@ struct rq {
>   	u64			clock_pelt_idle_copy;
>   	u64			clock_idle_copy;
>   #endif
> +#ifdef CONFIG_SCHED_CACHE
> +	raw_spinlock_t		cpu_epoch_lock ____cacheline_aligned;
> +	u64			cpu_runtime;
> +	unsigned long		cpu_epoch;
> +	unsigned long		cpu_epoch_next;
> +#endif
>   
>   	atomic_t		nr_iowait;
>   
> @@ -3819,6 +3825,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
>   static inline void init_sched_mm_cid(struct task_struct *t) { }
>   #endif /* !CONFIG_SCHED_MM_CID */
>   
> +extern void init_sched_mm(struct task_struct *p);
> +
>   extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
>   extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
>   static inline

Re: [PATCH v2 01/23] sched/cache: Introduce infrastructure for cache-aware load balancing

Posted by Chen, Yu C 1 month, 3 weeks ago

On 12/11/2025 5:03 PM, Vern Hao wrote:
> Hi, Peter, Chen Yu and Tim:
> 
> On 2025/12/4 07:07, Tim Chen wrote:
>> From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
>>
>> Adds infrastructure to enable cache-aware load balancing,
>> which improves cache locality by grouping tasks that share resources
>> within the same cache domain. This reduces cache misses and improves
>> overall data access efficiency.
>>
>> In this initial implementation, threads belonging to the same process
>> are treated as entities that likely share working sets. The mechanism
>> tracks per-process CPU occupancy across cache domains and attempts to
>> migrate threads toward cache-hot domains where their process already
>> has active threads, thereby enhancing locality.
>>
>> This provides a basic model for cache affinity. While the current code
>> targets the last-level cache (LLC), the approach could be extended to
>> other domain types such as clusters (L2) or node-internal groupings.
>>
>> At present, the mechanism selects the CPU within an LLC that has the
>> highest recent runtime. Subsequent patches in this series will use this
>> information in the load-balancing path to guide task placement toward
>> preferred LLCs.
>>
>> In the future, more advanced policies could be integrated through NUMA
>> balancing-for example, migrating a task to its preferred LLC when spare
>> capacity exists, or swapping tasks across LLCs to improve cache affinity.
>> Grouping of tasks could also be generalized from that of a process
>> to be that of a NUMA group, or be user configurable.
>>
>> Originally-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
>> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
>> ---
>>
>> Notes:
>>      v1->v2:
>>         Restore the original CPU scan to cover all online CPUs,
>>         rather than scanning within the preferred NUMA node.
>>         (Peter Zijlstra)
>>         Use rq->curr instead of rq->donor. (K Prateek Nayak)
>>         Minor fix in task_tick_cache() to use
>>         if (mm->mm_sched_epoch >= rq->cpu_epoch)
>>         to avoid mm_sched_epoch going backwards.
>>
>>   include/linux/mm_types.h |  44 +++++++
>>   include/linux/sched.h    |  11 ++
>>   init/Kconfig             |  11 ++
>>   kernel/fork.c            |   6 +
>>   kernel/sched/core.c      |   6 +
>>   kernel/sched/fair.c      | 258 +++++++++++++++++++++++++++++++++++++++
>>   kernel/sched/sched.h     |   8 ++
>>   7 files changed, 344 insertions(+)
>>
>> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
>> index 90e5790c318f..1ea16ef90566 100644
>> --- a/include/linux/mm_types.h
>> +++ b/include/linux/mm_types.h
>> @@ -939,6 +939,11 @@ typedef struct {
>>       DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS);
>>   } __private mm_flags_t;
>> +struct mm_sched {
>> +    u64 runtime;
>> +    unsigned long epoch;
>> +};
>> +
>>   struct kioctx_table;
>>   struct iommu_mm_data;
>>   struct mm_struct {
>> @@ -1029,6 +1034,17 @@ struct mm_struct {
>>            */
>>           raw_spinlock_t cpus_allowed_lock;
>>   #endif
>> +#ifdef CONFIG_SCHED_CACHE
>> +        /*
>> +         * Track per-cpu-per-process occupancy as a proxy for cache 
>> residency.
>> +         * See account_mm_sched() and ...
>> +         */
>> +        struct mm_sched __percpu *pcpu_sched;
>> +        raw_spinlock_t mm_sched_lock;
>> +        unsigned long mm_sched_epoch;
>> +        int mm_sched_cpu;
> As we discussed earlier，I continue to believe that dedicating 
> 'mm_sched_cpu' to handle the aggregated hotspots of all threads is 
> inappropriate, as the multiple threads lack a necessary correlation in 
> our real application.
> 
> So, I was wondering if we could put this variable into struct 
> task_struct, That allows us to better monitor the hotspot CPU of each 
> thread, despite some details needing consideration.
> 

I suppose you are suggesting a fine-grained control for a set of tasks.
Process-scope aggregation could be a start as the default strategy(
conservative, benefit multi-thread workloads that share data per process,
not introduce regression).

On top of that, I wonder if we could provide task-scope control like
sched_setattr(), similar to core-scheduling cookie mechanism, for
users that want aggressive aggregation. But before doing that, we need a
mechanism that that leverages a monitor system(like PMU) to figure out
if putting these tasks together would bring benefit(if I understand
Steven's suggestion correctly on LPC), or detection tasks that share
resource, then maybe leverage QOS interfaces to enable the cache-aware
aggregation(something Qias mentioned on the LPC).

thanks,
Chenyu

Re: [PATCH v2 01/23] sched/cache: Introduce infrastructure for cache-aware load balancing

Posted by Vern Hao 1 month, 3 weeks ago

On 2025/12/16 14:12, Chen, Yu C wrote:
> On 12/11/2025 5:03 PM, Vern Hao wrote:
>> Hi, Peter, Chen Yu and Tim:
>>
>> On 2025/12/4 07:07, Tim Chen wrote:
>>> From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
>>>
>>> Adds infrastructure to enable cache-aware load balancing,
>>> which improves cache locality by grouping tasks that share resources
>>> within the same cache domain. This reduces cache misses and improves
>>> overall data access efficiency.
>>>
>>> In this initial implementation, threads belonging to the same process
>>> are treated as entities that likely share working sets. The mechanism
>>> tracks per-process CPU occupancy across cache domains and attempts to
>>> migrate threads toward cache-hot domains where their process already
>>> has active threads, thereby enhancing locality.
>>>
>>> This provides a basic model for cache affinity. While the current code
>>> targets the last-level cache (LLC), the approach could be extended to
>>> other domain types such as clusters (L2) or node-internal groupings.
>>>
>>> At present, the mechanism selects the CPU within an LLC that has the
>>> highest recent runtime. Subsequent patches in this series will use this
>>> information in the load-balancing path to guide task placement toward
>>> preferred LLCs.
>>>
>>> In the future, more advanced policies could be integrated through NUMA
>>> balancing-for example, migrating a task to its preferred LLC when spare
>>> capacity exists, or swapping tasks across LLCs to improve cache 
>>> affinity.
>>> Grouping of tasks could also be generalized from that of a process
>>> to be that of a NUMA group, or be user configurable.
>>>
>>> Originally-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>>> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
>>> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
>>> ---
>>>
>>> Notes:
>>>      v1->v2:
>>>         Restore the original CPU scan to cover all online CPUs,
>>>         rather than scanning within the preferred NUMA node.
>>>         (Peter Zijlstra)
>>>         Use rq->curr instead of rq->donor. (K Prateek Nayak)
>>>         Minor fix in task_tick_cache() to use
>>>         if (mm->mm_sched_epoch >= rq->cpu_epoch)
>>>         to avoid mm_sched_epoch going backwards.
>>>
>>>   include/linux/mm_types.h |  44 +++++++
>>>   include/linux/sched.h    |  11 ++
>>>   init/Kconfig             |  11 ++
>>>   kernel/fork.c            |   6 +
>>>   kernel/sched/core.c      |   6 +
>>>   kernel/sched/fair.c      | 258 
>>> +++++++++++++++++++++++++++++++++++++++
>>>   kernel/sched/sched.h     |   8 ++
>>>   7 files changed, 344 insertions(+)
>>>
>>> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
>>> index 90e5790c318f..1ea16ef90566 100644
>>> --- a/include/linux/mm_types.h
>>> +++ b/include/linux/mm_types.h
>>> @@ -939,6 +939,11 @@ typedef struct {
>>>       DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS);
>>>   } __private mm_flags_t;
>>> +struct mm_sched {
>>> +    u64 runtime;
>>> +    unsigned long epoch;
>>> +};
>>> +
>>>   struct kioctx_table;
>>>   struct iommu_mm_data;
>>>   struct mm_struct {
>>> @@ -1029,6 +1034,17 @@ struct mm_struct {
>>>            */
>>>           raw_spinlock_t cpus_allowed_lock;
>>>   #endif
>>> +#ifdef CONFIG_SCHED_CACHE
>>> +        /*
>>> +         * Track per-cpu-per-process occupancy as a proxy for cache 
>>> residency.
>>> +         * See account_mm_sched() and ...
>>> +         */
>>> +        struct mm_sched __percpu *pcpu_sched;
>>> +        raw_spinlock_t mm_sched_lock;
>>> +        unsigned long mm_sched_epoch;
>>> +        int mm_sched_cpu;
>> As we discussed earlier，I continue to believe that dedicating 
>> 'mm_sched_cpu' to handle the aggregated hotspots of all threads is 
>> inappropriate, as the multiple threads lack a necessary correlation 
>> in our real application.
>>
>> So, I was wondering if we could put this variable into struct 
>> task_struct, That allows us to better monitor the hotspot CPU of each 
>> thread, despite some details needing consideration.
>>
>
> I suppose you are suggesting a fine-grained control for a set of tasks.
> Process-scope aggregation could be a start as the default strategy(
> conservative, benefit multi-thread workloads that share data per process,
> not introduce regression).

Yes, in our real-world business scenarios at Tencent, I have indeed 
encountered this issue where multiple threads are divided into several 
categories to handle different transactions, so they are not share the 
hot data, the 'mm_sched_cpu'  does not represent all of their task, so 
add a control interface such as cgroup or others will be a good idea.

>
> On top of that, I wonder if we could provide task-scope control like
> sched_setattr(), similar to core-scheduling cookie mechanism, for
> users that want aggressive aggregation. But before doing that, we need a
> mechanism that that leverages a monitor system(like PMU) to figure out
There will maybe a trouble, If the environment is running on a VM, We 
could use tags to differentiate these tasks and do some tests to verify 
the performance difference between unifying the |mm_sched_cpu| and not 
unifying.
> if putting these tasks together would bring benefit(if I understand
> Steven's suggestion correctly on LPC), or detection tasks that share
> resource, then maybe leverage QOS interfaces to enable the cache-aware
> aggregation(something Qias mentioned on the LPC).
>
> thanks,
> Chenyu
>

Re: [PATCH v2 01/23] sched/cache: Introduce infrastructure for cache-aware load balancing

Posted by Tim Chen 3 weeks, 1 day ago

On Wed, 2025-12-17 at 09:17 +0800, Vern Hao wrote:
> On 2025/12/16 14:12, Chen, Yu C wrote:
> > On 12/11/2025 5:03 PM, Vern Hao wrote:
> > > Hi, Peter, Chen Yu and Tim:
> > > 
> > > On 2025/12/4 07:07, Tim Chen wrote:
> > > > From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
> > > > 
> > > > Adds infrastructure to enable cache-aware load balancing,
> > > > which improves cache locality by grouping tasks that share resources
> > > > within the same cache domain. This reduces cache misses and improves
> > > > overall data access efficiency.
> > > > 
> > > > In this initial implementation, threads belonging to the same process
> > > > are treated as entities that likely share working sets. The mechanism
> > > > tracks per-process CPU occupancy across cache domains and attempts to
> > > > migrate threads toward cache-hot domains where their process already
> > > > has active threads, thereby enhancing locality.
> > > > 
> > > > This provides a basic model for cache affinity. While the current code
> > > > targets the last-level cache (LLC), the approach could be extended to
> > > > other domain types such as clusters (L2) or node-internal groupings.
> > > > 
> > > > At present, the mechanism selects the CPU within an LLC that has the
> > > > highest recent runtime. Subsequent patches in this series will use this
> > > > information in the load-balancing path to guide task placement toward
> > > > preferred LLCs.
> > > > 
> > > > In the future, more advanced policies could be integrated through NUMA
> > > > balancing-for example, migrating a task to its preferred LLC when spare
> > > > capacity exists, or swapping tasks across LLCs to improve cache 
> > > > affinity.
> > > > Grouping of tasks could also be generalized from that of a process
> > > > to be that of a NUMA group, or be user configurable.
> > > > 
> > > > Originally-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > > Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> > > > Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> > > > ---
> > > > 
> > > > Notes:
> > > >      v1->v2:
> > > >         Restore the original CPU scan to cover all online CPUs,
> > > >         rather than scanning within the preferred NUMA node.
> > > >         (Peter Zijlstra)
> > > >         Use rq->curr instead of rq->donor. (K Prateek Nayak)
> > > >         Minor fix in task_tick_cache() to use
> > > >         if (mm->mm_sched_epoch >= rq->cpu_epoch)
> > > >         to avoid mm_sched_epoch going backwards.
> > > > 
> > > >   include/linux/mm_types.h |  44 +++++++
> > > >   include/linux/sched.h    |  11 ++
> > > >   init/Kconfig             |  11 ++
> > > >   kernel/fork.c            |   6 +
> > > >   kernel/sched/core.c      |   6 +
> > > >   kernel/sched/fair.c      | 258 
> > > > +++++++++++++++++++++++++++++++++++++++
> > > >   kernel/sched/sched.h     |   8 ++
> > > >   7 files changed, 344 insertions(+)
> > > > 
> > > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > > > index 90e5790c318f..1ea16ef90566 100644
> > > > --- a/include/linux/mm_types.h
> > > > +++ b/include/linux/mm_types.h
> > > > @@ -939,6 +939,11 @@ typedef struct {
> > > >       DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS);
> > > >   } __private mm_flags_t;
> > > > +struct mm_sched {
> > > > +    u64 runtime;
> > > > +    unsigned long epoch;
> > > > +};
> > > > +
> > > >   struct kioctx_table;
> > > >   struct iommu_mm_data;
> > > >   struct mm_struct {
> > > > @@ -1029,6 +1034,17 @@ struct mm_struct {
> > > >            */
> > > >           raw_spinlock_t cpus_allowed_lock;
> > > >   #endif
> > > > +#ifdef CONFIG_SCHED_CACHE
> > > > +        /*
> > > > +         * Track per-cpu-per-process occupancy as a proxy for cache 
> > > > residency.
> > > > +         * See account_mm_sched() and ...
> > > > +         */
> > > > +        struct mm_sched __percpu *pcpu_sched;
> > > > +        raw_spinlock_t mm_sched_lock;
> > > > +        unsigned long mm_sched_epoch;
> > > > +        int mm_sched_cpu;
> > > As we discussed earlier，I continue to believe that dedicating 
> > > 'mm_sched_cpu' to handle the aggregated hotspots of all threads is 
> > > inappropriate, as the multiple threads lack a necessary correlation 
> > > in our real application.
> > > 
> > > So, I was wondering if we could put this variable into struct 
> > > task_struct, That allows us to better monitor the hotspot CPU of each 
> > > thread, despite some details needing consideration.
> > > 
> > 
> > I suppose you are suggesting a fine-grained control for a set of tasks.
> > Process-scope aggregation could be a start as the default strategy(
> > conservative, benefit multi-thread workloads that share data per process,
> > not introduce regression).
> 
> Yes, in our real-world business scenarios at Tencent, I have indeed 
> encountered this issue where multiple threads are divided into several 
> categories to handle different transactions, so they are not share the 
> hot data, the 'mm_sched_cpu'  does not represent all of their task, so 
> add a control interface such as cgroup or others will be a good idea.
> 

Yes, the grouping and aggregating of tasks by process will not cover
your usage scenario. Chen Yu and I had quite a bit of discussions among
us and here're our thoughts.

In the initial version of cache aware scheduling, process based aggregation
is a sensible default. Once this basic option is merged in mainline we will consider adding
other options for task grouping.  For example, setting a flag in a cgroup
cpu controller to indicate that tasks in a cgroup could benefit from being
consolidated in a LLC.

We think that you can put your threads in each category in each of
its own cgroup.  Will that meet your need?

Things like mm_sched_cpu ... etc will be abstracted out, where the grouping structure in mm
is abstracted as cache_group.  So we will have something like
cache_group_sched_cpu instead of mm_sched_cpu.

Tim

> > 
> > On top of that, I wonder if we could provide task-scope control like
> > sched_setattr(), similar to core-scheduling cookie mechanism, for
> > users that want aggressive aggregation. But before doing that, we need a
> > mechanism that that leverages a monitor system(like PMU) to figure out
> There will maybe a trouble, If the environment is running on a VM, We 
> could use tags to differentiate these tasks and do some tests to verify 
> the performance difference between unifying the |mm_sched_cpu| and not 
> unifying.
> > if putting these tasks together would bring benefit(if I understand
> > Steven's suggestion correctly on LPC), or detection tasks that share
> > resource, then maybe leverage QOS interfaces to enable the cache-aware
> > aggregation(something Qias mentioned on the LPC).
> > 
> > thanks,
> > Chenyu
> >

Re: [PATCH v2 01/23] sched/cache: Introduce infrastructure for cache-aware load balancing

Posted by Peter Zijlstra 1 month, 4 weeks ago

On Wed, Dec 03, 2025 at 03:07:20PM -0800, Tim Chen wrote:

> +static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)

> +static void __no_profile task_cache_work(struct callback_head *work)

What's with the random __no_profile things?

Re: [PATCH v2 01/23] sched/cache: Introduce infrastructure for cache-aware load balancing

Posted by Chen, Yu C 1 month, 4 weeks ago

On 12/10/2025 6:37 PM, Peter Zijlstra wrote:
> On Wed, Dec 03, 2025 at 03:07:20PM -0800, Tim Chen wrote:
> 
>> +static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
> 
>> +static void __no_profile task_cache_work(struct callback_head *work)
> 
> What's with the random __no_profile things?

In the early version without this tag we got some error reports by gcov.
We will check if this issue still exists and do some investigations.

thanks,
Chenyu

Re: [PATCH v2 01/23] sched/cache: Introduce infrastructure for cache-aware load balancing

Posted by Peter Zijlstra 1 month, 4 weeks ago

On Wed, Dec 10, 2025 at 10:57:27PM +0900, Chen, Yu C wrote:
> On 12/10/2025 6:37 PM, Peter Zijlstra wrote:
> > On Wed, Dec 03, 2025 at 03:07:20PM -0800, Tim Chen wrote:
> > 
> > > +static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
> > 
> > > +static void __no_profile task_cache_work(struct callback_head *work)
> > 
> > What's with the random __no_profile things?
> 
> In the early version without this tag we got some error reports by gcov.
> We will check if this issue still exists and do some investigations.

That would be weird, nothing in the scheduler has anything like it. So
yeah, please see if you can find out where that came from.

Re: [PATCH v2 01/23] sched/cache: Introduce infrastructure for cache-aware load balancing

Posted by Peter Zijlstra 2 months ago

On Wed, Dec 03, 2025 at 03:07:20PM -0800, Tim Chen wrote:

>        Minor fix in task_tick_cache() to use
>        if (mm->mm_sched_epoch >= rq->cpu_epoch)
>        to avoid mm_sched_epoch going backwards.

> +static void task_tick_cache(struct rq *rq, struct task_struct *p)
> +{
> +	struct callback_head *work = &p->cache_work;
> +	struct mm_struct *mm = p->mm;
> +
> +	if (!sched_cache_enabled())
> +		return;
> +
> +	if (!mm || !mm->pcpu_sched)
> +		return;
> +
> +	/* avoid moving backwards */
> +	if (mm->mm_sched_epoch >= rq->cpu_epoch)
> +		return;

IIRC this was supposed to be able to wrap; which then means you should
write it like:

	if ((mm->mm_sched_epoch - rq->cpu_epoch) >= 0)
		return;

or somesuch.

> +
> +	guard(raw_spinlock)(&mm->mm_sched_lock);
> +
> +	if (work->next == work) {
> +		task_work_add(p, work, TWA_RESUME);
> +		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
> +	}
> +}

Re: [PATCH v2 01/23] sched/cache: Introduce infrastructure for cache-aware load balancing

Posted by Tim Chen 1 month, 4 weeks ago

On Tue, 2025-12-09 at 12:12 +0100, Peter Zijlstra wrote:
> On Wed, Dec 03, 2025 at 03:07:20PM -0800, Tim Chen wrote:
> 
> >        Minor fix in task_tick_cache() to use
> >        if (mm->mm_sched_epoch >= rq->cpu_epoch)
> >        to avoid mm_sched_epoch going backwards.
> 
> > +static void task_tick_cache(struct rq *rq, struct task_struct *p)
> > +{
> > +	struct callback_head *work = &p->cache_work;
> > +	struct mm_struct *mm = p->mm;
> > +
> > +	if (!sched_cache_enabled())
> > +		return;
> > +
> > +	if (!mm || !mm->pcpu_sched)
> > +		return;
> > +
> > +	/* avoid moving backwards */
> > +	if (mm->mm_sched_epoch >= rq->cpu_epoch)
> > +		return;
> 
> IIRC this was supposed to be able to wrap; which then means you should
> write it like:
> 
> 	if ((mm->mm_sched_epoch - rq->cpu_epoch) >= 0)
> 		return;
> 
> or somesuch.

Okay. Got it.

Tim

> 
> > +
> > +	guard(raw_spinlock)(&mm->mm_sched_lock);
> > +
> > +	if (work->next == work) {
> > +		task_work_add(p, work, TWA_RESUME);
> > +		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
> > +	}
> > +}