sched: Cache aware load-balancing

[RFC][PATCH] sched: Cache aware load-balancing

Posted by Peter Zijlstra 8 months, 4 weeks ago

Hi all,

One of the many things on the eternal todo list has been finishing the
below hackery.

It is an attempt at modelling cache affinity -- and while the patch
really only targets LLC, it could very well be extended to also apply to
clusters (L2). Specifically any case of multiple cache domains inside a
node.

Anyway, I wrote this about a year ago, and I mentioned this at the
recent OSPM conf where Gautham and Prateek expressed interest in playing
with this code.

So here goes, very rough and largely unproven code ahead :-)

It applies to current tip/master, but I know it will fail the __percpu
validation that sits in -next, although that shouldn't be terribly hard
to fix up.

As is, it only computes a CPU inside the LLC that has the highest recent
runtime, this CPU is then used in the wake-up path to steer towards this
LLC and in task_hot() to limit migrations away from it.

More elaborate things could be done, notably there is an XXX in there
somewhere about finding the best LLC inside a NODE (interaction with
NUMA_BALANCING).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/mm_types.h |  44 +++++++
 include/linux/sched.h    |   4 +
 init/Kconfig             |   4 +
 kernel/fork.c            |   5 +
 kernel/sched/core.c      |  13 +-
 kernel/sched/fair.c      | 330 ++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h     |   8 ++
 7 files changed, 388 insertions(+), 20 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0234f14f2aa6..3ed8dd225eb9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -800,6 +800,12 @@ struct mm_cid {
 };
 #endif
 
+struct mm_sched {
+	u64 runtime;
+	unsigned long epoch;
+	unsigned long occ;
+};
+
 struct kioctx_table;
 struct iommu_mm_data;
 struct mm_struct {
@@ -890,6 +896,17 @@ struct mm_struct {
 		 */
 		raw_spinlock_t cpus_allowed_lock;
 #endif
+#ifdef CONFIG_SCHED_CACHE
+		/*
+		 * Track per-cpu-per-process occupancy as a proxy for cache residency.
+		 * See account_mm_sched() and ...
+		 */
+		struct mm_sched __percpu *pcpu_sched;
+		raw_spinlock_t mm_sched_lock;
+		unsigned long mm_sched_epoch;
+		int mm_sched_cpu;
+#endif
+
 #ifdef CONFIG_MMU
 		atomic_long_t pgtables_bytes;	/* size of all page tables */
 #endif
@@ -1296,6 +1313,33 @@ static inline unsigned int mm_cid_size(void)
 static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
 #endif /* CONFIG_SCHED_MM_CID */
 
+#ifdef CONFIG_SCHED_CACHE
+extern void mm_init_sched(struct mm_struct *mm, struct mm_sched *pcpu_sched);
+
+static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
+{
+	struct mm_sched *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
+	if (!pcpu_sched)
+		return -ENOMEM;
+
+	mm_init_sched(mm, pcpu_sched);
+	return 0;
+}
+
+#define mm_alloc_sched(...)	alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__))
+
+static inline void mm_destroy_sched(struct mm_struct *mm)
+{
+	free_percpu(mm->pcpu_sched);
+	mm->pcpu_sched = NULL;
+}
+#else /* !CONFIG_SCHED_CACHE */
+
+static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; }
+static inline void mm_destroy_sched(struct mm_struct *mm) { }
+
+#endif /* CONFIG_SCHED_CACHE */
+
 struct mmu_gather;
 extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e5c38718ff5..f8eafe440369 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1379,6 +1379,10 @@ struct task_struct {
 	unsigned long			numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_SCHED_CACHE
+	struct callback_head		cache_work;
+#endif
+
 #ifdef CONFIG_RSEQ
 	struct rseq __user *rseq;
 	u32 rseq_len;
diff --git a/init/Kconfig b/init/Kconfig
index 681f38ee68db..14b15215318f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -950,6 +950,10 @@ config NUMA_BALANCING
 
 	  This system will be inactive on UMA systems.
 
+config SCHED_CACHE
+	bool "Cache aware scheduler"
+	default y
+
 config NUMA_BALANCING_DEFAULT_ENABLED
 	bool "Automatically enable NUMA aware memory/task placement"
 	default y
diff --git a/kernel/fork.c b/kernel/fork.c
index 1b659b07ecd5..bc9d7dbfd980 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1314,6 +1314,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	if (mm_alloc_cid(mm, p))
 		goto fail_cid;
 
+	if (mm_alloc_sched(mm))
+		goto fail_sched;
+
 	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
 				     NR_MM_COUNTERS))
 		goto fail_pcpu;
@@ -1323,6 +1326,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	return mm;
 
 fail_pcpu:
+	mm_destroy_sched(mm);
+fail_sched:
 	mm_destroy_cid(mm);
 fail_cid:
 	destroy_context(mm);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 87540217fc09..649db6ea41ea 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4514,6 +4514,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->migration_pending = NULL;
 #endif
 	init_sched_mm_cid(p);
+	init_sched_mm(p);
 }
 
 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -8505,6 +8506,7 @@ static struct kmem_cache *task_group_cache __ro_after_init;
 
 void __init sched_init(void)
 {
+	unsigned long now = jiffies;
 	unsigned long ptr = 0;
 	int i;
 
@@ -8579,7 +8581,7 @@ void __init sched_init(void)
 		raw_spin_lock_init(&rq->__lock);
 		rq->nr_running = 0;
 		rq->calc_load_active = 0;
-		rq->calc_load_update = jiffies + LOAD_FREQ;
+		rq->calc_load_update = now + LOAD_FREQ;
 		init_cfs_rq(&rq->cfs);
 		init_rt_rq(&rq->rt);
 		init_dl_rq(&rq->dl);
@@ -8623,7 +8625,7 @@ void __init sched_init(void)
 		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
 		rq->balance_callback = &balance_push_callback;
 		rq->active_balance = 0;
-		rq->next_balance = jiffies;
+		rq->next_balance = now;
 		rq->push_cpu = 0;
 		rq->cpu = i;
 		rq->online = 0;
@@ -8635,7 +8637,7 @@ void __init sched_init(void)
 
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ_COMMON
-		rq->last_blocked_load_update_tick = jiffies;
+		rq->last_blocked_load_update_tick = now;
 		atomic_set(&rq->nohz_flags, 0);
 
 		INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
@@ -8660,6 +8662,11 @@ void __init sched_init(void)
 
 		rq->core_cookie = 0UL;
 #endif
+#ifdef CONFIG_SCHED_CACHE
+		raw_spin_lock_init(&rq->cpu_epoch_lock);
+		rq->cpu_epoch_next = now;
+#endif
+
 		zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
 	}
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e43993a4e580..943af076e09c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1166,10 +1166,229 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
 	return delta_exec;
 }
 
-static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
+#ifdef CONFIG_SCHED_CACHE
+
+/*
+ * XXX numbers come from a place the sun don't shine -- probably wants to be SD
+ * tunable or so.
+ */
+#define EPOCH_PERIOD	(HZ/100)	/* 10 ms */
+#define EPOCH_OLD	5		/* 50 ms */
+
+void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
+{
+	unsigned long epoch;
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
+		struct rq *rq = cpu_rq(i);
+
+		pcpu_sched->runtime = 0;
+		pcpu_sched->epoch = epoch = rq->cpu_epoch;
+		pcpu_sched->occ = -1;
+	}
+
+	raw_spin_lock_init(&mm->mm_sched_lock);
+	mm->mm_sched_epoch = epoch;
+	mm->mm_sched_cpu = -1;
+
+	smp_store_release(&mm->pcpu_sched, _pcpu_sched);
+}
+
+/* because why would C be fully specified */
+static __always_inline void __shr_u64(u64 *val, unsigned int n)
+{
+	if (n >= 64) {
+		*val = 0;
+		return;
+	}
+	*val >>= n;
+}
+
+static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
+{
+	lockdep_assert_held(&rq->cpu_epoch_lock);
+
+	unsigned long n, now = jiffies;
+	long delta = now - rq->cpu_epoch_next;
+
+	if (delta > 0) {
+		n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
+		rq->cpu_epoch += n;
+		rq->cpu_epoch_next += n * EPOCH_PERIOD;
+		__shr_u64(&rq->cpu_runtime, n);
+	}
+
+	n = rq->cpu_epoch - pcpu_sched->epoch;
+	if (n) {
+		pcpu_sched->epoch += n;
+		__shr_u64(&pcpu_sched->runtime, n);
+	}
+}
+
+static unsigned long fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
+{
+	guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
+
+	__update_mm_sched(rq, pcpu_sched);
+
+	/*
+	 * Runtime is a geometric series (r=0.5) and as such will sum to twice
+	 * the accumulation period, this means the multiplcation here should
+	 * not overflow.
+	 */
+	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
+}
+
+static inline
+void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+{
+	struct mm_struct *mm = p->mm;
+	struct mm_sched *pcpu_sched;
+	unsigned long epoch;
+
+	/*
+	 * init_task and kthreads don't be having no mm
+	 */
+	if (!mm || !mm->pcpu_sched)
+		return;
+
+	pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
+
+	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
+		__update_mm_sched(rq, pcpu_sched);
+		pcpu_sched->runtime += delta_exec;
+		rq->cpu_runtime += delta_exec;
+		epoch = rq->cpu_epoch;
+	}
+
+	/*
+	 * If this task hasn't hit task_cache_work() for a while, invalidate
+	 * it's preferred state.
+	 */
+	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
+		mm->mm_sched_cpu = -1;
+		pcpu_sched->occ = -1;
+	}
+}
+
+static void task_tick_cache(struct rq *rq, struct task_struct *p)
+{
+	struct callback_head *work = &p->cache_work;
+	struct mm_struct *mm = p->mm;
+
+	if (!mm || !mm->pcpu_sched)
+		return;
+
+	if (mm->mm_sched_epoch == rq->cpu_epoch)
+		return;
+
+	guard(raw_spinlock)(&mm->mm_sched_lock);
+
+	if (mm->mm_sched_epoch == rq->cpu_epoch)
+		return;
+
+	if (work->next == work) {
+		task_work_add(p, work, TWA_RESUME);
+		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
+	}
+}
+
+static void task_cache_work(struct callback_head *work)
+{
+	struct task_struct *p = current;
+	struct mm_struct *mm = p->mm;
+	unsigned long m_a_occ = 0;
+	int cpu, m_a_cpu = -1;
+	cpumask_var_t cpus;
+
+	WARN_ON_ONCE(work != &p->cache_work);
+
+	work->next = work;
+
+	if (p->flags & PF_EXITING)
+		return;
+
+	if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
+		return;
+
+	scoped_guard (cpus_read_lock) {
+		cpumask_copy(cpus, cpu_online_mask);
+
+		for_each_cpu(cpu, cpus) {
+			/* XXX sched_cluster_active */
+			struct sched_domain *sd = per_cpu(sd_llc, cpu);
+			unsigned long occ, m_occ = 0, a_occ = 0;
+			int m_cpu = -1, nr = 0, i;
+
+			for_each_cpu(i, sched_domain_span(sd)) {
+				occ = fraction_mm_sched(cpu_rq(i),
+							per_cpu_ptr(mm->pcpu_sched, i));
+				a_occ += occ;
+				if (occ > m_occ) {
+					m_occ = occ;
+					m_cpu = i;
+				}
+				nr++;
+				trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
+					     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
+			}
+
+			a_occ /= nr;
+			if (a_occ > m_a_occ) {
+				m_a_occ = a_occ;
+				m_a_cpu = m_cpu;
+			}
+
+			trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
+				     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
+
+			for_each_cpu(i, sched_domain_span(sd)) {
+				/* XXX threshold ? */
+				per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ;
+			}
+
+			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
+		}
+	}
+
+	/*
+	 * If the max average cache occupancy is 'small' we don't care.
+	 */
+	if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD))
+		m_a_cpu = -1;
+
+	mm->mm_sched_cpu = m_a_cpu;
+
+	free_cpumask_var(cpus);
+}
+
+void init_sched_mm(struct task_struct *p)
+{
+	struct callback_head *work = &p->cache_work;
+	init_task_work(work, task_cache_work);
+	work->next = work;
+}
+
+#else
+
+static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
+				    s64 delta_exec) { }
+
+
+void init_sched_mm(struct task_struct *p) { }
+
+static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
+
+#endif
+
+static inline
+void update_curr_task(struct rq *rq, struct task_struct *p, s64 delta_exec)
 {
 	trace_sched_stat_runtime(p, delta_exec);
 	account_group_exec_runtime(p, delta_exec);
+	account_mm_sched(rq, p, delta_exec);
 	cgroup_account_cputime(p, delta_exec);
 }
 
@@ -1215,7 +1434,7 @@ s64 update_curr_common(struct rq *rq)
 
 	delta_exec = update_curr_se(rq, &donor->se);
 	if (likely(delta_exec > 0))
-		update_curr_task(donor, delta_exec);
+		update_curr_task(rq, donor, delta_exec);
 
 	return delta_exec;
 }
@@ -1244,7 +1463,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	if (entity_is_task(curr)) {
 		struct task_struct *p = task_of(curr);
 
-		update_curr_task(p, delta_exec);
+		update_curr_task(rq, p, delta_exec);
 
 		/*
 		 * If the fair_server is active, we need to account for the
@@ -7850,7 +8069,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	 * per-cpu select_rq_mask usage
 	 */
 	lockdep_assert_irqs_disabled();
-
+again:
 	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
 	    asym_fits_cpu(task_util, util_min, util_max, target))
 		return target;
@@ -7888,7 +8107,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	/* Check a recently used CPU as a potential idle candidate: */
 	recent_used_cpu = p->recent_used_cpu;
 	p->recent_used_cpu = prev;
-	if (recent_used_cpu != prev &&
+	if (prev == p->wake_cpu &&
+	    recent_used_cpu != prev &&
 	    recent_used_cpu != target &&
 	    cpus_share_cache(recent_used_cpu, target) &&
 	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
@@ -7941,6 +8161,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if ((unsigned)i < nr_cpumask_bits)
 		return i;
 
+	if (prev != p->wake_cpu && !cpus_share_cache(prev, p->wake_cpu)) {
+		/*
+		 * Most likely select_cache_cpu() will have re-directed
+		 * the wakeup, but getting here means the preferred cache is
+		 * too busy, so re-try with the actual previous.
+		 *
+		 * XXX wake_affine is lost for this pass.
+		 */
+		prev = target = p->wake_cpu;
+		goto again;
+	}
+
 	/*
 	 * For cluster machines which have lower sharing cache like L2 or
 	 * LLC Tag, we tend to find an idle CPU in the target's cluster
@@ -8563,6 +8795,40 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 	return target;
 }
 
+#ifdef CONFIG_SCHED_CACHE
+static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle);
+
+static int select_cache_cpu(struct task_struct *p, int prev_cpu)
+{
+	struct mm_struct *mm = p->mm;
+	int cpu;
+
+	if (!mm || p->nr_cpus_allowed == 1)
+		return prev_cpu;
+
+	cpu = mm->mm_sched_cpu;
+	if (cpu < 0)
+		return prev_cpu;
+
+
+	if (static_branch_likely(&sched_numa_balancing) &&
+	    __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) {
+		/*
+		 * XXX look for max occupancy inside prev_cpu's node
+		 */
+		return prev_cpu;
+	}
+
+	return cpu;
+}
+#else
+static int select_cache_cpu(struct task_struct *p, int prev_cpu)
+{
+	return prev_cpu;
+}
+#endif
+
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
@@ -8588,6 +8854,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 	 * required for stable ->cpus_allowed
 	 */
 	lockdep_assert_held(&p->pi_lock);
+	guard(rcu)();
+
 	if (wake_flags & WF_TTWU) {
 		record_wakee(p);
 
@@ -8595,6 +8863,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 		    cpumask_test_cpu(cpu, p->cpus_ptr))
 			return cpu;
 
+		new_cpu = prev_cpu = select_cache_cpu(p, prev_cpu);
+
 		if (!is_rd_overutilized(this_rq()->rd)) {
 			new_cpu = find_energy_efficient_cpu(p, prev_cpu);
 			if (new_cpu >= 0)
@@ -8605,7 +8875,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
 	}
 
-	rcu_read_lock();
 	for_each_domain(cpu, tmp) {
 		/*
 		 * If both 'cpu' and 'prev_cpu' are part of this domain,
@@ -8638,7 +8907,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 		/* Fast path */
 		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 	}
-	rcu_read_unlock();
 
 	return new_cpu;
 }
@@ -9288,6 +9556,17 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 	if (sysctl_sched_migration_cost == 0)
 		return 0;
 
+#ifdef CONFIG_SCHED_CACHE
+	if (p->mm && p->mm->pcpu_sched) {
+		/*
+		 * XXX things like Skylake have non-inclusive L3 and might not
+		 * like this L3 centric view. What to do about L2 stickyness ?
+		 */
+		return per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ >
+		       per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ;
+	}
+#endif
+
 	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
 
 	return delta < (s64)sysctl_sched_migration_cost;
@@ -9299,27 +9578,25 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
  * Returns 0, if task migration is not affected by locality.
  * Returns a negative value, if task migration improves locality i.e migration preferred.
  */
-static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
 {
 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
 	unsigned long src_weight, dst_weight;
 	int src_nid, dst_nid, dist;
 
-	if (!static_branch_likely(&sched_numa_balancing))
-		return 0;
-
-	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+	if (!p->numa_faults)
 		return 0;
 
-	src_nid = cpu_to_node(env->src_cpu);
-	dst_nid = cpu_to_node(env->dst_cpu);
+	src_nid = cpu_to_node(src_cpu);
+	dst_nid = cpu_to_node(dst_cpu);
 
 	if (src_nid == dst_nid)
 		return 0;
 
 	/* Migrating away from the preferred node is always bad. */
 	if (src_nid == p->numa_preferred_nid) {
-		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
+		struct rq *src_rq = cpu_rq(src_cpu);
+		if (src_rq->nr_running > src_rq->nr_preferred_running)
 			return 1;
 		else
 			return 0;
@@ -9330,7 +9607,7 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 		return -1;
 
 	/* Leaving a core idle is often worse than degrading locality. */
-	if (env->idle == CPU_IDLE)
+	if (idle)
 		return 0;
 
 	dist = node_distance(src_nid, dst_nid);
@@ -9345,7 +9622,24 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 	return src_weight - dst_weight;
 }
 
+static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+{
+	if (!static_branch_likely(&sched_numa_balancing))
+		return 0;
+
+	if (!(env->sd->flags & SD_NUMA))
+		return 0;
+
+	return __migrate_degrades_locality(p, env->src_cpu, env->dst_cpu,
+					   env->idle == CPU_IDLE);
+}
+
 #else
+static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
+{
+	return 0;
+}
+
 static inline long migrate_degrades_locality(struct task_struct *p,
 					     struct lb_env *env)
 {
@@ -13104,8 +13398,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
  */
 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 {
-	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &curr->se;
+	struct cfs_rq *cfs_rq;
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
@@ -13115,6 +13409,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 	if (static_branch_unlikely(&sched_numa_balancing))
 		task_tick_numa(rq, curr);
 
+	task_tick_cache(rq, curr);
+
 	update_misfit_status(curr, rq);
 	check_update_overutilized_status(task_rq(curr));
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 47972f34ea70..d16ccd66ca07 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1171,6 +1171,12 @@ struct rq {
 	u64			clock_pelt_idle_copy;
 	u64			clock_idle_copy;
 #endif
+#ifdef CONFIG_SCHED_CACHE
+	raw_spinlock_t		cpu_epoch_lock;
+	u64			cpu_runtime;
+	unsigned long		cpu_epoch;
+	unsigned long		cpu_epoch_next;
+#endif
 
 	atomic_t		nr_iowait;
 
@@ -3861,6 +3867,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
 static inline void init_sched_mm_cid(struct task_struct *t) { }
 #endif /* !CONFIG_SCHED_MM_CID */
 
+extern void init_sched_mm(struct task_struct *p);
+
 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
 extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
 #ifdef CONFIG_SMP

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Libo Chen 8 months, 2 weeks ago


On 3/25/25 05:09, Peter Zijlstra wrote:

> +		for_each_cpu(cpu, cpus) {
> +			/* XXX sched_cluster_active */
> +			struct sched_domain *sd = per_cpu(sd_llc, cpu);

Hi Peter,

I understand that this targets llc, but just want to point out that sd
here could be NULL for arch w/o llc, and then this can cause NULL ptr
dereference in sched_domain_span(sd)

Libo

> +			unsigned long occ, m_occ = 0, a_occ = 0;
> +			int m_cpu = -1, nr = 0, i;
> +
> +			for_each_cpu(i, sched_domain_span(sd)) {
> +				occ = fraction_mm_sched(cpu_rq(i),
> +							per_cpu_ptr(mm->pcpu_sched, i));
> +				a_occ += occ;
> +				if (occ > m_occ) {
> +					m_occ = occ;
> +					m_cpu = i;
> +				}
> +				nr++;
> +				trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
> +					     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
> +			}
> +
> +			a_occ /= nr;
> +			if (a_occ > m_a_occ) {
> +				m_a_occ = a_occ;
> +				m_a_cpu = m_cpu;
> +			}
> +
> +			trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
> +				     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
> +
> +			for_each_cpu(i, sched_domain_span(sd)) {
> +				/* XXX threshold ? */
> +				per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ;
> +			}
> +
> +			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
> +		}
> +	}
> +
> +	/*
> +	 * If the max average cache occupancy is 'small' we don't care.
> +	 */
> +	if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD))
> +		m_a_cpu = -1;
> +
> +	mm->mm_sched_cpu = m_a_cpu;
> +
> +	free_cpumask_var(cpus);
> +}
> +
> +void init_sched_mm(struct task_struct *p)
> +{
> +	struct callback_head *work = &p->cache_work;
> +	init_task_work(work, task_cache_work);
> +	work->next = work;
> +}
> +
> +#else
> +
> +static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
> +				    s64 delta_exec) { }
> +
> +
> +void init_sched_mm(struct task_struct *p) { }
> +
> +static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
> +
> +#endif
> +
> +static inline
> +void update_curr_task(struct rq *rq, struct task_struct *p, s64 delta_exec)
>  {
>  	trace_sched_stat_runtime(p, delta_exec);
>  	account_group_exec_runtime(p, delta_exec);
> +	account_mm_sched(rq, p, delta_exec);
>  	cgroup_account_cputime(p, delta_exec);
>  }
>  
> @@ -1215,7 +1434,7 @@ s64 update_curr_common(struct rq *rq)
>  
>  	delta_exec = update_curr_se(rq, &donor->se);
>  	if (likely(delta_exec > 0))
> -		update_curr_task(donor, delta_exec);
> +		update_curr_task(rq, donor, delta_exec);
>  
>  	return delta_exec;
>  }
> @@ -1244,7 +1463,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
>  	if (entity_is_task(curr)) {
>  		struct task_struct *p = task_of(curr);
>  
> -		update_curr_task(p, delta_exec);
> +		update_curr_task(rq, p, delta_exec);
>  
>  		/*
>  		 * If the fair_server is active, we need to account for the
> @@ -7850,7 +8069,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>  	 * per-cpu select_rq_mask usage
>  	 */
>  	lockdep_assert_irqs_disabled();
> -
> +again:
>  	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
>  	    asym_fits_cpu(task_util, util_min, util_max, target))
>  		return target;
> @@ -7888,7 +8107,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>  	/* Check a recently used CPU as a potential idle candidate: */
>  	recent_used_cpu = p->recent_used_cpu;
>  	p->recent_used_cpu = prev;
> -	if (recent_used_cpu != prev &&
> +	if (prev == p->wake_cpu &&
> +	    recent_used_cpu != prev &&
>  	    recent_used_cpu != target &&
>  	    cpus_share_cache(recent_used_cpu, target) &&
>  	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
> @@ -7941,6 +8161,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>  	if ((unsigned)i < nr_cpumask_bits)
>  		return i;
>  
> +	if (prev != p->wake_cpu && !cpus_share_cache(prev, p->wake_cpu)) {
> +		/*
> +		 * Most likely select_cache_cpu() will have re-directed
> +		 * the wakeup, but getting here means the preferred cache is
> +		 * too busy, so re-try with the actual previous.
> +		 *
> +		 * XXX wake_affine is lost for this pass.
> +		 */
> +		prev = target = p->wake_cpu;
> +		goto again;
> +	}
> +
>  	/*
>  	 * For cluster machines which have lower sharing cache like L2 or
>  	 * LLC Tag, we tend to find an idle CPU in the target's cluster
> @@ -8563,6 +8795,40 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>  	return target;
>  }
>  
> +#ifdef CONFIG_SCHED_CACHE
> +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle);
> +
> +static int select_cache_cpu(struct task_struct *p, int prev_cpu)
> +{
> +	struct mm_struct *mm = p->mm;
> +	int cpu;
> +
> +	if (!mm || p->nr_cpus_allowed == 1)
> +		return prev_cpu;
> +
> +	cpu = mm->mm_sched_cpu;
> +	if (cpu < 0)
> +		return prev_cpu;
> +
> +
> +	if (static_branch_likely(&sched_numa_balancing) &&
> +	    __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) {
> +		/*
> +		 * XXX look for max occupancy inside prev_cpu's node
> +		 */
> +		return prev_cpu;
> +	}
> +
> +	return cpu;
> +}
> +#else
> +static int select_cache_cpu(struct task_struct *p, int prev_cpu)
> +{
> +	return prev_cpu;
> +}
> +#endif
> +
> +
>  /*
>   * select_task_rq_fair: Select target runqueue for the waking task in domains
>   * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
> @@ -8588,6 +8854,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>  	 * required for stable ->cpus_allowed
>  	 */
>  	lockdep_assert_held(&p->pi_lock);
> +	guard(rcu)();
> +
>  	if (wake_flags & WF_TTWU) {
>  		record_wakee(p);
>  
> @@ -8595,6 +8863,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>  		    cpumask_test_cpu(cpu, p->cpus_ptr))
>  			return cpu;
>  
> +		new_cpu = prev_cpu = select_cache_cpu(p, prev_cpu);
> +
>  		if (!is_rd_overutilized(this_rq()->rd)) {
>  			new_cpu = find_energy_efficient_cpu(p, prev_cpu);
>  			if (new_cpu >= 0)
> @@ -8605,7 +8875,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>  		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
>  	}
>  
> -	rcu_read_lock();
>  	for_each_domain(cpu, tmp) {
>  		/*
>  		 * If both 'cpu' and 'prev_cpu' are part of this domain,
> @@ -8638,7 +8907,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>  		/* Fast path */
>  		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
>  	}
> -	rcu_read_unlock();
>  
>  	return new_cpu;
>  }
> @@ -9288,6 +9556,17 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>  	if (sysctl_sched_migration_cost == 0)
>  		return 0;
>  
> +#ifdef CONFIG_SCHED_CACHE
> +	if (p->mm && p->mm->pcpu_sched) {
> +		/*
> +		 * XXX things like Skylake have non-inclusive L3 and might not
> +		 * like this L3 centric view. What to do about L2 stickyness ?
> +		 */
> +		return per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ >
> +		       per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ;
> +	}
> +#endif
> +
>  	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
>  
>  	return delta < (s64)sysctl_sched_migration_cost;
> @@ -9299,27 +9578,25 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>   * Returns 0, if task migration is not affected by locality.
>   * Returns a negative value, if task migration improves locality i.e migration preferred.
>   */
> -static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
> +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
>  {
>  	struct numa_group *numa_group = rcu_dereference(p->numa_group);
>  	unsigned long src_weight, dst_weight;
>  	int src_nid, dst_nid, dist;
>  
> -	if (!static_branch_likely(&sched_numa_balancing))
> -		return 0;
> -
> -	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
> +	if (!p->numa_faults)
>  		return 0;
>  
> -	src_nid = cpu_to_node(env->src_cpu);
> -	dst_nid = cpu_to_node(env->dst_cpu);
> +	src_nid = cpu_to_node(src_cpu);
> +	dst_nid = cpu_to_node(dst_cpu);
>  
>  	if (src_nid == dst_nid)
>  		return 0;
>  
>  	/* Migrating away from the preferred node is always bad. */
>  	if (src_nid == p->numa_preferred_nid) {
> -		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
> +		struct rq *src_rq = cpu_rq(src_cpu);
> +		if (src_rq->nr_running > src_rq->nr_preferred_running)
>  			return 1;
>  		else
>  			return 0;
> @@ -9330,7 +9607,7 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
>  		return -1;
>  
>  	/* Leaving a core idle is often worse than degrading locality. */
> -	if (env->idle == CPU_IDLE)
> +	if (idle)
>  		return 0;
>  
>  	dist = node_distance(src_nid, dst_nid);
> @@ -9345,7 +9622,24 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
>  	return src_weight - dst_weight;
>  }
>  
> +static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
> +{
> +	if (!static_branch_likely(&sched_numa_balancing))
> +		return 0;
> +
> +	if (!(env->sd->flags & SD_NUMA))
> +		return 0;
> +
> +	return __migrate_degrades_locality(p, env->src_cpu, env->dst_cpu,
> +					   env->idle == CPU_IDLE);
> +}
> +
>  #else
> +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
> +{
> +	return 0;
> +}
> +
>  static inline long migrate_degrades_locality(struct task_struct *p,
>  					     struct lb_env *env)
>  {
> @@ -13104,8 +13398,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
>   */
>  static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>  {
> -	struct cfs_rq *cfs_rq;
>  	struct sched_entity *se = &curr->se;
> +	struct cfs_rq *cfs_rq;
>  
>  	for_each_sched_entity(se) {
>  		cfs_rq = cfs_rq_of(se);
> @@ -13115,6 +13409,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>  	if (static_branch_unlikely(&sched_numa_balancing))
>  		task_tick_numa(rq, curr);
>  
> +	task_tick_cache(rq, curr);
> +
>  	update_misfit_status(curr, rq);
>  	check_update_overutilized_status(task_rq(curr));
>  
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 47972f34ea70..d16ccd66ca07 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1171,6 +1171,12 @@ struct rq {
>  	u64			clock_pelt_idle_copy;
>  	u64			clock_idle_copy;
>  #endif
> +#ifdef CONFIG_SCHED_CACHE
> +	raw_spinlock_t		cpu_epoch_lock;
> +	u64			cpu_runtime;
> +	unsigned long		cpu_epoch;
> +	unsigned long		cpu_epoch_next;
> +#endif
>  
>  	atomic_t		nr_iowait;
>  
> @@ -3861,6 +3867,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
>  static inline void init_sched_mm_cid(struct task_struct *t) { }
>  #endif /* !CONFIG_SCHED_MM_CID */
>  
> +extern void init_sched_mm(struct task_struct *p);
> +
>  extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
>  extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
>  #ifdef CONFIG_SMP
>

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Tim Chen 8 months, 3 weeks ago

On Tue, 2025-03-25 at 13:09 +0100, Peter Zijlstra wrote:
> Hi all,
> 
> One of the many things on the eternal todo list has been finishing the
> below hackery.
> 
> It is an attempt at modelling cache affinity -- and while the patch
> really only targets LLC, it could very well be extended to also apply to
> clusters (L2). Specifically any case of multiple cache domains inside a
> node.
> 
> Anyway, I wrote this about a year ago, and I mentioned this at the
> recent OSPM conf where Gautham and Prateek expressed interest in playing
> with this code.
> 
> So here goes, very rough and largely unproven code ahead :-)
> 
> It applies to current tip/master, but I know it will fail the __percpu
> validation that sits in -next, although that shouldn't be terribly hard
> to fix up.
> 
> As is, it only computes a CPU inside the LLC that has the highest recent
> runtime, this CPU is then used in the wake-up path to steer towards this
> LLC and in task_hot() to limit migrations away from it.
> 
> More elaborate things could be done, notably there is an XXX in there
> somewhere about finding the best LLC inside a NODE (interaction with
> NUMA_BALANCING).

Thanks for sharing the patch.

> +
> +static void task_cache_work(struct callback_head *work)
> +{
> +	struct task_struct *p = current;
> +	struct mm_struct *mm = p->mm;
> +	unsigned long m_a_occ = 0;
> +	int cpu, m_a_cpu = -1;
> +	cpumask_var_t cpus;
> +
> +	WARN_ON_ONCE(work != &p->cache_work);
> +
> +	work->next = work;
> +
> +	if (p->flags & PF_EXITING)
> +		return;
> +
> +	if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
> +		return;
> +
> +	scoped_guard (cpus_read_lock) {
> +		cpumask_copy(cpus, cpu_online_mask);
> +

We can constrain the preferred LLC in the same preferred node
as that from numa balancing. Then numa balancing and preferred LLC
won't fight each other if preferred LLC falls on a different node.

Perhaps something like this here

+#ifdef CONFIG_NUMA_BALANCING
+               if (static_branch_likely(&sched_numa_balancing) &&
+                       p->numa_preferred_nid != NUMA_NO_NODE)
+                       cpumask_and(cpus, cpus, cpumask_of_node(p->numa_preferred_nid));
+#endif


> +		for_each_cpu(cpu, cpus) {
> +			/* XXX sched_cluster_active */
> +			struct sched_domain *sd = per_cpu(sd_llc, cpu);
> +			unsigned long occ, m_occ = 0, a_occ = 0;
> +			int m_cpu = -1, nr = 0, i;
> +
> +			for_each_cpu(i, sched_domain_span(sd)) {
> +				occ = fraction_mm_sched(cpu_rq(i),
> +							per_cpu_ptr(mm->pcpu_sched, i));
> +				a_occ += occ;
> +				if (occ > m_occ) {
> +					m_occ = occ;
> +					m_cpu = i;
> +				}
> +				nr++;
> +				trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
> +					     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
> +			}
> +
> +			a_occ /= nr;

Is it necessary to divide by nr (#CPUs in LLC)? Suppose we have uneven
number of CPUs between LLC, but some total occupancy, we will skew towards
the smaller LLC with this division, which may not be desirable. Could happen
if some CPUs are offlined. I think the preferred LLC can be the one with
most accumulated occupancy.

> +			if (a_occ > m_a_occ) {
> +				m_a_occ = a_occ;
> +				m_a_cpu = m_cpu;
> +			}
> +
> +			trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
> +				     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
> +
> +			for_each_cpu(i, sched_domain_span(sd)) {
> +				/* XXX threshold ? */
> +				per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ;
> +			}
> +
> +			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
> +		}
> +	}
> +
> +	/*
> +	 * If the max average cache occupancy is 'small' we don't care.
> +	 */
> +	if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD))
> +		m_a_cpu = -1;
> +
> +	mm->mm_sched_cpu = m_a_cpu;
> +
> +	free_cpumask_var(cpus);
> +}
> +
> +void init_sched_mm(struct task_struct *p)
> +{
> +	struct callback_head *work = &p->cache_work;
> +	init_task_work(work, task_cache_work);
> +	work->next = work;
> +}
> +
> +#else
> +
> +static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
> +				    s64 delta_exec) { }
> +
> +
> +void init_sched_mm(struct task_struct *p) { }
> +
> +static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
> +
> +#endif
> +
> +static inline
> +void update_curr_task(struct rq *rq, struct task_struct *p, s64 delta_exec)
>  {
>  	trace_sched_stat_runtime(p, delta_exec);
>  	account_group_exec_runtime(p, delta_exec);
> +	account_mm_sched(rq, p, delta_exec);
>  	cgroup_account_cputime(p, delta_exec);
>  }
>  
> @@ -1215,7 +1434,7 @@ s64 update_curr_common(struct rq *rq)
>  
>  	delta_exec = update_curr_se(rq, &donor->se);
>  	if (likely(delta_exec > 0))
> -		update_curr_task(donor, delta_exec);
> +		update_curr_task(rq, donor, delta_exec);
>  
>  	return delta_exec;
>  }
> @@ -1244,7 +1463,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
>  	if (entity_is_task(curr)) {
>  		struct task_struct *p = task_of(curr);
>  
> -		update_curr_task(p, delta_exec);
> +		update_curr_task(rq, p, delta_exec);
>  
>  		/*
>  		 * If the fair_server is active, we need to account for the
> @@ -7850,7 +8069,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>  	 * per-cpu select_rq_mask usage
>  	 */
>  	lockdep_assert_irqs_disabled();
> -
> +again:
>  	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
>  	    asym_fits_cpu(task_util, util_min, util_max, target))
>  		return target;
> @@ -7888,7 +8107,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>  	/* Check a recently used CPU as a potential idle candidate: */
>  	recent_used_cpu = p->recent_used_cpu;
>  	p->recent_used_cpu = prev;
> -	if (recent_used_cpu != prev &&
> +	if (prev == p->wake_cpu &&
> +	    recent_used_cpu != prev &&
>  	    recent_used_cpu != target &&
>  	    cpus_share_cache(recent_used_cpu, target) &&
>  	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
> @@ -7941,6 +8161,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>  	if ((unsigned)i < nr_cpumask_bits)
>  		return i;
>  
> +	if (prev != p->wake_cpu && !cpus_share_cache(prev, p->wake_cpu)) {
> +		/*
> +		 * Most likely select_cache_cpu() will have re-directed
> +		 * the wakeup, but getting here means the preferred cache is
> +		 * too busy, so re-try with the actual previous.
> +		 *
> +		 * XXX wake_affine is lost for this pass.
> +		 */
> +		prev = target = p->wake_cpu;
> +		goto again;
> +	}
> +
>  	/*
>  	 * For cluster machines which have lower sharing cache like L2 or
>  	 * LLC Tag, we tend to find an idle CPU in the target's cluster
> @@ -8563,6 +8795,40 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>  	return target;
>  }
>  
> +#ifdef CONFIG_SCHED_CACHE
> +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle);
> +
> +static int select_cache_cpu(struct task_struct *p, int prev_cpu)
> +{
> +	struct mm_struct *mm = p->mm;
> +	int cpu;
> +
> +	if (!mm || p->nr_cpus_allowed == 1)
> +		return prev_cpu;
> +
> +	cpu = mm->mm_sched_cpu;

Some regressions seen when the preferred LLC has significant load, causing frequent task
migrations between preferred and other LLCs. For those cases, tasks should
stay with prev_cpu to avoid migration overhead. Perhaps we could
consider the load in preferred LLC to see if we should move a task there. 
Say if preferred LLC is heavily loaded (say nr_running in LLC >= cpus in LLC),
we can stick with prev_cpu and not attempt to switch LLC.


> +	if (cpu < 0)
> +		return prev_cpu;
> +
> +
> +	if (static_branch_likely(&sched_numa_balancing) &&
> +	    __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) {
> +		/*
> +		 * XXX look for max occupancy inside prev_cpu's node
> +		 */
> +		return prev_cpu;
> +	}
> +
> +	return cpu;
> +}
> +#else
> +static int select_cache_cpu(struct task_struct *p, int prev_cpu)
> +{
> +	return prev_cpu;
> +}
> +#endif
> +
> +
>  /*
>   * select_task_rq_fair: Select target runqueue for the waking task in domains
>   * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
> @@ -8588,6 +8854,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>  	 * required for stable ->cpus_allowed
>  	 */
>  	lockdep_assert_held(&p->pi_lock);
> +	guard(rcu)();

Tim

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Abel Wu 8 months, 3 weeks ago

Hi Peter,

On 3/25/25 8:09 PM, Peter Zijlstra wrote:
> Hi all,
> 
> One of the many things on the eternal todo list has been finishing the
> below hackery.
> 
> It is an attempt at modelling cache affinity -- and while the patch
> really only targets LLC, it could very well be extended to also apply to
> clusters (L2). Specifically any case of multiple cache domains inside a
> node.
> 
> Anyway, I wrote this about a year ago, and I mentioned this at the
> recent OSPM conf where Gautham and Prateek expressed interest in playing
> with this code.
> 
> So here goes, very rough and largely unproven code ahead :-)
> 
> It applies to current tip/master, but I know it will fail the __percpu
> validation that sits in -next, although that shouldn't be terribly hard
> to fix up.
> 
> As is, it only computes a CPU inside the LLC that has the highest recent
> runtime, this CPU is then used in the wake-up path to steer towards this
> LLC and in task_hot() to limit migrations away from it.
> 
> More elaborate things could be done, notably there is an XXX in there
> somewhere about finding the best LLC inside a NODE (interaction with
> NUMA_BALANCING).
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>   include/linux/mm_types.h |  44 +++++++
>   include/linux/sched.h    |   4 +
>   init/Kconfig             |   4 +
>   kernel/fork.c            |   5 +
>   kernel/sched/core.c      |  13 +-
>   kernel/sched/fair.c      | 330 ++++++++++++++++++++++++++++++++++++++++++++---
>   kernel/sched/sched.h     |   8 ++
>   7 files changed, 388 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 0234f14f2aa6..3ed8dd225eb9 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -800,6 +800,12 @@ struct mm_cid {
>   };
>   #endif
>   
> +struct mm_sched {
> +	u64 runtime;
> +	unsigned long epoch;
> +	unsigned long occ;
> +};
> +
>   struct kioctx_table;
>   struct iommu_mm_data;
>   struct mm_struct {
> @@ -890,6 +896,17 @@ struct mm_struct {
>   		 */
>   		raw_spinlock_t cpus_allowed_lock;
>   #endif
> +#ifdef CONFIG_SCHED_CACHE
> +		/*
> +		 * Track per-cpu-per-process occupancy as a proxy for cache residency.
> +		 * See account_mm_sched() and ...
> +		 */
> +		struct mm_sched __percpu *pcpu_sched;
> +		raw_spinlock_t mm_sched_lock;
> +		unsigned long mm_sched_epoch;
> +		int mm_sched_cpu;
> +#endif
> +
>   #ifdef CONFIG_MMU
>   		atomic_long_t pgtables_bytes;	/* size of all page tables */
>   #endif
> @@ -1296,6 +1313,33 @@ static inline unsigned int mm_cid_size(void)
>   static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
>   #endif /* CONFIG_SCHED_MM_CID */
>   
> +#ifdef CONFIG_SCHED_CACHE
> +extern void mm_init_sched(struct mm_struct *mm, struct mm_sched *pcpu_sched);
> +
> +static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
> +{
> +	struct mm_sched *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
> +	if (!pcpu_sched)
> +		return -ENOMEM;
> +
> +	mm_init_sched(mm, pcpu_sched);
> +	return 0;
> +}
> +
> +#define mm_alloc_sched(...)	alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__))
> +
> +static inline void mm_destroy_sched(struct mm_struct *mm)
> +{
> +	free_percpu(mm->pcpu_sched);
> +	mm->pcpu_sched = NULL;
> +}
> +#else /* !CONFIG_SCHED_CACHE */
> +
> +static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; }
> +static inline void mm_destroy_sched(struct mm_struct *mm) { }
> +
> +#endif /* CONFIG_SCHED_CACHE */
> +
>   struct mmu_gather;
>   extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
>   extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 6e5c38718ff5..f8eafe440369 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1379,6 +1379,10 @@ struct task_struct {
>   	unsigned long			numa_pages_migrated;
>   #endif /* CONFIG_NUMA_BALANCING */
>   
> +#ifdef CONFIG_SCHED_CACHE
> +	struct callback_head		cache_work;
> +#endif

IIUC this work updates stats for the whole mm and seems not
necessary for each task of the process to repeat same thing.
Hence would be better move this work to mm_struct.

> +
>   #ifdef CONFIG_RSEQ
>   	struct rseq __user *rseq;
>   	u32 rseq_len;
> diff --git a/init/Kconfig b/init/Kconfig
> index 681f38ee68db..14b15215318f 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -950,6 +950,10 @@ config NUMA_BALANCING
>   
>   	  This system will be inactive on UMA systems.
>   
> +config SCHED_CACHE
> +	bool "Cache aware scheduler"
> +	default y
> +
>   config NUMA_BALANCING_DEFAULT_ENABLED
>   	bool "Automatically enable NUMA aware memory/task placement"
>   	default y
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 1b659b07ecd5..bc9d7dbfd980 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1314,6 +1314,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>   	if (mm_alloc_cid(mm, p))
>   		goto fail_cid;
>   
> +	if (mm_alloc_sched(mm))
> +		goto fail_sched;
> +
>   	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
>   				     NR_MM_COUNTERS))
>   		goto fail_pcpu;
> @@ -1323,6 +1326,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>   	return mm;
>   
>   fail_pcpu:
> +	mm_destroy_sched(mm);
> +fail_sched:
>   	mm_destroy_cid(mm);
>   fail_cid:
>   	destroy_context(mm);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 87540217fc09..649db6ea41ea 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4514,6 +4514,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
>   	p->migration_pending = NULL;
>   #endif
>   	init_sched_mm_cid(p);
> +	init_sched_mm(p);
>   }
>   
>   DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
> @@ -8505,6 +8506,7 @@ static struct kmem_cache *task_group_cache __ro_after_init;
>   
>   void __init sched_init(void)
>   {
> +	unsigned long now = jiffies;
>   	unsigned long ptr = 0;
>   	int i;
>   
> @@ -8579,7 +8581,7 @@ void __init sched_init(void)
>   		raw_spin_lock_init(&rq->__lock);
>   		rq->nr_running = 0;
>   		rq->calc_load_active = 0;
> -		rq->calc_load_update = jiffies + LOAD_FREQ;
> +		rq->calc_load_update = now + LOAD_FREQ;
>   		init_cfs_rq(&rq->cfs);
>   		init_rt_rq(&rq->rt);
>   		init_dl_rq(&rq->dl);
> @@ -8623,7 +8625,7 @@ void __init sched_init(void)
>   		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
>   		rq->balance_callback = &balance_push_callback;
>   		rq->active_balance = 0;
> -		rq->next_balance = jiffies;
> +		rq->next_balance = now;
>   		rq->push_cpu = 0;
>   		rq->cpu = i;
>   		rq->online = 0;
> @@ -8635,7 +8637,7 @@ void __init sched_init(void)
>   
>   		rq_attach_root(rq, &def_root_domain);
>   #ifdef CONFIG_NO_HZ_COMMON
> -		rq->last_blocked_load_update_tick = jiffies;
> +		rq->last_blocked_load_update_tick = now;
>   		atomic_set(&rq->nohz_flags, 0);
>   
>   		INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
> @@ -8660,6 +8662,11 @@ void __init sched_init(void)
>   
>   		rq->core_cookie = 0UL;
>   #endif
> +#ifdef CONFIG_SCHED_CACHE
> +		raw_spin_lock_init(&rq->cpu_epoch_lock);
> +		rq->cpu_epoch_next = now;
> +#endif
> +
>   		zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
>   	}
>   
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e43993a4e580..943af076e09c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1166,10 +1166,229 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
>   	return delta_exec;
>   }
>   
> -static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
> +#ifdef CONFIG_SCHED_CACHE
> +
> +/*
> + * XXX numbers come from a place the sun don't shine -- probably wants to be SD
> + * tunable or so.
> + */
> +#define EPOCH_PERIOD	(HZ/100)	/* 10 ms */
> +#define EPOCH_OLD	5		/* 50 ms */
> +
> +void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
> +{
> +	unsigned long epoch;
> +	int i;
> +
> +	for_each_possible_cpu(i) {
> +		struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
> +		struct rq *rq = cpu_rq(i);
> +
> +		pcpu_sched->runtime = 0;
> +		pcpu_sched->epoch = epoch = rq->cpu_epoch;
> +		pcpu_sched->occ = -1;
> +	}
> +
> +	raw_spin_lock_init(&mm->mm_sched_lock);
> +	mm->mm_sched_epoch = epoch;
> +	mm->mm_sched_cpu = -1;
> +
> +	smp_store_release(&mm->pcpu_sched, _pcpu_sched);
> +}
> +
> +/* because why would C be fully specified */
> +static __always_inline void __shr_u64(u64 *val, unsigned int n)
> +{
> +	if (n >= 64) {
> +		*val = 0;
> +		return;
> +	}
> +	*val >>= n;
> +}
> +
> +static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
> +{
> +	lockdep_assert_held(&rq->cpu_epoch_lock);
> +
> +	unsigned long n, now = jiffies;
> +	long delta = now - rq->cpu_epoch_next;
> +
> +	if (delta > 0) {
> +		n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
> +		rq->cpu_epoch += n;
> +		rq->cpu_epoch_next += n * EPOCH_PERIOD;
> +		__shr_u64(&rq->cpu_runtime, n);
> +	}
> +
> +	n = rq->cpu_epoch - pcpu_sched->epoch;
> +	if (n) {
> +		pcpu_sched->epoch += n;
> +		__shr_u64(&pcpu_sched->runtime, n);
> +	}
> +}
> +
> +static unsigned long fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
> +{
> +	guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
> +
> +	__update_mm_sched(rq, pcpu_sched);
> +
> +	/*
> +	 * Runtime is a geometric series (r=0.5) and as such will sum to twice
> +	 * the accumulation period, this means the multiplcation here should
> +	 * not overflow.
> +	 */
> +	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);

Should the actual cpu capacity also be taken into consideration?

> +}
> +
> +static inline
> +void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> +{
> +	struct mm_struct *mm = p->mm;
> +	struct mm_sched *pcpu_sched;
> +	unsigned long epoch;
> +
> +	/*
> +	 * init_task and kthreads don't be having no mm
> +	 */
> +	if (!mm || !mm->pcpu_sched)
> +		return;
> +
> +	pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
> +
> +	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
> +		__update_mm_sched(rq, pcpu_sched);
> +		pcpu_sched->runtime += delta_exec;
> +		rq->cpu_runtime += delta_exec;
> +		epoch = rq->cpu_epoch;
> +	}
> +
> +	/*
> +	 * If this task hasn't hit task_cache_work() for a while, invalidate
> +	 * it's preferred state.
> +	 */
> +	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
> +		mm->mm_sched_cpu = -1;
> +		pcpu_sched->occ = -1;
> +	}

This seems too late. account_mm_sched() is called when p is runnable,
so if the whole process sleeps for a while before woken up, ttwu will
take the out-dated value.

> +}
> +
> +static void task_tick_cache(struct rq *rq, struct task_struct *p)
> +{
> +	struct callback_head *work = &p->cache_work;
> +	struct mm_struct *mm = p->mm;
> +
> +	if (!mm || !mm->pcpu_sched)
> +		return;
> +
> +	if (mm->mm_sched_epoch == rq->cpu_epoch)
> +		return;
> +
> +	guard(raw_spinlock)(&mm->mm_sched_lock);
> +
> +	if (mm->mm_sched_epoch == rq->cpu_epoch)
> +		return;
> +
> +	if (work->next == work) {
> +		task_work_add(p, work, TWA_RESUME);
> +		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
> +	}
> +}
> +
> +static void task_cache_work(struct callback_head *work)
> +{
> +	struct task_struct *p = current;
> +	struct mm_struct *mm = p->mm;
> +	unsigned long m_a_occ = 0;
> +	int cpu, m_a_cpu = -1;
> +	cpumask_var_t cpus;
> +
> +	WARN_ON_ONCE(work != &p->cache_work);
> +
> +	work->next = work;
> +
> +	if (p->flags & PF_EXITING)
> +		return;
> +
> +	if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
> +		return;
> +
> +	scoped_guard (cpus_read_lock) {
> +		cpumask_copy(cpus, cpu_online_mask);
> +
> +		for_each_cpu(cpu, cpus) {
> +			/* XXX sched_cluster_active */
> +			struct sched_domain *sd = per_cpu(sd_llc, cpu);
> +			unsigned long occ, m_occ = 0, a_occ = 0;
> +			int m_cpu = -1, nr = 0, i;
> +
> +			for_each_cpu(i, sched_domain_span(sd)) {
> +				occ = fraction_mm_sched(cpu_rq(i),
> +							per_cpu_ptr(mm->pcpu_sched, i));
> +				a_occ += occ;
> +				if (occ > m_occ) {
> +					m_occ = occ;
> +					m_cpu = i;
> +				}

It would be possible to cause task stacking on this hint cpu
due to its less frequently updated compared to wakeup.

And although the occupancy heuristic looks reasonable, IMHO it
doesn't make much sense to compare between cpus as they share
the LLC, and a non-hint cpu with warmer L1/L2$ in same LLC with
the hint cpu seems more preferred.

Do you think it's appropriate or not to only hint on the hottest
LLC? So the tasks can hopefully wokenup on 'right' LLC on the
premise that wouldn't cause much imbalance between LLCs.

I will do some tests and return with more feedback.

Thanks!
	Abel

> +				nr++;
> +				trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
> +					     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
> +			}
> +
> +			a_occ /= nr;
> +			if (a_occ > m_a_occ) {
> +				m_a_occ = a_occ;
> +				m_a_cpu = m_cpu;
> +			}
> +
> +			trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
> +				     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
> +
> +			for_each_cpu(i, sched_domain_span(sd)) {
> +				/* XXX threshold ? */
> +				per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ;
> +			}
> +
> +			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
> +		}
> +	}
> +
> +	/*
> +	 * If the max average cache occupancy is 'small' we don't care.
> +	 */
> +	if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD))
> +		m_a_cpu = -1;
> +
> +	mm->mm_sched_cpu = m_a_cpu;
> +
> +	free_cpumask_var(cpus);
> +}
> +
> +void init_sched_mm(struct task_struct *p)
> +{
> +	struct callback_head *work = &p->cache_work;
> +	init_task_work(work, task_cache_work);
> +	work->next = work;
> +}
> +
> +#else
> +
> +static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
> +				    s64 delta_exec) { }
> +
> +
> +void init_sched_mm(struct task_struct *p) { }
> +
> +static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
> +
> +#endif
> +
> +static inline
> +void update_curr_task(struct rq *rq, struct task_struct *p, s64 delta_exec)
>   {
>   	trace_sched_stat_runtime(p, delta_exec);
>   	account_group_exec_runtime(p, delta_exec);
> +	account_mm_sched(rq, p, delta_exec);
>   	cgroup_account_cputime(p, delta_exec);
>   }
>   
> @@ -1215,7 +1434,7 @@ s64 update_curr_common(struct rq *rq)
>   
>   	delta_exec = update_curr_se(rq, &donor->se);
>   	if (likely(delta_exec > 0))
> -		update_curr_task(donor, delta_exec);
> +		update_curr_task(rq, donor, delta_exec);
>   
>   	return delta_exec;
>   }
> @@ -1244,7 +1463,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
>   	if (entity_is_task(curr)) {
>   		struct task_struct *p = task_of(curr);
>   
> -		update_curr_task(p, delta_exec);
> +		update_curr_task(rq, p, delta_exec);
>   
>   		/*
>   		 * If the fair_server is active, we need to account for the
> @@ -7850,7 +8069,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>   	 * per-cpu select_rq_mask usage
>   	 */
>   	lockdep_assert_irqs_disabled();
> -
> +again:
>   	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
>   	    asym_fits_cpu(task_util, util_min, util_max, target))
>   		return target;
> @@ -7888,7 +8107,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>   	/* Check a recently used CPU as a potential idle candidate: */
>   	recent_used_cpu = p->recent_used_cpu;
>   	p->recent_used_cpu = prev;
> -	if (recent_used_cpu != prev &&
> +	if (prev == p->wake_cpu &&
> +	    recent_used_cpu != prev &&
>   	    recent_used_cpu != target &&
>   	    cpus_share_cache(recent_used_cpu, target) &&
>   	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
> @@ -7941,6 +8161,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>   	if ((unsigned)i < nr_cpumask_bits)
>   		return i;
>   
> +	if (prev != p->wake_cpu && !cpus_share_cache(prev, p->wake_cpu)) {
> +		/*
> +		 * Most likely select_cache_cpu() will have re-directed
> +		 * the wakeup, but getting here means the preferred cache is
> +		 * too busy, so re-try with the actual previous.
> +		 *
> +		 * XXX wake_affine is lost for this pass.
> +		 */
> +		prev = target = p->wake_cpu;
> +		goto again;
> +	}
> +
>   	/*
>   	 * For cluster machines which have lower sharing cache like L2 or
>   	 * LLC Tag, we tend to find an idle CPU in the target's cluster
> @@ -8563,6 +8795,40 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>   	return target;
>   }
>   
> +#ifdef CONFIG_SCHED_CACHE
> +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle);
> +
> +static int select_cache_cpu(struct task_struct *p, int prev_cpu)
> +{
> +	struct mm_struct *mm = p->mm;
> +	int cpu;
> +
> +	if (!mm || p->nr_cpus_allowed == 1)
> +		return prev_cpu;
> +
> +	cpu = mm->mm_sched_cpu;
> +	if (cpu < 0)
> +		return prev_cpu;
> +
> +
> +	if (static_branch_likely(&sched_numa_balancing) &&
> +	    __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) {
> +		/*
> +		 * XXX look for max occupancy inside prev_cpu's node
> +		 */
> +		return prev_cpu;
> +	}
> +
> +	return cpu;
> +}
> +#else
> +static int select_cache_cpu(struct task_struct *p, int prev_cpu)
> +{
> +	return prev_cpu;
> +}
> +#endif
> +
> +
>   /*
>    * select_task_rq_fair: Select target runqueue for the waking task in domains
>    * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
> @@ -8588,6 +8854,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>   	 * required for stable ->cpus_allowed
>   	 */
>   	lockdep_assert_held(&p->pi_lock);
> +	guard(rcu)();
> +
>   	if (wake_flags & WF_TTWU) {
>   		record_wakee(p);
>   
> @@ -8595,6 +8863,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>   		    cpumask_test_cpu(cpu, p->cpus_ptr))
>   			return cpu;
>   
> +		new_cpu = prev_cpu = select_cache_cpu(p, prev_cpu);
> +
>   		if (!is_rd_overutilized(this_rq()->rd)) {
>   			new_cpu = find_energy_efficient_cpu(p, prev_cpu);
>   			if (new_cpu >= 0)
> @@ -8605,7 +8875,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>   		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
>   	}
>   
> -	rcu_read_lock();
>   	for_each_domain(cpu, tmp) {
>   		/*
>   		 * If both 'cpu' and 'prev_cpu' are part of this domain,
> @@ -8638,7 +8907,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>   		/* Fast path */
>   		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
>   	}
> -	rcu_read_unlock();
>   
>   	return new_cpu;
>   }
> @@ -9288,6 +9556,17 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>   	if (sysctl_sched_migration_cost == 0)
>   		return 0;
>   
> +#ifdef CONFIG_SCHED_CACHE
> +	if (p->mm && p->mm->pcpu_sched) {
> +		/*
> +		 * XXX things like Skylake have non-inclusive L3 and might not
> +		 * like this L3 centric view. What to do about L2 stickyness ?
> +		 */
> +		return per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ >
> +		       per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ;
> +	}
> +#endif
> +
>   	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
>   
>   	return delta < (s64)sysctl_sched_migration_cost;
> @@ -9299,27 +9578,25 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>    * Returns 0, if task migration is not affected by locality.
>    * Returns a negative value, if task migration improves locality i.e migration preferred.
>    */
> -static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
> +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
>   {
>   	struct numa_group *numa_group = rcu_dereference(p->numa_group);
>   	unsigned long src_weight, dst_weight;
>   	int src_nid, dst_nid, dist;
>   
> -	if (!static_branch_likely(&sched_numa_balancing))
> -		return 0;
> -
> -	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
> +	if (!p->numa_faults)
>   		return 0;
>   
> -	src_nid = cpu_to_node(env->src_cpu);
> -	dst_nid = cpu_to_node(env->dst_cpu);
> +	src_nid = cpu_to_node(src_cpu);
> +	dst_nid = cpu_to_node(dst_cpu);
>   
>   	if (src_nid == dst_nid)
>   		return 0;
>   
>   	/* Migrating away from the preferred node is always bad. */
>   	if (src_nid == p->numa_preferred_nid) {
> -		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
> +		struct rq *src_rq = cpu_rq(src_cpu);
> +		if (src_rq->nr_running > src_rq->nr_preferred_running)
>   			return 1;
>   		else
>   			return 0;
> @@ -9330,7 +9607,7 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
>   		return -1;
>   
>   	/* Leaving a core idle is often worse than degrading locality. */
> -	if (env->idle == CPU_IDLE)
> +	if (idle)
>   		return 0;
>   
>   	dist = node_distance(src_nid, dst_nid);
> @@ -9345,7 +9622,24 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
>   	return src_weight - dst_weight;
>   }
>   
> +static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
> +{
> +	if (!static_branch_likely(&sched_numa_balancing))
> +		return 0;
> +
> +	if (!(env->sd->flags & SD_NUMA))
> +		return 0;
> +
> +	return __migrate_degrades_locality(p, env->src_cpu, env->dst_cpu,
> +					   env->idle == CPU_IDLE);
> +}
> +
>   #else
> +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
> +{
> +	return 0;
> +}
> +
>   static inline long migrate_degrades_locality(struct task_struct *p,
>   					     struct lb_env *env)
>   {
> @@ -13104,8 +13398,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
>    */
>   static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>   {
> -	struct cfs_rq *cfs_rq;
>   	struct sched_entity *se = &curr->se;
> +	struct cfs_rq *cfs_rq;
>   
>   	for_each_sched_entity(se) {
>   		cfs_rq = cfs_rq_of(se);
> @@ -13115,6 +13409,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>   	if (static_branch_unlikely(&sched_numa_balancing))
>   		task_tick_numa(rq, curr);
>   
> +	task_tick_cache(rq, curr);
> +
>   	update_misfit_status(curr, rq);
>   	check_update_overutilized_status(task_rq(curr));
>   
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 47972f34ea70..d16ccd66ca07 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1171,6 +1171,12 @@ struct rq {
>   	u64			clock_pelt_idle_copy;
>   	u64			clock_idle_copy;
>   #endif
> +#ifdef CONFIG_SCHED_CACHE
> +	raw_spinlock_t		cpu_epoch_lock;
> +	u64			cpu_runtime;
> +	unsigned long		cpu_epoch;
> +	unsigned long		cpu_epoch_next;
> +#endif
>   
>   	atomic_t		nr_iowait;
>   
> @@ -3861,6 +3867,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
>   static inline void init_sched_mm_cid(struct task_struct *t) { }
>   #endif /* !CONFIG_SCHED_MM_CID */
>   
> +extern void init_sched_mm(struct task_struct *p);
> +
>   extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
>   extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
>   #ifdef CONFIG_SMP
>

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Chen, Yu C 8 months, 3 weeks ago

On 3/28/2025 9:57 PM, Abel Wu wrote:
> Hi Peter,
> 
> On 3/25/25 8:09 PM, Peter Zijlstra wrote:
>>   struct mmu_gather;
>>   extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct 
>> *mm);
>>   extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct 
>> mm_struct *mm);
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 6e5c38718ff5..f8eafe440369 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1379,6 +1379,10 @@ struct task_struct {
>>       unsigned long            numa_pages_migrated;
>>   #endif /* CONFIG_NUMA_BALANCING */
>> +#ifdef CONFIG_SCHED_CACHE
>> +    struct callback_head        cache_work;
>> +#endif
> 
> IIUC this work updates stats for the whole mm and seems not
> necessary for each task of the process to repeat same thing.
> Hence would be better move this work to mm_struct.
> 

It seems that the per task cache_work is used to avoid
duplicated task_cache_work() in task->task_works queue, see
task_tick_cache()'s check
  if (work->next == work)
	task_work_add()

To do exclusive task_cache_work() and only allow 1 task
in the process to do the calculation, maybe introducing similar 
mechanism like task_numa_work(), something like this:

if (!try_cmpxchg(&mm->cache_next_scan, &calc, next_scan))
	return;

>> +
>> +static inline
>> +void account_mm_sched(struct rq *rq, struct task_struct *p, s64 
>> delta_exec)
>> +{
>> +    struct mm_struct *mm = p->mm;
>> +    struct mm_sched *pcpu_sched;
>> +    unsigned long epoch;
>> +
>> +    /*
>> +     * init_task and kthreads don't be having no mm
>> +     */
>> +    if (!mm || !mm->pcpu_sched)
>> +        return;
>> +
>> +    pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
>> +
>> +    scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
>> +        __update_mm_sched(rq, pcpu_sched);
>> +        pcpu_sched->runtime += delta_exec;
>> +        rq->cpu_runtime += delta_exec;
>> +        epoch = rq->cpu_epoch;
>> +    }
>> +
>> +    /*
>> +     * If this task hasn't hit task_cache_work() for a while, invalidate
>> +     * it's preferred state.
>> +     */
>> +    if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
>> +        mm->mm_sched_cpu = -1;
>> +        pcpu_sched->occ = -1;
>> +    }
> 
> This seems too late. account_mm_sched() is called when p is runnable,
> so if the whole process sleeps for a while before woken up, ttwu will
> take the out-dated value.
> 

Yup, there seems to be a problem. It would be better if we could reset 
the mm_sched_cpu to -1 after the last thread of the process falls 
asleep. But considering that all threads are sleeping, even if the ttwu 
tries to enqueue the first newly-woken thread to an out-dated idle 
mm_sched_cpu, does it matter? I guess it would not be a serious problem, 
because all the cache of the process might have been evicted due to long 
sleep.

>> +
>> +static void task_cache_work(struct callback_head *work)
>> +{
>> +    struct task_struct *p = current;
>> +    struct mm_struct *mm = p->mm;
>> +    unsigned long m_a_occ = 0;
>> +    int cpu, m_a_cpu = -1;
>> +    cpumask_var_t cpus;
>> +
>> +    WARN_ON_ONCE(work != &p->cache_work);
>> +
>> +    work->next = work;
>> +
>> +    if (p->flags & PF_EXITING)
>> +        return;
>> +
>> +    if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
>> +        return;
>> +
>> +    scoped_guard (cpus_read_lock) {
>> +        cpumask_copy(cpus, cpu_online_mask);
>> +
>> +        for_each_cpu(cpu, cpus) {
>> +            /* XXX sched_cluster_active */
>> +            struct sched_domain *sd = per_cpu(sd_llc, cpu);
>> +            unsigned long occ, m_occ = 0, a_occ = 0;
>> +            int m_cpu = -1, nr = 0, i;
>> +
>> +            for_each_cpu(i, sched_domain_span(sd)) {
>> +                occ = fraction_mm_sched(cpu_rq(i),
>> +                            per_cpu_ptr(mm->pcpu_sched, i));
>> +                a_occ += occ;
>> +                if (occ > m_occ) {
>> +                    m_occ = occ;
>> +                    m_cpu = i;
>> +                }
> 
> It would be possible to cause task stacking on this hint cpu
> due to its less frequently updated compared to wakeup.
> 

The SIS would overwrite the prev CPU with this hint(cached) CPU, and use 
that cached CPU as a hint to search for an idle CPU, so ideally it 
should not cause task stacking. But there is a race condition that 
multiple wakeup path might find the same cached "idle" CPU and queue 
wakees on it, this usually happens when there is frequent context 
switch(wakeup)+short duration tasks.


> And although the occupancy heuristic looks reasonable, IMHO it
> doesn't make much sense to compare between cpus as they share
> the LLC, and a non-hint cpu with warmer L1/L2$ in same LLC with
> the hint cpu seems more preferred.
> 
> Do you think it's appropriate or not to only hint on the hottest
> LLC? So the tasks can hopefully wokenup on 'right' LLC on the
> premise that wouldn't cause much imbalance between LLCs.
> 
 > I will do some tests and return with more feedback.
 >

Find an idle CPU in the wakee's hostest LLC seems to be plausible.
The benchmark data might indicate a proper way.

thanks,
Chenyu

Re: Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Abel Wu 8 months, 3 weeks ago

On 3/29/25 11:06 PM, Chen, Yu C wrote:
> On 3/28/2025 9:57 PM, Abel Wu wrote:
>>
>> IIUC this work updates stats for the whole mm and seems not
>> necessary for each task of the process to repeat same thing.
>> Hence would be better move this work to mm_struct.
>>
> 
> It seems that the per task cache_work is used to avoid
> duplicated task_cache_work() in task->task_works queue, see
> task_tick_cache()'s check
>   if (work->next == work)
>      task_work_add()

Yes, this check avoids task_works being corrupt caused by double
adding the work, no matter this work is task- or mm-specific. So
if moving to mm_struct, this check become false once any task takes
this work, and other tasks can not do the same job until that task
finishes by setting work->next to work.

BTW, moving to mm_struct will save some memory footprint?

> 
> To do exclusive task_cache_work() and only allow 1 task
> in the process to do the calculation, maybe introducing similar mechanism like task_numa_work(), something like this:
> 
> if (!try_cmpxchg(&mm->cache_next_scan, &calc, next_scan))
>      return;

Yes, this looks good to me. While Peter used epoch to regulate
the frequency of this work, which is a ligher way but allows some
inaccuracy which is further fixed by a double check after holding
mm->mm_sched_lock.

I plan to use trylock on mm->mm_sched_lock first. If trylock fails
then someone is adding the work and we can skip early.

static void task_tick_cache(struct rq *rq, struct task_struct *p)
{
	...

	if (mm->mm_sched_epoch == rq->cpu_epoch)
		return;

	guard(raw_spinlock)(&mm->mm_sched_lock);  <-- trylock

	...
}

> 
>>> +
>>> +static inline
>>> +void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>>> +{
>>> +    struct mm_struct *mm = p->mm;
>>> +    struct mm_sched *pcpu_sched;
>>> +    unsigned long epoch;
>>> +
>>> +    /*
>>> +     * init_task and kthreads don't be having no mm
>>> +     */
>>> +    if (!mm || !mm->pcpu_sched)
>>> +        return;
>>> +
>>> +    pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
>>> +
>>> +    scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
>>> +        __update_mm_sched(rq, pcpu_sched);
>>> +        pcpu_sched->runtime += delta_exec;
>>> +        rq->cpu_runtime += delta_exec;
>>> +        epoch = rq->cpu_epoch;
>>> +    }
>>> +
>>> +    /*
>>> +     * If this task hasn't hit task_cache_work() for a while, invalidate
>>> +     * it's preferred state.
>>> +     */
>>> +    if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
>>> +        mm->mm_sched_cpu = -1;
>>> +        pcpu_sched->occ = -1;
>>> +    }
>>
>> This seems too late. account_mm_sched() is called when p is runnable,
>> so if the whole process sleeps for a while before woken up, ttwu will
>> take the out-dated value.
>>
> 
> Yup, there seems to be a problem. It would be better if we could reset the mm_sched_cpu to -1 after the last thread of the process falls asleep. But considering that all threads are sleeping, even if the ttwu tries to enqueue the first newly-woken thread to an out-dated idle mm_sched_cpu, does it matter? I guess it would not be a serious problem, because all the cache of the process might have been evicted due to long sleep.

Yes, it seems not a big deal if mm_sched_cpu not overwrites any better
choice.

> 
>>> +
>>> +static void task_cache_work(struct callback_head *work)
>>> +{
>>> +    struct task_struct *p = current;
>>> +    struct mm_struct *mm = p->mm;
>>> +    unsigned long m_a_occ = 0;
>>> +    int cpu, m_a_cpu = -1;
>>> +    cpumask_var_t cpus;
>>> +
>>> +    WARN_ON_ONCE(work != &p->cache_work);
>>> +
>>> +    work->next = work;
>>> +
>>> +    if (p->flags & PF_EXITING)
>>> +        return;
>>> +
>>> +    if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
>>> +        return;
>>> +
>>> +    scoped_guard (cpus_read_lock) {
>>> +        cpumask_copy(cpus, cpu_online_mask);
>>> +
>>> +        for_each_cpu(cpu, cpus) {
>>> +            /* XXX sched_cluster_active */
>>> +            struct sched_domain *sd = per_cpu(sd_llc, cpu);
>>> +            unsigned long occ, m_occ = 0, a_occ = 0;
>>> +            int m_cpu = -1, nr = 0, i;
>>> +
>>> +            for_each_cpu(i, sched_domain_span(sd)) {
>>> +                occ = fraction_mm_sched(cpu_rq(i),
>>> +                            per_cpu_ptr(mm->pcpu_sched, i));
>>> +                a_occ += occ;
>>> +                if (occ > m_occ) {
>>> +                    m_occ = occ;
>>> +                    m_cpu = i;
>>> +                }
>>
>> It would be possible to cause task stacking on this hint cpu
>> due to its less frequently updated compared to wakeup.
>>
> 
> The SIS would overwrite the prev CPU with this hint(cached) CPU, and use that cached CPU as a hint to search for an idle CPU, so ideally it should not cause task stacking. But there is a race condition that multiple wakeup path might find the same cached "idle" CPU and queue wakees on it, this usually happens when there is frequent context switch(wakeup)+short duration tasks.

Ideally mm_sched_cpu is updated every EPOCH_PERIOD which is default
to 10ms, so I'm afraid the race window is not small.

> 
> 
>> And although the occupancy heuristic looks reasonable, IMHO it
>> doesn't make much sense to compare between cpus as they share
>> the LLC, and a non-hint cpu with warmer L1/L2$ in same LLC with
>> the hint cpu seems more preferred.
>>
>> Do you think it's appropriate or not to only hint on the hottest
>> LLC? So the tasks can hopefully wokenup on 'right' LLC on the
>> premise that wouldn't cause much imbalance between LLCs.
>>
>  > I will do some tests and return with more feedback.
>  >
> 
> Find an idle CPU in the wakee's hostest LLC seems to be plausible.
> The benchmark data might indicate a proper way.
> 
> thanks,
> Chenyu
> 
>

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Chen, Yu C 8 months, 3 weeks ago

On 3/30/2025 4:46 PM, Abel Wu wrote:
> On 3/29/25 11:06 PM, Chen, Yu C wrote:
>> On 3/28/2025 9:57 PM, Abel Wu wrote:
>>>
>>> IIUC this work updates stats for the whole mm and seems not
>>> necessary for each task of the process to repeat same thing.
>>> Hence would be better move this work to mm_struct.
>>>
>>
>> It seems that the per task cache_work is used to avoid
>> duplicated task_cache_work() in task->task_works queue, see
>> task_tick_cache()'s check
>>   if (work->next == work)
>>      task_work_add()
> 
> Yes, this check avoids task_works being corrupt caused by double
> adding the work, no matter this work is task- or mm-specific. So
> if moving to mm_struct, this check become false once any task takes
> this work, and other tasks can not do the same job until that task
> finishes by setting work->next to work.

I see. What I previous thought was that, checking work->next == work
can not avoid two tasks doing the same statistic calculation
in task_cache_work(), because work->next = work is invoked
at the beginning of task_cache_work() - maybe need to put
this at the end of task_cache_work()?

> 
> BTW, moving to mm_struct will save some memory footprint?
> Yes, this would help.

>>
>> To do exclusive task_cache_work() and only allow 1 task
>> in the process to do the calculation, maybe introducing similar 
>> mechanism like task_numa_work(), something like this:
>>
>> if (!try_cmpxchg(&mm->cache_next_scan, &calc, next_scan))
>>      return;
> 
> Yes, this looks good to me. While Peter used epoch to regulate
> the frequency of this work, which is a ligher way but allows some
> inaccuracy which is further fixed by a double check after holding
> mm->mm_sched_lock.
> 
> I plan to use trylock on mm->mm_sched_lock first. If trylock fails
> then someone is adding the work and we can skip early.
> 
> static void task_tick_cache(struct rq *rq, struct task_struct *p)
> {
>      ...
> 
>      if (mm->mm_sched_epoch == rq->cpu_epoch)
>          return;
> 
>      guard(raw_spinlock)(&mm->mm_sched_lock);  <-- trylock
> 
>      ...
> }
> 

This lock is intended to protect that no two tasks enqueuing the same 
per-mm_struct work at the same time, right? And for the task work 
execution phase, maybe we also need to put work->next = work at the end 
of task_cache_work()?

>>
>>>> +
>>>> +static inline
>>>> +void account_mm_sched(struct rq *rq, struct task_struct *p, s64 
>>>> delta_exec)
>>>> +{
>>>> +    struct mm_struct *mm = p->mm;
>>>> +    struct mm_sched *pcpu_sched;
>>>> +    unsigned long epoch;
>>>> +
>>>> +    /*
>>>> +     * init_task and kthreads don't be having no mm
>>>> +     */
>>>> +    if (!mm || !mm->pcpu_sched)
>>>> +        return;
>>>> +
>>>> +    pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
>>>> +
>>>> +    scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
>>>> +        __update_mm_sched(rq, pcpu_sched);
>>>> +        pcpu_sched->runtime += delta_exec;
>>>> +        rq->cpu_runtime += delta_exec;
>>>> +        epoch = rq->cpu_epoch;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * If this task hasn't hit task_cache_work() for a while, 
>>>> invalidate
>>>> +     * it's preferred state.
>>>> +     */
>>>> +    if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
>>>> +        mm->mm_sched_cpu = -1;
>>>> +        pcpu_sched->occ = -1;
>>>> +    }
>>>
>>> This seems too late. account_mm_sched() is called when p is runnable,
>>> so if the whole process sleeps for a while before woken up, ttwu will
>>> take the out-dated value.
>>>
>>
>> Yup, there seems to be a problem. It would be better if we could reset 
>> the mm_sched_cpu to -1 after the last thread of the process falls 
>> asleep. But considering that all threads are sleeping, even if the 
>> ttwu tries to enqueue the first newly-woken thread to an out-dated 
>> idle mm_sched_cpu, does it matter? I guess it would not be a serious 
>> problem, because all the cache of the process might have been evicted 
>> due to long sleep.
> 
> Yes, it seems not a big deal if mm_sched_cpu not overwrites any better
> choice.
> 
>>
>>>> +
>>>> +static void task_cache_work(struct callback_head *work)
>>>> +{
>>>> +    struct task_struct *p = current;
>>>> +    struct mm_struct *mm = p->mm;
>>>> +    unsigned long m_a_occ = 0;
>>>> +    int cpu, m_a_cpu = -1;
>>>> +    cpumask_var_t cpus;
>>>> +
>>>> +    WARN_ON_ONCE(work != &p->cache_work);
>>>> +
>>>> +    work->next = work;
>>>> +
>>>> +    if (p->flags & PF_EXITING)
>>>> +        return;
>>>> +
>>>> +    if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
>>>> +        return;
>>>> +
>>>> +    scoped_guard (cpus_read_lock) {
>>>> +        cpumask_copy(cpus, cpu_online_mask);
>>>> +
>>>> +        for_each_cpu(cpu, cpus) {
>>>> +            /* XXX sched_cluster_active */
>>>> +            struct sched_domain *sd = per_cpu(sd_llc, cpu);
>>>> +            unsigned long occ, m_occ = 0, a_occ = 0;
>>>> +            int m_cpu = -1, nr = 0, i;
>>>> +
>>>> +            for_each_cpu(i, sched_domain_span(sd)) {
>>>> +                occ = fraction_mm_sched(cpu_rq(i),
>>>> +                            per_cpu_ptr(mm->pcpu_sched, i));
>>>> +                a_occ += occ;
>>>> +                if (occ > m_occ) {
>>>> +                    m_occ = occ;
>>>> +                    m_cpu = i;
>>>> +                }
>>>
>>> It would be possible to cause task stacking on this hint cpu
>>> due to its less frequently updated compared to wakeup.
>>>
>>
>> The SIS would overwrite the prev CPU with this hint(cached) CPU, and 
>> use that cached CPU as a hint to search for an idle CPU, so ideally it 
>> should not cause task stacking. But there is a race condition that 
>> multiple wakeup path might find the same cached "idle" CPU and queue 
>> wakees on it, this usually happens when there is frequent context 
>> switch(wakeup)+short duration tasks.
> 
> Ideally mm_sched_cpu is updated every EPOCH_PERIOD which is default
> to 10ms, so I'm afraid the race window is not small.
> 
OK, understood. My thought was that, if many wakers start searching for 
idle CPU from the same mm_sched_cpu(because mm_sched_cpu has not been 
changed for 10ms), if waker1 succeeds to enqueue a long-running wakee1 
on that mm_sched_cpu, other wakers might stop choosing that mm_sched_cpu 
in SIS. But if all the wakees are short-duration ones and doing frequent 
context switches, many wakers would think that they find an "idle" 
mm_sched_cpu, but actually it is heavily contended by many wakers.

thanks,
Chenyu

>>
>>
>>> And although the occupancy heuristic looks reasonable, IMHO it
>>> doesn't make much sense to compare between cpus as they share
>>> the LLC, and a non-hint cpu with warmer L1/L2$ in same LLC with
>>> the hint cpu seems more preferred.
>>>
>>> Do you think it's appropriate or not to only hint on the hottest
>>> LLC? So the tasks can hopefully wokenup on 'right' LLC on the
>>> premise that wouldn't cause much imbalance between LLCs.
>>>
>>  > I will do some tests and return with more feedback.
>>  >
>>
>> Find an idle CPU in the wakee's hostest LLC seems to be plausible.
>> The benchmark data might indicate a proper way.
>>
>> thanks,
>> Chenyu
>>
>>

Re: Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Abel Wu 8 months, 3 weeks ago

On 3/31/25 1:25 PM, Chen, Yu C wrote:
> On 3/30/2025 4:46 PM, Abel Wu wrote:
>> On 3/29/25 11:06 PM, Chen, Yu C wrote:
>>> On 3/28/2025 9:57 PM, Abel Wu wrote:
>>>>
>>>> IIUC this work updates stats for the whole mm and seems not
>>>> necessary for each task of the process to repeat same thing.
>>>> Hence would be better move this work to mm_struct.
>>>>
>>>
>>> It seems that the per task cache_work is used to avoid
>>> duplicated task_cache_work() in task->task_works queue, see
>>> task_tick_cache()'s check
>>>   if (work->next == work)
>>>      task_work_add()
>>
>> Yes, this check avoids task_works being corrupt caused by double
>> adding the work, no matter this work is task- or mm-specific. So
>> if moving to mm_struct, this check become false once any task takes
>> this work, and other tasks can not do the same job until that task
>> finishes by setting work->next to work.
> 
> I see. What I previous thought was that, checking work->next == work
> can not avoid two tasks doing the same statistic calculation
> in task_cache_work(), because work->next = work is invoked
> at the beginning of task_cache_work() - maybe need to put
> this at the end of task_cache_work()?

LGTM.

> 
>>
>> BTW, moving to mm_struct will save some memory footprint?
>> Yes, this would help.
> 
>>>
>>> To do exclusive task_cache_work() and only allow 1 task
>>> in the process to do the calculation, maybe introducing similar mechanism like task_numa_work(), something like this:
>>>
>>> if (!try_cmpxchg(&mm->cache_next_scan, &calc, next_scan))
>>>      return;
>>
>> Yes, this looks good to me. While Peter used epoch to regulate
>> the frequency of this work, which is a ligher way but allows some
>> inaccuracy which is further fixed by a double check after holding
>> mm->mm_sched_lock.
>>
>> I plan to use trylock on mm->mm_sched_lock first. If trylock fails
>> then someone is adding the work and we can skip early.
>>
>> static void task_tick_cache(struct rq *rq, struct task_struct *p)
>> {
>>      ...
>>
>>      if (mm->mm_sched_epoch == rq->cpu_epoch)
>>          return;
>>
>>      guard(raw_spinlock)(&mm->mm_sched_lock);  <-- trylock
>>
>>      ...
>> }
>>
> 
> This lock is intended to protect that no two tasks enqueuing the same per-mm_struct work at the same time, right? And for the task work execution phase, maybe we also need to put work->next = work at the end of task_cache_work()?

Lock here is needed to protect adding work, what I meant is to return
early if trylock fails, since there is someone else being the owner to
do the work.

> 
>>>
>>>>> +
>>>>> +static inline
>>>>> +void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>>>>> +{
>>>>> +    struct mm_struct *mm = p->mm;
>>>>> +    struct mm_sched *pcpu_sched;
>>>>> +    unsigned long epoch;
>>>>> +
>>>>> +    /*
>>>>> +     * init_task and kthreads don't be having no mm
>>>>> +     */
>>>>> +    if (!mm || !mm->pcpu_sched)
>>>>> +        return;
>>>>> +
>>>>> +    pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
>>>>> +
>>>>> +    scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
>>>>> +        __update_mm_sched(rq, pcpu_sched);
>>>>> +        pcpu_sched->runtime += delta_exec;
>>>>> +        rq->cpu_runtime += delta_exec;
>>>>> +        epoch = rq->cpu_epoch;
>>>>> +    }
>>>>> +
>>>>> +    /*
>>>>> +     * If this task hasn't hit task_cache_work() for a while, invalidate
>>>>> +     * it's preferred state.
>>>>> +     */
>>>>> +    if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
>>>>> +        mm->mm_sched_cpu = -1;
>>>>> +        pcpu_sched->occ = -1;
>>>>> +    }
>>>>
>>>> This seems too late. account_mm_sched() is called when p is runnable,
>>>> so if the whole process sleeps for a while before woken up, ttwu will
>>>> take the out-dated value.
>>>>
>>>
>>> Yup, there seems to be a problem. It would be better if we could reset the mm_sched_cpu to -1 after the last thread of the process falls asleep. But considering that all threads are sleeping, even if the ttwu tries to enqueue the first newly-woken thread to an out-dated idle mm_sched_cpu, does it matter? I guess it would not be a serious problem, because all the cache of the process might have been evicted due to long sleep.
>>
>> Yes, it seems not a big deal if mm_sched_cpu not overwrites any better
>> choice.
>>
>>>
>>>>> +
>>>>> +static void task_cache_work(struct callback_head *work)
>>>>> +{
>>>>> +    struct task_struct *p = current;
>>>>> +    struct mm_struct *mm = p->mm;
>>>>> +    unsigned long m_a_occ = 0;
>>>>> +    int cpu, m_a_cpu = -1;
>>>>> +    cpumask_var_t cpus;
>>>>> +
>>>>> +    WARN_ON_ONCE(work != &p->cache_work);
>>>>> +
>>>>> +    work->next = work;
>>>>> +
>>>>> +    if (p->flags & PF_EXITING)
>>>>> +        return;
>>>>> +
>>>>> +    if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
>>>>> +        return;
>>>>> +
>>>>> +    scoped_guard (cpus_read_lock) {
>>>>> +        cpumask_copy(cpus, cpu_online_mask);
>>>>> +
>>>>> +        for_each_cpu(cpu, cpus) {
>>>>> +            /* XXX sched_cluster_active */
>>>>> +            struct sched_domain *sd = per_cpu(sd_llc, cpu);
>>>>> +            unsigned long occ, m_occ = 0, a_occ = 0;
>>>>> +            int m_cpu = -1, nr = 0, i;
>>>>> +
>>>>> +            for_each_cpu(i, sched_domain_span(sd)) {
>>>>> +                occ = fraction_mm_sched(cpu_rq(i),
>>>>> +                            per_cpu_ptr(mm->pcpu_sched, i));
>>>>> +                a_occ += occ;
>>>>> +                if (occ > m_occ) {
>>>>> +                    m_occ = occ;
>>>>> +                    m_cpu = i;
>>>>> +                }
>>>>
>>>> It would be possible to cause task stacking on this hint cpu
>>>> due to its less frequently updated compared to wakeup.
>>>>
>>>
>>> The SIS would overwrite the prev CPU with this hint(cached) CPU, and use that cached CPU as a hint to search for an idle CPU, so ideally it should not cause task stacking. But there is a race condition that multiple wakeup path might find the same cached "idle" CPU and queue wakees on it, this usually happens when there is frequent context switch(wakeup)+short duration tasks.
>>
>> Ideally mm_sched_cpu is updated every EPOCH_PERIOD which is default
>> to 10ms, so I'm afraid the race window is not small.
>>
> OK, understood. My thought was that, if many wakers start searching for idle CPU from the same mm_sched_cpu(because mm_sched_cpu has not been changed for 10ms), if waker1 succeeds to enqueue a long-running wakee1 on that mm_sched_cpu, other wakers might stop choosing that mm_sched_cpu in SIS. But if all the wakees are short-duration ones and doing frequent context switches, many wakers would think that they find an "idle" mm_sched_cpu, but actually it is heavily contended by many wakers.

OK, seems I misunderstood the race condition you previously mentioned.
Yes, it's not clear yet whether mm_sched_cpu would cause stacking or not,
I will do some more test before further discussion.

Thanks,
	Abel

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Madadi Vineeth Reddy 8 months, 3 weeks ago

Hi Peter,

On 25/03/25 17:39, Peter Zijlstra wrote:
> Hi all,
> 
> One of the many things on the eternal todo list has been finishing the
> below hackery.
> 
> It is an attempt at modelling cache affinity -- and while the patch
> really only targets LLC, it could very well be extended to also apply to
> clusters (L2). Specifically any case of multiple cache domains inside a
> node.
> 
> Anyway, I wrote this about a year ago, and I mentioned this at the
> recent OSPM conf where Gautham and Prateek expressed interest in playing
> with this code.
> 
> So here goes, very rough and largely unproven code ahead :-)
> 
> It applies to current tip/master, but I know it will fail the __percpu
> validation that sits in -next, although that shouldn't be terribly hard
> to fix up.
> 
> As is, it only computes a CPU inside the LLC that has the highest recent
> runtime, this CPU is then used in the wake-up path to steer towards this
> LLC and in task_hot() to limit migrations away from it.
> 
> More elaborate things could be done, notably there is an XXX in there
> somewhere about finding the best LLC inside a NODE (interaction with
> NUMA_BALANCING).

Tested the patch on a 12-core, 96-thread Power10 system using a real-life
workload, DayTrader.

Here is a summary of the runs:

Users | Instances | Throughput vs Base | Avg Resp. Time vs Base
--------------------------------------------------------------
30    | 1        | -25.3%              | +50%
60    | 1        | -25.1%              | +50%
30    | 3        | -22.8%              | +33%

As of now, the patch negatively impacts performance both in terms of
throughput and latency.

I will conduct more extensive testing with both microbenchmarks and
real-life workloads.

Thanks,
Madadi Vineeth Reddy

> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/mm_types.h |  44 +++++++
>  include/linux/sched.h    |   4 +
>  init/Kconfig             |   4 +
>  kernel/fork.c            |   5 +
>  kernel/sched/core.c      |  13 +-
>  kernel/sched/fair.c      | 330 ++++++++++++++++++++++++++++++++++++++++++++---
>  kernel/sched/sched.h     |   8 ++
>  7 files changed, 388 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 0234f14f2aa6..3ed8dd225eb9 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -800,6 +800,12 @@ struct mm_cid {
>  };
>  #endif
>  
> +struct mm_sched {
> +	u64 runtime;
> +	unsigned long epoch;
> +	unsigned long occ;
> +};
> +
>  struct kioctx_table;
>  struct iommu_mm_data;
>  struct mm_struct {
> @@ -890,6 +896,17 @@ struct mm_struct {
>  		 */
>  		raw_spinlock_t cpus_allowed_lock;
>  #endif
> +#ifdef CONFIG_SCHED_CACHE
> +		/*
> +		 * Track per-cpu-per-process occupancy as a proxy for cache residency.
> +		 * See account_mm_sched() and ...
> +		 */
> +		struct mm_sched __percpu *pcpu_sched;
> +		raw_spinlock_t mm_sched_lock;
> +		unsigned long mm_sched_epoch;
> +		int mm_sched_cpu;
> +#endif
> +
>  #ifdef CONFIG_MMU
>  		atomic_long_t pgtables_bytes;	/* size of all page tables */
>  #endif
> @@ -1296,6 +1313,33 @@ static inline unsigned int mm_cid_size(void)
>  static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
>  #endif /* CONFIG_SCHED_MM_CID */
>  
> +#ifdef CONFIG_SCHED_CACHE
> +extern void mm_init_sched(struct mm_struct *mm, struct mm_sched *pcpu_sched);
> +
> +static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
> +{
> +	struct mm_sched *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
> +	if (!pcpu_sched)
> +		return -ENOMEM;
> +
> +	mm_init_sched(mm, pcpu_sched);
> +	return 0;
> +}
> +
> +#define mm_alloc_sched(...)	alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__))
> +
> +static inline void mm_destroy_sched(struct mm_struct *mm)
> +{
> +	free_percpu(mm->pcpu_sched);
> +	mm->pcpu_sched = NULL;
> +}
> +#else /* !CONFIG_SCHED_CACHE */
> +
> +static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; }
> +static inline void mm_destroy_sched(struct mm_struct *mm) { }
> +
> +#endif /* CONFIG_SCHED_CACHE */
> +
>  struct mmu_gather;
>  extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
>  extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 6e5c38718ff5..f8eafe440369 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1379,6 +1379,10 @@ struct task_struct {
>  	unsigned long			numa_pages_migrated;
>  #endif /* CONFIG_NUMA_BALANCING */
>  
> +#ifdef CONFIG_SCHED_CACHE
> +	struct callback_head		cache_work;
> +#endif
> +
>  #ifdef CONFIG_RSEQ
>  	struct rseq __user *rseq;
>  	u32 rseq_len;
> diff --git a/init/Kconfig b/init/Kconfig
> index 681f38ee68db..14b15215318f 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -950,6 +950,10 @@ config NUMA_BALANCING
>  
>  	  This system will be inactive on UMA systems.
>  
> +config SCHED_CACHE
> +	bool "Cache aware scheduler"
> +	default y
> +
>  config NUMA_BALANCING_DEFAULT_ENABLED
>  	bool "Automatically enable NUMA aware memory/task placement"
>  	default y
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 1b659b07ecd5..bc9d7dbfd980 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1314,6 +1314,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>  	if (mm_alloc_cid(mm, p))
>  		goto fail_cid;
>  
> +	if (mm_alloc_sched(mm))
> +		goto fail_sched;
> +
>  	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
>  				     NR_MM_COUNTERS))
>  		goto fail_pcpu;
> @@ -1323,6 +1326,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>  	return mm;
>  
>  fail_pcpu:
> +	mm_destroy_sched(mm);
> +fail_sched:
>  	mm_destroy_cid(mm);
>  fail_cid:
>  	destroy_context(mm);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 87540217fc09..649db6ea41ea 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4514,6 +4514,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
>  	p->migration_pending = NULL;
>  #endif
>  	init_sched_mm_cid(p);
> +	init_sched_mm(p);
>  }
>  
>  DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
> @@ -8505,6 +8506,7 @@ static struct kmem_cache *task_group_cache __ro_after_init;
>  
>  void __init sched_init(void)
>  {
> +	unsigned long now = jiffies;
>  	unsigned long ptr = 0;
>  	int i;
>  
> @@ -8579,7 +8581,7 @@ void __init sched_init(void)
>  		raw_spin_lock_init(&rq->__lock);
>  		rq->nr_running = 0;
>  		rq->calc_load_active = 0;
> -		rq->calc_load_update = jiffies + LOAD_FREQ;
> +		rq->calc_load_update = now + LOAD_FREQ;
>  		init_cfs_rq(&rq->cfs);
>  		init_rt_rq(&rq->rt);
>  		init_dl_rq(&rq->dl);
> @@ -8623,7 +8625,7 @@ void __init sched_init(void)
>  		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
>  		rq->balance_callback = &balance_push_callback;
>  		rq->active_balance = 0;
> -		rq->next_balance = jiffies;
> +		rq->next_balance = now;
>  		rq->push_cpu = 0;
>  		rq->cpu = i;
>  		rq->online = 0;
> @@ -8635,7 +8637,7 @@ void __init sched_init(void)
>  
>  		rq_attach_root(rq, &def_root_domain);
>  #ifdef CONFIG_NO_HZ_COMMON
> -		rq->last_blocked_load_update_tick = jiffies;
> +		rq->last_blocked_load_update_tick = now;
>  		atomic_set(&rq->nohz_flags, 0);
>  
>  		INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
> @@ -8660,6 +8662,11 @@ void __init sched_init(void)
>  
>  		rq->core_cookie = 0UL;
>  #endif
> +#ifdef CONFIG_SCHED_CACHE
> +		raw_spin_lock_init(&rq->cpu_epoch_lock);
> +		rq->cpu_epoch_next = now;
> +#endif
> +
>  		zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
>  	}
>  
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e43993a4e580..943af076e09c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1166,10 +1166,229 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
>  	return delta_exec;
>  }
>  
> -static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
> +#ifdef CONFIG_SCHED_CACHE
> +
> +/*
> + * XXX numbers come from a place the sun don't shine -- probably wants to be SD
> + * tunable or so.
> + */
> +#define EPOCH_PERIOD	(HZ/100)	/* 10 ms */
> +#define EPOCH_OLD	5		/* 50 ms */
> +
> +void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
> +{
> +	unsigned long epoch;
> +	int i;
> +
> +	for_each_possible_cpu(i) {
> +		struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
> +		struct rq *rq = cpu_rq(i);
> +
> +		pcpu_sched->runtime = 0;
> +		pcpu_sched->epoch = epoch = rq->cpu_epoch;
> +		pcpu_sched->occ = -1;
> +	}
> +
> +	raw_spin_lock_init(&mm->mm_sched_lock);
> +	mm->mm_sched_epoch = epoch;
> +	mm->mm_sched_cpu = -1;
> +
> +	smp_store_release(&mm->pcpu_sched, _pcpu_sched);
> +}
> +
> +/* because why would C be fully specified */
> +static __always_inline void __shr_u64(u64 *val, unsigned int n)
> +{
> +	if (n >= 64) {
> +		*val = 0;
> +		return;
> +	}
> +	*val >>= n;
> +}
> +
> +static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
> +{
> +	lockdep_assert_held(&rq->cpu_epoch_lock);
> +
> +	unsigned long n, now = jiffies;
> +	long delta = now - rq->cpu_epoch_next;
> +
> +	if (delta > 0) {
> +		n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
> +		rq->cpu_epoch += n;
> +		rq->cpu_epoch_next += n * EPOCH_PERIOD;
> +		__shr_u64(&rq->cpu_runtime, n);
> +	}
> +
> +	n = rq->cpu_epoch - pcpu_sched->epoch;
> +	if (n) {
> +		pcpu_sched->epoch += n;
> +		__shr_u64(&pcpu_sched->runtime, n);
> +	}
> +}
> +
> +static unsigned long fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
> +{
> +	guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
> +
> +	__update_mm_sched(rq, pcpu_sched);
> +
> +	/*
> +	 * Runtime is a geometric series (r=0.5) and as such will sum to twice
> +	 * the accumulation period, this means the multiplcation here should
> +	 * not overflow.
> +	 */
> +	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
> +}
> +
> +static inline
> +void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> +{
> +	struct mm_struct *mm = p->mm;
> +	struct mm_sched *pcpu_sched;
> +	unsigned long epoch;
> +
> +	/*
> +	 * init_task and kthreads don't be having no mm
> +	 */
> +	if (!mm || !mm->pcpu_sched)
> +		return;
> +
> +	pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
> +
> +	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
> +		__update_mm_sched(rq, pcpu_sched);
> +		pcpu_sched->runtime += delta_exec;
> +		rq->cpu_runtime += delta_exec;
> +		epoch = rq->cpu_epoch;
> +	}
> +
> +	/*
> +	 * If this task hasn't hit task_cache_work() for a while, invalidate
> +	 * it's preferred state.
> +	 */
> +	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
> +		mm->mm_sched_cpu = -1;
> +		pcpu_sched->occ = -1;
> +	}
> +}
> +
> +static void task_tick_cache(struct rq *rq, struct task_struct *p)
> +{
> +	struct callback_head *work = &p->cache_work;
> +	struct mm_struct *mm = p->mm;
> +
> +	if (!mm || !mm->pcpu_sched)
> +		return;
> +
> +	if (mm->mm_sched_epoch == rq->cpu_epoch)
> +		return;
> +
> +	guard(raw_spinlock)(&mm->mm_sched_lock);
> +
> +	if (mm->mm_sched_epoch == rq->cpu_epoch)
> +		return;
> +
> +	if (work->next == work) {
> +		task_work_add(p, work, TWA_RESUME);
> +		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
> +	}
> +}
> +
> +static void task_cache_work(struct callback_head *work)
> +{
> +	struct task_struct *p = current;
> +	struct mm_struct *mm = p->mm;
> +	unsigned long m_a_occ = 0;
> +	int cpu, m_a_cpu = -1;
> +	cpumask_var_t cpus;
> +
> +	WARN_ON_ONCE(work != &p->cache_work);
> +
> +	work->next = work;
> +
> +	if (p->flags & PF_EXITING)
> +		return;
> +
> +	if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
> +		return;
> +
> +	scoped_guard (cpus_read_lock) {
> +		cpumask_copy(cpus, cpu_online_mask);
> +
> +		for_each_cpu(cpu, cpus) {
> +			/* XXX sched_cluster_active */
> +			struct sched_domain *sd = per_cpu(sd_llc, cpu);
> +			unsigned long occ, m_occ = 0, a_occ = 0;
> +			int m_cpu = -1, nr = 0, i;
> +
> +			for_each_cpu(i, sched_domain_span(sd)) {
> +				occ = fraction_mm_sched(cpu_rq(i),
> +							per_cpu_ptr(mm->pcpu_sched, i));
> +				a_occ += occ;
> +				if (occ > m_occ) {
> +					m_occ = occ;
> +					m_cpu = i;
> +				}
> +				nr++;
> +				trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
> +					     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
> +			}
> +
> +			a_occ /= nr;
> +			if (a_occ > m_a_occ) {
> +				m_a_occ = a_occ;
> +				m_a_cpu = m_cpu;
> +			}
> +
> +			trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
> +				     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
> +
> +			for_each_cpu(i, sched_domain_span(sd)) {
> +				/* XXX threshold ? */
> +				per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ;
> +			}
> +
> +			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
> +		}
> +	}
> +
> +	/*
> +	 * If the max average cache occupancy is 'small' we don't care.
> +	 */
> +	if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD))
> +		m_a_cpu = -1;
> +
> +	mm->mm_sched_cpu = m_a_cpu;
> +
> +	free_cpumask_var(cpus);
> +}
> +
> +void init_sched_mm(struct task_struct *p)
> +{
> +	struct callback_head *work = &p->cache_work;
> +	init_task_work(work, task_cache_work);
> +	work->next = work;
> +}
> +
> +#else
> +
> +static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
> +				    s64 delta_exec) { }
> +
> +
> +void init_sched_mm(struct task_struct *p) { }
> +
> +static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
> +
> +#endif
> +
> +static inline
> +void update_curr_task(struct rq *rq, struct task_struct *p, s64 delta_exec)
>  {
>  	trace_sched_stat_runtime(p, delta_exec);
>  	account_group_exec_runtime(p, delta_exec);
> +	account_mm_sched(rq, p, delta_exec);
>  	cgroup_account_cputime(p, delta_exec);
>  }
>  
> @@ -1215,7 +1434,7 @@ s64 update_curr_common(struct rq *rq)
>  
>  	delta_exec = update_curr_se(rq, &donor->se);
>  	if (likely(delta_exec > 0))
> -		update_curr_task(donor, delta_exec);
> +		update_curr_task(rq, donor, delta_exec);
>  
>  	return delta_exec;
>  }
> @@ -1244,7 +1463,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
>  	if (entity_is_task(curr)) {
>  		struct task_struct *p = task_of(curr);
>  
> -		update_curr_task(p, delta_exec);
> +		update_curr_task(rq, p, delta_exec);
>  
>  		/*
>  		 * If the fair_server is active, we need to account for the
> @@ -7850,7 +8069,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>  	 * per-cpu select_rq_mask usage
>  	 */
>  	lockdep_assert_irqs_disabled();
> -
> +again:
>  	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
>  	    asym_fits_cpu(task_util, util_min, util_max, target))
>  		return target;
> @@ -7888,7 +8107,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>  	/* Check a recently used CPU as a potential idle candidate: */
>  	recent_used_cpu = p->recent_used_cpu;
>  	p->recent_used_cpu = prev;
> -	if (recent_used_cpu != prev &&
> +	if (prev == p->wake_cpu &&
> +	    recent_used_cpu != prev &&
>  	    recent_used_cpu != target &&
>  	    cpus_share_cache(recent_used_cpu, target) &&
>  	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
> @@ -7941,6 +8161,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>  	if ((unsigned)i < nr_cpumask_bits)
>  		return i;
>  
> +	if (prev != p->wake_cpu && !cpus_share_cache(prev, p->wake_cpu)) {
> +		/*
> +		 * Most likely select_cache_cpu() will have re-directed
> +		 * the wakeup, but getting here means the preferred cache is
> +		 * too busy, so re-try with the actual previous.
> +		 *
> +		 * XXX wake_affine is lost for this pass.
> +		 */
> +		prev = target = p->wake_cpu;
> +		goto again;
> +	}
> +
>  	/*
>  	 * For cluster machines which have lower sharing cache like L2 or
>  	 * LLC Tag, we tend to find an idle CPU in the target's cluster
> @@ -8563,6 +8795,40 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>  	return target;
>  }
>  
> +#ifdef CONFIG_SCHED_CACHE
> +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle);
> +
> +static int select_cache_cpu(struct task_struct *p, int prev_cpu)
> +{
> +	struct mm_struct *mm = p->mm;
> +	int cpu;
> +
> +	if (!mm || p->nr_cpus_allowed == 1)
> +		return prev_cpu;
> +
> +	cpu = mm->mm_sched_cpu;
> +	if (cpu < 0)
> +		return prev_cpu;
> +
> +
> +	if (static_branch_likely(&sched_numa_balancing) &&
> +	    __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) {
> +		/*
> +		 * XXX look for max occupancy inside prev_cpu's node
> +		 */
> +		return prev_cpu;
> +	}
> +
> +	return cpu;
> +}
> +#else
> +static int select_cache_cpu(struct task_struct *p, int prev_cpu)
> +{
> +	return prev_cpu;
> +}
> +#endif
> +
> +
>  /*
>   * select_task_rq_fair: Select target runqueue for the waking task in domains
>   * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
> @@ -8588,6 +8854,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>  	 * required for stable ->cpus_allowed
>  	 */
>  	lockdep_assert_held(&p->pi_lock);
> +	guard(rcu)();
> +
>  	if (wake_flags & WF_TTWU) {
>  		record_wakee(p);
>  
> @@ -8595,6 +8863,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>  		    cpumask_test_cpu(cpu, p->cpus_ptr))
>  			return cpu;
>  
> +		new_cpu = prev_cpu = select_cache_cpu(p, prev_cpu);
> +
>  		if (!is_rd_overutilized(this_rq()->rd)) {
>  			new_cpu = find_energy_efficient_cpu(p, prev_cpu);
>  			if (new_cpu >= 0)
> @@ -8605,7 +8875,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>  		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
>  	}
>  
> -	rcu_read_lock();
>  	for_each_domain(cpu, tmp) {
>  		/*
>  		 * If both 'cpu' and 'prev_cpu' are part of this domain,
> @@ -8638,7 +8907,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>  		/* Fast path */
>  		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
>  	}
> -	rcu_read_unlock();
>  
>  	return new_cpu;
>  }
> @@ -9288,6 +9556,17 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>  	if (sysctl_sched_migration_cost == 0)
>  		return 0;
>  
> +#ifdef CONFIG_SCHED_CACHE
> +	if (p->mm && p->mm->pcpu_sched) {
> +		/*
> +		 * XXX things like Skylake have non-inclusive L3 and might not
> +		 * like this L3 centric view. What to do about L2 stickyness ?
> +		 */
> +		return per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ >
> +		       per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ;
> +	}
> +#endif
> +
>  	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
>  
>  	return delta < (s64)sysctl_sched_migration_cost;
> @@ -9299,27 +9578,25 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>   * Returns 0, if task migration is not affected by locality.
>   * Returns a negative value, if task migration improves locality i.e migration preferred.
>   */
> -static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
> +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
>  {
>  	struct numa_group *numa_group = rcu_dereference(p->numa_group);
>  	unsigned long src_weight, dst_weight;
>  	int src_nid, dst_nid, dist;
>  
> -	if (!static_branch_likely(&sched_numa_balancing))
> -		return 0;
> -
> -	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
> +	if (!p->numa_faults)
>  		return 0;
>  
> -	src_nid = cpu_to_node(env->src_cpu);
> -	dst_nid = cpu_to_node(env->dst_cpu);
> +	src_nid = cpu_to_node(src_cpu);
> +	dst_nid = cpu_to_node(dst_cpu);
>  
>  	if (src_nid == dst_nid)
>  		return 0;
>  
>  	/* Migrating away from the preferred node is always bad. */
>  	if (src_nid == p->numa_preferred_nid) {
> -		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
> +		struct rq *src_rq = cpu_rq(src_cpu);
> +		if (src_rq->nr_running > src_rq->nr_preferred_running)
>  			return 1;
>  		else
>  			return 0;
> @@ -9330,7 +9607,7 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
>  		return -1;
>  
>  	/* Leaving a core idle is often worse than degrading locality. */
> -	if (env->idle == CPU_IDLE)
> +	if (idle)
>  		return 0;
>  
>  	dist = node_distance(src_nid, dst_nid);
> @@ -9345,7 +9622,24 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
>  	return src_weight - dst_weight;
>  }
>  
> +static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
> +{
> +	if (!static_branch_likely(&sched_numa_balancing))
> +		return 0;
> +
> +	if (!(env->sd->flags & SD_NUMA))
> +		return 0;
> +
> +	return __migrate_degrades_locality(p, env->src_cpu, env->dst_cpu,
> +					   env->idle == CPU_IDLE);
> +}
> +
>  #else
> +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
> +{
> +	return 0;
> +}
> +
>  static inline long migrate_degrades_locality(struct task_struct *p,
>  					     struct lb_env *env)
>  {
> @@ -13104,8 +13398,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
>   */
>  static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>  {
> -	struct cfs_rq *cfs_rq;
>  	struct sched_entity *se = &curr->se;
> +	struct cfs_rq *cfs_rq;
>  
>  	for_each_sched_entity(se) {
>  		cfs_rq = cfs_rq_of(se);
> @@ -13115,6 +13409,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>  	if (static_branch_unlikely(&sched_numa_balancing))
>  		task_tick_numa(rq, curr);
>  
> +	task_tick_cache(rq, curr);
> +
>  	update_misfit_status(curr, rq);
>  	check_update_overutilized_status(task_rq(curr));
>  
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 47972f34ea70..d16ccd66ca07 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1171,6 +1171,12 @@ struct rq {
>  	u64			clock_pelt_idle_copy;
>  	u64			clock_idle_copy;
>  #endif
> +#ifdef CONFIG_SCHED_CACHE
> +	raw_spinlock_t		cpu_epoch_lock;
> +	u64			cpu_runtime;
> +	unsigned long		cpu_epoch;
> +	unsigned long		cpu_epoch_next;
> +#endif
>  
>  	atomic_t		nr_iowait;
>  
> @@ -3861,6 +3867,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
>  static inline void init_sched_mm_cid(struct task_struct *t) { }
>  #endif /* !CONFIG_SCHED_MM_CID */
>  
> +extern void init_sched_mm(struct task_struct *p);
> +
>  extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
>  extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
>  #ifdef CONFIG_SMP

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Chen, Yu C 8 months, 3 weeks ago

Hi Madadi,

On 3/27/2025 10:43 AM, Madadi Vineeth Reddy wrote:
> Hi Peter,
> 
> On 25/03/25 17:39, Peter Zijlstra wrote:
>> Hi all,
>>
>> One of the many things on the eternal todo list has been finishing the
>> below hackery.
>>
>> It is an attempt at modelling cache affinity -- and while the patch
>> really only targets LLC, it could very well be extended to also apply to
>> clusters (L2). Specifically any case of multiple cache domains inside a
>> node.
>>
>> Anyway, I wrote this about a year ago, and I mentioned this at the
>> recent OSPM conf where Gautham and Prateek expressed interest in playing
>> with this code.
>>
>> So here goes, very rough and largely unproven code ahead :-)
>>
>> It applies to current tip/master, but I know it will fail the __percpu
>> validation that sits in -next, although that shouldn't be terribly hard
>> to fix up.
>>
>> As is, it only computes a CPU inside the LLC that has the highest recent
>> runtime, this CPU is then used in the wake-up path to steer towards this
>> LLC and in task_hot() to limit migrations away from it.
>>
>> More elaborate things could be done, notably there is an XXX in there
>> somewhere about finding the best LLC inside a NODE (interaction with
>> NUMA_BALANCING).
> 
> Tested the patch on a 12-core, 96-thread Power10 system using a real-life
> workload, DayTrader.

Do all the Cores share the same LLC within 1 node? If this is the case,
the regression might be due to over-migration/task stacking within 1 
LLC/node. This patch should be modified that cache aware load 
balancing/wakeup will not be triggered if there is only 1 LLC within the 
node IMO.

thanks,
Chenyu

> 
> Here is a summary of the runs:
> 
> Users | Instances | Throughput vs Base | Avg Resp. Time vs Base
> --------------------------------------------------------------
> 30    | 1        | -25.3%              | +50%
> 60    | 1        | -25.1%              | +50%
> 30    | 3        | -22.8%              | +33%
> 
> As of now, the patch negatively impacts performance both in terms of
> throughput and latency.
> 
> I will conduct more extensive testing with both microbenchmarks and
> real-life workloads.
> 
> Thanks,
> Madadi Vineeth Reddy
>

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Madadi Vineeth Reddy 8 months, 3 weeks ago

Hi Chen Yu,

On 27/03/25 16:44, Chen, Yu C wrote:
> Hi Madadi,
> 
> On 3/27/2025 10:43 AM, Madadi Vineeth Reddy wrote:
>> Hi Peter,
>>
>> On 25/03/25 17:39, Peter Zijlstra wrote:
>>> Hi all,
>>>
>>> One of the many things on the eternal todo list has been finishing the
>>> below hackery.
>>>
>>> It is an attempt at modelling cache affinity -- and while the patch
>>> really only targets LLC, it could very well be extended to also apply to
>>> clusters (L2). Specifically any case of multiple cache domains inside a
>>> node.
>>>
>>> Anyway, I wrote this about a year ago, and I mentioned this at the
>>> recent OSPM conf where Gautham and Prateek expressed interest in playing
>>> with this code.
>>>
>>> So here goes, very rough and largely unproven code ahead :-)
>>>
>>> It applies to current tip/master, but I know it will fail the __percpu
>>> validation that sits in -next, although that shouldn't be terribly hard
>>> to fix up.
>>>
>>> As is, it only computes a CPU inside the LLC that has the highest recent
>>> runtime, this CPU is then used in the wake-up path to steer towards this
>>> LLC and in task_hot() to limit migrations away from it.
>>>
>>> More elaborate things could be done, notably there is an XXX in there
>>> somewhere about finding the best LLC inside a NODE (interaction with
>>> NUMA_BALANCING).
>>
>> Tested the patch on a 12-core, 96-thread Power10 system using a real-life
>> workload, DayTrader.
> 
> Do all the Cores share the same LLC within 1 node? If this is the case,
> the regression might be due to over-migration/task stacking within 1 LLC/node. This patch should be modified that cache aware load balancing/wakeup will not be triggered if there is only 1 LLC within the node IMO.

Are you asking whether LLC is shared at the node level?

In Power10, the LLC is at the small core level, covering 4 threads.

In my test setup, there were 4 nodes, each with 24 CPUs, meaning there
were 6 LLCs per node.

Went through the patch in more detail and will check if task stacking
is an issue using micro-benchmarks.

Thanks for your feedback.

Thanks,
Madadi Vineeth Reddy

> 
> thanks,
> Chenyu
> 
>>
>> Here is a summary of the runs:
>>
>> Users | Instances | Throughput vs Base | Avg Resp. Time vs Base
>> --------------------------------------------------------------
>> 30    | 1        | -25.3%              | +50%
>> 60    | 1        | -25.1%              | +50%
>> 30    | 3        | -22.8%              | +33%
>>
>> As of now, the patch negatively impacts performance both in terms of
>> throughput and latency.
>>
>> I will conduct more extensive testing with both microbenchmarks and
>> real-life workloads.
>>
>> Thanks,
>> Madadi Vineeth Reddy
>>

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Chen, Yu C 8 months, 3 weeks ago

Hi Peter,

Thanks for sending this out,

On 3/25/2025 8:09 PM, Peter Zijlstra wrote:
> Hi all,
> 
> One of the many things on the eternal todo list has been finishing the
> below hackery.
> 
> It is an attempt at modelling cache affinity -- and while the patch
> really only targets LLC, it could very well be extended to also apply to
> clusters (L2). Specifically any case of multiple cache domains inside a
> node.
> 
> Anyway, I wrote this about a year ago, and I mentioned this at the
> recent OSPM conf where Gautham and Prateek expressed interest in playing
> with this code.
> 
> So here goes, very rough and largely unproven code ahead :-)
> 
> It applies to current tip/master, but I know it will fail the __percpu
> validation that sits in -next, although that shouldn't be terribly hard
> to fix up.
> 
> As is, it only computes a CPU inside the LLC that has the highest recent
> runtime, this CPU is then used in the wake-up path to steer towards this
> LLC and in task_hot() to limit migrations away from it.
> 
> More elaborate things could be done, notably there is an XXX in there
> somewhere about finding the best LLC inside a NODE (interaction with
> NUMA_BALANCING).
> 

Besides the control provided by CONFIG_SCHED_CACHE, could we also 
introduce sched_feat(SCHED_CACHE) to manage this feature, facilitating 
dynamic adjustments? Similarly we can also introduce other sched_feats 
for load balancing and NUMA balancing for fine-grain control.

> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>   include/linux/mm_types.h |  44 +++++++
>   include/linux/sched.h    |   4 +
>   init/Kconfig             |   4 +
>   kernel/fork.c            |   5 +
>   kernel/sched/core.c      |  13 +-
>   kernel/sched/fair.c      | 330 ++++++++++++++++++++++++++++++++++++++++++++---
>   kernel/sched/sched.h     |   8 ++
>   7 files changed, 388 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 0234f14f2aa6..3ed8dd225eb9 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -800,6 +800,12 @@ struct mm_cid {
>   };
>   #endif
>   
> +struct mm_sched {
> +	u64 runtime;
> +	unsigned long epoch;
> +	unsigned long occ;
> +};
> +
>   struct kioctx_table;
>   struct iommu_mm_data;
>   struct mm_struct {
> @@ -890,6 +896,17 @@ struct mm_struct {
>   		 */
>   		raw_spinlock_t cpus_allowed_lock;
>   #endif
> +#ifdef CONFIG_SCHED_CACHE
> +		/*
> +		 * Track per-cpu-per-process occupancy as a proxy for cache residency.
> +		 * See account_mm_sched() and ...
> +		 */
> +		struct mm_sched __percpu *pcpu_sched;
> +		raw_spinlock_t mm_sched_lock;
> +		unsigned long mm_sched_epoch;
> +		int mm_sched_cpu;
> +#endif
> +
>   #ifdef CONFIG_MMU
>   		atomic_long_t pgtables_bytes;	/* size of all page tables */
>   #endif
> @@ -1296,6 +1313,33 @@ static inline unsigned int mm_cid_size(void)
>   static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
>   #endif /* CONFIG_SCHED_MM_CID */
>   
> +#ifdef CONFIG_SCHED_CACHE
> +extern void mm_init_sched(struct mm_struct *mm, struct mm_sched *pcpu_sched);
> +
> +static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
> +{
> +	struct mm_sched *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
> +	if (!pcpu_sched)
> +		return -ENOMEM;
> +
> +	mm_init_sched(mm, pcpu_sched);
> +	return 0;
> +}
> +
> +#define mm_alloc_sched(...)	alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__))
> +
> +static inline void mm_destroy_sched(struct mm_struct *mm)
> +{
> +	free_percpu(mm->pcpu_sched);
> +	mm->pcpu_sched = NULL;
> +}
> +#else /* !CONFIG_SCHED_CACHE */
> +
> +static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; }
> +static inline void mm_destroy_sched(struct mm_struct *mm) { }
> +
> +#endif /* CONFIG_SCHED_CACHE */
> +
>   struct mmu_gather;
>   extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
>   extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 6e5c38718ff5..f8eafe440369 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1379,6 +1379,10 @@ struct task_struct {
>   	unsigned long			numa_pages_migrated;
>   #endif /* CONFIG_NUMA_BALANCING */
>   
> +#ifdef CONFIG_SCHED_CACHE
> +	struct callback_head		cache_work;
> +#endif
> +
>   #ifdef CONFIG_RSEQ
>   	struct rseq __user *rseq;
>   	u32 rseq_len;
> diff --git a/init/Kconfig b/init/Kconfig
> index 681f38ee68db..14b15215318f 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -950,6 +950,10 @@ config NUMA_BALANCING
>   
>   	  This system will be inactive on UMA systems.
>   
> +config SCHED_CACHE
> +	bool "Cache aware scheduler"
> +	default y
> +
>   config NUMA_BALANCING_DEFAULT_ENABLED
>   	bool "Automatically enable NUMA aware memory/task placement"
>   	default y
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 1b659b07ecd5..bc9d7dbfd980 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1314,6 +1314,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>   	if (mm_alloc_cid(mm, p))
>   		goto fail_cid;
>   
> +	if (mm_alloc_sched(mm))
> +		goto fail_sched;
> +
>   	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
>   				     NR_MM_COUNTERS))
>   		goto fail_pcpu;
> @@ -1323,6 +1326,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>   	return mm;
>   
>   fail_pcpu:
> +	mm_destroy_sched(mm);
> +fail_sched:
>   	mm_destroy_cid(mm);
>   fail_cid:
>   	destroy_context(mm);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 87540217fc09..649db6ea41ea 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4514,6 +4514,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
>   	p->migration_pending = NULL;
>   #endif
>   	init_sched_mm_cid(p);
> +	init_sched_mm(p);
>   }
>   
>   DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
> @@ -8505,6 +8506,7 @@ static struct kmem_cache *task_group_cache __ro_after_init;
>   
>   void __init sched_init(void)
>   {
> +	unsigned long now = jiffies;
>   	unsigned long ptr = 0;
>   	int i;
>   
> @@ -8579,7 +8581,7 @@ void __init sched_init(void)
>   		raw_spin_lock_init(&rq->__lock);
>   		rq->nr_running = 0;
>   		rq->calc_load_active = 0;
> -		rq->calc_load_update = jiffies + LOAD_FREQ;
> +		rq->calc_load_update = now + LOAD_FREQ;
>   		init_cfs_rq(&rq->cfs);
>   		init_rt_rq(&rq->rt);
>   		init_dl_rq(&rq->dl);
> @@ -8623,7 +8625,7 @@ void __init sched_init(void)
>   		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
>   		rq->balance_callback = &balance_push_callback;
>   		rq->active_balance = 0;
> -		rq->next_balance = jiffies;
> +		rq->next_balance = now;
>   		rq->push_cpu = 0;
>   		rq->cpu = i;
>   		rq->online = 0;
> @@ -8635,7 +8637,7 @@ void __init sched_init(void)
>   
>   		rq_attach_root(rq, &def_root_domain);
>   #ifdef CONFIG_NO_HZ_COMMON
> -		rq->last_blocked_load_update_tick = jiffies;
> +		rq->last_blocked_load_update_tick = now;
>   		atomic_set(&rq->nohz_flags, 0);
>   
>   		INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
> @@ -8660,6 +8662,11 @@ void __init sched_init(void)
>   
>   		rq->core_cookie = 0UL;
>   #endif
> +#ifdef CONFIG_SCHED_CACHE
> +		raw_spin_lock_init(&rq->cpu_epoch_lock);
> +		rq->cpu_epoch_next = now;
> +#endif
> +
>   		zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
>   	}
>   
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e43993a4e580..943af076e09c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1166,10 +1166,229 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
>   	return delta_exec;
>   }
>   
> -static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
> +#ifdef CONFIG_SCHED_CACHE
> +
> +/*
> + * XXX numbers come from a place the sun don't shine -- probably wants to be SD
> + * tunable or so.
> + */
> +#define EPOCH_PERIOD	(HZ/100)	/* 10 ms */
> +#define EPOCH_OLD	5		/* 50 ms */
> +
> +void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
> +{
> +	unsigned long epoch;
> +	int i;
> +
> +	for_each_possible_cpu(i) {
> +		struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
> +		struct rq *rq = cpu_rq(i);
> +
> +		pcpu_sched->runtime = 0;
> +		pcpu_sched->epoch = epoch = rq->cpu_epoch;
> +		pcpu_sched->occ = -1;
> +	}
> +
> +	raw_spin_lock_init(&mm->mm_sched_lock);
> +	mm->mm_sched_epoch = epoch;
> +	mm->mm_sched_cpu = -1;
> +
> +	smp_store_release(&mm->pcpu_sched, _pcpu_sched);
> +}
> +
> +/* because why would C be fully specified */
> +static __always_inline void __shr_u64(u64 *val, unsigned int n)
> +{
> +	if (n >= 64) {
> +		*val = 0;
> +		return;
> +	}
> +	*val >>= n;
> +}
> +
> +static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
> +{
> +	lockdep_assert_held(&rq->cpu_epoch_lock);
> +
> +	unsigned long n, now = jiffies;
> +	long delta = now - rq->cpu_epoch_next;
> +
> +	if (delta > 0) {
> +		n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
> +		rq->cpu_epoch += n;
> +		rq->cpu_epoch_next += n * EPOCH_PERIOD;
> +		__shr_u64(&rq->cpu_runtime, n);
> +	}
> +
> +	n = rq->cpu_epoch - pcpu_sched->epoch;
> +	if (n) {
> +		pcpu_sched->epoch += n;
> +		__shr_u64(&pcpu_sched->runtime, n);
> +	}
> +}
> +
> +static unsigned long fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
> +{
> +	guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
> +
> +	__update_mm_sched(rq, pcpu_sched);
> +
> +	/*
> +	 * Runtime is a geometric series (r=0.5) and as such will sum to twice
> +	 * the accumulation period, this means the multiplcation here should
> +	 * not overflow.
> +	 */
> +	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
> +}
> +
> +static inline
> +void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> +{
> +	struct mm_struct *mm = p->mm;
> +	struct mm_sched *pcpu_sched;
> +	unsigned long epoch;
> +
> +	/*
> +	 * init_task and kthreads don't be having no mm
> +	 */
> +	if (!mm || !mm->pcpu_sched)
> +		return;
> +
> +	pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
> +
> +	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
> +		__update_mm_sched(rq, pcpu_sched);
> +		pcpu_sched->runtime += delta_exec;
> +		rq->cpu_runtime += delta_exec;
> +		epoch = rq->cpu_epoch;
> +	}
> +
> +	/*
> +	 * If this task hasn't hit task_cache_work() for a while, invalidate
> +	 * it's preferred state.
> +	 */
> +	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
> +		mm->mm_sched_cpu = -1;
> +		pcpu_sched->occ = -1;
> +	}
> +}
> +
> +static void task_tick_cache(struct rq *rq, struct task_struct *p)
> +{
> +	struct callback_head *work = &p->cache_work;
> +	struct mm_struct *mm = p->mm;
> +
> +	if (!mm || !mm->pcpu_sched)
> +		return;
> +
> +	if (mm->mm_sched_epoch == rq->cpu_epoch)
> +		return;
> +

[1]

> +	guard(raw_spinlock)(&mm->mm_sched_lock);
> +
> +	if (mm->mm_sched_epoch == rq->cpu_epoch)
> +		return;

Remove above duplicated [1] and keep the check here?

> +
> +	if (work->next == work) {
> +		task_work_add(p, work, TWA_RESUME);
> +		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
> +	}
> +}
> +
> +static void task_cache_work(struct callback_head *work)
> +{
> +	struct task_struct *p = current;
> +	struct mm_struct *mm = p->mm;
> +	unsigned long m_a_occ = 0;
> +	int cpu, m_a_cpu = -1;
> +	cpumask_var_t cpus;
> +
> +	WARN_ON_ONCE(work != &p->cache_work);
> +
> +	work->next = work;
> +
> +	if (p->flags & PF_EXITING)
> +		return;
> +
> +	if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
> +		return;
> +
> +	scoped_guard (cpus_read_lock) {
> +		cpumask_copy(cpus, cpu_online_mask);
> +
> +		for_each_cpu(cpu, cpus) {
> +			/* XXX sched_cluster_active */
> +			struct sched_domain *sd = per_cpu(sd_llc, cpu);
> +			unsigned long occ, m_occ = 0, a_occ = 0;
> +			int m_cpu = -1, nr = 0, i;
> +
> +			for_each_cpu(i, sched_domain_span(sd)) {
> +				occ = fraction_mm_sched(cpu_rq(i),
> +							per_cpu_ptr(mm->pcpu_sched, i));
> +				a_occ += occ;
> +				if (occ > m_occ) {
> +					m_occ = occ;
> +					m_cpu = i;
> +				}
> +				nr++;
> +				trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
> +					     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
> +			}
> +
> +			a_occ /= nr;


In systems with a larger number of CPUs within a single LLC, the 
division may lose accuracy.
Can we either use multiplication for comparison, or just use the 
accumulated total CPU occupancy
of that LLC for comparison (by removing a_occ /= nr)?


> +			if (a_occ > m_a_occ) {
> +				m_a_occ = a_occ;
> +				m_a_cpu = m_cpu;
> +			}
> +
> +			trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
> +				     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
> +
> +			for_each_cpu(i, sched_domain_span(sd)) {
> +				/* XXX threshold ? */
> +				per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ;
> +			}
> +
> +			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
> +		}
> +	}
> +
> +	/*
> +	 * If the max average cache occupancy is 'small' we don't care.
> +	 */
> +	if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD))
> +		m_a_cpu = -1;
> +
> +	mm->mm_sched_cpu = m_a_cpu;


There might be an issue with mm_sched_cpu bouncing. Perhaps adding a 
threshold to compare the old a_occ of mm->mm_sched_cpu with the new 
a_occ of m_a_cpu could help. For example, if the latter (new_a_occ) is 
twice larger than the former (old a_occ), we can update mm->mm_sched_cpu 
to the new m_a_cpu. Otherwise, we keep the old value.

> +
> +	free_cpumask_var(cpus);
> +}
> +
> +void init_sched_mm(struct task_struct *p)
> +{
> +	struct callback_head *work = &p->cache_work;
> +	init_task_work(work, task_cache_work);
> +	work->next = work;
> +}
> +
> +#else
> +
> +static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
> +				    s64 delta_exec) { }
> +
> +
> +void init_sched_mm(struct task_struct *p) { }
> +
> +static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
> +
> +#endif
> +
> +static inline
> +void update_curr_task(struct rq *rq, struct task_struct *p, s64 delta_exec)
>   {
>   	trace_sched_stat_runtime(p, delta_exec);
>   	account_group_exec_runtime(p, delta_exec);
> +	account_mm_sched(rq, p, delta_exec);
>   	cgroup_account_cputime(p, delta_exec);
>   }
>   
> @@ -1215,7 +1434,7 @@ s64 update_curr_common(struct rq *rq)
>   
>   	delta_exec = update_curr_se(rq, &donor->se);
>   	if (likely(delta_exec > 0))
> -		update_curr_task(donor, delta_exec);
> +		update_curr_task(rq, donor, delta_exec);
>   
>   	return delta_exec;
>   }
> @@ -1244,7 +1463,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
>   	if (entity_is_task(curr)) {
>   		struct task_struct *p = task_of(curr);
>   
> -		update_curr_task(p, delta_exec);
> +		update_curr_task(rq, p, delta_exec);
>   
>   		/*
>   		 * If the fair_server is active, we need to account for the
> @@ -7850,7 +8069,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>   	 * per-cpu select_rq_mask usage
>   	 */
>   	lockdep_assert_irqs_disabled();
> -
> +again:
>   	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
>   	    asym_fits_cpu(task_util, util_min, util_max, target))
>   		return target;
> @@ -7888,7 +8107,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>   	/* Check a recently used CPU as a potential idle candidate: */
>   	recent_used_cpu = p->recent_used_cpu;
>   	p->recent_used_cpu = prev;
> -	if (recent_used_cpu != prev &&
> +	if (prev == p->wake_cpu &&
> +	    recent_used_cpu != prev &&
>   	    recent_used_cpu != target &&
>   	    cpus_share_cache(recent_used_cpu, target) &&
>   	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
> @@ -7941,6 +8161,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>   	if ((unsigned)i < nr_cpumask_bits)
>   		return i;
>   
> +	if (prev != p->wake_cpu && !cpus_share_cache(prev, p->wake_cpu)) {
> +		/*
> +		 * Most likely select_cache_cpu() will have re-directed
> +		 * the wakeup, but getting here means the preferred cache is
> +		 * too busy, so re-try with the actual previous.
> +		 *
> +		 * XXX wake_affine is lost for this pass.
> +		 */
> +		prev = target = p->wake_cpu;
> +		goto again;
> +	}
> +
>   	/*
>   	 * For cluster machines which have lower sharing cache like L2 or
>   	 * LLC Tag, we tend to find an idle CPU in the target's cluster
> @@ -8563,6 +8795,40 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>   	return target;
>   }
>   
> +#ifdef CONFIG_SCHED_CACHE
> +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle);
> +
> +static int select_cache_cpu(struct task_struct *p, int prev_cpu)
> +{
> +	struct mm_struct *mm = p->mm;
> +	int cpu;
> +
> +	if (!mm || p->nr_cpus_allowed == 1)
> +		return prev_cpu;
> +
> +	cpu = mm->mm_sched_cpu;
> +	if (cpu < 0)
> +		return prev_cpu;
> +


We observed frequent task migrations during some highly context-switch 
benchmarks, which led to performance regression when the LLC was 
saturated. Could we avoid task migration in cases where the previous CPU 
and the preferred CPU are within the same LLC?

if (cpus_share_cache(prev_cpu, cpu))
	return prev_cpu;

> +
> +	if (static_branch_likely(&sched_numa_balancing) &&
> +	    __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) {
> +		/*
> +		 * XXX look for max occupancy inside prev_cpu's node
> +		 */
> +		return prev_cpu;
> +	}
> +

Tim found that spreading tasks within the preferred LLC might help avoid 
task stacking:

sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (likely(sd))
	return cpumask_any(sched_domain_span(sd));

> +	return cpu;
> +}
> +#else
> +static int select_cache_cpu(struct task_struct *p, int prev_cpu)
> +{
> +	return prev_cpu;
> +}
> +#endif
> +
> +
>   /*
>    * select_task_rq_fair: Select target runqueue for the waking task in domains
>    * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
> @@ -8588,6 +8854,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>   	 * required for stable ->cpus_allowed
>   	 */
>   	lockdep_assert_held(&p->pi_lock);
> +	guard(rcu)();
> +
>   	if (wake_flags & WF_TTWU) {
>   		record_wakee(p);
>   
> @@ -8595,6 +8863,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>   		    cpumask_test_cpu(cpu, p->cpus_ptr))
>   			return cpu;
>   
> +		new_cpu = prev_cpu = select_cache_cpu(p, prev_cpu);
> +
>   		if (!is_rd_overutilized(this_rq()->rd)) {
>   			new_cpu = find_energy_efficient_cpu(p, prev_cpu);
>   			if (new_cpu >= 0)
> @@ -8605,7 +8875,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>   		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
>   	}
>   
> -	rcu_read_lock();
>   	for_each_domain(cpu, tmp) {
>   		/*
>   		 * If both 'cpu' and 'prev_cpu' are part of this domain,
> @@ -8638,7 +8907,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>   		/* Fast path */
>   		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
>   	}
> -	rcu_read_unlock();
>   
>   	return new_cpu;
>   }
> @@ -9288,6 +9556,17 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>   	if (sysctl_sched_migration_cost == 0)
>   		return 0;
>   
> +#ifdef CONFIG_SCHED_CACHE
> +	if (p->mm && p->mm->pcpu_sched) {
> +		/*
> +		 * XXX things like Skylake have non-inclusive L3 and might not
> +		 * like this L3 centric view. What to do about L2 stickyness ?
> +		 */
> +		return per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ >
> +		       per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ;

This might encourage more task migration within the preferred LLC, 
leading to some performance regression. Is it possible to raise the 
threshold for task migration within the preferred LLC and use the 
original delta time to determine whether a task is cache-hot?

if (p->mm && p->mm->pcpu_sched &&
     cpus_share_cache(env->dst_cpu, env->src_cpu))
	return  2*per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ >
		  per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ;
}

thanks,
Chenyu

> +	}
> +#endif
> +
>   	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
>   
>   	return delta < (s64)sysctl_sched_migration_cost;
> @@ -9299,27 +9578,25 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>    * Returns 0, if task migration is not affected by locality.
>    * Returns a negative value, if task migration improves locality i.e migration preferred.
>    */
> -static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
> +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
>   {
>   	struct numa_group *numa_group = rcu_dereference(p->numa_group);
>   	unsigned long src_weight, dst_weight;
>   	int src_nid, dst_nid, dist;
>   
> -	if (!static_branch_likely(&sched_numa_balancing))
> -		return 0;
> -
> -	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
> +	if (!p->numa_faults)
>   		return 0;
>   
> -	src_nid = cpu_to_node(env->src_cpu);
> -	dst_nid = cpu_to_node(env->dst_cpu);
> +	src_nid = cpu_to_node(src_cpu);
> +	dst_nid = cpu_to_node(dst_cpu);
>   
>   	if (src_nid == dst_nid)
>   		return 0;
>   
>   	/* Migrating away from the preferred node is always bad. */
>   	if (src_nid == p->numa_preferred_nid) {
> -		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
> +		struct rq *src_rq = cpu_rq(src_cpu);
> +		if (src_rq->nr_running > src_rq->nr_preferred_running)
>   			return 1;
>   		else
>   			return 0;
> @@ -9330,7 +9607,7 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
>   		return -1;
>   
>   	/* Leaving a core idle is often worse than degrading locality. */
> -	if (env->idle == CPU_IDLE)
> +	if (idle)
>   		return 0;
>   
>   	dist = node_distance(src_nid, dst_nid);
> @@ -9345,7 +9622,24 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
>   	return src_weight - dst_weight;
>   }
>   
> +static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
> +{
> +	if (!static_branch_likely(&sched_numa_balancing))
> +		return 0;
> +
> +	if (!(env->sd->flags & SD_NUMA))
> +		return 0;
> +
> +	return __migrate_degrades_locality(p, env->src_cpu, env->dst_cpu,
> +					   env->idle == CPU_IDLE);
> +}
> +
>   #else
> +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
> +{
> +	return 0;
> +}
> +
>   static inline long migrate_degrades_locality(struct task_struct *p,
>   					     struct lb_env *env)
>   {
> @@ -13104,8 +13398,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
>    */
>   static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>   {
> -	struct cfs_rq *cfs_rq;
>   	struct sched_entity *se = &curr->se;
> +	struct cfs_rq *cfs_rq;
>   
>   	for_each_sched_entity(se) {
>   		cfs_rq = cfs_rq_of(se);
> @@ -13115,6 +13409,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>   	if (static_branch_unlikely(&sched_numa_balancing))
>   		task_tick_numa(rq, curr);
>   
> +	task_tick_cache(rq, curr);
> +
>   	update_misfit_status(curr, rq);
>   	check_update_overutilized_status(task_rq(curr));
>   
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 47972f34ea70..d16ccd66ca07 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1171,6 +1171,12 @@ struct rq {
>   	u64			clock_pelt_idle_copy;
>   	u64			clock_idle_copy;
>   #endif
> +#ifdef CONFIG_SCHED_CACHE
> +	raw_spinlock_t		cpu_epoch_lock;
> +	u64			cpu_runtime;
> +	unsigned long		cpu_epoch;
> +	unsigned long		cpu_epoch_next;
> +#endif
>   
>   	atomic_t		nr_iowait;
>   
> @@ -3861,6 +3867,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
>   static inline void init_sched_mm_cid(struct task_struct *t) { }
>   #endif /* !CONFIG_SCHED_MM_CID */
>   
> +extern void init_sched_mm(struct task_struct *p);
> +
>   extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
>   extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
>   #ifdef CONFIG_SMP

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Peter Zijlstra 8 months, 3 weeks ago

On Tue, Mar 25, 2025 at 11:19:52PM +0800, Chen, Yu C wrote:

> > +static void task_tick_cache(struct rq *rq, struct task_struct *p)
> > +{
> > +	struct callback_head *work = &p->cache_work;
> > +	struct mm_struct *mm = p->mm;
> > +
> > +	if (!mm || !mm->pcpu_sched)
> > +		return;
> > +
> > +	if (mm->mm_sched_epoch == rq->cpu_epoch)
> > +		return;
> > +
> 
> [1]
> 
> > +	guard(raw_spinlock)(&mm->mm_sched_lock);
> > +
> > +	if (mm->mm_sched_epoch == rq->cpu_epoch)
> > +		return;
> 
> Remove above duplicated [1] and keep the check here?

Why? That just adds locking overhead for no reason.

> > +
> > +	if (work->next == work) {
> > +		task_work_add(p, work, TWA_RESUME);
> > +		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
> > +	}
> > +}
> > +
> > +static void task_cache_work(struct callback_head *work)
> > +{
> > +	struct task_struct *p = current;
> > +	struct mm_struct *mm = p->mm;
> > +	unsigned long m_a_occ = 0;
> > +	int cpu, m_a_cpu = -1;
> > +	cpumask_var_t cpus;
> > +
> > +	WARN_ON_ONCE(work != &p->cache_work);
> > +
> > +	work->next = work;
> > +
> > +	if (p->flags & PF_EXITING)
> > +		return;
> > +
> > +	if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
> > +		return;
> > +
> > +	scoped_guard (cpus_read_lock) {
> > +		cpumask_copy(cpus, cpu_online_mask);
> > +
> > +		for_each_cpu(cpu, cpus) {
> > +			/* XXX sched_cluster_active */
> > +			struct sched_domain *sd = per_cpu(sd_llc, cpu);
> > +			unsigned long occ, m_occ = 0, a_occ = 0;
> > +			int m_cpu = -1, nr = 0, i;
> > +
> > +			for_each_cpu(i, sched_domain_span(sd)) {
> > +				occ = fraction_mm_sched(cpu_rq(i),
> > +							per_cpu_ptr(mm->pcpu_sched, i));
> > +				a_occ += occ;
> > +				if (occ > m_occ) {
> > +					m_occ = occ;
> > +					m_cpu = i;
> > +				}
> > +				nr++;
> > +				trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
> > +					     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
> > +			}
> > +
> > +			a_occ /= nr;
> 
> 
> In systems with a larger number of CPUs within a single LLC, the division
> may lose accuracy.

May, sure, but does it actually matter? We're tracking occupancy in ns
per EPOCH_PERIOD, which is 1e7. Lets assume 'large' here is something
daft like 100 CPUs per LLC (lets just really hope nobody actually goes
and do that, please), then you're still only loosing like 1e2 ns from
your average.

> Can we either use multiplication for comparison, or just use the accumulated
> total CPU occupancy
> of that LLC for comparison (by removing a_occ /= nr)?

You can't remove the devision, you get into trouble when the LLCs do not
have equal number of CPUs. You can carry that multiplication around ofc,
but again, why bother?

> > +			if (a_occ > m_a_occ) {
> > +				m_a_occ = a_occ;
> > +				m_a_cpu = m_cpu;
> > +			}
> > +
> > +			trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
> > +				     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
> > +
> > +			for_each_cpu(i, sched_domain_span(sd)) {
> > +				/* XXX threshold ? */
> > +				per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ;
> > +			}
> > +
> > +			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
> > +		}
> > +	}
> > +
> > +	/*
> > +	 * If the max average cache occupancy is 'small' we don't care.
> > +	 */
> > +	if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD))
> > +		m_a_cpu = -1;
> > +
> > +	mm->mm_sched_cpu = m_a_cpu;
> 
> There might be an issue with mm_sched_cpu bouncing. Perhaps adding a
> threshold to compare the old a_occ of mm->mm_sched_cpu with the new a_occ of
> m_a_cpu could help. For example, if the latter (new_a_occ) is twice larger
> than the former (old a_occ), we can update mm->mm_sched_cpu to the new
> m_a_cpu. Otherwise, we keep the old value.

Some hysteresis might make sense, but I don't think waiting for it to
double makes sense, that might be too agressive.

> > +#ifdef CONFIG_SCHED_CACHE
> > +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle);
> > +
> > +static int select_cache_cpu(struct task_struct *p, int prev_cpu)
> > +{
> > +	struct mm_struct *mm = p->mm;
> > +	int cpu;
> > +
> > +	if (!mm || p->nr_cpus_allowed == 1)
> > +		return prev_cpu;
> > +
> > +	cpu = mm->mm_sched_cpu;
> > +	if (cpu < 0)
> > +		return prev_cpu;
> > +
> 
> 
> We observed frequent task migrations during some highly context-switch
> benchmarks, which led to performance regression when the LLC was saturated.
> Could we avoid task migration in cases where the previous CPU and the
> preferred CPU are within the same LLC?
> 
> if (cpus_share_cache(prev_cpu, cpu))
> 	return prev_cpu;

Sure.

> > +
> > +	if (static_branch_likely(&sched_numa_balancing) &&
> > +	    __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) {
> > +		/*
> > +		 * XXX look for max occupancy inside prev_cpu's node
> > +		 */
> > +		return prev_cpu;
> > +	}
> > +
> 
> Tim found that spreading tasks within the preferred LLC might help avoid
> task stacking:
> 
> sd = rcu_dereference(per_cpu(sd_llc, cpu));
> if (likely(sd))
> 	return cpumask_any(sched_domain_span(sd));

You know that cpumask_any() is implemented using cpumask_first(), right?
You're causing stacking if anything with that.

> > @@ -9288,6 +9556,17 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
> >   	if (sysctl_sched_migration_cost == 0)
> >   		return 0;
> > +#ifdef CONFIG_SCHED_CACHE
> > +	if (p->mm && p->mm->pcpu_sched) {
> > +		/*
> > +		 * XXX things like Skylake have non-inclusive L3 and might not
> > +		 * like this L3 centric view. What to do about L2 stickyness ?
> > +		 */
> > +		return per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ >
> > +		       per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ;
> 
> This might encourage more task migration within the preferred LLC, leading
> to some performance regression. Is it possible to raise the threshold for
> task migration within the preferred LLC and use the original delta time to
> determine whether a task is cache-hot?
> 
> if (p->mm && p->mm->pcpu_sched &&
>     cpus_share_cache(env->dst_cpu, env->src_cpu))
> 	return  2*per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ >
> 		  per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ;
> }

Nah, the saner thing to do is to preserve the topology averages and look
at those instead of the per-cpu values.

Eg. have task_cache_work() compute and store averages in the
sched_domain structure and then use those.

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Chen, Yu C 8 months, 3 weeks ago

On 3/26/2025 5:38 PM, Peter Zijlstra wrote:
> On Tue, Mar 25, 2025 at 11:19:52PM +0800, Chen, Yu C wrote:
> 
>>> +static void task_tick_cache(struct rq *rq, struct task_struct *p)
>>> +{
>>> +	struct callback_head *work = &p->cache_work;
>>> +	struct mm_struct *mm = p->mm;
>>> +
>>> +	if (!mm || !mm->pcpu_sched)
>>> +		return;
>>> +
>>> +	if (mm->mm_sched_epoch == rq->cpu_epoch)
>>> +		return;
>>> +
>>
>> [1]
>>
>>> +	guard(raw_spinlock)(&mm->mm_sched_lock);
>>> +
>>> +	if (mm->mm_sched_epoch == rq->cpu_epoch)
>>> +		return;
>>
>> Remove above duplicated [1] and keep the check here?
> 
> Why? That just adds locking overhead for no reason.
> 

I thought mm->mm_sched_epoch is updated under the protect of
mm->mm_sched_lock, so the read side should also be protected by
this lock?

>>> +
>>> +	if (work->next == work) {
>>> +		task_work_add(p, work, TWA_RESUME);
>>> +		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
>>> +	}
>>> +}
>>> +
>>> +static void task_cache_work(struct callback_head *work)
>>> +{
>>> +	struct task_struct *p = current;
>>> +	struct mm_struct *mm = p->mm;
>>> +	unsigned long m_a_occ = 0;
>>> +	int cpu, m_a_cpu = -1;
>>> +	cpumask_var_t cpus;
>>> +
>>> +	WARN_ON_ONCE(work != &p->cache_work);
>>> +
>>> +	work->next = work;
>>> +
>>> +	if (p->flags & PF_EXITING)
>>> +		return;
>>> +
>>> +	if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
>>> +		return;
>>> +
>>> +	scoped_guard (cpus_read_lock) {
>>> +		cpumask_copy(cpus, cpu_online_mask);
>>> +
>>> +		for_each_cpu(cpu, cpus) {
>>> +			/* XXX sched_cluster_active */
>>> +			struct sched_domain *sd = per_cpu(sd_llc, cpu);
>>> +			unsigned long occ, m_occ = 0, a_occ = 0;
>>> +			int m_cpu = -1, nr = 0, i;
>>> +
>>> +			for_each_cpu(i, sched_domain_span(sd)) {
>>> +				occ = fraction_mm_sched(cpu_rq(i),
>>> +							per_cpu_ptr(mm->pcpu_sched, i));
>>> +				a_occ += occ;
>>> +				if (occ > m_occ) {
>>> +					m_occ = occ;
>>> +					m_cpu = i;
>>> +				}
>>> +				nr++;
>>> +				trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
>>> +					     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
>>> +			}
>>> +
>>> +			a_occ /= nr;
>>
>>
>> In systems with a larger number of CPUs within a single LLC, the division
>> may lose accuracy.
> 
> May, sure, but does it actually matter? We're tracking occupancy in ns
> per EPOCH_PERIOD, which is 1e7. Lets assume 'large' here is something
> daft like 100 CPUs per LLC (lets just really hope nobody actually goes
> and do that, please), then you're still only loosing like 1e2 ns from
> your average.
> 

I see, the 100 ns should not matter much compared to 10 ms sample period.

>> Can we either use multiplication for comparison, or just use the accumulated
>> total CPU occupancy
>> of that LLC for comparison (by removing a_occ /= nr)?
> 
> You can't remove the devision, you get into trouble when the LLCs do not
> have equal number of CPUs. 

On a hybrid system, suppose there are two LLCs, LLC1 has 8 CPUs, the 
sum_occ is 1024, its avg_occ is 128. LLC2 has 4 CPUs, the sum_occ is
768, avg_occ is 192. If we compare avg_occ, we should choose LLC2, even 
if LLC1 might have more idle CPUs, and LLC1 has more active cache-hot 
threads.

> You can carry that multiplication around ofc,
> but again, why bother?
> 
>>> +			if (a_occ > m_a_occ) {
>>> +				m_a_occ = a_occ;
>>> +				m_a_cpu = m_cpu;
>>> +			}
>>> +
>>> +			trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
>>> +				     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
>>> +
>>> +			for_each_cpu(i, sched_domain_span(sd)) {
>>> +				/* XXX threshold ? */
>>> +				per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ;
>>> +			}
>>> +
>>> +			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
>>> +		}
>>> +	}
>>> +
>>> +	/*
>>> +	 * If the max average cache occupancy is 'small' we don't care.
>>> +	 */
>>> +	if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD))
>>> +		m_a_cpu = -1;
>>> +
>>> +	mm->mm_sched_cpu = m_a_cpu;
>>
>> There might be an issue with mm_sched_cpu bouncing. Perhaps adding a
>> threshold to compare the old a_occ of mm->mm_sched_cpu with the new a_occ of
>> m_a_cpu could help. For example, if the latter (new_a_occ) is twice larger
>> than the former (old a_occ), we can update mm->mm_sched_cpu to the new
>> m_a_cpu. Otherwise, we keep the old value.
> 
> Some hysteresis might make sense, but I don't think waiting for it to
> double makes sense, that might be too agressive.
> 

OK, might need to do some evaluation to figure out a reasonable threshold.

>>> +#ifdef CONFIG_SCHED_CACHE
>>> +static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle);
>>> +
>>> +static int select_cache_cpu(struct task_struct *p, int prev_cpu)
>>> +{
>>> +	struct mm_struct *mm = p->mm;
>>> +	int cpu;
>>> +
>>> +	if (!mm || p->nr_cpus_allowed == 1)
>>> +		return prev_cpu;
>>> +
>>> +	cpu = mm->mm_sched_cpu;
>>> +	if (cpu < 0)
>>> +		return prev_cpu;
>>> +
>>
>>
>> We observed frequent task migrations during some highly context-switch
>> benchmarks, which led to performance regression when the LLC was saturated.
>> Could we avoid task migration in cases where the previous CPU and the
>> preferred CPU are within the same LLC?
>>
>> if (cpus_share_cache(prev_cpu, cpu))
>> 	return prev_cpu;
> 
> Sure.
> 
>>> +
>>> +	if (static_branch_likely(&sched_numa_balancing) &&
>>> +	    __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) {
>>> +		/*
>>> +		 * XXX look for max occupancy inside prev_cpu's node
>>> +		 */
>>> +		return prev_cpu;
>>> +	}
>>> +
>>
>> Tim found that spreading tasks within the preferred LLC might help avoid
>> task stacking:
>>
>> sd = rcu_dereference(per_cpu(sd_llc, cpu));
>> if (likely(sd))
>> 	return cpumask_any(sched_domain_span(sd));
> 
> You know that cpumask_any() is implemented using cpumask_first(), right?
> You're causing stacking if anything with that.
> 

I see, this still cause stacking issue. Let me try to find other method 
to spread task on its preferred LLC.

>>> @@ -9288,6 +9556,17 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>>>    	if (sysctl_sched_migration_cost == 0)
>>>    		return 0;
>>> +#ifdef CONFIG_SCHED_CACHE
>>> +	if (p->mm && p->mm->pcpu_sched) {
>>> +		/*
>>> +		 * XXX things like Skylake have non-inclusive L3 and might not
>>> +		 * like this L3 centric view. What to do about L2 stickyness ?
>>> +		 */
>>> +		return per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ >
>>> +		       per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ;
>>
>> This might encourage more task migration within the preferred LLC, leading
>> to some performance regression. Is it possible to raise the threshold for
>> task migration within the preferred LLC and use the original delta time to
>> determine whether a task is cache-hot?
>>
>> if (p->mm && p->mm->pcpu_sched &&
>>      cpus_share_cache(env->dst_cpu, env->src_cpu))
>> 	return  2*per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ >
>> 		  per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ;
>> }

After a second thought, my change might be incorrect. Because every CPU 
in the same LLC should have the same percpu occ.

 > > Nah, the saner thing to do is to preserve the topology averages and 
look
> at those instead of the per-cpu values.
> 
> Eg. have task_cache_work() compute and store averages in the
> sched_domain structure and then use those.
Sorry I did not quite catch up with this, doesn't the
per_cpu_ptr(p->mm->pcpu_sched, cpu)->occ
already have the average occupancy of the whole LLC domain? Every
CPU in the same LLC should have the same average occ because in
task_cache_work():
for_each_cpu(i, sched_domain_span(sd)) {
	/* XXX threshold ? */
	per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ;
}


If the goal is to avoid migrating task to its non-preferred LLC during
load balance, maybe we can check if:
1. env->src_cpu is in p's preferred LLC already, and
2. env->dst_cpu is not in p's preferred LLC, and
3. p's preferred LLC is not overloaded,we should treat p as cache hot.

The definition of preferred LLC of a task is something like:
p->mm->mm_preferred_llc = llc_id(p->mm->mm_sched_cpu)


I'll add some trace/schedstat on top of your patch to investigate.

thanks,
Chenyu

thanks,
Chenyu

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Peter Zijlstra 8 months, 3 weeks ago

On Wed, Mar 26, 2025 at 10:38:41AM +0100, Peter Zijlstra wrote:

> Nah, the saner thing to do is to preserve the topology averages and look
> at those instead of the per-cpu values.
> 
> Eg. have task_cache_work() compute and store averages in the
> sched_domain structure and then use those.

A little something like so perhaps ?

This immediately also gives the information required for clusters and
finding the best LLC of a Node and things like that.

--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -82,6 +82,9 @@ struct sched_domain_shared {
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
 	int		nr_idle_scan;
+
+	unsigned long   sum_occ;
+	unsigned long	avg_occ;
 };
 
 struct sched_domain {
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1286,8 +1286,8 @@ static void task_cache_work(struct callb
 	struct task_struct *p = current;
 	struct mm_struct *mm = p->mm;
 	unsigned long m_a_occ = 0;
-	int cpu, m_a_cpu = -1;
-	cpumask_var_t cpus;
+	int m_a_cpu = -1;
+	int cpu;
 
 	WARN_ON_ONCE(work != &p->cache_work);
 
@@ -1296,46 +1296,46 @@ static void task_cache_work(struct callb
 	if (p->flags & PF_EXITING)
 		return;
 
-	if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
-		return;
-
 	scoped_guard (cpus_read_lock) {
-		cpumask_copy(cpus, cpu_online_mask);
 
-		for_each_cpu(cpu, cpus) {
-			/* XXX sched_cluster_active */
-			struct sched_domain *sd = per_cpu(sd_llc, cpu);
-			unsigned long occ, m_occ = 0, a_occ = 0;
-			int m_cpu = -1, nr = 0, i;
+		for_each_online_cpu(cpu) {
+			struct sched_domain *sd;
+			struct sched_domain_shared *sds;
+			unsigned long occ;
+
+			for_each_domain(cpu, sd) {
+				if (!(sd->flags & SD_SHARE_LLC))
+					break;
 
-			for_each_cpu(i, sched_domain_span(sd)) {
+				sds = sd->shared;
 				occ = fraction_mm_sched(cpu_rq(i),
 							per_cpu_ptr(mm->pcpu_sched, i));
-				a_occ += occ;
-				if (occ > m_occ) {
-					m_occ = occ;
-					m_cpu = i;
-				}
-				nr++;
-				trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
-					     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
-			}
-
-			a_occ /= nr;
-			if (a_occ > m_a_occ) {
-				m_a_occ = a_occ;
-				m_a_cpu = m_cpu;
+				sds->sum_occ += occ + 1;
 			}
+		}
 
-			trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
-				     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
+		for_each_online_cpu(cpu) {
+			struct sched_domain *sd;
+			struct sched_domain_shared *sds;
+
+			for_each_domain(cpu, sd) {
+				if (!(sd->flags & SD_SHARE_LLC))
+					break;
+
+				sds = sd->shared;
+				if (sds->agg_occ) {
+					sds->avg_occ = (sds->agg_occ - sd->span_weight) /
+						       sd->span_weight;
+					sds->sum_occ = 0;
+				}
 
-			for_each_cpu(i, sched_domain_span(sd)) {
-				/* XXX threshold ? */
-				per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ;
+				if (sd == per_cpu(sd_llc, cpu)) {
+					if (sds->avg_occ > m_a_occ) {
+						m_a_occ = sds->avg_occ;
+						m_a_cpu = cpu;
+					}
+				}
 			}
-
-			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
 		}
 	}
 
@@ -1346,8 +1346,6 @@ static void task_cache_work(struct callb
 		m_a_cpu = -1;
 
 	mm->mm_sched_cpu = m_a_cpu;
-
-	free_cpumask_var(cpus);
 }
 
 void init_sched_mm(struct task_struct *p)

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Peter Zijlstra 8 months, 3 weeks ago

On Wed, Mar 26, 2025 at 11:25:53AM +0100, Peter Zijlstra wrote:
> On Wed, Mar 26, 2025 at 10:38:41AM +0100, Peter Zijlstra wrote:
> 
> > Nah, the saner thing to do is to preserve the topology averages and look
> > at those instead of the per-cpu values.
> > 
> > Eg. have task_cache_work() compute and store averages in the
> > sched_domain structure and then use those.
> 
> A little something like so perhaps ?

Oh urgh, ignore all that, I'm an idiot.

The numbers are per process, can't stick them in global structures.

I remember going through all this before :-/ Its just so very tempting
since its the right structure and everything, but we need it per
process :-(

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Peter Zijlstra 8 months, 3 weeks ago

On Wed, Mar 26, 2025 at 11:25:53AM +0100, Peter Zijlstra wrote:
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1286,8 +1286,8 @@ static void task_cache_work(struct callb
>  	struct task_struct *p = current;
>  	struct mm_struct *mm = p->mm;
>  	unsigned long m_a_occ = 0;
> -	int cpu, m_a_cpu = -1;
> -	cpumask_var_t cpus;
> +	int m_a_cpu = -1;
> +	int cpu;
>  
>  	WARN_ON_ONCE(work != &p->cache_work);
>  
> @@ -1296,46 +1296,46 @@ static void task_cache_work(struct callb
>  	if (p->flags & PF_EXITING)
>  		return;
>  
> -	if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
> -		return;
> -
>  	scoped_guard (cpus_read_lock) {
> -		cpumask_copy(cpus, cpu_online_mask);
>  
> -		for_each_cpu(cpu, cpus) {
> -			/* XXX sched_cluster_active */
> -			struct sched_domain *sd = per_cpu(sd_llc, cpu);
> -			unsigned long occ, m_occ = 0, a_occ = 0;
> -			int m_cpu = -1, nr = 0, i;
> +		for_each_online_cpu(cpu) {
> +			struct sched_domain *sd;
> +			struct sched_domain_shared *sds;
> +			unsigned long occ;
> +
> +			for_each_domain(cpu, sd) {
> +				if (!(sd->flags & SD_SHARE_LLC))
> +					break;
>  
> -			for_each_cpu(i, sched_domain_span(sd)) {
> +				sds = sd->shared;
>  				occ = fraction_mm_sched(cpu_rq(i),
>  							per_cpu_ptr(mm->pcpu_sched, i));
> -				a_occ += occ;
> -				if (occ > m_occ) {
> -					m_occ = occ;
> -					m_cpu = i;
> -				}
> -				nr++;
> -				trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
> -					     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
> -			}
> -
> -			a_occ /= nr;
> -			if (a_occ > m_a_occ) {
> -				m_a_occ = a_occ;
> -				m_a_cpu = m_cpu;
> +				sds->sum_occ += occ + 1;
>  			}
> +		}
>  
> -			trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
> -				     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
> +		for_each_online_cpu(cpu) {
> +			struct sched_domain *sd;
> +			struct sched_domain_shared *sds;
> +
> +			for_each_domain(cpu, sd) {
> +				if (!(sd->flags & SD_SHARE_LLC))
> +					break;
> +
> +				sds = sd->shared;
> +				if (sds->agg_occ) {
> +					sds->avg_occ = (sds->agg_occ - sd->span_weight) /
> +						       sd->span_weight;
> +					sds->sum_occ = 0;
> +				}

s/agg_occ/sum_occ/g, stupid last minute renames etc.. :-)

>  
> -			for_each_cpu(i, sched_domain_span(sd)) {
> -				/* XXX threshold ? */
> -				per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ;
> +				if (sd == per_cpu(sd_llc, cpu)) {
> +					if (sds->avg_occ > m_a_occ) {
> +						m_a_occ = sds->avg_occ;
> +						m_a_cpu = cpu;
> +					}
> +				}
>  			}
> -
> -			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
>  		}
>  	}
>  
> @@ -1346,8 +1346,6 @@ static void task_cache_work(struct callb
>  		m_a_cpu = -1;
>  
>  	mm->mm_sched_cpu = m_a_cpu;
> -
> -	free_cpumask_var(cpus);
>  }
>  
>  void init_sched_mm(struct task_struct *p)

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Peter Zijlstra 8 months, 3 weeks ago

On Tue, Mar 25, 2025 at 11:19:52PM +0800, Chen, Yu C wrote:
> 
> Hi Peter,
> 
> Thanks for sending this out,
> 
> On 3/25/2025 8:09 PM, Peter Zijlstra wrote:
> > Hi all,
> > 
> > One of the many things on the eternal todo list has been finishing the
> > below hackery.
> > 
> > It is an attempt at modelling cache affinity -- and while the patch
> > really only targets LLC, it could very well be extended to also apply to
> > clusters (L2). Specifically any case of multiple cache domains inside a
> > node.
> > 
> > Anyway, I wrote this about a year ago, and I mentioned this at the
> > recent OSPM conf where Gautham and Prateek expressed interest in playing
> > with this code.
> > 
> > So here goes, very rough and largely unproven code ahead :-)
> > 
> > It applies to current tip/master, but I know it will fail the __percpu
> > validation that sits in -next, although that shouldn't be terribly hard
> > to fix up.
> > 
> > As is, it only computes a CPU inside the LLC that has the highest recent
> > runtime, this CPU is then used in the wake-up path to steer towards this
> > LLC and in task_hot() to limit migrations away from it.
> > 
> > More elaborate things could be done, notably there is an XXX in there
> > somewhere about finding the best LLC inside a NODE (interaction with
> > NUMA_BALANCING).
> > 
> 
> Besides the control provided by CONFIG_SCHED_CACHE, could we also introduce
> sched_feat(SCHED_CACHE) to manage this feature, facilitating dynamic
> adjustments? Similarly we can also introduce other sched_feats for load
> balancing and NUMA balancing for fine-grain control.

We can do all sorts, but the very first thing is determining if this is
worth it at all. Because if we can't make this work at all, all those
things are a waste of time.

This patch is not meant to be merged, it is meant for testing and
development. We need to first make it actually improve workloads. If it
then turns out it regresses workloads (likely, things always do), then
we can look at how to best do that.

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by K Prateek Nayak 8 months, 3 weeks ago

Hello Peter, Chenyu,

On 3/26/2025 12:14 AM, Peter Zijlstra wrote:
> On Tue, Mar 25, 2025 at 11:19:52PM +0800, Chen, Yu C wrote:
>>
>> Hi Peter,
>>
>> Thanks for sending this out,
>>
>> On 3/25/2025 8:09 PM, Peter Zijlstra wrote:
>>> Hi all,
>>>
>>> One of the many things on the eternal todo list has been finishing the
>>> below hackery.
>>>
>>> It is an attempt at modelling cache affinity -- and while the patch
>>> really only targets LLC, it could very well be extended to also apply to
>>> clusters (L2). Specifically any case of multiple cache domains inside a
>>> node.
>>>
>>> Anyway, I wrote this about a year ago, and I mentioned this at the
>>> recent OSPM conf where Gautham and Prateek expressed interest in playing
>>> with this code.
>>>
>>> So here goes, very rough and largely unproven code ahead :-)
>>>
>>> It applies to current tip/master, but I know it will fail the __percpu
>>> validation that sits in -next, although that shouldn't be terribly hard
>>> to fix up.
>>>
>>> As is, it only computes a CPU inside the LLC that has the highest recent
>>> runtime, this CPU is then used in the wake-up path to steer towards this
>>> LLC and in task_hot() to limit migrations away from it.
>>>
>>> More elaborate things could be done, notably there is an XXX in there
>>> somewhere about finding the best LLC inside a NODE (interaction with
>>> NUMA_BALANCING).
>>>
>>
>> Besides the control provided by CONFIG_SCHED_CACHE, could we also introduce
>> sched_feat(SCHED_CACHE) to manage this feature, facilitating dynamic
>> adjustments? Similarly we can also introduce other sched_feats for load
>> balancing and NUMA balancing for fine-grain control.
> 
> We can do all sorts, but the very first thing is determining if this is
> worth it at all. Because if we can't make this work at all, all those
> things are a waste of time.
> 
> This patch is not meant to be merged, it is meant for testing and
> development. We need to first make it actually improve workloads. If it
> then turns out it regresses workloads (likely, things always do), then
> we can look at how to best do that.
> 

Thank you for sharing the patch and the initial review from Chenyu
pointing to issues that need fixing. I'll try to take a good look at it
this week and see if I can improve some trivial benchmarks that regress
currently with RFC as is.

In its current form I think this suffers from the same problem as
SIS_NODE where wakeups redirect to same set of CPUs and a good deal of
additional work is being done without any benefit.

I'll leave the results from my initial testing on the 3rd Generation
EPYC platform below and will evaluate what is making the benchmarks
unhappy. I'll return with more data when some of these benchmarks
are not as unhappy as they are now.

Thank you both for the RFC and the initial feedback. Following are
the initial results for the RFC as is:

   ==================================================================
   Test          : hackbench
   Units         : Normalized time in seconds
   Interpretation: Lower is better
   Statistic     : AMean
   ==================================================================
   Case:           tip[pct imp](CV)      sched_cache[pct imp](CV)
    1-groups     1.00 [ -0.00](10.12)     1.01 [ -0.89]( 2.84)
    2-groups     1.00 [ -0.00]( 6.92)     1.83 [-83.15]( 1.61)
    4-groups     1.00 [ -0.00]( 3.14)     3.00 [-200.21]( 3.13)
    8-groups     1.00 [ -0.00]( 1.35)     3.44 [-243.75]( 2.20)
   16-groups     1.00 [ -0.00]( 1.32)     2.59 [-158.98]( 4.29)


   ==================================================================
   Test          : tbench
   Units         : Normalized throughput
   Interpretation: Higher is better
   Statistic     : AMean
   ==================================================================
   Clients:    tip[pct imp](CV)     sched_cache[pct imp](CV)
       1     1.00 [  0.00]( 0.43)     0.96 [ -3.54]( 0.56)
       2     1.00 [  0.00]( 0.58)     0.99 [ -1.32]( 1.40)
       4     1.00 [  0.00]( 0.54)     0.98 [ -2.34]( 0.78)
       8     1.00 [  0.00]( 0.49)     0.96 [ -3.91]( 0.54)
      16     1.00 [  0.00]( 1.06)     0.97 [ -3.22]( 1.82)
      32     1.00 [  0.00]( 1.27)     0.95 [ -4.74]( 2.05)
      64     1.00 [  0.00]( 1.54)     0.93 [ -6.65]( 0.63)
     128     1.00 [  0.00]( 0.38)     0.93 [ -6.91]( 1.18)
     256     1.00 [  0.00]( 1.85)     0.99 [ -0.50]( 1.34)
     512     1.00 [  0.00]( 0.31)     0.98 [ -2.47]( 0.14)
    1024     1.00 [  0.00]( 0.19)     0.97 [ -3.06]( 0.39)


   ==================================================================
   Test          : stream-10
   Units         : Normalized Bandwidth, MB/s
   Interpretation: Higher is better
   Statistic     : HMean
   ==================================================================
   Test:       tip[pct imp](CV)     sched_cache[pct imp](CV)
    Copy     1.00 [  0.00](11.31)     0.34 [-65.89](72.77)
   Scale     1.00 [  0.00]( 6.62)     0.32 [-68.09](72.49)
     Add     1.00 [  0.00]( 7.06)     0.34 [-65.56](70.56)
   Triad     1.00 [  0.00]( 8.91)     0.34 [-66.47](72.70)


   ==================================================================
   Test          : stream-100
   Units         : Normalized Bandwidth, MB/s
   Interpretation: Higher is better
   Statistic     : HMean
   ==================================================================
   Test:       tip[pct imp](CV)     sched_cache[pct imp](CV)
    Copy     1.00 [  0.00]( 2.01)     0.83 [-16.96](24.55)
   Scale     1.00 [  0.00]( 1.49)     0.79 [-21.40](24.10)
     Add     1.00 [  0.00]( 2.67)     0.79 [-21.33](25.39)
   Triad     1.00 [  0.00]( 2.19)     0.81 [-19.19](25.55)


   ==================================================================
   Test          : netperf
   Units         : Normalized Througput
   Interpretation: Higher is better
   Statistic     : AMean
   ==================================================================
   Clients:         tip[pct imp](CV)     sched_cache[pct imp](CV)
    1-clients     1.00 [  0.00]( 1.43)     0.98 [ -2.22]( 0.26)
    2-clients     1.00 [  0.00]( 1.02)     0.97 [ -2.55]( 0.89)
    4-clients     1.00 [  0.00]( 0.83)     0.98 [ -2.27]( 0.46)
    8-clients     1.00 [  0.00]( 0.73)     0.98 [ -2.45]( 0.80)
   16-clients     1.00 [  0.00]( 0.97)     0.97 [ -2.90]( 0.88)
   32-clients     1.00 [  0.00]( 0.88)     0.95 [ -5.29]( 1.69)
   64-clients     1.00 [  0.00]( 1.49)     0.91 [ -8.70]( 1.95)
   128-clients    1.00 [  0.00]( 1.05)     0.92 [ -8.39]( 4.25)
   256-clients    1.00 [  0.00]( 3.85)     0.92 [ -8.33]( 2.45)
   512-clients    1.00 [  0.00](59.63)     0.92 [ -7.83](51.19)


   ==================================================================
   Test          : schbench
   Units         : Normalized 99th percentile latency in us
   Interpretation: Lower is better
   Statistic     : Median
   ==================================================================
   #workers: tip[pct imp](CV)       sched_cache[pct imp](CV)
     1     1.00 [ -0.00]( 6.67)      0.38 [ 62.22]    ( 5.88)
     2     1.00 [ -0.00](10.18)      0.43 [ 56.52]    ( 2.94)
     4     1.00 [ -0.00]( 4.49)      0.60 [ 40.43]    ( 5.52)
     8     1.00 [ -0.00]( 6.68)    113.96 [-11296.23] (12.91)
    16     1.00 [ -0.00]( 1.87)    359.34 [-35834.43] (20.02)
    32     1.00 [ -0.00]( 4.01)    217.67 [-21667.03] ( 5.48)
    64     1.00 [ -0.00]( 3.21)     97.43 [-9643.02]  ( 4.61)
   128     1.00 [ -0.00](44.13)     41.36 [-4036.10]  ( 6.92)
   256     1.00 [ -0.00](14.46)      2.69 [-169.31]   ( 1.86)
   512     1.00 [ -0.00]( 1.95)      1.89 [-89.22]    ( 2.24)


   ==================================================================
   Test          : new-schbench-requests-per-second
   Units         : Normalized Requests per second
   Interpretation: Higher is better
   Statistic     : Median
   ==================================================================
   #workers: tip[pct imp](CV)      sched_cache[pct imp](CV)
     1     1.00 [  0.00]( 0.46)     0.96 [ -4.14]( 0.00)
     2     1.00 [  0.00]( 0.15)     0.95 [ -5.27]( 2.29)
     4     1.00 [  0.00]( 0.15)     0.88 [-12.01]( 0.46)
     8     1.00 [  0.00]( 0.15)     0.55 [-45.47]( 1.23)
    16     1.00 [  0.00]( 0.00)     0.54 [-45.62]( 0.50)
    32     1.00 [  0.00]( 3.40)     0.63 [-37.48]( 6.37)
    64     1.00 [  0.00]( 7.09)     0.67 [-32.73]( 0.59)
   128     1.00 [  0.00]( 0.00)     0.99 [ -0.76]( 0.34)
   256     1.00 [  0.00]( 1.12)     1.06 [  6.32]( 1.55)
   512     1.00 [  0.00]( 0.22)     1.06 [  6.08]( 0.92)


   ==================================================================
   Test          : new-schbench-wakeup-latency
   Units         : Normalized 99th percentile latency in us
   Interpretation: Lower is better
   Statistic     : Median
   ==================================================================
   #workers: tip[pct imp](CV)       sched_cache[pct imp](CV)
     1     1.00 [ -0.00](19.72)     0.85  [ 15.38]    ( 8.13)
     2     1.00 [ -0.00](15.96)     1.09  [ -9.09]    (18.20)
     4     1.00 [ -0.00]( 3.87)     1.00  [ -0.00]    ( 0.00)
     8     1.00 [ -0.00]( 8.15)    118.17 [-11716.67] ( 0.58)
    16     1.00 [ -0.00]( 3.87)    146.62 [-14561.54] ( 4.64)
    32     1.00 [ -0.00](12.99)    141.60 [-14060.00] ( 5.64)
    64     1.00 [ -0.00]( 6.20)    78.62  [-7762.50]  ( 1.79)
   128     1.00 [ -0.00]( 0.96)    11.36  [-1036.08]  ( 3.41)
   256     1.00 [ -0.00]( 2.76)     1.11  [-11.22]    ( 3.28)
   512     1.00 [ -0.00]( 0.20)     1.21  [-20.81]    ( 0.91)


   ==================================================================
   Test          : new-schbench-request-latency
   Units         : Normalized 99th percentile latency in us
   Interpretation: Lower is better
   Statistic     : Median
   ==================================================================
   #workers: tip[pct imp](CV)      sched_cache[pct imp](CV)
     1     1.00 [ -0.00]( 1.07)     1.11 [-10.66]  ( 2.76)
     2     1.00 [ -0.00]( 0.14)     1.20 [-20.40]  ( 1.73)
     4     1.00 [ -0.00]( 1.39)     2.04 [-104.20] ( 0.96)
     8     1.00 [ -0.00]( 0.36)     3.94 [-294.20] ( 2.85)
    16     1.00 [ -0.00]( 1.18)     4.56 [-356.16] ( 1.19)
    32     1.00 [ -0.00]( 8.42)     3.02 [-201.67] ( 8.93)
    64     1.00 [ -0.00]( 4.85)     1.51 [-51.38]  ( 0.80)
   128     1.00 [ -0.00]( 0.28)     1.83 [-82.77]  ( 1.21)
   256     1.00 [ -0.00](10.52)     1.43 [-43.11]  (10.67)
   512     1.00 [ -0.00]( 0.69)     1.25 [-24.96]  ( 6.24)


   ==================================================================
   Test          : Various longer running benchmarks
   Units         : %diff in throughput reported
   Interpretation: Higher is better
   Statistic     : Median
   ==================================================================
   Benchmarks:                 %diff
   ycsb-cassandra             -10.70%
   ycsb-mongodb               -13.66%

   deathstarbench-1x           13.87%
   deathstarbench-2x            1.70%
   deathstarbench-3x           -8.44%
   deathstarbench-6x           -3.12%

   hammerdb+mysql 16VU        -33.50%
   hammerdb+mysql 64VU        -33.22%

---

I'm planning on taking hackbench and schbench as two extreme cases for
throughput and tail latency and later look at Stream from a "high
bandwidth, don't consolidate" standpoint. I hope once those cases
aren't as much in the reds, the larger benchmarks will be happier too.

-- 
Thanks and Regards,
Prateek

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Chen, Yu C 8 months, 3 weeks ago

Hi Prateek,

On 3/26/2025 2:18 PM, K Prateek Nayak wrote:
> Hello Peter, Chenyu,
> 
> On 3/26/2025 12:14 AM, Peter Zijlstra wrote:
>> On Tue, Mar 25, 2025 at 11:19:52PM +0800, Chen, Yu C wrote:
>>>
>>> Hi Peter,
>>>
>>> Thanks for sending this out,
>>>
>>> On 3/25/2025 8:09 PM, Peter Zijlstra wrote:
>>>> Hi all,
>>>>
>>>> One of the many things on the eternal todo list has been finishing the
>>>> below hackery.
>>>>
>>>> It is an attempt at modelling cache affinity -- and while the patch
>>>> really only targets LLC, it could very well be extended to also 
>>>> apply to
>>>> clusters (L2). Specifically any case of multiple cache domains inside a
>>>> node.
>>>>
>>>> Anyway, I wrote this about a year ago, and I mentioned this at the
>>>> recent OSPM conf where Gautham and Prateek expressed interest in 
>>>> playing
>>>> with this code.
>>>>
>>>> So here goes, very rough and largely unproven code ahead :-)
>>>>
>>>> It applies to current tip/master, but I know it will fail the __percpu
>>>> validation that sits in -next, although that shouldn't be terribly hard
>>>> to fix up.
>>>>
>>>> As is, it only computes a CPU inside the LLC that has the highest 
>>>> recent
>>>> runtime, this CPU is then used in the wake-up path to steer towards 
>>>> this
>>>> LLC and in task_hot() to limit migrations away from it.
>>>>
>>>> More elaborate things could be done, notably there is an XXX in there
>>>> somewhere about finding the best LLC inside a NODE (interaction with
>>>> NUMA_BALANCING).
>>>>
>>>
>>> Besides the control provided by CONFIG_SCHED_CACHE, could we also 
>>> introduce
>>> sched_feat(SCHED_CACHE) to manage this feature, facilitating dynamic
>>> adjustments? Similarly we can also introduce other sched_feats for load
>>> balancing and NUMA balancing for fine-grain control.
>>
>> We can do all sorts, but the very first thing is determining if this is
>> worth it at all. Because if we can't make this work at all, all those
>> things are a waste of time.
>>
>> This patch is not meant to be merged, it is meant for testing and
>> development. We need to first make it actually improve workloads. If it
>> then turns out it regresses workloads (likely, things always do), then
>> we can look at how to best do that.
>>
> 
> Thank you for sharing the patch and the initial review from Chenyu
> pointing to issues that need fixing. I'll try to take a good look at it
> this week and see if I can improve some trivial benchmarks that regress
> currently with RFC as is.
> 
> In its current form I think this suffers from the same problem as
> SIS_NODE where wakeups redirect to same set of CPUs and a good deal of
> additional work is being done without any benefit.
> 
> I'll leave the results from my initial testing on the 3rd Generation
> EPYC platform below and will evaluate what is making the benchmarks
> unhappy. I'll return with more data when some of these benchmarks
> are not as unhappy as they are now.
> 
> Thank you both for the RFC and the initial feedback. Following are
> the initial results for the RFC as is:
> 
>    ==================================================================
>    Test          : hackbench
>    Units         : Normalized time in seconds
>    Interpretation: Lower is better
>    Statistic     : AMean
>    ==================================================================
>    Case:           tip[pct imp](CV)      sched_cache[pct imp](CV)
>     1-groups     1.00 [ -0.00](10.12)     1.01 [ -0.89]( 2.84)
>     2-groups     1.00 [ -0.00]( 6.92)     1.83 [-83.15]( 1.61)
>     4-groups     1.00 [ -0.00]( 3.14)     3.00 [-200.21]( 3.13)
>     8-groups     1.00 [ -0.00]( 1.35)     3.44 [-243.75]( 2.20)
>    16-groups     1.00 [ -0.00]( 1.32)     2.59 [-158.98]( 4.29)
> 
> 
>    ==================================================================
>    Test          : tbench
>    Units         : Normalized throughput
>    Interpretation: Higher is better
>    Statistic     : AMean
>    ==================================================================
>    Clients:    tip[pct imp](CV)     sched_cache[pct imp](CV)
>        1     1.00 [  0.00]( 0.43)     0.96 [ -3.54]( 0.56)
>        2     1.00 [  0.00]( 0.58)     0.99 [ -1.32]( 1.40)
>        4     1.00 [  0.00]( 0.54)     0.98 [ -2.34]( 0.78)
>        8     1.00 [  0.00]( 0.49)     0.96 [ -3.91]( 0.54)
>       16     1.00 [  0.00]( 1.06)     0.97 [ -3.22]( 1.82)
>       32     1.00 [  0.00]( 1.27)     0.95 [ -4.74]( 2.05)
>       64     1.00 [  0.00]( 1.54)     0.93 [ -6.65]( 0.63)
>      128     1.00 [  0.00]( 0.38)     0.93 [ -6.91]( 1.18)
>      256     1.00 [  0.00]( 1.85)     0.99 [ -0.50]( 1.34)
>      512     1.00 [  0.00]( 0.31)     0.98 [ -2.47]( 0.14)
>     1024     1.00 [  0.00]( 0.19)     0.97 [ -3.06]( 0.39)
> 
> 
>    ==================================================================
>    Test          : stream-10
>    Units         : Normalized Bandwidth, MB/s
>    Interpretation: Higher is better
>    Statistic     : HMean
>    ==================================================================
>    Test:       tip[pct imp](CV)     sched_cache[pct imp](CV)
>     Copy     1.00 [  0.00](11.31)     0.34 [-65.89](72.77)
>    Scale     1.00 [  0.00]( 6.62)     0.32 [-68.09](72.49)
>      Add     1.00 [  0.00]( 7.06)     0.34 [-65.56](70.56)
>    Triad     1.00 [  0.00]( 8.91)     0.34 [-66.47](72.70)
> 
> 
>    ==================================================================
>    Test          : stream-100
>    Units         : Normalized Bandwidth, MB/s
>    Interpretation: Higher is better
>    Statistic     : HMean
>    ==================================================================
>    Test:       tip[pct imp](CV)     sched_cache[pct imp](CV)
>     Copy     1.00 [  0.00]( 2.01)     0.83 [-16.96](24.55)
>    Scale     1.00 [  0.00]( 1.49)     0.79 [-21.40](24.10)
>      Add     1.00 [  0.00]( 2.67)     0.79 [-21.33](25.39)
>    Triad     1.00 [  0.00]( 2.19)     0.81 [-19.19](25.55)
> 
> 
>    ==================================================================
>    Test          : netperf
>    Units         : Normalized Througput
>    Interpretation: Higher is better
>    Statistic     : AMean
>    ==================================================================
>    Clients:         tip[pct imp](CV)     sched_cache[pct imp](CV)
>     1-clients     1.00 [  0.00]( 1.43)     0.98 [ -2.22]( 0.26)
>     2-clients     1.00 [  0.00]( 1.02)     0.97 [ -2.55]( 0.89)
>     4-clients     1.00 [  0.00]( 0.83)     0.98 [ -2.27]( 0.46)
>     8-clients     1.00 [  0.00]( 0.73)     0.98 [ -2.45]( 0.80)
>    16-clients     1.00 [  0.00]( 0.97)     0.97 [ -2.90]( 0.88)
>    32-clients     1.00 [  0.00]( 0.88)     0.95 [ -5.29]( 1.69)
>    64-clients     1.00 [  0.00]( 1.49)     0.91 [ -8.70]( 1.95)
>    128-clients    1.00 [  0.00]( 1.05)     0.92 [ -8.39]( 4.25)
>    256-clients    1.00 [  0.00]( 3.85)     0.92 [ -8.33]( 2.45)
>    512-clients    1.00 [  0.00](59.63)     0.92 [ -7.83](51.19)
> 
> 
>    ==================================================================
>    Test          : schbench
>    Units         : Normalized 99th percentile latency in us
>    Interpretation: Lower is better
>    Statistic     : Median
>    ==================================================================
>    #workers: tip[pct imp](CV)       sched_cache[pct imp](CV)
>      1     1.00 [ -0.00]( 6.67)      0.38 [ 62.22]    ( 5.88)
>      2     1.00 [ -0.00](10.18)      0.43 [ 56.52]    ( 2.94)
>      4     1.00 [ -0.00]( 4.49)      0.60 [ 40.43]    ( 5.52)
>      8     1.00 [ -0.00]( 6.68)    113.96 [-11296.23] (12.91)
>     16     1.00 [ -0.00]( 1.87)    359.34 [-35834.43] (20.02)
>     32     1.00 [ -0.00]( 4.01)    217.67 [-21667.03] ( 5.48)
>     64     1.00 [ -0.00]( 3.21)     97.43 [-9643.02]  ( 4.61)
>    128     1.00 [ -0.00](44.13)     41.36 [-4036.10]  ( 6.92)
>    256     1.00 [ -0.00](14.46)      2.69 [-169.31]   ( 1.86)
>    512     1.00 [ -0.00]( 1.95)      1.89 [-89.22]    ( 2.24)
> 
> 
>    ==================================================================
>    Test          : new-schbench-requests-per-second
>    Units         : Normalized Requests per second
>    Interpretation: Higher is better
>    Statistic     : Median
>    ==================================================================
>    #workers: tip[pct imp](CV)      sched_cache[pct imp](CV)
>      1     1.00 [  0.00]( 0.46)     0.96 [ -4.14]( 0.00)
>      2     1.00 [  0.00]( 0.15)     0.95 [ -5.27]( 2.29)
>      4     1.00 [  0.00]( 0.15)     0.88 [-12.01]( 0.46)
>      8     1.00 [  0.00]( 0.15)     0.55 [-45.47]( 1.23)
>     16     1.00 [  0.00]( 0.00)     0.54 [-45.62]( 0.50)
>     32     1.00 [  0.00]( 3.40)     0.63 [-37.48]( 6.37)
>     64     1.00 [  0.00]( 7.09)     0.67 [-32.73]( 0.59)
>    128     1.00 [  0.00]( 0.00)     0.99 [ -0.76]( 0.34)
>    256     1.00 [  0.00]( 1.12)     1.06 [  6.32]( 1.55)
>    512     1.00 [  0.00]( 0.22)     1.06 [  6.08]( 0.92)
> 
> 
>    ==================================================================
>    Test          : new-schbench-wakeup-latency
>    Units         : Normalized 99th percentile latency in us
>    Interpretation: Lower is better
>    Statistic     : Median
>    ==================================================================
>    #workers: tip[pct imp](CV)       sched_cache[pct imp](CV)
>      1     1.00 [ -0.00](19.72)     0.85  [ 15.38]    ( 8.13)
>      2     1.00 [ -0.00](15.96)     1.09  [ -9.09]    (18.20)
>      4     1.00 [ -0.00]( 3.87)     1.00  [ -0.00]    ( 0.00)
>      8     1.00 [ -0.00]( 8.15)    118.17 [-11716.67] ( 0.58)
>     16     1.00 [ -0.00]( 3.87)    146.62 [-14561.54] ( 4.64)
>     32     1.00 [ -0.00](12.99)    141.60 [-14060.00] ( 5.64)
>     64     1.00 [ -0.00]( 6.20)    78.62  [-7762.50]  ( 1.79)
>    128     1.00 [ -0.00]( 0.96)    11.36  [-1036.08]  ( 3.41)
>    256     1.00 [ -0.00]( 2.76)     1.11  [-11.22]    ( 3.28)
>    512     1.00 [ -0.00]( 0.20)     1.21  [-20.81]    ( 0.91)
> 
> 
>    ==================================================================
>    Test          : new-schbench-request-latency
>    Units         : Normalized 99th percentile latency in us
>    Interpretation: Lower is better
>    Statistic     : Median
>    ==================================================================
>    #workers: tip[pct imp](CV)      sched_cache[pct imp](CV)
>      1     1.00 [ -0.00]( 1.07)     1.11 [-10.66]  ( 2.76)
>      2     1.00 [ -0.00]( 0.14)     1.20 [-20.40]  ( 1.73)
>      4     1.00 [ -0.00]( 1.39)     2.04 [-104.20] ( 0.96)
>      8     1.00 [ -0.00]( 0.36)     3.94 [-294.20] ( 2.85)
>     16     1.00 [ -0.00]( 1.18)     4.56 [-356.16] ( 1.19)
>     32     1.00 [ -0.00]( 8.42)     3.02 [-201.67] ( 8.93)
>     64     1.00 [ -0.00]( 4.85)     1.51 [-51.38]  ( 0.80)
>    128     1.00 [ -0.00]( 0.28)     1.83 [-82.77]  ( 1.21)
>    256     1.00 [ -0.00](10.52)     1.43 [-43.11]  (10.67)
>    512     1.00 [ -0.00]( 0.69)     1.25 [-24.96]  ( 6.24)
> 
> 
>    ==================================================================
>    Test          : Various longer running benchmarks
>    Units         : %diff in throughput reported
>    Interpretation: Higher is better
>    Statistic     : Median
>    ==================================================================
>    Benchmarks:                 %diff
>    ycsb-cassandra             -10.70%
>    ycsb-mongodb               -13.66%
> 
>    deathstarbench-1x           13.87%
>    deathstarbench-2x            1.70%
>    deathstarbench-3x           -8.44%
>    deathstarbench-6x           -3.12%
> 
>    hammerdb+mysql 16VU        -33.50%
>    hammerdb+mysql 64VU        -33.22%
> 
> ---
> 
> I'm planning on taking hackbench and schbench as two extreme cases for
> throughput and tail latency and later look at Stream from a "high
> bandwidth, don't consolidate" standpoint. I hope once those cases
> aren't as much in the reds, the larger benchmarks will be happier too.
> 

Thanks for running the test. I think hackbenc/schbench would be the good 
benchmarks to start with. I remember that you and Gautham mentioned that 
schbench prefers to be aggregated in a single LLC in LPC2021 or 2022. I 
ran a schbench test using mmtests on a Xeon server which has 4 NUMA 
nodes. Each node has 80 cores (with SMT disabled). The numa=off option 
was appended to the boot commandline, so there are 4 "LLCs" within each 
node.


                                     BASELIN             SCHED_CACH
                                    BASELINE            SCHED_CACHE
Lat 50.0th-qrtle-1          8.00 (   0.00%)        5.00 (  37.50%)
Lat 90.0th-qrtle-1          9.00 (   0.00%)        5.00 (  44.44%)
Lat 99.0th-qrtle-1         13.00 (   0.00%)       10.00 (  23.08%)
Lat 99.9th-qrtle-1         21.00 (   0.00%)       19.00 (   9.52%)*
Lat 20.0th-qrtle-1        404.00 (   0.00%)      411.00 (  -1.73%)
Lat 50.0th-qrtle-2          8.00 (   0.00%)        5.00 (  37.50%)
Lat 90.0th-qrtle-2         11.00 (   0.00%)        8.00 (  27.27%)
Lat 99.0th-qrtle-2         16.00 (   0.00%)       11.00 (  31.25%)
Lat 99.9th-qrtle-2         27.00 (   0.00%)       17.00 (  37.04%)*
Lat 20.0th-qrtle-2        823.00 (   0.00%)      821.00 (   0.24%)
Lat 50.0th-qrtle-4         10.00 (   0.00%)        5.00 (  50.00%)
Lat 90.0th-qrtle-4         12.00 (   0.00%)        6.00 (  50.00%)
Lat 99.0th-qrtle-4         18.00 (   0.00%)        9.00 (  50.00%)
Lat 99.9th-qrtle-4         29.00 (   0.00%)       16.00 (  44.83%)*
Lat 20.0th-qrtle-4       1650.00 (   0.00%)     1598.00 (   3.15%)
Lat 50.0th-qrtle-8          9.00 (   0.00%)        4.00 (  55.56%)
Lat 90.0th-qrtle-8         11.00 (   0.00%)        6.00 (  45.45%)
Lat 99.0th-qrtle-8         16.00 (   0.00%)        9.00 (  43.75%)
Lat 99.9th-qrtle-8         28.00 (   0.00%)      188.00 (-571.43%)*
Lat 20.0th-qrtle-8       3316.00 (   0.00%)     3100.00 (   6.51%)
Lat 50.0th-qrtle-16        10.00 (   0.00%)        5.00 (  50.00%)
Lat 90.0th-qrtle-16        13.00 (   0.00%)        7.00 (  46.15%)
Lat 99.0th-qrtle-16        19.00 (   0.00%)       12.00 (  36.84%)
Lat 99.9th-qrtle-16        28.00 (   0.00%)     2034.00 (-7164.29%)*
Lat 20.0th-qrtle-16      6632.00 (   0.00%)     5800.00 (  12.55%)
Lat 50.0th-qrtle-32         7.00 (   0.00%)       12.00 ( -71.43%)
Lat 90.0th-qrtle-32        10.00 (   0.00%)       62.00 (-520.00%)
Lat 99.0th-qrtle-32        14.00 (   0.00%)      841.00 (-5907.14%)
Lat 99.9th-qrtle-32        23.00 (   0.00%)     1862.00 (-7995.65%)*
Lat 20.0th-qrtle-32     13264.00 (   0.00%)    10608.00 (  20.02%)
Lat 50.0th-qrtle-64         7.00 (   0.00%)       64.00 (-814.29%)
Lat 90.0th-qrtle-64        12.00 (   0.00%)      709.00 (-5808.33%)
Lat 99.0th-qrtle-64        18.00 (   0.00%)     2260.00 (-12455.56%)
Lat 99.9th-qrtle-64        26.00 (   0.00%)     3572.00 (-13638.46%)*
Lat 20.0th-qrtle-64     26528.00 (   0.00%)    14064.00 (  46.98%)
Lat 50.0th-qrtle-128        7.00 (   0.00%)      115.00 (-1542.86%)
Lat 90.0th-qrtle-128       11.00 (   0.00%)     1626.00 (-14681.82%)
Lat 99.0th-qrtle-128       17.00 (   0.00%)     4472.00 (-26205.88%)
Lat 99.9th-qrtle-128       27.00 (   0.00%)     8088.00 (-29855.56%)*
Lat 20.0th-qrtle-128    53184.00 (   0.00%)    17312.00 (  67.45%)
Lat 50.0th-qrtle-256      172.00 (   0.00%)      255.00 ( -48.26%)
Lat 90.0th-qrtle-256     2092.00 (   0.00%)     1482.00 (  29.16%)
Lat 99.0th-qrtle-256     2684.00 (   0.00%)     3148.00 ( -17.29%)
Lat 99.9th-qrtle-256     4504.00 (   0.00%)     6008.00 ( -33.39%)*
Lat 20.0th-qrtle-256    53056.00 (   0.00%)    48064.00 (   9.41%)
Lat 50.0th-qrtle-319      375.00 (   0.00%)      478.00 ( -27.47%)
Lat 90.0th-qrtle-319     2420.00 (   0.00%)     2244.00 (   7.27%)
Lat 99.0th-qrtle-319     4552.00 (   0.00%)     4456.00 (   2.11%)
Lat 99.9th-qrtle-319     6072.00 (   0.00%)     7656.00 ( -26.09%)*
Lat 20.0th-qrtle-319    47936.00 (   0.00%)    47808.00 (   0.27%)

We can see that, when the system is under-load, the 99.9th wakeup
latency improves. But when the system gets busier, say, from thread
number 8 to 319, the wakeup latency suffers.

The following change could mitigate the issue, which is intended to 
avoid task migration/stacking:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cddd67100a91..a492463aed71 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8801,6 +8801,7 @@ static long __migrate_degrades_locality(struct 
task_struct *p, int src_cpu, int
  static int select_cache_cpu(struct task_struct *p, int prev_cpu)
  {
         struct mm_struct *mm = p->mm;
+       struct sched_domain *sd;
         int cpu;

         if (!sched_feat(SCHED_CACHE))
@@ -8813,6 +8814,8 @@ static int select_cache_cpu(struct task_struct *p, 
int prev_cpu)
         if (cpu < 0)
                 return prev_cpu;

+       if (cpus_share_cache(prev_cpu, cpu))
+               return prev_cpu;

         if (static_branch_likely(&sched_numa_balancing) &&
             __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) {
@@ -8822,6 +8825,10 @@ static int select_cache_cpu(struct task_struct 
*p, int prev_cpu)
                 return prev_cpu;
         }

+       sd = rcu_dereference(per_cpu(sd_llc, cpu));
+       if (likely(sd))
+               return cpumask_any(sched_domain_span(sd));
+
         return cpu;
  }

                                  BASELINE_s          SCHED_CACHE_s
                                 BASELINE_sc         SCHED_CACHE_sc
Lat 50.0th-qrtle-1          5.00 (   0.00%)        5.00 (   0.00%)
Lat 90.0th-qrtle-1          8.00 (   0.00%)        5.00 (  37.50%)
Lat 99.0th-qrtle-1         10.00 (   0.00%)       10.00 (   0.00%)
Lat 99.9th-qrtle-1         20.00 (   0.00%)       20.00 (   0.00%)*
Lat 20.0th-qrtle-1        409.00 (   0.00%)      406.00 (   0.73%)
Lat 50.0th-qrtle-2          8.00 (   0.00%)        4.00 (  50.00%)
Lat 90.0th-qrtle-2         11.00 (   0.00%)        5.00 (  54.55%)
Lat 99.0th-qrtle-2         16.00 (   0.00%)       11.00 (  31.25%)
Lat 99.9th-qrtle-2         29.00 (   0.00%)       16.00 (  44.83%)*
Lat 20.0th-qrtle-2        819.00 (   0.00%)      825.00 (  -0.73%)
Lat 50.0th-qrtle-4         10.00 (   0.00%)        4.00 (  60.00%)
Lat 90.0th-qrtle-4         12.00 (   0.00%)        4.00 (  66.67%)
Lat 99.0th-qrtle-4         18.00 (   0.00%)        6.00 (  66.67%)
Lat 99.9th-qrtle-4         30.00 (   0.00%)       15.00 (  50.00%)*
Lat 20.0th-qrtle-4       1658.00 (   0.00%)     1670.00 (  -0.72%)
Lat 50.0th-qrtle-8          9.00 (   0.00%)        3.00 (  66.67%)
Lat 90.0th-qrtle-8         11.00 (   0.00%)        4.00 (  63.64%)
Lat 99.0th-qrtle-8         16.00 (   0.00%)        6.00 (  62.50%)
Lat 99.9th-qrtle-8         29.00 (   0.00%)       13.00 (  55.17%)*
Lat 20.0th-qrtle-8       3308.00 (   0.00%)     3340.00 (  -0.97%)
Lat 50.0th-qrtle-16         9.00 (   0.00%)        4.00 (  55.56%)
Lat 90.0th-qrtle-16        12.00 (   0.00%)        4.00 (  66.67%)
Lat 99.0th-qrtle-16        18.00 (   0.00%)        6.00 (  66.67%)
Lat 99.9th-qrtle-16        31.00 (   0.00%)       12.00 (  61.29%)*
Lat 20.0th-qrtle-16      6616.00 (   0.00%)     6680.00 (  -0.97%)
Lat 50.0th-qrtle-32         8.00 (   0.00%)        4.00 (  50.00%)
Lat 90.0th-qrtle-32        11.00 (   0.00%)        5.00 (  54.55%)
Lat 99.0th-qrtle-32        17.00 (   0.00%)        8.00 (  52.94%)
Lat 99.9th-qrtle-32        27.00 (   0.00%)       11.00 (  59.26%)*
Lat 20.0th-qrtle-32     13296.00 (   0.00%)    13328.00 (  -0.24%)
Lat 50.0th-qrtle-64         9.00 (   0.00%)       46.00 (-411.11%)
Lat 90.0th-qrtle-64        14.00 (   0.00%)     1198.00 (-8457.14%)
Lat 99.0th-qrtle-64        20.00 (   0.00%)     2252.00 (-11160.00%)
Lat 99.9th-qrtle-64        31.00 (   0.00%)     2844.00 (-9074.19%)*
Lat 20.0th-qrtle-64     26528.00 (   0.00%)    15504.00 (  41.56%)
Lat 50.0th-qrtle-128        7.00 (   0.00%)       26.00 (-271.43%)
Lat 90.0th-qrtle-128       11.00 (   0.00%)     2244.00 (-20300.00%)
Lat 99.0th-qrtle-128       17.00 (   0.00%)     4488.00 (-26300.00%)
Lat 99.9th-qrtle-128       27.00 (   0.00%)     5752.00 (-21203.70%)*
Lat 20.0th-qrtle-128    53184.00 (   0.00%)    24544.00 (  53.85%)
Lat 50.0th-qrtle-256      172.00 (   0.00%)      135.00 (  21.51%)
Lat 90.0th-qrtle-256     2084.00 (   0.00%)     2022.00 (   2.98%)
Lat 99.0th-qrtle-256     2780.00 (   0.00%)     3908.00 ( -40.58%)
Lat 99.9th-qrtle-256     4536.00 (   0.00%)     5832.00 ( -28.57%)*
Lat 20.0th-qrtle-256    53568.00 (   0.00%)    51904.00 (   3.11%)
Lat 50.0th-qrtle-319      369.00 (   0.00%)      358.00 (   2.98%)
Lat 90.0th-qrtle-319     2428.00 (   0.00%)     2436.00 (  -0.33%)
Lat 99.0th-qrtle-319     4552.00 (   0.00%)     4664.00 (  -2.46%)
Lat 99.9th-qrtle-319     6104.00 (   0.00%)     6632.00 (  -8.65%)*
Lat 20.0th-qrtle-319    48192.00 (   0.00%)    48832.00 (  -1.33%)


We can see wakeup latency improvement in a wider range when running 
different number of threads. But there is still regression starting from 
thread number 64 - maybe the benefit of LLC locality is offset by the 
task stacking on 1 LLC. One possible direction I'm thinking of is that, 
we can get a snapshot of LLC status in load balance, check if the LLC is 
overloaded, if yes, do not enable this LLC aggregation during task 
wakeup - but do it in the load balancer, which is less frequent.

thanks,
Chenyu

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Peter Zijlstra 8 months, 3 weeks ago

On Wed, Mar 26, 2025 at 05:15:24PM +0800, Chen, Yu C wrote:
> Thanks for running the test. I think hackbenc/schbench would be the good
> benchmarks to start with. I remember that you and Gautham mentioned that
> schbench prefers to be aggregated in a single LLC in LPC2021 or 2022. I ran
> a schbench test using mmtests on a Xeon server which has 4 NUMA nodes. Each
> node has 80 cores (with SMT disabled). The numa=off option was appended to
> the boot commandline, so there are 4 "LLCs" within each node.

We really should look at getting the SnC topology even without SnC being
in use.

The sheer size of these LLCs is untenable.

Re: [RFC][PATCH] sched: Cache aware load-balancing

Posted by Chen, Yu C 8 months, 3 weeks ago

On 3/26/2025 5:42 PM, Peter Zijlstra wrote:
> On Wed, Mar 26, 2025 at 05:15:24PM +0800, Chen, Yu C wrote:
>> Thanks for running the test. I think hackbenc/schbench would be the good
>> benchmarks to start with. I remember that you and Gautham mentioned that
>> schbench prefers to be aggregated in a single LLC in LPC2021 or 2022. I ran
>> a schbench test using mmtests on a Xeon server which has 4 NUMA nodes. Each
>> node has 80 cores (with SMT disabled). The numa=off option was appended to
>> the boot commandline, so there are 4 "LLCs" within each node.
> 
> We really should look at getting the SnC topology even without SnC being
> in use.
> 


I agree, unfortunately it seems that with SNC disabled there is no 
information exposed to the OS that can be used to divide the large LLC 
into smaller pieces.

thanks,
Chenyu

> The sheer size of these LLCs is untenable.