sched/cache: Avoid cache-aware scheduling for memory-heavy processes

[tip: sched/core] sched/cache: Avoid cache-aware scheduling for memory-heavy processes
Posted by tip-bot2 for Chen Yu 3 weeks, 4 days ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     808915f982c2a52f5d148510ecfab52284de67cf
Gitweb:        https://git.kernel.org/tip/808915f982c2a52f5d148510ecfab52284de67cf
Author:        Chen Yu <yu.c.chen@intel.com>
AuthorDate:    Wed, 13 May 2026 13:39:16 -07:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 18 May 2026 21:33:15 +02:00

sched/cache: Avoid cache-aware scheduling for memory-heavy processes

Prateek and Tingyin reported that memory-intensive workloads (such as
stream) can saturate memory bandwidth and caches on the preferred LLC
when sched_cache aggregates too many threads.

To mitigate this, estimate a process's memory footprint by comparing
its NUMA balancing fault statistics to the size of the LLC. If the
footprint exceeds the LLC size, skip cache-aware scheduling.

Note that footprint is only an approximation of the memory footprint,
since the kernel lacks suitable metrics to estimate the real working
set. If a user-provided hint is available in the future, it would be
more accurate. A later patch will allow users to provide a hint to
adjust this threshold.

Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Vern Hao <vernhao@tencent.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Tingyin Duan <tingyin.duan@gmail.com>
Link: https://patch.msgid.link/95cf64a385bcc12f18dcebe9d59e8d3ba8bb318f.1778703694.git.tim.c.chen@linux.intel.com
---
 include/linux/sched.h |  1 +-
 kernel/exit.c         | 29 ++++++++++++++++++++-
 kernel/sched/fair.c   | 62 +++++++++++++++++++++++++++++++++++++++---
 3 files changed, 89 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6701911..9572967 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2425,6 +2425,7 @@ struct sched_cache_stat {
 	unsigned long epoch;
 	u64 nr_running_avg;
 	unsigned long next_scan;
+	unsigned long footprint;
 	int cpu;
 } ____cacheline_aligned_in_smp;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index ede3117..77275c2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -543,6 +543,32 @@ void mm_update_next_owner(struct mm_struct *mm)
 }
 #endif /* CONFIG_MEMCG */
 
+#if defined(CONFIG_SCHED_CACHE) && defined(CONFIG_NUMA_BALANCING)
+/*
+ * Subtract the memory footprint of the current task from
+ * mm.
+ */
+static void exit_mm_sched_cache(struct mm_struct *mm)
+{
+	unsigned long fp, sub;
+
+	if (!current->total_numa_faults)
+		return;
+	/*
+	 * No lock protection due to performance considerations.
+	 * Make sure mm->sc_stat.footprint does not become
+	 * negative.
+	 */
+	fp = READ_ONCE(mm->sc_stat.footprint);
+	sub = min(fp, current->total_numa_faults);
+	WRITE_ONCE(mm->sc_stat.footprint, fp - sub);
+}
+#else
+static inline void exit_mm_sched_cache(struct mm_struct *mm)
+{
+}
+#endif /* CONFIG_SCHED_CACHE CONFIG_NUMA_BALANCING */
+
 /*
  * Turn us into a lazy TLB process if we
  * aren't already..
@@ -554,6 +580,9 @@ static void exit_mm(void)
 	exit_mm_release(current, mm);
 	if (!mm)
 		return;
+
+	exit_mm_sched_cache(mm);
+
 	mmap_read_lock(mm);
 	mmgrab_lazy_tlb(mm);
 	BUG_ON(mm != current->active_mm);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df21366..a10116f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1384,6 +1384,32 @@ static int llc_id(int cpu)
 	return per_cpu(sd_llc_id, cpu);
 }
 
+static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+{
+#ifdef CONFIG_NUMA_BALANCING
+	unsigned long llc, footprint;
+	struct sched_domain *sd;
+
+	guard(rcu)();
+
+	sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd);
+	if (!sd)
+		return true;
+
+	if (static_branch_likely(&sched_numa_balancing)) {
+		/*
+		 * TBD: RDT exclusive LLC ways reserved should be
+		 * excluded.
+		 */
+		llc = sd->llc_bytes;
+		footprint = READ_ONCE(mm->sc_stat.footprint);
+
+		return (llc < (footprint * PAGE_SIZE));
+	}
+#endif
+	return false;
+}
+
 static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p,
 			   int cpu)
 {
@@ -1463,6 +1489,7 @@ void mm_init_sched(struct mm_struct *mm,
 	mm->sc_stat.cpu = -1;
 	mm->sc_stat.next_scan = jiffies;
 	mm->sc_stat.nr_running_avg = 0;
+	mm->sc_stat.footprint = 0;
 	/*
 	 * The update to mm->sc_stat should not be reordered
 	 * before initialization to mm's other fields, in case
@@ -1585,7 +1612,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
 	 * its preferred state.
 	 */
 	if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
-	    invalid_llc_nr(mm, p, cpu_of(rq))) {
+	    invalid_llc_nr(mm, p, cpu_of(rq)) ||
+	    exceed_llc_capacity(mm, cpu_of(rq))) {
 		if (mm->sc_stat.cpu != -1)
 			mm->sc_stat.cpu = -1;
 	}
@@ -1716,7 +1744,8 @@ static void task_cache_work(struct callback_head *work)
 		return;
 
 	curr_cpu = task_cpu(p);
-	if (invalid_llc_nr(mm, p, curr_cpu)) {
+	if (invalid_llc_nr(mm, p, curr_cpu) ||
+	    exceed_llc_capacity(mm, curr_cpu)) {
 		if (mm->sc_stat.cpu != -1)
 			mm->sc_stat.cpu = -1;
 
@@ -3515,6 +3544,7 @@ static void task_numa_placement(struct task_struct *p)
 	unsigned long total_faults;
 	u64 runtime, period;
 	spinlock_t *group_lock = NULL;
+	long __maybe_unused new_fp;
 	struct numa_group *ng;
 
 	/*
@@ -3589,6 +3619,31 @@ static void task_numa_placement(struct task_struct *p)
 				ng->total_faults += diff;
 				group_faults += ng->faults[mem_idx];
 			}
+#ifdef CONFIG_SCHED_CACHE
+			/*
+			 * Per task p->numa_faults[mem_idx] converges,
+			 * so the accumulation of each task's faults
+			 * converges too - Given the number of threads,
+			 * it cannot overflow an unsigned long.
+			 * Racy with concurrent updates from other threads
+			 * sharing this mm. Acceptable since footprint is a
+			 * heuristic and occasional lost updates are tolerable.
+			 *
+			 * If a task exits, its corresponding footprint must
+			 * be subtracted from the mm->sc_stat.footprint, otherwise
+			 * the mm->sc_stat.footprint will not converge:
+			 * the exiting thread's footprint remains unchanged/undecayed
+			 * in mm->sc_stat.footprint. See exit_mm().
+			 *
+			 * Lost updates and unsynchronized subtraction
+			 * in exit_mm() can cause footprint + diff to
+			 * go negative. Clamp to zero to prevent the
+			 * unsigned footprint from wrapping.
+			 */
+			new_fp = (long)READ_ONCE(p->mm->sc_stat.footprint) + diff;
+			WRITE_ONCE(p->mm->sc_stat.footprint,
+				   max(new_fp, 0L));
+#endif
 		}
 
 		if (!ng) {
@@ -10338,7 +10393,8 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
 		return mig_unrestricted;
 
 	/* skip cache aware load balance for too many threads */
-	if (invalid_llc_nr(mm, p, dst_cpu)) {
+	if (invalid_llc_nr(mm, p, dst_cpu) ||
+	    exceed_llc_capacity(mm, dst_cpu)) {
 		if (mm->sc_stat.cpu != -1)
 			mm->sc_stat.cpu = -1;
 		return mig_unrestricted;