From: Chen Yu <yu.c.chen@intel.com>
Introduce a set of debugfs knobs to control the enabling of
and parameters for cache-aware load balancing.
(1) llc_enabled
llc_enabled acts as the primary switch - users can toggle it to
enable or disable cache aware load balancing.
(2) llc_aggr_tolerance
With sched_cache enabled, the scheduler uses a process's RSS as a
proxy for its LLC footprint to determine if aggregating tasks on the
preferred LLC could cause cache contention. If RSS exceeds the LLC
size, aggregation is skipped. Some workloads with large RSS but small
actual memory footprints may still benefit from aggregation. Since
the kernel cannot efficiently track per-task cache usage (resctrl is
user-space only), userspace can provide a more accurate hint.
Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let
users control how strictly RSS limits aggregation. Values range from
0 to 100:
- 0: Cache-aware scheduling is disabled.
- 1: Strict; tasks with RSS larger than LLC size are skipped.
- 100: Aggressive; tasks are aggregated regardless of RSS.
For example, with a 32MB L3 cache:
- llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped.
- llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped
(784GB = (1 + (99 - 1) * 256) * 32MB).
Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls
how strictly the number of active threads is considered when doing
cache aware load balance. The number of SMTs is also considered.
High SMT counts reduce the aggregation capacity, preventing excessive
task aggregation on SMT-heavy systems like Power10/Power11.
For example, with 8 Cores/16 CPUs in a L3:
- llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped.
- llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped
785 = (1 + (99 - 1) * 8).
(3) llc_epoch_period/llc_epoch_affinity_timeout
Besides, llc_epoch_period and llc_epoch_affinity_timeout are also turned
into tunable.
Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Suggested-by: Tingyin Duan <tingyin.duan@gmail.com>
Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---
Notes:
v1->v2: Remove the smt_nr check in fits_llc_capacity().
(Aaron Lu)
include/linux/sched.h | 4 ++-
kernel/sched/debug.c | 62 ++++++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++++++++-----
kernel/sched/sched.h | 5 ++++
kernel/sched/topology.c | 54 +++++++++++++++++++++++++++++++++--
5 files changed, 178 insertions(+), 10 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 466ba8b7398c..95bf080bbbf0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2436,9 +2436,11 @@ extern void migrate_enable(void);
DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
#ifdef CONFIG_SCHED_CACHE
+DECLARE_STATIC_KEY_FALSE(sched_cache_on);
+
static inline bool sched_cache_enabled(void)
{
- return false;
+ return static_branch_unlikely(&sched_cache_on);
}
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 02e16b70a790..cde324672103 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = {
.release = single_release,
};
+#ifdef CONFIG_SCHED_CACHE
+#define SCHED_CACHE_CREATE_CONTROL(name, max) \
+static ssize_t sched_cache_write_##name(struct file *filp, \
+ const char __user *ubuf, \
+ size_t cnt, loff_t *ppos) \
+{ \
+ char buf[16]; \
+ unsigned int val; \
+ if (cnt > 15) \
+ cnt = 15; \
+ if (copy_from_user(&buf, ubuf, cnt)) \
+ return -EFAULT; \
+ buf[cnt] = '\0'; \
+ if (kstrtouint(buf, 10, &val)) \
+ return -EINVAL; \
+ if (val > (max)) \
+ return -EINVAL; \
+ llc_##name = val; \
+ if (!strcmp(#name, "enabled")) \
+ sched_cache_set(false); \
+ *ppos += cnt; \
+ return cnt; \
+} \
+static int sched_cache_show_##name(struct seq_file *m, void *v) \
+{ \
+ seq_printf(m, "%d\n", llc_##name); \
+ return 0; \
+} \
+static int sched_cache_open_##name(struct inode *inode, \
+ struct file *filp) \
+{ \
+ return single_open(filp, sched_cache_show_##name, NULL); \
+} \
+static const struct file_operations sched_cache_fops_##name = { \
+ .open = sched_cache_open_##name, \
+ .write = sched_cache_write_##name, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+}
+
+SCHED_CACHE_CREATE_CONTROL(overload_pct, 100);
+SCHED_CACHE_CREATE_CONTROL(imb_pct, 100);
+SCHED_CACHE_CREATE_CONTROL(aggr_tolerance, 100);
+SCHED_CACHE_CREATE_CONTROL(enabled, 1);
+#endif /* SCHED_CACHE */
+
static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -523,6 +570,21 @@ static __init int sched_init_debug(void)
debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_SCHED_CACHE
+ debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL,
+ &sched_cache_fops_overload_pct);
+ debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL,
+ &sched_cache_fops_imb_pct);
+ debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL,
+ &sched_cache_fops_aggr_tolerance);
+ debugfs_create_file("llc_enabled", 0644, debugfs_sched, NULL,
+ &sched_cache_fops_enabled);
+ debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched,
+ &llc_epoch_period);
+ debugfs_create_u32("llc_epoch_affinity_timeout", 0644, debugfs_sched,
+ &llc_epoch_affinity_timeout);
+#endif
+
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
debugfs_fair_server_init();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 424ec601cfdf..a2e2d6742481 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1207,6 +1207,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
__read_mostly unsigned int llc_overload_pct = 50;
__read_mostly unsigned int llc_imb_pct = 20;
+__read_mostly unsigned int llc_aggr_tolerance = 1;
+__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
+__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
static int llc_id(int cpu)
{
@@ -1223,11 +1226,22 @@ static int llc_id(int cpu)
return llc;
}
+static inline int get_sched_cache_scale(int mul)
+{
+ if (!llc_aggr_tolerance)
+ return 0;
+
+ if (llc_aggr_tolerance == 100)
+ return INT_MAX;
+
+ return (1 + (llc_aggr_tolerance - 1) * mul);
+}
+
static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
{
+ unsigned int llc, scale;
struct cacheinfo *ci;
unsigned long rss;
- unsigned int llc;
/*
* get_cpu_cacheinfo_level() can not be used
@@ -1252,19 +1266,54 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
rss = get_mm_counter(mm, MM_ANONPAGES) +
get_mm_counter(mm, MM_SHMEMPAGES);
- return (llc <= (rss * PAGE_SIZE));
+ /*
+ * Scale the LLC size by 256*llc_aggr_tolerance
+ * and compare it to the task's RSS size.
+ *
+ * Suppose the L3 size is 32MB. If the
+ * llc_aggr_tolerance is 1:
+ * When the RSS is larger than 32MB, the process
+ * is regarded as exceeding the LLC capacity. If
+ * the llc_aggr_tolerance is 99:
+ * When the RSS is larger than 784GB, the process
+ * is regarded as exceeding the LLC capacity because:
+ * 784GB = (1 + (99 - 1) * 256) * 32MB
+ */
+ scale = get_sched_cache_scale(256);
+ if (scale == INT_MAX)
+ return false;
+
+ return ((llc * scale) <= (rss * PAGE_SIZE));
}
static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
{
- int smt_nr = 1;
+ int smt_nr = 1, scale;
#ifdef CONFIG_SCHED_SMT
if (sched_smt_active())
smt_nr = cpumask_weight(cpu_smt_mask(cpu));
#endif
+ /*
+ * Scale the Core number in a LLC by llc_aggr_tolerance
+ * and compare it to the task's active threads.
+ *
+ * Suppose the number of Cores in LLC is 8.
+ * Every core has 2 SMTs.
+ * If the llc_aggr_tolerance is 1: When the
+ * nr_running is larger than 8, the process
+ * is regarded as exceeding the LLC capacity.
+ * If the llc_aggr_tolerance is 99:
+ * When the nr_running is larger than 785,
+ * the process is regarded as exceeding
+ * the LLC capacity:
+ * 785 = 1 + (99 - 1) * 8
+ */
+ scale = get_sched_cache_scale(1);
+ if (scale == INT_MAX)
+ return false;
- return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
+ return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size, cpu)));
}
static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
@@ -1350,9 +1399,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
long delta = now - rq->cpu_epoch_next;
if (delta > 0) {
- n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
+ n = (delta + llc_epoch_period - 1) / llc_epoch_period;
rq->cpu_epoch += n;
- rq->cpu_epoch_next += n * EPOCH_PERIOD;
+ rq->cpu_epoch_next += n * llc_epoch_period;
__shr_u64(&rq->cpu_runtime, n);
}
@@ -1412,7 +1461,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
* has only 1 thread, or has too many active threads, invalidate
* its preferred state.
*/
- if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
+ if (epoch - READ_ONCE(mm->mm_sched_epoch) > llc_epoch_affinity_timeout ||
get_nr_threads(p) <= 1 ||
exceed_llc_nr(mm, cpu_of(rq)) ||
exceed_llc_capacity(mm, cpu_of(rq))) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 40798a06e058..15d126bd3728 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2852,6 +2852,11 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
#ifdef CONFIG_SCHED_CACHE
extern unsigned int llc_overload_pct;
extern unsigned int llc_imb_pct;
+extern unsigned int llc_aggr_tolerance;
+extern unsigned int llc_epoch_period;
+extern unsigned int llc_epoch_affinity_timeout;
+extern unsigned int llc_enabled;
+void sched_cache_set(bool locked);
#endif
#ifdef CONFIG_SCHED_HRTICK
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9799e3a9a609..818599ddaaef 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -26,6 +26,49 @@ int max_llcs;
static bool sched_cache_present;
+unsigned int llc_enabled = 1;
+DEFINE_STATIC_KEY_FALSE(sched_cache_on);
+
+/*
+ * Enable/disable cache aware scheduling according to
+ * user input and the presence of hardware support.
+ */
+static void _sched_cache_set(bool enable, bool locked)
+{
+ if (enable) {
+ if (locked)
+ static_branch_enable_cpuslocked(&sched_cache_on);
+ else
+ static_branch_enable(&sched_cache_on);
+ } else {
+ if (locked)
+ static_branch_disable_cpuslocked(&sched_cache_on);
+ else
+ static_branch_disable(&sched_cache_on);
+ }
+}
+
+void sched_cache_set(bool locked)
+{
+ /* hardware does not support */
+ if (!sched_cache_present) {
+ if (static_branch_likely(&sched_cache_on))
+ _sched_cache_set(false, locked);
+
+ return;
+ }
+
+ /* user wants it or not ?*/
+ if (llc_enabled) {
+ if (!static_branch_likely(&sched_cache_on))
+ _sched_cache_set(true, locked);
+
+ } else {
+ if (static_branch_likely(&sched_cache_on))
+ _sched_cache_set(false, locked);
+ }
+}
+
static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
{
unsigned int *new = NULL;
@@ -70,8 +113,12 @@ static int resize_llc_pref(bool has_multi_llcs)
* new buffer.
*/
tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
- if (!tmp_llc_pref)
- return -ENOMEM;
+ if (!tmp_llc_pref) {
+ sched_cache_present = false;
+ ret = -ENOMEM;
+
+ goto out;
+ }
for_each_present_cpu(i)
*per_cpu_ptr(tmp_llc_pref, i) = NULL;
@@ -89,6 +136,7 @@ static int resize_llc_pref(bool has_multi_llcs)
new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
if (!new) {
ret = -ENOMEM;
+ sched_cache_present = false;
goto release_old;
}
@@ -126,6 +174,8 @@ static int resize_llc_pref(bool has_multi_llcs)
if (!ret)
max_llcs = new_max_llcs;
+out:
+ sched_cache_set(true);
return ret;
}
--
2.32.0
On 4/12/2025 07:07, Tim Chen wrote:
> From: Chen Yu <yu.c.chen@intel.com>
>
> Introduce a set of debugfs knobs to control the enabling of
> and parameters for cache-aware load balancing.
>
> (1) llc_enabled
> llc_enabled acts as the primary switch - users can toggle it to
> enable or disable cache aware load balancing.
>
> (2) llc_aggr_tolerance
> With sched_cache enabled, the scheduler uses a process's RSS as a
> proxy for its LLC footprint to determine if aggregating tasks on the
> preferred LLC could cause cache contention. If RSS exceeds the LLC
> size, aggregation is skipped. Some workloads with large RSS but small
> actual memory footprints may still benefit from aggregation. Since
> the kernel cannot efficiently track per-task cache usage (resctrl is
> user-space only), userspace can provide a more accurate hint.
>
> Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let
> users control how strictly RSS limits aggregation. Values range from
> 0 to 100:
>
> - 0: Cache-aware scheduling is disabled.
> - 1: Strict; tasks with RSS larger than LLC size are skipped.
> - 100: Aggressive; tasks are aggregated regardless of RSS.
>
> For example, with a 32MB L3 cache:
>
> - llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped.
> - llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped
> (784GB = (1 + (99 - 1) * 256) * 32MB).
>
> Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls
> how strictly the number of active threads is considered when doing
> cache aware load balance. The number of SMTs is also considered.
> High SMT counts reduce the aggregation capacity, preventing excessive
> task aggregation on SMT-heavy systems like Power10/Power11.
>
> For example, with 8 Cores/16 CPUs in a L3:
>
> - llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped.
> - llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped
> 785 = (1 + (99 - 1) * 8).
>
> (3) llc_epoch_period/llc_epoch_affinity_timeout
> Besides, llc_epoch_period and llc_epoch_affinity_timeout are also turned
> into tunable.
>
> Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Suggested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
> Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
> Suggested-by: Tingyin Duan <tingyin.duan@gmail.com>
> Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> ---
>
> Notes:
> v1->v2: Remove the smt_nr check in fits_llc_capacity().
> (Aaron Lu)
>
> include/linux/sched.h | 4 ++-
> kernel/sched/debug.c | 62 ++++++++++++++++++++++++++++++++++++++++
> kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++++++++-----
> kernel/sched/sched.h | 5 ++++
> kernel/sched/topology.c | 54 +++++++++++++++++++++++++++++++++--
> 5 files changed, 178 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 466ba8b7398c..95bf080bbbf0 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2436,9 +2436,11 @@ extern void migrate_enable(void);
> DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>
> #ifdef CONFIG_SCHED_CACHE
> +DECLARE_STATIC_KEY_FALSE(sched_cache_on);
> +
> static inline bool sched_cache_enabled(void)
> {
> - return false;
> + return static_branch_unlikely(&sched_cache_on);
> }
> #endif
>
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 02e16b70a790..cde324672103 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = {
> .release = single_release,
> };
>
> +#ifdef CONFIG_SCHED_CACHE
> +#define SCHED_CACHE_CREATE_CONTROL(name, max) \
> +static ssize_t sched_cache_write_##name(struct file *filp, \
> + const char __user *ubuf, \
> + size_t cnt, loff_t *ppos) \
> +{ \
> + char buf[16]; \
> + unsigned int val; \
> + if (cnt > 15) \
> + cnt = 15; \
> + if (copy_from_user(&buf, ubuf, cnt)) \
> + return -EFAULT; \
> + buf[cnt] = '\0'; \
> + if (kstrtouint(buf, 10, &val)) \
> + return -EINVAL; \
> + if (val > (max)) \
> + return -EINVAL; \
> + llc_##name = val; \
> + if (!strcmp(#name, "enabled")) \
> + sched_cache_set(false); \
> + *ppos += cnt; \
> + return cnt; \
> +} \
> +static int sched_cache_show_##name(struct seq_file *m, void *v) \
> +{ \
> + seq_printf(m, "%d\n", llc_##name); \
> + return 0; \
> +} \
> +static int sched_cache_open_##name(struct inode *inode, \
> + struct file *filp) \
> +{ \
> + return single_open(filp, sched_cache_show_##name, NULL); \
> +} \
> +static const struct file_operations sched_cache_fops_##name = { \
> + .open = sched_cache_open_##name, \
> + .write = sched_cache_write_##name, \
> + .read = seq_read, \
> + .llseek = seq_lseek, \
> + .release = single_release, \
> +}
> +
> +SCHED_CACHE_CREATE_CONTROL(overload_pct, 100);
> +SCHED_CACHE_CREATE_CONTROL(imb_pct, 100);
> +SCHED_CACHE_CREATE_CONTROL(aggr_tolerance, 100);
> +SCHED_CACHE_CREATE_CONTROL(enabled, 1);
> +#endif /* SCHED_CACHE */
> +
> static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
> size_t cnt, loff_t *ppos)
> {
> @@ -523,6 +570,21 @@ static __init int sched_init_debug(void)
> debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
> #endif /* CONFIG_NUMA_BALANCING */
>
> +#ifdef CONFIG_SCHED_CACHE
> + debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL,
> + &sched_cache_fops_overload_pct);
> + debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL,
> + &sched_cache_fops_imb_pct);
> + debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL,
> + &sched_cache_fops_aggr_tolerance);
> + debugfs_create_file("llc_enabled", 0644, debugfs_sched, NULL,
> + &sched_cache_fops_enabled);
> + debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched,
> + &llc_epoch_period);
> + debugfs_create_u32("llc_epoch_affinity_timeout", 0644, debugfs_sched,
> + &llc_epoch_affinity_timeout);
> +#endif
> +
> debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
>
> debugfs_fair_server_init();
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 424ec601cfdf..a2e2d6742481 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1207,6 +1207,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
>
> __read_mostly unsigned int llc_overload_pct = 50;
> __read_mostly unsigned int llc_imb_pct = 20;
> +__read_mostly unsigned int llc_aggr_tolerance = 1;
> +__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
> +__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
>
> static int llc_id(int cpu)
> {
> @@ -1223,11 +1226,22 @@ static int llc_id(int cpu)
> return llc;
> }
>
> +static inline int get_sched_cache_scale(int mul)
> +{
> + if (!llc_aggr_tolerance)
> + return 0;
> +
> + if (llc_aggr_tolerance == 100)
> + return INT_MAX;
> +
> + return (1 + (llc_aggr_tolerance - 1) * mul);
> +}
> +
> static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
> {
> + unsigned int llc, scale;
> struct cacheinfo *ci;
> unsigned long rss;
> - unsigned int llc;
>
> /*
> * get_cpu_cacheinfo_level() can not be used
> @@ -1252,19 +1266,54 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
> rss = get_mm_counter(mm, MM_ANONPAGES) +
> get_mm_counter(mm, MM_SHMEMPAGES);
>
> - return (llc <= (rss * PAGE_SIZE));
> + /*
> + * Scale the LLC size by 256*llc_aggr_tolerance
> + * and compare it to the task's RSS size.
> + *
> + * Suppose the L3 size is 32MB. If the
> + * llc_aggr_tolerance is 1:
> + * When the RSS is larger than 32MB, the process
> + * is regarded as exceeding the LLC capacity. If
> + * the llc_aggr_tolerance is 99:
> + * When the RSS is larger than 784GB, the process
> + * is regarded as exceeding the LLC capacity because:
> + * 784GB = (1 + (99 - 1) * 256) * 32MB
> + */
> + scale = get_sched_cache_scale(256);
Hi Tim Chen and Chen Yu,
There's an integer overflow here. Since the unit of LLC size is bytes,
you have a 256-scale unit. For a typical LLC size of 32M, you calculate
32M multiplied by 256, which equals 8GB. This value exceeds the maximum
integer value (2GB, INT_MAX), resulting in an integer overflow.
I think such function should be use u64. Below is my patch:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 205208f061bb..bcafb3c2b369 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1226,20 +1226,20 @@ static int llc_id(int cpu)
return llc;
}
-static inline int get_sched_cache_scale(int mul)
+static inline u64 get_sched_cache_scale(int mul)
{
if (!llc_aggr_tolerance)
return 0;
if (llc_aggr_tolerance == 100)
- return INT_MAX;
+ return ULLONG_MAX;
return (1 + (llc_aggr_tolerance - 1) * mul);
}
static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
{
- unsigned int llc, scale;
+ unsigned long long llc, scale;
struct cacheinfo *ci;
unsigned long rss;
@@ -1280,7 +1280,7 @@ static bool exceed_llc_capacity(struct mm_struct
*mm, int cpu)
* 784GB = (1 + (99 - 1) * 256) * 32MB
*/
scale = get_sched_cache_scale(256);
- if (scale == INT_MAX)
+ if (scale == ULLONG_MAX)
return false;
return ((llc * scale) <= (rss * PAGE_SIZE));
@@ -1288,7 +1288,7 @@ static bool exceed_llc_capacity(struct mm_struct
*mm, int cpu)
static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
{
- int smt_nr = 1, scale;
+ unsigned long long smt_nr = 1, scale;
#ifdef CONFIG_SCHED_SMT
if (sched_smt_active())
@@ -1310,7 +1310,7 @@ static bool exceed_llc_nr(struct mm_struct *mm,
int cpu)
* 785 = 1 + (99 - 1) * 8
*/
scale = get_sched_cache_scale(1);
- if (scale == INT_MAX)
+ if (scale == ULLONG_MAX)
return false;
return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size,
cpu)));
Thanks,
Yangyu Chen
> + if (scale == INT_MAX)
> + return false;
> +
> + return ((llc * scale) <= (rss * PAGE_SIZE));
> }
>
> static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
> {
> - int smt_nr = 1;
> + int smt_nr = 1, scale;
>
> #ifdef CONFIG_SCHED_SMT
> if (sched_smt_active())
> smt_nr = cpumask_weight(cpu_smt_mask(cpu));
> #endif
> + /*
> + * Scale the Core number in a LLC by llc_aggr_tolerance
> + * and compare it to the task's active threads.
> + *
> + * Suppose the number of Cores in LLC is 8.
> + * Every core has 2 SMTs.
> + * If the llc_aggr_tolerance is 1: When the
> + * nr_running is larger than 8, the process
> + * is regarded as exceeding the LLC capacity.
> + * If the llc_aggr_tolerance is 99:
> + * When the nr_running is larger than 785,
> + * the process is regarded as exceeding
> + * the LLC capacity:
> + * 785 = 1 + (99 - 1) * 8
> + */
> + scale = get_sched_cache_scale(1);
> + if (scale == INT_MAX)
> + return false;
>
> - return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
> + return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size, cpu)));
> }
>
> static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
> @@ -1350,9 +1399,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
> long delta = now - rq->cpu_epoch_next;
>
> if (delta > 0) {
> - n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
> + n = (delta + llc_epoch_period - 1) / llc_epoch_period;
> rq->cpu_epoch += n;
> - rq->cpu_epoch_next += n * EPOCH_PERIOD;
> + rq->cpu_epoch_next += n * llc_epoch_period;
> __shr_u64(&rq->cpu_runtime, n);
> }
>
> @@ -1412,7 +1461,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> * has only 1 thread, or has too many active threads, invalidate
> * its preferred state.
> */
> - if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
> + if (epoch - READ_ONCE(mm->mm_sched_epoch) > llc_epoch_affinity_timeout ||
> get_nr_threads(p) <= 1 ||
> exceed_llc_nr(mm, cpu_of(rq)) ||
> exceed_llc_capacity(mm, cpu_of(rq))) {
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 40798a06e058..15d126bd3728 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2852,6 +2852,11 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
> #ifdef CONFIG_SCHED_CACHE
> extern unsigned int llc_overload_pct;
> extern unsigned int llc_imb_pct;
> +extern unsigned int llc_aggr_tolerance;
> +extern unsigned int llc_epoch_period;
> +extern unsigned int llc_epoch_affinity_timeout;
> +extern unsigned int llc_enabled;
> +void sched_cache_set(bool locked);
> #endif
>
> #ifdef CONFIG_SCHED_HRTICK
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 9799e3a9a609..818599ddaaef 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -26,6 +26,49 @@ int max_llcs;
>
> static bool sched_cache_present;
>
> +unsigned int llc_enabled = 1;
> +DEFINE_STATIC_KEY_FALSE(sched_cache_on);
> +
> +/*
> + * Enable/disable cache aware scheduling according to
> + * user input and the presence of hardware support.
> + */
> +static void _sched_cache_set(bool enable, bool locked)
> +{
> + if (enable) {
> + if (locked)
> + static_branch_enable_cpuslocked(&sched_cache_on);
> + else
> + static_branch_enable(&sched_cache_on);
> + } else {
> + if (locked)
> + static_branch_disable_cpuslocked(&sched_cache_on);
> + else
> + static_branch_disable(&sched_cache_on);
> + }
> +}
> +
> +void sched_cache_set(bool locked)
> +{
> + /* hardware does not support */
> + if (!sched_cache_present) {
> + if (static_branch_likely(&sched_cache_on))
> + _sched_cache_set(false, locked);
> +
> + return;
> + }
> +
> + /* user wants it or not ?*/
> + if (llc_enabled) {
> + if (!static_branch_likely(&sched_cache_on))
> + _sched_cache_set(true, locked);
> +
> + } else {
> + if (static_branch_likely(&sched_cache_on))
> + _sched_cache_set(false, locked);
> + }
> +}
> +
> static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
> {
> unsigned int *new = NULL;
> @@ -70,8 +113,12 @@ static int resize_llc_pref(bool has_multi_llcs)
> * new buffer.
> */
> tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
> - if (!tmp_llc_pref)
> - return -ENOMEM;
> + if (!tmp_llc_pref) {
> + sched_cache_present = false;
> + ret = -ENOMEM;
> +
> + goto out;
> + }
>
> for_each_present_cpu(i)
> *per_cpu_ptr(tmp_llc_pref, i) = NULL;
> @@ -89,6 +136,7 @@ static int resize_llc_pref(bool has_multi_llcs)
> new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
> if (!new) {
> ret = -ENOMEM;
> + sched_cache_present = false;
>
> goto release_old;
> }
> @@ -126,6 +174,8 @@ static int resize_llc_pref(bool has_multi_llcs)
> if (!ret)
> max_llcs = new_max_llcs;
>
> +out:
> + sched_cache_set(true);
> return ret;
> }
>
Hi Yangyu,
On 1/21/2026 11:21 PM, Yangyu Chen wrote:
>
>
> On 4/12/2025 07:07, Tim Chen wrote:
[ ... ]
>> + scale = get_sched_cache_scale(256);
>
> Hi Tim Chen and Chen Yu,
>
> There's an integer overflow here. Since the unit of LLC size is bytes,
> you have a 256-scale unit. For a typical LLC size of 32M, you calculate
> 32M multiplied by 256, which equals 8GB. This value exceeds the maximum
> integer value (2GB, INT_MAX), resulting in an integer overflow.
>
> I think such function should be use u64. Below is my patch:
>
Thanks very much for the investigation, Jianyong previously also
mentioned this issue
https://lore.kernel.org/all/SI2PR04MB49317BA503E9C8A381956D6AE38CA@SI2PR04MB4931.apcprd04.prod.outlook.com/
we will fix the issue accordingly.
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 205208f061bb..bcafb3c2b369 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1226,20 +1226,20 @@ static int llc_id(int cpu)
> return llc;
> }
>
> -static inline int get_sched_cache_scale(int mul)
> +static inline u64 get_sched_cache_scale(int mul)
> {
> if (!llc_aggr_tolerance)
> return 0;
>
> if (llc_aggr_tolerance == 100)
> - return INT_MAX;
> + return ULLONG_MAX;
>
> return (1 + (llc_aggr_tolerance - 1) * mul);
> }
>
> static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
> {
> - unsigned int llc, scale;
> + unsigned long long llc, scale;
I suppose we only need to change llc to u64, and not change
scale, because (llc * scale) would be converted to u64 anyway.
thanks,
Chenyu
> On 4 Dec 2025, at 07:07, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>
> From: Chen Yu <yu.c.chen@intel.com>
>
> Introduce a set of debugfs knobs to control the enabling of
> and parameters for cache-aware load balancing.
>
> (1) llc_enabled
> llc_enabled acts as the primary switch - users can toggle it to
> enable or disable cache aware load balancing.
>
> (2) llc_aggr_tolerance
> With sched_cache enabled, the scheduler uses a process's RSS as a
> proxy for its LLC footprint to determine if aggregating tasks on the
> preferred LLC could cause cache contention. If RSS exceeds the LLC
> size, aggregation is skipped. Some workloads with large RSS but small
> actual memory footprints may still benefit from aggregation. Since
> the kernel cannot efficiently track per-task cache usage (resctrl is
> user-space only), userspace can provide a more accurate hint.
>
> Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let
> users control how strictly RSS limits aggregation. Values range from
> 0 to 100:
>
> - 0: Cache-aware scheduling is disabled.
> - 1: Strict; tasks with RSS larger than LLC size are skipped.
> - 100: Aggressive; tasks are aggregated regardless of RSS.
>
Hi Chen Yu and Tim Chen,
Maybe we should have something like prctl(PR_LLC_AGGR_TOLERANCE, 100).
I have tested this version of the patch on my EPYC Milan 7V13 (7763 variant) physical machine, with 32M LLC for each 8-core CCX. I found that I need to tune "llc_aggr_tolerance" to 100, else I can't get cache-aware scheduling to work on Verilated [1] XiangShan [2] running the chacha20 [3] as I mentioned before [4].
But if I set it to 100, I will lose some performance on stream copy benchmarks since the bandwidth is limited per CCX. Thus, I think we should have a new prctl to let userspace software hint the kernel that this task can be bound by latency between cores, and should use this feature no matter the RSS exceed the LLC size.
I finally have an EPYC Milan at home today. I'm glad to test this patch further. And I'm very willing to have this thus we can have a fast verilator[1] without numactl manually.
[1] https://github.com/verilator/verilator
[2] https://github.com/OpenXiangShan/Xiangshan
[3] https://github.com/cyyself/chacha20-xiangshan
[4] https://lore.kernel.org/lkml/tencent_6E51A3175F8AE0A7F684A319EE63CC56C806@qq.com/
Thanks,
Yangyu Chen
> For exaImple, with a 32MB L3 cache:
>
> - llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped.
> - llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped
> (784GB = (1 + (99 - 1) * 256) * 32MB).
>
> Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls
> how strictly the number of active threads is considered when doing
> cache aware load balance. The number of SMTs is also considered.
> High SMT counts reduce the aggregation capacity, preventing excessive
> task aggregation on SMT-heavy systems like Power10/Power11.
>
> For example, with 8 Cores/16 CPUs in a L3:
>
> - llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped.
> - llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped
> 785 = (1 + (99 - 1) * 8).
>
> (3) llc_epoch_period/llc_epoch_affinity_timeout
> Besides, llc_epoch_period and llc_epoch_affinity_timeout are also turned
> into tunable.
>
> Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Suggested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
> Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
> Suggested-by: Tingyin Duan <tingyin.duan@gmail.com>
> Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> ---
>
> Notes:
> v1->v2: Remove the smt_nr check in fits_llc_capacity().
> (Aaron Lu)
>
> include/linux/sched.h | 4 ++-
> kernel/sched/debug.c | 62 ++++++++++++++++++++++++++++++++++++++++
> kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++++++++-----
> kernel/sched/sched.h | 5 ++++
> kernel/sched/topology.c | 54 +++++++++++++++++++++++++++++++++--
> 5 files changed, 178 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 466ba8b7398c..95bf080bbbf0 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2436,9 +2436,11 @@ extern void migrate_enable(void);
> DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>
> #ifdef CONFIG_SCHED_CACHE
> +DECLARE_STATIC_KEY_FALSE(sched_cache_on);
> +
> static inline bool sched_cache_enabled(void)
> {
> - return false;
> + return static_branch_unlikely(&sched_cache_on);
> }
> #endif
>
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 02e16b70a790..cde324672103 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = {
> .release = single_release,
> };
>
> +#ifdef CONFIG_SCHED_CACHE
> +#define SCHED_CACHE_CREATE_CONTROL(name, max) \
> +static ssize_t sched_cache_write_##name(struct file *filp, \
> + const char __user *ubuf, \
> + size_t cnt, loff_t *ppos) \
> +{ \
> + char buf[16]; \
> + unsigned int val; \
> + if (cnt > 15) \
> + cnt = 15; \
> + if (copy_from_user(&buf, ubuf, cnt)) \
> + return -EFAULT; \
> + buf[cnt] = '\0'; \
> + if (kstrtouint(buf, 10, &val)) \
> + return -EINVAL; \
> + if (val > (max)) \
> + return -EINVAL; \
> + llc_##name = val; \
> + if (!strcmp(#name, "enabled")) \
> + sched_cache_set(false); \
> + *ppos += cnt; \
> + return cnt; \
> +} \
> +static int sched_cache_show_##name(struct seq_file *m, void *v) \
> +{ \
> + seq_printf(m, "%d\n", llc_##name); \
> + return 0; \
> +} \
> +static int sched_cache_open_##name(struct inode *inode, \
> + struct file *filp) \
> +{ \
> + return single_open(filp, sched_cache_show_##name, NULL); \
> +} \
> +static const struct file_operations sched_cache_fops_##name = { \
> + .open = sched_cache_open_##name, \
> + .write = sched_cache_write_##name, \
> + .read = seq_read, \
> + .llseek = seq_lseek, \
> + .release = single_release, \
> +}
> +
> +SCHED_CACHE_CREATE_CONTROL(overload_pct, 100);
> +SCHED_CACHE_CREATE_CONTROL(imb_pct, 100);
> +SCHED_CACHE_CREATE_CONTROL(aggr_tolerance, 100);
> +SCHED_CACHE_CREATE_CONTROL(enabled, 1);
> +#endif /* SCHED_CACHE */
> +
> static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
> size_t cnt, loff_t *ppos)
> {
> @@ -523,6 +570,21 @@ static __init int sched_init_debug(void)
> debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
> #endif /* CONFIG_NUMA_BALANCING */
>
> +#ifdef CONFIG_SCHED_CACHE
> + debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL,
> + &sched_cache_fops_overload_pct);
> + debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL,
> + &sched_cache_fops_imb_pct);
> + debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL,
> + &sched_cache_fops_aggr_tolerance);
> + debugfs_create_file("llc_enabled", 0644, debugfs_sched, NULL,
> + &sched_cache_fops_enabled);
> + debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched,
> + &llc_epoch_period);
> + debugfs_create_u32("llc_epoch_affinity_timeout", 0644, debugfs_sched,
> + &llc_epoch_affinity_timeout);
> +#endif
> +
> debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
>
> debugfs_fair_server_init();
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 424ec601cfdf..a2e2d6742481 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1207,6 +1207,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
>
> __read_mostly unsigned int llc_overload_pct = 50;
> __read_mostly unsigned int llc_imb_pct = 20;
> +__read_mostly unsigned int llc_aggr_tolerance = 1;
> +__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
> +__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
>
> static int llc_id(int cpu)
> {
> @@ -1223,11 +1226,22 @@ static int llc_id(int cpu)
> return llc;
> }
>
> +static inline int get_sched_cache_scale(int mul)
> +{
> + if (!llc_aggr_tolerance)
> + return 0;
> +
> + if (llc_aggr_tolerance == 100)
> + return INT_MAX;
> +
> + return (1 + (llc_aggr_tolerance - 1) * mul);
> +}
> +
> static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
> {
> + unsigned int llc, scale;
> struct cacheinfo *ci;
> unsigned long rss;
> - unsigned int llc;
>
> /*
> * get_cpu_cacheinfo_level() can not be used
> @@ -1252,19 +1266,54 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
> rss = get_mm_counter(mm, MM_ANONPAGES) +
> get_mm_counter(mm, MM_SHMEMPAGES);
>
> - return (llc <= (rss * PAGE_SIZE));
> + /*
> + * Scale the LLC size by 256*llc_aggr_tolerance
> + * and compare it to the task's RSS size.
> + *
> + * Suppose the L3 size is 32MB. If the
> + * llc_aggr_tolerance is 1:
> + * When the RSS is larger than 32MB, the process
> + * is regarded as exceeding the LLC capacity. If
> + * the llc_aggr_tolerance is 99:
> + * When the RSS is larger than 784GB, the process
> + * is regarded as exceeding the LLC capacity because:
> + * 784GB = (1 + (99 - 1) * 256) * 32MB
> + */
> + scale = get_sched_cache_scale(256);
> + if (scale == INT_MAX)
> + return false;
> +
> + return ((llc * scale) <= (rss * PAGE_SIZE));
> }
>
> static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
> {
> - int smt_nr = 1;
> + int smt_nr = 1, scale;
>
> #ifdef CONFIG_SCHED_SMT
> if (sched_smt_active())
> smt_nr = cpumask_weight(cpu_smt_mask(cpu));
> #endif
> + /*
> + * Scale the Core number in a LLC by llc_aggr_tolerance
> + * and compare it to the task's active threads.
> + *
> + * Suppose the number of Cores in LLC is 8.
> + * Every core has 2 SMTs.
> + * If the llc_aggr_tolerance is 1: When the
> + * nr_running is larger than 8, the process
> + * is regarded as exceeding the LLC capacity.
> + * If the llc_aggr_tolerance is 99:
> + * When the nr_running is larger than 785,
> + * the process is regarded as exceeding
> + * the LLC capacity:
> + * 785 = 1 + (99 - 1) * 8
> + */
> + scale = get_sched_cache_scale(1);
> + if (scale == INT_MAX)
> + return false;
>
> - return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
> + return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size, cpu)));
> }
>
> static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
> @@ -1350,9 +1399,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
> long delta = now - rq->cpu_epoch_next;
>
> if (delta > 0) {
> - n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
> + n = (delta + llc_epoch_period - 1) / llc_epoch_period;
> rq->cpu_epoch += n;
> - rq->cpu_epoch_next += n * EPOCH_PERIOD;
> + rq->cpu_epoch_next += n * llc_epoch_period;
> __shr_u64(&rq->cpu_runtime, n);
> }
>
> @@ -1412,7 +1461,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> * has only 1 thread, or has too many active threads, invalidate
> * its preferred state.
> */
> - if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
> + if (epoch - READ_ONCE(mm->mm_sched_epoch) > llc_epoch_affinity_timeout ||
> get_nr_threads(p) <= 1 ||
> exceed_llc_nr(mm, cpu_of(rq)) ||
> exceed_llc_capacity(mm, cpu_of(rq))) {
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 40798a06e058..15d126bd3728 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2852,6 +2852,11 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
> #ifdef CONFIG_SCHED_CACHE
> extern unsigned int llc_overload_pct;
> extern unsigned int llc_imb_pct;
> +extern unsigned int llc_aggr_tolerance;
> +extern unsigned int llc_epoch_period;
> +extern unsigned int llc_epoch_affinity_timeout;
> +extern unsigned int llc_enabled;
> +void sched_cache_set(bool locked);
> #endif
>
> #ifdef CONFIG_SCHED_HRTICK
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 9799e3a9a609..818599ddaaef 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -26,6 +26,49 @@ int max_llcs;
>
> static bool sched_cache_present;
>
> +unsigned int llc_enabled = 1;
> +DEFINE_STATIC_KEY_FALSE(sched_cache_on);
> +
> +/*
> + * Enable/disable cache aware scheduling according to
> + * user input and the presence of hardware support.
> + */
> +static void _sched_cache_set(bool enable, bool locked)
> +{
> + if (enable) {
> + if (locked)
> + static_branch_enable_cpuslocked(&sched_cache_on);
> + else
> + static_branch_enable(&sched_cache_on);
> + } else {
> + if (locked)
> + static_branch_disable_cpuslocked(&sched_cache_on);
> + else
> + static_branch_disable(&sched_cache_on);
> + }
> +}
> +
> +void sched_cache_set(bool locked)
> +{
> + /* hardware does not support */
> + if (!sched_cache_present) {
> + if (static_branch_likely(&sched_cache_on))
> + _sched_cache_set(false, locked);
> +
> + return;
> + }
> +
> + /* user wants it or not ?*/
> + if (llc_enabled) {
> + if (!static_branch_likely(&sched_cache_on))
> + _sched_cache_set(true, locked);
> +
> + } else {
> + if (static_branch_likely(&sched_cache_on))
> + _sched_cache_set(false, locked);
> + }
> +}
> +
> static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
> {
> unsigned int *new = NULL;
> @@ -70,8 +113,12 @@ static int resize_llc_pref(bool has_multi_llcs)
> * new buffer.
> */
> tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
> - if (!tmp_llc_pref)
> - return -ENOMEM;
> + if (!tmp_llc_pref) {
> + sched_cache_present = false;
> + ret = -ENOMEM;
> +
> + goto out;
> + }
>
> for_each_present_cpu(i)
> *per_cpu_ptr(tmp_llc_pref, i) = NULL;
> @@ -89,6 +136,7 @@ static int resize_llc_pref(bool has_multi_llcs)
> new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
> if (!new) {
> ret = -ENOMEM;
> + sched_cache_present = false;
>
> goto release_old;
> }
> @@ -126,6 +174,8 @@ static int resize_llc_pref(bool has_multi_llcs)
> if (!ret)
> max_llcs = new_max_llcs;
>
> +out:
> + sched_cache_set(true);
> return ret;
> }
>
> --
> 2.32.0
> On 23 Dec 2025, at 20:12, Yangyu Chen <cyy@cyyself.name> wrote:
>
>> On 4 Dec 2025, at 07:07, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>>
>> From: Chen Yu <yu.c.chen@intel.com>
>>
>> Introduce a set of debugfs knobs to control the enabling of
>> and parameters for cache-aware load balancing.
>>
>> (1) llc_enabled
>> llc_enabled acts as the primary switch - users can toggle it to
>> enable or disable cache aware load balancing.
>>
>> (2) llc_aggr_tolerance
>> With sched_cache enabled, the scheduler uses a process's RSS as a
>> proxy for its LLC footprint to determine if aggregating tasks on the
>> preferred LLC could cause cache contention. If RSS exceeds the LLC
>> size, aggregation is skipped. Some workloads with large RSS but small
>> actual memory footprints may still benefit from aggregation. Since
>> the kernel cannot efficiently track per-task cache usage (resctrl is
>> user-space only), userspace can provide a more accurate hint.
>>
>> Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let
>> users control how strictly RSS limits aggregation. Values range from
>> 0 to 100:
>>
>> - 0: Cache-aware scheduling is disabled.
>> - 1: Strict; tasks with RSS larger than LLC size are skipped.
>> - 100: Aggressive; tasks are aggregated regardless of RSS.
>>
>
> Hi Chen Yu and Tim Chen,
>
> Maybe we should have something like prctl(PR_LLC_AGGR_TOLERANCE, 100).
>
> I have tested this version of the patch on my EPYC Milan 7V13 (7763 variant) physical machine, with 32M LLC for each 8-core CCX. I found that I need to tune "llc_aggr_tolerance" to 100, else I can't get cache-aware scheduling to work on Verilated [1] XiangShan [2] running the chacha20 [3] as I mentioned before [4].
>
In addition, I have investigated why this happens. And finally I
realized that's because that workload observed 35596 kB RssAnon on
my EPYC Milan Machine, slightly exceeding the LLC size (32M). I
have tested it on an EPYC Genoa cloud server with the correct core
/ cache hierarchy in ACPI table, that shows 31700 kB RssAnon, thus
fitting in LLC. I have no idea why my result shows higher RssAnon,
since they both run Debian Trixie with the exact same kernel and
same executable. But it reminds me we should have a userspace API
for that.
Thanks,
Yangyu Chen
> But if I set it to 100, I will lose some performance on stream copy benchmarks since the bandwidth is limited per CCX. Thus, I think we should have a new prctl to let userspace software hint the kernel that this task can be bound by latency between cores, and should use this feature no matter the RSS exceed the LLC size.
>
> I finally have an EPYC Milan at home today. I'm glad to test this patch further. And I'm very willing to have this thus we can have a fast verilator[1] without numactl manually.
>
> [1] https://github.com/verilator/verilator
> [2] https://github.com/OpenXiangShan/Xiangshan
> [3] https://github.com/cyyself/chacha20-xiangshan
> [4] https://lore.kernel.org/lkml/tencent_6E51A3175F8AE0A7F684A319EE63CC56C806@qq.com/
>
> Thanks,
> Yangyu Chen
>
>> For exaImple, with a 32MB L3 cache:
>>
>> - llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped.
>> - llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped
>> (784GB = (1 + (99 - 1) * 256) * 32MB).
>>
>> Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls
>> how strictly the number of active threads is considered when doing
>> cache aware load balance. The number of SMTs is also considered.
>> High SMT counts reduce the aggregation capacity, preventing excessive
>> task aggregation on SMT-heavy systems like Power10/Power11.
>>
>> For example, with 8 Cores/16 CPUs in a L3:
>>
>> - llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped.
>> - llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped
>> 785 = (1 + (99 - 1) * 8).
>>
>> (3) llc_epoch_period/llc_epoch_affinity_timeout
>> Besides, llc_epoch_period and llc_epoch_affinity_timeout are also turned
>> into tunable.
>>
>> Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
>> Suggested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
>> Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
>> Suggested-by: Tingyin Duan <tingyin.duan@gmail.com>
>> Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
>> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
>> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
>> ---
>>
>> Notes:
>> v1->v2: Remove the smt_nr check in fits_llc_capacity().
>> (Aaron Lu)
>>
>> include/linux/sched.h | 4 ++-
>> kernel/sched/debug.c | 62 ++++++++++++++++++++++++++++++++++++++++
>> kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++++++++-----
>> kernel/sched/sched.h | 5 ++++
>> kernel/sched/topology.c | 54 +++++++++++++++++++++++++++++++++--
>> 5 files changed, 178 insertions(+), 10 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 466ba8b7398c..95bf080bbbf0 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -2436,9 +2436,11 @@ extern void migrate_enable(void);
>> DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>>
>> #ifdef CONFIG_SCHED_CACHE
>> +DECLARE_STATIC_KEY_FALSE(sched_cache_on);
>> +
>> static inline bool sched_cache_enabled(void)
>> {
>> - return false;
>> + return static_branch_unlikely(&sched_cache_on);
>> }
>> #endif
>>
>> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>> index 02e16b70a790..cde324672103 100644
>> --- a/kernel/sched/debug.c
>> +++ b/kernel/sched/debug.c
>> @@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = {
>> .release = single_release,
>> };
>>
>> +#ifdef CONFIG_SCHED_CACHE
>> +#define SCHED_CACHE_CREATE_CONTROL(name, max) \
>> +static ssize_t sched_cache_write_##name(struct file *filp, \
>> + const char __user *ubuf, \
>> + size_t cnt, loff_t *ppos) \
>> +{ \
>> + char buf[16]; \
>> + unsigned int val; \
>> + if (cnt > 15) \
>> + cnt = 15; \
>> + if (copy_from_user(&buf, ubuf, cnt)) \
>> + return -EFAULT; \
>> + buf[cnt] = '\0'; \
>> + if (kstrtouint(buf, 10, &val)) \
>> + return -EINVAL; \
>> + if (val > (max)) \
>> + return -EINVAL; \
>> + llc_##name = val; \
>> + if (!strcmp(#name, "enabled")) \
>> + sched_cache_set(false); \
>> + *ppos += cnt; \
>> + return cnt; \
>> +} \
>> +static int sched_cache_show_##name(struct seq_file *m, void *v) \
>> +{ \
>> + seq_printf(m, "%d\n", llc_##name); \
>> + return 0; \
>> +} \
>> +static int sched_cache_open_##name(struct inode *inode, \
>> + struct file *filp) \
>> +{ \
>> + return single_open(filp, sched_cache_show_##name, NULL); \
>> +} \
>> +static const struct file_operations sched_cache_fops_##name = { \
>> + .open = sched_cache_open_##name, \
>> + .write = sched_cache_write_##name, \
>> + .read = seq_read, \
>> + .llseek = seq_lseek, \
>> + .release = single_release, \
>> +}
>> +
>> +SCHED_CACHE_CREATE_CONTROL(overload_pct, 100);
>> +SCHED_CACHE_CREATE_CONTROL(imb_pct, 100);
>> +SCHED_CACHE_CREATE_CONTROL(aggr_tolerance, 100);
>> +SCHED_CACHE_CREATE_CONTROL(enabled, 1);
>> +#endif /* SCHED_CACHE */
>> +
>> static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
>> size_t cnt, loff_t *ppos)
>> {
>> @@ -523,6 +570,21 @@ static __init int sched_init_debug(void)
>> debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
>> #endif /* CONFIG_NUMA_BALANCING */
>>
>> +#ifdef CONFIG_SCHED_CACHE
>> + debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL,
>> + &sched_cache_fops_overload_pct);
>> + debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL,
>> + &sched_cache_fops_imb_pct);
>> + debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL,
>> + &sched_cache_fops_aggr_tolerance);
>> + debugfs_create_file("llc_enabled", 0644, debugfs_sched, NULL,
>> + &sched_cache_fops_enabled);
>> + debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched,
>> + &llc_epoch_period);
>> + debugfs_create_u32("llc_epoch_affinity_timeout", 0644, debugfs_sched,
>> + &llc_epoch_affinity_timeout);
>> +#endif
>> +
>> debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
>>
>> debugfs_fair_server_init();
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 424ec601cfdf..a2e2d6742481 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -1207,6 +1207,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
>>
>> __read_mostly unsigned int llc_overload_pct = 50;
>> __read_mostly unsigned int llc_imb_pct = 20;
>> +__read_mostly unsigned int llc_aggr_tolerance = 1;
>> +__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
>> +__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
>>
>> static int llc_id(int cpu)
>> {
>> @@ -1223,11 +1226,22 @@ static int llc_id(int cpu)
>> return llc;
>> }
>>
>> +static inline int get_sched_cache_scale(int mul)
>> +{
>> + if (!llc_aggr_tolerance)
>> + return 0;
>> +
>> + if (llc_aggr_tolerance == 100)
>> + return INT_MAX;
>> +
>> + return (1 + (llc_aggr_tolerance - 1) * mul);
>> +}
>> +
>> static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
>> {
>> + unsigned int llc, scale;
>> struct cacheinfo *ci;
>> unsigned long rss;
>> - unsigned int llc;
>>
>> /*
>> * get_cpu_cacheinfo_level() can not be used
>> @@ -1252,19 +1266,54 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
>> rss = get_mm_counter(mm, MM_ANONPAGES) +
>> get_mm_counter(mm, MM_SHMEMPAGES);
>>
>> - return (llc <= (rss * PAGE_SIZE));
>> + /*
>> + * Scale the LLC size by 256*llc_aggr_tolerance
>> + * and compare it to the task's RSS size.
>> + *
>> + * Suppose the L3 size is 32MB. If the
>> + * llc_aggr_tolerance is 1:
>> + * When the RSS is larger than 32MB, the process
>> + * is regarded as exceeding the LLC capacity. If
>> + * the llc_aggr_tolerance is 99:
>> + * When the RSS is larger than 784GB, the process
>> + * is regarded as exceeding the LLC capacity because:
>> + * 784GB = (1 + (99 - 1) * 256) * 32MB
>> + */
>> + scale = get_sched_cache_scale(256);
>> + if (scale == INT_MAX)
>> + return false;
>> +
>> + return ((llc * scale) <= (rss * PAGE_SIZE));
>> }
>>
>> static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
>> {
>> - int smt_nr = 1;
>> + int smt_nr = 1, scale;
>>
>> #ifdef CONFIG_SCHED_SMT
>> if (sched_smt_active())
>> smt_nr = cpumask_weight(cpu_smt_mask(cpu));
>> #endif
>> + /*
>> + * Scale the Core number in a LLC by llc_aggr_tolerance
>> + * and compare it to the task's active threads.
>> + *
>> + * Suppose the number of Cores in LLC is 8.
>> + * Every core has 2 SMTs.
>> + * If the llc_aggr_tolerance is 1: When the
>> + * nr_running is larger than 8, the process
>> + * is regarded as exceeding the LLC capacity.
>> + * If the llc_aggr_tolerance is 99:
>> + * When the nr_running is larger than 785,
>> + * the process is regarded as exceeding
>> + * the LLC capacity:
>> + * 785 = 1 + (99 - 1) * 8
>> + */
>> + scale = get_sched_cache_scale(1);
>> + if (scale == INT_MAX)
>> + return false;
>>
>> - return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
>> + return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size, cpu)));
>> }
>>
>> static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
>> @@ -1350,9 +1399,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
>> long delta = now - rq->cpu_epoch_next;
>>
>> if (delta > 0) {
>> - n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
>> + n = (delta + llc_epoch_period - 1) / llc_epoch_period;
>> rq->cpu_epoch += n;
>> - rq->cpu_epoch_next += n * EPOCH_PERIOD;
>> + rq->cpu_epoch_next += n * llc_epoch_period;
>> __shr_u64(&rq->cpu_runtime, n);
>> }
>>
>> @@ -1412,7 +1461,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>> * has only 1 thread, or has too many active threads, invalidate
>> * its preferred state.
>> */
>> - if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
>> + if (epoch - READ_ONCE(mm->mm_sched_epoch) > llc_epoch_affinity_timeout ||
>> get_nr_threads(p) <= 1 ||
>> exceed_llc_nr(mm, cpu_of(rq)) ||
>> exceed_llc_capacity(mm, cpu_of(rq))) {
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index 40798a06e058..15d126bd3728 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -2852,6 +2852,11 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
>> #ifdef CONFIG_SCHED_CACHE
>> extern unsigned int llc_overload_pct;
>> extern unsigned int llc_imb_pct;
>> +extern unsigned int llc_aggr_tolerance;
>> +extern unsigned int llc_epoch_period;
>> +extern unsigned int llc_epoch_affinity_timeout;
>> +extern unsigned int llc_enabled;
>> +void sched_cache_set(bool locked);
>> #endif
>>
>> #ifdef CONFIG_SCHED_HRTICK
>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>> index 9799e3a9a609..818599ddaaef 100644
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -26,6 +26,49 @@ int max_llcs;
>>
>> static bool sched_cache_present;
>>
>> +unsigned int llc_enabled = 1;
>> +DEFINE_STATIC_KEY_FALSE(sched_cache_on);
>> +
>> +/*
>> + * Enable/disable cache aware scheduling according to
>> + * user input and the presence of hardware support.
>> + */
>> +static void _sched_cache_set(bool enable, bool locked)
>> +{
>> + if (enable) {
>> + if (locked)
>> + static_branch_enable_cpuslocked(&sched_cache_on);
>> + else
>> + static_branch_enable(&sched_cache_on);
>> + } else {
>> + if (locked)
>> + static_branch_disable_cpuslocked(&sched_cache_on);
>> + else
>> + static_branch_disable(&sched_cache_on);
>> + }
>> +}
>> +
>> +void sched_cache_set(bool locked)
>> +{
>> + /* hardware does not support */
>> + if (!sched_cache_present) {
>> + if (static_branch_likely(&sched_cache_on))
>> + _sched_cache_set(false, locked);
>> +
>> + return;
>> + }
>> +
>> + /* user wants it or not ?*/
>> + if (llc_enabled) {
>> + if (!static_branch_likely(&sched_cache_on))
>> + _sched_cache_set(true, locked);
>> +
>> + } else {
>> + if (static_branch_likely(&sched_cache_on))
>> + _sched_cache_set(false, locked);
>> + }
>> +}
>> +
>> static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
>> {
>> unsigned int *new = NULL;
>> @@ -70,8 +113,12 @@ static int resize_llc_pref(bool has_multi_llcs)
>> * new buffer.
>> */
>> tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
>> - if (!tmp_llc_pref)
>> - return -ENOMEM;
>> + if (!tmp_llc_pref) {
>> + sched_cache_present = false;
>> + ret = -ENOMEM;
>> +
>> + goto out;
>> + }
>>
>> for_each_present_cpu(i)
>> *per_cpu_ptr(tmp_llc_pref, i) = NULL;
>> @@ -89,6 +136,7 @@ static int resize_llc_pref(bool has_multi_llcs)
>> new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
>> if (!new) {
>> ret = -ENOMEM;
>> + sched_cache_present = false;
>>
>> goto release_old;
>> }
>> @@ -126,6 +174,8 @@ static int resize_llc_pref(bool has_multi_llcs)
>> if (!ret)
>> max_llcs = new_max_llcs;
>>
>> +out:
>> + sched_cache_set(true);
>> return ret;
>> }
>>
>> --
>> 2.32.0
>
>> On 4 Dec 2025, at 07:07, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>>
>> From: Chen Yu <yu.c.chen@intel.com>
>>
>> Introduce a set of debugfs knobs to control the enabling of
>> and parameters for cache-aware load balancing.
>>
>> (1) llc_enabled
>> llc_enabled acts as the primary switch - users can toggle it to
>> enable or disable cache aware load balancing.
>>
>> (2) llc_aggr_tolerance
>> With sched_cache enabled, the scheduler uses a process's RSS as a
>> proxy for its LLC footprint to determine if aggregating tasks on the
>> preferred LLC could cause cache contention. If RSS exceeds the LLC
>> size, aggregation is skipped. Some workloads with large RSS but small
>> actual memory footprints may still benefit from aggregation. Since
>> the kernel cannot efficiently track per-task cache usage (resctrl is
>> user-space only), userspace can provide a more accurate hint.
>>
>> Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let
>> users control how strictly RSS limits aggregation. Values range from
>> 0 to 100:
>>
>> - 0: Cache-aware scheduling is disabled.
>> - 1: Strict; tasks with RSS larger than LLC size are skipped.
>> - 100: Aggressive; tasks are aggregated regardless of RSS.
>>
>
> Hi Chen Yu and Tim Chen,
>
> Maybe we should have something like prctl(PR_LLC_AGGR_TOLERANCE, 100).
>
> I have tested this version of the patch on my EPYC Milan 7V13 (7763 variant) physical machine, with 32M LLC for each 8-core CCX. I found that I need to tune "llc_aggr_tolerance" to 100, else I can't get cache-aware scheduling to work on Verilated [1] XiangShan [2] running the chacha20 [3] as I mentioned before [4].
>
> But if I set it to 100, I will lose some performance on stream copy benchmarks since the bandwidth is limited per CCX. Thus, I think we should have a new prctl to let userspace software hint the kernel that this task can be bound by latency between cores, and should use this feature no matter the RSS exceed the LLC size.
>
> I finally have an EPYC Milan at home today. I'm glad to test this patch further. And I'm very willing to have this thus we can have a fast verilator[1] without numactl manually.
>
> [1] https://github.com/verilator/verilator
> [2] https://github.com/OpenXiangShan/Xiangshan
> [3] https://github.com/cyyself/chacha20-xiangshan
> [4] https://lore.kernel.org/lkml/tencent_6E51A3175F8AE0A7F684A319EE63CC56C806@qq.com/
>
> Thanks,
> Yangyu Chen
>
>> For exaImple, with a 32MB L3 cache:
>>
>> - llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped.
>> - llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped
>> (784GB = (1 + (99 - 1) * 256) * 32MB).
>>
>> Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls
>> how strictly the number of active threads is considered when doing
>> cache aware load balance. The number of SMTs is also considered.
>> High SMT counts reduce the aggregation capacity, preventing excessive
>> task aggregation on SMT-heavy systems like Power10/Power11.
>>
>> For example, with 8 Cores/16 CPUs in a L3:
>>
>> - llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped.
>> - llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped
>> 785 = (1 + (99 - 1) * 8).
>>
>> (3) llc_epoch_period/llc_epoch_affinity_timeout
>> Besides, llc_epoch_period and llc_epoch_affinity_timeout are also turned
>> into tunable.
>>
>> Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
>> Suggested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
>> Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
>> Suggested-by: Tingyin Duan <tingyin.duan@gmail.com>
>> Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
>> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
>> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
>> ---
>>
>> Notes:
>> v1->v2: Remove the smt_nr check in fits_llc_capacity().
>> (Aaron Lu)
>>
>> include/linux/sched.h | 4 ++-
>> kernel/sched/debug.c | 62 ++++++++++++++++++++++++++++++++++++++++
>> kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++++++++-----
>> kernel/sched/sched.h | 5 ++++
>> kernel/sched/topology.c | 54 +++++++++++++++++++++++++++++++++--
>> 5 files changed, 178 insertions(+), 10 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 466ba8b7398c..95bf080bbbf0 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -2436,9 +2436,11 @@ extern void migrate_enable(void);
>> DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>>
>> #ifdef CONFIG_SCHED_CACHE
>> +DECLARE_STATIC_KEY_FALSE(sched_cache_on);
>> +
>> static inline bool sched_cache_enabled(void)
>> {
>> - return false;
>> + return static_branch_unlikely(&sched_cache_on);
>> }
>> #endif
>>
>> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>> index 02e16b70a790..cde324672103 100644
>> --- a/kernel/sched/debug.c
>> +++ b/kernel/sched/debug.c
>> @@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = {
>> .release = single_release,
>> };
>>
>> +#ifdef CONFIG_SCHED_CACHE
>> +#define SCHED_CACHE_CREATE_CONTROL(name, max) \
>> +static ssize_t sched_cache_write_##name(struct file *filp, \
>> + const char __user *ubuf, \
>> + size_t cnt, loff_t *ppos) \
>> +{ \
>> + char buf[16]; \
>> + unsigned int val; \
>> + if (cnt > 15) \
>> + cnt = 15; \
>> + if (copy_from_user(&buf, ubuf, cnt)) \
>> + return -EFAULT; \
>> + buf[cnt] = '\0'; \
>> + if (kstrtouint(buf, 10, &val)) \
>> + return -EINVAL; \
>> + if (val > (max)) \
>> + return -EINVAL; \
>> + llc_##name = val; \
>> + if (!strcmp(#name, "enabled")) \
>> + sched_cache_set(false); \
>> + *ppos += cnt; \
>> + return cnt; \
>> +} \
>> +static int sched_cache_show_##name(struct seq_file *m, void *v) \
>> +{ \
>> + seq_printf(m, "%d\n", llc_##name); \
>> + return 0; \
>> +} \
>> +static int sched_cache_open_##name(struct inode *inode, \
>> + struct file *filp) \
>> +{ \
>> + return single_open(filp, sched_cache_show_##name, NULL); \
>> +} \
>> +static const struct file_operations sched_cache_fops_##name = { \
>> + .open = sched_cache_open_##name, \
>> + .write = sched_cache_write_##name, \
>> + .read = seq_read, \
>> + .llseek = seq_lseek, \
>> + .release = single_release, \
>> +}
>> +
>> +SCHED_CACHE_CREATE_CONTROL(overload_pct, 100);
>> +SCHED_CACHE_CREATE_CONTROL(imb_pct, 100);
>> +SCHED_CACHE_CREATE_CONTROL(aggr_tolerance, 100);
>> +SCHED_CACHE_CREATE_CONTROL(enabled, 1);
>> +#endif /* SCHED_CACHE */
>> +
>> static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
>> size_t cnt, loff_t *ppos)
>> {
>> @@ -523,6 +570,21 @@ static __init int sched_init_debug(void)
>> debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
>> #endif /* CONFIG_NUMA_BALANCING */
>>
>> +#ifdef CONFIG_SCHED_CACHE
>> + debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL,
>> + &sched_cache_fops_overload_pct);
>> + debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL,
>> + &sched_cache_fops_imb_pct);
>> + debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL,
>> + &sched_cache_fops_aggr_tolerance);
>> + debugfs_create_file("llc_enabled", 0644, debugfs_sched, NULL,
>> + &sched_cache_fops_enabled);
>> + debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched,
>> + &llc_epoch_period);
>> + debugfs_create_u32("llc_epoch_affinity_timeout", 0644, debugfs_sched,
>> + &llc_epoch_affinity_timeout);
>> +#endif
>> +
>> debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
>>
>> debugfs_fair_server_init();
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 424ec601cfdf..a2e2d6742481 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -1207,6 +1207,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
>>
>> __read_mostly unsigned int llc_overload_pct = 50;
>> __read_mostly unsigned int llc_imb_pct = 20;
>> +__read_mostly unsigned int llc_aggr_tolerance = 1;
>> +__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
>> +__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
>>
>> static int llc_id(int cpu)
>> {
>> @@ -1223,11 +1226,22 @@ static int llc_id(int cpu)
>> return llc;
>> }
>>
>> +static inline int get_sched_cache_scale(int mul)
>> +{
>> + if (!llc_aggr_tolerance)
>> + return 0;
>> +
>> + if (llc_aggr_tolerance == 100)
>> + return INT_MAX;
>> +
>> + return (1 + (llc_aggr_tolerance - 1) * mul);
>> +}
>> +
>> static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
>> {
>> + unsigned int llc, scale;
>> struct cacheinfo *ci;
>> unsigned long rss;
>> - unsigned int llc;
>>
>> /*
>> * get_cpu_cacheinfo_level() can not be used
>> @@ -1252,19 +1266,54 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
>> rss = get_mm_counter(mm, MM_ANONPAGES) +
>> get_mm_counter(mm, MM_SHMEMPAGES);
>>
>> - return (llc <= (rss * PAGE_SIZE));
>> + /*
>> + * Scale the LLC size by 256*llc_aggr_tolerance
>> + * and compare it to the task's RSS size.
>> + *
>> + * Suppose the L3 size is 32MB. If the
>> + * llc_aggr_tolerance is 1:
>> + * When the RSS is larger than 32MB, the process
>> + * is regarded as exceeding the LLC capacity. If
>> + * the llc_aggr_tolerance is 99:
>> + * When the RSS is larger than 784GB, the process
>> + * is regarded as exceeding the LLC capacity because:
>> + * 784GB = (1 + (99 - 1) * 256) * 32MB
>> + */
>> + scale = get_sched_cache_scale(256);
>> + if (scale == INT_MAX)
>> + return false;
>> +
>> + return ((llc * scale) <= (rss * PAGE_SIZE));
>> }
>>
>> static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
>> {
>> - int smt_nr = 1;
>> + int smt_nr = 1, scale;
>>
>> #ifdef CONFIG_SCHED_SMT
>> if (sched_smt_active())
>> smt_nr = cpumask_weight(cpu_smt_mask(cpu));
>> #endif
>> + /*
>> + * Scale the Core number in a LLC by llc_aggr_tolerance
>> + * and compare it to the task's active threads.
>> + *
>> + * Suppose the number of Cores in LLC is 8.
>> + * Every core has 2 SMTs.
>> + * If the llc_aggr_tolerance is 1: When the
>> + * nr_running is larger than 8, the process
>> + * is regarded as exceeding the LLC capacity.
>> + * If the llc_aggr_tolerance is 99:
>> + * When the nr_running is larger than 785,
>> + * the process is regarded as exceeding
>> + * the LLC capacity:
>> + * 785 = 1 + (99 - 1) * 8
>> + */
>> + scale = get_sched_cache_scale(1);
>> + if (scale == INT_MAX)
>> + return false;
>>
>> - return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
>> + return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size, cpu)));
>> }
>>
>> static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
>> @@ -1350,9 +1399,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
>> long delta = now - rq->cpu_epoch_next;
>>
>> if (delta > 0) {
>> - n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
>> + n = (delta + llc_epoch_period - 1) / llc_epoch_period;
>> rq->cpu_epoch += n;
>> - rq->cpu_epoch_next += n * EPOCH_PERIOD;
>> + rq->cpu_epoch_next += n * llc_epoch_period;
>> __shr_u64(&rq->cpu_runtime, n);
>> }
>>
>> @@ -1412,7 +1461,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>> * has only 1 thread, or has too many active threads, invalidate
>> * its preferred state.
>> */
>> - if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
>> + if (epoch - READ_ONCE(mm->mm_sched_epoch) > llc_epoch_affinity_timeout ||
>> get_nr_threads(p) <= 1 ||
>> exceed_llc_nr(mm, cpu_of(rq)) ||
>> exceed_llc_capacity(mm, cpu_of(rq))) {
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index 40798a06e058..15d126bd3728 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -2852,6 +2852,11 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
>> #ifdef CONFIG_SCHED_CACHE
>> extern unsigned int llc_overload_pct;
>> extern unsigned int llc_imb_pct;
>> +extern unsigned int llc_aggr_tolerance;
>> +extern unsigned int llc_epoch_period;
>> +extern unsigned int llc_epoch_affinity_timeout;
>> +extern unsigned int llc_enabled;
>> +void sched_cache_set(bool locked);
>> #endif
>>
>> #ifdef CONFIG_SCHED_HRTICK
>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>> index 9799e3a9a609..818599ddaaef 100644
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -26,6 +26,49 @@ int max_llcs;
>>
>> static bool sched_cache_present;
>>
>> +unsigned int llc_enabled = 1;
>> +DEFINE_STATIC_KEY_FALSE(sched_cache_on);
>> +
>> +/*
>> + * Enable/disable cache aware scheduling according to
>> + * user input and the presence of hardware support.
>> + */
>> +static void _sched_cache_set(bool enable, bool locked)
>> +{
>> + if (enable) {
>> + if (locked)
>> + static_branch_enable_cpuslocked(&sched_cache_on);
>> + else
>> + static_branch_enable(&sched_cache_on);
>> + } else {
>> + if (locked)
>> + static_branch_disable_cpuslocked(&sched_cache_on);
>> + else
>> + static_branch_disable(&sched_cache_on);
>> + }
>> +}
>> +
>> +void sched_cache_set(bool locked)
>> +{
>> + /* hardware does not support */
>> + if (!sched_cache_present) {
>> + if (static_branch_likely(&sched_cache_on))
>> + _sched_cache_set(false, locked);
>> +
>> + return;
>> + }
>> +
>> + /* user wants it or not ?*/
>> + if (llc_enabled) {
>> + if (!static_branch_likely(&sched_cache_on))
>> + _sched_cache_set(true, locked);
>> +
>> + } else {
>> + if (static_branch_likely(&sched_cache_on))
>> + _sched_cache_set(false, locked);
>> + }
>> +}
>> +
>> static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
>> {
>> unsigned int *new = NULL;
>> @@ -70,8 +113,12 @@ static int resize_llc_pref(bool has_multi_llcs)
>> * new buffer.
>> */
>> tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
>> - if (!tmp_llc_pref)
>> - return -ENOMEM;
>> + if (!tmp_llc_pref) {
>> + sched_cache_present = false;
>> + ret = -ENOMEM;
>> +
>> + goto out;
>> + }
>>
>> for_each_present_cpu(i)
>> *per_cpu_ptr(tmp_llc_pref, i) = NULL;
>> @@ -89,6 +136,7 @@ static int resize_llc_pref(bool has_multi_llcs)
>> new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
>> if (!new) {
>> ret = -ENOMEM;
>> + sched_cache_present = false;
>>
>> goto release_old;
>> }
>> @@ -126,6 +174,8 @@ static int resize_llc_pref(bool has_multi_llcs)
>> if (!ret)
>> max_llcs = new_max_llcs;
>>
>> +out:
>> + sched_cache_set(true);
>> return ret;
>> }
>>
>> --
>> 2.32.0
> On 24 Dec 2025, at 00:44, Yangyu Chen <cyy@cyyself.name> wrote:
>
>> On 23 Dec 2025, at 20:12, Yangyu Chen <cyy@cyyself.name> wrote:
>>
>>> On 4 Dec 2025, at 07:07, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>>>
>>> From: Chen Yu <yu.c.chen@intel.com>
>>>
>>> Introduce a set of debugfs knobs to control the enabling of
>>> and parameters for cache-aware load balancing.
>>>
>>> (1) llc_enabled
>>> llc_enabled acts as the primary switch - users can toggle it to
>>> enable or disable cache aware load balancing.
>>>
>>> (2) llc_aggr_tolerance
>>> With sched_cache enabled, the scheduler uses a process's RSS as a
>>> proxy for its LLC footprint to determine if aggregating tasks on the
>>> preferred LLC could cause cache contention. If RSS exceeds the LLC
>>> size, aggregation is skipped. Some workloads with large RSS but small
>>> actual memory footprints may still benefit from aggregation. Since
>>> the kernel cannot efficiently track per-task cache usage (resctrl is
>>> user-space only), userspace can provide a more accurate hint.
>>>
>>> Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let
>>> users control how strictly RSS limits aggregation. Values range from
>>> 0 to 100:
>>>
>>> - 0: Cache-aware scheduling is disabled.
>>> - 1: Strict; tasks with RSS larger than LLC size are skipped.
>>> - 100: Aggressive; tasks are aggregated regardless of RSS.
>>>
>>
>> Hi Chen Yu and Tim Chen,
>>
>> Maybe we should have something like prctl(PR_LLC_AGGR_TOLERANCE, 100).
>>
>> I have tested this version of the patch on my EPYC Milan 7V13 (7763 variant) physical machine, with 32M LLC for each 8-core CCX. I found that I need to tune "llc_aggr_tolerance" to 100, else I can't get cache-aware scheduling to work on Verilated [1] XiangShan [2] running the chacha20 [3] as I mentioned before [4].
>>
>
> In addition, I have investigated why this happens. And finally I
> realized that's because that workload observed 35596 kB RssAnon on
> my EPYC Milan Machine, slightly exceeding the LLC size (32M). I
> have tested it on an EPYC Genoa cloud server with the correct core
> / cache hierarchy in ACPI table, that shows 31700 kB RssAnon, thus
> fitting in LLC. I have no idea why my result shows higher RssAnon,
> since they both run Debian Trixie with the exact same kernel and
> same executable. But it reminds me we should have a userspace API
> for that.
>
> Thanks,
> Yangyu Chen
>
In addition, during profiling the verilator, I found that if scheduled
to SMTs, it will result in poor performance. Thus, I think we should
separate the control for rss size with the SMT scale.
It's notable that rss size is not the actual memory footprint. It
would be better if we could measure the l2_miss event or l3_miss
event to measure the l3 hit rate. Just for future work.
I'm willing to provide a patch for such a prctl. But I'm busy these
days, maybe I can have the time to do that after one week.
Thanks,
Yangyu chen
>> But if I set it to 100, I will lose some performance on stream copy benchmarks since the bandwidth is limited per CCX. Thus, I think we should have a new prctl to let userspace software hint the kernel that this task can be bound by latency between cores, and should use this feature no matter the RSS exceed the LLC size.
>>
>> I finally have an EPYC Milan at home today. I'm glad to test this patch further. And I'm very willing to have this thus we can have a fast verilator[1] without numactl manually.
>>
>> [1] https://github.com/verilator/verilator
>> [2] https://github.com/OpenXiangShan/Xiangshan
>> [3] https://github.com/cyyself/chacha20-xiangshan
>> [4] https://lore.kernel.org/lkml/tencent_6E51A3175F8AE0A7F684A319EE63CC56C806@qq.com/
>>
>> Thanks,
>> Yangyu Chen
>>
>>> For exaImple, with a 32MB L3 cache:
>>>
>>> - llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped.
>>> - llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped
>>> (784GB = (1 + (99 - 1) * 256) * 32MB).
>>>
>>> Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls
>>> how strictly the number of active threads is considered when doing
>>> cache aware load balance. The number of SMTs is also considered.
>>> High SMT counts reduce the aggregation capacity, preventing excessive
>>> task aggregation on SMT-heavy systems like Power10/Power11.
>>>
>>> For example, with 8 Cores/16 CPUs in a L3:
>>>
>>> - llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped.
>>> - llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped
>>> 785 = (1 + (99 - 1) * 8).
>>>
>>> (3) llc_epoch_period/llc_epoch_affinity_timeout
>>> Besides, llc_epoch_period and llc_epoch_affinity_timeout are also turned
>>> into tunable.
>>>
>>> Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
>>> Suggested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
>>> Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
>>> Suggested-by: Tingyin Duan <tingyin.duan@gmail.com>
>>> Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
>>> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
>>> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
>>> ---
>>>
>>> Notes:
>>> v1->v2: Remove the smt_nr check in fits_llc_capacity().
>>> (Aaron Lu)
>>>
>>> include/linux/sched.h | 4 ++-
>>> kernel/sched/debug.c | 62 ++++++++++++++++++++++++++++++++++++++++
>>> kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++++++++-----
>>> kernel/sched/sched.h | 5 ++++
>>> kernel/sched/topology.c | 54 +++++++++++++++++++++++++++++++++--
>>> 5 files changed, 178 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>> index 466ba8b7398c..95bf080bbbf0 100644
>>> --- a/include/linux/sched.h
>>> +++ b/include/linux/sched.h
>>> @@ -2436,9 +2436,11 @@ extern void migrate_enable(void);
>>> DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>>>
>>> #ifdef CONFIG_SCHED_CACHE
>>> +DECLARE_STATIC_KEY_FALSE(sched_cache_on);
>>> +
>>> static inline bool sched_cache_enabled(void)
>>> {
>>> - return false;
>>> + return static_branch_unlikely(&sched_cache_on);
>>> }
>>> #endif
>>>
>>> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>>> index 02e16b70a790..cde324672103 100644
>>> --- a/kernel/sched/debug.c
>>> +++ b/kernel/sched/debug.c
>>> @@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = {
>>> .release = single_release,
>>> };
>>>
>>> +#ifdef CONFIG_SCHED_CACHE
>>> +#define SCHED_CACHE_CREATE_CONTROL(name, max) \
>>> +static ssize_t sched_cache_write_##name(struct file *filp, \
>>> + const char __user *ubuf, \
>>> + size_t cnt, loff_t *ppos) \
>>> +{ \
>>> + char buf[16]; \
>>> + unsigned int val; \
>>> + if (cnt > 15) \
>>> + cnt = 15; \
>>> + if (copy_from_user(&buf, ubuf, cnt)) \
>>> + return -EFAULT; \
>>> + buf[cnt] = '\0'; \
>>> + if (kstrtouint(buf, 10, &val)) \
>>> + return -EINVAL; \
>>> + if (val > (max)) \
>>> + return -EINVAL; \
>>> + llc_##name = val; \
>>> + if (!strcmp(#name, "enabled")) \
>>> + sched_cache_set(false); \
>>> + *ppos += cnt; \
>>> + return cnt; \
>>> +} \
>>> +static int sched_cache_show_##name(struct seq_file *m, void *v) \
>>> +{ \
>>> + seq_printf(m, "%d\n", llc_##name); \
>>> + return 0; \
>>> +} \
>>> +static int sched_cache_open_##name(struct inode *inode, \
>>> + struct file *filp) \
>>> +{ \
>>> + return single_open(filp, sched_cache_show_##name, NULL); \
>>> +} \
>>> +static const struct file_operations sched_cache_fops_##name = { \
>>> + .open = sched_cache_open_##name, \
>>> + .write = sched_cache_write_##name, \
>>> + .read = seq_read, \
>>> + .llseek = seq_lseek, \
>>> + .release = single_release, \
>>> +}
>>> +
>>> +SCHED_CACHE_CREATE_CONTROL(overload_pct, 100);
>>> +SCHED_CACHE_CREATE_CONTROL(imb_pct, 100);
>>> +SCHED_CACHE_CREATE_CONTROL(aggr_tolerance, 100);
>>> +SCHED_CACHE_CREATE_CONTROL(enabled, 1);
>>> +#endif /* SCHED_CACHE */
>>> +
>>> static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
>>> size_t cnt, loff_t *ppos)
>>> {
>>> @@ -523,6 +570,21 @@ static __init int sched_init_debug(void)
>>> debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
>>> #endif /* CONFIG_NUMA_BALANCING */
>>>
>>> +#ifdef CONFIG_SCHED_CACHE
>>> + debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL,
>>> + &sched_cache_fops_overload_pct);
>>> + debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL,
>>> + &sched_cache_fops_imb_pct);
>>> + debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL,
>>> + &sched_cache_fops_aggr_tolerance);
>>> + debugfs_create_file("llc_enabled", 0644, debugfs_sched, NULL,
>>> + &sched_cache_fops_enabled);
>>> + debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched,
>>> + &llc_epoch_period);
>>> + debugfs_create_u32("llc_epoch_affinity_timeout", 0644, debugfs_sched,
>>> + &llc_epoch_affinity_timeout);
>>> +#endif
>>> +
>>> debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
>>>
>>> debugfs_fair_server_init();
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 424ec601cfdf..a2e2d6742481 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -1207,6 +1207,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
>>>
>>> __read_mostly unsigned int llc_overload_pct = 50;
>>> __read_mostly unsigned int llc_imb_pct = 20;
>>> +__read_mostly unsigned int llc_aggr_tolerance = 1;
>>> +__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
>>> +__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
>>>
>>> static int llc_id(int cpu)
>>> {
>>> @@ -1223,11 +1226,22 @@ static int llc_id(int cpu)
>>> return llc;
>>> }
>>>
>>> +static inline int get_sched_cache_scale(int mul)
>>> +{
>>> + if (!llc_aggr_tolerance)
>>> + return 0;
>>> +
>>> + if (llc_aggr_tolerance == 100)
>>> + return INT_MAX;
>>> +
>>> + return (1 + (llc_aggr_tolerance - 1) * mul);
>>> +}
>>> +
>>> static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
>>> {
>>> + unsigned int llc, scale;
>>> struct cacheinfo *ci;
>>> unsigned long rss;
>>> - unsigned int llc;
>>>
>>> /*
>>> * get_cpu_cacheinfo_level() can not be used
>>> @@ -1252,19 +1266,54 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
>>> rss = get_mm_counter(mm, MM_ANONPAGES) +
>>> get_mm_counter(mm, MM_SHMEMPAGES);
>>>
>>> - return (llc <= (rss * PAGE_SIZE));
>>> + /*
>>> + * Scale the LLC size by 256*llc_aggr_tolerance
>>> + * and compare it to the task's RSS size.
>>> + *
>>> + * Suppose the L3 size is 32MB. If the
>>> + * llc_aggr_tolerance is 1:
>>> + * When the RSS is larger than 32MB, the process
>>> + * is regarded as exceeding the LLC capacity. If
>>> + * the llc_aggr_tolerance is 99:
>>> + * When the RSS is larger than 784GB, the process
>>> + * is regarded as exceeding the LLC capacity because:
>>> + * 784GB = (1 + (99 - 1) * 256) * 32MB
>>> + */
>>> + scale = get_sched_cache_scale(256);
>>> + if (scale == INT_MAX)
>>> + return false;
>>> +
>>> + return ((llc * scale) <= (rss * PAGE_SIZE));
>>> }
>>>
>>> static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
>>> {
>>> - int smt_nr = 1;
>>> + int smt_nr = 1, scale;
>>>
>>> #ifdef CONFIG_SCHED_SMT
>>> if (sched_smt_active())
>>> smt_nr = cpumask_weight(cpu_smt_mask(cpu));
>>> #endif
>>> + /*
>>> + * Scale the Core number in a LLC by llc_aggr_tolerance
>>> + * and compare it to the task's active threads.
>>> + *
>>> + * Suppose the number of Cores in LLC is 8.
>>> + * Every core has 2 SMTs.
>>> + * If the llc_aggr_tolerance is 1: When the
>>> + * nr_running is larger than 8, the process
>>> + * is regarded as exceeding the LLC capacity.
>>> + * If the llc_aggr_tolerance is 99:
>>> + * When the nr_running is larger than 785,
>>> + * the process is regarded as exceeding
>>> + * the LLC capacity:
>>> + * 785 = 1 + (99 - 1) * 8
>>> + */
>>> + scale = get_sched_cache_scale(1);
>>> + if (scale == INT_MAX)
>>> + return false;
>>>
>>> - return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
>>> + return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size, cpu)));
>>> }
>>>
>>> static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
>>> @@ -1350,9 +1399,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
>>> long delta = now - rq->cpu_epoch_next;
>>>
>>> if (delta > 0) {
>>> - n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
>>> + n = (delta + llc_epoch_period - 1) / llc_epoch_period;
>>> rq->cpu_epoch += n;
>>> - rq->cpu_epoch_next += n * EPOCH_PERIOD;
>>> + rq->cpu_epoch_next += n * llc_epoch_period;
>>> __shr_u64(&rq->cpu_runtime, n);
>>> }
>>>
>>> @@ -1412,7 +1461,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>>> * has only 1 thread, or has too many active threads, invalidate
>>> * its preferred state.
>>> */
>>> - if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
>>> + if (epoch - READ_ONCE(mm->mm_sched_epoch) > llc_epoch_affinity_timeout ||
>>> get_nr_threads(p) <= 1 ||
>>> exceed_llc_nr(mm, cpu_of(rq)) ||
>>> exceed_llc_capacity(mm, cpu_of(rq))) {
>>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>>> index 40798a06e058..15d126bd3728 100644
>>> --- a/kernel/sched/sched.h
>>> +++ b/kernel/sched/sched.h
>>> @@ -2852,6 +2852,11 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
>>> #ifdef CONFIG_SCHED_CACHE
>>> extern unsigned int llc_overload_pct;
>>> extern unsigned int llc_imb_pct;
>>> +extern unsigned int llc_aggr_tolerance;
>>> +extern unsigned int llc_epoch_period;
>>> +extern unsigned int llc_epoch_affinity_timeout;
>>> +extern unsigned int llc_enabled;
>>> +void sched_cache_set(bool locked);
>>> #endif
>>>
>>> #ifdef CONFIG_SCHED_HRTICK
>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>>> index 9799e3a9a609..818599ddaaef 100644
>>> --- a/kernel/sched/topology.c
>>> +++ b/kernel/sched/topology.c
>>> @@ -26,6 +26,49 @@ int max_llcs;
>>>
>>> static bool sched_cache_present;
>>>
>>> +unsigned int llc_enabled = 1;
>>> +DEFINE_STATIC_KEY_FALSE(sched_cache_on);
>>> +
>>> +/*
>>> + * Enable/disable cache aware scheduling according to
>>> + * user input and the presence of hardware support.
>>> + */
>>> +static void _sched_cache_set(bool enable, bool locked)
>>> +{
>>> + if (enable) {
>>> + if (locked)
>>> + static_branch_enable_cpuslocked(&sched_cache_on);
>>> + else
>>> + static_branch_enable(&sched_cache_on);
>>> + } else {
>>> + if (locked)
>>> + static_branch_disable_cpuslocked(&sched_cache_on);
>>> + else
>>> + static_branch_disable(&sched_cache_on);
>>> + }
>>> +}
>>> +
>>> +void sched_cache_set(bool locked)
>>> +{
>>> + /* hardware does not support */
>>> + if (!sched_cache_present) {
>>> + if (static_branch_likely(&sched_cache_on))
>>> + _sched_cache_set(false, locked);
>>> +
>>> + return;
>>> + }
>>> +
>>> + /* user wants it or not ?*/
>>> + if (llc_enabled) {
>>> + if (!static_branch_likely(&sched_cache_on))
>>> + _sched_cache_set(true, locked);
>>> +
>>> + } else {
>>> + if (static_branch_likely(&sched_cache_on))
>>> + _sched_cache_set(false, locked);
>>> + }
>>> +}
>>> +
>>> static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
>>> {
>>> unsigned int *new = NULL;
>>> @@ -70,8 +113,12 @@ static int resize_llc_pref(bool has_multi_llcs)
>>> * new buffer.
>>> */
>>> tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
>>> - if (!tmp_llc_pref)
>>> - return -ENOMEM;
>>> + if (!tmp_llc_pref) {
>>> + sched_cache_present = false;
>>> + ret = -ENOMEM;
>>> +
>>> + goto out;
>>> + }
>>>
>>> for_each_present_cpu(i)
>>> *per_cpu_ptr(tmp_llc_pref, i) = NULL;
>>> @@ -89,6 +136,7 @@ static int resize_llc_pref(bool has_multi_llcs)
>>> new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
>>> if (!new) {
>>> ret = -ENOMEM;
>>> + sched_cache_present = false;
>>>
>>> goto release_old;
>>> }
>>> @@ -126,6 +174,8 @@ static int resize_llc_pref(bool has_multi_llcs)
>>> if (!ret)
>>> max_llcs = new_max_llcs;
>>>
>>> +out:
>>> + sched_cache_set(true);
>>> return ret;
>>> }
>>>
>>> --
>>> 2.32.0
>>
>>> On 4 Dec 2025, at 07:07, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>>>
>>> From: Chen Yu <yu.c.chen@intel.com>
>>>
>>> Introduce a set of debugfs knobs to control the enabling of
>>> and parameters for cache-aware load balancing.
>>>
>>> (1) llc_enabled
>>> llc_enabled acts as the primary switch - users can toggle it to
>>> enable or disable cache aware load balancing.
>>>
>>> (2) llc_aggr_tolerance
>>> With sched_cache enabled, the scheduler uses a process's RSS as a
>>> proxy for its LLC footprint to determine if aggregating tasks on the
>>> preferred LLC could cause cache contention. If RSS exceeds the LLC
>>> size, aggregation is skipped. Some workloads with large RSS but small
>>> actual memory footprints may still benefit from aggregation. Since
>>> the kernel cannot efficiently track per-task cache usage (resctrl is
>>> user-space only), userspace can provide a more accurate hint.
>>>
>>> Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let
>>> users control how strictly RSS limits aggregation. Values range from
>>> 0 to 100:
>>>
>>> - 0: Cache-aware scheduling is disabled.
>>> - 1: Strict; tasks with RSS larger than LLC size are skipped.
>>> - 100: Aggressive; tasks are aggregated regardless of RSS.
>>>
>>
>> Hi Chen Yu and Tim Chen,
>>
>> Maybe we should have something like prctl(PR_LLC_AGGR_TOLERANCE, 100).
>>
>> I have tested this version of the patch on my EPYC Milan 7V13 (7763 variant) physical machine, with 32M LLC for each 8-core CCX. I found that I need to tune "llc_aggr_tolerance" to 100, else I can't get cache-aware scheduling to work on Verilated [1] XiangShan [2] running the chacha20 [3] as I mentioned before [4].
>>
>> But if I set it to 100, I will lose some performance on stream copy benchmarks since the bandwidth is limited per CCX. Thus, I think we should have a new prctl to let userspace software hint the kernel that this task can be bound by latency between cores, and should use this feature no matter the RSS exceed the LLC size.
>>
>> I finally have an EPYC Milan at home today. I'm glad to test this patch further. And I'm very willing to have this thus we can have a fast verilator[1] without numactl manually.
>>
>> [1] https://github.com/verilator/verilator
>> [2] https://github.com/OpenXiangShan/Xiangshan
>> [3] https://github.com/cyyself/chacha20-xiangshan
>> [4] https://lore.kernel.org/lkml/tencent_6E51A3175F8AE0A7F684A319EE63CC56C806@qq.com/
>>
>> Thanks,
>> Yangyu Chen
>>
>>> For exaImple, with a 32MB L3 cache:
>>>
>>> - llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped.
>>> - llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped
>>> (784GB = (1 + (99 - 1) * 256) * 32MB).
>>>
>>> Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls
>>> how strictly the number of active threads is considered when doing
>>> cache aware load balance. The number of SMTs is also considered.
>>> High SMT counts reduce the aggregation capacity, preventing excessive
>>> task aggregation on SMT-heavy systems like Power10/Power11.
>>>
>>> For example, with 8 Cores/16 CPUs in a L3:
>>>
>>> - llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped.
>>> - llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped
>>> 785 = (1 + (99 - 1) * 8).
>>>
>>> (3) llc_epoch_period/llc_epoch_affinity_timeout
>>> Besides, llc_epoch_period and llc_epoch_affinity_timeout are also turned
>>> into tunable.
>>>
>>> Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
>>> Suggested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
>>> Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
>>> Suggested-by: Tingyin Duan <tingyin.duan@gmail.com>
>>> Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
>>> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
>>> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
>>> ---
>>>
>>> Notes:
>>> v1->v2: Remove the smt_nr check in fits_llc_capacity().
>>> (Aaron Lu)
>>>
>>> include/linux/sched.h | 4 ++-
>>> kernel/sched/debug.c | 62 ++++++++++++++++++++++++++++++++++++++++
>>> kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++++++++-----
>>> kernel/sched/sched.h | 5 ++++
>>> kernel/sched/topology.c | 54 +++++++++++++++++++++++++++++++++--
>>> 5 files changed, 178 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>> index 466ba8b7398c..95bf080bbbf0 100644
>>> --- a/include/linux/sched.h
>>> +++ b/include/linux/sched.h
>>> @@ -2436,9 +2436,11 @@ extern void migrate_enable(void);
>>> DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>>>
>>> #ifdef CONFIG_SCHED_CACHE
>>> +DECLARE_STATIC_KEY_FALSE(sched_cache_on);
>>> +
>>> static inline bool sched_cache_enabled(void)
>>> {
>>> - return false;
>>> + return static_branch_unlikely(&sched_cache_on);
>>> }
>>> #endif
>>>
>>> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>>> index 02e16b70a790..cde324672103 100644
>>> --- a/kernel/sched/debug.c
>>> +++ b/kernel/sched/debug.c
>>> @@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = {
>>> .release = single_release,
>>> };
>>>
>>> +#ifdef CONFIG_SCHED_CACHE
>>> +#define SCHED_CACHE_CREATE_CONTROL(name, max) \
>>> +static ssize_t sched_cache_write_##name(struct file *filp, \
>>> + const char __user *ubuf, \
>>> + size_t cnt, loff_t *ppos) \
>>> +{ \
>>> + char buf[16]; \
>>> + unsigned int val; \
>>> + if (cnt > 15) \
>>> + cnt = 15; \
>>> + if (copy_from_user(&buf, ubuf, cnt)) \
>>> + return -EFAULT; \
>>> + buf[cnt] = '\0'; \
>>> + if (kstrtouint(buf, 10, &val)) \
>>> + return -EINVAL; \
>>> + if (val > (max)) \
>>> + return -EINVAL; \
>>> + llc_##name = val; \
>>> + if (!strcmp(#name, "enabled")) \
>>> + sched_cache_set(false); \
>>> + *ppos += cnt; \
>>> + return cnt; \
>>> +} \
>>> +static int sched_cache_show_##name(struct seq_file *m, void *v) \
>>> +{ \
>>> + seq_printf(m, "%d\n", llc_##name); \
>>> + return 0; \
>>> +} \
>>> +static int sched_cache_open_##name(struct inode *inode, \
>>> + struct file *filp) \
>>> +{ \
>>> + return single_open(filp, sched_cache_show_##name, NULL); \
>>> +} \
>>> +static const struct file_operations sched_cache_fops_##name = { \
>>> + .open = sched_cache_open_##name, \
>>> + .write = sched_cache_write_##name, \
>>> + .read = seq_read, \
>>> + .llseek = seq_lseek, \
>>> + .release = single_release, \
>>> +}
>>> +
>>> +SCHED_CACHE_CREATE_CONTROL(overload_pct, 100);
>>> +SCHED_CACHE_CREATE_CONTROL(imb_pct, 100);
>>> +SCHED_CACHE_CREATE_CONTROL(aggr_tolerance, 100);
>>> +SCHED_CACHE_CREATE_CONTROL(enabled, 1);
>>> +#endif /* SCHED_CACHE */
>>> +
>>> static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
>>> size_t cnt, loff_t *ppos)
>>> {
>>> @@ -523,6 +570,21 @@ static __init int sched_init_debug(void)
>>> debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
>>> #endif /* CONFIG_NUMA_BALANCING */
>>>
>>> +#ifdef CONFIG_SCHED_CACHE
>>> + debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL,
>>> + &sched_cache_fops_overload_pct);
>>> + debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL,
>>> + &sched_cache_fops_imb_pct);
>>> + debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL,
>>> + &sched_cache_fops_aggr_tolerance);
>>> + debugfs_create_file("llc_enabled", 0644, debugfs_sched, NULL,
>>> + &sched_cache_fops_enabled);
>>> + debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched,
>>> + &llc_epoch_period);
>>> + debugfs_create_u32("llc_epoch_affinity_timeout", 0644, debugfs_sched,
>>> + &llc_epoch_affinity_timeout);
>>> +#endif
>>> +
>>> debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
>>>
>>> debugfs_fair_server_init();
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 424ec601cfdf..a2e2d6742481 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -1207,6 +1207,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
>>>
>>> __read_mostly unsigned int llc_overload_pct = 50;
>>> __read_mostly unsigned int llc_imb_pct = 20;
>>> +__read_mostly unsigned int llc_aggr_tolerance = 1;
>>> +__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
>>> +__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
>>>
>>> static int llc_id(int cpu)
>>> {
>>> @@ -1223,11 +1226,22 @@ static int llc_id(int cpu)
>>> return llc;
>>> }
>>>
>>> +static inline int get_sched_cache_scale(int mul)
>>> +{
>>> + if (!llc_aggr_tolerance)
>>> + return 0;
>>> +
>>> + if (llc_aggr_tolerance == 100)
>>> + return INT_MAX;
>>> +
>>> + return (1 + (llc_aggr_tolerance - 1) * mul);
>>> +}
>>> +
>>> static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
>>> {
>>> + unsigned int llc, scale;
>>> struct cacheinfo *ci;
>>> unsigned long rss;
>>> - unsigned int llc;
>>>
>>> /*
>>> * get_cpu_cacheinfo_level() can not be used
>>> @@ -1252,19 +1266,54 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
>>> rss = get_mm_counter(mm, MM_ANONPAGES) +
>>> get_mm_counter(mm, MM_SHMEMPAGES);
>>>
>>> - return (llc <= (rss * PAGE_SIZE));
>>> + /*
>>> + * Scale the LLC size by 256*llc_aggr_tolerance
>>> + * and compare it to the task's RSS size.
>>> + *
>>> + * Suppose the L3 size is 32MB. If the
>>> + * llc_aggr_tolerance is 1:
>>> + * When the RSS is larger than 32MB, the process
>>> + * is regarded as exceeding the LLC capacity. If
>>> + * the llc_aggr_tolerance is 99:
>>> + * When the RSS is larger than 784GB, the process
>>> + * is regarded as exceeding the LLC capacity because:
>>> + * 784GB = (1 + (99 - 1) * 256) * 32MB
>>> + */
>>> + scale = get_sched_cache_scale(256);
>>> + if (scale == INT_MAX)
>>> + return false;
>>> +
>>> + return ((llc * scale) <= (rss * PAGE_SIZE));
>>> }
>>>
>>> static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
>>> {
>>> - int smt_nr = 1;
>>> + int smt_nr = 1, scale;
>>>
>>> #ifdef CONFIG_SCHED_SMT
>>> if (sched_smt_active())
>>> smt_nr = cpumask_weight(cpu_smt_mask(cpu));
>>> #endif
>>> + /*
>>> + * Scale the Core number in a LLC by llc_aggr_tolerance
>>> + * and compare it to the task's active threads.
>>> + *
>>> + * Suppose the number of Cores in LLC is 8.
>>> + * Every core has 2 SMTs.
>>> + * If the llc_aggr_tolerance is 1: When the
>>> + * nr_running is larger than 8, the process
>>> + * is regarded as exceeding the LLC capacity.
>>> + * If the llc_aggr_tolerance is 99:
>>> + * When the nr_running is larger than 785,
>>> + * the process is regarded as exceeding
>>> + * the LLC capacity:
>>> + * 785 = 1 + (99 - 1) * 8
>>> + */
>>> + scale = get_sched_cache_scale(1);
>>> + if (scale == INT_MAX)
>>> + return false;
>>>
>>> - return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
>>> + return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size, cpu)));
>>> }
>>>
>>> static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
>>> @@ -1350,9 +1399,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
>>> long delta = now - rq->cpu_epoch_next;
>>>
>>> if (delta > 0) {
>>> - n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
>>> + n = (delta + llc_epoch_period - 1) / llc_epoch_period;
>>> rq->cpu_epoch += n;
>>> - rq->cpu_epoch_next += n * EPOCH_PERIOD;
>>> + rq->cpu_epoch_next += n * llc_epoch_period;
>>> __shr_u64(&rq->cpu_runtime, n);
>>> }
>>>
>>> @@ -1412,7 +1461,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>>> * has only 1 thread, or has too many active threads, invalidate
>>> * its preferred state.
>>> */
>>> - if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
>>> + if (epoch - READ_ONCE(mm->mm_sched_epoch) > llc_epoch_affinity_timeout ||
>>> get_nr_threads(p) <= 1 ||
>>> exceed_llc_nr(mm, cpu_of(rq)) ||
>>> exceed_llc_capacity(mm, cpu_of(rq))) {
>>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>>> index 40798a06e058..15d126bd3728 100644
>>> --- a/kernel/sched/sched.h
>>> +++ b/kernel/sched/sched.h
>>> @@ -2852,6 +2852,11 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
>>> #ifdef CONFIG_SCHED_CACHE
>>> extern unsigned int llc_overload_pct;
>>> extern unsigned int llc_imb_pct;
>>> +extern unsigned int llc_aggr_tolerance;
>>> +extern unsigned int llc_epoch_period;
>>> +extern unsigned int llc_epoch_affinity_timeout;
>>> +extern unsigned int llc_enabled;
>>> +void sched_cache_set(bool locked);
>>> #endif
>>>
>>> #ifdef CONFIG_SCHED_HRTICK
>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>>> index 9799e3a9a609..818599ddaaef 100644
>>> --- a/kernel/sched/topology.c
>>> +++ b/kernel/sched/topology.c
>>> @@ -26,6 +26,49 @@ int max_llcs;
>>>
>>> static bool sched_cache_present;
>>>
>>> +unsigned int llc_enabled = 1;
>>> +DEFINE_STATIC_KEY_FALSE(sched_cache_on);
>>> +
>>> +/*
>>> + * Enable/disable cache aware scheduling according to
>>> + * user input and the presence of hardware support.
>>> + */
>>> +static void _sched_cache_set(bool enable, bool locked)
>>> +{
>>> + if (enable) {
>>> + if (locked)
>>> + static_branch_enable_cpuslocked(&sched_cache_on);
>>> + else
>>> + static_branch_enable(&sched_cache_on);
>>> + } else {
>>> + if (locked)
>>> + static_branch_disable_cpuslocked(&sched_cache_on);
>>> + else
>>> + static_branch_disable(&sched_cache_on);
>>> + }
>>> +}
>>> +
>>> +void sched_cache_set(bool locked)
>>> +{
>>> + /* hardware does not support */
>>> + if (!sched_cache_present) {
>>> + if (static_branch_likely(&sched_cache_on))
>>> + _sched_cache_set(false, locked);
>>> +
>>> + return;
>>> + }
>>> +
>>> + /* user wants it or not ?*/
>>> + if (llc_enabled) {
>>> + if (!static_branch_likely(&sched_cache_on))
>>> + _sched_cache_set(true, locked);
>>> +
>>> + } else {
>>> + if (static_branch_likely(&sched_cache_on))
>>> + _sched_cache_set(false, locked);
>>> + }
>>> +}
>>> +
>>> static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
>>> {
>>> unsigned int *new = NULL;
>>> @@ -70,8 +113,12 @@ static int resize_llc_pref(bool has_multi_llcs)
>>> * new buffer.
>>> */
>>> tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
>>> - if (!tmp_llc_pref)
>>> - return -ENOMEM;
>>> + if (!tmp_llc_pref) {
>>> + sched_cache_present = false;
>>> + ret = -ENOMEM;
>>> +
>>> + goto out;
>>> + }
>>>
>>> for_each_present_cpu(i)
>>> *per_cpu_ptr(tmp_llc_pref, i) = NULL;
>>> @@ -89,6 +136,7 @@ static int resize_llc_pref(bool has_multi_llcs)
>>> new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
>>> if (!new) {
>>> ret = -ENOMEM;
>>> + sched_cache_present = false;
>>>
>>> goto release_old;
>>> }
>>> @@ -126,6 +174,8 @@ static int resize_llc_pref(bool has_multi_llcs)
>>> if (!ret)
>>> max_llcs = new_max_llcs;
>>>
>>> +out:
>>> + sched_cache_set(true);
>>> return ret;
>>> }
>>>
>>> --
>>> 2.32.0
>
>
>> On 23 Dec 2025, at 20:12, Yangyu Chen <cyy@cyyself.name> wrote:
>>
>>> On 4 Dec 2025, at 07:07, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>>>
>>> From: Chen Yu <yu.c.chen@intel.com>
>>>
>>> Introduce a set of debugfs knobs to control the enabling of
>>> and parameters for cache-aware load balancing.
>>>
>>> (1) llc_enabled
>>> llc_enabled acts as the primary switch - users can toggle it to
>>> enable or disable cache aware load balancing.
>>>
>>> (2) llc_aggr_tolerance
>>> With sched_cache enabled, the scheduler uses a process's RSS as a
>>> proxy for its LLC footprint to determine if aggregating tasks on the
>>> preferred LLC could cause cache contention. If RSS exceeds the LLC
>>> size, aggregation is skipped. Some workloads with large RSS but small
>>> actual memory footprints may still benefit from aggregation. Since
>>> the kernel cannot efficiently track per-task cache usage (resctrl is
>>> user-space only), userspace can provide a more accurate hint.
>>>
>>> Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let
>>> users control how strictly RSS limits aggregation. Values range from
>>> 0 to 100:
>>>
>>> - 0: Cache-aware scheduling is disabled.
>>> - 1: Strict; tasks with RSS larger than LLC size are skipped.
>>> - 100: Aggressive; tasks are aggregated regardless of RSS.
>>>
>>
>> Hi Chen Yu and Tim Chen,
>>
>> Maybe we should have something like prctl(PR_LLC_AGGR_TOLERANCE, 100).
>>
>> I have tested this version of the patch on my EPYC Milan 7V13 (7763 variant) physical machine, with 32M LLC for each 8-core CCX. I found that I need to tune "llc_aggr_tolerance" to 100, else I can't get cache-aware scheduling to work on Verilated [1] XiangShan [2] running the chacha20 [3] as I mentioned before [4].
>>
>
> In addition, I have investigated why this happens. And finally I
> realized that's because that workload observed 35596 kB RssAnon on
> my EPYC Milan Machine, slightly exceeding the LLC size (32M). I
> have tested it on an EPYC Genoa cloud server with the correct core
> / cache hierarchy in ACPI table, that shows 31700 kB RssAnon, thus
> fitting in LLC. I have no idea why my result shows higher RssAnon,
> since they both run Debian Trixie with the exact same kernel and
> same executable. But it reminds me we should have a userspace API
> for that.
>
> Thanks,
> Yangyu Chen
>
>> But if I set it to 100, I will lose some performance on stream copy benchmarks since the bandwidth is limited per CCX. Thus, I think we should have a new prctl to let userspace software hint the kernel that this task can be bound by latency between cores, and should use this feature no matter the RSS exceed the LLC size.
>>
>> I finally have an EPYC Milan at home today. I'm glad to test this patch further. And I'm very willing to have this thus we can have a fast verilator[1] without numactl manually.
>>
>> [1] https://github.com/verilator/verilator
>> [2] https://github.com/OpenXiangShan/Xiangshan
>> [3] https://github.com/cyyself/chacha20-xiangshan
>> [4] https://lore.kernel.org/lkml/tencent_6E51A3175F8AE0A7F684A319EE63CC56C806@qq.com/
>>
>> Thanks,
>> Yangyu Chen
>>
>>> For exaImple, with a 32MB L3 cache:
>>>
>>> - llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped.
>>> - llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped
>>> (784GB = (1 + (99 - 1) * 256) * 32MB).
>>>
>>> Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls
>>> how strictly the number of active threads is considered when doing
>>> cache aware load balance. The number of SMTs is also considered.
>>> High SMT counts reduce the aggregation capacity, preventing excessive
>>> task aggregation on SMT-heavy systems like Power10/Power11.
>>>
>>> For example, with 8 Cores/16 CPUs in a L3:
>>>
>>> - llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped.
>>> - llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped
>>> 785 = (1 + (99 - 1) * 8).
>>>
>>> (3) llc_epoch_period/llc_epoch_affinity_timeout
>>> Besides, llc_epoch_period and llc_epoch_affinity_timeout are also turned
>>> into tunable.
>>>
>>> Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
>>> Suggested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
>>> Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
>>> Suggested-by: Tingyin Duan <tingyin.duan@gmail.com>
>>> Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
>>> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
>>> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
>>> ---
>>>
>>> Notes:
>>> v1->v2: Remove the smt_nr check in fits_llc_capacity().
>>> (Aaron Lu)
>>>
>>> include/linux/sched.h | 4 ++-
>>> kernel/sched/debug.c | 62 ++++++++++++++++++++++++++++++++++++++++
>>> kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++++++++-----
>>> kernel/sched/sched.h | 5 ++++
>>> kernel/sched/topology.c | 54 +++++++++++++++++++++++++++++++++--
>>> 5 files changed, 178 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>> index 466ba8b7398c..95bf080bbbf0 100644
>>> --- a/include/linux/sched.h
>>> +++ b/include/linux/sched.h
>>> @@ -2436,9 +2436,11 @@ extern void migrate_enable(void);
>>> DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>>>
>>> #ifdef CONFIG_SCHED_CACHE
>>> +DECLARE_STATIC_KEY_FALSE(sched_cache_on);
>>> +
>>> static inline bool sched_cache_enabled(void)
>>> {
>>> - return false;
>>> + return static_branch_unlikely(&sched_cache_on);
>>> }
>>> #endif
>>>
>>> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>>> index 02e16b70a790..cde324672103 100644
>>> --- a/kernel/sched/debug.c
>>> +++ b/kernel/sched/debug.c
>>> @@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = {
>>> .release = single_release,
>>> };
>>>
>>> +#ifdef CONFIG_SCHED_CACHE
>>> +#define SCHED_CACHE_CREATE_CONTROL(name, max) \
>>> +static ssize_t sched_cache_write_##name(struct file *filp, \
>>> + const char __user *ubuf, \
>>> + size_t cnt, loff_t *ppos) \
>>> +{ \
>>> + char buf[16]; \
>>> + unsigned int val; \
>>> + if (cnt > 15) \
>>> + cnt = 15; \
>>> + if (copy_from_user(&buf, ubuf, cnt)) \
>>> + return -EFAULT; \
>>> + buf[cnt] = '\0'; \
>>> + if (kstrtouint(buf, 10, &val)) \
>>> + return -EINVAL; \
>>> + if (val > (max)) \
>>> + return -EINVAL; \
>>> + llc_##name = val; \
>>> + if (!strcmp(#name, "enabled")) \
>>> + sched_cache_set(false); \
>>> + *ppos += cnt; \
>>> + return cnt; \
>>> +} \
>>> +static int sched_cache_show_##name(struct seq_file *m, void *v) \
>>> +{ \
>>> + seq_printf(m, "%d\n", llc_##name); \
>>> + return 0; \
>>> +} \
>>> +static int sched_cache_open_##name(struct inode *inode, \
>>> + struct file *filp) \
>>> +{ \
>>> + return single_open(filp, sched_cache_show_##name, NULL); \
>>> +} \
>>> +static const struct file_operations sched_cache_fops_##name = { \
>>> + .open = sched_cache_open_##name, \
>>> + .write = sched_cache_write_##name, \
>>> + .read = seq_read, \
>>> + .llseek = seq_lseek, \
>>> + .release = single_release, \
>>> +}
>>> +
>>> +SCHED_CACHE_CREATE_CONTROL(overload_pct, 100);
>>> +SCHED_CACHE_CREATE_CONTROL(imb_pct, 100);
>>> +SCHED_CACHE_CREATE_CONTROL(aggr_tolerance, 100);
>>> +SCHED_CACHE_CREATE_CONTROL(enabled, 1);
>>> +#endif /* SCHED_CACHE */
>>> +
>>> static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
>>> size_t cnt, loff_t *ppos)
>>> {
>>> @@ -523,6 +570,21 @@ static __init int sched_init_debug(void)
>>> debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
>>> #endif /* CONFIG_NUMA_BALANCING */
>>>
>>> +#ifdef CONFIG_SCHED_CACHE
>>> + debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL,
>>> + &sched_cache_fops_overload_pct);
>>> + debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL,
>>> + &sched_cache_fops_imb_pct);
>>> + debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL,
>>> + &sched_cache_fops_aggr_tolerance);
>>> + debugfs_create_file("llc_enabled", 0644, debugfs_sched, NULL,
>>> + &sched_cache_fops_enabled);
>>> + debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched,
>>> + &llc_epoch_period);
>>> + debugfs_create_u32("llc_epoch_affinity_timeout", 0644, debugfs_sched,
>>> + &llc_epoch_affinity_timeout);
>>> +#endif
>>> +
>>> debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
>>>
>>> debugfs_fair_server_init();
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 424ec601cfdf..a2e2d6742481 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -1207,6 +1207,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
>>>
>>> __read_mostly unsigned int llc_overload_pct = 50;
>>> __read_mostly unsigned int llc_imb_pct = 20;
>>> +__read_mostly unsigned int llc_aggr_tolerance = 1;
>>> +__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
>>> +__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
>>>
>>> static int llc_id(int cpu)
>>> {
>>> @@ -1223,11 +1226,22 @@ static int llc_id(int cpu)
>>> return llc;
>>> }
>>>
>>> +static inline int get_sched_cache_scale(int mul)
>>> +{
>>> + if (!llc_aggr_tolerance)
>>> + return 0;
>>> +
>>> + if (llc_aggr_tolerance == 100)
>>> + return INT_MAX;
>>> +
>>> + return (1 + (llc_aggr_tolerance - 1) * mul);
>>> +}
>>> +
>>> static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
>>> {
>>> + unsigned int llc, scale;
>>> struct cacheinfo *ci;
>>> unsigned long rss;
>>> - unsigned int llc;
>>>
>>> /*
>>> * get_cpu_cacheinfo_level() can not be used
>>> @@ -1252,19 +1266,54 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
>>> rss = get_mm_counter(mm, MM_ANONPAGES) +
>>> get_mm_counter(mm, MM_SHMEMPAGES);
>>>
>>> - return (llc <= (rss * PAGE_SIZE));
>>> + /*
>>> + * Scale the LLC size by 256*llc_aggr_tolerance
>>> + * and compare it to the task's RSS size.
>>> + *
>>> + * Suppose the L3 size is 32MB. If the
>>> + * llc_aggr_tolerance is 1:
>>> + * When the RSS is larger than 32MB, the process
>>> + * is regarded as exceeding the LLC capacity. If
>>> + * the llc_aggr_tolerance is 99:
>>> + * When the RSS is larger than 784GB, the process
>>> + * is regarded as exceeding the LLC capacity because:
>>> + * 784GB = (1 + (99 - 1) * 256) * 32MB
>>> + */
>>> + scale = get_sched_cache_scale(256);
>>> + if (scale == INT_MAX)
>>> + return false;
>>> +
>>> + return ((llc * scale) <= (rss * PAGE_SIZE));
>>> }
>>>
>>> static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
>>> {
>>> - int smt_nr = 1;
>>> + int smt_nr = 1, scale;
>>>
>>> #ifdef CONFIG_SCHED_SMT
>>> if (sched_smt_active())
>>> smt_nr = cpumask_weight(cpu_smt_mask(cpu));
>>> #endif
>>> + /*
>>> + * Scale the Core number in a LLC by llc_aggr_tolerance
>>> + * and compare it to the task's active threads.
>>> + *
>>> + * Suppose the number of Cores in LLC is 8.
>>> + * Every core has 2 SMTs.
>>> + * If the llc_aggr_tolerance is 1: When the
>>> + * nr_running is larger than 8, the process
>>> + * is regarded as exceeding the LLC capacity.
>>> + * If the llc_aggr_tolerance is 99:
>>> + * When the nr_running is larger than 785,
>>> + * the process is regarded as exceeding
>>> + * the LLC capacity:
>>> + * 785 = 1 + (99 - 1) * 8
>>> + */
>>> + scale = get_sched_cache_scale(1);
>>> + if (scale == INT_MAX)
>>> + return false;
>>>
>>> - return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
>>> + return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size, cpu)));
>>> }
>>>
>>> static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
>>> @@ -1350,9 +1399,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
>>> long delta = now - rq->cpu_epoch_next;
>>>
>>> if (delta > 0) {
>>> - n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
>>> + n = (delta + llc_epoch_period - 1) / llc_epoch_period;
>>> rq->cpu_epoch += n;
>>> - rq->cpu_epoch_next += n * EPOCH_PERIOD;
>>> + rq->cpu_epoch_next += n * llc_epoch_period;
>>> __shr_u64(&rq->cpu_runtime, n);
>>> }
>>>
>>> @@ -1412,7 +1461,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>>> * has only 1 thread, or has too many active threads, invalidate
>>> * its preferred state.
>>> */
>>> - if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
>>> + if (epoch - READ_ONCE(mm->mm_sched_epoch) > llc_epoch_affinity_timeout ||
>>> get_nr_threads(p) <= 1 ||
>>> exceed_llc_nr(mm, cpu_of(rq)) ||
>>> exceed_llc_capacity(mm, cpu_of(rq))) {
>>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>>> index 40798a06e058..15d126bd3728 100644
>>> --- a/kernel/sched/sched.h
>>> +++ b/kernel/sched/sched.h
>>> @@ -2852,6 +2852,11 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
>>> #ifdef CONFIG_SCHED_CACHE
>>> extern unsigned int llc_overload_pct;
>>> extern unsigned int llc_imb_pct;
>>> +extern unsigned int llc_aggr_tolerance;
>>> +extern unsigned int llc_epoch_period;
>>> +extern unsigned int llc_epoch_affinity_timeout;
>>> +extern unsigned int llc_enabled;
>>> +void sched_cache_set(bool locked);
>>> #endif
>>>
>>> #ifdef CONFIG_SCHED_HRTICK
>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>>> index 9799e3a9a609..818599ddaaef 100644
>>> --- a/kernel/sched/topology.c
>>> +++ b/kernel/sched/topology.c
>>> @@ -26,6 +26,49 @@ int max_llcs;
>>>
>>> static bool sched_cache_present;
>>>
>>> +unsigned int llc_enabled = 1;
>>> +DEFINE_STATIC_KEY_FALSE(sched_cache_on);
>>> +
>>> +/*
>>> + * Enable/disable cache aware scheduling according to
>>> + * user input and the presence of hardware support.
>>> + */
>>> +static void _sched_cache_set(bool enable, bool locked)
>>> +{
>>> + if (enable) {
>>> + if (locked)
>>> + static_branch_enable_cpuslocked(&sched_cache_on);
>>> + else
>>> + static_branch_enable(&sched_cache_on);
>>> + } else {
>>> + if (locked)
>>> + static_branch_disable_cpuslocked(&sched_cache_on);
>>> + else
>>> + static_branch_disable(&sched_cache_on);
>>> + }
>>> +}
>>> +
>>> +void sched_cache_set(bool locked)
>>> +{
>>> + /* hardware does not support */
>>> + if (!sched_cache_present) {
>>> + if (static_branch_likely(&sched_cache_on))
>>> + _sched_cache_set(false, locked);
>>> +
>>> + return;
>>> + }
>>> +
>>> + /* user wants it or not ?*/
>>> + if (llc_enabled) {
>>> + if (!static_branch_likely(&sched_cache_on))
>>> + _sched_cache_set(true, locked);
>>> +
>>> + } else {
>>> + if (static_branch_likely(&sched_cache_on))
>>> + _sched_cache_set(false, locked);
>>> + }
>>> +}
>>> +
>>> static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
>>> {
>>> unsigned int *new = NULL;
>>> @@ -70,8 +113,12 @@ static int resize_llc_pref(bool has_multi_llcs)
>>> * new buffer.
>>> */
>>> tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
>>> - if (!tmp_llc_pref)
>>> - return -ENOMEM;
>>> + if (!tmp_llc_pref) {
>>> + sched_cache_present = false;
>>> + ret = -ENOMEM;
>>> +
>>> + goto out;
>>> + }
>>>
>>> for_each_present_cpu(i)
>>> *per_cpu_ptr(tmp_llc_pref, i) = NULL;
>>> @@ -89,6 +136,7 @@ static int resize_llc_pref(bool has_multi_llcs)
>>> new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
>>> if (!new) {
>>> ret = -ENOMEM;
>>> + sched_cache_present = false;
>>>
>>> goto release_old;
>>> }
>>> @@ -126,6 +174,8 @@ static int resize_llc_pref(bool has_multi_llcs)
>>> if (!ret)
>>> max_llcs = new_max_llcs;
>>>
>>> +out:
>>> + sched_cache_set(true);
>>> return ret;
>>> }
>>>
>>> --
>>> 2.32.0
>>
>>> On 4 Dec 2025, at 07:07, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>>>
>>> From: Chen Yu <yu.c.chen@intel.com>
>>>
>>> Introduce a set of debugfs knobs to control the enabling of
>>> and parameters for cache-aware load balancing.
>>>
>>> (1) llc_enabled
>>> llc_enabled acts as the primary switch - users can toggle it to
>>> enable or disable cache aware load balancing.
>>>
>>> (2) llc_aggr_tolerance
>>> With sched_cache enabled, the scheduler uses a process's RSS as a
>>> proxy for its LLC footprint to determine if aggregating tasks on the
>>> preferred LLC could cause cache contention. If RSS exceeds the LLC
>>> size, aggregation is skipped. Some workloads with large RSS but small
>>> actual memory footprints may still benefit from aggregation. Since
>>> the kernel cannot efficiently track per-task cache usage (resctrl is
>>> user-space only), userspace can provide a more accurate hint.
>>>
>>> Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let
>>> users control how strictly RSS limits aggregation. Values range from
>>> 0 to 100:
>>>
>>> - 0: Cache-aware scheduling is disabled.
>>> - 1: Strict; tasks with RSS larger than LLC size are skipped.
>>> - 100: Aggressive; tasks are aggregated regardless of RSS.
>>>
>>
>> Hi Chen Yu and Tim Chen,
>>
>> Maybe we should have something like prctl(PR_LLC_AGGR_TOLERANCE, 100).
>>
>> I have tested this version of the patch on my EPYC Milan 7V13 (7763 variant) physical machine, with 32M LLC for each 8-core CCX. I found that I need to tune "llc_aggr_tolerance" to 100, else I can't get cache-aware scheduling to work on Verilated [1] XiangShan [2] running the chacha20 [3] as I mentioned before [4].
>>
>> But if I set it to 100, I will lose some performance on stream copy benchmarks since the bandwidth is limited per CCX. Thus, I think we should have a new prctl to let userspace software hint the kernel that this task can be bound by latency between cores, and should use this feature no matter the RSS exceed the LLC size.
>>
>> I finally have an EPYC Milan at home today. I'm glad to test this patch further. And I'm very willing to have this thus we can have a fast verilator[1] without numactl manually.
>>
>> [1] https://github.com/verilator/verilator
>> [2] https://github.com/OpenXiangShan/Xiangshan
>> [3] https://github.com/cyyself/chacha20-xiangshan
>> [4] https://lore.kernel.org/lkml/tencent_6E51A3175F8AE0A7F684A319EE63CC56C806@qq.com/
>>
>> Thanks,
>> Yangyu Chen
>>
>>> For exaImple, with a 32MB L3 cache:
>>>
>>> - llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped.
>>> - llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped
>>> (784GB = (1 + (99 - 1) * 256) * 32MB).
>>>
>>> Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls
>>> how strictly the number of active threads is considered when doing
>>> cache aware load balance. The number of SMTs is also considered.
>>> High SMT counts reduce the aggregation capacity, preventing excessive
>>> task aggregation on SMT-heavy systems like Power10/Power11.
>>>
>>> For example, with 8 Cores/16 CPUs in a L3:
>>>
>>> - llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped.
>>> - llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped
>>> 785 = (1 + (99 - 1) * 8).
>>>
>>> (3) llc_epoch_period/llc_epoch_affinity_timeout
>>> Besides, llc_epoch_period and llc_epoch_affinity_timeout are also turned
>>> into tunable.
>>>
>>> Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
>>> Suggested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
>>> Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
>>> Suggested-by: Tingyin Duan <tingyin.duan@gmail.com>
>>> Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
>>> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
>>> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
>>> ---
>>>
>>> Notes:
>>> v1->v2: Remove the smt_nr check in fits_llc_capacity().
>>> (Aaron Lu)
>>>
>>> include/linux/sched.h | 4 ++-
>>> kernel/sched/debug.c | 62 ++++++++++++++++++++++++++++++++++++++++
>>> kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++++++++-----
>>> kernel/sched/sched.h | 5 ++++
>>> kernel/sched/topology.c | 54 +++++++++++++++++++++++++++++++++--
>>> 5 files changed, 178 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>> index 466ba8b7398c..95bf080bbbf0 100644
>>> --- a/include/linux/sched.h
>>> +++ b/include/linux/sched.h
>>> @@ -2436,9 +2436,11 @@ extern void migrate_enable(void);
>>> DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>>>
>>> #ifdef CONFIG_SCHED_CACHE
>>> +DECLARE_STATIC_KEY_FALSE(sched_cache_on);
>>> +
>>> static inline bool sched_cache_enabled(void)
>>> {
>>> - return false;
>>> + return static_branch_unlikely(&sched_cache_on);
>>> }
>>> #endif
>>>
>>> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>>> index 02e16b70a790..cde324672103 100644
>>> --- a/kernel/sched/debug.c
>>> +++ b/kernel/sched/debug.c
>>> @@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = {
>>> .release = single_release,
>>> };
>>>
>>> +#ifdef CONFIG_SCHED_CACHE
>>> +#define SCHED_CACHE_CREATE_CONTROL(name, max) \
>>> +static ssize_t sched_cache_write_##name(struct file *filp, \
>>> + const char __user *ubuf, \
>>> + size_t cnt, loff_t *ppos) \
>>> +{ \
>>> + char buf[16]; \
>>> + unsigned int val; \
>>> + if (cnt > 15) \
>>> + cnt = 15; \
>>> + if (copy_from_user(&buf, ubuf, cnt)) \
>>> + return -EFAULT; \
>>> + buf[cnt] = '\0'; \
>>> + if (kstrtouint(buf, 10, &val)) \
>>> + return -EINVAL; \
>>> + if (val > (max)) \
>>> + return -EINVAL; \
>>> + llc_##name = val; \
>>> + if (!strcmp(#name, "enabled")) \
>>> + sched_cache_set(false); \
>>> + *ppos += cnt; \
>>> + return cnt; \
>>> +} \
>>> +static int sched_cache_show_##name(struct seq_file *m, void *v) \
>>> +{ \
>>> + seq_printf(m, "%d\n", llc_##name); \
>>> + return 0; \
>>> +} \
>>> +static int sched_cache_open_##name(struct inode *inode, \
>>> + struct file *filp) \
>>> +{ \
>>> + return single_open(filp, sched_cache_show_##name, NULL); \
>>> +} \
>>> +static const struct file_operations sched_cache_fops_##name = { \
>>> + .open = sched_cache_open_##name, \
>>> + .write = sched_cache_write_##name, \
>>> + .read = seq_read, \
>>> + .llseek = seq_lseek, \
>>> + .release = single_release, \
>>> +}
>>> +
>>> +SCHED_CACHE_CREATE_CONTROL(overload_pct, 100);
>>> +SCHED_CACHE_CREATE_CONTROL(imb_pct, 100);
>>> +SCHED_CACHE_CREATE_CONTROL(aggr_tolerance, 100);
>>> +SCHED_CACHE_CREATE_CONTROL(enabled, 1);
>>> +#endif /* SCHED_CACHE */
>>> +
>>> static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
>>> size_t cnt, loff_t *ppos)
>>> {
>>> @@ -523,6 +570,21 @@ static __init int sched_init_debug(void)
>>> debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
>>> #endif /* CONFIG_NUMA_BALANCING */
>>>
>>> +#ifdef CONFIG_SCHED_CACHE
>>> + debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL,
>>> + &sched_cache_fops_overload_pct);
>>> + debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL,
>>> + &sched_cache_fops_imb_pct);
>>> + debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL,
>>> + &sched_cache_fops_aggr_tolerance);
>>> + debugfs_create_file("llc_enabled", 0644, debugfs_sched, NULL,
>>> + &sched_cache_fops_enabled);
>>> + debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched,
>>> + &llc_epoch_period);
>>> + debugfs_create_u32("llc_epoch_affinity_timeout", 0644, debugfs_sched,
>>> + &llc_epoch_affinity_timeout);
>>> +#endif
>>> +
>>> debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
>>>
>>> debugfs_fair_server_init();
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 424ec601cfdf..a2e2d6742481 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -1207,6 +1207,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
>>>
>>> __read_mostly unsigned int llc_overload_pct = 50;
>>> __read_mostly unsigned int llc_imb_pct = 20;
>>> +__read_mostly unsigned int llc_aggr_tolerance = 1;
>>> +__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
>>> +__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
>>>
>>> static int llc_id(int cpu)
>>> {
>>> @@ -1223,11 +1226,22 @@ static int llc_id(int cpu)
>>> return llc;
>>> }
>>>
>>> +static inline int get_sched_cache_scale(int mul)
>>> +{
>>> + if (!llc_aggr_tolerance)
>>> + return 0;
>>> +
>>> + if (llc_aggr_tolerance == 100)
>>> + return INT_MAX;
>>> +
>>> + return (1 + (llc_aggr_tolerance - 1) * mul);
>>> +}
>>> +
>>> static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
>>> {
>>> + unsigned int llc, scale;
>>> struct cacheinfo *ci;
>>> unsigned long rss;
>>> - unsigned int llc;
>>>
>>> /*
>>> * get_cpu_cacheinfo_level() can not be used
>>> @@ -1252,19 +1266,54 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
>>> rss = get_mm_counter(mm, MM_ANONPAGES) +
>>> get_mm_counter(mm, MM_SHMEMPAGES);
>>>
>>> - return (llc <= (rss * PAGE_SIZE));
>>> + /*
>>> + * Scale the LLC size by 256*llc_aggr_tolerance
>>> + * and compare it to the task's RSS size.
>>> + *
>>> + * Suppose the L3 size is 32MB. If the
>>> + * llc_aggr_tolerance is 1:
>>> + * When the RSS is larger than 32MB, the process
>>> + * is regarded as exceeding the LLC capacity. If
>>> + * the llc_aggr_tolerance is 99:
>>> + * When the RSS is larger than 784GB, the process
>>> + * is regarded as exceeding the LLC capacity because:
>>> + * 784GB = (1 + (99 - 1) * 256) * 32MB
>>> + */
>>> + scale = get_sched_cache_scale(256);
>>> + if (scale == INT_MAX)
>>> + return false;
>>> +
>>> + return ((llc * scale) <= (rss * PAGE_SIZE));
>>> }
>>>
>>> static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
>>> {
>>> - int smt_nr = 1;
>>> + int smt_nr = 1, scale;
>>>
>>> #ifdef CONFIG_SCHED_SMT
>>> if (sched_smt_active())
>>> smt_nr = cpumask_weight(cpu_smt_mask(cpu));
>>> #endif
>>> + /*
>>> + * Scale the Core number in a LLC by llc_aggr_tolerance
>>> + * and compare it to the task's active threads.
>>> + *
>>> + * Suppose the number of Cores in LLC is 8.
>>> + * Every core has 2 SMTs.
>>> + * If the llc_aggr_tolerance is 1: When the
>>> + * nr_running is larger than 8, the process
>>> + * is regarded as exceeding the LLC capacity.
>>> + * If the llc_aggr_tolerance is 99:
>>> + * When the nr_running is larger than 785,
>>> + * the process is regarded as exceeding
>>> + * the LLC capacity:
>>> + * 785 = 1 + (99 - 1) * 8
>>> + */
>>> + scale = get_sched_cache_scale(1);
>>> + if (scale == INT_MAX)
>>> + return false;
>>>
>>> - return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
>>> + return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size, cpu)));
>>> }
>>>
>>> static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
>>> @@ -1350,9 +1399,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
>>> long delta = now - rq->cpu_epoch_next;
>>>
>>> if (delta > 0) {
>>> - n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
>>> + n = (delta + llc_epoch_period - 1) / llc_epoch_period;
>>> rq->cpu_epoch += n;
>>> - rq->cpu_epoch_next += n * EPOCH_PERIOD;
>>> + rq->cpu_epoch_next += n * llc_epoch_period;
>>> __shr_u64(&rq->cpu_runtime, n);
>>> }
>>>
>>> @@ -1412,7 +1461,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>>> * has only 1 thread, or has too many active threads, invalidate
>>> * its preferred state.
>>> */
>>> - if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
>>> + if (epoch - READ_ONCE(mm->mm_sched_epoch) > llc_epoch_affinity_timeout ||
>>> get_nr_threads(p) <= 1 ||
>>> exceed_llc_nr(mm, cpu_of(rq)) ||
>>> exceed_llc_capacity(mm, cpu_of(rq))) {
>>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>>> index 40798a06e058..15d126bd3728 100644
>>> --- a/kernel/sched/sched.h
>>> +++ b/kernel/sched/sched.h
>>> @@ -2852,6 +2852,11 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
>>> #ifdef CONFIG_SCHED_CACHE
>>> extern unsigned int llc_overload_pct;
>>> extern unsigned int llc_imb_pct;
>>> +extern unsigned int llc_aggr_tolerance;
>>> +extern unsigned int llc_epoch_period;
>>> +extern unsigned int llc_epoch_affinity_timeout;
>>> +extern unsigned int llc_enabled;
>>> +void sched_cache_set(bool locked);
>>> #endif
>>>
>>> #ifdef CONFIG_SCHED_HRTICK
>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>>> index 9799e3a9a609..818599ddaaef 100644
>>> --- a/kernel/sched/topology.c
>>> +++ b/kernel/sched/topology.c
>>> @@ -26,6 +26,49 @@ int max_llcs;
>>>
>>> static bool sched_cache_present;
>>>
>>> +unsigned int llc_enabled = 1;
>>> +DEFINE_STATIC_KEY_FALSE(sched_cache_on);
>>> +
>>> +/*
>>> + * Enable/disable cache aware scheduling according to
>>> + * user input and the presence of hardware support.
>>> + */
>>> +static void _sched_cache_set(bool enable, bool locked)
>>> +{
>>> + if (enable) {
>>> + if (locked)
>>> + static_branch_enable_cpuslocked(&sched_cache_on);
>>> + else
>>> + static_branch_enable(&sched_cache_on);
>>> + } else {
>>> + if (locked)
>>> + static_branch_disable_cpuslocked(&sched_cache_on);
>>> + else
>>> + static_branch_disable(&sched_cache_on);
>>> + }
>>> +}
>>> +
>>> +void sched_cache_set(bool locked)
>>> +{
>>> + /* hardware does not support */
>>> + if (!sched_cache_present) {
>>> + if (static_branch_likely(&sched_cache_on))
>>> + _sched_cache_set(false, locked);
>>> +
>>> + return;
>>> + }
>>> +
>>> + /* user wants it or not ?*/
>>> + if (llc_enabled) {
>>> + if (!static_branch_likely(&sched_cache_on))
>>> + _sched_cache_set(true, locked);
>>> +
>>> + } else {
>>> + if (static_branch_likely(&sched_cache_on))
>>> + _sched_cache_set(false, locked);
>>> + }
>>> +}
>>> +
>>> static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
>>> {
>>> unsigned int *new = NULL;
>>> @@ -70,8 +113,12 @@ static int resize_llc_pref(bool has_multi_llcs)
>>> * new buffer.
>>> */
>>> tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
>>> - if (!tmp_llc_pref)
>>> - return -ENOMEM;
>>> + if (!tmp_llc_pref) {
>>> + sched_cache_present = false;
>>> + ret = -ENOMEM;
>>> +
>>> + goto out;
>>> + }
>>>
>>> for_each_present_cpu(i)
>>> *per_cpu_ptr(tmp_llc_pref, i) = NULL;
>>> @@ -89,6 +136,7 @@ static int resize_llc_pref(bool has_multi_llcs)
>>> new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
>>> if (!new) {
>>> ret = -ENOMEM;
>>> + sched_cache_present = false;
>>>
>>> goto release_old;
>>> }
>>> @@ -126,6 +174,8 @@ static int resize_llc_pref(bool has_multi_llcs)
>>> if (!ret)
>>> max_llcs = new_max_llcs;
>>>
>>> +out:
>>> + sched_cache_set(true);
>>> return ret;
>>> }
>>>
>>> --
>>> 2.32.0
On 12/24/2025 11:28 AM, Yangyu Chen wrote: > > >> On 24 Dec 2025, at 00:44, Yangyu Chen <cyy@cyyself.name> wrote: >> >>> On 23 Dec 2025, at 20:12, Yangyu Chen <cyy@cyyself.name> wrote: >>> >>>> On 4 Dec 2025, at 07:07, Tim Chen <tim.c.chen@linux.intel.com> wrote: >>>> >>>> From: Chen Yu <yu.c.chen@intel.com> >>>> >>>> Introduce a set of debugfs knobs to control the enabling of >>>> and parameters for cache-aware load balancing. >>>> >>>> (1) llc_enabled >>>> llc_enabled acts as the primary switch - users can toggle it to >>>> enable or disable cache aware load balancing. >>>> >>>> (2) llc_aggr_tolerance >>>> With sched_cache enabled, the scheduler uses a process's RSS as a >>>> proxy for its LLC footprint to determine if aggregating tasks on the >>>> preferred LLC could cause cache contention. If RSS exceeds the LLC >>>> size, aggregation is skipped. Some workloads with large RSS but small >>>> actual memory footprints may still benefit from aggregation. Since >>>> the kernel cannot efficiently track per-task cache usage (resctrl is >>>> user-space only), userspace can provide a more accurate hint. >>>> >>>> Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let >>>> users control how strictly RSS limits aggregation. Values range from >>>> 0 to 100: >>>> >>>> - 0: Cache-aware scheduling is disabled. >>>> - 1: Strict; tasks with RSS larger than LLC size are skipped. >>>> - 100: Aggressive; tasks are aggregated regardless of RSS. >>>> >>> >>> Hi Chen Yu and Tim Chen, >>> >>> Maybe we should have something like prctl(PR_LLC_AGGR_TOLERANCE, 100). >>> >>> I have tested this version of the patch on my EPYC Milan 7V13 (7763 variant) physical machine, with 32M LLC for each 8-core CCX. I found that I need to tune "llc_aggr_tolerance" to 100, else I can't get cache-aware scheduling to work on Verilated [1] XiangShan [2] running the chacha20 [3] as I mentioned before [4]. >>> >> >> In addition, I have investigated why this happens. And finally I >> realized that's because that workload observed 35596 kB RssAnon on >> my EPYC Milan Machine, slightly exceeding the LLC size (32M). I >> have tested it on an EPYC Genoa cloud server with the correct core >> / cache hierarchy in ACPI table, that shows 31700 kB RssAnon, thus >> fitting in LLC. I have no idea why my result shows higher RssAnon, >> since they both run Debian Trixie with the exact same kernel and >> same executable. But it reminds me we should have a userspace API >> for that. >> > > In addition, during profiling the verilator, I found that if scheduled > to SMTs, it will result in poor performance. Thus, I think we should > separate the control for rss size with the SMT scale. > Thanks for the investigation. Could you elaborate a little more about scheduled to SMTs? Do you mean, if every CPU(SMT) in the LLC has 1 running task, then the performance is impacted? I thought we have exceed_llc_nr() to check the smt to avoid this? > It's notable that rss size is not the actual memory footprint. It > would be better if we could measure the l2_miss event or l3_miss > event to measure the l3 hit rate. Just for future work. > Yes, in user space, we can collect PMUs events/memory bandwidth via resctrl to decide whether to set task attributes. > I'm willing to provide a patch for such a prctl. But I'm busy these > days, maybe I can have the time to do that after one week. > Sure. We haven't yet decided which interface we can leverage. Also, Qais is working on QOS interface[1] - maybe we can build on his work. [1] https://lore.kernel.org/all/20240820163512.1096301-11-qyousef@layalina.io/ thanks, Chenyu
> On 24 Dec 2025, at 15:51, Chen, Yu C <yu.c.chen@intel.com> wrote: > > On 12/24/2025 11:28 AM, Yangyu Chen wrote: >>> On 24 Dec 2025, at 00:44, Yangyu Chen <cyy@cyyself.name> wrote: >>> >>>> On 23 Dec 2025, at 20:12, Yangyu Chen <cyy@cyyself.name> wrote: >>>> >>>>> On 4 Dec 2025, at 07:07, Tim Chen <tim.c.chen@linux.intel.com> wrote: >>>>> >>>>> From: Chen Yu <yu.c.chen@intel.com> >>>>> >>>>> Introduce a set of debugfs knobs to control the enabling of >>>>> and parameters for cache-aware load balancing. >>>>> >>>>> (1) llc_enabled >>>>> llc_enabled acts as the primary switch - users can toggle it to >>>>> enable or disable cache aware load balancing. >>>>> >>>>> (2) llc_aggr_tolerance >>>>> With sched_cache enabled, the scheduler uses a process's RSS as a >>>>> proxy for its LLC footprint to determine if aggregating tasks on the >>>>> preferred LLC could cause cache contention. If RSS exceeds the LLC >>>>> size, aggregation is skipped. Some workloads with large RSS but small >>>>> actual memory footprints may still benefit from aggregation. Since >>>>> the kernel cannot efficiently track per-task cache usage (resctrl is >>>>> user-space only), userspace can provide a more accurate hint. >>>>> >>>>> Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let >>>>> users control how strictly RSS limits aggregation. Values range from >>>>> 0 to 100: >>>>> >>>>> - 0: Cache-aware scheduling is disabled. >>>>> - 1: Strict; tasks with RSS larger than LLC size are skipped. >>>>> - 100: Aggressive; tasks are aggregated regardless of RSS. >>>>> >>>> >>>> Hi Chen Yu and Tim Chen, >>>> >>>> Maybe we should have something like prctl(PR_LLC_AGGR_TOLERANCE, 100). >>>> >>>> I have tested this version of the patch on my EPYC Milan 7V13 (7763 variant) physical machine, with 32M LLC for each 8-core CCX. I found that I need to tune "llc_aggr_tolerance" to 100, else I can't get cache-aware scheduling to work on Verilated [1] XiangShan [2] running the chacha20 [3] as I mentioned before [4]. >>>> >>> >>> In addition, I have investigated why this happens. And finally I >>> realized that's because that workload observed 35596 kB RssAnon on >>> my EPYC Milan Machine, slightly exceeding the LLC size (32M). I >>> have tested it on an EPYC Genoa cloud server with the correct core >>> / cache hierarchy in ACPI table, that shows 31700 kB RssAnon, thus >>> fitting in LLC. I have no idea why my result shows higher RssAnon, >>> since they both run Debian Trixie with the exact same kernel and >>> same executable. But it reminds me we should have a userspace API >>> for that. >>> >> In addition, during profiling the verilator, I found that if scheduled >> to SMTs, it will result in poor performance. Thus, I think we should >> separate the control for rss size with the SMT scale. > > Thanks for the investigation. Could you elaborate a little more about > scheduled to SMTs? Do you mean, if every CPU(SMT) in the LLC has 1 running > task, then the performance is impacted? I thought we have > exceed_llc_nr() to check the smt to avoid this? The verilator can specify the number of threads being used for the RTL simulator during compilation. And it cannot be changed at runtime since it will do static partitioning. Thus, I didn't mean if there is another thread being scheduled to a SMT in the LLC and we got poor performance. I mean that the users can allow the verilator to use more threads larger than the LLC capacity. But I have tested your case, on my observation with the recent version of XiangShan + Verilator + LLVM21 with an 8-thread emulator, it shows 41%(30% for 1-thread) and 62%(39% for 1-thread) performance degradation on Raptor Lake and EPYC Milan if another 8 threads are running with a simple loop. But I think that's only a datapoint. Since both Raptor Lake and Zen 5 will statically partition the ROB in the CPU backend, and such workloads will suffer a lot of data cache misses since they have a very huge instruction footprint. I think SMT performance is not easy to characterize across different microarchitectures and workloads, but one thing for sure is that I didn't come across a situation where a 16-thread emulator on an EPYC machine scheduled to 1-CCX with 2-SMT is better than 2-CCX with only 1-SMT. That’s why I think we should split this two user controls, one for RSS and one for the number of threads. Thanks, Yangyu Chen > >> It's notable that rss size is not the actual memory footprint. It >> would be better if we could measure the l2_miss event or l3_miss >> event to measure the l3 hit rate. Just for future work. > > Yes, in user space, we can collect PMUs events/memory bandwidth via > resctrl to decide whether to set task attributes. > >> I'm willing to provide a patch for such a prctl. But I'm busy these >> days, maybe I can have the time to do that after one week. > > Sure. We haven't yet decided which interface we can leverage. > Also, Qais is working on QOS interface[1] - maybe we can build > on his work. > > [1] https://lore.kernel.org/all/20240820163512.1096301-11-qyousef@layalina.io/ > > thanks, > Chenyu
On 2025/12/4 07:07, Tim Chen wrote:
> From: Chen Yu <yu.c.chen@intel.com>
>
> Introduce a set of debugfs knobs to control the enabling of
> and parameters for cache-aware load balancing.
>
> (1) llc_enabled
> llc_enabled acts as the primary switch - users can toggle it to
> enable or disable cache aware load balancing.
>
> (2) llc_aggr_tolerance
> With sched_cache enabled, the scheduler uses a process's RSS as a
> proxy for its LLC footprint to determine if aggregating tasks on the
> preferred LLC could cause cache contention. If RSS exceeds the LLC
> size, aggregation is skipped. Some workloads with large RSS but small
> actual memory footprints may still benefit from aggregation. Since
> the kernel cannot efficiently track per-task cache usage (resctrl is
> user-space only), userspace can provide a more accurate hint.
>
> Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let
> users control how strictly RSS limits aggregation. Values range from
> 0 to 100:
>
> - 0: Cache-aware scheduling is disabled.
> - 1: Strict; tasks with RSS larger than LLC size are skipped.
> - 100: Aggressive; tasks are aggregated regardless of RSS.
>
> For example, with a 32MB L3 cache:
>
> - llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped.
> - llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped
> (784GB = (1 + (99 - 1) * 256) * 32MB).
>
> Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls
> how strictly the number of active threads is considered when doing
> cache aware load balance. The number of SMTs is also considered.
> High SMT counts reduce the aggregation capacity, preventing excessive
> task aggregation on SMT-heavy systems like Power10/Power11.
>
> For example, with 8 Cores/16 CPUs in a L3:
>
> - llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped.
> - llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped
> 785 = (1 + (99 - 1) * 8).
>
> (3) llc_epoch_period/llc_epoch_affinity_timeout
> Besides, llc_epoch_period and llc_epoch_affinity_timeout are also turned
> into tunable.
>
> Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Suggested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
> Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
> Suggested-by: Tingyin Duan <tingyin.duan@gmail.com>
> Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> ---
>
> Notes:
> v1->v2: Remove the smt_nr check in fits_llc_capacity().
> (Aaron Lu)
>
> include/linux/sched.h | 4 ++-
> kernel/sched/debug.c | 62 ++++++++++++++++++++++++++++++++++++++++
> kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++++++++-----
> kernel/sched/sched.h | 5 ++++
> kernel/sched/topology.c | 54 +++++++++++++++++++++++++++++++++--
> 5 files changed, 178 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 466ba8b7398c..95bf080bbbf0 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2436,9 +2436,11 @@ extern void migrate_enable(void);
> DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>
> #ifdef CONFIG_SCHED_CACHE
> +DECLARE_STATIC_KEY_FALSE(sched_cache_on);
> +
> static inline bool sched_cache_enabled(void)
> {
> - return false;
> + return static_branch_unlikely(&sched_cache_on);
> }
> #endif
>
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 02e16b70a790..cde324672103 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = {
> .release = single_release,
> };
>
> +#ifdef CONFIG_SCHED_CACHE
> +#define SCHED_CACHE_CREATE_CONTROL(name, max) \
> +static ssize_t sched_cache_write_##name(struct file *filp, \
> + const char __user *ubuf, \
> + size_t cnt, loff_t *ppos) \
> +{ \
> + char buf[16]; \
> + unsigned int val; \
> + if (cnt > 15) \
> + cnt = 15; \
> + if (copy_from_user(&buf, ubuf, cnt)) \
> + return -EFAULT; \
> + buf[cnt] = '\0'; \
> + if (kstrtouint(buf, 10, &val)) \
> + return -EINVAL; \
> + if (val > (max)) \
> + return -EINVAL; \
> + llc_##name = val; \
> + if (!strcmp(#name, "enabled")) \
> + sched_cache_set(false); \
> + *ppos += cnt; \
> + return cnt; \
> +} \
> +static int sched_cache_show_##name(struct seq_file *m, void *v) \
> +{ \
> + seq_printf(m, "%d\n", llc_##name); \
> + return 0; \
> +} \
> +static int sched_cache_open_##name(struct inode *inode, \
> + struct file *filp) \
> +{ \
> + return single_open(filp, sched_cache_show_##name, NULL); \
> +} \
> +static const struct file_operations sched_cache_fops_##name = { \
> + .open = sched_cache_open_##name, \
> + .write = sched_cache_write_##name, \
> + .read = seq_read, \
> + .llseek = seq_lseek, \
> + .release = single_release, \
> +}
> +
> +SCHED_CACHE_CREATE_CONTROL(overload_pct, 100);
> +SCHED_CACHE_CREATE_CONTROL(imb_pct, 100);
> +SCHED_CACHE_CREATE_CONTROL(aggr_tolerance, 100);
> +SCHED_CACHE_CREATE_CONTROL(enabled, 1);
> +#endif /* SCHED_CACHE */
> +
> static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
> size_t cnt, loff_t *ppos)
> {
> @@ -523,6 +570,21 @@ static __init int sched_init_debug(void)
> debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
> #endif /* CONFIG_NUMA_BALANCING */
>
> +#ifdef CONFIG_SCHED_CACHE
> + debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL,
> + &sched_cache_fops_overload_pct);
> + debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL,
> + &sched_cache_fops_imb_pct);
> + debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL,
> + &sched_cache_fops_aggr_tolerance);
> + debugfs_create_file("llc_enabled", 0644, debugfs_sched, NULL,
> + &sched_cache_fops_enabled);
> + debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched,
> + &llc_epoch_period);
> + debugfs_create_u32("llc_epoch_affinity_timeout", 0644, debugfs_sched,
> + &llc_epoch_affinity_timeout);
> +#endif
> +
> debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
>
> debugfs_fair_server_init();
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 424ec601cfdf..a2e2d6742481 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1207,6 +1207,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
>
> __read_mostly unsigned int llc_overload_pct = 50;
> __read_mostly unsigned int llc_imb_pct = 20;
> +__read_mostly unsigned int llc_aggr_tolerance = 1;
> +__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
> +__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
>
> static int llc_id(int cpu)
> {
> @@ -1223,11 +1226,22 @@ static int llc_id(int cpu)
> return llc;
> }
>
> +static inline int get_sched_cache_scale(int mul)
> +{
> + if (!llc_aggr_tolerance)
> + return 0;
> +
> + if (llc_aggr_tolerance == 100)
the range of llc_aggr_tolerance is [0, 100], so a little bug here? maybe
check if (llc_aggr_tolerance >= 100)
and if llc_aggr_tolerance = 0, the func return 0, it means
exceed_llc_capacity & exceed_llc_nr always true, there maybeinconsistent
to have this value set while |llc_enable=1| is set.
> + return INT_MAX;
> +
> + return (1 + (llc_aggr_tolerance - 1) * mul);
> +}
> +
> static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
> {
> + unsigned int llc, scale;
> struct cacheinfo *ci;
> unsigned long rss;
> - unsigned int llc;
>
> /*
> * get_cpu_cacheinfo_level() can not be used
> @@ -1252,19 +1266,54 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
> rss = get_mm_counter(mm, MM_ANONPAGES) +
> get_mm_counter(mm, MM_SHMEMPAGES);
>
> - return (llc <= (rss * PAGE_SIZE));
> + /*
> + * Scale the LLC size by 256*llc_aggr_tolerance
> + * and compare it to the task's RSS size.
> + *
> + * Suppose the L3 size is 32MB. If the
> + * llc_aggr_tolerance is 1:
> + * When the RSS is larger than 32MB, the process
> + * is regarded as exceeding the LLC capacity. If
> + * the llc_aggr_tolerance is 99:
> + * When the RSS is larger than 784GB, the process
> + * is regarded as exceeding the LLC capacity because:
> + * 784GB = (1 + (99 - 1) * 256) * 32MB
> + */
> + scale = get_sched_cache_scale(256);
> + if (scale == INT_MAX)
> + return false;
> +
> + return ((llc * scale) <= (rss * PAGE_SIZE));
> }
>
> static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
> {
> - int smt_nr = 1;
> + int smt_nr = 1, scale;
>
> #ifdef CONFIG_SCHED_SMT
> if (sched_smt_active())
> smt_nr = cpumask_weight(cpu_smt_mask(cpu));
> #endif
> + /*
> + * Scale the Core number in a LLC by llc_aggr_tolerance
> + * and compare it to the task's active threads.
> + *
> + * Suppose the number of Cores in LLC is 8.
> + * Every core has 2 SMTs.
> + * If the llc_aggr_tolerance is 1: When the
> + * nr_running is larger than 8, the process
> + * is regarded as exceeding the LLC capacity.
> + * If the llc_aggr_tolerance is 99:
> + * When the nr_running is larger than 785,
> + * the process is regarded as exceeding
> + * the LLC capacity:
> + * 785 = 1 + (99 - 1) * 8
> + */
> + scale = get_sched_cache_scale(1);
> + if (scale == INT_MAX)
> + return false;
>
> - return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
> + return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size, cpu)));
> }
>
> static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
> @@ -1350,9 +1399,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
> long delta = now - rq->cpu_epoch_next;
>
> if (delta > 0) {
> - n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
> + n = (delta + llc_epoch_period - 1) / llc_epoch_period;
> rq->cpu_epoch += n;
> - rq->cpu_epoch_next += n * EPOCH_PERIOD;
> + rq->cpu_epoch_next += n * llc_epoch_period;
> __shr_u64(&rq->cpu_runtime, n);
> }
>
> @@ -1412,7 +1461,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> * has only 1 thread, or has too many active threads, invalidate
> * its preferred state.
> */
> - if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
> + if (epoch - READ_ONCE(mm->mm_sched_epoch) > llc_epoch_affinity_timeout ||
> get_nr_threads(p) <= 1 ||
> exceed_llc_nr(mm, cpu_of(rq)) ||
> exceed_llc_capacity(mm, cpu_of(rq))) {
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 40798a06e058..15d126bd3728 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2852,6 +2852,11 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
> #ifdef CONFIG_SCHED_CACHE
> extern unsigned int llc_overload_pct;
> extern unsigned int llc_imb_pct;
> +extern unsigned int llc_aggr_tolerance;
> +extern unsigned int llc_epoch_period;
> +extern unsigned int llc_epoch_affinity_timeout;
> +extern unsigned int llc_enabled;
> +void sched_cache_set(bool locked);
> #endif
>
> #ifdef CONFIG_SCHED_HRTICK
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 9799e3a9a609..818599ddaaef 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -26,6 +26,49 @@ int max_llcs;
>
> static bool sched_cache_present;
>
> +unsigned int llc_enabled = 1;
> +DEFINE_STATIC_KEY_FALSE(sched_cache_on);
> +
> +/*
> + * Enable/disable cache aware scheduling according to
> + * user input and the presence of hardware support.
> + */
> +static void _sched_cache_set(bool enable, bool locked)
> +{
> + if (enable) {
> + if (locked)
> + static_branch_enable_cpuslocked(&sched_cache_on);
> + else
> + static_branch_enable(&sched_cache_on);
> + } else {
> + if (locked)
> + static_branch_disable_cpuslocked(&sched_cache_on);
> + else
> + static_branch_disable(&sched_cache_on);
> + }
> +}
> +
> +void sched_cache_set(bool locked)
> +{
> + /* hardware does not support */
> + if (!sched_cache_present) {
> + if (static_branch_likely(&sched_cache_on))
> + _sched_cache_set(false, locked);
> +
> + return;
> + }
> +
> + /* user wants it or not ?*/
> + if (llc_enabled) {
> + if (!static_branch_likely(&sched_cache_on))
> + _sched_cache_set(true, locked);
> +
> + } else {
> + if (static_branch_likely(&sched_cache_on))
> + _sched_cache_set(false, locked);
> + }
> +}
> +
> static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
> {
> unsigned int *new = NULL;
> @@ -70,8 +113,12 @@ static int resize_llc_pref(bool has_multi_llcs)
> * new buffer.
> */
> tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
> - if (!tmp_llc_pref)
> - return -ENOMEM;
> + if (!tmp_llc_pref) {
> + sched_cache_present = false;
> + ret = -ENOMEM;
> +
> + goto out;
> + }
>
> for_each_present_cpu(i)
> *per_cpu_ptr(tmp_llc_pref, i) = NULL;
> @@ -89,6 +136,7 @@ static int resize_llc_pref(bool has_multi_llcs)
> new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
> if (!new) {
> ret = -ENOMEM;
> + sched_cache_present = false;
>
> goto release_old;
> }
> @@ -126,6 +174,8 @@ static int resize_llc_pref(bool has_multi_llcs)
> if (!ret)
> max_llcs = new_max_llcs;
>
> +out:
> + sched_cache_set(true);
> return ret;
> }
>
On 12/19/2025 12:14 PM, Vern Hao wrote: > > On 2025/12/4 07:07, Tim Chen wrote: >> From: Chen Yu <yu.c.chen@intel.com> > the range of llc_aggr_tolerance is [0, 100], so a little bug here? maybe > check if (llc_aggr_tolerance >= 100) > > and if llc_aggr_tolerance = 0, the func return 0, it means > exceed_llc_capacity & exceed_llc_nr always true, there maybeinconsistent > to have this value set while |llc_enable=1| is set. > I see your point. The original idea was that llc_aggr_tolerance and llc_enable work together (independently) to determine whether cache-aware scheduling should be enabled. That is to say, llc_enable was not supposed be used as an indicator for users to query whether the actual cache-aware scheduling is enabled. Let me check if we can reset llc_enable if llc_aggr_tolerance is 0. thanks, Chenyu
On 12/19/2025 12:14 PM, Vern Hao wrote:
>
> On 2025/12/4 07:07, Tim Chen wrote:
>> From: Chen Yu <yu.c.chen@intel.com>
>>
>> Introduce a set of debugfs knobs to control the enabling of
>> and parameters for cache-aware load balancing.
>>
>> (1) llc_enabled
>> llc_enabled acts as the primary switch - users can toggle it to
>> enable or disable cache aware load balancing.
>>
>> (2) llc_aggr_tolerance
>> With sched_cache enabled, the scheduler uses a process's RSS as a
>> proxy for its LLC footprint to determine if aggregating tasks on the
>> preferred LLC could cause cache contention. If RSS exceeds the LLC
>> size, aggregation is skipped. Some workloads with large RSS but small
>> actual memory footprints may still benefit from aggregation. Since
>> the kernel cannot efficiently track per-task cache usage (resctrl is
>> user-space only), userspace can provide a more accurate hint.
>>
>> Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let
>> users control how strictly RSS limits aggregation. Values range from
>> 0 to 100:
>>
>> - 0: Cache-aware scheduling is disabled.
>> - 1: Strict; tasks with RSS larger than LLC size are skipped.
>> - 100: Aggressive; tasks are aggregated regardless of RSS.
>>
>> For example, with a 32MB L3 cache:
>>
>> - llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped.
>> - llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped
>> (784GB = (1 + (99 - 1) * 256) * 32MB).
>>
>> Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls
>> how strictly the number of active threads is considered when doing
>> cache aware load balance. The number of SMTs is also considered.
>> High SMT counts reduce the aggregation capacity, preventing excessive
>> task aggregation on SMT-heavy systems like Power10/Power11.
>>
>> For example, with 8 Cores/16 CPUs in a L3:
>>
>> - llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped.
>> - llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped
>> 785 = (1 + (99 - 1) * 8).
>>
>> (3) llc_epoch_period/llc_epoch_affinity_timeout
>> Besides, llc_epoch_period and llc_epoch_affinity_timeout are also turned
>> into tunable.
>>
>> Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
>> Suggested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
>> Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
>> Suggested-by: Tingyin Duan <tingyin.duan@gmail.com>
>> Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
>> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
>> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
>> ---
>>
>> Notes:
>> v1->v2: Remove the smt_nr check in fits_llc_capacity().
>> (Aaron Lu)
>>
>> include/linux/sched.h | 4 ++-
>> kernel/sched/debug.c | 62 ++++++++++++++++++++++++++++++++++++++++
>> kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++++++++-----
>> kernel/sched/sched.h | 5 ++++
>> kernel/sched/topology.c | 54 +++++++++++++++++++++++++++++++++--
>> 5 files changed, 178 insertions(+), 10 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 466ba8b7398c..95bf080bbbf0 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -2436,9 +2436,11 @@ extern void migrate_enable(void);
>> DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>> #ifdef CONFIG_SCHED_CACHE
>> +DECLARE_STATIC_KEY_FALSE(sched_cache_on);
>> +
>> static inline bool sched_cache_enabled(void)
>> {
>> - return false;
>> + return static_branch_unlikely(&sched_cache_on);
>> }
>> #endif
>> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>> index 02e16b70a790..cde324672103 100644
>> --- a/kernel/sched/debug.c
>> +++ b/kernel/sched/debug.c
>> @@ -169,6 +169,53 @@ static const struct file_operations
>> sched_feat_fops = {
>> .release = single_release,
>> };
>> +#ifdef CONFIG_SCHED_CACHE
>> +#define SCHED_CACHE_CREATE_CONTROL(name, max) \
>> +static ssize_t sched_cache_write_##name(struct file *filp, \
>> + const char __user *ubuf, \
>> + size_t cnt, loff_t *ppos) \
>> +{ \
>> + char buf[16]; \
>> + unsigned int val; \
>> + if (cnt > 15) \
>> + cnt = 15; \
>> + if (copy_from_user(&buf, ubuf, cnt)) \
>> + return -EFAULT; \
>> + buf[cnt] = '\0'; \
>> + if (kstrtouint(buf, 10, &val)) \
>> + return -EINVAL; \
>> + if (val > (max)) \
>> + return -EINVAL; \
>> + llc_##name = val; \
>> + if (!strcmp(#name, "enabled")) \
>> + sched_cache_set(false); \
>> + *ppos += cnt; \
>> + return cnt; \
>> +} \
>> +static int sched_cache_show_##name(struct seq_file *m, void *v) \
>> +{ \
>> + seq_printf(m, "%d\n", llc_##name); \
>> + return 0; \
>> +} \
>> +static int sched_cache_open_##name(struct inode *inode, \
>> + struct file *filp) \
>> +{ \
>> + return single_open(filp, sched_cache_show_##name, NULL); \
>> +} \
>> +static const struct file_operations sched_cache_fops_##name = { \
>> + .open = sched_cache_open_##name, \
>> + .write = sched_cache_write_##name, \
>> + .read = seq_read, \
>> + .llseek = seq_lseek, \
>> + .release = single_release, \
>> +}
>> +
>> +SCHED_CACHE_CREATE_CONTROL(overload_pct, 100);
>> +SCHED_CACHE_CREATE_CONTROL(imb_pct, 100);
>> +SCHED_CACHE_CREATE_CONTROL(aggr_tolerance, 100);
>> +SCHED_CACHE_CREATE_CONTROL(enabled, 1);
>> +#endif /* SCHED_CACHE */
>> +
>> static ssize_t sched_scaling_write(struct file *filp, const char
>> __user *ubuf,
>> size_t cnt, loff_t *ppos)
>> {
>> @@ -523,6 +570,21 @@ static __init int sched_init_debug(void)
>> debugfs_create_u32("hot_threshold_ms", 0644, numa,
>> &sysctl_numa_balancing_hot_threshold);
>> #endif /* CONFIG_NUMA_BALANCING */
>> +#ifdef CONFIG_SCHED_CACHE
>> + debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL,
>> + &sched_cache_fops_overload_pct);
>> + debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL,
>> + &sched_cache_fops_imb_pct);
>> + debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL,
>> + &sched_cache_fops_aggr_tolerance);
>> + debugfs_create_file("llc_enabled", 0644, debugfs_sched, NULL,
>> + &sched_cache_fops_enabled);
>> + debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched,
>> + &llc_epoch_period);
>> + debugfs_create_u32("llc_epoch_affinity_timeout", 0644,
>> debugfs_sched,
>> + &llc_epoch_affinity_timeout);
>> +#endif
>> +
>> debugfs_create_file("debug", 0444, debugfs_sched, NULL,
>> &sched_debug_fops);
>> debugfs_fair_server_init();
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 424ec601cfdf..a2e2d6742481 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -1207,6 +1207,9 @@ static s64 update_se(struct rq *rq, struct
>> sched_entity *se)
>> __read_mostly unsigned int llc_overload_pct = 50;
>> __read_mostly unsigned int llc_imb_pct = 20;
>> +__read_mostly unsigned int llc_aggr_tolerance = 1;
>> +__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD;
>> +__read_mostly unsigned int llc_epoch_affinity_timeout =
>> EPOCH_LLC_AFFINITY_TIMEOUT;
>> static int llc_id(int cpu)
>> {
>> @@ -1223,11 +1226,22 @@ static int llc_id(int cpu)
>> return llc;
>> }
>> +static inline int get_sched_cache_scale(int mul)
>> +{
>> + if (!llc_aggr_tolerance)
>> + return 0;
>> +
>> + if (llc_aggr_tolerance == 100)
> the range of llc_aggr_tolerance is [0, 100], so a little bug here? maybe
> check if (llc_aggr_tolerance >= 100)
I thought llc_aggr_tolerance should not exceed 100, in
sched_cache_write_aggr_tolerance(), if the input value is
higher than max, it will return invalid:
return -EINVAL;
I did a double check on this:
root@vm:/sys/kernel/debug/sched# echo 100 > llc_aggr_tolerance
root@vm:/sys/kernel/debug/sched# echo 101 > llc_aggr_tolerance
bash: echo: write error: Invalid argument
>
> and if llc_aggr_tolerance = 0, the func return 0, it means
> exceed_llc_capacity & exceed_llc_nr always true, there may be
> inconsistent to have this value set while |llc_enable=1| is set.
>
If the llc_aggr_tolerance is 0, the cache aware scheduling is supposed
to be disabled - that is, exceed_llc_capacity() always returns true ->
the process is not eligible for cache aware scheduling.
thanks,
Chenyu
On Wed, Dec 03, 2025 at 03:07:39PM -0800, Tim Chen wrote:
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 466ba8b7398c..95bf080bbbf0 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2436,9 +2436,11 @@ extern void migrate_enable(void);
> DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>
> #ifdef CONFIG_SCHED_CACHE
> +DECLARE_STATIC_KEY_FALSE(sched_cache_on);
> +
> static inline bool sched_cache_enabled(void)
> {
> - return false;
> + return static_branch_unlikely(&sched_cache_on);
> }
> #endif
>
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 02e16b70a790..cde324672103 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = {
> .release = single_release,
> };
>
> +#ifdef CONFIG_SCHED_CACHE
> +#define SCHED_CACHE_CREATE_CONTROL(name, max) \
> +static ssize_t sched_cache_write_##name(struct file *filp, \
> + const char __user *ubuf, \
> + size_t cnt, loff_t *ppos) \
> +{ \
> + char buf[16]; \
> + unsigned int val; \
> + if (cnt > 15) \
> + cnt = 15; \
> + if (copy_from_user(&buf, ubuf, cnt)) \
> + return -EFAULT; \
> + buf[cnt] = '\0'; \
> + if (kstrtouint(buf, 10, &val)) \
> + return -EINVAL; \
> + if (val > (max)) \
> + return -EINVAL; \
> + llc_##name = val; \
> + if (!strcmp(#name, "enabled")) \
> + sched_cache_set(false); \
Oh gawd :-(
Please just write out all the various write methods and use
kstrtoul_from_user() and kstrtobool_from_user() where applicable.
> + *ppos += cnt; \
> + return cnt; \
> +} \
> +static int sched_cache_show_##name(struct seq_file *m, void *v) \
> +{ \
> + seq_printf(m, "%d\n", llc_##name); \
> + return 0; \
> +} \
> +static int sched_cache_open_##name(struct inode *inode, \
> + struct file *filp) \
> +{ \
> + return single_open(filp, sched_cache_show_##name, NULL); \
> +} \
> +static const struct file_operations sched_cache_fops_##name = { \
> + .open = sched_cache_open_##name, \
> + .write = sched_cache_write_##name, \
> + .read = seq_read, \
> + .llseek = seq_lseek, \
> + .release = single_release, \
> +}
> +
> +SCHED_CACHE_CREATE_CONTROL(overload_pct, 100);
> +SCHED_CACHE_CREATE_CONTROL(imb_pct, 100);
> +SCHED_CACHE_CREATE_CONTROL(aggr_tolerance, 100);
> +SCHED_CACHE_CREATE_CONTROL(enabled, 1);
On 12/11/2025 1:02 AM, Peter Zijlstra wrote:
> On Wed, Dec 03, 2025 at 03:07:39PM -0800, Tim Chen wrote:
>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 466ba8b7398c..95bf080bbbf0 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -2436,9 +2436,11 @@ extern void migrate_enable(void);
>> DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>>
>> #ifdef CONFIG_SCHED_CACHE
>> +DECLARE_STATIC_KEY_FALSE(sched_cache_on);
>> +
>> static inline bool sched_cache_enabled(void)
>> {
>> - return false;
>> + return static_branch_unlikely(&sched_cache_on);
>> }
>> #endif
>>
>> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>> index 02e16b70a790..cde324672103 100644
>> --- a/kernel/sched/debug.c
>> +++ b/kernel/sched/debug.c
>> @@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = {
>> .release = single_release,
>> };
>>
>> +#ifdef CONFIG_SCHED_CACHE
>> +#define SCHED_CACHE_CREATE_CONTROL(name, max) \
>> +static ssize_t sched_cache_write_##name(struct file *filp, \
>> + const char __user *ubuf, \
>> + size_t cnt, loff_t *ppos) \
>> +{ \
>> + char buf[16]; \
>> + unsigned int val; \
>> + if (cnt > 15) \
>> + cnt = 15; \
>> + if (copy_from_user(&buf, ubuf, cnt)) \
>> + return -EFAULT; \
>> + buf[cnt] = '\0'; \
>
>
>> + if (kstrtouint(buf, 10, &val)) \
>> + return -EINVAL; \
>> + if (val > (max)) \
>> + return -EINVAL; \
>> + llc_##name = val; \
>> + if (!strcmp(#name, "enabled")) \
>> + sched_cache_set(false); \
>
> Oh gawd :-(
>
> Please just write out all the various write methods and use
> kstrtoul_from_user() and kstrtobool_from_user() where applicable.
>
OK, will do.
thanks,
Chenyu
© 2016 - 2026 Red Hat, Inc.