From: Chen Yu <yu.c.chen@intel.com>
A performance regression was observed by Prateek when running hackbench
with many threads per process (high fd count). To avoid this, processes
with a large number of active threads are excluded from cache-aware
scheduling.
With sched_cache enabled, record the number of active threads in each
process during the periodic task_cache_work(). While iterating over
CPUs, if the currently running task belongs to the same process as the
task that launched task_cache_work(), increment the active thread count.
If the number of active threads within the process exceeds the number
of Cores(divided by SMTs number) in the LLC, do not enable cache-aware
scheduling. For users who wish to perform task aggregation regardless,
a debugfs knob is provided for tuning in a subsequent patch.
Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Aaron Lu <ziqianlu@bytedance.com>
Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---
Notes:
v2->v3:
Put the calculating of nr_running_avg and the use of it into 1 patch.
(Peter Zijlstra)
Use guard(rcu)() when calculating the number of active threads of the
process.
(Peter Zijlstra)
Introduce update_avg_scale() rather than using update_avg() to fit
system with small LLC.
(Aaron Lu)
include/linux/sched.h | 1 +
kernel/sched/fair.c | 59 ++++++++++++++++++++++++++++++++++++++++---
2 files changed, 57 insertions(+), 3 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c98bd1c46088..511c9b263386 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2346,6 +2346,7 @@ struct sched_cache_stat {
struct sched_cache_time __percpu *pcpu_sched;
raw_spinlock_t lock;
unsigned long epoch;
+ u64 nr_running_avg;
int cpu;
} ____cacheline_aligned_in_smp;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d1145997b88d..86b6b08e7e1e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1223,6 +1223,19 @@ static inline bool valid_llc_buf(struct sched_domain *sd,
return valid_llc_id(id);
}
+static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
+{
+ int smt_nr = 1;
+
+#ifdef CONFIG_SCHED_SMT
+ if (sched_smt_active())
+ smt_nr = cpumask_weight(cpu_smt_mask(cpu));
+#endif
+
+ return !fits_capacity((mm->sc_stat.nr_running_avg * smt_nr),
+ per_cpu(sd_llc_size, cpu));
+}
+
static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
{
struct sched_domain *sd;
@@ -1417,7 +1430,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
*/
if (time_after(epoch,
READ_ONCE(mm->sc_stat.epoch) + EPOCH_LLC_AFFINITY_TIMEOUT) ||
- get_nr_threads(p) <= 1) {
+ get_nr_threads(p) <= 1 ||
+ exceed_llc_nr(mm, cpu_of(rq))) {
if (mm->sc_stat.cpu != -1)
mm->sc_stat.cpu = -1;
}
@@ -1458,13 +1472,31 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
}
}
+static inline void update_avg_scale(u64 *avg, u64 sample)
+{
+ int factor = per_cpu(sd_llc_size, raw_smp_processor_id());
+ s64 diff = sample - *avg;
+ u32 divisor;
+
+ /*
+ * Scale the divisor based on the number of CPUs contained
+ * in the LLC. This scaling ensures smaller LLC domains use
+ * a smaller divisor to achieve more precise sensitivity to
+ * changes in nr_running, while larger LLC domains are capped
+ * at a maximum divisor of 8 which is the default smoothing
+ * factor of EWMA in update_avg().
+ */
+ divisor = clamp_t(u32, (factor >> 2), 2, 8);
+ *avg += div64_s64(diff, divisor);
+}
+
static void task_cache_work(struct callback_head *work)
{
- struct task_struct *p = current;
+ struct task_struct *p = current, *cur;
struct mm_struct *mm = p->mm;
unsigned long m_a_occ = 0;
unsigned long curr_m_a_occ = 0;
- int cpu, m_a_cpu = -1;
+ int cpu, m_a_cpu = -1, nr_running = 0;
cpumask_var_t cpus;
WARN_ON_ONCE(work != &p->cache_work);
@@ -1474,6 +1506,13 @@ static void task_cache_work(struct callback_head *work)
if (p->flags & PF_EXITING)
return;
+ if (get_nr_threads(p) <= 1) {
+ if (mm->sc_stat.cpu != -1)
+ mm->sc_stat.cpu = -1;
+
+ return;
+ }
+
if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
return;
@@ -1497,6 +1536,12 @@ static void task_cache_work(struct callback_head *work)
m_occ = occ;
m_cpu = i;
}
+ scoped_guard (rcu) {
+ cur = rcu_dereference(cpu_rq(i)->curr);
+ if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) &&
+ cur->mm == mm)
+ nr_running++;
+ }
}
/*
@@ -1540,6 +1585,7 @@ static void task_cache_work(struct callback_head *work)
mm->sc_stat.cpu = m_a_cpu;
}
+ update_avg_scale(&mm->sc_stat.nr_running_avg, nr_running);
free_cpumask_var(cpus);
}
@@ -9988,6 +10034,13 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
return mig_unrestricted;
+ /* skip cache aware load balance for single/too many threads */
+ if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu)) {
+ if (mm->sc_stat.cpu != -1)
+ mm->sc_stat.cpu = -1;
+ return mig_unrestricted;
+ }
+
if (cpus_share_cache(dst_cpu, cpu))
to_pref = true;
else if (cpus_share_cache(src_cpu, cpu))
--
2.32.0
On Tue, Feb 10, 2026 at 02:18:55PM -0800, Tim Chen wrote:
> +static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
> +{
> + int smt_nr = 1;
> +
> +#ifdef CONFIG_SCHED_SMT
> + if (sched_smt_active())
> + smt_nr = cpumask_weight(cpu_smt_mask(cpu));
cpu_smt_num_threads ?
> +#endif
> +
> + return !fits_capacity((mm->sc_stat.nr_running_avg * smt_nr),
> + per_cpu(sd_llc_size, cpu));
> +}
On Thu, 2026-02-19 at 17:50 +0100, Peter Zijlstra wrote:
> On Tue, Feb 10, 2026 at 02:18:55PM -0800, Tim Chen wrote:
>
> > +static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
> > +{
> > + int smt_nr = 1;
> > +
> > +#ifdef CONFIG_SCHED_SMT
> > + if (sched_smt_active())
> > + smt_nr = cpumask_weight(cpu_smt_mask(cpu));
>
> cpu_smt_num_threads ?
Yes, cpu_smt_num_threads should work.
Tim
>
> > +#endif
> > +
> > + return !fits_capacity((mm->sc_stat.nr_running_avg * smt_nr),
> > + per_cpu(sd_llc_size, cpu));
> > +}
On 11/02/26 03:48, Tim Chen wrote:
> From: Chen Yu <yu.c.chen@intel.com>
>
> A performance regression was observed by Prateek when running hackbench
> with many threads per process (high fd count). To avoid this, processes
> with a large number of active threads are excluded from cache-aware
> scheduling.
>
> With sched_cache enabled, record the number of active threads in each
> process during the periodic task_cache_work(). While iterating over
> CPUs, if the currently running task belongs to the same process as the
> task that launched task_cache_work(), increment the active thread count.
>
> If the number of active threads within the process exceeds the number
> of Cores(divided by SMTs number) in the LLC, do not enable cache-aware
> scheduling. For users who wish to perform task aggregation regardless,
> a debugfs knob is provided for tuning in a subsequent patch.
>
> Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Suggested-by: Aaron Lu <ziqianlu@bytedance.com>
> Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> ---
>
> Notes:
> v2->v3:
> Put the calculating of nr_running_avg and the use of it into 1 patch.
> (Peter Zijlstra)
>
> Use guard(rcu)() when calculating the number of active threads of the
> process.
> (Peter Zijlstra)
>
> Introduce update_avg_scale() rather than using update_avg() to fit
> system with small LLC.
> (Aaron Lu)
>
> include/linux/sched.h | 1 +
> kernel/sched/fair.c | 59 ++++++++++++++++++++++++++++++++++++++++---
> 2 files changed, 57 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index c98bd1c46088..511c9b263386 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2346,6 +2346,7 @@ struct sched_cache_stat {
> struct sched_cache_time __percpu *pcpu_sched;
> raw_spinlock_t lock;
> unsigned long epoch;
> + u64 nr_running_avg;
> int cpu;
> } ____cacheline_aligned_in_smp;
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d1145997b88d..86b6b08e7e1e 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1223,6 +1223,19 @@ static inline bool valid_llc_buf(struct sched_domain *sd,
> return valid_llc_id(id);
> }
>
> +static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
> +{
> + int smt_nr = 1;
> +
> +#ifdef CONFIG_SCHED_SMT
> + if (sched_smt_active())
> + smt_nr = cpumask_weight(cpu_smt_mask(cpu));
> +#endif
> +
> + return !fits_capacity((mm->sc_stat.nr_running_avg * smt_nr),
> + per_cpu(sd_llc_size, cpu));
On Power10/Power11 with SMT4 and LLC size of 4, this check
effectively disables cache-aware scheduling for any process.
I raised this point in v1 as well. Increasing the threshold
doesn't seem like a viable solution either, as that would regress
hackbench/ebizzy.
Is there a way to make this useful for architectures with small LLC
sizes? One possible approach we were exploring is to have LLC at a
hemisphere level that comprise multiple SMT4 cores.
Thanks,
Vineeth
> +}
> +
> static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
> {
> struct sched_domain *sd;
> @@ -1417,7 +1430,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> */
> if (time_after(epoch,
> READ_ONCE(mm->sc_stat.epoch) + EPOCH_LLC_AFFINITY_TIMEOUT) ||
> - get_nr_threads(p) <= 1) {
> + get_nr_threads(p) <= 1 ||
> + exceed_llc_nr(mm, cpu_of(rq))) {
> if (mm->sc_stat.cpu != -1)
> mm->sc_stat.cpu = -1;
> }
> @@ -1458,13 +1472,31 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
> }
> }
>
> +static inline void update_avg_scale(u64 *avg, u64 sample)
> +{
> + int factor = per_cpu(sd_llc_size, raw_smp_processor_id());
> + s64 diff = sample - *avg;
> + u32 divisor;
> +
> + /*
> + * Scale the divisor based on the number of CPUs contained
> + * in the LLC. This scaling ensures smaller LLC domains use
> + * a smaller divisor to achieve more precise sensitivity to
> + * changes in nr_running, while larger LLC domains are capped
> + * at a maximum divisor of 8 which is the default smoothing
> + * factor of EWMA in update_avg().
> + */
> + divisor = clamp_t(u32, (factor >> 2), 2, 8);
> + *avg += div64_s64(diff, divisor);
> +}
> +
> static void task_cache_work(struct callback_head *work)
> {
> - struct task_struct *p = current;
> + struct task_struct *p = current, *cur;
> struct mm_struct *mm = p->mm;
> unsigned long m_a_occ = 0;
> unsigned long curr_m_a_occ = 0;
> - int cpu, m_a_cpu = -1;
> + int cpu, m_a_cpu = -1, nr_running = 0;
> cpumask_var_t cpus;
>
> WARN_ON_ONCE(work != &p->cache_work);
> @@ -1474,6 +1506,13 @@ static void task_cache_work(struct callback_head *work)
> if (p->flags & PF_EXITING)
> return;
>
> + if (get_nr_threads(p) <= 1) {
> + if (mm->sc_stat.cpu != -1)
> + mm->sc_stat.cpu = -1;
> +
> + return;
> + }
> +
> if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
> return;
>
> @@ -1497,6 +1536,12 @@ static void task_cache_work(struct callback_head *work)
> m_occ = occ;
> m_cpu = i;
> }
> + scoped_guard (rcu) {
> + cur = rcu_dereference(cpu_rq(i)->curr);
> + if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) &&
> + cur->mm == mm)
> + nr_running++;
> + }
> }
>
> /*
> @@ -1540,6 +1585,7 @@ static void task_cache_work(struct callback_head *work)
> mm->sc_stat.cpu = m_a_cpu;
> }
>
> + update_avg_scale(&mm->sc_stat.nr_running_avg, nr_running);
> free_cpumask_var(cpus);
> }
>
> @@ -9988,6 +10034,13 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
> if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
> return mig_unrestricted;
>
> + /* skip cache aware load balance for single/too many threads */
> + if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu)) {
> + if (mm->sc_stat.cpu != -1)
> + mm->sc_stat.cpu = -1;
> + return mig_unrestricted;
> + }
> +
> if (cpus_share_cache(dst_cpu, cpu))
> to_pref = true;
> else if (cpus_share_cache(src_cpu, cpu))
On Wed, Feb 18, 2026 at 11:24:05PM +0530, Madadi Vineeth Reddy wrote: > Is there a way to make this useful for architectures with small LLC > sizes? One possible approach we were exploring is to have LLC at a > hemisphere level that comprise multiple SMT4 cores. Is this hemisphere an actual physical cache level, or would that be artificial?
Hi Peter, On 19/02/26 22:25, Peter Zijlstra wrote: > On Wed, Feb 18, 2026 at 11:24:05PM +0530, Madadi Vineeth Reddy wrote: >> Is there a way to make this useful for architectures with small LLC >> sizes? One possible approach we were exploring is to have LLC at a >> hemisphere level that comprise multiple SMT4 cores. > > Is this hemisphere an actual physical cache level, or would that be > artificial? It's artificial. There is no cache being shared at this level but this is still the level where some amount of cache-snooping takes place and it is relatively faster to access the data from the caches of the cores within this domain. We verified with this producer consumer workload where the producer and consumer threads placed in the same hemisphere showed measurably better latency compared to cross-hemisphere placement. Thanks, Vineeth
On Fri, Feb 20, 2026 at 12:10:21PM +0530, Madadi Vineeth Reddy wrote: > Hi Peter, > > On 19/02/26 22:25, Peter Zijlstra wrote: > > On Wed, Feb 18, 2026 at 11:24:05PM +0530, Madadi Vineeth Reddy wrote: > >> Is there a way to make this useful for architectures with small LLC > >> sizes? One possible approach we were exploring is to have LLC at a > >> hemisphere level that comprise multiple SMT4 cores. > > > > Is this hemisphere an actual physical cache level, or would that be > > artificial? > > It's artificial. There is no cache being shared at this level but this is > still the level where some amount of cache-snooping takes place and it is > relatively faster to access the data from the caches of the cores > within this domain. > > We verified with this producer consumer workload where the producer > and consumer threads placed in the same hemisphere showed measurably > better latency compared to cross-hemisphere placement. So I just read the Power10 Wikipedia entry; that seems to suggest there actually is a significant L3 at the hemisphere level. That thing states that Power10 has: - 16 cores in two hemispheres of 8 cores each. - each core has 2M L2 cache - each hemi has 64M of L3 cache Then there appears to be a 'funny' in that there's always one 'dead' core, so you end up with 8+7, and the small hemi looses an 8M L3 slice due to that. Now, I'm just reading a Wiki pages written by a random person on the interweb, so perhaps this is wrong (in which case I would suggest you get someone from IBM to go and edit that page and provide references), or there has been a miscommunication somewhere else, and perhaps there really is L3 at the hemi level, and arch/powerpc/ 'forgot' to expose that :-)
Hi Peter, Sorry for the delayed response. Wanted to be sure before responding. On 20/02/26 15:23, Peter Zijlstra wrote: > On Fri, Feb 20, 2026 at 12:10:21PM +0530, Madadi Vineeth Reddy wrote: >> Hi Peter, >> >> On 19/02/26 22:25, Peter Zijlstra wrote: >>> On Wed, Feb 18, 2026 at 11:24:05PM +0530, Madadi Vineeth Reddy wrote: >>>> Is there a way to make this useful for architectures with small LLC >>>> sizes? One possible approach we were exploring is to have LLC at a >>>> hemisphere level that comprise multiple SMT4 cores. >>> >>> Is this hemisphere an actual physical cache level, or would that be >>> artificial? >> >> It's artificial. There is no cache being shared at this level but this is >> still the level where some amount of cache-snooping takes place and it is >> relatively faster to access the data from the caches of the cores >> within this domain. >> >> We verified with this producer consumer workload where the producer >> and consumer threads placed in the same hemisphere showed measurably >> better latency compared to cross-hemisphere placement. > > So I just read the Power10 Wikipedia entry; that seems to suggest there > actually is a significant L3 at the hemisphere level. > > That thing states that Power10 has: > > - 16 cores in two hemispheres of 8 cores each. > - each core has 2M L2 cache > - each hemi has 64M of L3 cache The Wikipedia entry is incorrect. On Power10, L3 is at the SMT4 small core level (4M per core), not at the hemisphere level. This is documented in the Power10 user manual [1] (Page 175). L3 is also a victim cache on Power10. > > Then there appears to be a 'funny' in that there's always one 'dead' > core, so you end up with 8+7, and the small hemi looses an 8M L3 slice > due to that. > > Now, I'm just reading a Wiki pages written by a random person on the > interweb, so perhaps this is wrong (in which case I would suggest you Yes, the Wikipedia page is wrong on this. We will get it corrected with proper references. [1] https://files.openpower.foundation/s/EgCy7C43p2NSRfR Thanks, Vineeth > get someone from IBM to go and edit that page and provide references), > or there has been a miscommunication somewhere else, and perhaps there > really is L3 at the hemi level, and arch/powerpc/ 'forgot' to expose > that :-)
On Wed, Feb 18, 2026 at 11:24:05PM +0530, Madadi Vineeth Reddy wrote:
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index d1145997b88d..86b6b08e7e1e 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -1223,6 +1223,19 @@ static inline bool valid_llc_buf(struct sched_domain *sd,
> > return valid_llc_id(id);
> > }
> >
> > +static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
> > +{
> > + int smt_nr = 1;
> > +
> > +#ifdef CONFIG_SCHED_SMT
> > + if (sched_smt_active())
> > + smt_nr = cpumask_weight(cpu_smt_mask(cpu));
> > +#endif
> > +
> > + return !fits_capacity((mm->sc_stat.nr_running_avg * smt_nr),
> > + per_cpu(sd_llc_size, cpu));
>
>
> On Power10/Power11 with SMT4 and LLC size of 4, this check
> effectively disables cache-aware scheduling for any process.
>
> I raised this point in v1 as well. Increasing the threshold
> doesn't seem like a viable solution either, as that would regress
> hackbench/ebizzy.
>
> Is there a way to make this useful for architectures with small LLC
> sizes? One possible approach we were exploring is to have LLC at a
> hemisphere level that comprise multiple SMT4 cores.
One way forward would be to use a llc-mask instead of a single llc value
for preference. I think this got mentioned before, and I think it makes
sense to do this later.
But once you can have a 'few' LLCs as preference, this constraint
becomes a little easier.
On 19/02/26 22:22, Peter Zijlstra wrote:
> On Wed, Feb 18, 2026 at 11:24:05PM +0530, Madadi Vineeth Reddy wrote:
>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index d1145997b88d..86b6b08e7e1e 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -1223,6 +1223,19 @@ static inline bool valid_llc_buf(struct sched_domain *sd,
>>> return valid_llc_id(id);
>>> }
>>>
>>> +static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
>>> +{
>>> + int smt_nr = 1;
>>> +
>>> +#ifdef CONFIG_SCHED_SMT
>>> + if (sched_smt_active())
>>> + smt_nr = cpumask_weight(cpu_smt_mask(cpu));
>>> +#endif
>>> +
>>> + return !fits_capacity((mm->sc_stat.nr_running_avg * smt_nr),
>>> + per_cpu(sd_llc_size, cpu));
>>
>>
>> On Power10/Power11 with SMT4 and LLC size of 4, this check
>> effectively disables cache-aware scheduling for any process.
>>
>> I raised this point in v1 as well. Increasing the threshold
>> doesn't seem like a viable solution either, as that would regress
>> hackbench/ebizzy.
>>
>> Is there a way to make this useful for architectures with small LLC
>> sizes? One possible approach we were exploring is to have LLC at a
>> hemisphere level that comprise multiple SMT4 cores.
>
> One way forward would be to use a llc-mask instead of a single llc value
> for preference. I think this got mentioned before, and I think it makes
> sense to do this later.
>
> But once you can have a 'few' LLCs as preference, this constraint
> becomes a little easier.
Yes, that makes sense. Spanning the llc-mask across multiple cores
in a hemisphere for preference would relax this condition.
We will explore how this can be incorporated. Thanks for taking a
look.
Thanks,
Vineeth
On Wed, 2026-02-18 at 23:24 +0530, Madadi Vineeth Reddy wrote:
> On 11/02/26 03:48, Tim Chen wrote:
> > From: Chen Yu <yu.c.chen@intel.com>
> >
> >
[ .. snip ..]
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index d1145997b88d..86b6b08e7e1e 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -1223,6 +1223,19 @@ static inline bool valid_llc_buf(struct sched_domain *sd,
> > return valid_llc_id(id);
> > }
> >
> > +static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
> > +{
> > + int smt_nr = 1;
> > +
> > +#ifdef CONFIG_SCHED_SMT
> > + if (sched_smt_active())
> > + smt_nr = cpumask_weight(cpu_smt_mask(cpu));
> > +#endif
> > +
> > + return !fits_capacity((mm->sc_stat.nr_running_avg * smt_nr),
> > + per_cpu(sd_llc_size, cpu));
>
>
> On Power10/Power11 with SMT4 and LLC size of 4, this check
> effectively disables cache-aware scheduling for any process.
There are 4 cores per LLC, with 4 SMT per core? In that case, once we have more than
4 running threads and there's another idle LLC available, seems
like putting the additional thread on a different LLC is the
right thing to do as threads sharing a core will usually be much
slower.
But when number of threads are under 4, we should still be
doing aggregation.
Perhaps I am misunderstanding your topology.
Tim
>
> I raised this point in v1 as well. Increasing the threshold
> doesn't seem like a viable solution either, as that would regress
> hackbench/ebizzy.
>
> Is there a way to make this useful for architectures with small LLC
> sizes? One possible approach we were exploring is to have LLC at a
> hemisphere level that comprise multiple SMT4 cores.
>
> Thanks,
> Vineeth
On 19/02/26 03:14, Tim Chen wrote:
> On Wed, 2026-02-18 at 23:24 +0530, Madadi Vineeth Reddy wrote:
>> On 11/02/26 03:48, Tim Chen wrote:
>>> From: Chen Yu <yu.c.chen@intel.com>
>>>
>>>
> [ .. snip ..]
>
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index d1145997b88d..86b6b08e7e1e 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -1223,6 +1223,19 @@ static inline bool valid_llc_buf(struct sched_domain *sd,
>>> return valid_llc_id(id);
>>> }
>>>
>>> +static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
>>> +{
>>> + int smt_nr = 1;
>>> +
>>> +#ifdef CONFIG_SCHED_SMT
>>> + if (sched_smt_active())
>>> + smt_nr = cpumask_weight(cpu_smt_mask(cpu));
>>> +#endif
>>> +
>>> + return !fits_capacity((mm->sc_stat.nr_running_avg * smt_nr),
>>> + per_cpu(sd_llc_size, cpu));
>>
>>
>> On Power10/Power11 with SMT4 and LLC size of 4, this check
>> effectively disables cache-aware scheduling for any process.
>
> There are 4 cores per LLC, with 4 SMT per core? In that case, once we have more than
> 4 running threads and there's another idle LLC available, seems
> like putting the additional thread on a different LLC is the
> right thing to do as threads sharing a core will usually be much
> slower.
>
> But when number of threads are under 4, we should still be
> doing aggregation.
>
> Perhaps I am misunderstanding your topology.
There is only one core per LLC whose size is 4 CPUs.
So, mm->sc_stat.nr_running_avg can't be >= 1 for
cache aware scheduling to be enabled.
Thanks,
Vineeth
>
> Tim
>
>>
>> I raised this point in v1 as well. Increasing the threshold
>> doesn't seem like a viable solution either, as that would regress
>> hackbench/ebizzy.
>>
>> Is there a way to make this useful for architectures with small LLC
>> sizes? One possible approach we were exploring is to have LLC at a
>> hemisphere level that comprise multiple SMT4 cores.
>>
>> Thanks,
>> Vineeth
On Thu, 2026-02-19 at 07:58 +0530, Madadi Vineeth Reddy wrote:
> On 19/02/26 03:14, Tim Chen wrote:
> > On Wed, 2026-02-18 at 23:24 +0530, Madadi Vineeth Reddy wrote:
> > > On 11/02/26 03:48, Tim Chen wrote:
> > > > From: Chen Yu <yu.c.chen@intel.com>
> > > >
> > > >
> > [ .. snip ..]
> >
> > > >
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index d1145997b88d..86b6b08e7e1e 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -1223,6 +1223,19 @@ static inline bool valid_llc_buf(struct sched_domain *sd,
> > > > return valid_llc_id(id);
> > > > }
> > > >
> > > > +static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
> > > > +{
> > > > + int smt_nr = 1;
> > > > +
> > > > +#ifdef CONFIG_SCHED_SMT
> > > > + if (sched_smt_active())
> > > > + smt_nr = cpumask_weight(cpu_smt_mask(cpu));
> > > > +#endif
> > > > +
> > > > + return !fits_capacity((mm->sc_stat.nr_running_avg * smt_nr),
> > > > + per_cpu(sd_llc_size, cpu));
> > >
> > >
> > > On Power10/Power11 with SMT4 and LLC size of 4, this check
> > > effectively disables cache-aware scheduling for any process.
> >
> > There are 4 cores per LLC, with 4 SMT per core? In that case, once we have more than
> > 4 running threads and there's another idle LLC available, seems
> > like putting the additional thread on a different LLC is the
> > right thing to do as threads sharing a core will usually be much
> > slower.
> >
> > But when number of threads are under 4, we should still be
> > doing aggregation.
> >
> > Perhaps I am misunderstanding your topology.
>
> There is only one core per LLC whose size is 4 CPUs.
> So, mm->sc_stat.nr_running_avg can't be >= 1 for
> cache aware scheduling to be enabled.
If there is only 1 core, and mm->sc_stat.nr_running_avg > 1,
wouldn't it be better to spread the tasks among the cores with
normal load balance, instead of having threads aggregated
fighting for the resource of a single core, i.e. run without
cache aware scheduling?
Tim
>
> Thanks,
> Vineeth
>
> >
> > Tim
> >
> > >
> > > I raised this point in v1 as well. Increasing the threshold
> > > doesn't seem like a viable solution either, as that would regress
> > > hackbench/ebizzy.
> > >
> > > Is there a way to make this useful for architectures with small LLC
> > > sizes? One possible approach we were exploring is to have LLC at a
> > > hemisphere level that comprise multiple SMT4 cores.
> > >
> > > Thanks,
> > > Vineeth
Hi Vineeth,
On 2/19/2026 10:28 AM, Madadi Vineeth Reddy wrote:
> On 19/02/26 03:14, Tim Chen wrote:
>> On Wed, 2026-02-18 at 23:24 +0530, Madadi Vineeth Reddy wrote:
>>> On 11/02/26 03:48, Tim Chen wrote:
>>>> From: Chen Yu <yu.c.chen@intel.com>
>>>>
>>>>
>> [ .. snip ..]
>>
>>>>
>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>> index d1145997b88d..86b6b08e7e1e 100644
>>>> --- a/kernel/sched/fair.c
>>>> +++ b/kernel/sched/fair.c
>>>> @@ -1223,6 +1223,19 @@ static inline bool valid_llc_buf(struct sched_domain *sd,
>>>> return valid_llc_id(id);
>>>> }
>>>>
>>>> +static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
>>>> +{
>>>> + int smt_nr = 1;
>>>> +
>>>> +#ifdef CONFIG_SCHED_SMT
>>>> + if (sched_smt_active())
>>>> + smt_nr = cpumask_weight(cpu_smt_mask(cpu));
>>>> +#endif
>>>> +
>>>> + return !fits_capacity((mm->sc_stat.nr_running_avg * smt_nr),
>>>> + per_cpu(sd_llc_size, cpu));
>>>
>>>
>>> On Power10/Power11 with SMT4 and LLC size of 4, this check
>>> effectively disables cache-aware scheduling for any process.
>>
>> There are 4 cores per LLC, with 4 SMT per core? In that case, once we have more than
>> 4 running threads and there's another idle LLC available, seems
>> like putting the additional thread on a different LLC is the
>> right thing to do as threads sharing a core will usually be much
>> slower.
>>
>> But when number of threads are under 4, we should still be
>> doing aggregation.
>>
>> Perhaps I am misunderstanding your topology.
>
> There is only one core per LLC whose size is 4 CPUs.
> So, mm->sc_stat.nr_running_avg can't be >= 1 for
> cache aware scheduling to be enabled.
>
There is a scale factor in the final step that can be tuned by
the user space:
exceeded = !fits_capacity((mm->sc_stat.nr_running_avg * smt_nr),
(scale * per_cpu(sd_llc_size, cpu)));
So if the user increases the llc_aggr_tolerance via debugfs,
the cache aware aggregation is still enabled. Or do you suggest
to tune the nr_running check and the RSS check via different
debugfs knobs?
thanks,
Chenyu
© 2016 - 2026 Red Hat, Inc.