[PATCH v2 04/23] sched/cache: Make LLC id continuous

Tim Chen posted 23 patches 2 weeks, 1 day ago
There is a newer version of this series
[PATCH v2 04/23] sched/cache: Make LLC id continuous
Posted by Tim Chen 2 weeks, 1 day ago
Introduce an index mapping between CPUs and their LLCs. This provides
a continuous per LLC index needed for cache-aware load balancing in
later patches.

The existing per_cpu llc_id usually points to the first CPU of the
LLC domain, which is sparse and unsuitable as an array index. Using
llc_id directly would waste memory.

With the new mapping, CPUs in the same LLC share a continuous id:

  per_cpu(llc_id, CPU=0...15)  = 0
  per_cpu(llc_id, CPU=16...31) = 1
  per_cpu(llc_id, CPU=32...47) = 2
  ...

Co-developed-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---

Notes:
    v1->v2:
       Convert the static LLC id to be allocated sequentially as LLCs are
       discovered, and replace the old sd_llc_id. (Peter Zijlstra)

 kernel/sched/fair.c     |  9 ++++++-
 kernel/sched/sched.h    |  1 +
 kernel/sched/topology.c | 60 +++++++++++++++++++++++++++++++++++++++--
 3 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 710ed9943d27..0a3918269906 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct            = 20;
 
 static int llc_id(int cpu)
 {
+	int llc;
+
 	if (cpu < 0)
 		return -1;
 
-	return per_cpu(sd_llc_id, cpu);
+	llc = per_cpu(sd_llc_id, cpu);
+	/* avoid race with cpu hotplug */
+	if (unlikely(llc >= max_llcs))
+		return -1;
+
+	return llc;
 }
 
 void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bf72c5bab506..728737641847 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2075,6 +2075,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 
 extern struct static_key_false sched_asym_cpucapacity;
 extern struct static_key_false sched_cluster_active;
+extern int max_llcs;
 
 static __always_inline bool sched_asym_cpucap_active(void)
 {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 444bdfdab731..f25d950ab015 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -17,6 +17,8 @@ void sched_domains_mutex_unlock(void)
 	mutex_unlock(&sched_domains_mutex);
 }
 
+int max_llcs;
+
 /* Protected by sched_domains_mutex: */
 static cpumask_var_t sched_domains_tmpmask;
 static cpumask_var_t sched_domains_tmpmask2;
@@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
 DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
 
+/*
+ * Assign continuous llc id for the CPU, and return
+ * the assigned llc id.
+ */
+static int update_llc_id(struct sched_domain *sd,
+			 int cpu)
+{
+	int id = per_cpu(sd_llc_id, cpu), i;
+
+	if (id >= 0)
+		return id;
+
+	if (sd) {
+		/* Look for any assigned id and reuse it.*/
+		for_each_cpu(i, sched_domain_span(sd)) {
+			id = per_cpu(sd_llc_id, i);
+
+			if (id >= 0) {
+				per_cpu(sd_llc_id, cpu) = id;
+				return id;
+			}
+		}
+	}
+
+	/*
+	 * When 1. there is no id assigned to this LLC domain,
+	 * or 2. the sd is NULL, we reach here.
+	 * Consider the following scenario,
+	 * CPU0~CPU95 are in the node0, CPU96~CPU191 are
+	 * in the node1. During bootup, maxcpus=96 is
+	 * appended.
+	 * case 1: When running cpu_attach_domain(CPU24)
+	 * during boot up, CPU24 is the first CPU in its
+	 * non-NULL LLC domain. However,
+	 * its corresponding llc id has not been assigned yet.
+	 *
+	 * case 2: After boot up, the CPU100 is brought up
+	 * via sysfs manually. As a result, CPU100 has only a
+	 * Numa domain attached, because CPU100 is the only CPU
+	 * of a sched domain, all its bottom domains are degenerated.
+	 * The LLC domain pointer sd is NULL for CPU100.
+	 *
+	 * For both cases, we want to increase the number of LLCs.
+	 */
+	per_cpu(sd_llc_id, cpu) = max_llcs++;
+
+	return per_cpu(sd_llc_id, cpu);
+}
+
 static void update_top_cache_domain(int cpu)
 {
 	struct sched_domain_shared *sds = NULL;
@@ -677,14 +728,13 @@ static void update_top_cache_domain(int cpu)
 
 	sd = highest_flag_domain(cpu, SD_SHARE_LLC);
 	if (sd) {
-		id = cpumask_first(sched_domain_span(sd));
 		size = cpumask_weight(sched_domain_span(sd));
 		sds = sd->shared;
 	}
 
 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 	per_cpu(sd_llc_size, cpu) = size;
-	per_cpu(sd_llc_id, cpu) = id;
+	id = update_llc_id(sd, cpu);
 	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
 
 	sd = lowest_flag_domain(cpu, SD_CLUSTER);
@@ -2488,6 +2538,12 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	bool has_asym = false;
 	bool has_cluster = false;
 
+	/* first scan of LLCs */
+	if (!max_llcs) {
+		for_each_possible_cpu(i)
+			per_cpu(sd_llc_id, i) = -1;
+	}
+
 	if (WARN_ON(cpumask_empty(cpu_map)))
 		goto error;
 
-- 
2.32.0
Re: [PATCH v2 04/23] sched/cache: Make LLC id continuous
Posted by Peter Zijlstra 1 week, 3 days ago
On Wed, Dec 03, 2025 at 03:07:23PM -0800, Tim Chen wrote:

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 710ed9943d27..0a3918269906 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct            = 20;
>  
>  static int llc_id(int cpu)
>  {
> +	int llc;
> +
>  	if (cpu < 0)
>  		return -1;
>  
> +	llc = per_cpu(sd_llc_id, cpu);
> +	/* avoid race with cpu hotplug */
> +	if (unlikely(llc >= max_llcs))
> +		return -1;
> +
> +	return llc;
>  }
>  
>  void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)

> @@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
>  DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
>  DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
>  
> +/*
> + * Assign continuous llc id for the CPU, and return
> + * the assigned llc id.
> + */
> +static int update_llc_id(struct sched_domain *sd,
> +			 int cpu)
> +{
> +	int id = per_cpu(sd_llc_id, cpu), i;
> +
> +	if (id >= 0)
> +		return id;
> +
> +	if (sd) {
> +		/* Look for any assigned id and reuse it.*/
> +		for_each_cpu(i, sched_domain_span(sd)) {
> +			id = per_cpu(sd_llc_id, i);
> +
> +			if (id >= 0) {
> +				per_cpu(sd_llc_id, cpu) = id;
> +				return id;
> +			}
> +		}
> +	}
> +
> +	/*
> +	 * When 1. there is no id assigned to this LLC domain,
> +	 * or 2. the sd is NULL, we reach here.
> +	 * Consider the following scenario,
> +	 * CPU0~CPU95 are in the node0, CPU96~CPU191 are
> +	 * in the node1. During bootup, maxcpus=96 is
> +	 * appended.
> +	 * case 1: When running cpu_attach_domain(CPU24)
> +	 * during boot up, CPU24 is the first CPU in its
> +	 * non-NULL LLC domain. However,
> +	 * its corresponding llc id has not been assigned yet.
> +	 *
> +	 * case 2: After boot up, the CPU100 is brought up
> +	 * via sysfs manually. As a result, CPU100 has only a
> +	 * Numa domain attached, because CPU100 is the only CPU
> +	 * of a sched domain, all its bottom domains are degenerated.
> +	 * The LLC domain pointer sd is NULL for CPU100.
> +	 *
> +	 * For both cases, we want to increase the number of LLCs.
> +	 */
> +	per_cpu(sd_llc_id, cpu) = max_llcs++;
> +
> +	return per_cpu(sd_llc_id, cpu);
> +}

I'm not sure I follow. So partition_sched_domains() first calls
detach_destroy_domains() on the old set, and then build_sched_domains()
on the new set.

Do detach_destroy_domain() will do:

  cpu_attach_domain(NULL,..);

That is, it will explicitly attach the NULL sched_domain to a CPU. At
which point I feel update_llc_id() should be returning -1, no?

Then later, build_sched_domains() will set a !NULL sched_domain, at
which point update_llc_id() can set a real value.

This should then also get rid of that weird max_llcs check in llc_id(),
right?
Re: [PATCH v2 04/23] sched/cache: Make LLC id continuous
Posted by Tim Chen 3 days, 22 hours ago
On Tue, 2025-12-09 at 12:58 +0100, Peter Zijlstra wrote:
> On Wed, Dec 03, 2025 at 03:07:23PM -0800, Tim Chen wrote:
> 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 710ed9943d27..0a3918269906 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct            = 20;
> >  
> >  static int llc_id(int cpu)
> >  {
> > +	int llc;
> > +
> >  	if (cpu < 0)
> >  		return -1;
> >  
> > +	llc = per_cpu(sd_llc_id, cpu);
> > +	/* avoid race with cpu hotplug */
> > +	if (unlikely(llc >= max_llcs))
> > +		return -1;
> > +
> > +	return llc;
> >  }
> >  
> >  void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
> 
> > @@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> >  DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
> >  DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
> >  
> > +/*
> > + * Assign continuous llc id for the CPU, and return
> > + * the assigned llc id.
> > + */
> > +static int update_llc_id(struct sched_domain *sd,
> > +			 int cpu)
> > +{
> > +	int id = per_cpu(sd_llc_id, cpu), i;
> > +
> > +	if (id >= 0)
> > +		return id;
> > +
> > +	if (sd) {
> > +		/* Look for any assigned id and reuse it.*/
> > +		for_each_cpu(i, sched_domain_span(sd)) {
> > +			id = per_cpu(sd_llc_id, i);
> > +
> > +			if (id >= 0) {
> > +				per_cpu(sd_llc_id, cpu) = id;
> > +				return id;
> > +			}
> > +		}
> > +	}
> > +
> > +	/*
> > +	 * When 1. there is no id assigned to this LLC domain,
> > +	 * or 2. the sd is NULL, we reach here.
> > +	 * Consider the following scenario,
> > +	 * CPU0~CPU95 are in the node0, CPU96~CPU191 are
> > +	 * in the node1. During bootup, maxcpus=96 is
> > +	 * appended.
> > +	 * case 1: When running cpu_attach_domain(CPU24)
> > +	 * during boot up, CPU24 is the first CPU in its
> > +	 * non-NULL LLC domain. However,
> > +	 * its corresponding llc id has not been assigned yet.
> > +	 *
> > +	 * case 2: After boot up, the CPU100 is brought up
> > +	 * via sysfs manually. As a result, CPU100 has only a
> > +	 * Numa domain attached, because CPU100 is the only CPU
> > +	 * of a sched domain, all its bottom domains are degenerated.
> > +	 * The LLC domain pointer sd is NULL for CPU100.
> > +	 *
> > +	 * For both cases, we want to increase the number of LLCs.
> > +	 */
> > +	per_cpu(sd_llc_id, cpu) = max_llcs++;
> > +
> > +	return per_cpu(sd_llc_id, cpu);
> > +}
> 
> I'm not sure I follow. So partition_sched_domains() first calls
> detach_destroy_domains() on the old set, and then build_sched_domains()
> on the new set.
> 
> Do detach_destroy_domain() will do:
> 
>   cpu_attach_domain(NULL,..);
> 
> That is, it will explicitly attach the NULL sched_domain to a CPU. At
> which point I feel update_llc_id() should be returning -1, no?
> 
> Then later, build_sched_domains() will set a !NULL sched_domain, at
> which point update_llc_id() can set a real value.
> 
> This should then also get rid of that weird max_llcs check in llc_id(),
> right?

Thanks for pointing this out.  Yes, we should take care of the
attachment of NULL sd. Will update the code accordingly.

Tim
Re: [PATCH v2 04/23] sched/cache: Make LLC id continuous
Posted by Chen, Yu C 3 days, 14 hours ago
On 12/16/2025 4:49 AM, Tim Chen wrote:
> On Tue, 2025-12-09 at 12:58 +0100, Peter Zijlstra wrote:
>> On Wed, Dec 03, 2025 at 03:07:23PM -0800, Tim Chen wrote:
>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 710ed9943d27..0a3918269906 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct            = 20;
>>>   
>>>   static int llc_id(int cpu)
>>>   {
>>> +	int llc;
>>> +
>>>   	if (cpu < 0)
>>>   		return -1;
>>>   
>>> +	llc = per_cpu(sd_llc_id, cpu);
>>> +	/* avoid race with cpu hotplug */
>>> +	if (unlikely(llc >= max_llcs))
>>> +		return -1;
>>> +
>>> +	return llc;
>>>   }
>>>   
>>>   void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
>>
>>> @@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
>>>   DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
>>>   DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
>>>   
>>> +/*
>>> + * Assign continuous llc id for the CPU, and return
>>> + * the assigned llc id.
>>> + */
>>> +static int update_llc_id(struct sched_domain *sd,
>>> +			 int cpu)
>>> +{
>>> +	int id = per_cpu(sd_llc_id, cpu), i;
>>> +
>>> +	if (id >= 0)
>>> +		return id;
>>> +
>>> +	if (sd) {
>>> +		/* Look for any assigned id and reuse it.*/
>>> +		for_each_cpu(i, sched_domain_span(sd)) {
>>> +			id = per_cpu(sd_llc_id, i);
>>> +
>>> +			if (id >= 0) {
>>> +				per_cpu(sd_llc_id, cpu) = id;
>>> +				return id;
>>> +			}
>>> +		}
>>> +	}
>>> +
>>> +	/*
>>> +	 * When 1. there is no id assigned to this LLC domain,
>>> +	 * or 2. the sd is NULL, we reach here.
>>> +	 * Consider the following scenario,
>>> +	 * CPU0~CPU95 are in the node0, CPU96~CPU191 are
>>> +	 * in the node1. During bootup, maxcpus=96 is
>>> +	 * appended.
>>> +	 * case 1: When running cpu_attach_domain(CPU24)
>>> +	 * during boot up, CPU24 is the first CPU in its
>>> +	 * non-NULL LLC domain. However,
>>> +	 * its corresponding llc id has not been assigned yet.
>>> +	 *
>>> +	 * case 2: After boot up, the CPU100 is brought up
>>> +	 * via sysfs manually. As a result, CPU100 has only a
>>> +	 * Numa domain attached, because CPU100 is the only CPU
>>> +	 * of a sched domain, all its bottom domains are degenerated.
>>> +	 * The LLC domain pointer sd is NULL for CPU100.
>>> +	 *
>>> +	 * For both cases, we want to increase the number of LLCs.
>>> +	 */
>>> +	per_cpu(sd_llc_id, cpu) = max_llcs++;
>>> +
>>> +	return per_cpu(sd_llc_id, cpu);
>>> +}
>>
>> I'm not sure I follow. So partition_sched_domains() first calls
>> detach_destroy_domains() on the old set, and then build_sched_domains()
>> on the new set.
>>
>> Do detach_destroy_domain() will do:
>>
>>    cpu_attach_domain(NULL,..);
>>
>> That is, it will explicitly attach the NULL sched_domain to a CPU. At
>> which point I feel update_llc_id() should be returning -1, no?
>>
>> Then later, build_sched_domains() will set a !NULL sched_domain, at
>> which point update_llc_id() can set a real value.
>>
>> This should then also get rid of that weird max_llcs check in llc_id(),
>> right?

The check for max_llcs was intended to prevent out-of-bounds access
to rq->nr_pref_llc[] at multiple points in the code.
Since dst_llc = llc_id(env->dst_cpu); — and while the LLC ID for the
  CPU is updated in update_llc_id(), this update occurs before we reallocate
  the nr_pref_llc buffer — dst_llc may end up exceeding the bounds of the
original nr_pref_llc buffer.

For this reason, we added a check if (dst_llc > max_llc) in llc_id()
when attempting to access rq->nr_pref_llc[dst_llc].

However, I agree that the max_llc check seems to not properly integrated
into  the current patch: it should instead be placed in the 7th patch, as
this would better illustrate the rationale for the max_llc check here:
sched/cache: Introduce per runqueue task LLC preference counter

In the 7th patch, we actually increment new_max_llcs rather than
max_llcs — meaning max_llcs always represents the "old" number of LLCs.
As a result, there is a race window between extending the rq->nr_pref_llc
buffer and updating max_llcs.


@@ -714,7 +827,7 @@ static int update_llc_id(struct sched_domain *sd,
  	 *
  	 * For both cases, we want to increase the number of LLCs.
  	 */
-	per_cpu(sd_llc_id, cpu) = max_llcs++;
+	per_cpu(sd_llc_id, cpu) = new_max_llcs++;

  	return per_cpu(sd_llc_id, cpu);
  }


> Thanks for pointing this out.  Yes, we should take care of the
> attachment of NULL sd. Will update the code accordingly.
> 

My understanding is that, if the sd is NULL, it is either because invoked
by detach_destroy_domain() for the old set, or by case 2 mentioned in 
above comments:
Say, CPU0-CPU95 are online during bootup, the boot command line is 
maxcpus=96.
Later after bootup, the user wants to bring up CPU100, the LLC domain for
CPU100 is NULL in this case(due to sd generation), and a new LLC should be
detected.

That is to say, when we reach update_llc_id(), there could be 2 reasons
for NULL sd. For the detach_destroy_domain() case, update_llc_id()
should return a valid id without increasing the max_llcs, because of
     if (id >= 0)
         return id;
And for the latter, the max_llcs should be increased.
Let me double check on this.

thanks,
Chenyu


> Tim
Re: [PATCH v2 04/23] sched/cache: Make LLC id continuous
Posted by Tim Chen 2 days, 23 hours ago
On Tue, 2025-12-16 at 13:31 +0800, Chen, Yu C wrote:
> On 12/16/2025 4:49 AM, Tim Chen wrote:
> > On Tue, 2025-12-09 at 12:58 +0100, Peter Zijlstra wrote:
> > > On Wed, Dec 03, 2025 at 03:07:23PM -0800, Tim Chen wrote:
> > > 
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index 710ed9943d27..0a3918269906 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct            = 20;
> > > >   
> > > >   static int llc_id(int cpu)
> > > >   {
> > > > +	int llc;
> > > > +
> > > >   	if (cpu < 0)
> > > >   		return -1;
> > > >   
> > > > +	llc = per_cpu(sd_llc_id, cpu);
> > > > +	/* avoid race with cpu hotplug */
> > > > +	if (unlikely(llc >= max_llcs))
> > > > +		return -1;
> > > > +
> > > > +	return llc;
> > > >   }
> > > >   
> > > >   void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
> > > 
> > > > @@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> > > >   DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
> > > >   DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
> > > >   
> > > > +/*
> > > > + * Assign continuous llc id for the CPU, and return
> > > > + * the assigned llc id.
> > > > + */
> > > > +static int update_llc_id(struct sched_domain *sd,
> > > > +			 int cpu)
> > > > +{
> > > > +	int id = per_cpu(sd_llc_id, cpu), i;
> > > > +
> > > > +	if (id >= 0)
> > > > +		return id;
> > > > +
> > > > +	if (sd) {
> > > > +		/* Look for any assigned id and reuse it.*/
> > > > +		for_each_cpu(i, sched_domain_span(sd)) {
> > > > +			id = per_cpu(sd_llc_id, i);
> > > > +
> > > > +			if (id >= 0) {
> > > > +				per_cpu(sd_llc_id, cpu) = id;
> > > > +				return id;
> > > > +			}
> > > > +		}
> > > > +	}
> > > > +
> > > > +	/*
> > > > +	 * When 1. there is no id assigned to this LLC domain,
> > > > +	 * or 2. the sd is NULL, we reach here.
> > > > +	 * Consider the following scenario,
> > > > +	 * CPU0~CPU95 are in the node0, CPU96~CPU191 are
> > > > +	 * in the node1. During bootup, maxcpus=96 is
> > > > +	 * appended.
> > > > +	 * case 1: When running cpu_attach_domain(CPU24)
> > > > +	 * during boot up, CPU24 is the first CPU in its
> > > > +	 * non-NULL LLC domain. However,
> > > > +	 * its corresponding llc id has not been assigned yet.
> > > > +	 *
> > > > +	 * case 2: After boot up, the CPU100 is brought up
> > > > +	 * via sysfs manually. As a result, CPU100 has only a
> > > > +	 * Numa domain attached, because CPU100 is the only CPU
> > > > +	 * of a sched domain, all its bottom domains are degenerated.
> > > > +	 * The LLC domain pointer sd is NULL for CPU100.
> > > > +	 *
> > > > +	 * For both cases, we want to increase the number of LLCs.
> > > > +	 */
> > > > +	per_cpu(sd_llc_id, cpu) = max_llcs++;
> > > > +
> > > > +	return per_cpu(sd_llc_id, cpu);
> > > > +}
> > > 
> > > I'm not sure I follow. So partition_sched_domains() first calls
> > > detach_destroy_domains() on the old set, and then build_sched_domains()
> > > on the new set.
> > > 
> > > Do detach_destroy_domain() will do:
> > > 
> > >    cpu_attach_domain(NULL,..);
> > > 
> > > That is, it will explicitly attach the NULL sched_domain to a CPU. At
> > > which point I feel update_llc_id() should be returning -1, no?
> > > 
> > > Then later, build_sched_domains() will set a !NULL sched_domain, at
> > > which point update_llc_id() can set a real value.
> > > 
> > > This should then also get rid of that weird max_llcs check in llc_id(),
> > > right?
> 
> The check for max_llcs was intended to prevent out-of-bounds access
> to rq->nr_pref_llc[] at multiple points in the code.
> Since dst_llc = llc_id(env->dst_cpu); — and while the LLC ID for the
>   CPU is updated in update_llc_id(), this update occurs before we reallocate
>   the nr_pref_llc buffer — dst_llc may end up exceeding the bounds of the
> original nr_pref_llc buffer.
> 
> For this reason, we added a check if (dst_llc > max_llc) in llc_id()
> when attempting to access rq->nr_pref_llc[dst_llc].
> 
> However, I agree that the max_llc check seems to not properly integrated
> into  the current patch: it should instead be placed in the 7th patch, as
> this would better illustrate the rationale for the max_llc check here:
> sched/cache: Introduce per runqueue task LLC preference counter
> 
> In the 7th patch, we actually increment new_max_llcs rather than
> max_llcs — meaning max_llcs always represents the "old" number of LLCs.
> As a result, there is a race window between extending the rq->nr_pref_llc
> buffer and updating max_llcs.
> 
> 
> @@ -714,7 +827,7 @@ static int update_llc_id(struct sched_domain *sd,
>   	 *
>   	 * For both cases, we want to increase the number of LLCs.
>   	 */
> -	per_cpu(sd_llc_id, cpu) = max_llcs++;
> +	per_cpu(sd_llc_id, cpu) = new_max_llcs++;
> 
>   	return per_cpu(sd_llc_id, cpu);
>   }
> 
> 
> > Thanks for pointing this out.  Yes, we should take care of the
> > attachment of NULL sd. Will update the code accordingly.
> > 
> 
> My understanding is that, if the sd is NULL, it is either because invoked
> by detach_destroy_domain() for the old set, or by case 2 mentioned in 
> above comments:
> Say, CPU0-CPU95 are online during bootup, the boot command line is 
> maxcpus=96.
> Later after bootup, the user wants to bring up CPU100, the LLC domain for
> CPU100 is NULL in this case(due to sd generation), and a new LLC should be
> detected.
> 
> That is to say, when we reach update_llc_id(), there could be 2 reasons
> for NULL sd. For the detach_destroy_domain() case, update_llc_id()
> should return a valid id without increasing the max_llcs, because of
>      if (id >= 0)
>          return id;
> And for the latter, the max_llcs should be increased.
> Let me double check on this.

The issue is we could offline all CPUs in a LLC and online them later.
In the current code, we will assign their ids all to -1. So on attach
of CPUs again, we'll be assigning a new LLC.  I think the proper thing
to do is not to assign llc id of the offlined cpu (the case where sd == NULL)
and keep the original llc id assigned.  Then we should be okay and not
increase max_llcs.

Tim

> 
> thanks,
> Chenyu
> 
> 
> > Tim
Re: [PATCH v2 04/23] sched/cache: Make LLC id continuous
Posted by Chen, Yu C 2 days, 14 hours ago
On 12/17/2025 3:53 AM, Tim Chen wrote:
> On Tue, 2025-12-16 at 13:31 +0800, Chen, Yu C wrote:
>> On 12/16/2025 4:49 AM, Tim Chen wrote:
>>> On Tue, 2025-12-09 at 12:58 +0100, Peter Zijlstra wrote:
>>>> On Wed, Dec 03, 2025 at 03:07:23PM -0800, Tim Chen wrote:
>>>>
>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>> index 710ed9943d27..0a3918269906 100644
>>>>> --- a/kernel/sched/fair.c
>>>>> +++ b/kernel/sched/fair.c
>>>>> @@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct            = 20;
>>>>>    
>>>>>    static int llc_id(int cpu)
>>>>>    {
>>>>> +	int llc;
>>>>> +
>>>>>    	if (cpu < 0)
>>>>>    		return -1;
>>>>>    
>>>>> +	llc = per_cpu(sd_llc_id, cpu);
>>>>> +	/* avoid race with cpu hotplug */
>>>>> +	if (unlikely(llc >= max_llcs))
>>>>> +		return -1;
>>>>> +
>>>>> +	return llc;
>>>>>    }
>>>>>    
>>>>>    void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
>>>>
>>>>> @@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
>>>>>    DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
>>>>>    DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
>>>>>    
>>>>> +/*
>>>>> + * Assign continuous llc id for the CPU, and return
>>>>> + * the assigned llc id.
>>>>> + */
>>>>> +static int update_llc_id(struct sched_domain *sd,
>>>>> +			 int cpu)
>>>>> +{
>>>>> +	int id = per_cpu(sd_llc_id, cpu), i;
>>>>> +
>>>>> +	if (id >= 0)
>>>>> +		return id;
>>>>> +
>>>>> +	if (sd) {
>>>>> +		/* Look for any assigned id and reuse it.*/
>>>>> +		for_each_cpu(i, sched_domain_span(sd)) {
>>>>> +			id = per_cpu(sd_llc_id, i);
>>>>> +
>>>>> +			if (id >= 0) {
>>>>> +				per_cpu(sd_llc_id, cpu) = id;
>>>>> +				return id;
>>>>> +			}
>>>>> +		}
>>>>> +	}
>>>>> +
>>>>> +	/*
>>>>> +	 * When 1. there is no id assigned to this LLC domain,
>>>>> +	 * or 2. the sd is NULL, we reach here.
>>>>> +	 * Consider the following scenario,
>>>>> +	 * CPU0~CPU95 are in the node0, CPU96~CPU191 are
>>>>> +	 * in the node1. During bootup, maxcpus=96 is
>>>>> +	 * appended.
>>>>> +	 * case 1: When running cpu_attach_domain(CPU24)
>>>>> +	 * during boot up, CPU24 is the first CPU in its
>>>>> +	 * non-NULL LLC domain. However,
>>>>> +	 * its corresponding llc id has not been assigned yet.
>>>>> +	 *
>>>>> +	 * case 2: After boot up, the CPU100 is brought up
>>>>> +	 * via sysfs manually. As a result, CPU100 has only a
>>>>> +	 * Numa domain attached, because CPU100 is the only CPU
>>>>> +	 * of a sched domain, all its bottom domains are degenerated.
>>>>> +	 * The LLC domain pointer sd is NULL for CPU100.
>>>>> +	 *
>>>>> +	 * For both cases, we want to increase the number of LLCs.
>>>>> +	 */
>>>>> +	per_cpu(sd_llc_id, cpu) = max_llcs++;
>>>>> +
>>>>> +	return per_cpu(sd_llc_id, cpu);
>>>>> +}
>>>>
>>>> I'm not sure I follow. So partition_sched_domains() first calls
>>>> detach_destroy_domains() on the old set, and then build_sched_domains()
>>>> on the new set.
>>>>
>>>> Do detach_destroy_domain() will do:
>>>>
>>>>     cpu_attach_domain(NULL,..);
>>>>
>>>> That is, it will explicitly attach the NULL sched_domain to a CPU. At
>>>> which point I feel update_llc_id() should be returning -1, no?
>>>>
>>>> Then later, build_sched_domains() will set a !NULL sched_domain, at
>>>> which point update_llc_id() can set a real value.
>>>>
>>>> This should then also get rid of that weird max_llcs check in llc_id(),
>>>> right?
>>
>> The check for max_llcs was intended to prevent out-of-bounds access
>> to rq->nr_pref_llc[] at multiple points in the code.
>> Since dst_llc = llc_id(env->dst_cpu); — and while the LLC ID for the
>>    CPU is updated in update_llc_id(), this update occurs before we reallocate
>>    the nr_pref_llc buffer — dst_llc may end up exceeding the bounds of the
>> original nr_pref_llc buffer.
>>
>> For this reason, we added a check if (dst_llc > max_llc) in llc_id()
>> when attempting to access rq->nr_pref_llc[dst_llc].
>>
>> However, I agree that the max_llc check seems to not properly integrated
>> into  the current patch: it should instead be placed in the 7th patch, as
>> this would better illustrate the rationale for the max_llc check here:
>> sched/cache: Introduce per runqueue task LLC preference counter
>>
>> In the 7th patch, we actually increment new_max_llcs rather than
>> max_llcs — meaning max_llcs always represents the "old" number of LLCs.
>> As a result, there is a race window between extending the rq->nr_pref_llc
>> buffer and updating max_llcs.
>>
>>
>> @@ -714,7 +827,7 @@ static int update_llc_id(struct sched_domain *sd,
>>    	 *
>>    	 * For both cases, we want to increase the number of LLCs.
>>    	 */
>> -	per_cpu(sd_llc_id, cpu) = max_llcs++;
>> +	per_cpu(sd_llc_id, cpu) = new_max_llcs++;
>>
>>    	return per_cpu(sd_llc_id, cpu);
>>    }
>>
>>
>>> Thanks for pointing this out.  Yes, we should take care of the
>>> attachment of NULL sd. Will update the code accordingly.
>>>
>>
>> My understanding is that, if the sd is NULL, it is either because invoked
>> by detach_destroy_domain() for the old set, or by case 2 mentioned in
>> above comments:
>> Say, CPU0-CPU95 are online during bootup, the boot command line is
>> maxcpus=96.
>> Later after bootup, the user wants to bring up CPU100, the LLC domain for
>> CPU100 is NULL in this case(due to sd generation), and a new LLC should be
>> detected.
>>
>> That is to say, when we reach update_llc_id(), there could be 2 reasons
>> for NULL sd. For the detach_destroy_domain() case, update_llc_id()
>> should return a valid id without increasing the max_llcs, because of
>>       if (id >= 0)
>>           return id;
>> And for the latter, the max_llcs should be increased.
>> Let me double check on this.
> 
> The issue is we could offline all CPUs in a LLC and online them later.
> In the current code, we will assign their ids all to -1.

I suppose we don't reset the ids in current implementation, only
the first scan of LLCs will reset/initialize the ids to -1 in
build_sched_domains()?
         if (!max_llcs) { //max_llcs is initialized to 0 during bootup
                 for_each_possible_cpu(i)
                         per_cpu(sd_llc_id, i) = -1;
         }

> So on attach
> of CPUs again, we'll be assigning a new LLC.  I think the proper thing
> to do is not to assign llc id of the offlined cpu (the case where sd == NULL)
> and keep the original llc id assigned.  Then we should be okay and not
> increase max_llcs.
> 

This is the current implementation because we don't assign new ids to
CPUs that already have an id(no matter it is offline/online).

thanks,
Chenyu