[v2] Cache aware scheduling

[PATCH v2 07/23] sched/cache: Introduce per runqueue task LLC preference counter

Posted by Tim Chen 2 months ago

Each runqueue is assigned an array where each element tracks
the number of tasks preferring a given LLC, indexed from 0 to
max_llcs - 1.

For example, rq->nr_pref_llc[3] = 2 signifies that there are 2 tasks on
this runqueue which prefer to run within LLC3.

The load balancer can use this information to identify busy
runqueues and migrate tasks to their preferred LLC domains.
This array will be reallocated at runtime if the number of LLCs
increases due to CPU hotplug. Only extending the buffer(rather
than shrinking it) is supported to simplify the implementation.

Introduce the buffer allocation mechanism, and the statistics
will be calculated in the subsequent patch.

Co-developed-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---

Notes:
    v1->v2:
        Remove static allocation of per runqueue LLC preference arrays.
        Allocate array size to the actual number of LLCs online. (Peter Zijlstra, Madadi Vineeth Reddy)

 kernel/sched/core.c     |   1 +
 kernel/sched/sched.h    |   1 +
 kernel/sched/topology.c | 117 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 118 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 48626c81ba8e..ce533dc485f5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8800,6 +8800,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SCHED_CACHE
 		raw_spin_lock_init(&rq->cpu_epoch_lock);
 		rq->cpu_epoch_next = jiffies;
+		rq->nr_pref_llc = NULL;
 #endif
 
 		zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ee8b70647835..8f2a779825e4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1129,6 +1129,7 @@ struct rq {
 #ifdef CONFIG_SCHED_CACHE
 	unsigned int		nr_pref_llc_running;
 	unsigned int		nr_llc_running;
+	unsigned int		*nr_pref_llc;
 #endif
 #ifdef CONFIG_NO_HZ_COMMON
 	unsigned long		last_blocked_load_update_tick;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f25d950ab015..d583399fc6a1 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -17,8 +17,121 @@ void sched_domains_mutex_unlock(void)
 	mutex_unlock(&sched_domains_mutex);
 }
 
+/* the number of max LLCs being detected */
+static int new_max_llcs;
+/* the current number of max LLCs */
 int max_llcs;
 
+#ifdef CONFIG_SCHED_CACHE
+
+static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
+{
+	unsigned int *new = NULL;
+
+	new = kcalloc(new_max_llcs, sizeof(unsigned int),
+		      GFP_KERNEL | __GFP_NOWARN);
+
+	if (!new) {
+		*gc = NULL;
+	} else {
+		/*
+		 * Place old entry in garbage collector
+		 * for later disposal.
+		 */
+		*gc = old;
+	}
+	return new;
+}
+
+static void populate_new_pref_llcs(unsigned int *old, unsigned int *new)
+{
+	int i;
+
+	if (!old)
+		return;
+
+	for (i = 0; i < max_llcs; i++)
+		new[i] = old[i];
+}
+
+static int resize_llc_pref(void)
+{
+	unsigned int *__percpu *tmp_llc_pref;
+	int i, ret = 0;
+
+	if (new_max_llcs <= max_llcs)
+		return 0;
+
+	/*
+	 * Allocate temp percpu pointer for old llc_pref,
+	 * which will be released after switching to the
+	 * new buffer.
+	 */
+	tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
+	if (!tmp_llc_pref)
+		return -ENOMEM;
+
+	for_each_present_cpu(i)
+		*per_cpu_ptr(tmp_llc_pref, i) = NULL;
+
+	/*
+	 * Resize the per rq nr_pref_llc buffer and
+	 * switch to this new buffer.
+	 */
+	for_each_present_cpu(i) {
+		struct rq_flags rf;
+		unsigned int *new;
+		struct rq *rq;
+
+		rq = cpu_rq(i);
+		new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
+		if (!new) {
+			ret = -ENOMEM;
+
+			goto release_old;
+		}
+
+		/*
+		 * Locking rq ensures that rq->nr_pref_llc values
+		 * don't change with new task enqueue/dequeue
+		 * when we repopulate the newly enlarged array.
+		 */
+		rq_lock_irqsave(rq, &rf);
+		populate_new_pref_llcs(rq->nr_pref_llc, new);
+		rq->nr_pref_llc = new;
+		rq_unlock_irqrestore(rq, &rf);
+	}
+
+release_old:
+	/*
+	 * Load balance is done under rcu_lock.
+	 * Wait for load balance before and during resizing to
+	 * be done. They may refer to old nr_pref_llc[]
+	 * that hasn't been resized.
+	 */
+	synchronize_rcu();
+	for_each_present_cpu(i)
+		kfree(*per_cpu_ptr(tmp_llc_pref, i));
+
+	free_percpu(tmp_llc_pref);
+
+	/* succeed and update */
+	if (!ret)
+		max_llcs = new_max_llcs;
+
+	return ret;
+}
+
+#else
+
+static int resize_llc_pref(void)
+{
+	max_llcs = new_max_llcs;
+	return 0;
+}
+
+#endif
+
 /* Protected by sched_domains_mutex: */
 static cpumask_var_t sched_domains_tmpmask;
 static cpumask_var_t sched_domains_tmpmask2;
@@ -714,7 +827,7 @@ static int update_llc_id(struct sched_domain *sd,
 	 *
 	 * For both cases, we want to increase the number of LLCs.
 	 */
-	per_cpu(sd_llc_id, cpu) = max_llcs++;
+	per_cpu(sd_llc_id, cpu) = new_max_llcs++;
 
 	return per_cpu(sd_llc_id, cpu);
 }
@@ -2674,6 +2787,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	if (has_cluster)
 		static_branch_inc_cpuslocked(&sched_cluster_active);
 
+	resize_llc_pref();
+
 	if (rq && sched_debug_verbose)
 		pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));
 
-- 
2.32.0

Re: [PATCH v2 07/23] sched/cache: Introduce per runqueue task LLC preference counter

Posted by Peter Zijlstra 1 month, 4 weeks ago

On Wed, Dec 03, 2025 at 03:07:26PM -0800, Tim Chen wrote:

> +static int resize_llc_pref(void)
> +{
> +	unsigned int *__percpu *tmp_llc_pref;
> +	int i, ret = 0;
> +
> +	if (new_max_llcs <= max_llcs)
> +		return 0;
> +
> +	/*
> +	 * Allocate temp percpu pointer for old llc_pref,
> +	 * which will be released after switching to the
> +	 * new buffer.
> +	 */
> +	tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
> +	if (!tmp_llc_pref)
> +		return -ENOMEM;
> +
> +	for_each_present_cpu(i)
> +		*per_cpu_ptr(tmp_llc_pref, i) = NULL;
> +
> +	/*
> +	 * Resize the per rq nr_pref_llc buffer and
> +	 * switch to this new buffer.
> +	 */
> +	for_each_present_cpu(i) {
> +		struct rq_flags rf;
> +		unsigned int *new;
> +		struct rq *rq;
> +
> +		rq = cpu_rq(i);
> +		new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
> +		if (!new) {
> +			ret = -ENOMEM;
> +
> +			goto release_old;
> +		}
> +
> +		/*
> +		 * Locking rq ensures that rq->nr_pref_llc values
> +		 * don't change with new task enqueue/dequeue
> +		 * when we repopulate the newly enlarged array.
> +		 */
> +		rq_lock_irqsave(rq, &rf);
> +		populate_new_pref_llcs(rq->nr_pref_llc, new);
> +		rq->nr_pref_llc = new;
> +		rq_unlock_irqrestore(rq, &rf);
> +	}
> +
> +release_old:
> +	/*
> +	 * Load balance is done under rcu_lock.
> +	 * Wait for load balance before and during resizing to
> +	 * be done. They may refer to old nr_pref_llc[]
> +	 * that hasn't been resized.
> +	 */
> +	synchronize_rcu();
> +	for_each_present_cpu(i)
> +		kfree(*per_cpu_ptr(tmp_llc_pref, i));
> +
> +	free_percpu(tmp_llc_pref);
> +
> +	/* succeed and update */
> +	if (!ret)
> +		max_llcs = new_max_llcs;
> +
> +	return ret;
> +}

Would it perhaps be easier to stick this thing in rq->sd rather than in
rq->nr_pref_llc. That way it automagically switches with the 'new'
domain. And then, with a bit of care, a singe load-balance pass should
see a consistent view (there should not be reloads of rq->sd -- which
will be a bit of an audit I suppose).

Re: [PATCH v2 07/23] sched/cache: Introduce per runqueue task LLC preference counter

Posted by Tim Chen 1 month, 4 weeks ago

On Wed, 2025-12-10 at 13:51 +0100, Peter Zijlstra wrote:
> On Wed, Dec 03, 2025 at 03:07:26PM -0800, Tim Chen wrote:
> 
> > +static int resize_llc_pref(void)
> > +{
> > +	unsigned int *__percpu *tmp_llc_pref;
> > +	int i, ret = 0;
> > +
> > +	if (new_max_llcs <= max_llcs)
> > +		return 0;
> > +
> > +	/*
> > +	 * Allocate temp percpu pointer for old llc_pref,
> > +	 * which will be released after switching to the
> > +	 * new buffer.
> > +	 */
> > +	tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
> > +	if (!tmp_llc_pref)
> > +		return -ENOMEM;
> > +
> > +	for_each_present_cpu(i)
> > +		*per_cpu_ptr(tmp_llc_pref, i) = NULL;
> > +
> > +	/*
> > +	 * Resize the per rq nr_pref_llc buffer and
> > +	 * switch to this new buffer.
> > +	 */
> > +	for_each_present_cpu(i) {
> > +		struct rq_flags rf;
> > +		unsigned int *new;
> > +		struct rq *rq;
> > +
> > +		rq = cpu_rq(i);
> > +		new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
> > +		if (!new) {
> > +			ret = -ENOMEM;
> > +
> > +			goto release_old;
> > +		}
> > +
> > +		/*
> > +		 * Locking rq ensures that rq->nr_pref_llc values
> > +		 * don't change with new task enqueue/dequeue
> > +		 * when we repopulate the newly enlarged array.
> > +		 */
> > +		rq_lock_irqsave(rq, &rf);
> > +		populate_new_pref_llcs(rq->nr_pref_llc, new);
> > +		rq->nr_pref_llc = new;
> > +		rq_unlock_irqrestore(rq, &rf);
> > +	}
> > +
> > +release_old:
> > +	/*
> > +	 * Load balance is done under rcu_lock.
> > +	 * Wait for load balance before and during resizing to
> > +	 * be done. They may refer to old nr_pref_llc[]
> > +	 * that hasn't been resized.
> > +	 */
> > +	synchronize_rcu();
> > +	for_each_present_cpu(i)
> > +		kfree(*per_cpu_ptr(tmp_llc_pref, i));
> > +
> > +	free_percpu(tmp_llc_pref);
> > +
> > +	/* succeed and update */
> > +	if (!ret)
> > +		max_llcs = new_max_llcs;
> > +
> > +	return ret;
> > +}
> 
> Would it perhaps be easier to stick this thing in rq->sd rather than in
> rq->nr_pref_llc. That way it automagically switches with the 'new'
> domain. And then, with a bit of care, a singe load-balance pass should
> see a consistent view (there should not be reloads of rq->sd -- which
> will be a bit of an audit I suppose).

We need nr_pref_llc information at the runqueue level because the load balancer 
must identify which specific rq has the largest number of tasks that 
prefer a given destination LLC. If we move the counter to the LLC’s sd 
level, we would only know the aggregate number of tasks in the entire LLC 
that prefer that destination—not which rq they reside on. Without per-rq 
counts, we would not be able to select the correct source rq to pull tasks from.

The only way this could work at the LLC-sd level is if all CPUs within 
the LLC shared a single runqueue, which is not the case today.

Let me know if I understand your comments correctly.

Tim

Re: [PATCH v2 07/23] sched/cache: Introduce per runqueue task LLC preference counter

Posted by Peter Zijlstra 1 month, 4 weeks ago

On Wed, Dec 10, 2025 at 10:49:14AM -0800, Tim Chen wrote:
> On Wed, 2025-12-10 at 13:51 +0100, Peter Zijlstra wrote:
> > On Wed, Dec 03, 2025 at 03:07:26PM -0800, Tim Chen wrote:

> > Would it perhaps be easier to stick this thing in rq->sd rather than in
> > rq->nr_pref_llc. That way it automagically switches with the 'new'
> > domain. And then, with a bit of care, a singe load-balance pass should
> > see a consistent view (there should not be reloads of rq->sd -- which
> > will be a bit of an audit I suppose).
> 
> We need nr_pref_llc information at the runqueue level because the load balancer 
> must identify which specific rq has the largest number of tasks that 
> prefer a given destination LLC. If we move the counter to the LLC’s sd 
> level, we would only know the aggregate number of tasks in the entire LLC 
> that prefer that destination—not which rq they reside on. Without per-rq 
> counts, we would not be able to select the correct source rq to pull tasks from.
> 
> The only way this could work at the LLC-sd level is if all CPUs within 
> the LLC shared a single runqueue, which is not the case today.
> 
> Let me know if I understand your comments correctly.

So the sched_domain instances are per-cpu (hence the need for
sched_domain_shared). So irrespective of what level you stick them at (I
was thinking the bottom most, but it really doesn't matter) they will be
per CPU.

Re: [PATCH v2 07/23] sched/cache: Introduce per runqueue task LLC preference counter

Posted by Tim Chen 1 month, 3 weeks ago

On Thu, 2025-12-11 at 11:31 +0100, Peter Zijlstra wrote:
> On Wed, Dec 10, 2025 at 10:49:14AM -0800, Tim Chen wrote:
> > On Wed, 2025-12-10 at 13:51 +0100, Peter Zijlstra wrote:
> > > On Wed, Dec 03, 2025 at 03:07:26PM -0800, Tim Chen wrote:
> 
> > > Would it perhaps be easier to stick this thing in rq->sd rather than in
> > > rq->nr_pref_llc. That way it automagically switches with the 'new'
> > > domain. And then, with a bit of care, a singe load-balance pass should
> > > see a consistent view (there should not be reloads of rq->sd -- which
> > > will be a bit of an audit I suppose).
> > 
> > We need nr_pref_llc information at the runqueue level because the load balancer 
> > must identify which specific rq has the largest number of tasks that 
> > prefer a given destination LLC. If we move the counter to the LLC’s sd 
> > level, we would only know the aggregate number of tasks in the entire LLC 
> > that prefer that destination—not which rq they reside on. Without per-rq 
> > counts, we would not be able to select the correct source rq to pull tasks from.
> > 
> > The only way this could work at the LLC-sd level is if all CPUs within 
> > the LLC shared a single runqueue, which is not the case today.
> > 
> > Let me know if I understand your comments correctly.
> 
> So the sched_domain instances are per-cpu (hence the need for
> sched_domain_shared). So irrespective of what level you stick them at (I
> was thinking the bottom most, but it really doesn't matter) they will be
> per CPU.

One side effect of that is when rebuild_sched_domains() got triggered, all
rq->sd is getting reallocated. So we'll lose the old LLC preferences till
we have time to re-sample process occupancy. I think it is okay as long
as the call to rebuild_sched_domains() too frequently.  Is this assumption
correct?

Tim

Re: [PATCH v2 07/23] sched/cache: Introduce per runqueue task LLC preference counter

Posted by Tim Chen 1 month, 3 weeks ago

On Thu, 2025-12-11 at 11:31 +0100, Peter Zijlstra wrote:
> On Wed, Dec 10, 2025 at 10:49:14AM -0800, Tim Chen wrote:
> > On Wed, 2025-12-10 at 13:51 +0100, Peter Zijlstra wrote:
> > > On Wed, Dec 03, 2025 at 03:07:26PM -0800, Tim Chen wrote:
> 
> > > Would it perhaps be easier to stick this thing in rq->sd rather than in
> > > rq->nr_pref_llc. That way it automagically switches with the 'new'
> > > domain. And then, with a bit of care, a singe load-balance pass should
> > > see a consistent view (there should not be reloads of rq->sd -- which
> > > will be a bit of an audit I suppose).
> > 
> > We need nr_pref_llc information at the runqueue level because the load balancer 
> > must identify which specific rq has the largest number of tasks that 
> > prefer a given destination LLC. If we move the counter to the LLC’s sd 
> > level, we would only know the aggregate number of tasks in the entire LLC 
> > that prefer that destination—not which rq they reside on. Without per-rq 
> > counts, we would not be able to select the correct source rq to pull tasks from.
> > 
> > The only way this could work at the LLC-sd level is if all CPUs within 
> > the LLC shared a single runqueue, which is not the case today.
> > 
> > Let me know if I understand your comments correctly.
> 
> So the sched_domain instances are per-cpu (hence the need for
> sched_domain_shared). So irrespective of what level you stick them at (I
> was thinking the bottom most, but it really doesn't matter) they will be
> per CPU.
> 

Okay, I see what you're saying.  Will update code accordingly.

Tim

Re: [PATCH v2 07/23] sched/cache: Introduce per runqueue task LLC preference counter

Posted by Peter Zijlstra 1 month, 4 weeks ago

On Wed, Dec 03, 2025 at 03:07:26PM -0800, Tim Chen wrote:

> +static int resize_llc_pref(void)
> +{
> +	unsigned int *__percpu *tmp_llc_pref;
> +	int i, ret = 0;
> +
> +	if (new_max_llcs <= max_llcs)
> +		return 0;
> +
> +	/*
> +	 * Allocate temp percpu pointer for old llc_pref,
> +	 * which will be released after switching to the
> +	 * new buffer.
> +	 */
> +	tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
> +	if (!tmp_llc_pref)
> +		return -ENOMEM;
> +
> +	for_each_present_cpu(i)
> +		*per_cpu_ptr(tmp_llc_pref, i) = NULL;
> +
> +	/*
> +	 * Resize the per rq nr_pref_llc buffer and
> +	 * switch to this new buffer.
> +	 */
> +	for_each_present_cpu(i) {
> +		struct rq_flags rf;
> +		unsigned int *new;
> +		struct rq *rq;
> +
> +		rq = cpu_rq(i);
> +		new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
> +		if (!new) {
> +			ret = -ENOMEM;
> +
> +			goto release_old;
> +		}
> +
> +		/*
> +		 * Locking rq ensures that rq->nr_pref_llc values
> +		 * don't change with new task enqueue/dequeue
> +		 * when we repopulate the newly enlarged array.
> +		 */
> +		rq_lock_irqsave(rq, &rf);
> +		populate_new_pref_llcs(rq->nr_pref_llc, new);
> +		rq->nr_pref_llc = new;
> +		rq_unlock_irqrestore(rq, &rf);
> +	}
> +
> +release_old:
> +	/*
> +	 * Load balance is done under rcu_lock.
> +	 * Wait for load balance before and during resizing to
> +	 * be done. They may refer to old nr_pref_llc[]
> +	 * that hasn't been resized.
> +	 */
> +	synchronize_rcu();
> +	for_each_present_cpu(i)
> +		kfree(*per_cpu_ptr(tmp_llc_pref, i));
> +
> +	free_percpu(tmp_llc_pref);
> +
> +	/* succeed and update */
> +	if (!ret)
> +		max_llcs = new_max_llcs;
> +
> +	return ret;
> +}

> @@ -2674,6 +2787,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>  	if (has_cluster)
>  		static_branch_inc_cpuslocked(&sched_cluster_active);
>  
> +	resize_llc_pref();
> +
>  	if (rq && sched_debug_verbose)
>  		pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));

I suspect people will hate on you for that synchronize_rcu() in there.

Specifically, we do build_sched_domain() for every CPU brought online,
this means booting 512 CPUs now includes 512 sync_rcu()s.

Worse, IIRC sync_rcu() is O(n) (or worse -- could be n*ln(n)) in number
of CPUs, so the total thing will be O(n^2) (or worse) for bringing CPUs
online.

Re: [PATCH v2 07/23] sched/cache: Introduce per runqueue task LLC preference counter

Posted by Tim Chen 1 month, 4 weeks ago

On Wed, 2025-12-10 at 13:43 +0100, Peter Zijlstra wrote:
> On Wed, Dec 03, 2025 at 03:07:26PM -0800, Tim Chen wrote:
> 
> > +static int resize_llc_pref(void)
> > +{
> > +	unsigned int *__percpu *tmp_llc_pref;
> > +	int i, ret = 0;
> > +
> > +	if (new_max_llcs <= max_llcs)
> > +		return 0;
> > +
> > +	/*
> > +	 * Allocate temp percpu pointer for old llc_pref,
> > +	 * which will be released after switching to the
> > +	 * new buffer.
> > +	 */
> > +	tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
> > +	if (!tmp_llc_pref)
> > +		return -ENOMEM;
> > +
> > +	for_each_present_cpu(i)
> > +		*per_cpu_ptr(tmp_llc_pref, i) = NULL;
> > +
> > +	/*
> > +	 * Resize the per rq nr_pref_llc buffer and
> > +	 * switch to this new buffer.
> > +	 */
> > +	for_each_present_cpu(i) {
> > +		struct rq_flags rf;
> > +		unsigned int *new;
> > +		struct rq *rq;
> > +
> > +		rq = cpu_rq(i);
> > +		new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
> > +		if (!new) {
> > +			ret = -ENOMEM;
> > +
> > +			goto release_old;
> > +		}
> > +
> > +		/*
> > +		 * Locking rq ensures that rq->nr_pref_llc values
> > +		 * don't change with new task enqueue/dequeue
> > +		 * when we repopulate the newly enlarged array.
> > +		 */
> > +		rq_lock_irqsave(rq, &rf);
> > +		populate_new_pref_llcs(rq->nr_pref_llc, new);
> > +		rq->nr_pref_llc = new;
> > +		rq_unlock_irqrestore(rq, &rf);
> > +	}
> > +
> > +release_old:
> > +	/*
> > +	 * Load balance is done under rcu_lock.
> > +	 * Wait for load balance before and during resizing to
> > +	 * be done. They may refer to old nr_pref_llc[]
> > +	 * that hasn't been resized.
> > +	 */
> > +	synchronize_rcu();
> > +	for_each_present_cpu(i)
> > +		kfree(*per_cpu_ptr(tmp_llc_pref, i));
> > +
> > +	free_percpu(tmp_llc_pref);
> > +
> > +	/* succeed and update */
> > +	if (!ret)
> > +		max_llcs = new_max_llcs;
> > +
> > +	return ret;
> > +}
> 
> > @@ -2674,6 +2787,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> >  	if (has_cluster)
> >  		static_branch_inc_cpuslocked(&sched_cluster_active);
> >  
> > +	resize_llc_pref();
> > +
> >  	if (rq && sched_debug_verbose)
> >  		pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));
> 
> I suspect people will hate on you for that synchronize_rcu() in there.
> 
> Specifically, we do build_sched_domain() for every CPU brought online,
> this means booting 512 CPUs now includes 512 sync_rcu()s.
> Worse, IIRC sync_rcu() is O(n) (or worse -- could be n*ln(n)) in number
> of CPUs, so the total thing will be O(n^2) (or worse) for bringing CPUs
> online.
> 
> 

Though we only do sychronize_rcu in resize_llc_pref() when we encounter a new LLC, 
and need a larger array of LLCs, and not on
every CPU. That said, I agree that free is better done in a RCU call back
to avoid scynchronize_rcu overhead.

Tim

Re: [PATCH v2 07/23] sched/cache: Introduce per runqueue task LLC preference counter

Posted by Peter Zijlstra 2 months ago

On Wed, Dec 03, 2025 at 03:07:26PM -0800, Tim Chen wrote:

> +#ifdef CONFIG_SCHED_CACHE
> +
> +static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
> +{
> +	unsigned int *new = NULL;
> +
> +	new = kcalloc(new_max_llcs, sizeof(unsigned int),
> +		      GFP_KERNEL | __GFP_NOWARN);
> +
> +	if (!new) {
> +		*gc = NULL;
> +	} else {
> +		/*
> +		 * Place old entry in garbage collector
> +		 * for later disposal.
> +		 */
> +		*gc = old;
> +	}
> +	return new;
> +}
> +
> +static void populate_new_pref_llcs(unsigned int *old, unsigned int *new)
> +{
> +	int i;
> +
> +	if (!old)
> +		return;
> +
> +	for (i = 0; i < max_llcs; i++)
> +		new[i] = old[i];
> +}
> +
> +static int resize_llc_pref(void)
> +{
> +	unsigned int *__percpu *tmp_llc_pref;
> +	int i, ret = 0;
> +
> +	if (new_max_llcs <= max_llcs)
> +		return 0;
> +
> +	/*
> +	 * Allocate temp percpu pointer for old llc_pref,
> +	 * which will be released after switching to the
> +	 * new buffer.
> +	 */
> +	tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
> +	if (!tmp_llc_pref)
> +		return -ENOMEM;
> +
> +	for_each_present_cpu(i)
> +		*per_cpu_ptr(tmp_llc_pref, i) = NULL;
> +
> +	/*
> +	 * Resize the per rq nr_pref_llc buffer and
> +	 * switch to this new buffer.
> +	 */
> +	for_each_present_cpu(i) {
> +		struct rq_flags rf;
> +		unsigned int *new;
> +		struct rq *rq;
> +
> +		rq = cpu_rq(i);
> +		new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
> +		if (!new) {
> +			ret = -ENOMEM;
> +
> +			goto release_old;
> +		}
> +
> +		/*
> +		 * Locking rq ensures that rq->nr_pref_llc values
> +		 * don't change with new task enqueue/dequeue
> +		 * when we repopulate the newly enlarged array.
> +		 */

		guard(rq_lock_irq)(rq);

Notably, this cannot be with IRQs disabled, as you're doing allocations.

> +		rq_lock_irqsave(rq, &rf);
> +		populate_new_pref_llcs(rq->nr_pref_llc, new);
> +		rq->nr_pref_llc = new;
> +		rq_unlock_irqrestore(rq, &rf);
> +	}
> +
> +release_old:
> +	/*
> +	 * Load balance is done under rcu_lock.
> +	 * Wait for load balance before and during resizing to
> +	 * be done. They may refer to old nr_pref_llc[]
> +	 * that hasn't been resized.
> +	 */
> +	synchronize_rcu();
> +	for_each_present_cpu(i)
> +		kfree(*per_cpu_ptr(tmp_llc_pref, i));
> +
> +	free_percpu(tmp_llc_pref);
> +
> +	/* succeed and update */
> +	if (!ret)
> +		max_llcs = new_max_llcs;
> +
> +	return ret;
> +}

I think you need at least cpus_read_lock(), because present_cpu is
dynamic -- but I'm not quite sure what lock is used to serialize it.

Re: [PATCH v2 07/23] sched/cache: Introduce per runqueue task LLC preference counter

Posted by Tim Chen 1 month, 4 weeks ago

On Tue, 2025-12-09 at 14:06 +0100, Peter Zijlstra wrote:
> On Wed, Dec 03, 2025 at 03:07:26PM -0800, Tim Chen wrote:
> 
> > +#ifdef CONFIG_SCHED_CACHE
> > +
> > +static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
> > +{
> > +	unsigned int *new = NULL;
> > +
> > +	new = kcalloc(new_max_llcs, sizeof(unsigned int),
> > +		      GFP_KERNEL | __GFP_NOWARN);
> > +
> > +	if (!new) {
> > +		*gc = NULL;
> > +	} else {
> > +		/*
> > +		 * Place old entry in garbage collector
> > +		 * for later disposal.
> > +		 */
> > +		*gc = old;
> > +	}
> > +	return new;
> > +}
> > +
> > +static void populate_new_pref_llcs(unsigned int *old, unsigned int *new)
> > +{
> > +	int i;
> > +
> > +	if (!old)
> > +		return;
> > +
> > +	for (i = 0; i < max_llcs; i++)
> > +		new[i] = old[i];
> > +}
> > +
> > +static int resize_llc_pref(void)
> > +{
> > +	unsigned int *__percpu *tmp_llc_pref;
> > +	int i, ret = 0;
> > +
> > +	if (new_max_llcs <= max_llcs)
> > +		return 0;
> > +
> > +	/*
> > +	 * Allocate temp percpu pointer for old llc_pref,
> > +	 * which will be released after switching to the
> > +	 * new buffer.
> > +	 */
> > +	tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
> > +	if (!tmp_llc_pref)
> > +		return -ENOMEM;
> > +
> > +	for_each_present_cpu(i)
> > +		*per_cpu_ptr(tmp_llc_pref, i) = NULL;
> > +
> > +	/*
> > +	 * Resize the per rq nr_pref_llc buffer and
> > +	 * switch to this new buffer.
> > +	 */
> > +	for_each_present_cpu(i) {
> > +		struct rq_flags rf;
> > +		unsigned int *new;
> > +		struct rq *rq;
> > +
> > +		rq = cpu_rq(i);
> > +		new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
> > +		if (!new) {
> > +			ret = -ENOMEM;
> > +
> > +			goto release_old;
> > +		}
> > +
> > +		/*
> > +		 * Locking rq ensures that rq->nr_pref_llc values
> > +		 * don't change with new task enqueue/dequeue
> > +		 * when we repopulate the newly enlarged array.
> > +		 */
> 
> 		guard(rq_lock_irq)(rq);
> 
> Notably, this cannot be with IRQs disabled, as you're doing allocations.

Okay.

> 
> > +		rq_lock_irqsave(rq, &rf);
> > +		populate_new_pref_llcs(rq->nr_pref_llc, new);
> > +		rq->nr_pref_llc = new;
> > +		rq_unlock_irqrestore(rq, &rf);
> > +	}
> > +
> > +release_old:
> > +	/*
> > +	 * Load balance is done under rcu_lock.
> > +	 * Wait for load balance before and during resizing to
> > +	 * be done. They may refer to old nr_pref_llc[]
> > +	 * that hasn't been resized.
> > +	 */
> > +	synchronize_rcu();
> > +	for_each_present_cpu(i)
> > +		kfree(*per_cpu_ptr(tmp_llc_pref, i));
> > +
> > +	free_percpu(tmp_llc_pref);
> > +
> > +	/* succeed and update */
> > +	if (!ret)
> > +		max_llcs = new_max_llcs;
> > +
> > +	return ret;
> > +}
> 
> I think you need at least cpus_read_lock(), because present_cpu is
> dynamic -- but I'm not quite sure what lock is used to serialize it.

Let me check on what is the right lock for making sure present_cpu
is not chenged.  Thanks.

Tim