[v3] sched: Rewrite MM CID management

[patch V3 13/20] sched/mmcid: Provide precomputed maximal value

Posted by Thomas Gleixner 3 months, 1 week ago

Reading mm::mm_users and mm:::mm_cid::nr_cpus_allowed everytime to compute
the maximal CID value is just wasteful as that value is only changing on
fork(), exit() and eventually when the affinity changes.

So it can be easily precomputed at those points and provided in mm::mm_cid
for consumption in the hot path.

But there is an issue with using mm::mm_users for accounting because that
does not necessarily reflect the number of user space tasks as other kernel
code can take temporary references on the MM which skew the picture.

Solve that by adding a users counter to struct mm_mm_cid, which is modified
by fork() and exit() and used for precomputing under mm_mm_cid::lock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rseq_types.h |    6 ++++
 kernel/fork.c              |    1 
 kernel/sched/core.c        |   59 ++++++++++++++++++++++++++++++++-------------
 kernel/sched/sched.h       |    3 --
 4 files changed, 50 insertions(+), 19 deletions(-)

--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -117,14 +117,20 @@ struct mm_cid_pcpu {
 /**
  * struct mm_mm_cid - Storage for per MM CID data
  * @pcpu:		Per CPU storage for CIDs associated to a CPU
+ * @max_cids:		The exclusive maximum CID value for allocation and convergance
  * @nr_cpus_allowed:	The number of CPUs in the per MM allowed CPUs map. The map
  *			is growth only.
+ * @users:		The number of tasks sharing this MM. Seperate from mm::mm_users
+ *			as that is modified by mmget()/mm_put() by other entities which
+ *			do not actually share the MM.
  * @lock:		Spinlock to protect all fields except @pcpu. It also protects
  *			the MM cid cpumask and the MM cidmask bitmap.
  */
 struct mm_mm_cid {
 	struct mm_cid_pcpu	__percpu *pcpu;
+	unsigned int		max_cids;
 	unsigned int		nr_cpus_allowed;
+	unsigned int		users;
 	raw_spinlock_t		lock;
 }____cacheline_aligned_in_smp;
 #else /* CONFIG_SCHED_MM_CID */
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2455,6 +2455,7 @@ static bool need_futex_hash_allocate_def
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
 	if (p->mm) {
+		sched_mm_cid_exit(p);
 		mm_clear_owner(p->mm, p);
 		mmput(p->mm);
 	}
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4485,7 +4485,6 @@ static void __sched_fork(u64 clone_flags
 	init_numa_balancing(clone_flags, p);
 	p->wake_entry.u_flags = CSD_TYPE_TTWU;
 	p->migration_pending = NULL;
-	init_sched_mm_cid(p);
 }
 
 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -10369,15 +10368,27 @@ void call_trace_sched_update_nr_running(
 
 #ifdef CONFIG_SCHED_MM_CID
 /*
- * When a task exits, the MM CID held by the task is not longer required as
- * the task cannot return to user space.
+ * Update the CID range properties when the constraints change. Invoked via
+ * fork(), exit() and affinity changes
  */
+static void mm_update_max_cids(struct mm_struct *mm)
+{
+	struct mm_mm_cid *mc = &mm->mm_cid;
+	unsigned int max_cids;
+
+	lockdep_assert_held(&mm->mm_cid.lock);
+
+	/* Calculate the new maximum constraint */
+	max_cids = min(mc->nr_cpus_allowed, mc->users);
+	WRITE_ONCE(mc->max_cids, max_cids);
+}
+
 static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk)
 {
 	struct cpumask *mm_allowed;
 	unsigned int weight;
 
-	if (!mm)
+	if (!mm || !READ_ONCE(mm->mm_cid.users))
 		return;
 
 	/*
@@ -10387,9 +10398,30 @@ static inline void mm_update_cpus_allowe
 	guard(raw_spinlock)(&mm->mm_cid.lock);
 	mm_allowed = mm_cpus_allowed(mm);
 	weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk);
+	if (weight == mm->mm_cid.nr_cpus_allowed)
+		return;
 	WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight);
+	mm_update_max_cids(mm);
+}
+
+void sched_mm_cid_fork(struct task_struct *t)
+{
+	struct mm_struct *mm = t->mm;
+
+	WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
+
+	guard(raw_spinlock)(&mm->mm_cid.lock);
+	t->mm_cid.active = 1;
+	mm->mm_cid.users++;
+	/* Preset last_cid for mm_cid_select() */
+	t->mm_cid.last_cid = READ_ONCE(mm->mm_cid.max_cids) - 1;
+	mm_update_max_cids(mm);
 }
 
+/*
+ * When a task exits, the MM CID held by the task is not longer required as
+ * the task cannot return to user space.
+ */
 void sched_mm_cid_exit(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
@@ -10397,12 +10429,14 @@ void sched_mm_cid_exit(struct task_struc
 	if (!mm || !t->mm_cid.active)
 		return;
 
-	guard(preempt)();
+	guard(raw_spinlock)(&mm->mm_cid.lock);
 	t->mm_cid.active = 0;
+	mm->mm_cid.users--;
 	if (t->mm_cid.cid != MM_CID_UNSET) {
 		clear_bit(t->mm_cid.cid, mm_cidmask(mm));
 		t->mm_cid.cid = MM_CID_UNSET;
 	}
+	mm_update_max_cids(mm);
 }
 
 /* Deactivate MM CID allocation across execve() */
@@ -10414,22 +10448,11 @@ void sched_mm_cid_before_execve(struct t
 /* Reactivate MM CID after successful execve() */
 void sched_mm_cid_after_execve(struct task_struct *t)
 {
-	struct mm_struct *mm = t->mm;
-
-	if (!mm)
-		return;
-
+	sched_mm_cid_fork(t);
 	guard(preempt)();
-	t->mm_cid.active = 1;
 	mm_cid_select(t);
 }
 
-void sched_mm_cid_fork(struct task_struct *t)
-{
-	WARN_ON_ONCE(!t->mm || t->mm_cid.cid != MM_CID_UNSET);
-	t->mm_cid.active = 1;
-}
-
 void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
 {
 	struct mm_cid_pcpu __percpu *pcpu = mm->mm_cid.pcpu;
@@ -10438,7 +10461,9 @@ void mm_init_cid(struct mm_struct *mm, s
 	for_each_possible_cpu(cpu)
 		per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET;
 
+	mm->mm_cid.max_cids = 0;
 	mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
+	mm->mm_cid.users = 0;
 	raw_spin_lock_init(&mm->mm_cid.lock);
 	cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
 	bitmap_zero(mm_cidmask(mm), bitmap_size(num_possible_cpus()));
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3570,7 +3570,7 @@ static inline bool mm_cid_get(struct tas
 	struct mm_struct *mm = t->mm;
 	unsigned int max_cids;
 
-	max_cids = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users));
+	max_cids = READ_ONCE(mm->mm_cid.max_cids);
 
 	/* Try to reuse the last CID of this task */
 	if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids))
@@ -3613,7 +3613,6 @@ static inline void switch_mm_cid(struct
 }
 
 #else /* !CONFIG_SCHED_MM_CID: */
-static inline void init_sched_mm_cid(struct task_struct *t) { }
 static inline void mm_cid_select(struct task_struct *t) { }
 static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
 #endif /* !CONFIG_SCHED_MM_CID */

Re: [patch V3 13/20] sched/mmcid: Provide precomputed maximal value

Posted by Mathieu Desnoyers 3 months, 1 week ago

On 2025-10-29 09:09, Thomas Gleixner wrote:
> Reading mm::mm_users and mm:::mm_cid::nr_cpus_allowed everytime to compute

every time

> + * @max_cids:		The exclusive maximum CID value for allocation and convergance

convergence

> + * @users:		The number of tasks sharing this MM. Seperate from mm::mm_users

Separate

> + *			as that is modified by mmget()/mm_put() by other entities which
> + *			do not actually share the MM.
>    * @lock:		Spinlock to protect all fields except @pcpu. It also protects
>    *			the MM cid cpumask and the MM cidmask bitmap.
>    */
>   struct mm_mm_cid {
>   	struct mm_cid_pcpu	__percpu *pcpu;
> +	unsigned int		max_cids;
>   	unsigned int		nr_cpus_allowed;
> +	unsigned int		users;

I suspect this reintroduces false-sharing between the "users"
and "lock" fields (updated every time a thread is forked/exits)
and load of the pcpu pointer which is pretty much immutable.
This will slow down accesses to the percpu data in the scheduler
fast path.

>   	raw_spinlock_t		lock;
>   }____cacheline_aligned_in_smp;
>   #else /* CONFIG_SCHED_MM_CID */

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

Re: [patch V3 13/20] sched/mmcid: Provide precomputed maximal value

Posted by Thomas Gleixner 3 months, 1 week ago

On Thu, Oct 30 2025 at 10:23, Mathieu Desnoyers wrote:
> On 2025-10-29 09:09, Thomas Gleixner wrote:
>>   struct mm_mm_cid {
>>   	struct mm_cid_pcpu	__percpu *pcpu;
>> +	unsigned int		max_cids;
>>   	unsigned int		nr_cpus_allowed;
>> +	unsigned int		users;
>
> I suspect this reintroduces false-sharing between the "users"
> and "lock" fields (updated every time a thread is forked/exits)
> and load of the pcpu pointer which is pretty much immutable.
> This will slow down accesses to the percpu data in the scheduler
> fast path.

At this point yes, but when all bits are in place then the lock fields
end up in a different cache line.

The false sharing issue vs. *pcpu and max_cids is minor, but I can move
the low frequency modified members past the work, so it does not matter
at all. The work stuff is rarely used, so there is no point to worry
about the occasional cache line contention on that.

Thanks,

        tglx

Re: [patch V3 13/20] sched/mmcid: Provide precomputed maximal value

Posted by Mathieu Desnoyers 3 months, 1 week ago

On 2025-10-31 11:06, Thomas Gleixner wrote:
> On Thu, Oct 30 2025 at 10:23, Mathieu Desnoyers wrote:
>> On 2025-10-29 09:09, Thomas Gleixner wrote:
>>>    struct mm_mm_cid {
>>>    	struct mm_cid_pcpu	__percpu *pcpu;
>>> +	unsigned int		max_cids;
>>>    	unsigned int		nr_cpus_allowed;
>>> +	unsigned int		users;
>>
>> I suspect this reintroduces false-sharing between the "users"
>> and "lock" fields (updated every time a thread is forked/exits)
>> and load of the pcpu pointer which is pretty much immutable.
>> This will slow down accesses to the percpu data in the scheduler
>> fast path.
> 
> At this point yes, but when all bits are in place then the lock fields
> end up in a different cache line.
> 
> The false sharing issue vs. *pcpu and max_cids is minor, but I can move
> the low frequency modified members past the work, so it does not matter
> at all. The work stuff is rarely used, so there is no point to worry
> about the occasional cache line contention on that.

If we have alignment requirements on fields that matter for performance,
I recommend using __cacheline_group_{begin,end}_aligned() to make this
explicit. See include/linux/cache.h.

Thanks,

Mathieu



-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com