This is a follow up to the V1 submission:
https://lore.kernel.org/20260129210219.452851594@kernel.org
Ihor and Shrikanth reported hard lockups which can be tracked back to the recent
rewrite of the MM_CID management code.
1) The from task to CPU ownership transition lacks the intermediate
transition mode, which can lead to CID pool exhaustion and a
subsequent live lock. That intermediate mode was implemented for the
reverse operation already but omitted for this transition as the
original analysis missed a few possible scheduling scenarios.
2) Weakly ordered architectures can observe inconsistent state which
causes them to make the wrong decision. That leads to the same problem
as with #1.
The following series addresses these issue and fixes another albeit harmless
inconsistent state hickup which was found when analysing the above issues.
With these issues addressed the last change optimizes the bitmap
utilization in the transition modes.
The series applies on Linus tree and passes the selftests and a thread pool
emulator which stress tests the ownership transitions.
Changes vs. V1:
- Move the mm_cid_fixup_tasks_to_cpus() wrapping where it belongs (patch 1)
- Add barriers before and after the fixup functions to prevent CPU
reordering of the mode stores - Mathieu
- Update change logs - Mathieu
Delta patch against V1 is below
Thanks,
tglx
---
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -133,7 +133,6 @@ struct mm_cid_pcpu {
* as that is modified by mmget()/mm_put() by other entities which
* do not actually share the MM.
* @pcpu_thrs: Threshold for switching back from per CPU mode
- * @mode_change: Mode change in progress
* @update_deferred: A deferred switch back to per task mode is pending.
*/
struct mm_mm_cid {
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10445,6 +10445,12 @@ static bool mm_update_max_cids(struct mm
/* Flip the mode and set the transition flag to bridge the transfer */
WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU));
+ /*
+ * Order the store against the subsequent fixups so that
+ * acquire(rq::lock) cannot be reordered by the CPU before the
+ * store.
+ */
+ smp_mb();
return true;
}
@@ -10487,6 +10493,16 @@ static inline void mm_update_cpus_allowe
irq_work_queue(&mc->irq_work);
}
+static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int mode)
+{
+ /*
+ * Ensure that the store removing the TRANSIT bit cannot be
+ * reordered by the CPU before the fixups have been completed.
+ */
+ smp_mb();
+ WRITE_ONCE(mm->mm_cid.mode, mode);
+}
+
static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
{
if (cid_on_cpu(t->mm_cid.cid)) {
@@ -10530,8 +10546,7 @@ static void mm_cid_fixup_cpus_to_tasks(s
}
}
}
- /* Clear the transition bit in the mode */
- WRITE_ONCE(mm->mm_cid.mode, 0);
+ mm_cid_complete_transit(mm, 0);
}
static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
@@ -10603,8 +10618,7 @@ static void mm_cid_fixup_tasks_to_cpus(v
struct mm_struct *mm = current->mm;
mm_cid_do_fixup_tasks_to_cpus(mm);
- /* Clear the transition bit in the mode */
- WRITE_ONCE(mm->mm_cid.mode, MM_CID_ONCPU);
+ mm_cid_complete_transit(mm, MM_CID_ONCPU);
}
static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3914,8 +3914,7 @@ static __always_inline void mm_cid_sched
/*
* If transition mode is done, transfer ownership when the CID is
- * within the convergion range. Otherwise the next schedule in will
- * have to allocate or converge
+ * within the convergence range to optimize the next schedule in.
*/
if (!cid_in_transit(mode) && cid < READ_ONCE(mm->mm_cid.max_cids)) {
if (cid_on_cpu(mode))
On Mon, Feb 02, 2026 at 10:39:35AM +0100, Thomas Gleixner wrote:
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -10445,6 +10445,12 @@ static bool mm_update_max_cids(struct mm
>
> /* Flip the mode and set the transition flag to bridge the transfer */
> WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU));
> + /*
> + * Order the store against the subsequent fixups so that
> + * acquire(rq::lock) cannot be reordered by the CPU before the
> + * store.
> + */
> + smp_mb();
> return true;
> }
>
> @@ -10487,6 +10493,16 @@ static inline void mm_update_cpus_allowe
> irq_work_queue(&mc->irq_work);
> }
>
> +static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int mode)
> +{
> + /*
> + * Ensure that the store removing the TRANSIT bit cannot be
> + * reordered by the CPU before the fixups have been completed.
> + */
> + smp_mb();
> + WRITE_ONCE(mm->mm_cid.mode, mode);
> +}
I think this could've been smp_store_release(), but this is the slow
path so nobody cares and this is nicely symmetric.
On 2026-02-02 05:14, Peter Zijlstra wrote:
> On Mon, Feb 02, 2026 at 10:39:35AM +0100, Thomas Gleixner wrote:
>
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -10445,6 +10445,12 @@ static bool mm_update_max_cids(struct mm
>>
>> /* Flip the mode and set the transition flag to bridge the transfer */
>> WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU));
>> + /*
>> + * Order the store against the subsequent fixups so that
>> + * acquire(rq::lock) cannot be reordered by the CPU before the
>> + * store.
>> + */
>> + smp_mb();
>> return true;
>> }
>>
>> @@ -10487,6 +10493,16 @@ static inline void mm_update_cpus_allowe
>> irq_work_queue(&mc->irq_work);
>> }
>>
>> +static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int mode)
>> +{
>> + /*
>> + * Ensure that the store removing the TRANSIT bit cannot be
>> + * reordered by the CPU before the fixups have been completed.
>> + */
>> + smp_mb();
>> + WRITE_ONCE(mm->mm_cid.mode, mode);
>> +}
>
> I think this could've been smp_store_release(), but this is the slow
> path so nobody cares and this is nicely symmetric.
I'm not sure the store-release would work here. What load-acquire
would it pair with ?
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
On Mon, Feb 02, 2026 at 06:46:34AM -0500, Mathieu Desnoyers wrote:
> On 2026-02-02 05:14, Peter Zijlstra wrote:
> > On Mon, Feb 02, 2026 at 10:39:35AM +0100, Thomas Gleixner wrote:
> >
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -10445,6 +10445,12 @@ static bool mm_update_max_cids(struct mm
> > > /* Flip the mode and set the transition flag to bridge the transfer */
> > > WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU));
> > > + /*
> > > + * Order the store against the subsequent fixups so that
> > > + * acquire(rq::lock) cannot be reordered by the CPU before the
> > > + * store.
> > > + */
> > > + smp_mb();
> > > return true;
> > > }
> > > @@ -10487,6 +10493,16 @@ static inline void mm_update_cpus_allowe
> > > irq_work_queue(&mc->irq_work);
> > > }
> > > +static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int mode)
> > > +{
> > > + /*
> > > + * Ensure that the store removing the TRANSIT bit cannot be
> > > + * reordered by the CPU before the fixups have been completed.
> > > + */
> > > + smp_mb();
> > > + WRITE_ONCE(mm->mm_cid.mode, mode);
> > > +}
> >
> > I think this could've been smp_store_release(), but this is the slow
> > path so nobody cares and this is nicely symmetric.
>
> I'm not sure the store-release would work here. What load-acquire
> would it pair with ?
The purpose here -- per the comment is to ensure the fixup stuff is
visible before the TRANSIT bit goes 0, store-release ensures that.
That pairs with whatever cares about this barrier now.
On Mon, Feb 02 2026 at 13:54, Peter Zijlstra wrote:
> On Mon, Feb 02, 2026 at 06:46:34AM -0500, Mathieu Desnoyers wrote:
>> On 2026-02-02 05:14, Peter Zijlstra wrote:
>> > On Mon, Feb 02, 2026 at 10:39:35AM +0100, Thomas Gleixner wrote:
>> >
>> > > --- a/kernel/sched/core.c
>> > > +++ b/kernel/sched/core.c
>> > > @@ -10445,6 +10445,12 @@ static bool mm_update_max_cids(struct mm
>> > > /* Flip the mode and set the transition flag to bridge the transfer */
>> > > WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU));
>> > > + /*
>> > > + * Order the store against the subsequent fixups so that
>> > > + * acquire(rq::lock) cannot be reordered by the CPU before the
>> > > + * store.
>> > > + */
>> > > + smp_mb();
>> > > return true;
>> > > }
>> > > @@ -10487,6 +10493,16 @@ static inline void mm_update_cpus_allowe
>> > > irq_work_queue(&mc->irq_work);
>> > > }
>> > > +static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int mode)
>> > > +{
>> > > + /*
>> > > + * Ensure that the store removing the TRANSIT bit cannot be
>> > > + * reordered by the CPU before the fixups have been completed.
>> > > + */
>> > > + smp_mb();
>> > > + WRITE_ONCE(mm->mm_cid.mode, mode);
>> > > +}
>> >
>> > I think this could've been smp_store_release(), but this is the slow
>> > path so nobody cares and this is nicely symmetric.
>>
>> I'm not sure the store-release would work here. What load-acquire
>> would it pair with ?
>
> The purpose here -- per the comment is to ensure the fixup stuff is
> visible before the TRANSIT bit goes 0, store-release ensures that.
>
> That pairs with whatever cares about this barrier now.
I thought about this and stopped reading memory-barriers.txt after brain
started to hurt.
acquire A
store B
release A
acquire C
store D
release C
A and C are independent of each other as are B and D. So according to
the docs acquire C can be reordered before release A. So far so
good. But what's unclear to me is whether this scenarion is possible:
acquire A
acquire C
store D
release C
store B
release A
because that would screw up stuff badly.
Thanks,
tglx
On 2026-02-02 07:54, Peter Zijlstra wrote:
> On Mon, Feb 02, 2026 at 06:46:34AM -0500, Mathieu Desnoyers wrote:
>> On 2026-02-02 05:14, Peter Zijlstra wrote:
>>> On Mon, Feb 02, 2026 at 10:39:35AM +0100, Thomas Gleixner wrote:
>>>
>>>> --- a/kernel/sched/core.c
>>>> +++ b/kernel/sched/core.c
>>>> @@ -10445,6 +10445,12 @@ static bool mm_update_max_cids(struct mm
>>>> /* Flip the mode and set the transition flag to bridge the transfer */
>>>> WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU));
>>>> + /*
>>>> + * Order the store against the subsequent fixups so that
>>>> + * acquire(rq::lock) cannot be reordered by the CPU before the
>>>> + * store.
>>>> + */
>>>> + smp_mb();
>>>> return true;
>>>> }
>>>> @@ -10487,6 +10493,16 @@ static inline void mm_update_cpus_allowe
>>>> irq_work_queue(&mc->irq_work);
>>>> }
>>>> +static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int mode)
>>>> +{
>>>> + /*
>>>> + * Ensure that the store removing the TRANSIT bit cannot be
>>>> + * reordered by the CPU before the fixups have been completed.
>>>> + */
>>>> + smp_mb();
>>>> + WRITE_ONCE(mm->mm_cid.mode, mode);
>>>> +}
>>>
>>> I think this could've been smp_store_release(), but this is the slow
>>> path so nobody cares and this is nicely symmetric.
>>
>> I'm not sure the store-release would work here. What load-acquire
>> would it pair with ?
>
> The purpose here -- per the comment is to ensure the fixup stuff is
> visible before the TRANSIT bit goes 0, store-release ensures that.
>
> That pairs with whatever cares about this barrier now.
Now that I think about it some more, I think my advice about adding
smp_mb() before rq lock/after rq unlock was wrong.
The store setting transit will be ordered by rq _unlock_ which acts
as a release barrier, and store clearing transit will be ordered by
rq _lock_ acting as an acquire barrier. Those pair with the respective
rq unlock/lock in the scheduler.
So AFAIU we can remove those two useless smp_mb().
Thoughts ?
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
Ihor reported a BPF CI failure which turned out to be a live lock in the
MM_CID management. The scenario is:
A test program creates the 5th thread, which means the MM_CID users become
more than the number of CPUs (four in this example), so it switches to per
CPU ownership mode.
At this point each live task of the program has a CID associated. Assume
thread creation order assignment for simplicity.
T0 CID0 runs fork() and creates T4
T1 CID1
T2 CID2
T3 CID3
T4 --- not visible yet
T0 sets mm_cid::percpu = true and transfers its own CID to CPU0 where it
runs on and then starts the fixup which walks through the threads to
transfer the per task CIDs either to the CPU the task is running on or drop
it back into the pool if the task is not on a CPU.
During that T1 - T3 are free to schedule in and out before the fixup caught
up with them. Going through all possible permutations with a python script
revealed a few problematic cases. The most trivial one is:
T1 schedules in on CPU1 and observes percpu == true, so it transfers
its CID to CPU1
T1 is migrated to CPU2 and schedule in observes percpu == true, but
CPU2 does not have a CID associated and T1 transferred its own to
CPU1
So it has to allocate one with CPU2 runqueue lock held, but the
pool is empty, so it keeps looping in mm_get_cid().
Now T0 reaches T1 in the thread walk and tries to lock the corresponding
runqueue lock, which is held causing a full live lock.
There is a similar scenario in the reverse direction of switching from per
CPU to task mode which is way more obvious and got therefore addressed by
an intermediate mode. In this mode the CIDs are marked with MM_CID_TRANSIT,
which means that they are neither owned by the CPU nor by the task. When a
task schedules out with a transit CID it drops the CID back into the pool
making it available for others to use temporarily. Once the task which
initiated the mode switch finished the fixup it clears the transit mode and
the process goes back into per task ownership mode.
Unfortunately this insight was not mapped back to the task to CPU mode
switch as the above described scenario was not considered in the analysis.
Apply the same transit mechanism to the task to CPU mode switch to handle
these problematic cases correctly.
As with the CPU to task transition this results in a potential temporary
contention on the CID bitmap, but that's only for the time it takes to
complete the transition. After that it stays in steady mode which does not
touch the bitmap at all.
Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions")
Reported-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Closes: https://lore.kernel.org/2b7463d7-0f58-4e34-9775-6e2115cfb971@linux.dev
---
V2: Massage change log - Mathieu
Move mm_cid_fixup_tasks_to_cpus() wrapping from patch 2
---
kernel/sched/core.c | 128 +++++++++++++++++++++++++++++++++------------------
kernel/sched/sched.h | 4 +
2 files changed, 88 insertions(+), 44 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10269,7 +10269,8 @@ void call_trace_sched_update_nr_running(
* Serialization rules:
*
* mm::mm_cid::mutex: Serializes fork() and exit() and therefore
- * protects mm::mm_cid::users.
+ * protects mm::mm_cid::users and mode switch
+ * transitions
*
* mm::mm_cid::lock: Serializes mm_update_max_cids() and
* mm_update_cpus_allowed(). Nests in mm_cid::mutex
@@ -10285,14 +10286,61 @@ void call_trace_sched_update_nr_running(
*
* A CID is either owned by a task (stored in task_struct::mm_cid.cid) or
* by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the
- * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode,
- * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the
- * task needs to drop the CID into the pool when scheduling out. Both bits
- * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is
- * actually handed over to user space in the RSEQ memory.
+ * MM_CID_ONCPU bit set.
+ *
+ * During the transition of ownership mode, the MM_CID_TRANSIT bit is set
+ * on the CIDs. When this bit is set the tasks drop the CID back into the
+ * pool when scheduling out.
+ *
+ * Both bits (ONCPU and TRANSIT) are filtered out by task_cid() when the
+ * CID is actually handed over to user space in the RSEQ memory.
*
* Mode switching:
*
+ * All transitions of ownership mode happen in two phases:
+ *
+ * 1) mm:mm_cid.transit contains MM_CID_TRANSIT. This is OR'ed on the CIDs
+ * and denotes that the CID is only temporarily owned by a task. When
+ * the task schedules out it drops the CID back into the pool if this
+ * bit is set.
+ *
+ * 2) The initiating context walks the per CPU space or the tasks to fixup
+ * or drop the CIDs and after completion it clears mm:mm_cid.transit.
+ * After that point the CIDs are strictly task or CPU owned again.
+ *
+ * This two phase transition is required to prevent CID space exhaustion
+ * during the transition as a direct transfer of ownership would fail:
+ *
+ * - On task to CPU mode switch if a task is scheduled in on one CPU and
+ * then migrated to another CPU before the fixup freed enough per task
+ * CIDs.
+ *
+ * - On CPU to task mode switch if two tasks are scheduled in on the same
+ * CPU before the fixup freed per CPU CIDs.
+ *
+ * Both scenarios can result in a live lock because sched_in() is invoked
+ * with runqueue lock held and loops in search of a CID and the fixup
+ * thread can't make progress freeing them up because it is stuck on the
+ * same runqueue lock.
+ *
+ * While MM_CID_TRANSIT is active during the transition phase the MM_CID
+ * bitmap can be contended, but that's a temporary contention bound to the
+ * transition period. After that everything goes back into steady state and
+ * nothing except fork() and exit() will touch the bitmap. This is an
+ * acceptable tradeoff as it completely avoids complex serialization,
+ * memory barriers and atomic operations for the common case.
+ *
+ * Aside of that this mechanism also ensures RT compability:
+ *
+ * - The task which runs the fixup is fully preemptible except for the
+ * short runqueue lock held sections.
+ *
+ * - The transient impact of the bitmap contention is only problematic
+ * when there is a thundering herd scenario of tasks scheduling in and
+ * out concurrently. There is not much which can be done about that
+ * except for avoiding mode switching by a proper overall system
+ * configuration.
+ *
* Switching to per CPU mode happens when the user count becomes greater
* than the maximum number of CIDs, which is calculated by:
*
@@ -10306,12 +10354,13 @@ void call_trace_sched_update_nr_running(
*
* At the point of switching to per CPU mode the new user is not yet
* visible in the system, so the task which initiated the fork() runs the
- * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and
- * either transfers each tasks owned CID to the CPU the task runs on or
- * drops it into the CID pool if a task is not on a CPU at that point in
- * time. Tasks which schedule in before the task walk reaches them do the
- * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes
- * it's guaranteed that no task related to that MM owns a CID anymore.
+ * fixup function. mm_cid_fixup_tasks_to_cpu() walks the thread list and
+ * either marks each task owned CID with MM_CID_TRANSIT if the task is
+ * running on a CPU or drops it into the CID pool if a task is not on a
+ * CPU. Tasks which schedule in before the task walk reaches them do the
+ * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus()
+ * completes it is guaranteed that no task related to that MM owns a CID
+ * anymore.
*
* Switching back to task mode happens when the user count goes below the
* threshold which was recorded on the per CPU mode switch:
@@ -10327,28 +10376,11 @@ void call_trace_sched_update_nr_running(
* run either in the deferred update function in context of a workqueue or
* by a task which forks a new one or by a task which exits. Whatever
* happens first. mm_cid_fixup_cpus_to_task() walks through the possible
- * CPUs and either transfers the CPU owned CIDs to a related task which
- * runs on the CPU or drops it into the pool. Tasks which schedule in on a
- * CPU which the walk did not cover yet do the handover themself.
- *
- * This transition from CPU to per task ownership happens in two phases:
- *
- * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task
- * CID and denotes that the CID is only temporarily owned by the
- * task. When it schedules out the task drops the CID back into the
- * pool if this bit is set.
- *
- * 2) The initiating context walks the per CPU space and after completion
- * clears mm:mm_cid.transit. So after that point the CIDs are strictly
- * task owned again.
- *
- * This two phase transition is required to prevent CID space exhaustion
- * during the transition as a direct transfer of ownership would fail if
- * two tasks are scheduled in on the same CPU before the fixup freed per
- * CPU CIDs.
- *
- * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
- * related to that MM is owned by a CPU anymore.
+ * CPUs and either marks the CPU owned CIDs with MM_CID_TRANSIT if a
+ * related task is running on the CPU or drops it into the pool. Tasks
+ * which are scheduled in before the fixup covered them do the handover
+ * themself. When mm_cid_fixup_cpus_to_tasks() completes it is guaranteed
+ * that no CID related to that MM is owned by a CPU anymore.
*/
/*
@@ -10400,9 +10432,9 @@ static bool mm_update_max_cids(struct mm
/* Mode change required? */
if (!!mc->percpu == !!mc->pcpu_thrs)
return false;
- /* When switching back to per TASK mode, set the transition flag */
- if (!mc->pcpu_thrs)
- WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
+
+ /* Set the transition flag to bridge the transfer */
+ WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs);
return true;
}
@@ -10493,10 +10525,10 @@ static void mm_cid_fixup_cpus_to_tasks(s
WRITE_ONCE(mm->mm_cid.transit, 0);
}
-static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
+static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
{
if (cid_on_task(t->mm_cid.cid)) {
- t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid);
+ t->mm_cid.cid = cid_to_transit_cid(t->mm_cid.cid);
pcp->cid = t->mm_cid.cid;
}
}
@@ -10509,18 +10541,17 @@ static bool mm_cid_fixup_task_to_cpu(str
if (!t->mm_cid.active)
return false;
if (cid_on_task(t->mm_cid.cid)) {
- /* If running on the CPU, transfer the CID, otherwise drop it */
+ /* If running on the CPU, put the CID in transit mode, otherwise drop it */
if (task_rq(t)->curr == t)
- mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
+ mm_cid_transit_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
else
mm_unset_cid_on_task(t);
}
return true;
}
-static void mm_cid_fixup_tasks_to_cpus(void)
+static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
{
- struct mm_struct *mm = current->mm;
struct task_struct *p, *t;
unsigned int users;
@@ -10558,6 +10589,15 @@ static void mm_cid_fixup_tasks_to_cpus(v
}
}
+static void mm_cid_fixup_tasks_to_cpus(void)
+{
+ struct mm_struct *mm = current->mm;
+
+ mm_cid_do_fixup_tasks_to_cpus(mm);
+ /* Clear the transition bit */
+ WRITE_ONCE(mm->mm_cid.transit, 0);
+}
+
static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
{
t->mm_cid.active = 1;
@@ -10596,7 +10636,7 @@ void sched_mm_cid_fork(struct task_struc
if (!percpu)
mm_cid_transit_to_task(current, pcp);
else
- mm_cid_transfer_to_cpu(current, pcp);
+ mm_cid_transit_to_cpu(current, pcp);
}
if (percpu) {
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3841,6 +3841,10 @@ static __always_inline void mm_cid_from_
/* Still nothing, allocate a new one */
if (!cid_on_cpu(cpu_cid))
cpu_cid = cid_to_cpu_cid(mm_get_cid(mm));
+
+ /* Set the transition mode flag if required */
+ if (READ_ONCE(mm->mm_cid.transit))
+ cpu_cid = cpu_cid_to_cid(cpu_cid) | MM_CID_TRANSIT;
}
mm_cid_update_pcpu_cid(mm, cpu_cid);
mm_cid_update_task_cid(t, cpu_cid);
On 2026-02-02 04:39, Thomas Gleixner wrote:
[...]
>
> Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions")
> Reported-by: Ihor Solodrai <ihor.solodrai@linux.dev>
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
> Closes: https://lore.kernel.org/2b7463d7-0f58-4e34-9775-6e2115cfb971@linux.dev
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
The following commit has been merged into the sched/urgent branch of tip:
Commit-ID: 4327fb13fa47770183c4850c35382c30ba5f939d
Gitweb: https://git.kernel.org/tip/4327fb13fa47770183c4850c35382c30ba5f939d
Author: Thomas Gleixner <tglx@kernel.org>
AuthorDate: Mon, 02 Feb 2026 10:39:40 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 04 Feb 2026 12:21:11 +01:00
sched/mmcid: Prevent live lock on task to CPU mode transition
Ihor reported a BPF CI failure which turned out to be a live lock in the
MM_CID management. The scenario is:
A test program creates the 5th thread, which means the MM_CID users become
more than the number of CPUs (four in this example), so it switches to per
CPU ownership mode.
At this point each live task of the program has a CID associated. Assume
thread creation order assignment for simplicity.
T0 CID0 runs fork() and creates T4
T1 CID1
T2 CID2
T3 CID3
T4 --- not visible yet
T0 sets mm_cid::percpu = true and transfers its own CID to CPU0 where it
runs on and then starts the fixup which walks through the threads to
transfer the per task CIDs either to the CPU the task is running on or drop
it back into the pool if the task is not on a CPU.
During that T1 - T3 are free to schedule in and out before the fixup caught
up with them. Going through all possible permutations with a python script
revealed a few problematic cases. The most trivial one is:
T1 schedules in on CPU1 and observes percpu == true, so it transfers
its CID to CPU1
T1 is migrated to CPU2 and schedule in observes percpu == true, but
CPU2 does not have a CID associated and T1 transferred its own to
CPU1
So it has to allocate one with CPU2 runqueue lock held, but the
pool is empty, so it keeps looping in mm_get_cid().
Now T0 reaches T1 in the thread walk and tries to lock the corresponding
runqueue lock, which is held causing a full live lock.
There is a similar scenario in the reverse direction of switching from per
CPU to task mode which is way more obvious and got therefore addressed by
an intermediate mode. In this mode the CIDs are marked with MM_CID_TRANSIT,
which means that they are neither owned by the CPU nor by the task. When a
task schedules out with a transit CID it drops the CID back into the pool
making it available for others to use temporarily. Once the task which
initiated the mode switch finished the fixup it clears the transit mode and
the process goes back into per task ownership mode.
Unfortunately this insight was not mapped back to the task to CPU mode
switch as the above described scenario was not considered in the analysis.
Apply the same transit mechanism to the task to CPU mode switch to handle
these problematic cases correctly.
As with the CPU to task transition this results in a potential temporary
contention on the CID bitmap, but that's only for the time it takes to
complete the transition. After that it stays in steady mode which does not
touch the bitmap at all.
Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions")
Closes: https://lore.kernel.org/2b7463d7-0f58-4e34-9775-6e2115cfb971@linux.dev
Reported-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20260201192834.897115238@kernel.org
---
kernel/sched/core.c | 128 +++++++++++++++++++++++++++---------------
kernel/sched/sched.h | 4 +-
2 files changed, 88 insertions(+), 44 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 045f83a..1e790f2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10269,7 +10269,8 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
* Serialization rules:
*
* mm::mm_cid::mutex: Serializes fork() and exit() and therefore
- * protects mm::mm_cid::users.
+ * protects mm::mm_cid::users and mode switch
+ * transitions
*
* mm::mm_cid::lock: Serializes mm_update_max_cids() and
* mm_update_cpus_allowed(). Nests in mm_cid::mutex
@@ -10285,14 +10286,61 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
*
* A CID is either owned by a task (stored in task_struct::mm_cid.cid) or
* by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the
- * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode,
- * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the
- * task needs to drop the CID into the pool when scheduling out. Both bits
- * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is
- * actually handed over to user space in the RSEQ memory.
+ * MM_CID_ONCPU bit set.
+ *
+ * During the transition of ownership mode, the MM_CID_TRANSIT bit is set
+ * on the CIDs. When this bit is set the tasks drop the CID back into the
+ * pool when scheduling out.
+ *
+ * Both bits (ONCPU and TRANSIT) are filtered out by task_cid() when the
+ * CID is actually handed over to user space in the RSEQ memory.
*
* Mode switching:
*
+ * All transitions of ownership mode happen in two phases:
+ *
+ * 1) mm:mm_cid.transit contains MM_CID_TRANSIT. This is OR'ed on the CIDs
+ * and denotes that the CID is only temporarily owned by a task. When
+ * the task schedules out it drops the CID back into the pool if this
+ * bit is set.
+ *
+ * 2) The initiating context walks the per CPU space or the tasks to fixup
+ * or drop the CIDs and after completion it clears mm:mm_cid.transit.
+ * After that point the CIDs are strictly task or CPU owned again.
+ *
+ * This two phase transition is required to prevent CID space exhaustion
+ * during the transition as a direct transfer of ownership would fail:
+ *
+ * - On task to CPU mode switch if a task is scheduled in on one CPU and
+ * then migrated to another CPU before the fixup freed enough per task
+ * CIDs.
+ *
+ * - On CPU to task mode switch if two tasks are scheduled in on the same
+ * CPU before the fixup freed per CPU CIDs.
+ *
+ * Both scenarios can result in a live lock because sched_in() is invoked
+ * with runqueue lock held and loops in search of a CID and the fixup
+ * thread can't make progress freeing them up because it is stuck on the
+ * same runqueue lock.
+ *
+ * While MM_CID_TRANSIT is active during the transition phase the MM_CID
+ * bitmap can be contended, but that's a temporary contention bound to the
+ * transition period. After that everything goes back into steady state and
+ * nothing except fork() and exit() will touch the bitmap. This is an
+ * acceptable tradeoff as it completely avoids complex serialization,
+ * memory barriers and atomic operations for the common case.
+ *
+ * Aside of that this mechanism also ensures RT compability:
+ *
+ * - The task which runs the fixup is fully preemptible except for the
+ * short runqueue lock held sections.
+ *
+ * - The transient impact of the bitmap contention is only problematic
+ * when there is a thundering herd scenario of tasks scheduling in and
+ * out concurrently. There is not much which can be done about that
+ * except for avoiding mode switching by a proper overall system
+ * configuration.
+ *
* Switching to per CPU mode happens when the user count becomes greater
* than the maximum number of CIDs, which is calculated by:
*
@@ -10306,12 +10354,13 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
*
* At the point of switching to per CPU mode the new user is not yet
* visible in the system, so the task which initiated the fork() runs the
- * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and
- * either transfers each tasks owned CID to the CPU the task runs on or
- * drops it into the CID pool if a task is not on a CPU at that point in
- * time. Tasks which schedule in before the task walk reaches them do the
- * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes
- * it's guaranteed that no task related to that MM owns a CID anymore.
+ * fixup function. mm_cid_fixup_tasks_to_cpu() walks the thread list and
+ * either marks each task owned CID with MM_CID_TRANSIT if the task is
+ * running on a CPU or drops it into the CID pool if a task is not on a
+ * CPU. Tasks which schedule in before the task walk reaches them do the
+ * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus()
+ * completes it is guaranteed that no task related to that MM owns a CID
+ * anymore.
*
* Switching back to task mode happens when the user count goes below the
* threshold which was recorded on the per CPU mode switch:
@@ -10327,28 +10376,11 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
* run either in the deferred update function in context of a workqueue or
* by a task which forks a new one or by a task which exits. Whatever
* happens first. mm_cid_fixup_cpus_to_task() walks through the possible
- * CPUs and either transfers the CPU owned CIDs to a related task which
- * runs on the CPU or drops it into the pool. Tasks which schedule in on a
- * CPU which the walk did not cover yet do the handover themself.
- *
- * This transition from CPU to per task ownership happens in two phases:
- *
- * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task
- * CID and denotes that the CID is only temporarily owned by the
- * task. When it schedules out the task drops the CID back into the
- * pool if this bit is set.
- *
- * 2) The initiating context walks the per CPU space and after completion
- * clears mm:mm_cid.transit. So after that point the CIDs are strictly
- * task owned again.
- *
- * This two phase transition is required to prevent CID space exhaustion
- * during the transition as a direct transfer of ownership would fail if
- * two tasks are scheduled in on the same CPU before the fixup freed per
- * CPU CIDs.
- *
- * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
- * related to that MM is owned by a CPU anymore.
+ * CPUs and either marks the CPU owned CIDs with MM_CID_TRANSIT if a
+ * related task is running on the CPU or drops it into the pool. Tasks
+ * which are scheduled in before the fixup covered them do the handover
+ * themself. When mm_cid_fixup_cpus_to_tasks() completes it is guaranteed
+ * that no CID related to that MM is owned by a CPU anymore.
*/
/*
@@ -10400,9 +10432,9 @@ static bool mm_update_max_cids(struct mm_struct *mm)
/* Mode change required? */
if (!!mc->percpu == !!mc->pcpu_thrs)
return false;
- /* When switching back to per TASK mode, set the transition flag */
- if (!mc->pcpu_thrs)
- WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
+
+ /* Set the transition flag to bridge the transfer */
+ WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs);
return true;
}
@@ -10493,10 +10525,10 @@ static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
WRITE_ONCE(mm->mm_cid.transit, 0);
}
-static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
+static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
{
if (cid_on_task(t->mm_cid.cid)) {
- t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid);
+ t->mm_cid.cid = cid_to_transit_cid(t->mm_cid.cid);
pcp->cid = t->mm_cid.cid;
}
}
@@ -10509,18 +10541,17 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm
if (!t->mm_cid.active)
return false;
if (cid_on_task(t->mm_cid.cid)) {
- /* If running on the CPU, transfer the CID, otherwise drop it */
+ /* If running on the CPU, put the CID in transit mode, otherwise drop it */
if (task_rq(t)->curr == t)
- mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
+ mm_cid_transit_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
else
mm_unset_cid_on_task(t);
}
return true;
}
-static void mm_cid_fixup_tasks_to_cpus(void)
+static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
{
- struct mm_struct *mm = current->mm;
struct task_struct *p, *t;
unsigned int users;
@@ -10558,6 +10589,15 @@ static void mm_cid_fixup_tasks_to_cpus(void)
}
}
+static void mm_cid_fixup_tasks_to_cpus(void)
+{
+ struct mm_struct *mm = current->mm;
+
+ mm_cid_do_fixup_tasks_to_cpus(mm);
+ /* Clear the transition bit */
+ WRITE_ONCE(mm->mm_cid.transit, 0);
+}
+
static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
{
t->mm_cid.active = 1;
@@ -10596,7 +10636,7 @@ void sched_mm_cid_fork(struct task_struct *t)
if (!percpu)
mm_cid_transit_to_task(current, pcp);
else
- mm_cid_transfer_to_cpu(current, pcp);
+ mm_cid_transit_to_cpu(current, pcp);
}
if (percpu) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 93fce4b..eff2073 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3841,6 +3841,10 @@ static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int
/* Still nothing, allocate a new one */
if (!cid_on_cpu(cpu_cid))
cpu_cid = cid_to_cpu_cid(mm_get_cid(mm));
+
+ /* Set the transition mode flag if required */
+ if (READ_ONCE(mm->mm_cid.transit))
+ cpu_cid = cpu_cid_to_cid(cpu_cid) | MM_CID_TRANSIT;
}
mm_cid_update_pcpu_cid(mm, cpu_cid);
mm_cid_update_task_cid(t, cpu_cid);
Shrikanth reported a hard lockup which he observed once. The stack trace
shows the following CID related participants:
watchdog: CPU 23 self-detected hard LOCKUP @ mm_get_cid+0xe8/0x188
NIP: mm_get_cid+0xe8/0x188
LR: mm_get_cid+0x108/0x188
mm_cid_switch_to+0x3c4/0x52c
__schedule+0x47c/0x700
schedule_idle+0x3c/0x64
do_idle+0x160/0x1b0
cpu_startup_entry+0x48/0x50
start_secondary+0x284/0x288
start_secondary_prolog+0x10/0x14
watchdog: CPU 11 self-detected hard LOCKUP @ plpar_hcall_norets_notrace+0x18/0x2c
NIP: plpar_hcall_norets_notrace+0x18/0x2c
LR: queued_spin_lock_slowpath+0xd88/0x15d0
_raw_spin_lock+0x80/0xa0
raw_spin_rq_lock_nested+0x3c/0xf8
mm_cid_fixup_cpus_to_tasks+0xc8/0x28c
sched_mm_cid_exit+0x108/0x22c
do_exit+0xf4/0x5d0
make_task_dead+0x0/0x178
system_call_exception+0x128/0x390
system_call_vectored_common+0x15c/0x2ec
The task on CPU11 is running the CID ownership mode change fixup function
and is stuck on a runqueue lock. The task on CPU23 is trying to get a CID
from the pool with the same runqueue lock held, but the pool is empty.
After decoding a similar issue in the opposite direction switching from per
task to per CPU mode the tool which models the possible scenarios failed to
come up with a similar loop hole.
This showed up only once, was not reproducible and according to tooling not
related to a overlooked scheduling scenario permutation. But the fact that
it was observed on a PowerPC system gave the right hint: PowerPC is a
weakly ordered architecture.
The transition mechanism does:
WRITE_ONCE(mm->mm_cid.transit, MM_CID_TRANSIT);
WRITE_ONCE(mm->mm_cid.percpu, new_mode);
fixup()
WRITE_ONCE(mm->mm_cid.transit, 0);
mm_cid_schedin() does:
if (!READ_ONCE(mm->mm_cid.percpu))
...
cid |= READ_ONCE(mm->mm_cid.transit);
so weakly ordered systems can observe percpu == false and transit == 0 even
if the fixup function has not yet completed. As a consequence the task will
not drop the CID when scheduling out before the fixup is completed, which
means the CID space can be exhausted and the next task scheduling in will
loop in mm_get_cid() and the fixup thread can livelock on the held runqueue
lock as above.
This could obviously be solved by using:
smp_store_release(&mm->mm_cid.percpu, true);
and
smp_load_acquire(&mm->mm_cid.percpu);
but that brings a memory barrier back into the scheduler hotpath, which was
just designed out by the CID rewrite.
That can be completely avoided by combining the per CPU mode and the
transit storage into a single mm_cid::mode member and ordering the stores
against the fixup functions to prevent the CPU from reordering them.
That makes the update of both states atomic and a concurrent read observes
always consistent state.
The price is an additional AND operation in mm_cid_schedin() to evaluate
the per CPU or the per task path, but that's in the noise even on strongly
ordered architectures as the actual load can be significantly more
expensive and the conditional branch evaluation is there anyway.
Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions")
Reported-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Closes: https://lore.kernel.org/bdfea828-4585-40e8-8835-247c6a8a76b0@linux.ibm.com
---
V2: Add barriers and massage change log - Mathieu
---
include/linux/rseq_types.h | 6 +---
kernel/sched/core.c | 66 ++++++++++++++++++++++++++++++---------------
kernel/sched/sched.h | 21 ++++++++------
3 files changed, 58 insertions(+), 35 deletions(-)
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -121,8 +121,7 @@ struct mm_cid_pcpu {
/**
* struct mm_mm_cid - Storage for per MM CID data
* @pcpu: Per CPU storage for CIDs associated to a CPU
- * @percpu: Set, when CIDs are in per CPU mode
- * @transit: Set to MM_CID_TRANSIT during a mode change transition phase
+ * @mode: Indicates per CPU and transition mode
* @max_cids: The exclusive maximum CID value for allocation and convergence
* @irq_work: irq_work to handle the affinity mode change case
* @work: Regular work to handle the affinity mode change case
@@ -139,8 +138,7 @@ struct mm_cid_pcpu {
struct mm_mm_cid {
/* Hotpath read mostly members */
struct mm_cid_pcpu __percpu *pcpu;
- unsigned int percpu;
- unsigned int transit;
+ unsigned int mode;
unsigned int max_cids;
/* Rarely used. Moves @lock and @mutex into the second cacheline */
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10297,16 +10297,25 @@ void call_trace_sched_update_nr_running(
*
* Mode switching:
*
+ * The ownership mode is per process and stored in mm:mm_cid::mode with the
+ * following possible states:
+ *
+ * 0: Per task ownership
+ * 0 | MM_CID_TRANSIT: Transition from per CPU to per task
+ * MM_CID_ONCPU: Per CPU ownership
+ * MM_CID_ONCPU | MM_CID_TRANSIT: Transition from per task to per CPU
+ *
* All transitions of ownership mode happen in two phases:
*
- * 1) mm:mm_cid.transit contains MM_CID_TRANSIT. This is OR'ed on the CIDs
- * and denotes that the CID is only temporarily owned by a task. When
- * the task schedules out it drops the CID back into the pool if this
- * bit is set.
+ * 1) mm:mm_cid::mode has the MM_CID_TRANSIT bit set. This is OR'ed on the
+ * CIDs and denotes that the CID is only temporarily owned by a
+ * task. When the task schedules out it drops the CID back into the
+ * pool if this bit is set.
*
* 2) The initiating context walks the per CPU space or the tasks to fixup
- * or drop the CIDs and after completion it clears mm:mm_cid.transit.
- * After that point the CIDs are strictly task or CPU owned again.
+ * or drop the CIDs and after completion it clears MM_CID_TRANSIT in
+ * mm:mm_cid::mode. After that point the CIDs are strictly task or CPU
+ * owned again.
*
* This two phase transition is required to prevent CID space exhaustion
* during the transition as a direct transfer of ownership would fail:
@@ -10411,6 +10420,7 @@ static inline unsigned int mm_cid_calc_p
static bool mm_update_max_cids(struct mm_struct *mm)
{
struct mm_mm_cid *mc = &mm->mm_cid;
+ bool percpu = cid_on_cpu(mc->mode);
lockdep_assert_held(&mm->mm_cid.lock);
@@ -10419,7 +10429,7 @@ static bool mm_update_max_cids(struct mm
__mm_update_max_cids(mc);
/* Check whether owner mode must be changed */
- if (!mc->percpu) {
+ if (!percpu) {
/* Enable per CPU mode when the number of users is above max_cids */
if (mc->users > mc->max_cids)
mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
@@ -10430,12 +10440,17 @@ static bool mm_update_max_cids(struct mm
}
/* Mode change required? */
- if (!!mc->percpu == !!mc->pcpu_thrs)
+ if (percpu == !!mc->pcpu_thrs)
return false;
- /* Set the transition flag to bridge the transfer */
- WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
- WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs);
+ /* Flip the mode and set the transition flag to bridge the transfer */
+ WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU));
+ /*
+ * Order the store against the subsequent fixups so that
+ * acquire(rq::lock) cannot be reordered by the CPU before the
+ * store.
+ */
+ smp_mb();
return true;
}
@@ -10460,7 +10475,7 @@ static inline void mm_update_cpus_allowe
WRITE_ONCE(mc->nr_cpus_allowed, weight);
__mm_update_max_cids(mc);
- if (!mc->percpu)
+ if (!cid_on_cpu(mc->mode))
return;
/* Adjust the threshold to the wider set */
@@ -10478,6 +10493,16 @@ static inline void mm_update_cpus_allowe
irq_work_queue(&mc->irq_work);
}
+static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int mode)
+{
+ /*
+ * Ensure that the store removing the TRANSIT bit cannot be
+ * reordered by the CPU before the fixups have been completed.
+ */
+ smp_mb();
+ WRITE_ONCE(mm->mm_cid.mode, mode);
+}
+
static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
{
if (cid_on_cpu(t->mm_cid.cid)) {
@@ -10521,8 +10546,7 @@ static void mm_cid_fixup_cpus_to_tasks(s
}
}
}
- /* Clear the transition bit */
- WRITE_ONCE(mm->mm_cid.transit, 0);
+ mm_cid_complete_transit(mm, 0);
}
static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
@@ -10594,8 +10618,7 @@ static void mm_cid_fixup_tasks_to_cpus(v
struct mm_struct *mm = current->mm;
mm_cid_do_fixup_tasks_to_cpus(mm);
- /* Clear the transition bit */
- WRITE_ONCE(mm->mm_cid.transit, 0);
+ mm_cid_complete_transit(mm, MM_CID_ONCPU);
}
static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
@@ -10626,13 +10649,13 @@ void sched_mm_cid_fork(struct task_struc
}
if (!sched_mm_cid_add_user(t, mm)) {
- if (!mm->mm_cid.percpu)
+ if (!cid_on_cpu(mm->mm_cid.mode))
t->mm_cid.cid = mm_get_cid(mm);
return;
}
/* Handle the mode change and transfer current's CID */
- percpu = !!mm->mm_cid.percpu;
+ percpu = cid_on_cpu(mm->mm_cid.mode);
if (!percpu)
mm_cid_transit_to_task(current, pcp);
else
@@ -10671,7 +10694,7 @@ static bool __sched_mm_cid_exit(struct t
* affinity change increased the number of allowed CPUs and the
* deferred fixup did not run yet.
*/
- if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode)))
return false;
/*
* A failed fork(2) cleanup never gets here, so @current must have
@@ -10762,7 +10785,7 @@ static void mm_cid_work_fn(struct work_s
if (!mm_update_max_cids(mm))
return;
/* Affinity changes can only switch back to task mode */
- if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode)))
return;
}
mm_cid_fixup_cpus_to_tasks(mm);
@@ -10783,8 +10806,7 @@ static void mm_cid_irq_work(struct irq_w
void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
{
mm->mm_cid.max_cids = 0;
- mm->mm_cid.percpu = 0;
- mm->mm_cid.transit = 0;
+ mm->mm_cid.mode = 0;
mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
mm->mm_cid.users = 0;
mm->mm_cid.pcpu_thrs = 0;
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3816,7 +3816,8 @@ static __always_inline void mm_cid_updat
__this_cpu_write(mm->mm_cid.pcpu->cid, cid);
}
-static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid)
+static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid,
+ unsigned int mode)
{
unsigned int max_cids, tcid = t->mm_cid.cid;
struct mm_struct *mm = t->mm;
@@ -3842,15 +3843,16 @@ static __always_inline void mm_cid_from_
if (!cid_on_cpu(cpu_cid))
cpu_cid = cid_to_cpu_cid(mm_get_cid(mm));
- /* Set the transition mode flag if required */
- if (READ_ONCE(mm->mm_cid.transit))
+ /* Handle the transition mode flag if required */
+ if (mode & MM_CID_TRANSIT)
cpu_cid = cpu_cid_to_cid(cpu_cid) | MM_CID_TRANSIT;
}
mm_cid_update_pcpu_cid(mm, cpu_cid);
mm_cid_update_task_cid(t, cpu_cid);
}
-static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid)
+static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid,
+ unsigned int mode)
{
unsigned int max_cids, tcid = t->mm_cid.cid;
struct mm_struct *mm = t->mm;
@@ -3876,7 +3878,7 @@ static __always_inline void mm_cid_from_
if (!cid_on_task(tcid))
tcid = mm_get_cid(mm);
/* Set the transition mode flag if required */
- tcid |= READ_ONCE(mm->mm_cid.transit);
+ tcid |= mode & MM_CID_TRANSIT;
}
mm_cid_update_pcpu_cid(mm, tcid);
mm_cid_update_task_cid(t, tcid);
@@ -3885,16 +3887,17 @@ static __always_inline void mm_cid_from_
static __always_inline void mm_cid_schedin(struct task_struct *next)
{
struct mm_struct *mm = next->mm;
- unsigned int cpu_cid;
+ unsigned int cpu_cid, mode;
if (!next->mm_cid.active)
return;
cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid);
- if (likely(!READ_ONCE(mm->mm_cid.percpu)))
- mm_cid_from_task(next, cpu_cid);
+ mode = READ_ONCE(mm->mm_cid.mode);
+ if (likely(!cid_on_cpu(mode)))
+ mm_cid_from_task(next, cpu_cid, mode);
else
- mm_cid_from_cpu(next, cpu_cid);
+ mm_cid_from_cpu(next, cpu_cid, mode);
}
static __always_inline void mm_cid_schedout(struct task_struct *prev)
On 2026-02-02 04:39, Thomas Gleixner wrote:
[...]
> Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions")
> Reported-by: Shrikanth Hegde <sshegde@linux.ibm.com>
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
> Closes: https://lore.kernel.org/bdfea828-4585-40e8-8835-247c6a8a76b0@linux.ibm.com
Please keep it as two full barriers as it is here. I'm not sure the
store-release proposed by PeterZ works. I'll have to find time to do
a litmus test, but I'm busy at the moment.
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
The following commit has been merged into the sched/urgent branch of tip:
Commit-ID: 47ee94efccf6732e4ef1a815c451aacaf1464757
Gitweb: https://git.kernel.org/tip/47ee94efccf6732e4ef1a815c451aacaf1464757
Author: Thomas Gleixner <tglx@kernel.org>
AuthorDate: Mon, 02 Feb 2026 10:39:45 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 04 Feb 2026 12:21:12 +01:00
sched/mmcid: Protect transition on weakly ordered systems
Shrikanth reported a hard lockup which he observed once. The stack trace
shows the following CID related participants:
watchdog: CPU 23 self-detected hard LOCKUP @ mm_get_cid+0xe8/0x188
NIP: mm_get_cid+0xe8/0x188
LR: mm_get_cid+0x108/0x188
mm_cid_switch_to+0x3c4/0x52c
__schedule+0x47c/0x700
schedule_idle+0x3c/0x64
do_idle+0x160/0x1b0
cpu_startup_entry+0x48/0x50
start_secondary+0x284/0x288
start_secondary_prolog+0x10/0x14
watchdog: CPU 11 self-detected hard LOCKUP @ plpar_hcall_norets_notrace+0x18/0x2c
NIP: plpar_hcall_norets_notrace+0x18/0x2c
LR: queued_spin_lock_slowpath+0xd88/0x15d0
_raw_spin_lock+0x80/0xa0
raw_spin_rq_lock_nested+0x3c/0xf8
mm_cid_fixup_cpus_to_tasks+0xc8/0x28c
sched_mm_cid_exit+0x108/0x22c
do_exit+0xf4/0x5d0
make_task_dead+0x0/0x178
system_call_exception+0x128/0x390
system_call_vectored_common+0x15c/0x2ec
The task on CPU11 is running the CID ownership mode change fixup function
and is stuck on a runqueue lock. The task on CPU23 is trying to get a CID
from the pool with the same runqueue lock held, but the pool is empty.
After decoding a similar issue in the opposite direction switching from per
task to per CPU mode the tool which models the possible scenarios failed to
come up with a similar loop hole.
This showed up only once, was not reproducible and according to tooling not
related to a overlooked scheduling scenario permutation. But the fact that
it was observed on a PowerPC system gave the right hint: PowerPC is a
weakly ordered architecture.
The transition mechanism does:
WRITE_ONCE(mm->mm_cid.transit, MM_CID_TRANSIT);
WRITE_ONCE(mm->mm_cid.percpu, new_mode);
fixup()
WRITE_ONCE(mm->mm_cid.transit, 0);
mm_cid_schedin() does:
if (!READ_ONCE(mm->mm_cid.percpu))
...
cid |= READ_ONCE(mm->mm_cid.transit);
so weakly ordered systems can observe percpu == false and transit == 0 even
if the fixup function has not yet completed. As a consequence the task will
not drop the CID when scheduling out before the fixup is completed, which
means the CID space can be exhausted and the next task scheduling in will
loop in mm_get_cid() and the fixup thread can livelock on the held runqueue
lock as above.
This could obviously be solved by using:
smp_store_release(&mm->mm_cid.percpu, true);
and
smp_load_acquire(&mm->mm_cid.percpu);
but that brings a memory barrier back into the scheduler hotpath, which was
just designed out by the CID rewrite.
That can be completely avoided by combining the per CPU mode and the
transit storage into a single mm_cid::mode member and ordering the stores
against the fixup functions to prevent the CPU from reordering them.
That makes the update of both states atomic and a concurrent read observes
always consistent state.
The price is an additional AND operation in mm_cid_schedin() to evaluate
the per CPU or the per task path, but that's in the noise even on strongly
ordered architectures as the actual load can be significantly more
expensive and the conditional branch evaluation is there anyway.
Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions")
Closes: https://lore.kernel.org/bdfea828-4585-40e8-8835-247c6a8a76b0@linux.ibm.com
Reported-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20260201192834.965217106@kernel.org
---
include/linux/rseq_types.h | 6 +--
kernel/sched/core.c | 66 ++++++++++++++++++++++++-------------
kernel/sched/sched.h | 21 ++++++------
3 files changed, 58 insertions(+), 35 deletions(-)
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 332dc14..ef08113 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -121,8 +121,7 @@ struct mm_cid_pcpu {
/**
* struct mm_mm_cid - Storage for per MM CID data
* @pcpu: Per CPU storage for CIDs associated to a CPU
- * @percpu: Set, when CIDs are in per CPU mode
- * @transit: Set to MM_CID_TRANSIT during a mode change transition phase
+ * @mode: Indicates per CPU and transition mode
* @max_cids: The exclusive maximum CID value for allocation and convergence
* @irq_work: irq_work to handle the affinity mode change case
* @work: Regular work to handle the affinity mode change case
@@ -139,8 +138,7 @@ struct mm_cid_pcpu {
struct mm_mm_cid {
/* Hotpath read mostly members */
struct mm_cid_pcpu __percpu *pcpu;
- unsigned int percpu;
- unsigned int transit;
+ unsigned int mode;
unsigned int max_cids;
/* Rarely used. Moves @lock and @mutex into the second cacheline */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1e790f2..8580283 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10297,16 +10297,25 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
*
* Mode switching:
*
+ * The ownership mode is per process and stored in mm:mm_cid::mode with the
+ * following possible states:
+ *
+ * 0: Per task ownership
+ * 0 | MM_CID_TRANSIT: Transition from per CPU to per task
+ * MM_CID_ONCPU: Per CPU ownership
+ * MM_CID_ONCPU | MM_CID_TRANSIT: Transition from per task to per CPU
+ *
* All transitions of ownership mode happen in two phases:
*
- * 1) mm:mm_cid.transit contains MM_CID_TRANSIT. This is OR'ed on the CIDs
- * and denotes that the CID is only temporarily owned by a task. When
- * the task schedules out it drops the CID back into the pool if this
- * bit is set.
+ * 1) mm:mm_cid::mode has the MM_CID_TRANSIT bit set. This is OR'ed on the
+ * CIDs and denotes that the CID is only temporarily owned by a
+ * task. When the task schedules out it drops the CID back into the
+ * pool if this bit is set.
*
* 2) The initiating context walks the per CPU space or the tasks to fixup
- * or drop the CIDs and after completion it clears mm:mm_cid.transit.
- * After that point the CIDs are strictly task or CPU owned again.
+ * or drop the CIDs and after completion it clears MM_CID_TRANSIT in
+ * mm:mm_cid::mode. After that point the CIDs are strictly task or CPU
+ * owned again.
*
* This two phase transition is required to prevent CID space exhaustion
* during the transition as a direct transfer of ownership would fail:
@@ -10411,6 +10420,7 @@ static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc)
static bool mm_update_max_cids(struct mm_struct *mm)
{
struct mm_mm_cid *mc = &mm->mm_cid;
+ bool percpu = cid_on_cpu(mc->mode);
lockdep_assert_held(&mm->mm_cid.lock);
@@ -10419,7 +10429,7 @@ static bool mm_update_max_cids(struct mm_struct *mm)
__mm_update_max_cids(mc);
/* Check whether owner mode must be changed */
- if (!mc->percpu) {
+ if (!percpu) {
/* Enable per CPU mode when the number of users is above max_cids */
if (mc->users > mc->max_cids)
mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
@@ -10430,12 +10440,17 @@ static bool mm_update_max_cids(struct mm_struct *mm)
}
/* Mode change required? */
- if (!!mc->percpu == !!mc->pcpu_thrs)
+ if (percpu == !!mc->pcpu_thrs)
return false;
- /* Set the transition flag to bridge the transfer */
- WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
- WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs);
+ /* Flip the mode and set the transition flag to bridge the transfer */
+ WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU));
+ /*
+ * Order the store against the subsequent fixups so that
+ * acquire(rq::lock) cannot be reordered by the CPU before the
+ * store.
+ */
+ smp_mb();
return true;
}
@@ -10460,7 +10475,7 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu
WRITE_ONCE(mc->nr_cpus_allowed, weight);
__mm_update_max_cids(mc);
- if (!mc->percpu)
+ if (!cid_on_cpu(mc->mode))
return;
/* Adjust the threshold to the wider set */
@@ -10478,6 +10493,16 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu
irq_work_queue(&mc->irq_work);
}
+static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int mode)
+{
+ /*
+ * Ensure that the store removing the TRANSIT bit cannot be
+ * reordered by the CPU before the fixups have been completed.
+ */
+ smp_mb();
+ WRITE_ONCE(mm->mm_cid.mode, mode);
+}
+
static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
{
if (cid_on_cpu(t->mm_cid.cid)) {
@@ -10521,8 +10546,7 @@ static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
}
}
}
- /* Clear the transition bit */
- WRITE_ONCE(mm->mm_cid.transit, 0);
+ mm_cid_complete_transit(mm, 0);
}
static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
@@ -10594,8 +10618,7 @@ static void mm_cid_fixup_tasks_to_cpus(void)
struct mm_struct *mm = current->mm;
mm_cid_do_fixup_tasks_to_cpus(mm);
- /* Clear the transition bit */
- WRITE_ONCE(mm->mm_cid.transit, 0);
+ mm_cid_complete_transit(mm, MM_CID_ONCPU);
}
static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
@@ -10626,13 +10649,13 @@ void sched_mm_cid_fork(struct task_struct *t)
}
if (!sched_mm_cid_add_user(t, mm)) {
- if (!mm->mm_cid.percpu)
+ if (!cid_on_cpu(mm->mm_cid.mode))
t->mm_cid.cid = mm_get_cid(mm);
return;
}
/* Handle the mode change and transfer current's CID */
- percpu = !!mm->mm_cid.percpu;
+ percpu = cid_on_cpu(mm->mm_cid.mode);
if (!percpu)
mm_cid_transit_to_task(current, pcp);
else
@@ -10671,7 +10694,7 @@ static bool __sched_mm_cid_exit(struct task_struct *t)
* affinity change increased the number of allowed CPUs and the
* deferred fixup did not run yet.
*/
- if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode)))
return false;
/*
* A failed fork(2) cleanup never gets here, so @current must have
@@ -10762,7 +10785,7 @@ static void mm_cid_work_fn(struct work_struct *work)
if (!mm_update_max_cids(mm))
return;
/* Affinity changes can only switch back to task mode */
- if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode)))
return;
}
mm_cid_fixup_cpus_to_tasks(mm);
@@ -10783,8 +10806,7 @@ static void mm_cid_irq_work(struct irq_work *work)
void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
{
mm->mm_cid.max_cids = 0;
- mm->mm_cid.percpu = 0;
- mm->mm_cid.transit = 0;
+ mm->mm_cid.mode = 0;
mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
mm->mm_cid.users = 0;
mm->mm_cid.pcpu_thrs = 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eff2073..f85fd6b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3816,7 +3816,8 @@ static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigne
__this_cpu_write(mm->mm_cid.pcpu->cid, cid);
}
-static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid)
+static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid,
+ unsigned int mode)
{
unsigned int max_cids, tcid = t->mm_cid.cid;
struct mm_struct *mm = t->mm;
@@ -3842,15 +3843,16 @@ static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int
if (!cid_on_cpu(cpu_cid))
cpu_cid = cid_to_cpu_cid(mm_get_cid(mm));
- /* Set the transition mode flag if required */
- if (READ_ONCE(mm->mm_cid.transit))
+ /* Handle the transition mode flag if required */
+ if (mode & MM_CID_TRANSIT)
cpu_cid = cpu_cid_to_cid(cpu_cid) | MM_CID_TRANSIT;
}
mm_cid_update_pcpu_cid(mm, cpu_cid);
mm_cid_update_task_cid(t, cpu_cid);
}
-static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid)
+static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid,
+ unsigned int mode)
{
unsigned int max_cids, tcid = t->mm_cid.cid;
struct mm_struct *mm = t->mm;
@@ -3876,7 +3878,7 @@ static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int
if (!cid_on_task(tcid))
tcid = mm_get_cid(mm);
/* Set the transition mode flag if required */
- tcid |= READ_ONCE(mm->mm_cid.transit);
+ tcid |= mode & MM_CID_TRANSIT;
}
mm_cid_update_pcpu_cid(mm, tcid);
mm_cid_update_task_cid(t, tcid);
@@ -3885,16 +3887,17 @@ static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int
static __always_inline void mm_cid_schedin(struct task_struct *next)
{
struct mm_struct *mm = next->mm;
- unsigned int cpu_cid;
+ unsigned int cpu_cid, mode;
if (!next->mm_cid.active)
return;
cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid);
- if (likely(!READ_ONCE(mm->mm_cid.percpu)))
- mm_cid_from_task(next, cpu_cid);
+ mode = READ_ONCE(mm->mm_cid.mode);
+ if (likely(!cid_on_cpu(mode)))
+ mm_cid_from_task(next, cpu_cid, mode);
else
- mm_cid_from_cpu(next, cpu_cid);
+ mm_cid_from_cpu(next, cpu_cid, mode);
}
static __always_inline void mm_cid_schedout(struct task_struct *prev)
When a exiting task initiates the switch from per CPU back to per task
mode, it has already dropped its CID and marked itself inactive. But a
leftover from an earlier iteration of the rework then reassigns the per
CPU CID to the exiting task with the transition bit set.
That's wrong as the task is already marked CID inactive, which means it is
inconsistent state. It's harmless because the CID is marked in transit and
therefore dropped back into the pool when the exiting task schedules out
either through preemption or the final schedule().
Simply drop the per CPU CID when the exiting task triggered the transition.
Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions")
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
---
kernel/sched/core.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10727,8 +10727,14 @@ void sched_mm_cid_exit(struct task_struc
scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
if (!__sched_mm_cid_exit(t))
return;
- /* Mode change required. Transfer currents CID */
- mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu));
+ /*
+ * Mode change. The task has the CID unset
+ * already. The CPU CID is still valid and
+ * does not have MM_CID_TRANSIT set as the
+ * mode change has just taken effect under
+ * mm::mm_cid::lock. Drop it.
+ */
+ mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu));
}
mm_cid_fixup_cpus_to_tasks(mm);
return;
The following commit has been merged into the sched/urgent branch of tip:
Commit-ID: 007d84287c7466ca68a5809b616338214dc5b77b
Gitweb: https://git.kernel.org/tip/007d84287c7466ca68a5809b616338214dc5b77b
Author: Thomas Gleixner <tglx@kernel.org>
AuthorDate: Mon, 02 Feb 2026 10:39:50 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 04 Feb 2026 12:21:12 +01:00
sched/mmcid: Drop per CPU CID immediately when switching to per task mode
When a exiting task initiates the switch from per CPU back to per task
mode, it has already dropped its CID and marked itself inactive. But a
leftover from an earlier iteration of the rework then reassigns the per
CPU CID to the exiting task with the transition bit set.
That's wrong as the task is already marked CID inactive, which means it is
inconsistent state. It's harmless because the CID is marked in transit and
therefore dropped back into the pool when the exiting task schedules out
either through preemption or the final schedule().
Simply drop the per CPU CID when the exiting task triggered the transition.
Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions")
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20260201192835.032221009@kernel.org
---
kernel/sched/core.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8580283..8549849 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10727,8 +10727,14 @@ void sched_mm_cid_exit(struct task_struct *t)
scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
if (!__sched_mm_cid_exit(t))
return;
- /* Mode change required. Transfer currents CID */
- mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu));
+ /*
+ * Mode change. The task has the CID unset
+ * already. The CPU CID is still valid and
+ * does not have MM_CID_TRANSIT set as the
+ * mode change has just taken effect under
+ * mm::mm_cid::lock. Drop it.
+ */
+ mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu));
}
mm_cid_fixup_cpus_to_tasks(mm);
return;
During the investigation of the various transition mode issues
instrumentation revealed that the amount of bitmap operations can be
significantly reduced when a task with a transitional CID schedules out
after the fixup function completed and disabled the transition mode.
At that point the mode is stable and therefore it is not required to drop
the transitional CID back into the pool. As the fixup is complete the
potential exhaustion of the CID pool is not longer possible, so the CID can
be transferred to the scheduling out task or to the CPU depending on the
current ownership mode.
The racy snapshot of mm_cid::mode which contains both the ownership state
and the transition bit is valid because runqueue lock is held and the fixup
function of a concurrent mode switch is serialized.
Assigning the ownership right there not only spares the bitmap access for
dropping the CID it also avoids it when the task is scheduled back in as it
directly hits the fast path in both modes when the CID is within the
optimal range. If it's outside the range the next schedule in will need to
converge so dropping it right away is sensible. In the good case this also
allows to go into the fast path on the next schedule in operation.
With a thread pool benchmark which is configured to cross the mode switch
boundaries frequently this reduces the number of bitmap operations by about
30% and increases the fastpath utilization in the low single digit
percentage range.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/sched/sched.h | 23 +++++++++++++++++++++--
1 file changed, 21 insertions(+), 2 deletions(-)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3902,12 +3902,31 @@ static __always_inline void mm_cid_sched
static __always_inline void mm_cid_schedout(struct task_struct *prev)
{
+ struct mm_struct *mm = prev->mm;
+ unsigned int mode, cid;
+
/* During mode transitions CIDs are temporary and need to be dropped */
if (likely(!cid_in_transit(prev->mm_cid.cid)))
return;
- mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid));
- prev->mm_cid.cid = MM_CID_UNSET;
+ mode = READ_ONCE(mm->mm_cid.mode);
+ cid = cid_from_transit_cid(prev->mm_cid.cid);
+
+ /*
+ * If transition mode is done, transfer ownership when the CID is
+ * within the convergence range to optimize the next schedule in.
+ */
+ if (!cid_in_transit(mode) && cid < READ_ONCE(mm->mm_cid.max_cids)) {
+ if (cid_on_cpu(mode))
+ cid = cid_to_cpu_cid(cid);
+
+ /* Update both so that the next schedule in goes into the fast path */
+ mm_cid_update_pcpu_cid(mm, cid);
+ prev->mm_cid.cid = cid;
+ } else {
+ mm_drop_cid(mm, cid);
+ prev->mm_cid.cid = MM_CID_UNSET;
+ }
}
static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next)
On 2026-02-02 04:39, Thomas Gleixner wrote: > > Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com
The following commit has been merged into the sched/urgent branch of tip:
Commit-ID: 4463c7aa11a6e67169ae48c6804968960c4bffea
Gitweb: https://git.kernel.org/tip/4463c7aa11a6e67169ae48c6804968960c4bffea
Author: Thomas Gleixner <tglx@kernel.org>
AuthorDate: Mon, 02 Feb 2026 10:39:55 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 04 Feb 2026 12:21:12 +01:00
sched/mmcid: Optimize transitional CIDs when scheduling out
During the investigation of the various transition mode issues
instrumentation revealed that the amount of bitmap operations can be
significantly reduced when a task with a transitional CID schedules out
after the fixup function completed and disabled the transition mode.
At that point the mode is stable and therefore it is not required to drop
the transitional CID back into the pool. As the fixup is complete the
potential exhaustion of the CID pool is not longer possible, so the CID can
be transferred to the scheduling out task or to the CPU depending on the
current ownership mode.
The racy snapshot of mm_cid::mode which contains both the ownership state
and the transition bit is valid because runqueue lock is held and the fixup
function of a concurrent mode switch is serialized.
Assigning the ownership right there not only spares the bitmap access for
dropping the CID it also avoids it when the task is scheduled back in as it
directly hits the fast path in both modes when the CID is within the
optimal range. If it's outside the range the next schedule in will need to
converge so dropping it right away is sensible. In the good case this also
allows to go into the fast path on the next schedule in operation.
With a thread pool benchmark which is configured to cross the mode switch
boundaries frequently this reduces the number of bitmap operations by about
30% and increases the fastpath utilization in the low single digit
percentage range.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20260201192835.100194627@kernel.org
---
kernel/sched/sched.h | 23 +++++++++++++++++++++--
1 file changed, 21 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f85fd6b..bd350e4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3902,12 +3902,31 @@ static __always_inline void mm_cid_schedin(struct task_struct *next)
static __always_inline void mm_cid_schedout(struct task_struct *prev)
{
+ struct mm_struct *mm = prev->mm;
+ unsigned int mode, cid;
+
/* During mode transitions CIDs are temporary and need to be dropped */
if (likely(!cid_in_transit(prev->mm_cid.cid)))
return;
- mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid));
- prev->mm_cid.cid = MM_CID_UNSET;
+ mode = READ_ONCE(mm->mm_cid.mode);
+ cid = cid_from_transit_cid(prev->mm_cid.cid);
+
+ /*
+ * If transition mode is done, transfer ownership when the CID is
+ * within the convergence range to optimize the next schedule in.
+ */
+ if (!cid_in_transit(mode) && cid < READ_ONCE(mm->mm_cid.max_cids)) {
+ if (cid_on_cpu(mode))
+ cid = cid_to_cpu_cid(cid);
+
+ /* Update both so that the next schedule in goes into the fast path */
+ mm_cid_update_pcpu_cid(mm, cid);
+ prev->mm_cid.cid = cid;
+ } else {
+ mm_drop_cid(mm, cid);
+ prev->mm_cid.cid = MM_CID_UNSET;
+ }
}
static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next)
© 2016 - 2026 Red Hat, Inc.