From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D39AA338F36 for ; Wed, 15 Oct 2025 17:29:27 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549370; cv=none; b=ZM5SZb0V48DdBYwcH0FBq7ddtFVhvnTJBZgDYBdKa0Zdiy6mceS3BG8lQtMr1qaHDHAl9W/ojLBSFx5gfGPD6qRLYj9z0Xp2wwLvlwAmqrTpO7agh7b0KjlIcMQGxyAxskDikbjqE1o9xvvANm6bjqJnotZznbe8QBWMY5wRzFs= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549370; c=relaxed/simple; bh=hZXTtkQX1m4nboTi7bzCkiXK2QF2DLW9nNzrylkSDuU=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=O0lM8wmTCtR/Hl/fWccDmBpQJ6sBwWDAKIuo9KeGWkVJvcplJkN/50vUi+c3lhrSdr0qb44Sh+InhHWcleDAcjJWpasz5XIbXST+efDkLdQbQ8wu3WCES0yYCsXo/SgoZhqdDlAcjhzW0SbM5wQcqtltdbnHtuh679LcxJqqoH0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=HANSSfbN; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=SA7vcyK0; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="HANSSfbN"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="SA7vcyK0" Message-ID: <20251015172834.377849729@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549366; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=J04Fc2al1BU3gWHV3OPGST9FXtVxo5msR/6Y7O0Y+XM=; b=HANSSfbNVZm74gs+9CK6WnUW5d2DWYLCEbpRImqmEAh45MqOVooKXM57fqusnBtu7Fwroh kBZhYaPZELJSpKxlocISBuE1kxiFHaRCtQoP7dRVmsG1b9xcOlmRMhIY4MEShbLpJjRd9T VCSi3usmi0NV8nIWTqp/HzvfusljLbQB5rfueUsv++Ly0i4m4a6pehLfJ4AhsXR/gh4++2 G/L65b7sLuIYDPyK4rTvsK4F+6WHfI06M2eEvq+qn5HoqpWFeW3v5f2yusxM5xH78oengP TRfhtJ20GefVp78OIc2z68HrjjcMLk8VlAX2H+xTjBKLLpZn9Xbkr5fD6+FKTQ== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549366; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=J04Fc2al1BU3gWHV3OPGST9FXtVxo5msR/6Y7O0Y+XM=; b=SA7vcyK0QeBnrAtD5JzaWiQCzfPcxmXlmm9CDUx1Jd3FwQW+Fj0wy+Ib4lemB+IsYQTm3r TuoePPtQWccYx0BA== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 01/19] sched/mmcid: Revert the complex CID management References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:25 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" The CID management is a complex beast, which affects both scheduling and task migration. The compaction mechanism forces random tasks of a process into task work on exit to user space causing latency spikes. Revert back to the initial simple bitmap allocating mechanics, which are known to have scalability issues as that allows to gradually build up a replacement functionality in a reviewable way. Signed-off-by: Thomas Gleixner --- include/linux/mm_types.h | 53 ---- kernel/fork.c | 5=20 kernel/sched/core.c | 514 +-----------------------------------------= ----- kernel/sched/sched.h | 291 +++----------------------- 4 files changed, 64 insertions(+), 799 deletions(-) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/87tt0k3oks.ffs@tglx --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -924,13 +924,9 @@ struct vm_area_struct { #define vma_policy(vma) NULL #endif =20 -#ifdef CONFIG_SCHED_MM_CID struct mm_cid { - u64 time; - int cid; - int recent_cid; + unsigned int cid; }; -#endif =20 struct kioctx_table; struct iommu_mm_data; @@ -993,12 +989,6 @@ struct mm_struct { * runqueue locks. */ struct mm_cid __percpu *pcpu_cid; - /* - * @mm_cid_next_scan: Next mm_cid scan (in jiffies). - * - * When the next mm_cid scan is due (in jiffies). - */ - unsigned long mm_cid_next_scan; /** * @nr_cpus_allowed: Number of CPUs allowed for mm. * @@ -1007,14 +997,6 @@ struct mm_struct { */ unsigned int nr_cpus_allowed; /** - * @max_nr_cid: Maximum number of allowed concurrency - * IDs allocated. - * - * Track the highest number of allowed concurrency IDs - * allocated for the mm. - */ - atomic_t max_nr_cid; - /** * @cpus_allowed_lock: Lock protecting mm cpus_allowed. * * Provide mutual exclusion for mm cpus_allowed and @@ -1325,35 +1307,7 @@ static inline void vma_iter_init(struct =20 #ifdef CONFIG_SCHED_MM_CID =20 -enum mm_cid_state { - MM_CID_UNSET =3D -1U, /* Unset state has lazy_put flag set. */ - MM_CID_LAZY_PUT =3D (1U << 31), -}; - -static inline bool mm_cid_is_unset(int cid) -{ - return cid =3D=3D MM_CID_UNSET; -} - -static inline bool mm_cid_is_lazy_put(int cid) -{ - return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT); -} - -static inline bool mm_cid_is_valid(int cid) -{ - return !(cid & MM_CID_LAZY_PUT); -} - -static inline int mm_cid_set_lazy_put(int cid) -{ - return cid | MM_CID_LAZY_PUT; -} - -static inline int mm_cid_clear_lazy_put(int cid) -{ - return cid & ~MM_CID_LAZY_PUT; -} +#define MM_CID_UNSET (~0U) =20 /* * mm_cpus_allowed: Union of all mm's threads allowed CPUs. @@ -1386,11 +1340,8 @@ static inline void mm_init_cid(struct mm struct mm_cid *pcpu_cid =3D per_cpu_ptr(mm->pcpu_cid, i); =20 pcpu_cid->cid =3D MM_CID_UNSET; - pcpu_cid->recent_cid =3D MM_CID_UNSET; - pcpu_cid->time =3D 0; } mm->nr_cpus_allowed =3D p->nr_cpus_allowed; - atomic_set(&mm->max_nr_cid, 0); raw_spin_lock_init(&mm->cpus_allowed_lock); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); cpumask_clear(mm_cidmask(mm)); --- a/kernel/fork.c +++ b/kernel/fork.c @@ -955,10 +955,9 @@ static struct task_struct *dup_task_stru #endif =20 #ifdef CONFIG_SCHED_MM_CID - tsk->mm_cid =3D -1; - tsk->last_mm_cid =3D -1; + tsk->mm_cid =3D MM_CID_UNSET; + tsk->last_mm_cid =3D MM_CID_UNSET; tsk->mm_cid_active =3D 0; - tsk->migrate_from_cpu =3D -1; #endif return tsk; =20 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2126,8 +2126,6 @@ void activate_task(struct rq *rq, struct { if (task_on_rq_migrating(p)) flags |=3D ENQUEUE_MIGRATED; - if (flags & ENQUEUE_MIGRATED) - sched_mm_cid_migrate_to(rq, p); =20 enqueue_task(rq, p, flags); =20 @@ -3364,7 +3362,6 @@ void set_task_cpu(struct task_struct *p, if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; - sched_mm_cid_migrate_from(p); perf_event_task_migrate(p); } =20 @@ -5344,8 +5341,7 @@ context_switch(struct rq *rq, struct tas } } =20 - /* switch_mm_cid() requires the memory barriers above. */ - switch_mm_cid(rq, prev, next); + switch_mm_cid(prev, next); =20 /* * Tell rseq that the task was scheduled in. Must be after @@ -5636,7 +5632,6 @@ void sched_tick(void) resched_latency =3D cpu_resched_latency(rq); calc_global_load_tick(rq); sched_core_tick(rq); - task_tick_mm_cid(rq, donor); scx_tick(rq); =20 rq_unlock(rq, &rf); @@ -10408,522 +10403,47 @@ void call_trace_sched_update_nr_running( } =20 #ifdef CONFIG_SCHED_MM_CID - /* - * @cid_lock: Guarantee forward-progress of cid allocation. - * - * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_= lock - * is only used when contention is detected by the lock-free allocation so - * forward progress can be guaranteed. - */ -DEFINE_RAW_SPINLOCK(cid_lock); - -/* - * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. - * - * When @use_cid_lock is 0, the cid allocation is lock-free. When contenti= on is - * detected, it is set to 1 to ensure that all newly coming allocations are - * serialized by @cid_lock until the allocation which detected contention - * completes and sets @use_cid_lock back to 0. This guarantees forward pro= gress - * of a cid allocation. - */ -int use_cid_lock; - -/* - * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cp= u cid - * concurrently with respect to the execution of the source runqueue conte= xt - * switch. - * - * There is one basic properties we want to guarantee here: - * - * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is act= ively - * used by a task. That would lead to concurrent allocation of the cid and - * userspace corruption. - * - * Provide this guarantee by introducing a Dekker memory ordering to guara= ntee - * that a pair of loads observe at least one of a pair of stores, which ca= n be - * shown as: - * - * X =3D Y =3D 0 - * - * w[X]=3D1 w[Y]=3D1 - * MB MB - * r[Y]=3Dy r[X]=3Dx - * - * Which guarantees that x=3D=3D0 && y=3D=3D0 is impossible. But rather th= an using - * values 0 and 1, this algorithm cares about specific state transitions o= f the - * runqueue current task (as updated by the scheduler context switch), and= the - * per-mm/cpu cid value. - * - * Let's introduce task (Y) which has task->mm =3D=3D mm and task (N) whic= h has - * task->mm !=3D mm for the rest of the discussion. There are two schedule= r state - * transitions on context switch we care about: - * - * (TSA) Store to rq->curr with transition from (N) to (Y) - * - * (TSB) Store to rq->curr with transition from (Y) to (N) - * - * On the remote-clear side, there is one transition we care about: - * - * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag - * - * There is also a transition to UNSET state which can be performed from a= ll - * sides (scheduler, remote-clear). It is always performed with a cmpxchg = which - * guarantees that only a single thread will succeed: - * - * (TMB) cmpxchg to *pcpu_cid to mark UNSET - * - * Just to be clear, what we do _not_ want to happen is a transition to UN= SET - * when a thread is actively using the cid (property (1)). - * - * Let's looks at the relevant combinations of TSA/TSB, and TMA transition= s. - * - * Scenario A) (TSA)+(TMA) (from next task perspective) - * - * CPU0 CPU1 - * - * Context switch CS-1 Remote-clear - * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY= (TMA) - * (implied barrier after cmpx= chg) - * - switch_mm_cid() - * - memory barrier (see switch_mm_cid() - * comment explaining how this barrier - * is combined with other scheduler - * barriers) - * - mm_cid_get (next) - * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->cur= r) - * - * This Dekker ensures that either task (Y) is observed by the - * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both = are - * observed. - * - * If task (Y) store is observed by rcu_dereference(), it means that there= is - * still an active task on the cpu. Remote-clear will therefore not transi= tion - * to UNSET, which fulfills property (1). - * - * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE= (), - * it will move its state to UNSET, which clears the percpu cid perhaps - * uselessly (which is not an issue for correctness). Because task (Y) is = not - * observed, CPU1 can move ahead to set the state to UNSET. Because moving - * state to UNSET is done with a cmpxchg expecting that the old state has = the - * LAZY flag set, only one thread will successfully UNSET. - * - * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0 - * will observe the LAZY flag and transition to UNSET (perhaps uselessly),= and - * CPU1 will observe task (Y) and do nothing more, which is fine. - * - * What we are effectively preventing with this Dekker is a scenario where - * neither LAZY flag nor store (Y) are observed, which would fail property= (1) - * because this would UNSET a cid which is actively used. + * When a task exits, the MM CID held by the task is not longer required as + * the task cannot return to user space. */ - -void sched_mm_cid_migrate_from(struct task_struct *t) -{ - t->migrate_from_cpu =3D task_cpu(t); -} - -static -int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, - struct task_struct *t, - struct mm_cid *src_pcpu_cid) -{ - struct mm_struct *mm =3D t->mm; - struct task_struct *src_task; - int src_cid, last_mm_cid; - - if (!mm) - return -1; - - last_mm_cid =3D t->last_mm_cid; - /* - * If the migrated task has no last cid, or if the current - * task on src rq uses the cid, it means the source cid does not need - * to be moved to the destination cpu. - */ - if (last_mm_cid =3D=3D -1) - return -1; - src_cid =3D READ_ONCE(src_pcpu_cid->cid); - if (!mm_cid_is_valid(src_cid) || last_mm_cid !=3D src_cid) - return -1; - - /* - * If we observe an active task using the mm on this rq, it means we - * are not the last task to be migrated from this cpu for this mm, so - * there is no need to move src_cid to the destination cpu. - */ - guard(rcu)(); - src_task =3D rcu_dereference(src_rq->curr); - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm =3D=3D mm) { - t->last_mm_cid =3D -1; - return -1; - } - - return src_cid; -} - -static -int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, - struct task_struct *t, - struct mm_cid *src_pcpu_cid, - int src_cid) -{ - struct task_struct *src_task; - struct mm_struct *mm =3D t->mm; - int lazy_cid; - - if (src_cid =3D=3D -1) - return -1; - - /* - * Attempt to clear the source cpu cid to move it to the destination - * cpu. - */ - lazy_cid =3D mm_cid_set_lazy_put(src_cid); - if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid)) - return -1; - - /* - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm matches the scheduler barrier in context_switch() - * between store to rq->curr and load of prev and next task's - * per-mm/cpu cid. - * - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm_cid_active matches the barrier in - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and - * sched_mm_cid_after_execve() between store to t->mm_cid_active and - * load of per-mm/cpu cid. - */ - - /* - * If we observe an active task using the mm on this rq after setting - * the lazy-put flag, this task will be responsible for transitioning - * from lazy-put flag set to MM_CID_UNSET. - */ - scoped_guard (rcu) { - src_task =3D rcu_dereference(src_rq->curr); - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm =3D=3D mm) { - /* - * We observed an active task for this mm, there is therefore - * no point in moving this cid to the destination cpu. - */ - t->last_mm_cid =3D -1; - return -1; - } - } - - /* - * The src_cid is unused, so it can be unset. - */ - if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) - return -1; - WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); - return src_cid; -} - -/* - * Migration to dst cpu. Called with dst_rq lock held. - * Interrupts are disabled, which keeps the window of cid ownership withou= t the - * source rq lock held small. - */ -void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) -{ - struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; - struct mm_struct *mm =3D t->mm; - int src_cid, src_cpu; - bool dst_cid_is_set; - struct rq *src_rq; - - lockdep_assert_rq_held(dst_rq); - - if (!mm) - return; - src_cpu =3D t->migrate_from_cpu; - if (src_cpu =3D=3D -1) { - t->last_mm_cid =3D -1; - return; - } - /* - * Move the src cid if the dst cid is unset. This keeps id - * allocation closest to 0 in cases where few threads migrate around - * many CPUs. - * - * If destination cid or recent cid is already set, we may have - * to just clear the src cid to ensure compactness in frequent - * migrations scenarios. - * - * It is not useful to clear the src cid when the number of threads is - * greater or equal to the number of allowed CPUs, because user-space - * can expect that the number of allowed cids can reach the number of - * allowed CPUs. - */ - dst_pcpu_cid =3D per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); - dst_cid_is_set =3D !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || - !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); - if (dst_cid_is_set && atomic_read(&mm->mm_users) >=3D READ_ONCE(mm->nr_cp= us_allowed)) - return; - src_pcpu_cid =3D per_cpu_ptr(mm->pcpu_cid, src_cpu); - src_rq =3D cpu_rq(src_cpu); - src_cid =3D __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid= ); - if (src_cid =3D=3D -1) - return; - src_cid =3D __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu= _cid, - src_cid); - if (src_cid =3D=3D -1) - return; - if (dst_cid_is_set) { - __mm_cid_put(mm, src_cid); - return; - } - /* Move src_cid to dst cpu. */ - mm_cid_snapshot_time(dst_rq, mm); - WRITE_ONCE(dst_pcpu_cid->cid, src_cid); - WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); -} - -static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid = *pcpu_cid, - int cpu) -{ - struct rq *rq =3D cpu_rq(cpu); - struct task_struct *t; - int cid, lazy_cid; - - cid =3D READ_ONCE(pcpu_cid->cid); - if (!mm_cid_is_valid(cid)) - return; - - /* - * Clear the cpu cid if it is set to keep cid allocation compact. If - * there happens to be other tasks left on the source cpu using this - * mm, the next task using this mm will reallocate its cid on context - * switch. - */ - lazy_cid =3D mm_cid_set_lazy_put(cid); - if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid)) - return; - - /* - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm matches the scheduler barrier in context_switch() - * between store to rq->curr and load of prev and next task's - * per-mm/cpu cid. - * - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm_cid_active matches the barrier in - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and - * sched_mm_cid_after_execve() between store to t->mm_cid_active and - * load of per-mm/cpu cid. - */ - - /* - * If we observe an active task using the mm on this rq after setting - * the lazy-put flag, that task will be responsible for transitioning - * from lazy-put flag set to MM_CID_UNSET. - */ - scoped_guard (rcu) { - t =3D rcu_dereference(rq->curr); - if (READ_ONCE(t->mm_cid_active) && t->mm =3D=3D mm) - return; - } - - /* - * The cid is unused, so it can be unset. - * Disable interrupts to keep the window of cid ownership without rq - * lock small. - */ - scoped_guard (irqsave) { - if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) - __mm_cid_put(mm, cid); - } -} - -static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) -{ - struct rq *rq =3D cpu_rq(cpu); - struct mm_cid *pcpu_cid; - struct task_struct *curr; - u64 rq_clock; - - /* - * rq->clock load is racy on 32-bit but one spurious clear once in a - * while is irrelevant. - */ - rq_clock =3D READ_ONCE(rq->clock); - pcpu_cid =3D per_cpu_ptr(mm->pcpu_cid, cpu); - - /* - * In order to take care of infrequently scheduled tasks, bump the time - * snapshot associated with this cid if an active task using the mm is - * observed on this rq. - */ - scoped_guard (rcu) { - curr =3D rcu_dereference(rq->curr); - if (READ_ONCE(curr->mm_cid_active) && curr->mm =3D=3D mm) { - WRITE_ONCE(pcpu_cid->time, rq_clock); - return; - } - } - - if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) - return; - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); -} - -static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu, - int weight) -{ - struct mm_cid *pcpu_cid; - int cid; - - pcpu_cid =3D per_cpu_ptr(mm->pcpu_cid, cpu); - cid =3D READ_ONCE(pcpu_cid->cid); - if (!mm_cid_is_valid(cid) || cid < weight) - return; - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); -} - -static void task_mm_cid_work(struct callback_head *work) -{ - unsigned long now =3D jiffies, old_scan, next_scan; - struct task_struct *t =3D current; - struct cpumask *cidmask; - struct mm_struct *mm; - int weight, cpu; - - WARN_ON_ONCE(t !=3D container_of(work, struct task_struct, cid_work)); - - work->next =3D work; /* Prevent double-add */ - if (t->flags & PF_EXITING) - return; - mm =3D t->mm; - if (!mm) - return; - old_scan =3D READ_ONCE(mm->mm_cid_next_scan); - next_scan =3D now + msecs_to_jiffies(MM_CID_SCAN_DELAY); - if (!old_scan) { - unsigned long res; - - res =3D cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan); - if (res !=3D old_scan) - old_scan =3D res; - else - old_scan =3D next_scan; - } - if (time_before(now, old_scan)) - return; - if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan)) - return; - cidmask =3D mm_cidmask(mm); - /* Clear cids that were not recently used. */ - for_each_possible_cpu(cpu) - sched_mm_cid_remote_clear_old(mm, cpu); - weight =3D cpumask_weight(cidmask); - /* - * Clear cids that are greater or equal to the cidmask weight to - * recompact it. - */ - for_each_possible_cpu(cpu) - sched_mm_cid_remote_clear_weight(mm, cpu, weight); -} - -void init_sched_mm_cid(struct task_struct *t) -{ - struct mm_struct *mm =3D t->mm; - int mm_users =3D 0; - - if (mm) { - mm_users =3D atomic_read(&mm->mm_users); - if (mm_users =3D=3D 1) - mm->mm_cid_next_scan =3D jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); - } - t->cid_work.next =3D &t->cid_work; /* Protect against double add */ - init_task_work(&t->cid_work, task_mm_cid_work); -} - -void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) -{ - struct callback_head *work =3D &curr->cid_work; - unsigned long now =3D jiffies; - - if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || - work->next !=3D work) - return; - if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) - return; - - /* No page allocation under rq lock */ - task_work_add(curr, work, TWA_RESUME); -} - void sched_mm_cid_exit_signals(struct task_struct *t) { struct mm_struct *mm =3D t->mm; - struct rq *rq; =20 - if (!mm) + if (!mm || !t->mm_cid_active) return; =20 - preempt_disable(); - rq =3D this_rq(); - guard(rq_lock_irqsave)(rq); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 0); - /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). - */ - smp_mb(); - mm_cid_put(mm); - t->last_mm_cid =3D t->mm_cid =3D -1; + guard(preempt)(); + t->mm_cid_active =3D 0; + if (t->mm_cid !=3D MM_CID_UNSET) { + cpumask_clear_cpu(t->mm_cid, mm_cidmask(mm)); + t->mm_cid =3D MM_CID_UNSET; + } } =20 +/* Deactivate MM CID allocation across execve() */ void sched_mm_cid_before_execve(struct task_struct *t) { - struct mm_struct *mm =3D t->mm; - struct rq *rq; - - if (!mm) - return; - - preempt_disable(); - rq =3D this_rq(); - guard(rq_lock_irqsave)(rq); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 0); - /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). - */ - smp_mb(); - mm_cid_put(mm); - t->last_mm_cid =3D t->mm_cid =3D -1; + sched_mm_cid_exit_signals(t); } =20 +/* Reactivate MM CID after successful execve() */ void sched_mm_cid_after_execve(struct task_struct *t) { struct mm_struct *mm =3D t->mm; - struct rq *rq; =20 if (!mm) return; =20 - preempt_disable(); - rq =3D this_rq(); - scoped_guard (rq_lock_irqsave, rq) { - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 1); - /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). - */ - smp_mb(); - t->last_mm_cid =3D t->mm_cid =3D mm_cid_get(rq, t, mm); - } + guard(preempt)(); + t->mm_cid_active =3D 1; + mm_cid_select(t); } =20 void sched_mm_cid_fork(struct task_struct *t) { - WARN_ON_ONCE(!t->mm || t->mm_cid !=3D -1); + WARN_ON_ONCE(!t->mm || t->mm_cid !=3D MM_CID_UNSET); t->mm_cid_active =3D 1; } #endif /* CONFIG_SCHED_MM_CID */ --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3534,288 +3534,83 @@ extern void sched_dynamic_update(int mod extern const char *preempt_modes[]; =20 #ifdef CONFIG_SCHED_MM_CID - -#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ -#define MM_CID_SCAN_DELAY 100 /* 100ms */ - -extern raw_spinlock_t cid_lock; -extern int use_cid_lock; - -extern void sched_mm_cid_migrate_from(struct task_struct *t); -extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct = *t); -extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr); -extern void init_sched_mm_cid(struct task_struct *t); - -static inline void __mm_cid_put(struct mm_struct *mm, int cid) -{ - if (cid < 0) - return; - cpumask_clear_cpu(cid, mm_cidmask(mm)); -} - -/* - * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition = to - * the MM_CID_UNSET state without holding the rq lock, but the rq lock nee= ds to - * be held to transition to other states. - * - * State transitions synchronized with cmpxchg or try_cmpxchg need to be - * consistent across CPUs, which prevents use of this_cpu_cmpxchg. - */ -static inline void mm_cid_put_lazy(struct task_struct *t) +static inline void init_sched_mm_cid(struct task_struct *t) { struct mm_struct *mm =3D t->mm; - struct mm_cid __percpu *pcpu_cid =3D mm->pcpu_cid; - int cid; + unsigned int max_cid; =20 - lockdep_assert_irqs_disabled(); - cid =3D __this_cpu_read(pcpu_cid->cid); - if (!mm_cid_is_lazy_put(cid) || - !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) + if (!mm) return; - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); -} - -static inline int mm_cid_pcpu_unset(struct mm_struct *mm) -{ - struct mm_cid __percpu *pcpu_cid =3D mm->pcpu_cid; - int cid, res; =20 - lockdep_assert_irqs_disabled(); - cid =3D __this_cpu_read(pcpu_cid->cid); - for (;;) { - if (mm_cid_is_unset(cid)) - return MM_CID_UNSET; - /* - * Attempt transition from valid or lazy-put to unset. - */ - res =3D cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET); - if (res =3D=3D cid) - break; - cid =3D res; - } - return cid; + /* Preset last_mm_cid */ + max_cid =3D min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->m= m_users)); + t->last_mm_cid =3D max_cid - 1; } =20 -static inline void mm_cid_put(struct mm_struct *mm) +static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, u= nsigned int max_cids) { - int cid; + struct mm_struct *mm =3D t->mm; =20 - lockdep_assert_irqs_disabled(); - cid =3D mm_cid_pcpu_unset(mm); - if (cid =3D=3D MM_CID_UNSET) - return; - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); + if (cid >=3D max_cids) + return false; + if (cpumask_test_and_set_cpu(cid, mm_cidmask(mm))) + return false; + t->mm_cid =3D t->last_mm_cid =3D cid; + __this_cpu_write(mm->pcpu_cid->cid, cid); + return true; } =20 -static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct= *mm) +static inline bool mm_cid_get(struct task_struct *t) { - struct cpumask *cidmask =3D mm_cidmask(mm); - struct mm_cid __percpu *pcpu_cid =3D mm->pcpu_cid; - int cid, max_nr_cid, allowed_max_nr_cid; + struct mm_struct *mm =3D t->mm; + unsigned int max_cids; =20 - /* - * After shrinking the number of threads or reducing the number - * of allowed cpus, reduce the value of max_nr_cid so expansion - * of cid allocation will preserve cache locality if the number - * of threads or allowed cpus increase again. - */ - max_nr_cid =3D atomic_read(&mm->max_nr_cid); - while ((allowed_max_nr_cid =3D min_t(int, READ_ONCE(mm->nr_cpus_allowed), - atomic_read(&mm->mm_users))), - max_nr_cid > allowed_max_nr_cid) { - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ - if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)= ) { - max_nr_cid =3D allowed_max_nr_cid; - break; - } - } - /* Try to re-use recent cid. This improves cache locality. */ - cid =3D __this_cpu_read(pcpu_cid->recent_cid); - if (!mm_cid_is_unset(cid) && cid < max_nr_cid && - !cpumask_test_and_set_cpu(cid, cidmask)) - return cid; - /* - * Expand cid allocation if the maximum number of concurrency - * IDs allocated (max_nr_cid) is below the number cpus allowed - * and number of threads. Expanding cid allocation as much as - * possible improves cache locality. - */ - cid =3D max_nr_cid; - while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_= users)) { - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ - if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) - continue; - if (!cpumask_test_and_set_cpu(cid, cidmask)) - return cid; - } - /* - * Find the first available concurrency id. - * Retry finding first zero bit if the mask is temporarily - * filled. This only happens during concurrent remote-clear - * which owns a cid without holding a rq lock. - */ - for (;;) { - cid =3D cpumask_first_zero(cidmask); - if (cid < READ_ONCE(mm->nr_cpus_allowed)) - break; - cpu_relax(); - } - if (cpumask_test_and_set_cpu(cid, cidmask)) - return -1; + max_cids =3D min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->= mm_users)); =20 - return cid; -} + /* Try to reuse the last CID of this task */ + if (__mm_cid_get(t, t->last_mm_cid, max_cids)) + return true; =20 -/* - * Save a snapshot of the current runqueue time of this cpu - * with the per-cpu cid value, allowing to estimate how recently it was us= ed. - */ -static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *m= m) -{ - struct mm_cid *pcpu_cid =3D per_cpu_ptr(mm->pcpu_cid, cpu_of(rq)); + /* Try to reuse the last CID of this mm on this CPU */ + if (__mm_cid_get(t, __this_cpu_read(mm->pcpu_cid->cid), max_cids)) + return true; =20 - lockdep_assert_rq_held(rq); - WRITE_ONCE(pcpu_cid->time, rq->clock); + /* Try the first zero bit in the cidmask. */ + return __mm_cid_get(t, cpumask_first_zero(mm_cidmask(mm)), max_cids); } =20 -static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, - struct mm_struct *mm) +static inline void mm_cid_select(struct task_struct *t) { - int cid; - - /* - * All allocations (even those using the cid_lock) are lock-free. If - * use_cid_lock is set, hold the cid_lock to perform cid allocation to - * guarantee forward progress. - */ - if (!READ_ONCE(use_cid_lock)) { - cid =3D __mm_cid_try_get(t, mm); - if (cid >=3D 0) - goto end; - raw_spin_lock(&cid_lock); - } else { - raw_spin_lock(&cid_lock); - cid =3D __mm_cid_try_get(t, mm); - if (cid >=3D 0) - goto unlock; - } - - /* - * cid concurrently allocated. Retry while forcing following - * allocations to use the cid_lock to ensure forward progress. - */ - WRITE_ONCE(use_cid_lock, 1); /* - * Set use_cid_lock before allocation. Only care about program order - * because this is only required for forward progress. + * mm_cid_get() can fail when the maximum CID, which is determined + * by min(mm->nr_cpus_allowed, mm->mm_users) changes concurrently. + * That's a transient failure as there cannot be more tasks + * concurrently on a CPU (or about to be scheduled in) than that. */ - barrier(); - /* - * Retry until it succeeds. It is guaranteed to eventually succeed once - * all newcoming allocations observe the use_cid_lock flag set. - */ - do { - cid =3D __mm_cid_try_get(t, mm); - cpu_relax(); - } while (cid < 0); - /* - * Allocate before clearing use_cid_lock. Only care about - * program order because this is for forward progress. - */ - barrier(); - WRITE_ONCE(use_cid_lock, 0); -unlock: - raw_spin_unlock(&cid_lock); -end: - mm_cid_snapshot_time(rq, mm); - - return cid; -} - -static inline int mm_cid_get(struct rq *rq, struct task_struct *t, - struct mm_struct *mm) -{ - struct mm_cid __percpu *pcpu_cid =3D mm->pcpu_cid; - struct cpumask *cpumask; - int cid; - - lockdep_assert_rq_held(rq); - cpumask =3D mm_cidmask(mm); - cid =3D __this_cpu_read(pcpu_cid->cid); - if (mm_cid_is_valid(cid)) { - mm_cid_snapshot_time(rq, mm); - return cid; - } - if (mm_cid_is_lazy_put(cid)) { - if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); + for (;;) { + if (mm_cid_get(t)) + break; } - cid =3D __mm_cid_get(rq, t, mm); - __this_cpu_write(pcpu_cid->cid, cid); - __this_cpu_write(pcpu_cid->recent_cid, cid); - - return cid; } =20 -static inline void switch_mm_cid(struct rq *rq, - struct task_struct *prev, - struct task_struct *next) +static inline void switch_mm_cid(struct task_struct *prev, struct task_str= uct *next) { - /* - * Provide a memory barrier between rq->curr store and load of - * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition. - * - * Should be adapted if context_switch() is modified. - */ - if (!next->mm) { // to kernel - /* - * user -> kernel transition does not guarantee a barrier, but - * we can use the fact that it performs an atomic operation in - * mmgrab(). - */ - if (prev->mm) // from user - smp_mb__after_mmgrab(); - /* - * kernel -> kernel transition does not change rq->curr->mm - * state. It stays NULL. - */ - } else { // to user - /* - * kernel -> user transition does not provide a barrier - * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. - * Provide it here. - */ - if (!prev->mm) { // from kernel - smp_mb(); - } else { // from user - /* - * user->user transition relies on an implicit - * memory barrier in switch_mm() when - * current->mm changes. If the architecture - * switch_mm() does not have an implicit memory - * barrier, it is emitted here. If current->mm - * is unchanged, no barrier is needed. - */ - smp_mb__after_switch_mm(); - } - } if (prev->mm_cid_active) { - mm_cid_snapshot_time(rq, prev->mm); - mm_cid_put_lazy(prev); - prev->mm_cid =3D -1; + if (prev->mm_cid !=3D MM_CID_UNSET) + cpumask_clear_cpu(prev->mm_cid, mm_cidmask(prev->mm)); + prev->mm_cid =3D MM_CID_UNSET; } + if (next->mm_cid_active) { - next->last_mm_cid =3D next->mm_cid =3D mm_cid_get(rq, next, next->mm); + mm_cid_select(next); rseq_sched_set_task_mm_cid(next, next->mm_cid); } } =20 #else /* !CONFIG_SCHED_MM_CID: */ -static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, = struct task_struct *next) { } -static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } -static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_= struct *t) { } -static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *cur= r) { } static inline void init_sched_mm_cid(struct task_struct *t) { } +static inline void mm_cid_select(struct task_struct *t) { } +static inline void switch_mm_cid(struct task_struct *prev, struct task_str= uct *next) { } #endif /* !CONFIG_SCHED_MM_CID */ =20 extern u64 avg_vruntime(struct cfs_rq *cfs_rq); From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3179033A02E for ; Wed, 15 Oct 2025 17:29:29 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549372; cv=none; b=ZjpixsWjPKMPyYw6mt0LkmJGWRk/pGybsuAQBBoSRhOjdE0Zw3/fXNIdvRgS+uQvVLTkwZDjQ1q6WlYqo3KS1gsinXlGqKM7rfawEfFtV/dk3vfn1/nUcratsUsbssjVP6BpusofbHshZwtR9s59GtHwWvYIdDAAOW/E9mStAd8= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549372; c=relaxed/simple; bh=2f42Vbcvk40En3sm6G3dcea2Hl/jXRicjf0v54YPKjk=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=joSO+sLUZkeuuya0G+WtJ/08ou2pfjzhlbRZV8r3Jqj3CEgPNL9wUmcwoBAzJC59CpX0sCI6hqkihjXJBszyg44MmzWF7U8dHyeHkSvlFmms6RVlcgUt6x6dsDcObCYRhWgyXLDPq/+hgzFrrwByMEu8aJicJYHMflk6DIObivs= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=1WPH7ji5; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=axjrsh0S; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="1WPH7ji5"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="axjrsh0S" Message-ID: <20251015172834.441852474@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549368; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=bakm3nMrmHedCES3Cwgl/XpWnEckVyWKNP1je7s3PFE=; b=1WPH7ji5MCuaZMWFqmZLX41iEewpXbmfxtyef/oMiVWbiWvOjuuYvDe68jr1WTM+/SzfV/ MWbAEaQ42d5CRLWfi3Epx55XlSpPYbJgzJOEjoEVL6qCtI3+ru0ooWR/6tLxt5K/Vtw/c0 4cR7TEr6wX597cMiK08xLNdUlXsZ1dlzqM5bZIPfgEyCB7TOW5AEIFPgMrNfec+HrXrISD hN3ec85iKhJ0KJ62D/WVPMVLMiNBSiQ7kXVDCniyPkVtapDYpGhsGX+leDexWZwnc6tkbC FpnLLYn3yMG/lnIg1AoN/FBc9r3WJkqsaHn+nqGJlpO22avTC0B4T69V3y5tuQ== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549368; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=bakm3nMrmHedCES3Cwgl/XpWnEckVyWKNP1je7s3PFE=; b=axjrsh0SFMT8FZVk8GzNKDtp4zIL0mGFelWzHPkuIxZurOAPgeGGVeQAYR+CMfInZeQAK6 fG34qpxFpFLojICg== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 02/19] sched/mmcid: Use proper data structures References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:27 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Having a lot of CID functionality specific members in struct task_struct and struct mm_struct is not really making the code easier to read. Encapsulate the CID specific parts in data structures and keep them seperate from the stuff they are embedded in. No functional change. Signed-off-by: Thomas Gleixner --- include/linux/mm_types.h | 56 +++++++++++-----------------------------= ----- include/linux/rseq_types.h | 42 +++++++++++++++++++++++++++++++++ include/linux/sched.h | 11 +------- init/init_task.c | 3 ++ kernel/fork.c | 6 ++-- kernel/sched/core.c | 16 ++++++------ kernel/sched/sched.h | 26 ++++++++++---------- 7 files changed, 85 insertions(+), 75 deletions(-) --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -20,6 +20,7 @@ #include #include #include +#include =20 #include =20 @@ -924,10 +925,6 @@ struct vm_area_struct { #define vma_policy(vma) NULL #endif =20 -struct mm_cid { - unsigned int cid; -}; - struct kioctx_table; struct iommu_mm_data; struct mm_struct { @@ -980,30 +977,9 @@ struct mm_struct { */ atomic_t mm_users; =20 -#ifdef CONFIG_SCHED_MM_CID - /** - * @pcpu_cid: Per-cpu current cid. - * - * Keep track of the currently allocated mm_cid for each cpu. - * The per-cpu mm_cid values are serialized by their respective - * runqueue locks. - */ - struct mm_cid __percpu *pcpu_cid; - /** - * @nr_cpus_allowed: Number of CPUs allowed for mm. - * - * Number of CPUs allowed in the union of all mm's - * threads allowed CPUs. - */ - unsigned int nr_cpus_allowed; - /** - * @cpus_allowed_lock: Lock protecting mm cpus_allowed. - * - * Provide mutual exclusion for mm cpus_allowed and - * mm nr_cpus_allowed updates. - */ - raw_spinlock_t cpus_allowed_lock; -#endif + /* MM CID related storage */ + struct mm_mm_cid mm_cid; + #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* size of all page tables */ #endif @@ -1306,9 +1282,6 @@ static inline void vma_iter_init(struct } =20 #ifdef CONFIG_SCHED_MM_CID - -#define MM_CID_UNSET (~0U) - /* * mm_cpus_allowed: Union of all mm's threads allowed CPUs. */ @@ -1337,20 +1310,20 @@ static inline void mm_init_cid(struct mm int i; =20 for_each_possible_cpu(i) { - struct mm_cid *pcpu_cid =3D per_cpu_ptr(mm->pcpu_cid, i); + struct mm_cid_pcpu *pcpu =3D per_cpu_ptr(mm->mm_cid.pcpu, i); =20 - pcpu_cid->cid =3D MM_CID_UNSET; + pcpu->cid =3D MM_CID_UNSET; } - mm->nr_cpus_allowed =3D p->nr_cpus_allowed; - raw_spin_lock_init(&mm->cpus_allowed_lock); + mm->mm_cid.nr_cpus_allowed =3D p->nr_cpus_allowed; + raw_spin_lock_init(&mm->mm_cid.lock); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); cpumask_clear(mm_cidmask(mm)); } =20 static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_st= ruct *p) { - mm->pcpu_cid =3D alloc_percpu_noprof(struct mm_cid); - if (!mm->pcpu_cid) + mm->mm_cid.pcpu =3D alloc_percpu_noprof(struct mm_cid_pcpu); + if (!mm->mm_cid.pcpu) return -ENOMEM; mm_init_cid(mm, p); return 0; @@ -1359,8 +1332,8 @@ static inline int mm_alloc_cid_noprof(st =20 static inline void mm_destroy_cid(struct mm_struct *mm) { - free_percpu(mm->pcpu_cid); - mm->pcpu_cid =3D NULL; + free_percpu(mm->mm_cid.pcpu); + mm->mm_cid.pcpu =3D NULL; } =20 static inline unsigned int mm_cid_size(void) @@ -1375,10 +1348,9 @@ static inline void mm_set_cpus_allowed(s if (!mm) return; /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */ - raw_spin_lock(&mm->cpus_allowed_lock); + guard(raw_spinlock)(&mm->mm_cid.lock); cpumask_or(mm_allowed, mm_allowed, cpumask); - WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed)); - raw_spin_unlock(&mm->cpus_allowed_lock); + WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed)); } #else /* CONFIG_SCHED_MM_CID */ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p= ) { } --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -90,4 +90,46 @@ struct rseq_data { struct rseq_data { }; #endif /* !CONFIG_RSEQ */ =20 +#ifdef CONFIG_SCHED_MM_CID + +#define MM_CID_UNSET (~0U) + +/** + * struct sched_mm_cid - Storage for per task MM CID data + * @active: MM CID is active for the task + * @cid: The CID associated to the task + * @last_cid: The last CID associated to the task + */ +struct sched_mm_cid { + unsigned int active; + unsigned int cid; + unsigned int last_cid; +}; + +/** + * struct mm_cid_pcpu - Storage for per CPU MM_CID data + * @cid: The CID associated to the CPU + */ +struct mm_cid_pcpu { + unsigned int cid; +}; + +/** + * struct mm_mm_cid - Storage for per MM CID data + * @pcpu: Per CPU storage for CIDs associated to a CPU + * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. Th= e map + * is growth only. + * @lock: Spinlock to protect all fields except @pcpu. It also protects + * the MM cid cpumask and the MM cidmask bitmap. + */ +struct mm_mm_cid { + struct mm_cid_pcpu __percpu *pcpu; + unsigned int nr_cpus_allowed; + raw_spinlock_t lock; +}; +#else /* CONFIG_SCHED_MM_CID */ +struct mm_cid { }; +struct sched_mm_cid { }; +#endif /* !CONFIG_SCHED_MM_CID */ + #endif --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1399,14 +1399,7 @@ struct task_struct { #endif /* CONFIG_NUMA_BALANCING */ =20 struct rseq_data rseq; - -#ifdef CONFIG_SCHED_MM_CID - int mm_cid; /* Current cid in mm */ - int last_mm_cid; /* Most recent cid in mm */ - int migrate_from_cpu; - int mm_cid_active; /* Whether cid bitmap is active */ - struct callback_head cid_work; -#endif + struct sched_mm_cid mm_cid; =20 struct tlbflush_unmap_batch tlb_ubc; =20 @@ -2300,7 +2293,7 @@ void sched_mm_cid_fork(struct task_struc void sched_mm_cid_exit_signals(struct task_struct *t); static inline int task_mm_cid(struct task_struct *t) { - return t->mm_cid; + return t->mm_cid.cid; } #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } --- a/init/init_task.c +++ b/init/init_task.c @@ -220,6 +220,9 @@ struct task_struct init_task __aligned(L #ifdef CONFIG_SECCOMP_FILTER .seccomp =3D { .filter_count =3D ATOMIC_INIT(0) }, #endif +#ifdef CONFIG_SCHED_MM_CID + .mm_cid =3D { .cid =3D MM_CID_UNSET, }, +#endif }; EXPORT_SYMBOL(init_task); =20 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -955,9 +955,9 @@ static struct task_struct *dup_task_stru #endif =20 #ifdef CONFIG_SCHED_MM_CID - tsk->mm_cid =3D MM_CID_UNSET; - tsk->last_mm_cid =3D MM_CID_UNSET; - tsk->mm_cid_active =3D 0; + tsk->mm_cid.cid =3D MM_CID_UNSET; + tsk->mm_cid.last_cid =3D MM_CID_UNSET; + tsk->mm_cid.active =3D 0; #endif return tsk; =20 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10411,14 +10411,14 @@ void sched_mm_cid_exit_signals(struct ta { struct mm_struct *mm =3D t->mm; =20 - if (!mm || !t->mm_cid_active) + if (!mm || !t->mm_cid.active) return; =20 guard(preempt)(); - t->mm_cid_active =3D 0; - if (t->mm_cid !=3D MM_CID_UNSET) { - cpumask_clear_cpu(t->mm_cid, mm_cidmask(mm)); - t->mm_cid =3D MM_CID_UNSET; + t->mm_cid.active =3D 0; + if (t->mm_cid.cid !=3D MM_CID_UNSET) { + cpumask_clear_cpu(t->mm_cid.cid, mm_cidmask(mm)); + t->mm_cid.cid =3D MM_CID_UNSET; } } =20 @@ -10437,14 +10437,14 @@ void sched_mm_cid_after_execve(struct ta return; =20 guard(preempt)(); - t->mm_cid_active =3D 1; + t->mm_cid.active =3D 1; mm_cid_select(t); } =20 void sched_mm_cid_fork(struct task_struct *t) { - WARN_ON_ONCE(!t->mm || t->mm_cid !=3D MM_CID_UNSET); - t->mm_cid_active =3D 1; + WARN_ON_ONCE(!t->mm || t->mm_cid.cid !=3D MM_CID_UNSET); + t->mm_cid.active =3D 1; } #endif /* CONFIG_SCHED_MM_CID */ =20 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3543,8 +3543,8 @@ static inline void init_sched_mm_cid(str return; =20 /* Preset last_mm_cid */ - max_cid =3D min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->m= m_users)); - t->last_mm_cid =3D max_cid - 1; + max_cid =3D min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read= (&mm->mm_users)); + t->mm_cid.last_cid =3D max_cid - 1; } =20 static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, u= nsigned int max_cids) @@ -3555,8 +3555,8 @@ static inline bool __mm_cid_get(struct t return false; if (cpumask_test_and_set_cpu(cid, mm_cidmask(mm))) return false; - t->mm_cid =3D t->last_mm_cid =3D cid; - __this_cpu_write(mm->pcpu_cid->cid, cid); + t->mm_cid.cid =3D t->mm_cid.last_cid =3D cid; + __this_cpu_write(mm->mm_cid.pcpu->cid, cid); return true; } =20 @@ -3565,14 +3565,14 @@ static inline bool mm_cid_get(struct tas struct mm_struct *mm =3D t->mm; unsigned int max_cids; =20 - max_cids =3D min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->= mm_users)); + max_cids =3D min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_rea= d(&mm->mm_users)); =20 /* Try to reuse the last CID of this task */ - if (__mm_cid_get(t, t->last_mm_cid, max_cids)) + if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids)) return true; =20 /* Try to reuse the last CID of this mm on this CPU */ - if (__mm_cid_get(t, __this_cpu_read(mm->pcpu_cid->cid), max_cids)) + if (__mm_cid_get(t, __this_cpu_read(mm->mm_cid.pcpu->cid), max_cids)) return true; =20 /* Try the first zero bit in the cidmask. */ @@ -3595,15 +3595,15 @@ static inline void mm_cid_select(struct =20 static inline void switch_mm_cid(struct task_struct *prev, struct task_str= uct *next) { - if (prev->mm_cid_active) { - if (prev->mm_cid !=3D MM_CID_UNSET) - cpumask_clear_cpu(prev->mm_cid, mm_cidmask(prev->mm)); - prev->mm_cid =3D MM_CID_UNSET; + if (prev->mm_cid.active) { + if (prev->mm_cid.cid !=3D MM_CID_UNSET) + cpumask_clear_cpu(prev->mm_cid.cid, mm_cidmask(prev->mm)); + prev->mm_cid.cid =3D MM_CID_UNSET; } =20 - if (next->mm_cid_active) { + if (next->mm_cid.active) { mm_cid_select(next); - rseq_sched_set_task_mm_cid(next, next->mm_cid); + rseq_sched_set_task_mm_cid(next, next->mm_cid.cid); } } From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id AF4DD33CE8B for ; Wed, 15 Oct 2025 17:29:31 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549373; cv=none; b=uVd0Wp+41awHhnoTfeqjnv2S8xZeLhfejdZSqpMkpGkQMQ/fg+zibVU024+JoLGLa3Q8XbDJqkxbgtptK85kCYRsgk54GTjC7QR5uJWbPS+xrMSk8N9lmBNvjtt0U41F3klKEplST9fguAd8ZrV/6R3t1Myliyc/LCB/FcdWaDU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549373; c=relaxed/simple; bh=WtdlZTEAm/qo378Vyp88v4Zz0OrT231u5G47EVKWV8U=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=KegCSCwMSC50Hgq8gHmWucGXcIrPGPu+mOd1PJI3So+f9KDx/y5fgoapuErPWYpheHFUq1erqmidzztimNs5vHfoOZj+RbYB2YonRcstS47mqww4Tv1CH78tjBZhmKwSwkMcJV+EFRDykX6RkDJw/H+Ve06fvQ31T+SkeX0MT8s= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=VVWBeUG2; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=kuaQLNSs; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="VVWBeUG2"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="kuaQLNSs" Message-ID: <20251015172834.504490216@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549369; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=6NYtpKGSbQFI9f0vnFaHT+ApHPfnQIXwG205RTbsJ5g=; b=VVWBeUG29pdYL3NFpVUGD1jPXY+BN+qXf1j/1YaQIVGwvnhOaWxEG3/QCpB5kb545D/b2n 2Nq4qRAse3nQQFbenxv7a2m8z/cXHeopt3ysKtVxJrYZAadfNIJJFbF8jty43nckjv3YS5 h+ypW0PxnWT3KiM53vZ6yY/ECCZrgSBeHRqDpKZypx1b0alFlJsT5gcmqss7mfKflsg3sU 2cbRjq1XdoXKLm3N9sws/2xPzj+FOetOK+/0WC5Tg/0xnEzOk8JdNKKqPgOARzHEIjlvxG uREOaSTQwNsyzRzsLeQZ4E0OGU0Y/ZMSFWX4KqlW1nb4WD+E8XwzocIwvkKNIw== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549369; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=6NYtpKGSbQFI9f0vnFaHT+ApHPfnQIXwG205RTbsJ5g=; b=kuaQLNSs46QZF/FnCorNs9AdSn7rFfCija+QcOH2TVAHUFr3GydWBB4c9VDTv/upcgqupf o1xWN680CEZq7gCw== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 03/19] sched/mmcid: Cacheline align MM CID storage References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:29 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Both the per CPU storage and the data in mm_struct are heavily used in context switch. As they can end up next to other frequently modified data, they are subject to false sharing. Make them cache line aligned. Signed-off-by: Thomas Gleixner --- include/linux/rseq_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -112,7 +112,7 @@ struct sched_mm_cid { */ struct mm_cid_pcpu { unsigned int cid; -}; +}____cacheline_aligned_in_smp; =20 /** * struct mm_mm_cid - Storage for per MM CID data @@ -126,7 +126,7 @@ struct mm_mm_cid { struct mm_cid_pcpu __percpu *pcpu; unsigned int nr_cpus_allowed; raw_spinlock_t lock; -}; +}____cacheline_aligned_in_smp; #else /* CONFIG_SCHED_MM_CID */ struct mm_cid { }; struct sched_mm_cid { }; From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5697F33EB07 for ; Wed, 15 Oct 2025 17:29:33 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549375; cv=none; b=ttSHlZYDOJhLDu8e0U/neuMRFhlkLdBkxru0wBUuPN6nweDQoZP42MO+rI0g6vcc1skeN2JsiAVpRql63C/t8wuHaexN+4FK7PmI6qmqtQIZKtvIj3WNgVnUOA4MGkEk4Ub1vDenhkXq+T1XPDxcs646Px/lz3DwCQ8iKvingZA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549375; c=relaxed/simple; bh=fAulOyMtoDIHtPffcLIZNosiilQd7SnxrdYCJYwu6vc=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=QLk75D4RVJ4P1VfJ+4QHtjnM/twr2GloWd+f5y5WL0iC9hYWRwAMjMv/7pWwSWixMoc4cVWUoWMrLXj5EGGMURu1hBwd9TMhk/DzH1GwjNqhLhrlJeXhe7az0Ix3N149GDrUmVJwdvWXK75DB0m16lkNHUVjmgQkxFO8ilhXV7k= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=2v8diJWF; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=JAsIVnJb; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="2v8diJWF"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="JAsIVnJb" Message-ID: <20251015172834.567565394@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549371; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=UAGiBz58l3yttdLeaOKha2U+mDMRTGzch+NUHw/3WXo=; b=2v8diJWFvvLFX7iIXY6/wKbLFHQqysbRUjDzo4wfeAyIpYc6KeiN1f7/mE94kCBWcp4Oqd qaQF+r9A9xZHeB0r4iMhpNTtaw9o2zvYtxC2HfVkW7JMZxUjMvMGojD2oVLcVFhFsUM0jz rEARAOpG+saBVoNDKDWWDgLwhTANQAhWXT2RsfmauF1UgAWzUjKosAFqWR77KHfpxEZXgL 7rYiHKe8GkYQrcmkliisz/0DAb1FyybSRXaDAUVi4fLuutzY2uxGhAieCXHNdwheJW28Vw IR5RUkmFFLqyh1RdobgVOAv0DjfN+jRJX996OBGIrDVrXcy1oKDESevLIGLy7g== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549371; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=UAGiBz58l3yttdLeaOKha2U+mDMRTGzch+NUHw/3WXo=; b=JAsIVnJbHG4vYW9GpYj6LYu7QApv2grY7/wafdWNtNYnA4g+F8ACbnKWybCI3auxh1iGzM iEJqWWMnvy/McyDg== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 04/19] sched: Fixup whitespace damage References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:31 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" With whitespace checks enabled in the editor this makes eyes bleed. Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5309,19 +5309,16 @@ context_switch(struct rq *rq, struct tas * * kernel -> user switch + mmdrop_lazy_tlb() active * user -> user switch - * - * switch_mm_cid() needs to be updated if the barriers provided - * by context_switch() are modified. */ - if (!next->mm) { // to kernel + if (!next->mm) { // to kernel enter_lazy_tlb(prev->active_mm, next); =20 next->active_mm =3D prev->active_mm; - if (prev->mm) // from user + if (prev->mm) // from user mmgrab_lazy_tlb(prev->active_mm); else prev->active_mm =3D NULL; - } else { // to user + } else { // to user membarrier_switch_mm(rq, prev->active_mm, next->mm); /* * sys_membarrier() requires an smp_mb() between setting @@ -5334,7 +5331,7 @@ context_switch(struct rq *rq, struct tas switch_mm_irqs_off(prev->active_mm, next->mm, next); lru_gen_use_mm(next->mm); =20 - if (!prev->mm) { // from kernel + if (!prev->mm) { // from kernel /* will mmdrop_lazy_tlb() in finish_task_switch(). */ rq->prev_mm =3D prev->active_mm; prev->active_mm =3D NULL; From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C4796340DB1 for ; Wed, 15 Oct 2025 17:29:37 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549379; cv=none; b=dWqf/kluPyfKmIB0CLxmukVKT0+OinbDpTCUbqlTqxfPlEE7uuwAwI95VCGNXJKtEutOxPI7uMfISELiO7YWo8avLEQCpbGNx0gD6DEgVjhGHx9Xftt7zy0DEumZIRnuAxks3PJbQxpAf8MQoxZYbgVMLLgP182Hca8TbP2LKEU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549379; c=relaxed/simple; bh=7fpwOABA3SzrIN9DCSw4qLEkvmleWEchFx4lA/unLrw=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=OGfsG7IVOQhJZlvMfPxW6uwB/0Gt9u036QV7FtgJ8SNSihSCDD6EG+ESMWrq0AfSm3yzcQs1dWniwWbgjFv0HZ676oQMU2NW1oseMD+b7cfnIplDmLMGSvyMaOvyuo2Hsb+IsQTKaXBGJTUP01hn9hVQqYC6qvlfWAU6rMQJeUA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=Cy5jyVUq; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=BEYff9w4; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="Cy5jyVUq"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="BEYff9w4" Message-ID: <20251015172834.631482357@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549373; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=79E8Bkn/ABS3eLdwGxQjGfkAWpdXEHtzjpjYM5kWPc0=; b=Cy5jyVUqXcNivDkPeTjWFXB3ReLXinHn7eiiev+r2cDxZg5JrP4+nU5T8Zl51wAw9ePOjT sXBATlX9RKqNqn3lYFkK6FtN1z6VlBjXiuztxKyfEQ0iSF+ygFAh4pEwQ7fbd1dZa9ev52 YY3AHbCTrTUKIONamGhWxC9V9yQ1XgycQlRNHruFH1LeANF2m0CfsmPORkAC6+L2HugfRo MlKmOZBX4p5y0JDb4g37S8wClnsz3UCitc8Y293+2alHCMKdQgHINt6CCMdNnvCjijT4kR ZLs5l5z2rHVkuocwHnooedqc+FNfqJ/4p95sZuU64GGD6QWK8MmN02g0SsH5XA== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549373; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=79E8Bkn/ABS3eLdwGxQjGfkAWpdXEHtzjpjYM5kWPc0=; b=BEYff9w4t55VbgMm7qnr/dbOvDVnXls5XjImYBK81a4L9kH+4QV9FynE/C+92XimpxzN+l IxDqtIyPRYDIQ9Bg== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 05/19] sched/mmcid: Move scheduler code out of global header References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:32 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" This is only used in the scheduler core code, so there is no point to have it in a global header. Signed-off-by: Thomas Gleixner --- include/linux/mm_types.h | 13 ------------- kernel/sched/core.c | 20 ++++++++++++++++++-- 2 files changed, 18 insertions(+), 15 deletions(-) --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1341,27 +1341,14 @@ static inline unsigned int mm_cid_size(v return 2 * cpumask_size(); /* mm_cpus_allowed(), mm_cidmask(). */ } =20 -static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct = cpumask *cpumask) -{ - struct cpumask *mm_allowed =3D mm_cpus_allowed(mm); - - if (!mm) - return; - /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */ - guard(raw_spinlock)(&mm->mm_cid.lock); - cpumask_or(mm_allowed, mm_allowed, cpumask); - WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed)); -} #else /* CONFIG_SCHED_MM_CID */ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p= ) { } static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p= ) { return 0; } static inline void mm_destroy_cid(struct mm_struct *mm) { } - static inline unsigned int mm_cid_size(void) { return 0; } -static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct = cpumask *cpumask) { } #endif /* CONFIG_SCHED_MM_CID */ =20 struct mmu_gather; --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2704,6 +2704,8 @@ int push_cpu_stop(void *arg) return 0; } =20 +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const cpum= ask_t *affmask); + /* * sched_class::set_cpus_allowed must do the below, but is not required to * actually call this function. @@ -2763,7 +2765,7 @@ static void put_prev_task(rq, p); =20 p->sched_class->set_cpus_allowed(p, ctx); - mm_set_cpus_allowed(p->mm, ctx->new_mask); + mm_update_cpus_allowed(p->mm, ctx->new_mask); =20 if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -10404,6 +10406,18 @@ void call_trace_sched_update_nr_running( * When a task exits, the MM CID held by the task is not longer required as * the task cannot return to user space. */ +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const stru= ct cpumask *affmsk) +{ + struct cpumask *mm_allowed =3D mm_cpus_allowed(mm); + + if (!mm) + return; + /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */ + guard(raw_spinlock)(&mm->mm_cid.lock); + cpumask_or(mm_allowed, mm_allowed, affmsk); + WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed)); +} + void sched_mm_cid_exit_signals(struct task_struct *t) { struct mm_struct *mm =3D t->mm; @@ -10443,7 +10457,9 @@ void sched_mm_cid_fork(struct task_struc WARN_ON_ONCE(!t->mm || t->mm_cid.cid !=3D MM_CID_UNSET); t->mm_cid.active =3D 1; } -#endif /* CONFIG_SCHED_MM_CID */ +#else /* CONFIG_SCHED_MM_CID */ +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const stru= ct cpumask *affmsk) { } +#endif /* !CONFIG_SCHED_MM_CID */ =20 #ifdef CONFIG_SCHED_CLASS_EXT void sched_deq_and_put_task(struct task_struct *p, int queue_flags, From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C4712340D85 for ; Wed, 15 Oct 2025 17:29:37 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549379; cv=none; b=H6+jPaRC5XSvRwlrtdLRkW9aRVRnJNuFoMiSiwoGFyAJHsQ20VXhFl3KdpT8GSrTsn+rNG+P5eQ/Zr1kf3MlT7qWeZ9/Q4mkWDDEOk80J73lS4tMk/19VulZgpi4Kk1urrCA4a5iGAX+SpO98iMA2RavBgv2cxwfZXb4RgIKxCM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549379; c=relaxed/simple; bh=9pWswLndSYH3OlKEHdAXCEVlAcboLxHWcLk6AbLtBBg=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=LiVbEXe9zYLsXFyGVduD1SBcwkMyMn5DhOL2oMrNBqnJubC2gv1BPCYXUJZfPrtaQ4p4Cz1SKxVlMOXK/OM3+A/H3dP5BXaMVgbc36DZCSuieL+YwBftJA3sD1wowN+EMt4700TT90C3bCGPiPyE8R2egIKtwJ+WiCBmGwDhY7I= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=jXT79sOI; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=VsBM5X55; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="jXT79sOI"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="VsBM5X55" Message-ID: <20251015172834.694547089@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549375; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=l5dqy6vQM2tKFDkysaDM6Dpk5DSc9c4FVIzZnpmdwOo=; b=jXT79sOIkWPg2A6CvH8sbwR+buTNddLmZGcFXtnb12arGQozofxIDAwadTx8M311wXh5LK CxjtQVqAk/Tp0nOg3WDmTx283nbCukqKOJd3jC1aQKZbJPWSs/r1LKZpMszcK2KyVQkoeP uYsUWQxN1FIE5aV5wj6HqciG8AgsuH5cMAtyeIRJCss+aMveKJ5+WIqbPK+Dq+zQKzaEAb 6VZkWvXgV41fl+1ialJeX556yP4LRTyr5sne4ddiWHOR5Xq1ootvLWUZMoDnRiJoLABDQu LWLIJfQitc38wVTnCxI7DfH3wBdxmiYNsBYdiR0cIEWv16ri3vTYlJ7IFHjFlw== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549375; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=l5dqy6vQM2tKFDkysaDM6Dpk5DSc9c4FVIzZnpmdwOo=; b=VsBM5X55zPNC3XSZHLMDPrulrN3Vniy7BPLHvG1CxWbhrY7YS9AQjX1SqgZuzWOXqFaSYx Tg4ZxkwO2Cf+SIDw== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 06/19] sched/mmcid: Prevent pointless work in mm_update_cpus_allowed() References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:34 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" The @nr_cpus_allowed management does way too much useless work for the common case where a process starts with unrestricted affinity, i.e. @nr_cpus_allowed is equal to the number of possible CPUs right away. Add a check whether that limit is reached already and then avoid the whole cpumask update and evaluation. Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2719,6 +2719,7 @@ void set_cpus_allowed_common(struct task =20 cpumask_copy(&p->cpus_mask, ctx->new_mask); p->nr_cpus_allowed =3D cpumask_weight(ctx->new_mask); + mm_update_cpus_allowed(p->mm, ctx->new_mask); =20 /* * Swap in a new user_cpus_ptr if SCA_USER flag set @@ -2765,7 +2766,6 @@ static void put_prev_task(rq, p); =20 p->sched_class->set_cpus_allowed(p, ctx); - mm_update_cpus_allowed(p->mm, ctx->new_mask); =20 if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -10408,12 +10408,20 @@ void call_trace_sched_update_nr_running( */ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const stru= ct cpumask *affmsk) { - struct cpumask *mm_allowed =3D mm_cpus_allowed(mm); + struct cpumask *mm_allowed; =20 - if (!mm) + if (!mm || READ_ONCE(mm->mm_cid.nr_cpus_allowed) =3D=3D nr_cpu_ids) return; - /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */ + + /* + * mm::mm_cid::mm_cpus_allowed is the superset of each threads + * allowed CPUs mask which means it can only grow. + */ guard(raw_spinlock)(&mm->mm_cid.lock); + /* Check again under the lock */ + if (mm->mm_cid.nr_cpus_allowed =3D=3D nr_cpu_ids) + return; + mm_allowed =3D mm_cpus_allowed(mm); cpumask_or(mm_allowed, mm_allowed, affmsk); WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed)); } From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6CE9833CEB1 for ; Wed, 15 Oct 2025 17:29:39 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549381; cv=none; b=cNEkaWfp1rAshi2Pe6Opqh710KMj8WpXlc93VEkqaXVY15tT27c68f18yhod2PLOgqWHm8YcYmuVMpIKiz+I38+YO9VzOZ2fM0MVIf9CzYJ4UdPkjtfST6CwjaOq3+uUhRs+rmD+/KYxrBcQ2vNDeWZEop68L2/Ph9HVPBCo2LU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549381; c=relaxed/simple; bh=3o3PKrl0qbbK1n7C9MNgQWn1awkfE6SwIZ71lmiq0xA=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=VccOlsYr2C0cA6kVL1zVzBLI2AOdgEdj9jKT7pJXKDIgVL2yabb5r9Q08Md5OCCZQxPHEZLskQTRGMc2SKO/5zhT0HPJ4uMOzTna0/ftlxQKx/l7f4bThMZkiYlKXayBNgBqRXk+DNfm+qrB+dTNOLrBe0jYNdxIgJzjfXecVTo= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=4PYXNKF0; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=LHaZu2bF; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="4PYXNKF0"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="LHaZu2bF" Message-ID: <20251015172834.757776587@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549377; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=9gNMHN79jDZMeM2Dx/h4kedzc9Grels48SIn/1YFPyw=; b=4PYXNKF0hemvrKt5eLeD/ADcxpl2cSE0BQ6uGxfjgJm9eKzclkg72DkxJR0De5occ1URLH aK4h+UAyAmFd9fSZS4U54vaSFdQzSA6aWSCA1UORPftISPkSUq+MQEJJFqPaEKTyXy6XS3 PdtZQ0y0qMZ+W/MuQioFE+0VgHVAF6KQDenFJeF4EeG+5hpMcPsAt4mzDOOyfacyNlv9cZ Z062c9wYRkUViDpnIpO+hq2o8xjxQv9/9raXoJN0nfAwR1iARd7J2Dyi+EaK8VI3B6c0y0 BI+ishh5nIP7qumpVziXz5Iqr9lifGZhH1n6gkO9QmIPeIcLoUO64DIEIlpBtw== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549377; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=9gNMHN79jDZMeM2Dx/h4kedzc9Grels48SIn/1YFPyw=; b=LHaZu2bFXlmmcSSfPUZ1Llf/Bmfv7mZickml05TSSWfpHPi0/qbGr483cwq9NuAz3jBISH jX3QbqD+7Bnk7SDg== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team , Yury Norov Subject: [patch 07/19] cpumask: Introduce cpumask_or_weight() References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:36 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" CID management OR's two cpumasks and then calculates the weight on the result. That's inefficient as that has to walk the same stuff twice. As this is done with runqueue lock held, there is a real benefit of speeding this up. Provide cpumask_or_weight() and the corresponding bitmap functions which return the weight of the OR result right away. Signed-off-by: Thomas Gleixner Cc: Yury Norov Acked-by: Yury Norov (NVIDIA) --- include/linux/bitmap.h | 15 +++++++++++++++ include/linux/cpumask.h | 16 ++++++++++++++++ lib/bitmap.c | 17 +++++++++++++++++ 3 files changed, 48 insertions(+) --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -45,6 +45,7 @@ struct device; * bitmap_copy(dst, src, nbits) *dst =3D *src * bitmap_and(dst, src1, src2, nbits) *dst =3D *src1 & *src2 * bitmap_or(dst, src1, src2, nbits) *dst =3D *src1 | *src2 + * bitmap_or_weight(dst, src1, src2, nbits) *dst =3D *src1 | *src2. Re= turns Hamming Weight of dst * bitmap_xor(dst, src1, src2, nbits) *dst =3D *src1 ^ *src2 * bitmap_andnot(dst, src1, src2, nbits) *dst =3D *src1 & ~(*src2) * bitmap_complement(dst, src, nbits) *dst =3D ~(*src) @@ -165,6 +166,8 @@ bool __bitmap_and(unsigned long *dst, co const unsigned long *bitmap2, unsigned int nbits); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); +unsigned int __bitmap_or_weight(unsigned long *dst, const unsigned long *b= itmap1, + const unsigned long *bitmap2, unsigned int nbits); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, @@ -338,6 +341,18 @@ void bitmap_or(unsigned long *dst, const } =20 static __always_inline +unsigned int bitmap_or_weight(unsigned long *dst, const unsigned long *src= 1, + const unsigned long *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) { + *dst =3D *src1 | *src2; + return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits)); + } else { + return __bitmap_or_weight(dst, src1, src2, nbits); + } +} + +static __always_inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -729,6 +729,22 @@ void cpumask_or(struct cpumask *dstp, co } =20 /** + * cpumask_or_weight - *dstp =3D *src1p | *src2p and return the weight of = the result + * @dstp: the cpumask result + * @src1p: the first input + * @src2p: the second input + * + * Return: The number of bits set in the resulting cpumask @dstp + */ +static __always_inline +unsigned int cpumask_or_weight(struct cpumask *dstp, const struct cpumask = *src1p, + const struct cpumask *src2p) +{ + return bitmap_or_weight(cpumask_bits(dstp), cpumask_bits(src1p), + cpumask_bits(src2p), small_cpumask_bits); +} + +/** * cpumask_xor - *dstp =3D *src1p ^ *src2p * @dstp: the cpumask result * @src1p: the first input --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -253,6 +253,23 @@ void __bitmap_or(unsigned long *dst, con } EXPORT_SYMBOL(__bitmap_or); =20 +unsigned int __bitmap_or_weight(unsigned long *dst, const unsigned long *b= itmap1, + const unsigned long *bitmap2, unsigned int bits) +{ + unsigned int k, w =3D 0; + + for (k =3D 0; k < bits / BITS_PER_LONG; k++) { + dst[k] =3D bitmap1[k] | bitmap2[k]; + w +=3D hweight_long(dst[k]); + } + + if (bits % BITS_PER_LONG) { + dst[k] =3D bitmap1[k] | bitmap2[k]; + w +=3D hweight_long(dst[k] & BITMAP_LAST_WORD_MASK(bits)); + } + return w; +} + void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id F0946341651 for ; Wed, 15 Oct 2025 17:29:40 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549382; cv=none; b=AHT0FxVdklqnFDgx8z1IQoQaD5VkZaiwvFGrZ2MecMS/IvnGPc7b6c/hDR3TgiVGxTNjOZ9jp6LKR+RDHAxqXHR6PvahSmvCttfr9BZXJBS/aDfa/sOzj3blQ5kCqW4a1PQMefi4C7mKLP4jPa74YQWoYvitIF4U/SZHgdjVBjI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549382; c=relaxed/simple; bh=MtiEhz0wHdPyjeaAls39ddozfjqipwFqzCW6ZelbBxk=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=ndQQ25sLPy54R+fz3R8A5sJw2TcJ/Ztsyj8M8nJRd5DoPVMOri8k2OUkdDoQINNdyADHbZDD+wknuCix1Dei11tt1BgkLwGazCtH9pOd2OZrWe5yuFzR4j6oBcwanSDn2LtYuJ+5UtS9U0Mjc8G+tURQt9RoRR0V9PANyDDi47U= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=QLp81ZOL; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=86Qc9Qlh; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="QLp81ZOL"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="86Qc9Qlh" Message-ID: <20251015172834.821167837@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549379; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=6czr5gj00vNJ5vkTWTx9oZXFl88VrqJbsZtU7Kcm6x8=; b=QLp81ZOLhfcx3sO8rXZq+reWBE5w6cu1zukdwyArqCoq2F8EF0LtIMC1T9c0IOpSwEFYCj uY7Ak+B9O4CLNo9Ki/wzXicYEwEJ8QR1FTvPKUm5iRX+xXfZFh/Mq9TgnqxWsCgQelllaU PobX/6uzdKwD6UE/9XvrD/6p84ITXXK6Yu+9T0zlYrdis0CIvHiIVO+c3xpNZcqiUP3/bN meG08+FpWOSrie98uSGqr9CLskA8zcqKlBD8wCEGqsvyaUbliQKGQZGwlV5wd7l+/0azm0 B7c3d+a9nKV0cEofPXs7lhLFtZkHiigExIgMUJBAtJIrwBBZVvDBp++40/tp8w== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549379; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=6czr5gj00vNJ5vkTWTx9oZXFl88VrqJbsZtU7Kcm6x8=; b=86Qc9QlhxN1HtLpCnIgcpVvymUJ3058/OgLX10dTgFqNgA4UUuq36I9GC614w+XwA9ZD8h oUvP0XnOjdzOPrCw== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 08/19] sched/mmcid: Use cpumask_or_weight() References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:38 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Use cpumask_or_weight() instead of cpumask_or() and cpumask_weight() on the result, which walks the same bitmap twice. Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10409,6 +10409,7 @@ void call_trace_sched_update_nr_running( static inline void mm_update_cpus_allowed(struct mm_struct *mm, const stru= ct cpumask *affmsk) { struct cpumask *mm_allowed; + unsigned int weight; =20 if (!mm || READ_ONCE(mm->mm_cid.nr_cpus_allowed) =3D=3D nr_cpu_ids) return; @@ -10422,8 +10423,8 @@ static inline void mm_update_cpus_allowe if (mm->mm_cid.nr_cpus_allowed =3D=3D nr_cpu_ids) return; mm_allowed =3D mm_cpus_allowed(mm); - cpumask_or(mm_allowed, mm_allowed, affmsk); - WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed)); + weight =3D cpumask_or_weight(mm_allowed, mm_allowed, affmsk); + WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight); } =20 void sched_mm_cid_exit_signals(struct task_struct *t) From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 34E612DFF18 for ; Wed, 15 Oct 2025 17:29:43 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549384; cv=none; b=cS6XJqzsMrUKwpC/BpXKQP1rrh21VNnXG5O0L5AlIfmCwZDqQ2VEnjxheYmQJN6biuHeBrvtfioqHlA0Agy3NMtMfzh+h7u+hVkbRzRp218yNZA8cDTRuFZV7js8lsijiYuKJ7/1Uv4VcGfE24tYC2rkNH6JRhvc+3IjF9mqkcI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549384; c=relaxed/simple; bh=sTLgufpAYUrvdEdtY1yc8s8j7BbqzgMcwvNAxTHLyVc=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=dLdkrrGkUXaLoLnPXpQvkMZoKhJ514zmg3AhRjhikXYKP6URayMJjVpH4OPJ3NUwH32tb1mZDDF16otgtivywGY2IGFRBodZ7JvAYMRE0Zo5MTWLtnFbMZlW1eT1tfOASX3dybGEjccqVpw2fCDs0AMsINwQ0HymDU/IcKp+jFc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=uF267Tt0; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=IOtW3iuq; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="uF267Tt0"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="IOtW3iuq" Message-ID: <20251015172834.884261347@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549381; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=J1lzC60maM+Wx9z41/PXq87Ndq3QPPxpl8N3vVQkT6I=; b=uF267Tt08mTkpaw74/xmNRIpLE8jkOk80iD6UjTDTNojLhDU1Snz5kUDAyHBscsIdF2BQG I469CNdNeaThz3ZRo35g6gwUsK/+sA6aGe1dcukLi9ED44P6+s6s2cO+bRCwj+oSpPwXvA poFrH4lS8kKjI8tnlKjVfQamdIGB4mbpY/JKhBWeQouLN5o97Bo8GgDnZ/HEQgbfukRG+h 1gCxkCbcg29e155Q5fN7vrAZAznMI1R+SahFwAiKLgz0skBaBRHUQb01ecirGQdOPUpv5r PrUFjEP4D2ZCrvLm/oUHevZd/SYO1zSP2H8N3/ZJSVmkVOnWk7J6Pcfxn1m94Q== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549381; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=J1lzC60maM+Wx9z41/PXq87Ndq3QPPxpl8N3vVQkT6I=; b=IOtW3iuqj/wfXckQrMldoEVfffwocb384cJcY6QXHjpsgd7BqKZCNkCtbSAF3bem+2W0l+ To09Gsrl0CwUusDw== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 09/19] sched/mmcid: Convert mm CID mask to a bitmap References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:40 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" This is truly a bitmap and just conveniently uses a cpumask because the maximum size of the bitmap is nr_cpu_ids. But that prevents to do searches for a zero bit in a limited range, which is helpful to provide an efficient mechanism to consolidate the CID space when the number of users decreases. Signed-off-by: Thomas Gleixner --- include/linux/mm_types.h | 6 +++--- kernel/sched/core.c | 2 +- kernel/sched/sched.h | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1296,13 +1296,13 @@ static inline cpumask_t *mm_cpus_allowed } =20 /* Accessor for struct mm_struct's cidmask. */ -static inline cpumask_t *mm_cidmask(struct mm_struct *mm) +static inline unsigned long *mm_cidmask(struct mm_struct *mm) { unsigned long cid_bitmap =3D (unsigned long)mm_cpus_allowed(mm); =20 /* Skip mm_cpus_allowed */ cid_bitmap +=3D cpumask_size(); - return (struct cpumask *)cid_bitmap; + return (unsigned long *)cid_bitmap; } =20 static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) @@ -1317,7 +1317,7 @@ static inline void mm_init_cid(struct mm mm->mm_cid.nr_cpus_allowed =3D p->nr_cpus_allowed; raw_spin_lock_init(&mm->mm_cid.lock); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); - cpumask_clear(mm_cidmask(mm)); + bitmap_zero(mm_cidmask(mm), nr_cpu_ids); } =20 static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_st= ruct *p) --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10437,7 +10437,7 @@ void sched_mm_cid_exit_signals(struct ta guard(preempt)(); t->mm_cid.active =3D 0; if (t->mm_cid.cid !=3D MM_CID_UNSET) { - cpumask_clear_cpu(t->mm_cid.cid, mm_cidmask(mm)); + clear_bit(t->mm_cid.cid, mm_cidmask(mm)); t->mm_cid.cid =3D MM_CID_UNSET; } } --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3553,7 +3553,7 @@ static inline bool __mm_cid_get(struct t =20 if (cid >=3D max_cids) return false; - if (cpumask_test_and_set_cpu(cid, mm_cidmask(mm))) + if (test_and_set_bit(cid, mm_cidmask(mm))) return false; t->mm_cid.cid =3D t->mm_cid.last_cid =3D cid; __this_cpu_write(mm->mm_cid.pcpu->cid, cid); @@ -3576,7 +3576,7 @@ static inline bool mm_cid_get(struct tas return true; =20 /* Try the first zero bit in the cidmask. */ - return __mm_cid_get(t, cpumask_first_zero(mm_cidmask(mm)), max_cids); + return __mm_cid_get(t, find_first_zero_bit(mm_cidmask(mm), nr_cpu_ids), m= ax_cids); } =20 static inline void mm_cid_select(struct task_struct *t) @@ -3597,7 +3597,7 @@ static inline void switch_mm_cid(struct { if (prev->mm_cid.active) { if (prev->mm_cid.cid !=3D MM_CID_UNSET) - cpumask_clear_cpu(prev->mm_cid.cid, mm_cidmask(prev->mm)); + clear_bit(prev->mm_cid.cid, mm_cidmask(prev->mm)); prev->mm_cid.cid =3D MM_CID_UNSET; } From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2ACBD341AAE for ; Wed, 15 Oct 2025 17:29:44 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549386; cv=none; b=VsBhBJwsSN6zMJYeegnb1/yXS/xkOBVvJpAidSbqHXYdLrlafP5oPuqXon/BrvVwwWY14lkyyUlL7tI9mmnUe3evINJ0MsQMyobRFOV5JI/qzx9XcbBezuE8GGKweAAm3jXJ6202vp1gdQ2bO9FT5XHhxFwd5rIGjmLPM0gcVM8= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549386; c=relaxed/simple; bh=LGC5d2CwjKxGi8CiLe0Iy8t7nBfUegW3c4jMF3VjwWw=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=ORQLwHxvhvzeX83Hh/IaaeZEbssBtJ7/JY7pL2zCtbuhMTIttFgH79eR/2CYxPN1j4g2Kr4ZpEH5e+s+5+bKM7cNE3v4WstU4Su9wOcUUcuKHJCIePeZv/BPP7lLl1PlSYJLTuICTxLzx1XWPkLamPS75mwclBglPIFUbUsyceg= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=zNrdfZRd; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=oUbNZge3; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="zNrdfZRd"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="oUbNZge3" Message-ID: <20251015172834.947237114@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549383; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=MlMcgpBHCcOIkGa/6GuJAY6BtPSrUqB+AgtqLnUK3s0=; b=zNrdfZRdKvjQReDv1ILALmNN0nK+Bf9BmhnWcVZ8Mo8QXTZlSsHbng5bUFRk8W9lHxerA3 lcJ9Jc+VnElsdxSz1sH/hz0E31sU/ad58SZdRFHNwv2EwiuU70DhAnxl+s9kI04JsZk5q7 u5QOirV+2BWvNR8HDZjGzod3P0fwxw67MuJ1FBKWaLqOeQsYmT+GlwjpR8y1gMscYWQHcL 8yGnytGkqPnvWQwdRSXOXFqBxzycuEZXCx1bwirQe/vEvYRsXOzBAwzyFFfE3JQy4wZvIQ k/FtNnZBsRWwTn8s6CCZhs6no5vSIVl86djD90f9f+bfUvGr45iJ38be1aG4Vw== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549383; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=MlMcgpBHCcOIkGa/6GuJAY6BtPSrUqB+AgtqLnUK3s0=; b=oUbNZge3nzVM2C81f9lLJcuerlZh9f44RG01tEv6G5DSg+px9CdGtrhpQ39ffiG7dTttE5 sotf+t2OzI9swQBg== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 10/19] signal: Move MMCID exit out of sighand lock References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:42 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" There is no need anymore to keep this under sighand lock as the current code and the upcoming replacement are not depending on the exit state of a task anymore. That allows to use a mutex in the exit path. Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 4 ++-- kernel/exit.c | 1 + kernel/sched/core.c | 4 ++-- kernel/signal.c | 2 -- 4 files changed, 5 insertions(+), 6 deletions(-) --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2290,7 +2290,7 @@ static __always_inline void alloc_tag_re void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); void sched_mm_cid_fork(struct task_struct *t); -void sched_mm_cid_exit_signals(struct task_struct *t); +void sched_mm_cid_exit(struct task_struct *t); static inline int task_mm_cid(struct task_struct *t) { return t->mm_cid.cid; @@ -2299,7 +2299,7 @@ static inline int task_mm_cid(struct tas static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } static inline void sched_mm_cid_fork(struct task_struct *t) { } -static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } +static inline void sched_mm_cid_exit(struct task_struct *t) { } static inline int task_mm_cid(struct task_struct *t) { /* --- a/kernel/exit.c +++ b/kernel/exit.c @@ -905,6 +905,7 @@ void __noreturn do_exit(long code) user_events_exit(tsk); =20 io_uring_files_cancel(); + sched_mm_cid_exit(tsk); exit_signals(tsk); /* sets PF_EXITING */ =20 seccomp_filter_release(tsk); --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10427,7 +10427,7 @@ static inline void mm_update_cpus_allowe WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight); } =20 -void sched_mm_cid_exit_signals(struct task_struct *t) +void sched_mm_cid_exit(struct task_struct *t) { struct mm_struct *mm =3D t->mm; =20 @@ -10445,7 +10445,7 @@ void sched_mm_cid_exit_signals(struct ta /* Deactivate MM CID allocation across execve() */ void sched_mm_cid_before_execve(struct task_struct *t) { - sched_mm_cid_exit_signals(t); + sched_mm_cid_exit(t); } =20 /* Reactivate MM CID after successful execve() */ --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3125,7 +3125,6 @@ void exit_signals(struct task_struct *ts cgroup_threadgroup_change_begin(tsk); =20 if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { - sched_mm_cid_exit_signals(tsk); tsk->flags |=3D PF_EXITING; cgroup_threadgroup_change_end(tsk); return; @@ -3136,7 +3135,6 @@ void exit_signals(struct task_struct *ts * From now this task is not visible for group-wide signals, * see wants_signal(), do_signal_stop(). */ - sched_mm_cid_exit_signals(tsk); tsk->flags |=3D PF_EXITING; =20 cgroup_threadgroup_change_end(tsk); From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C967C341AC0 for ; Wed, 15 Oct 2025 17:29:46 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549388; cv=none; b=lAmXJgEbMadBgHhuKH2YWLIH1TPvoZOqihwBdsmKfsSnbfjk3/gq0U0OQY4NoJaOM/QY7e6x/rp/i7VDW5VZX5pn7TZEGZzsdVvXnwiMdL3p5vM5nhMWUBZd5xcjZ3EfG41gItcRW2CvPc9zHY4u+M41wA5auzVVtESEq90/B0Y= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549388; c=relaxed/simple; bh=Cn+g8jVsk1IyAMUDj5C16SswRIn6DXyX19WzMjIUxxw=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=svZNQg0LDEwBIsyl653rh+WTv+cMv+0kv1zVaXHHVwUTzuRolyFERFjWP7lumwNnXXqZFry80/UuziTdJHSrOFqSEaBWWuox4SgAPMpusU2Nu1I2cGyS743qLCAbu/l8vZ43uJDzfU8A6fkRPWZJ1G7/9zqAZJ2ng3qIAF8O6I0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=r7yvOYw4; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=DyrnmOso; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="r7yvOYw4"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="DyrnmOso" Message-ID: <20251015172835.007019151@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549385; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=pRibfw9Eka6K7CsuhEpXbYgthEZSPRccLhsB23qNyDs=; b=r7yvOYw46RcC5QB5+Fssn3NGecry4hLi5BRIrJuDR9gzCwix6B3BpXW54X89Fz6yfYmvym HoBoMPWlZhmNUC8p0VMvx1IosWXEjg7x1WfQhWrIm4q8EirvvnV3ppr+na/U/+Nixnhnba jPJAsTTqlNNhvHGD+lD9HTFQPF7Bbwn6DVhYWjRqXW/myjcALbRY2V07wCc0wFU9pVqDT2 8seZKx4ebqGKu3h6Ll/UuhAoQQTSGXQ2ESzI+o6PcpRdCh0mIt1j+ljpj1mQ/XZ3tgN3cN JH0PknE6m8SWB5uHmXuP0B8SUizE1ONm/e2OMw6kIWyGcwM1EV/uLoKTaGNWsg== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549385; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=pRibfw9Eka6K7CsuhEpXbYgthEZSPRccLhsB23qNyDs=; b=DyrnmOsovFnCwhu6gad5RkDbek6u3gN+j9jRFFSStl5SQMODBYlzSq085SR/I2uad19/MN 3uyylJuM+xhgQAAQ== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 11/19] sched/mmcid: Move initialization out of line References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:44 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" It's getting bigger soon, so just move it out of line to the rest of the code. Signed-off-by: Thomas Gleixner --- include/linux/mm_types.h | 15 +-------------- kernel/sched/core.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 14 deletions(-) --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1305,20 +1305,7 @@ static inline unsigned long *mm_cidmask( return (unsigned long *)cid_bitmap; } =20 -static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) -{ - int i; - - for_each_possible_cpu(i) { - struct mm_cid_pcpu *pcpu =3D per_cpu_ptr(mm->mm_cid.pcpu, i); - - pcpu->cid =3D MM_CID_UNSET; - } - mm->mm_cid.nr_cpus_allowed =3D p->nr_cpus_allowed; - raw_spin_lock_init(&mm->mm_cid.lock); - cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); - bitmap_zero(mm_cidmask(mm), nr_cpu_ids); -} +void mm_init_cid(struct mm_struct *mm, struct task_struct *p); =20 static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_st= ruct *p) { --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10466,6 +10466,20 @@ void sched_mm_cid_fork(struct task_struc WARN_ON_ONCE(!t->mm || t->mm_cid.cid !=3D MM_CID_UNSET); t->mm_cid.active =3D 1; } + +void mm_init_cid(struct mm_struct *mm, struct task_struct *p) +{ + struct mm_cid_pcpu __percpu *pcpu =3D mm->mm_cid.pcpu; + int cpu; + + for_each_possible_cpu(cpu) + per_cpu_ptr(pcpu, cpu)->cid =3D MM_CID_UNSET; + + mm->mm_cid.nr_cpus_allowed =3D p->nr_cpus_allowed; + raw_spin_lock_init(&mm->mm_cid.lock); + cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); + bitmap_zero(mm_cidmask(mm), nr_cpu_ids); +} #else /* CONFIG_SCHED_MM_CID */ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const stru= ct cpumask *affmsk) { } #endif /* !CONFIG_SCHED_MM_CID */ From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1D97F342C81 for ; Wed, 15 Oct 2025 17:29:48 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549391; cv=none; b=q9iB+VHNA7FUV/ipOYwKXb7VpfaUcrF7iQ8RMNFWANc825xIJHnkfMnNL1KdEeU8zmOEDgG562+IjoQx3iONN0NYbmMCbaAMob/3+g++hOv8yYdc8cpDf6jK5ZRhu6jYBiSKLvCeYDBCUjud+Z8T98gvP6Gstx+3JPZiDvDffEs= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549391; c=relaxed/simple; bh=JTJkJjiUOE8/izhGx/Wm+O58YQgBD7ZsC87IE88+cMg=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=SVg9C9/oPMrPP4q66+A86x7S5Oa527yb6wrASgskrC7JHHYdw8j9DIHnpL+43jZWoq9AMLl8IhGAf8RITmtO0DyZIRE58fykAh4vHy4Tahwm3WpWY80e/aZRsJlI7mlBlzx8LrkLBa4ESlQKUI6fyD3KACG9K39VfqLIZrMlDEY= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=432gjQMG; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=J+48rji1; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="432gjQMG"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="J+48rji1" Message-ID: <20251015172835.068114686@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549387; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=ov/K9g3aVE2nIBgc7w2Mu8zuwzO5UAbpBaMU43faj8c=; b=432gjQMG6B14rpa7HPhBkhHnflgxs2aVrcXFyj7cwhYXpmtpuef8IcZjzF61+Uds9COLVt zETUIuzHGIYSuBNxNSgde695L4C4xtVn+Bke4kkeUy+7BwoW4+Pk6H3MVa/IHYG6KFqWsF hSEGmjyhBeakj40KUIuhw1OsKI3UO3zcH+Nhb3FgzuuAvqZcWwnUGkKC2XHDQDLDRpyG3V bwQFuGDbZMTiBWR4JXPmEwwOE7uKHUeDzocaeRDVYYBDcYazbMXV3/5/Buee/R7V1e/1dx gaMBF7sGC4TxHVwsXJPG9SlWHpyM2aEemEbN4vtTNUwJjUxsKiEe2W2zwLjYLg== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549387; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=ov/K9g3aVE2nIBgc7w2Mu8zuwzO5UAbpBaMU43faj8c=; b=J+48rji1XS5OnpW4W4EPH+CIgdIqxW7JTlDzUvueuZgRbCpU0dzb9jYtAm3gOAH8WsllIm oOfF93CZm50Yx8BA== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 12/19] sched/mmcid: Provide precomputed maximal value References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:46 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Reading mm::mm_users and mm:::mm_cid::nr_cpus_allowed everytime to compute the maximal CID value is just wasteful as that value is only changing on fork(), exit() and eventually when the affinity changes. So it can be easily precomputed at those points and provided in mm::mm_cid for consumption in the hot path. But there is an issue with using mm::mm_users for accounting because that does not necessarily reflect the number of user space tasks as other kernel code can take temporary references on the MM which skew the picture. Solve that by adding a users counter to struct mm_mm_cid, which is modified by fork() and exit() and used for precomputing under mm_mm_cid::lock. Signed-off-by: Thomas Gleixner --- include/linux/rseq_types.h | 6 ++++ kernel/fork.c | 1=20 kernel/sched/core.c | 66 ++++++++++++++++++++++++++++++++--------= ----- kernel/sched/sched.h | 3 -- 4 files changed, 56 insertions(+), 20 deletions(-) --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -117,14 +117,20 @@ struct mm_cid_pcpu { /** * struct mm_mm_cid - Storage for per MM CID data * @pcpu: Per CPU storage for CIDs associated to a CPU + * @max_cids: The exclusive maximum CID value for allocation and converga= nce * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. Th= e map * is growth only. + * @users: The number of tasks sharing this MM. Seperate from mm::mm_users + * as that is modified by mmget()/mm_put() by other entities which + * do not actually share the MM. * @lock: Spinlock to protect all fields except @pcpu. It also protects * the MM cid cpumask and the MM cidmask bitmap. */ struct mm_mm_cid { struct mm_cid_pcpu __percpu *pcpu; + unsigned int max_cids; unsigned int nr_cpus_allowed; + unsigned int users; raw_spinlock_t lock; }____cacheline_aligned_in_smp; #else /* CONFIG_SCHED_MM_CID */ --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2449,6 +2449,7 @@ static bool need_futex_hash_allocate_def exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) { + sched_mm_cid_exit(p); mm_clear_owner(p->mm, p); mmput(p->mm); } --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4517,7 +4517,6 @@ static void __sched_fork(unsigned long c init_numa_balancing(clone_flags, p); p->wake_entry.u_flags =3D CSD_TYPE_TTWU; p->migration_pending =3D NULL; - init_sched_mm_cid(p); } =20 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -10403,15 +10402,32 @@ void call_trace_sched_update_nr_running( =20 #ifdef CONFIG_SCHED_MM_CID /* - * When a task exits, the MM CID held by the task is not longer required as - * the task cannot return to user space. + * Update the CID range properties when the constraints change. Invoked via + * fork(), exit() and affinity changes */ +static void mm_update_max_cids(struct mm_struct *mm) +{ + struct mm_mm_cid *mc =3D &mm->mm_cid; + unsigned int max_cids; + + lockdep_assert_held(&mm->mm_cid.lock); + + /* Calculate the new maximum constraint */ + max_cids =3D min(mc->nr_cpus_allowed, mc->users); + WRITE_ONCE(mc->max_cids, max_cids); +} + static inline void mm_update_cpus_allowed(struct mm_struct *mm, const stru= ct cpumask *affmsk) { struct cpumask *mm_allowed; unsigned int weight; =20 - if (!mm || READ_ONCE(mm->mm_cid.nr_cpus_allowed) =3D=3D nr_cpu_ids) + /* + * Nothing to do when the mask is already maxed out or the user + * count dropped to zero. + */ + if (!mm || READ_ONCE(mm->mm_cid.nr_cpus_allowed) =3D=3D nr_cpu_ids || + !READ_ONCE(mm->mm_cid.users)) return; =20 /* @@ -10420,13 +10436,34 @@ static inline void mm_update_cpus_allowe */ guard(raw_spinlock)(&mm->mm_cid.lock); /* Check again under the lock */ - if (mm->mm_cid.nr_cpus_allowed =3D=3D nr_cpu_ids) + if (mm->mm_cid.nr_cpus_allowed =3D=3D nr_cpu_ids || !mm->mm_cid.users) return; mm_allowed =3D mm_cpus_allowed(mm); weight =3D cpumask_or_weight(mm_allowed, mm_allowed, affmsk); + if (weight =3D=3D mm->mm_cid.nr_cpus_allowed) + return; WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight); + mm_update_max_cids(mm); } =20 +void sched_mm_cid_fork(struct task_struct *t) +{ + struct mm_struct *mm =3D t->mm; + + WARN_ON_ONCE(!mm || t->mm_cid.cid !=3D MM_CID_UNSET); + + guard(raw_spinlock)(&mm->mm_cid.lock); + t->mm_cid.active =3D 1; + mm->mm_cid.users++; + /* Preset last_cid for mm_cid_select() */ + t->mm_cid.last_cid =3D READ_ONCE(mm->mm_cid.max_cids) - 1; + mm_update_max_cids(mm); +} + +/* + * When a task exits, the MM CID held by the task is not longer required as + * the task cannot return to user space. + */ void sched_mm_cid_exit(struct task_struct *t) { struct mm_struct *mm =3D t->mm; @@ -10434,12 +10471,14 @@ void sched_mm_cid_exit(struct task_struc if (!mm || !t->mm_cid.active) return; =20 - guard(preempt)(); + guard(raw_spinlock)(&mm->mm_cid.lock); t->mm_cid.active =3D 0; + mm->mm_cid.users--; if (t->mm_cid.cid !=3D MM_CID_UNSET) { clear_bit(t->mm_cid.cid, mm_cidmask(mm)); t->mm_cid.cid =3D MM_CID_UNSET; } + mm_update_max_cids(mm); } =20 /* Deactivate MM CID allocation across execve() */ @@ -10451,22 +10490,11 @@ void sched_mm_cid_before_execve(struct t /* Reactivate MM CID after successful execve() */ void sched_mm_cid_after_execve(struct task_struct *t) { - struct mm_struct *mm =3D t->mm; - - if (!mm) - return; - + sched_mm_cid_fork(t); guard(preempt)(); - t->mm_cid.active =3D 1; mm_cid_select(t); } =20 -void sched_mm_cid_fork(struct task_struct *t) -{ - WARN_ON_ONCE(!t->mm || t->mm_cid.cid !=3D MM_CID_UNSET); - t->mm_cid.active =3D 1; -} - void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { struct mm_cid_pcpu __percpu *pcpu =3D mm->mm_cid.pcpu; @@ -10475,7 +10503,9 @@ void mm_init_cid(struct mm_struct *mm, s for_each_possible_cpu(cpu) per_cpu_ptr(pcpu, cpu)->cid =3D MM_CID_UNSET; =20 + mm->mm_cid.max_cids =3D 0; mm->mm_cid.nr_cpus_allowed =3D p->nr_cpus_allowed; + mm->mm_cid.users =3D 0; raw_spin_lock_init(&mm->mm_cid.lock); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); bitmap_zero(mm_cidmask(mm), nr_cpu_ids); --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3565,7 +3565,7 @@ static inline bool mm_cid_get(struct tas struct mm_struct *mm =3D t->mm; unsigned int max_cids; =20 - max_cids =3D min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_rea= d(&mm->mm_users)); + max_cids =3D READ_ONCE(mm->mm_cid.max_cids); =20 /* Try to reuse the last CID of this task */ if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids)) @@ -3608,7 +3608,6 @@ static inline void switch_mm_cid(struct } =20 #else /* !CONFIG_SCHED_MM_CID: */ -static inline void init_sched_mm_cid(struct task_struct *t) { } static inline void mm_cid_select(struct task_struct *t) { } static inline void switch_mm_cid(struct task_struct *prev, struct task_str= uct *next) { } #endif /* !CONFIG_SCHED_MM_CID */ From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 15EFD342C97 for ; Wed, 15 Oct 2025 17:29:50 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549393; cv=none; b=rPQhYDFfpEJX8f0jYR3WqjuvBvNdlbfXWlYDhgocJ3haIuqqkoZdVHFUjwJdtMdYpk1T9RLol2rN6yg20YNRisP9fZmylm6JZvx+G1cZgtvWCSx0zTuvlnppsxCYJhR1/QijRl7W21hwTOCbFa0Izil6dS1ziYcf4NG34yKPOvU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549393; c=relaxed/simple; bh=WiNcuGyRY58l9VyXAdWrg1hbcNj6k2rznNOlyX0sMjU=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=R4LgNHGoMgqmvJMN+/e52L2yoUUC5rd3be/cdw4BUzTZzxivLnUbJiOtwv/NJ3K+swjJHNIQsBMXA/DpIgXlnoMWklpWaoGn0jN+KD2fDZU+PsYQAePgVDlFmuhZbPM05oD7aG6EYZ6PiXAK73z8KjlItKQuYuxv4gkIAlenbeE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=pLq91Kwj; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=6FmyS+0A; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="pLq91Kwj"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="6FmyS+0A" Message-ID: <20251015172835.128647487@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549389; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=rI5tG705H4SVQi23Z67G4+IXkytvRxi+xhu0kDxKPO4=; b=pLq91Kwj9DXG1V40UJ5gHhOOeT51cCviGZEWBcH1TcK/nxc8A79locLz6qOA619nICAcoN MzTwZk+cKM/nhREFKUPOPIcjVooP+Tym12/KS5VHyGDDdV8alLSMikFsZPRR2i244fk4c7 lNXi9JKY4cJgvdhlWYlmuNEvyE2WaUf5lSO3R0fL41YsvKnI8e3hlh08VuO7kRKn7sHDsz pJ3yQxsP/FfNnEozHw38SKJ1ZzY9WYY+MxCWGEGLNcXBYlprGh85YAghA3JKeUkPi7WREN CAJ3LQU9vzwQMMXrjsKTZ6bGyuznZFfvuImjvKRM8tDImkfC6GMvikmkm6lROQ== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549389; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=rI5tG705H4SVQi23Z67G4+IXkytvRxi+xhu0kDxKPO4=; b=6FmyS+0AQHRjpSpI1xU2nw7bTft2/CyFwgFTcSAIdH5lZeel57svPRwZhCODB7FM6jPJfR TEEzmPY/lvm7XJCg== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 13/19] sched/mmcid: Serialize sched_mm_cid_fork()/exit() with a mutex References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:48 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Prepare for the new CID management scheme which puts the CID ownership transition into the fork() and exit() slow path by serializing sched_mm_cid_fork()/exit() with it, so task list and cpu mask walks can be done in interruptible and preemptible code. The contention on it is not worse than on other concurrency controls in the fork()/exit() machinery. Signed-off-by: Thomas Gleixner --- include/linux/rseq_types.h | 2 ++ kernel/sched/core.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -125,6 +125,7 @@ struct mm_cid_pcpu { * do not actually share the MM. * @lock: Spinlock to protect all fields except @pcpu. It also protects * the MM cid cpumask and the MM cidmask bitmap. + * @mutex: Mutex to serialize forks and exits related to this mm */ struct mm_mm_cid { struct mm_cid_pcpu __percpu *pcpu; @@ -132,6 +133,7 @@ struct mm_mm_cid { unsigned int nr_cpus_allowed; unsigned int users; raw_spinlock_t lock; + struct mutex mutex; }____cacheline_aligned_in_smp; #else /* CONFIG_SCHED_MM_CID */ struct mm_cid { }; --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10402,6 +10402,25 @@ void call_trace_sched_update_nr_running( =20 #ifdef CONFIG_SCHED_MM_CID /* + * Concurrency IDentifier management + * + * Serialization rules: + * + * mm::mm_cid::mutex: Serializes fork() and exit() and therefore + * protects mm::mm_cid::users. + * + * mm::mm_cid::lock: Serializes mm_update_max_cids() and + * mm_update_cpus_allowed(). Nests in mm_cid::mutex + * and runqueue lock. + * + * The mm_cidmask bitmap is not protected by any of the mm::mm_cid locks + * and can only be modified with atomic operations. + * + * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue + * lock. + */ + +/* * Update the CID range properties when the constraints change. Invoked via * fork(), exit() and affinity changes */ @@ -10452,6 +10471,7 @@ void sched_mm_cid_fork(struct task_struc =20 WARN_ON_ONCE(!mm || t->mm_cid.cid !=3D MM_CID_UNSET); =20 + guard(mutex)(&mm->mm_cid.mutex); guard(raw_spinlock)(&mm->mm_cid.lock); t->mm_cid.active =3D 1; mm->mm_cid.users++; @@ -10471,6 +10491,7 @@ void sched_mm_cid_exit(struct task_struc if (!mm || !t->mm_cid.active) return; =20 + guard(mutex)(&mm->mm_cid.mutex); guard(raw_spinlock)(&mm->mm_cid.lock); t->mm_cid.active =3D 0; mm->mm_cid.users--; @@ -10507,6 +10528,7 @@ void mm_init_cid(struct mm_struct *mm, s mm->mm_cid.nr_cpus_allowed =3D p->nr_cpus_allowed; mm->mm_cid.users =3D 0; raw_spin_lock_init(&mm->mm_cid.lock); + mutex_init(&mm->mm_cid.mutex); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); bitmap_zero(mm_cidmask(mm), nr_cpu_ids); } From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id BCFF0342C9E for ; Wed, 15 Oct 2025 17:29:53 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549395; cv=none; b=bZJTcIkgxpoLUKQAObD2px76omA7ZM0KLKhZhV9qwjAYy20/vPryIG0eEkoaEX/w1n2ylIQkjmLnLjSPZ3AWXqbNu1we2B2PcborW1gJH37Jg0TAHdtEYWFQUmcEOuH99dLzSGKFv8xy/c3BOWGrMyh+MNkAmqnXWxSyQc2qC8Y= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549395; c=relaxed/simple; bh=5D0C1mmQsDAxcuzxoXCEVjtkgh+jZcBXlKnMVVKR6tQ=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=SnoSlPpFSyUBuqiQ30ai45r1sViFH1FAUdTu+w6eZ3vEoPGGrp7ljdCRM2UpyXSSC20aoNSqJsqom9QLJYX5YXOZAJo4SdtABq5NBGYPVVrur3iXX+Dijt4i01c1j+G+raOXWA3Adjbvh6J566+5t7m8GuMR4VVDdhzQXTpmvak= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=KNqrQDHS; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=FaGmoJnP; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="KNqrQDHS"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="FaGmoJnP" Message-ID: <20251015172835.190173603@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549391; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=pFVciMXUyxyULBxl1v3fLMw6h0CcuKGrvokqooQX3wk=; b=KNqrQDHS2yqMqDY+mjoAjK8spHSOnJiyAQBfOZI+u5p9TDuVC4QnfAw4XKJa59Nd1VquVr 8s5bWtUXl45zp/XvIkLVjgI/lLyi1nuMAUxPjfyjwATRbZQvqBMPKMIjne308mewrfLzj0 EVYfH3CF5/ebMW1A5Mbd4fBPdAvqnCj27keybyH6CQ/TPNNWJxYAld1sAHBaUJlYRfRVms om0nKXtFBYRw6jH1yhLmLi9wl4iJQ0Unf6Rju/YIDk34Ljci4EBOoMUMDjAYU3W5Vab6zP hZMHTaWpBzs+ev/Yxbb9+GmoNRTI3Tu4BJe2NDO4R62L6V46Jf+1HPOK66ON7Q== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549391; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=pFVciMXUyxyULBxl1v3fLMw6h0CcuKGrvokqooQX3wk=; b=FaGmoJnPRSn3eZom8LcojLl+984lhEkBN9qC0TPXSgelN8qq+ibqOvxXPkyAgUqGlUo97U AM7qIpMuhmnApsCw== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 14/19] sched/mmcid: Introduce per task/CPU ownership infrastrcuture References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:50 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" The MM CID management has two fundamental requirements: 1) It has to guarantee that at no given point in time the same CID is used by concurrent tasks in userspace. 2) The CID space must not exceed the number of possible CPUs in a system. While most allocators (glibc, tcmalloc, jemalloc) do not care about that, there seems to be at least some LTTng library depending on it. The CID space compaction itself is not a functional correctness requirement, it is only a useful optimization mechanism to reduce the memory foot print in unused user space pools. The optimal CID space is: min(nr_tasks, nr_cpus_allowed); Where @nr_tasks is the number of actual user space threads associated to the mm and @nr_cpus_allowed is the superset of all task affinities. It is growth only as it would be insane to take a racy snapshot of all task affinities when the affinity of one task changes just do redo it 2 milliseconds later when the next task changes it's affinity. That means that as long as the number of tasks is lower or equal than the number of CPUs allowed, each task owns a CID. If the number of tasks exceeds the number of CPUs allowed it switches to per CPU mode, where the CPUs own the CIDs and the tasks borrow them as long as they are scheduled in. For transition periods CIDs can go beyond the optimal space as long as they don't go beyond the number of possible CPUs. The current upstream implementation tries to keep the CID with the task even in overcommit situations, which complicates task migration. It also has to do the CID space consolidation work from a task work in the exit to user space path. As that work is assigned to a random task related to a MM this can inflict unwanted exit latencies. This can be done differently by implementing a strict CID ownership mechanism. Either the CIDs are owned by the tasks or by the CPUs. The latter provides less locality when tasks are heavily migrating, but there is no justification to optimize for overcommit scenarios and thereby penalazing everyone else. Provide the basic infrastructure to implement this: - Change the UNSET marker to BIT(31) from ~0U - Add the ONCPU marker as BIT(30) That allows to check for ownership trivialy and provides a simple check for UNSET as well. Signed-off-by: Thomas Gleixner --- include/linux/rseq_types.h | 3 ++- include/linux/sched.h | 6 +++--- kernel/sched/core.c | 7 +++++++ kernel/sched/sched.h | 44 ++++++++++++++++++++++++++++++++++++++++= ++++ 4 files changed, 56 insertions(+), 4 deletions(-) --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -92,7 +92,8 @@ struct rseq_data { }; =20 #ifdef CONFIG_SCHED_MM_CID =20 -#define MM_CID_UNSET (~0U) +#define MM_CID_UNSET BIT(31) +#define MM_CID_ONCPU BIT(30) =20 /** * struct sched_mm_cid - Storage for per task MM CID data --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2291,16 +2291,16 @@ void sched_mm_cid_before_execve(struct t void sched_mm_cid_after_execve(struct task_struct *t); void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit(struct task_struct *t); -static inline int task_mm_cid(struct task_struct *t) +static __always_inline int task_mm_cid(struct task_struct *t) { - return t->mm_cid.cid; + return t->mm_cid.cid & ~MM_CID_ONCPU; } #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } static inline void sched_mm_cid_fork(struct task_struct *t) { } static inline void sched_mm_cid_exit(struct task_struct *t) { } -static inline int task_mm_cid(struct task_struct *t) +static __always_inline int task_mm_cid(struct task_struct *t) { /* * Use the processor id as a fall-back when the mm cid feature is --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10418,6 +10418,13 @@ void call_trace_sched_update_nr_running( * * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue * lock. + * + * CID ownership: + * + * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or + * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the + * MM_CID_ONCPU bit set. This bit is filtered out by task_cid() when it + * is actualy handed over to user space in the RSEQ memory. */ =20 /* --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3534,6 +3534,50 @@ extern void sched_dynamic_update(int mod extern const char *preempt_modes[]; =20 #ifdef CONFIG_SCHED_MM_CID + +static __always_inline bool cid_on_cpu(unsigned int cid) +{ + return cid & MM_CID_ONCPU; +} + +static __always_inline unsigned int cpu_cid_to_cid(unsigned int cid) +{ + return cid & ~MM_CID_ONCPU; +} + +static __always_inline unsigned int cid_to_cpu_cid(unsigned int cid) +{ + return cid | MM_CID_ONCPU; +} + +static __always_inline bool cid_on_task(unsigned int cid) +{ + /* True if neither MM_CID_ONCPU nor MM_CID_UNSET set */ + return cid < MM_CID_ONCPU; +} + +static __always_inline void mm_drop_cid(struct mm_struct *mm, unsigned int= cid) +{ + clear_bit(cid, mm_cidmask(mm)); +} + +static __always_inline void mm_unset_cid_on_task(struct task_struct *t) +{ + unsigned int cid =3D t->mm_cid.cid; + + t->mm_cid.cid =3D MM_CID_UNSET; + if (cid_on_task(cid)) + mm_drop_cid(t->mm, cid); +} + +static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struc= t mm_cid_pcpu *pcp) +{ + /* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */ + pcp->cid =3D cpu_cid_to_cid(pcp->cid); + mm_drop_cid(mm, pcp->cid); +} + +/* Active implementation */ static inline void init_sched_mm_cid(struct task_struct *t) { struct mm_struct *mm =3D t->mm; From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id B9CEC3431E9 for ; Wed, 15 Oct 2025 17:29:57 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549399; cv=none; b=d0hUebjwCAmMh64d3+Vq1kIWLOZSvC1kJkFKBzXsV46+Hge9ly9F9ue2CHZb5S4sfKWSzT+uQO0YP+GHw8K4fkEvKJjE6/Fvig2ilLvuoVqGR7m/Vo0I+8vbSxKNyexscpMMIKWl8kEvHWGx3NW9bkmNXjPcspiHZNUxpj5Raeg= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549399; c=relaxed/simple; bh=Xxod1oyqga6XkJag5SeJ8gPZM5QaRoiDKgy19Ybg7qM=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=LQFnF15bUc9q/WkGoWJWIZ7AkpHIoua9f8CJHrArH9VSJmLLDYK2fowvVSs9kVp1+S94POnW1yrytWDKAsuu3f4NEnTaibywhpoc9tmKrpsiEF3NrqaMkyqxcppzbkumYZfMjgFA8gQ26oW/GqFFX9EqfQUWcPf5nroVtgLelZg= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=wMZXJv/V; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=s41i1sDl; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="wMZXJv/V"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="s41i1sDl" Message-ID: <20251015172835.252130919@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549393; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=Qlc2E7wHHwQG2HHxnIWIivpMdawkwpkQBf7obNqOoSw=; b=wMZXJv/V0jStj9CaD+Tu32neP/BLCkfi7sIa3rtRHL+dZsHYBkY915GZbtiD64/jUqbPJl Fw5nCCkrrYbkKr7FP9ZEEYmJW+svJdIGXPpzVOG8pCnUvqycD2cHMri4Cx2GYUAwpknK3I gsXF23ZJjbg9V2Ziy+8t35EW2Md65qE/oolA1pv+cN7gtHkUWnEByoNOnDbm1tTiejiuOK ry2MX7viF8ja9eJ7JAd6tTIb130gv1LI+J8zFthnbJhWYPKypzWiuo6OFKMJunRHB92s7t YDWNZ8SR4cXp3HM98hHPmeNPIkxfbB58ATBN+pjDwmXFT4rIX6I3lSpy/+VktA== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549393; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=Qlc2E7wHHwQG2HHxnIWIivpMdawkwpkQBf7obNqOoSw=; b=s41i1sDlKBknisEkDp/RrJRKhAbryzfwjrcYXTaOOANCa6ck74p8OvRMGpuDlcgpgvbd2m YlPO4vkthVIV+EAg== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 15/19] sched/mmcid: Provide new scheduler CID mechanism References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:52 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" The MM CID management has two fundamental requirements: 1) It has to guarantee that at no given point in time the same CID is used by concurrent tasks in userspace. 2) The CID space must not exceed the number of possible CPUs in a system. While most allocators (glibc, tcmalloc, jemalloc) do not care about that, there seems to be at least some LTTng library depending on it. The CID space compaction itself is not a functional correctness requirement, it is only a useful optimization mechanism to reduce the memory foot print in unused user space pools. The optimal CID space is: min(nr_tasks, nr_cpus_allowed); Where @nr_tasks is the number of actual user space threads associated to the mm and @nr_cpus_allowed is the superset of all task affinities. It is growth only as it would be insane to take a racy snapshot of all task affinities when the affinity of one task changes just do redo it 2 milliseconds later when the next task changes it's affinity. That means that as long as the number of tasks is lower or equal than the number of CPUs allowed, each task owns a CID. If the number of tasks exceeds the number of CPUs allowed it switches to per CPU mode, where the CPUs own the CIDs and the tasks borrow them as long as they are scheduled in. For transition periods CIDs can go beyond the optimal space as long as they don't go beyond the number of possible CPUs. The current upstream implementation tries to keep the CID with the task even in overcommit situations, which complicates task migration. It also has to do the CID space consolidation work from a task work in the exit to user space path. As that work is assigned to a random task related to a MM this can inflict unwanted exit latencies. Implement the context switch parts of a strict ownership mechanism to address this. This removes all work from a task which schedules out. That's a benefit as tasks which schedule out have the related shared mm:mm_cid data and the per CPU storage cache cold when the task has a big enough cache foot print while doing work in user space as perf top clearly shows. The task which schedules in has to check whether: 1) The ownership mode changed 2) The CID is within the optimal CID space In stable situations this results in zero work. The only short disruption is when ownership mode changes or when the associated CID is not in the optimal CID space. The latter only happens when tasks exit and therefore the optimal CID space shrinks. That mechanism is strictly optimized for the common case where no change happens. The only case where it actually causes a temporary one time spike is on mode changes when and only when a lot of tasks related to a MM schedule exactly at the same time and have eventually to compete on allocating a CID from the bitmap. In the sysbench test case which triggered the spinlock contention in the initial CID code, __schedule() drops significantly in perf top on a 128 Core (256 threads) machine when running sysbench with 255 threads, which fits into the task mode limit of 256 together with the parent thread: Upstream rseq/perf branch +CID rework =20 0.42% 0.37% 0.32% [k] __schedule Increasing the number of threads to 256, which puts the test process into per CPU mode looks about the same. Signed-off-by: Thomas Gleixner --- include/linux/rseq.h | 8 +- include/linux/rseq_types.h | 3 + kernel/sched/core.c | 1=20 kernel/sched/sched.h | 130 ++++++++++++++++++++++++++++++++++++++++= ++++- 4 files changed, 137 insertions(+), 5 deletions(-) --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -71,13 +71,13 @@ static __always_inline void rseq_sched_s } =20 /* - * Invoked from __set_task_cpu() when a task migrates to enforce an IDs - * update. + * Invoked from __set_task_cpu() when a task migrates or from + * mm_cid_schedin() when the CID changes to enforce an IDs update. * * This does not raise TIF_NOTIFY_RESUME as that happens in * rseq_sched_switch_event(). */ -static __always_inline void rseq_sched_set_task_cpu(struct task_struct *t,= unsigned int cpu) +static __always_inline void rseq_sched_set_ids_changed(struct task_struct = *t) { t->rseq.event.ids_changed =3D true; } @@ -176,7 +176,7 @@ static inline void rseq_fork(struct task static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_reg= s *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } -static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned= int cpu) { } +static inline void rseq_sched_set_ids_changed(struct task_struct *t) { } static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsig= ned int cid) { } static inline void rseq_force_update(void) { } static inline void rseq_virt_userspace_exit(void) { } --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -118,6 +118,7 @@ struct mm_cid_pcpu { /** * struct mm_mm_cid - Storage for per MM CID data * @pcpu: Per CPU storage for CIDs associated to a CPU + * @percpu: Set, when CIDs are in per CPU mode * @max_cids: The exclusive maximum CID value for allocation and converga= nce * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. Th= e map * is growth only. @@ -129,7 +130,9 @@ struct mm_cid_pcpu { * @mutex: Mutex to serialize forks and exits related to this mm */ struct mm_mm_cid { + /* Hotpath read mostly members */ struct mm_cid_pcpu __percpu *pcpu; + unsigned int percpu; unsigned int max_cids; unsigned int nr_cpus_allowed; unsigned int users; --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10532,6 +10532,7 @@ void mm_init_cid(struct mm_struct *mm, s per_cpu_ptr(pcpu, cpu)->cid =3D MM_CID_UNSET; =20 mm->mm_cid.max_cids =3D 0; + mm->mm_cid.percpu =3D 0; mm->mm_cid.nr_cpus_allowed =3D p->nr_cpus_allowed; mm->mm_cid.users =3D 0; raw_spin_lock_init(&mm->mm_cid.lock); --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2206,7 +2206,7 @@ static inline void __set_task_cpu(struct smp_wmb(); WRITE_ONCE(task_thread_info(p)->cpu, cpu); p->wake_cpu =3D cpu; - rseq_sched_set_task_cpu(p, cpu); + rseq_sched_set_ids_changed(p); #endif /* CONFIG_SMP */ } =20 @@ -3577,6 +3577,134 @@ static __always_inline void mm_drop_cid_ mm_drop_cid(mm, pcp->cid); } =20 +static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int= max_cids) +{ + unsigned int cid =3D find_first_zero_bit(mm_cidmask(mm), max_cids); + + if (cid >=3D max_cids) + return MM_CID_UNSET; + if (test_and_set_bit(cid, mm_cidmask(mm))) + return MM_CID_UNSET; + return cid; +} + +static inline unsigned int mm_get_cid(struct mm_struct *mm) +{ + unsigned int cid =3D __mm_get_cid(mm, READ_ONCE(mm->mm_cid.max_cids)); + + for (; cid =3D=3D MM_CID_UNSET; cpu_relax()) + cid =3D __mm_get_cid(mm, nr_cpu_ids); + + return cid; +} + +static inline unsigned int mm_cid_converge(struct mm_struct *mm, unsigned = int orig_cid, + unsigned int max_cids) +{ + unsigned int new_cid, cid =3D cpu_cid_to_cid(orig_cid); + + /* Is it in the optimal CID space? */ + if (likely(cid < max_cids)) + return orig_cid; + + /* Try to find one in the optimal space. Otherwise keep the provided. */ + new_cid =3D __mm_get_cid(mm, max_cids); + if (new_cid !=3D MM_CID_UNSET) { + mm_drop_cid(mm, cid); + /* Preserve the ONCPU mode of the original CID */ + return new_cid | (orig_cid & MM_CID_ONCPU); + } + return orig_cid; +} + +static __always_inline void mm_cid_update_task_cid(struct task_struct *t, = unsigned int cid) +{ + if (t->mm_cid.cid !=3D cid) { + t->mm_cid.cid =3D cid; + rseq_sched_set_ids_changed(t); + } +} + +static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, u= nsigned int cid) +{ + __this_cpu_write(mm->mm_cid.pcpu->cid, cid); +} + +static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigne= d int cpu_cid) +{ + unsigned int max_cids, tcid =3D t->mm_cid.cid; + struct mm_struct *mm =3D t->mm; + + max_cids =3D READ_ONCE(mm->mm_cid.max_cids); + /* Optimize for the common case where both have the ONCPU bit set */ + if (likely(cid_on_cpu(cpu_cid & tcid))) { + if (likely(cpu_cid_to_cid(cpu_cid) < max_cids)) { + mm_cid_update_task_cid(t, cpu_cid); + return; + } + /* Try to converge into the optimal CID space */ + cpu_cid =3D mm_cid_converge(mm, cpu_cid, max_cids); + } else { + /* Hand over or drop the task owned CID */ + if (cid_on_task(tcid)) { + if (cid_on_cpu(cpu_cid)) + mm_unset_cid_on_task(t); + else + cpu_cid =3D cid_to_cpu_cid(tcid); + } + /* Still nothing, allocate a new one */ + if (!cid_on_cpu(cpu_cid)) + cpu_cid =3D cid_to_cpu_cid(mm_get_cid(mm)); + } + mm_cid_update_pcpu_cid(mm, cpu_cid); + mm_cid_update_task_cid(t, cpu_cid); +} + +static __always_inline void mm_cid_from_task(struct task_struct *t, unsign= ed int cpu_cid) +{ + unsigned int max_cids, tcid =3D t->mm_cid.cid; + struct mm_struct *mm =3D t->mm; + + max_cids =3D READ_ONCE(mm->mm_cid.max_cids); + /* Optimize for the common case, where both have the ONCPU bit clear */ + if (likely(cid_on_task(tcid | cpu_cid))) { + if (likely(tcid < max_cids)) { + mm_cid_update_pcpu_cid(mm, tcid); + return; + } + /* Try to converge into the optimal CID space */ + tcid =3D mm_cid_converge(mm, tcid, max_cids); + } else { + /* Hand over or drop the CPU owned CID */ + if (cid_on_cpu(cpu_cid)) { + if (cid_on_task(tcid)) + mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu)); + else + tcid =3D cpu_cid_to_cid(cpu_cid); + } + /* Still nothing, allocate a new one */ + if (!cid_on_task(tcid)) + tcid =3D mm_get_cid(mm); + } + mm_cid_update_pcpu_cid(mm, tcid); + mm_cid_update_task_cid(t, tcid); +} + +static __always_inline void mm_cid_schedin(struct task_struct *next) +{ + struct mm_struct *mm =3D next->mm; + unsigned int cpu_cid; + + if (!next->mm_cid.active) + return; + + cpu_cid =3D __this_cpu_read(mm->mm_cid.pcpu->cid); + if (likely(!READ_ONCE(mm->mm_cid.percpu))) + mm_cid_from_task(next, cpu_cid); + else + mm_cid_from_cpu(next, cpu_cid); +} + /* Active implementation */ static inline void init_sched_mm_cid(struct task_struct *t) { From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5F48533EB05 for ; Wed, 15 Oct 2025 17:29:57 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549399; cv=none; b=onkIsjhNxWMQqr5UpTaSWN3RZRmtPefpygzpLUZmQDVtDZAtSoaAdEt+EDxPszm1+Xft51zIyZ5yCCg+ISyQ6L3s4pBwQqxkSt9IftXXUBD8mPFUuCxbM9QTaZudqF5wBjaIf54pKUXrCse6/KQWmvhRnQ5zd2oZeJ5S05lnH0U= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549399; c=relaxed/simple; bh=mOQPtcaiRCLqZ/b7tbn9dVat/xil8Rd4/tkz8Q6Eds8=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=uAlIjtWAlRDsrdpM7Do3gQsIgjtvCtso632LAYEdcUn1tLqSaun/W8PNasnR3Eq3JYWu6hXjdhLa3e88qmMMZUY9kupKVk64JMz6yiLscf6ttFM8TJ9SEHMOXY6dyeiJrpz4HkRFZ65tHKgUNZtd7voUhIf6/w1E9iJFEldYfbQ= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=VJ4Zjp1S; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=d5I9A+xl; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="VJ4Zjp1S"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="d5I9A+xl" Message-ID: <20251015172835.313805515@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549395; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=/s0DG47jXUf/Vwc4iZhFxfx8t5KUk/sbBFCf8Rh8hY4=; b=VJ4Zjp1S0Zm0YRw5Zys4D6gChwK43Yggjgs8Dt2McYhQ45/7qxeuXfyvXCiNb6JyNS8hD5 euO6IUmalyYcvIXNXHVuWUv9Km0b+KDfo6ap7FUWJaoqja6P1XzVxLzr40NGdy8T8hQyz7 4fR9jopTNmKpwC9nCTIOazUL3pci3owfLXpCR+Z4EmvIOrCaOOYrEoYv1/JpyQeAVIj+jx ZrAO8GFFxNur4t23rbeDhDtuVThl7KDmZbDgl3C7n0P49PtHQBMkkfabqZD3tASVDDiQlo SZQXoS8jN9m95/w+niAY5KdA2TNyqS91riDq5qc+Fi8tDy9TKw3dlIDuHMikGg== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549395; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=/s0DG47jXUf/Vwc4iZhFxfx8t5KUk/sbBFCf8Rh8hY4=; b=d5I9A+xloS3l/d0fjD06ouJZiwwpEkbwk98kFw7xkd7pgK/kE0Rwzfw3FwrHqY4srXUrMY bXtFidhNmleHqHCw== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 16/19] sched/mmcid: Provide CID ownership mode fixup functions References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:54 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" CIDs are either owned by tasks or by CPUs. The ownership mode depends on the number of tasks related to a MM and the number of CPUs on which these tasks are theoretically allowed to run on. Theoretically because that number is the superset of CPU affinities of all tasks which only grows and never shrinks. Switching to per CPU mode happens when the user count becomes greater than the maximum number of CIDs, which is calculated by: opt_cids =3D min(mm_cid::nr_cpus_allowed, mm_cid::users); max_cids =3D min(1.25 * opt_cids, nr_cpu_ids); The +25% allowance is useful for tight CPU masks in scenarios where only a few threads are created and destroyed to avoid frequent mode switches. Though this allowance shrinks, the closer opt_cids becomes to nr_cpu_ids, which is the (unfortunate) hard ABI limit. At the point of switching to per CPU mode the new user is not yet visible in the system, so the task which initiated the fork() runs the fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and either transfers each tasks owned CID to the CPU the task runs on or drops it into the CID pool if a task is not on a CPU at that point in time. Tasks which schedule in before the task walk reaches them do the handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes it's guaranteed that no task related to that MM owns a CID anymore. Switching back to task mode happens when the user count goes below the threshold which was recorded on the per CPU mode switch: pcpu_thrs =3D min(opt_cids - (opt_cids / 4), nr_cpu_ids / 2); This threshold is updated when a affinity change increases the number of allowed CPUs for the MM, which might cause a switch back to per task mode. If the switch back was initiated by a exiting task, then that task runs the fixup function. If it was initiated by a affinity change, then it's run either in the deferred update function in context of a workqueue or by a task which forks a new one or by a task which exits. Whatever happens first. mm_cid_fixup_cpus_to_task() walks through the possible CPUs and either transfers the CPU owned CIDs to a related task which runs on the CPU or drops it into the pool. Tasks which schedule in on a CPU which the walk did not cover yet do the handover themself. As the goal is to avoid serialization of the scheduler hotpath, this requires that the switch back threshold is maximally nr_cpu_ids / 2. Otherwise the CID space might become exhausted when tasks are scheduled in on CPUs which already transferred ownership before the fixup function was able to free or transfer enough CIDs. That would result in a live lock because the task loops in mm_get_cid() with runqueue lock held and the fixup function is stuck on that runqueue lock. When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID related to that MM is owned by a CPU anymore. Signed-off-by: Thomas Gleixner --- include/linux/rseq_types.h | 10 + kernel/sched/core.c | 251 ++++++++++++++++++++++++++++++++++++++++= ----- 2 files changed, 234 insertions(+), 27 deletions(-) --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -125,8 +125,9 @@ struct mm_cid_pcpu { * @users: The number of tasks sharing this MM. Seperate from mm::mm_users * as that is modified by mmget()/mm_put() by other entities which * do not actually share the MM. - * @lock: Spinlock to protect all fields except @pcpu. It also protects - * the MM cid cpumask and the MM cidmask bitmap. + * @pcpu_thrs: Threshold for switching back from per CPU mode + * @update_deferred: A deferred switch back to per task mode is pending. + * @lock: Spinlock to protect against affinity setting which can't take @= mutex * @mutex: Mutex to serialize forks and exits related to this mm */ struct mm_mm_cid { @@ -134,8 +135,13 @@ struct mm_mm_cid { struct mm_cid_pcpu __percpu *pcpu; unsigned int percpu; unsigned int max_cids; + + /* Low frequency modified */ unsigned int nr_cpus_allowed; unsigned int users; + unsigned int pcpu_thrs; + unsigned int update_deferred; + raw_spinlock_t lock; struct mutex mutex; }____cacheline_aligned_in_smp; --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10425,27 +10425,116 @@ void call_trace_sched_update_nr_running( * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the * MM_CID_ONCPU bit set. This bit is filtered out by task_cid() when it * is actualy handed over to user space in the RSEQ memory. + * + * Mode switching: + * + * Switching to per CPU mode happens when the user count becomes greater + * than the maximum number of CIDs, which is calculated by: + * + * opt_cids =3D min(mm_cid::nr_cpus_allowed, mm_cid::users); + * max_cids =3D min(1.25 * opt_cids, nr_cpu_ids); + * + * The +25% allowance is useful for tight CPU masks in scenarios where only + * a few threads are created and destroyed to avoid frequent mode + * switches. Though this allowance shrinks, the closer opt_cids becomes to + * nr_cpu_ids, which is the (unfortunate) hard ABI limit. + * + * At the point of switching to per CPU mode the new user is not yet + * visible in the system, so the task which initiated the fork() runs the + * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and + * either transfers each tasks owned CID to the CPU the task runs on or + * drops it into the CID pool if a task is not on a CPU at that point in + * time. Tasks which schedule in before the task walk reaches them do the + * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() complet= es + * it's guaranteed that no task related to that MM owns a CID anymore. + * + * Switching back to task mode happens when the user count goes below the + * threshold which was recorded on the per CPU mode switch: + * + * pcpu_thrs =3D min(opt_cids - (opt_cids / 4), nr_cpu_ids / 2); + * + * This threshold is updated when a affinity change increases the number of + * allowed CPUs for the MM, which might cause a switch back to per task + * mode. + * + * If the switch back was initiated by a exiting task, then that task runs + * the fixup function. If it was initiated by a affinity change, then it's + * run either in the deferred update function in context of a workqueue or + * by a task which forks a new one or by a task which exits. Whatever + * happens first. mm_cid_fixup_cpus_to_task() walks through the possible + * CPUs and either transfers the CPU owned CIDs to a related task which + * runs on the CPU or drops it into the pool. Tasks which schedule in on a + * CPU which the walk did not cover yet do the handover themself. + * + * As the goal is to avoid serialization of the scheduler hotpath, this + * requires that the switch back threshold is maximally nr_cpu_ids / 2. + * Otherwise the CID space might become exhausted when tasks are scheduled + * in on CPUs which already transferred ownership before the fixup function + * was able to free or transfer enough CIDs. That would result in a live + * lock because the task loops in mm_get_cid() with runqueue lock held and + * the fixup function is stuck on that runqueue lock. + * + * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID + * related to that MM is owned by a CPU anymore. */ =20 /* * Update the CID range properties when the constraints change. Invoked via * fork(), exit() and affinity changes */ -static void mm_update_max_cids(struct mm_struct *mm) +static void __mm_update_max_cids(struct mm_mm_cid *mc) +{ + unsigned int opt_cids, max_cids; + + /* Calculate the new optimal constraint */ + opt_cids =3D min(mc->nr_cpus_allowed, mc->users); + + /* Adjust the maximum CIDs to +25% limited by nr_cpu_ids */ + max_cids =3D min(opt_cids + (opt_cids / 4), nr_cpu_ids); + WRITE_ONCE(mc->max_cids, max_cids); +} + +static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc) +{ + unsigned int opt_cids; + + opt_cids =3D min(mc->nr_cpus_allowed, mc->users); + /* See mode switch documentation above! */ + return min(opt_cids - opt_cids / 4, nr_cpu_ids / 2); +} + +static bool mm_update_max_cids(struct mm_struct *mm) { struct mm_mm_cid *mc =3D &mm->mm_cid; - unsigned int max_cids; =20 lockdep_assert_held(&mm->mm_cid.lock); =20 - /* Calculate the new maximum constraint */ - max_cids =3D min(mc->nr_cpus_allowed, mc->users); - WRITE_ONCE(mc->max_cids, max_cids); + /* Clear deferred mode switch flag. A change is handled by the caller */ + mc->update_deferred =3D false; + __mm_update_max_cids(mc); + + /* Check whether owner mode must be changed */ + if (!mc->percpu) { + /* Enable per CPU mode when the number of users is above max_cids */ + if (mc->users > mc->max_cids) + mc->pcpu_thrs =3D mm_cid_calc_pcpu_thrs(mc); + } else { + /* Switch back to per task if user count under threshold */ + if (mc->users < mc->pcpu_thrs) + mc->pcpu_thrs =3D 0; + } + + /* Mode change required? */ + if (!!mc->percpu =3D=3D !!mc->pcpu_thrs) + return false; + WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs); + return true; } =20 static inline void mm_update_cpus_allowed(struct mm_struct *mm, const stru= ct cpumask *affmsk) { struct cpumask *mm_allowed; + struct mm_mm_cid *mc; unsigned int weight; =20 /* @@ -10455,21 +10544,130 @@ static inline void mm_update_cpus_allowe if (!mm || READ_ONCE(mm->mm_cid.nr_cpus_allowed) =3D=3D nr_cpu_ids || !READ_ONCE(mm->mm_cid.users)) return; - /* * mm::mm_cid::mm_cpus_allowed is the superset of each threads * allowed CPUs mask which means it can only grow. */ - guard(raw_spinlock)(&mm->mm_cid.lock); + mc =3D &mm->mm_cid; + guard(raw_spinlock)(&mc->lock); /* Check again under the lock */ - if (mm->mm_cid.nr_cpus_allowed =3D=3D nr_cpu_ids || !mm->mm_cid.users) + if (mc->nr_cpus_allowed =3D=3D nr_cpu_ids || !mc->users) return; + mm_allowed =3D mm_cpus_allowed(mm); weight =3D cpumask_or_weight(mm_allowed, mm_allowed, affmsk); - if (weight =3D=3D mm->mm_cid.nr_cpus_allowed) + if (weight =3D=3D mc->nr_cpus_allowed) + return; + + WRITE_ONCE(mc->nr_cpus_allowed, weight); + __mm_update_max_cids(mc); + if (!mc->percpu) return; - WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight); - mm_update_max_cids(mm); + + /* Adjust the threshold to the wider set */ + mc->pcpu_thrs =3D mm_cid_calc_pcpu_thrs(mc); + + /* Scheduling of deferred mode switch goes here */ +} + +static inline void mm_cid_transfer_to_task(struct task_struct *t, struct m= m_cid_pcpu *pcp) +{ + if (cid_on_cpu(t->mm_cid.cid)) { + t->mm_cid.cid =3D cpu_cid_to_cid(t->mm_cid.cid); + pcp->cid =3D t->mm_cid.cid; + } +} + +static void __maybe_unused mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) +{ + unsigned int cpu; + + /* Walk the CPUs and fixup all stale CIDs */ + for_each_possible_cpu(cpu) { + struct mm_cid_pcpu *pcp =3D per_cpu_ptr(mm->mm_cid.pcpu, cpu); + struct rq *rq =3D cpu_rq(cpu); + + /* Remote access to mm::mm_cid::pcpu requires rq_lock */ + guard(rq_lock_irq)(rq); + if (cid_on_cpu(pcp->cid)) { + /* If rq->curr has @mm, fix it up right here */ + if (rq->curr->mm =3D=3D mm && rq->curr->mm_cid.active) + mm_cid_transfer_to_task(rq->curr, pcp); + else + mm_drop_cid_on_cpu(mm, pcp); + } + } +} + +static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm= _cid_pcpu *pcp) +{ + if (cid_on_task(t->mm_cid.cid)) { + t->mm_cid.cid =3D cid_to_cpu_cid(t->mm_cid.cid); + pcp->cid =3D t->mm_cid.cid; + } +} + +static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_stru= ct *mm) +{ + /* Remote access to mm::mm_cid::pcpu requires rq_lock */ + guard(task_rq_lock)(t); + if (t->mm !=3D mm) + return false; + if (cid_on_task(t->mm_cid.cid)) { + /* If running on the CPU, transfer the CID, otherwise drop it */ + if (task_rq(t)->curr =3D=3D t) + mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); + else + mm_unset_cid_on_task(t); + } + return true; +} + +static void __maybe_unused mm_cid_fixup_tasks_to_cpus(void) +{ + struct mm_struct *mm =3D current->mm; + struct task_struct *p, *t; + unsigned int users; + + /* + * This can obviously race with a concurrent affinity change, which + * increases the number of allowed CPUs for this mm, but that does + * not affect the mode and only changes the CID constraints. A + * possible switch back to per task mode happens either in the + * deferred handler function or in the next fork()/exit(). + * + * The caller has already transferred. The newly incoming task is + * already accounted for, but not yet visible. + */ + users =3D mm->mm_cid.users - 2; + if (!users) + return; + + guard(rcu)(); + for_other_threads(current, t) { + mm_cid_fixup_task_to_cpu(t, mm); + users--; + } + + if (!users) + return; + + /* Happens only for VM_CLONE processes. */ + for_each_process_thread(p, t) { + if (t =3D=3D current || t->mm !=3D mm) + continue; + if (mm_cid_fixup_task_to_cpu(t, mm)) { + if (--users =3D=3D 0) + return; + } + } +} + +static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct = *mm) +{ + t->mm_cid.active =3D 1; + mm->mm_cid.users++; + return mm_update_max_cids(mm); } =20 void sched_mm_cid_fork(struct task_struct *t) @@ -10479,12 +10677,19 @@ void sched_mm_cid_fork(struct task_struc WARN_ON_ONCE(!mm || t->mm_cid.cid !=3D MM_CID_UNSET); =20 guard(mutex)(&mm->mm_cid.mutex); - guard(raw_spinlock)(&mm->mm_cid.lock); - t->mm_cid.active =3D 1; - mm->mm_cid.users++; - /* Preset last_cid for mm_cid_select() */ - t->mm_cid.last_cid =3D READ_ONCE(mm->mm_cid.max_cids) - 1; - mm_update_max_cids(mm); + scoped_guard(raw_spinlock, &mm->mm_cid.lock) { + sched_mm_cid_add_user(t, mm); + /* Preset last_cid for mm_cid_select() */ + t->mm_cid.last_cid =3D mm->mm_cid.max_cids - 1; + } +} + +static bool sched_mm_cid_remove_user(struct task_struct *t) +{ + t->mm_cid.active =3D 0; + mm_unset_cid_on_task(t); + t->mm->mm_cid.users--; + return mm_update_max_cids(t->mm); } =20 /* @@ -10499,14 +10704,8 @@ void sched_mm_cid_exit(struct task_struc return; =20 guard(mutex)(&mm->mm_cid.mutex); - guard(raw_spinlock)(&mm->mm_cid.lock); - t->mm_cid.active =3D 0; - mm->mm_cid.users--; - if (t->mm_cid.cid !=3D MM_CID_UNSET) { - clear_bit(t->mm_cid.cid, mm_cidmask(mm)); - t->mm_cid.cid =3D MM_CID_UNSET; - } - mm_update_max_cids(mm); + scoped_guard(raw_spinlock, &mm->mm_cid.lock) + sched_mm_cid_remove_user(t); } =20 /* Deactivate MM CID allocation across execve() */ @@ -10535,6 +10734,8 @@ void mm_init_cid(struct mm_struct *mm, s mm->mm_cid.percpu =3D 0; mm->mm_cid.nr_cpus_allowed =3D p->nr_cpus_allowed; mm->mm_cid.users =3D 0; + mm->mm_cid.pcpu_thrs =3D 0; + mm->mm_cid.update_deferred =3D 0; raw_spin_lock_init(&mm->mm_cid.lock); mutex_init(&mm->mm_cid.mutex); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 986503431EC for ; Wed, 15 Oct 2025 17:29:58 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549400; cv=none; b=nBPascpoK4oFE5AWbuAs40FMFWs5X0yMkiZwqIG2Ap9oWGGNNMMVwE0MNG6NXAjRBzuAPOgswWRFfyWmx+R9PTIlsCBd8EMgWnxcs41hH/IDd0rlYcGfDAUQeq49WXtj6CKZRtXFl+UIXwSU04uoolbouKDY15amJBZ3ILHODyQ= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549400; c=relaxed/simple; bh=GoiRX33kGm2vsBAgraaQ2dJcbW0n5konRrbN4aG3wfk=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=gXHj4om0kADN7HHNU5Q4idwaPaJcMOJWe4T4yR36lNbzukfXdg6O25r/4teEToiJlBi6mNSBszlOzXqSXy44aX4jH4XiabupRjT21pzxyeN0jQcSliSRwnZmjNHGI4Zxeyh6SgehdTJBz43XyhQrJOYPfyXV4rUNptHwtxRFP68= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=usKH+/RX; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=mP+RV8jN; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="usKH+/RX"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="mP+RV8jN" Message-ID: <20251015172835.375051021@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549397; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=LDQh6T+IJXDzKvHhdN2SgGNv5hIdajiORp6yTi65eas=; b=usKH+/RXLPIIvyP1GS6/Ru+Gs1SswWxXnRA1ctkvTLqCCW2pGOEl3HxNeyvAONsB6EU01M xe9J5J5SqHzmWKucLk8vjslhgaiCWYqQ3TOHln1Z6EFh5zF2aXWTiAw5nxsrWvpTb4sQVD j8/6OsYmhWfM4JflucoXMnEI7ewjiU1ry0ICfaHqL/FWn5FHG+hqGryhyC3rNNUvtyPNG5 3Y3K06X+4zqOKpvZc8hi5UUE3nI7pu1WxRq5pm+nR8X06yxRtg/0nmM3wxE15E6Djng1xC Su0ipKgnVe8H+zB+2crIiWydJsBoKcPvWlF7Va6j9q6vH7A/7qA6khJuT+wScQ== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549397; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=LDQh6T+IJXDzKvHhdN2SgGNv5hIdajiORp6yTi65eas=; b=mP+RV8jNphx7SBSCgad8Cv+49YpD6JJnI0UKh/wj9w61BdOieN8/9lHlQ47HT/nGATxW9C F1Eiwh8efXK4wWDw== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 17/19] irqwork: Move data struct to a types header References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:56 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" ... to avoid header recursion hell. Signed-off-by: Thomas Gleixner --- include/linux/irq_work.h | 9 ++------- include/linux/irq_work_types.h | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 7 deletions(-) --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -2,8 +2,9 @@ #ifndef _LINUX_IRQ_WORK_H #define _LINUX_IRQ_WORK_H =20 -#include +#include #include +#include =20 /* * An entry can be in one of four states: @@ -14,12 +15,6 @@ * busy NULL, 2 -> {free, claimed} : callback in progress, can be cla= imed */ =20 -struct irq_work { - struct __call_single_node node; - void (*func)(struct irq_work *); - struct rcuwait irqwait; -}; - #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ .node =3D { .u_flags =3D (_flags), }, \ .func =3D (_func), \ --- /dev/null +++ b/include/linux/irq_work_types.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_IRQ_WORK_TYPES_H +#define _LINUX_IRQ_WORK_TYPES_H + +#include +#include + +struct irq_work { + struct __call_single_node node; + void (*func)(struct irq_work *); + struct rcuwait irqwait; +}; + +#endif From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id EBD62343215 for ; Wed, 15 Oct 2025 17:30:01 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549403; cv=none; b=J+Etm9hNAvSTDYm8TqApuS+lbcxwuABksTQPyVcq1irGAk0wCUzPX6La5H8OeyQfhro9sksR3Ds6Kllkk2Bqo/lrLPDBY7p7E43t52T7xpPa3MlVt2l5IIpKsY7qCcbdNEq8sVFp2AFw38UEDfmhfCqBlniUPQN/EF4X40Q+5+k= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549403; c=relaxed/simple; bh=ngWj+GWmj2f9H2VtFKmQZ4jMk/az4KaE7BA1y4O3V7U=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=q84L2Tyz7bux8L7TyTc6JjhszFAD15zMhHttqh3xG+trKyWWerlISe5KJvrTqAOc2P4url99QtGqe9Ej0SRVqG8MnSA4RpcJSrfB1tOf/KjTg/5r0+rKEQtfcDXmgJGKcoY9BXyIlmIybdJ8tLLSs8Il+C4OF1JnPA9eZsugknc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=VCyI0SC4; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=Ir7QbTM6; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="VCyI0SC4"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="Ir7QbTM6" Message-ID: <20251015172835.436893677@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549399; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=TsxaaJ2h7vbj4zTWPeO2h6tlX4WG3sdVTWO/SJZts8E=; b=VCyI0SC4E8JDkLlWcLaLIjvgws3dlYVbSYNorU5R328GYuKEihQeyOs+Xi3yXEEAKf0CUa Cqofeb2KUazXAwTum0xZQDxl2SLEaPG9x+C5H4feif+Eti2gnm4jqmF6R/2Ki8zHsox6Fd IeyjnmyKdvE0wDLmCEmpCOTSwPoNnWBb2PMtIUzbNZkeqY/KEylWwgW/as61PluKPN2okw N1Tts0z095Ps05zKO0jvgt6CQTntZejqzv86yevx2kDq6icQYuC8VLkF6PdWuw7F7zsy+E Pr+fSP9ibU4LoGUv6/w86dAn1EAK64ENA5hTsZHiBJNstTTDJlNqv/Jy9nQ7Hg== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549399; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=TsxaaJ2h7vbj4zTWPeO2h6tlX4WG3sdVTWO/SJZts8E=; b=Ir7QbTM6m6C//KoHFfJTW372rUwpQPpxOSqfAVneztQnYnwXWERugVt9PJEzmZQ5NgUGRf 5tCKPHElOHfUrbBQ== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 18/19] sched/mmcid: Implement deferred mode change References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:29:58 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" When affinity changes cause an increase of the number of CPUs allowed for tasks which are related to a MM, that might results in a situation where the ownership mode can go back from per CPU mode to per task mode. As affinity changes happen with runqueue lock held there is no way to do the actual mode change and required fixup right there. Add the infrastructure to defer it to a workqueue. The scheduled work can race with a fork() or exit(). Whatever happens first takes care of it. Signed-off-by: Thomas Gleixner --- include/linux/rseq_types.h | 8 ++++++ kernel/sched/core.c | 58 +++++++++++++++++++++++++++++++++++++++-= ----- 2 files changed, 59 insertions(+), 7 deletions(-) --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -2,7 +2,9 @@ #ifndef _LINUX_RSEQ_TYPES_H #define _LINUX_RSEQ_TYPES_H =20 +#include #include +#include =20 #ifdef CONFIG_RSEQ struct rseq; @@ -127,6 +129,8 @@ struct mm_cid_pcpu { * do not actually share the MM. * @pcpu_thrs: Threshold for switching back from per CPU mode * @update_deferred: A deferred switch back to per task mode is pending. + * @irq_work: irq_work to handle the affinity mode change case + * @work: Regular work to handle the affinity mode change case * @lock: Spinlock to protect against affinity setting which can't take @= mutex * @mutex: Mutex to serialize forks and exits related to this mm */ @@ -142,6 +146,10 @@ struct mm_mm_cid { unsigned int pcpu_thrs; unsigned int update_deferred; =20 + /* Rarely used. Moves @lock and @mutex into the second cacheline */ + struct irq_work irq_work; + struct work_struct work; + raw_spinlock_t lock; struct mutex mutex; }____cacheline_aligned_in_smp; --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10566,8 +10566,17 @@ static inline void mm_update_cpus_allowe =20 /* Adjust the threshold to the wider set */ mc->pcpu_thrs =3D mm_cid_calc_pcpu_thrs(mc); + /* Switch back to per task mode? */ + if (mc->users >=3D mc->pcpu_thrs) + return; + + /* Don't queue twice */ + if (mc->update_deferred) + return; =20 - /* Scheduling of deferred mode switch goes here */ + /* Queue the irq work, which schedules the real work */ + mc->update_deferred =3D true; + irq_work_queue(&mc->irq_work); } =20 static inline void mm_cid_transfer_to_task(struct task_struct *t, struct m= m_cid_pcpu *pcp) @@ -10578,7 +10587,7 @@ static inline void mm_cid_transfer_to_ta } } =20 -static void __maybe_unused mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) +static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) { unsigned int cpu; =20 @@ -10722,14 +10731,47 @@ void sched_mm_cid_after_execve(struct ta mm_cid_select(t); } =20 -void mm_init_cid(struct mm_struct *mm, struct task_struct *p) +static void mm_cid_work_fn(struct work_struct *work) { - struct mm_cid_pcpu __percpu *pcpu =3D mm->mm_cid.pcpu; - int cpu; + struct mm_struct *mm =3D container_of(work, struct mm_struct, mm_cid.work= ); + + /* Make it compile, but not functional yet */ + if (!IS_ENABLED(CONFIG_NEW_MM_CID)) + return; =20 - for_each_possible_cpu(cpu) - per_cpu_ptr(pcpu, cpu)->cid =3D MM_CID_UNSET; + guard(mutex)(&mm->mm_cid.mutex); + /* Did the last user task exit already? */ + if (!mm->mm_cid.users) + return; =20 + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + /* Have fork() or exit() handled it already? */ + if (!mm->mm_cid.update_deferred) + return; + /* This clears mm_cid::update_deferred */ + if (!mm_update_max_cids(mm)) + return; + /* Affinity changes can only switch back to task mode */ + if (WARN_ON_ONCE(mm->mm_cid.percpu)) + return; + } + mm_cid_fixup_cpus_to_tasks(mm); +} + +static void mm_cid_irq_work(struct irq_work *work) +{ + struct mm_struct *mm =3D container_of(work, struct mm_struct, mm_cid.irq_= work); + + /* + * Needs to be unconditional because mm_cid::lock cannot be held + * when scheduling work as mm_update_cpus_allowed() nests inside + * rq::lock and schedule_work() might end up in wakeup... + */ + schedule_work(&mm->mm_cid.work); +} + +void mm_init_cid(struct mm_struct *mm, struct task_struct *p) +{ mm->mm_cid.max_cids =3D 0; mm->mm_cid.percpu =3D 0; mm->mm_cid.nr_cpus_allowed =3D p->nr_cpus_allowed; @@ -10738,6 +10780,8 @@ void mm_init_cid(struct mm_struct *mm, s mm->mm_cid.update_deferred =3D 0; raw_spin_lock_init(&mm->mm_cid.lock); mutex_init(&mm->mm_cid.mutex); + mm->mm_cid.irq_work =3D IRQ_WORK_INIT_HARD(mm_cid_irq_work); + INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); bitmap_zero(mm_cidmask(mm), nr_cpu_ids); } From nobody Fri Dec 19 22:01:53 2025 Received: from galois.linutronix.de (Galois.linutronix.de [193.142.43.55]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 76F1E343D63 for ; Wed, 15 Oct 2025 17:30:02 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=193.142.43.55 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549404; cv=none; b=G2E6XbgbkHBonMcq/HFelsSCtTyDuNPJEJE2sg/xg6oYvrGD0M/kMcb0BEdqpOFnlxbRTj+h6k0dQea58/HNkNT+4oNCTfX39f6JEi3g7zhz8kFLTlDGfiSQ5UE/YR4uJeWR3PKCA34GeRKXepABtwGjY0cqV3gj9madhwv6ZXk= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1760549404; c=relaxed/simple; bh=0K4fv2AJCUkhxe48bYIU/vRDtRk+pFjMr73nL2NLOfs=; h=Message-ID:From:To:Cc:Subject:References:MIME-Version: Content-Type:Date; b=bB57W8NJMYZKwspXD6D9hIzcTqa/llG/r1r7OuXLErLjnqTmGRqef8k2kCYZIxv4MWWb/WAOebc6I5peolCZ+29yXT2sd9SCq9GmzOmJf2rQ1AVyMBYaAg6UZxWRAi6wsImrZpKC9T7OZRq64xNfhhs8L/NcngZjtEtoI9U1qww= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de; spf=pass smtp.mailfrom=linutronix.de; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=rfvJgKOO; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b=9BZze9kB; arc=none smtp.client-ip=193.142.43.55 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linutronix.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linutronix.de Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="rfvJgKOO"; dkim=permerror (0-bit key) header.d=linutronix.de header.i=@linutronix.de header.b="9BZze9kB" Message-ID: <20251015172835.497158969@linutronix.de> DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1760549401; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=IdbvqWwp2x8aghZk+yjpSDK9ScBW1vSPmZGBckMJu2I=; b=rfvJgKOOrDlfCYaYh3nA31pEKPhbII0ykbyWZz5VvN3feaWZp0RPPB1rxQIZRV54GQ57Gv fGnvM+iAEFsKtwpyS/YcuAd+mRsqWC2lxFqAmSC+ezguq4u/yia6jEIbjKAZHS5ghPGX5I Tv0xliMMhIKgJQAnRvXN1rNUPj/ip4hCDMTJZI77YwYr1CRofd+yXSQp9jrq3U9FkYXprh 2Ef3mVPS+irWC6+OE/VMxfjr1uMlaBqUg++Zv2tY6BkF0kDWhMpNrrkkqpwXpqe6hWbi4x g6hFdu8553z7AVH8FjpmAb/Ja9QgxPadF8doNygsnUnWODBI1GT7VHm6z+ynng== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1760549401; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: references:references; bh=IdbvqWwp2x8aghZk+yjpSDK9ScBW1vSPmZGBckMJu2I=; b=9BZze9kBBuumu38EXdZHTd5bzBZ+zc4ccAEDlzPVbIfHZ2ZF3El92jMte+PDSMZIabs+Ky Qt5IB1D1yfATnaDA== From: Thomas Gleixner To: LKML Cc: Peter Zijlstra , Gabriele Monaco , Mathieu Desnoyers , Michael Jeanson , Jens Axboe , "Paul E. McKenney" , "Gautham R. Shenoy" , Florian Weimer , Tim Chen , TCMalloc Team Subject: [patch 19/19] sched/mmcid: Switch over to the new mechanism References: <20251015164952.694882104@linutronix.de> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Date: Wed, 15 Oct 2025 19:30:00 +0200 (CEST) Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Now that all pieces are in place, change the implementations of sched_mm_cid_fork() and sched_mm_cid_exit() to adhere to the new strict ownership scheme and switch context_switch() over to use the new mm_cid_schedin() functionality. The common case is that there is no mode change required, which makes fork() and exit() just update the user count and the constraints. In case that a new user would exceed the CID space limit the fork() context handles the transition to per CPU mode with mm::mm_cid::mutex held. exit() handles the transition back to per task mode when the user count drops below the switch back threshold. fork() might also be forced to handle a deferred switch back to per task mode, when a affinity change increased the number of allowed CPUs enough. Signed-off-by: Thomas Gleixner --- include/linux/rseq.h | 19 ------- include/linux/rseq_types.h | 8 +-- kernel/fork.c | 1=20 kernel/sched/core.c | 109 ++++++++++++++++++++++++++++++++++++++--= ----- kernel/sched/sched.h | 78 -------------------------------- 5 files changed, 99 insertions(+), 116 deletions(-) --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -82,24 +82,6 @@ static __always_inline void rseq_sched_s t->rseq.event.ids_changed =3D true; } =20 -/* - * Invoked from switch_mm_cid() in context switch when the task gets a MM - * CID assigned. - * - * This does not raise TIF_NOTIFY_RESUME as that happens in - * rseq_sched_switch_event(). - */ -static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct = *t, unsigned int cid) -{ - /* - * Requires a comparison as the switch_mm_cid() code does not - * provide a conditional for it readily. So avoid excessive updates - * when nothing changes. - */ - if (t->rseq.ids.mm_cid !=3D cid) - t->rseq.event.ids_changed =3D true; -} - /* Enforce a full update after RSEQ registration and when execve() failed = */ static inline void rseq_force_update(void) { @@ -177,7 +159,6 @@ static inline void rseq_handle_slowpath( static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_reg= s *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } static inline void rseq_sched_set_ids_changed(struct task_struct *t) { } -static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsig= ned int cid) { } static inline void rseq_force_update(void) { } static inline void rseq_virt_userspace_exit(void) { } static inline void rseq_fork(struct task_struct *t, unsigned long clone_fl= ags) { } --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -100,18 +100,18 @@ struct rseq_data { }; /** * struct sched_mm_cid - Storage for per task MM CID data * @active: MM CID is active for the task - * @cid: The CID associated to the task - * @last_cid: The last CID associated to the task + * @cid: The CID associated to the task either permanently or + * borrowed from the CPU */ struct sched_mm_cid { unsigned int active; unsigned int cid; - unsigned int last_cid; }; =20 /** * struct mm_cid_pcpu - Storage for per CPU MM_CID data - * @cid: The CID associated to the CPU + * @cid: The CID associated to the CPU either permanently or + * while a task with a CID is running */ struct mm_cid_pcpu { unsigned int cid; --- a/kernel/fork.c +++ b/kernel/fork.c @@ -956,7 +956,6 @@ static struct task_struct *dup_task_stru =20 #ifdef CONFIG_SCHED_MM_CID tsk->mm_cid.cid =3D MM_CID_UNSET; - tsk->mm_cid.last_cid =3D MM_CID_UNSET; tsk->mm_cid.active =3D 0; #endif return tsk; --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5339,7 +5339,7 @@ context_switch(struct rq *rq, struct tas } } =20 - switch_mm_cid(prev, next); + mm_cid_schedin(next); =20 /* * Tell rseq that the task was scheduled in. Must be after @@ -10632,7 +10632,7 @@ static bool mm_cid_fixup_task_to_cpu(str return true; } =20 -static void __maybe_unused mm_cid_fixup_tasks_to_cpus(void) +static void mm_cid_fixup_tasks_to_cpus(void) { struct mm_struct *mm =3D current->mm; struct task_struct *p, *t; @@ -10682,14 +10682,42 @@ static bool sched_mm_cid_add_user(struct void sched_mm_cid_fork(struct task_struct *t) { struct mm_struct *mm =3D t->mm; + bool percpu; =20 WARN_ON_ONCE(!mm || t->mm_cid.cid !=3D MM_CID_UNSET); =20 guard(mutex)(&mm->mm_cid.mutex); - scoped_guard(raw_spinlock, &mm->mm_cid.lock) { - sched_mm_cid_add_user(t, mm); - /* Preset last_cid for mm_cid_select() */ - t->mm_cid.last_cid =3D mm->mm_cid.max_cids - 1; + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + struct mm_cid_pcpu *pcp =3D this_cpu_ptr(mm->mm_cid.pcpu); + + /* First user ? */ + if (!mm->mm_cid.users) { + sched_mm_cid_add_user(t, mm); + t->mm_cid.cid =3D mm_get_cid(mm); + /* Required for execve() */ + pcp->cid =3D t->mm_cid.cid; + return; + } + + if (!sched_mm_cid_add_user(t, mm)) { + if (!mm->mm_cid.percpu) + t->mm_cid.cid =3D mm_get_cid(mm); + return; + } + + /* Handle the mode change and transfer current's CID */ + percpu =3D !!mm->mm_cid.percpu; + if (!percpu) + mm_cid_transfer_to_task(current, pcp); + else + mm_cid_transfer_to_cpu(current, pcp); + } + + if (percpu) { + mm_cid_fixup_tasks_to_cpus(); + } else { + mm_cid_fixup_cpus_to_tasks(mm); + t->mm_cid.cid =3D mm_get_cid(mm); } } =20 @@ -10701,6 +10729,30 @@ static bool sched_mm_cid_remove_user(str return mm_update_max_cids(t->mm); } =20 +static bool __sched_mm_cid_exit(struct task_struct *t) +{ + struct mm_struct *mm =3D t->mm; + + if (!sched_mm_cid_remove_user(t)) + return false; + /* + * Contrary to fork() this only deals with a switch back to per + * task mode either because the above decreased users or an + * affinity change increased the number of allowed CPUs and the + * deferred fixup did not run yet. + */ + if (WARN_ON_ONCE(mm->mm_cid.percpu)) + return false; + /* + * A failed fork(2) cleanup never gets here, so @current must have + * the same MM as @t. That's true for exit() and the failed + * pthread_create() cleanup case. + */ + if (WARN_ON_ONCE(current->mm !=3D mm)) + return false; + return true; +} + /* * When a task exits, the MM CID held by the task is not longer required as * the task cannot return to user space. @@ -10711,10 +10763,43 @@ void sched_mm_cid_exit(struct task_struc =20 if (!mm || !t->mm_cid.active) return; + /* + * Ensure that only one instance is doing MM CID operations within + * a MM. The common case is uncontended. The rare fixup case adds + * some overhead. + */ + scoped_guard(mutex, &mm->mm_cid.mutex) { + /* mm_cid::mutex is sufficient to protect mm_cid::users */ + if (likely(mm->mm_cid.users > 1)) { + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + if (!__sched_mm_cid_exit(t)) + return; + /* Mode change required. Transfer currents CID */ + mm_cid_transfer_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu)); + } + mm_cid_fixup_cpus_to_tasks(mm); + return; + } + /* Last user */ + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + /* Required across execve() */ + if (t =3D=3D current) + mm_cid_transfer_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu)); + /* Ignore mode change. There is nothing to do. */ + sched_mm_cid_remove_user(t); + } + } =20 - guard(mutex)(&mm->mm_cid.mutex); - scoped_guard(raw_spinlock, &mm->mm_cid.lock) - sched_mm_cid_remove_user(t); + /* + * As this is the last user (execve(), process exit or failed + * fork(2)) there is no concurrency anymore. + * + * Synchronize eventally pending work to ensure that there are no + * dangling references left. @t->mm_cid.users is zero so nothing + * can queue this work anymore. + */ + irq_work_sync(&mm->mm_cid.irq_work); + cancel_work_sync(&mm->mm_cid.work); } =20 /* Deactivate MM CID allocation across execve() */ @@ -10727,18 +10812,12 @@ void sched_mm_cid_before_execve(struct t void sched_mm_cid_after_execve(struct task_struct *t) { sched_mm_cid_fork(t); - guard(preempt)(); - mm_cid_select(t); } =20 static void mm_cid_work_fn(struct work_struct *work) { struct mm_struct *mm =3D container_of(work, struct mm_struct, mm_cid.work= ); =20 - /* Make it compile, but not functional yet */ - if (!IS_ENABLED(CONFIG_NEW_MM_CID)) - return; - guard(mutex)(&mm->mm_cid.mutex); /* Did the last user task exit already? */ if (!mm->mm_cid.users) --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3704,84 +3704,8 @@ static __always_inline void mm_cid_sched else mm_cid_from_cpu(next, cpu_cid); } - -/* Active implementation */ -static inline void init_sched_mm_cid(struct task_struct *t) -{ - struct mm_struct *mm =3D t->mm; - unsigned int max_cid; - - if (!mm) - return; - - /* Preset last_mm_cid */ - max_cid =3D min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read= (&mm->mm_users)); - t->mm_cid.last_cid =3D max_cid - 1; -} - -static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, u= nsigned int max_cids) -{ - struct mm_struct *mm =3D t->mm; - - if (cid >=3D max_cids) - return false; - if (test_and_set_bit(cid, mm_cidmask(mm))) - return false; - t->mm_cid.cid =3D t->mm_cid.last_cid =3D cid; - __this_cpu_write(mm->mm_cid.pcpu->cid, cid); - return true; -} - -static inline bool mm_cid_get(struct task_struct *t) -{ - struct mm_struct *mm =3D t->mm; - unsigned int max_cids; - - max_cids =3D READ_ONCE(mm->mm_cid.max_cids); - - /* Try to reuse the last CID of this task */ - if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids)) - return true; - - /* Try to reuse the last CID of this mm on this CPU */ - if (__mm_cid_get(t, __this_cpu_read(mm->mm_cid.pcpu->cid), max_cids)) - return true; - - /* Try the first zero bit in the cidmask. */ - return __mm_cid_get(t, find_first_zero_bit(mm_cidmask(mm), nr_cpu_ids), m= ax_cids); -} - -static inline void mm_cid_select(struct task_struct *t) -{ - /* - * mm_cid_get() can fail when the maximum CID, which is determined - * by min(mm->nr_cpus_allowed, mm->mm_users) changes concurrently. - * That's a transient failure as there cannot be more tasks - * concurrently on a CPU (or about to be scheduled in) than that. - */ - for (;;) { - if (mm_cid_get(t)) - break; - } -} - -static inline void switch_mm_cid(struct task_struct *prev, struct task_str= uct *next) -{ - if (prev->mm_cid.active) { - if (prev->mm_cid.cid !=3D MM_CID_UNSET) - clear_bit(prev->mm_cid.cid, mm_cidmask(prev->mm)); - prev->mm_cid.cid =3D MM_CID_UNSET; - } - - if (next->mm_cid.active) { - mm_cid_select(next); - rseq_sched_set_task_mm_cid(next, next->mm_cid.cid); - } -} - #else /* !CONFIG_SCHED_MM_CID: */ -static inline void mm_cid_select(struct task_struct *t) { } -static inline void switch_mm_cid(struct task_struct *prev, struct task_str= uct *next) { } +static inline void mm_cid_schedin(struct task_struct *next) { } #endif /* !CONFIG_SCHED_MM_CID */ =20 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);