All callers of do_set_cpus_allowed() only take p->pi_lock, which is
not sufficient to actually change the cpumask. Again, this is mostly
ok in these cases, but it results in unnecessarily complicated
reasoning.
Furthermore, there is no reason what so ever to not just take all the
required locks, so do just that.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/kthread.c | 15 +++++----------
kernel/sched/core.c | 21 +++++++--------------
kernel/sched/sched.h | 5 +++++
3 files changed, 17 insertions(+), 24 deletions(-)
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -593,18 +593,16 @@ EXPORT_SYMBOL(kthread_create_on_node);
static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
{
- unsigned long flags;
-
if (!wait_task_inactive(p, state)) {
WARN_ON(1);
return;
}
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+ do_set_cpus_allowed(p, mask);
+
/* It's safe because the task is inactive. */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- do_set_cpus_allowed(p, mask);
p->flags |= PF_NO_SETAFFINITY;
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
@@ -857,7 +855,6 @@ int kthread_affine_preferred(struct task
{
struct kthread *kthread = to_kthread(p);
cpumask_var_t affinity;
- unsigned long flags;
int ret = 0;
if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
@@ -882,10 +879,8 @@ int kthread_affine_preferred(struct task
list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
kthread_fetch_affinity(kthread, affinity);
- /* It's safe because the task is inactive. */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- do_set_cpus_allowed(p, affinity);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+ do_set_cpus_allowed(p, affinity);
mutex_unlock(&kthreads_hotplug_lock);
out:
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2703,18 +2703,14 @@ __do_set_cpus_allowed(struct task_struct
bool queued, running;
lockdep_assert_held(&p->pi_lock);
+ lockdep_assert_rq_held(rq);
queued = task_on_rq_queued(p);
running = task_current_donor(rq, p);
- if (queued) {
- /*
- * Because __kthread_bind() calls this on blocked tasks without
- * holding rq->lock.
- */
- lockdep_assert_rq_held(rq);
+ if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
- }
+
if (running)
put_prev_task(rq, p);
@@ -2743,7 +2739,10 @@ void do_set_cpus_allowed(struct task_str
struct rcu_head rcu;
};
- __do_set_cpus_allowed(p, &ac);
+ scoped_guard (__task_rq_lock, p) {
+ update_rq_clock(scope.rq);
+ __do_set_cpus_allowed(p, &ac);
+ }
/*
* Because this is called with p->pi_lock held, it is not possible
@@ -3518,12 +3517,6 @@ static int select_fallback_rq(int cpu, s
}
fallthrough;
case possible:
- /*
- * XXX When called from select_task_rq() we only
- * hold p->pi_lock and again violate locking order.
- *
- * More yuck to audit.
- */
do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
state = fail;
break;
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1822,6 +1822,11 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct
task_rq_unlock(_T->rq, _T->lock, &_T->rf),
struct rq *rq; struct rq_flags rf)
+DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
+ _T->rq = __task_rq_lock(_T->lock, &_T->rf),
+ __task_rq_unlock(_T->rq, &_T->rf),
+ struct rq *rq; struct rq_flags rf)
+
static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
On Wed, Sep 10, 2025 at 05:44:16PM +0200, Peter Zijlstra wrote: > All callers of do_set_cpus_allowed() only take p->pi_lock, which is > not sufficient to actually change the cpumask. Again, this is mostly > ok in these cases, but it results in unnecessarily complicated > reasoning. We're seeing lockups on some arm64 platforms in -next with the LTP cpuhotplug02 test, the machine sits there repeatedly complaining that RCU is stalled on IPIs: Running tests....... Name: cpuhotplug02 Date: Wed Oct 29 17:22:13 UTC 2025 Desc: What happens to a process when its CPU is offlined? CPU is 1 <3>[ 89.915745] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks: <3>[ 89.922145] rcu: 1-...0: (0 ticks this GP) idle=28f4/1/0x4000000000000000 softirq=2203/2203 fqs=195 <3>[ 89.931570] rcu: (detected by 4, t=5256 jiffies, g=10357, q=7 ncpus=6) <6>[ 89.938465] Sending NMI from CPU 4 to CPUs 1: <3>[ 99.944589] rcu: rcu_preempt kthread starved for 2629 jiffies! g10357 f0x0 RCU_GP_DOING_FQS(6) ->state=0x0 ->cpu=0 <3>[ 99.955226] rcu: Unless rcu_preempt kthread gets sufficient CPU time, OOM is now expected behavior. <3>[ 99.964637] rcu: RCU grace-period kthread stack dump: <6>[ 99.969957] task:rcu_preempt state:R running task stack:0 pid:15 tgid:15 ppid:2 task_flags:0x208040 flags:0x00000012 <6>[ 99.982871] Call trace: <6>[ 99.985582] __switch_to+0xf0/0x1c0 (T) <6>[ 99.989702] arch_send_call_function_single_ipi+0x30/0x3c <6>[ 99.995389] __smp_call_single_queue+0xa0/0xb0 <6>[ 100.000118] irq_work_queue_on+0x78/0xd0 <6>[ 100.004323] rcu_watching_snap_recheck+0x304/0x350 <6>[ 100.009394] force_qs_rnp+0x1d0/0x364 <6>[ 100.013330] rcu_gp_fqs_loop+0x324/0x500 <6>[ 100.017527] rcu_gp_kthread+0x134/0x160 <6>[ 100.021640] kthread+0x12c/0x204 <6>[ 100.025146] ret_from_fork+0x10/0x20 <3>[ 100.029000] rcu: Stack dump where RCU GP kthread last ran: with the same stack trace repeating ad infinitum. A bisect converges fairly smoothly on this commit which looks plausible though I've not really looked closely: git bisect start # status: waiting for both good and bad commits # bad: [f9ba12abc5282bf992f9a9ae87ad814fd03a0270] Add linux-next specific files for 20251029 git bisect bad f9ba12abc5282bf992f9a9ae87ad814fd03a0270 # status: waiting for good commit(s), bad commit known # good: [58efa7cdf77aae9e595b5f195d53d9abc37f1ecf] Merge branch 'for-linux-next-fixes' of https://gitlab.freedesktop.org/drm/misc/kernel.git git bisect good 58efa7cdf77aae9e595b5f195d53d9abc37f1ecf # good: [196c1d2131e9e2326e4a6a79eaa1ea54bdc90056] Merge branch 'libcrypto-next' of https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux.git git bisect good 196c1d2131e9e2326e4a6a79eaa1ea54bdc90056 # good: [47af99b9fa06d7207d03f53099c58ab145819c20] Merge branch 'for-next' of https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git git bisect good 47af99b9fa06d7207d03f53099c58ab145819c20 # bad: [53ac14eeef9a69b4e881a5cd8d56ecf054a25dc3] Merge branch 'for-next' of https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git git bisect bad 53ac14eeef9a69b4e881a5cd8d56ecf054a25dc3 # good: [9ac3f65ed6bd03cc83d86c50e51caa1d223e9e76] Merge branch 'for-next' of https://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git git bisect good 9ac3f65ed6bd03cc83d86c50e51caa1d223e9e76 # good: [301e1f4a2740ba1f8f312412b88fb9aabe3be7ec] Merge branch into tip/master: 'objtool/core' git bisect good 301e1f4a2740ba1f8f312412b88fb9aabe3be7ec # bad: [cca14814f28673e48bab2f1db13db420c33a2848] Merge branch 'for-next' of https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git git bisect bad cca14814f28673e48bab2f1db13db420c33a2848 # bad: [9b6f0572b2700cad5f3eaee3ca190ae960f56b80] Merge branch into tip/master: 'x86/entry' git bisect bad 9b6f0572b2700cad5f3eaee3ca190ae960f56b80 # bad: [4c95380701f58b8112f0b891de8d160e4199e19d] sched/ext: Fold balance_scx() into pick_task_scx() git bisect bad 4c95380701f58b8112f0b891de8d160e4199e19d # good: [6455ad5346c9cf755fa9dda6e326c4028fb3c853] sched: Move sched_class::prio_changed() into the change pattern git bisect good 6455ad5346c9cf755fa9dda6e326c4028fb3c853 # bad: [46a177fb01e52ec0e3f9eab9b217a0f7c8909eeb] sched: Add locking comments to sched_class methods git bisect bad 46a177fb01e52ec0e3f9eab9b217a0f7c8909eeb # bad: [abfc01077df66593f128d966fdad1d042facc9ac] sched: Fix do_set_cpus_allowed() locking git bisect bad abfc01077df66593f128d966fdad1d042facc9ac # good: [942b8db965006cf655d356162f7091a9238da94e] sched: Fix migrate_disable_switch() locking git bisect good 942b8db965006cf655d356162f7091a9238da94e # first bad commit: [abfc01077df66593f128d966fdad1d042facc9ac] sched: Fix do_set_cpus_allowed() locking
On Thu, Oct 30, 2025 at 12:12:01AM +0000, Mark Brown wrote: > On Wed, Sep 10, 2025 at 05:44:16PM +0200, Peter Zijlstra wrote: > > > All callers of do_set_cpus_allowed() only take p->pi_lock, which is > > not sufficient to actually change the cpumask. Again, this is mostly > > ok in these cases, but it results in unnecessarily complicated > > reasoning. > > We're seeing lockups on some arm64 platforms in -next with the LTP > cpuhotplug02 test, the machine sits there repeatedly complaining that > RCU is stalled on IPIs: Did not this help? https://lkml.kernel.org/r/20251027110133.GI3245006@noisy.programming.kicks-ass.net
On Thu, Oct 30, 2025 at 10:07:15AM +0100, Peter Zijlstra wrote: > On Thu, Oct 30, 2025 at 12:12:01AM +0000, Mark Brown wrote: > > We're seeing lockups on some arm64 platforms in -next with the LTP > > cpuhotplug02 test, the machine sits there repeatedly complaining that > > RCU is stalled on IPIs: > Did not this help? > https://lkml.kernel.org/r/20251027110133.GI3245006@noisy.programming.kicks-ass.net It looks like that hadn't landed in yesterday's -next - it showed up today and things do indeed look a lot happier, thanks!
© 2016 - 2026 Red Hat, Inc.