sched: Cleanup the change-pattern and related locking

[PATCH 07/12] sched: Fix do_set_cpus_allowed() locking

Posted by Peter Zijlstra 4 months ago

All callers of do_set_cpus_allowed() only take p->pi_lock, which is
not sufficient to actually change the cpumask. Again, this is mostly
ok in these cases, but it results in unnecessarily complicated
reasoning.

Furthermore, there is no reason what so ever to not just take all the
required locks, so do just that.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/kthread.c     |   15 +++++----------
 kernel/sched/core.c  |   21 +++++++--------------
 kernel/sched/sched.h |    5 +++++
 3 files changed, 17 insertions(+), 24 deletions(-)

--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -593,18 +593,16 @@ EXPORT_SYMBOL(kthread_create_on_node);
 
 static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
 {
-	unsigned long flags;
-
 	if (!wait_task_inactive(p, state)) {
 		WARN_ON(1);
 		return;
 	}
 
+	scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+		do_set_cpus_allowed(p, mask);
+
 	/* It's safe because the task is inactive. */
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	do_set_cpus_allowed(p, mask);
 	p->flags |= PF_NO_SETAFFINITY;
-	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 }
 
 static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
@@ -857,7 +855,6 @@ int kthread_affine_preferred(struct task
 {
 	struct kthread *kthread = to_kthread(p);
 	cpumask_var_t affinity;
-	unsigned long flags;
 	int ret = 0;
 
 	if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
@@ -882,10 +879,8 @@ int kthread_affine_preferred(struct task
 	list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
 	kthread_fetch_affinity(kthread, affinity);
 
-	/* It's safe because the task is inactive. */
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	do_set_cpus_allowed(p, affinity);
-	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+	scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+		do_set_cpus_allowed(p, affinity);
 
 	mutex_unlock(&kthreads_hotplug_lock);
 out:
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2668,18 +2668,14 @@ __do_set_cpus_allowed(struct task_struct
 	bool queued, running;
 
 	lockdep_assert_held(&p->pi_lock);
+	lockdep_assert_rq_held(rq);
 
 	queued = task_on_rq_queued(p);
 	running = task_current_donor(rq, p);
 
-	if (queued) {
-		/*
-		 * Because __kthread_bind() calls this on blocked tasks without
-		 * holding rq->lock.
-		 */
-		lockdep_assert_rq_held(rq);
+	if (queued)
 		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
-	}
+
 	if (running)
 		put_prev_task(rq, p);
 
@@ -2708,7 +2704,10 @@ void do_set_cpus_allowed(struct task_str
 		struct rcu_head rcu;
 	};
 
-	__do_set_cpus_allowed(p, &ac);
+	scoped_guard (__task_rq_lock, p) {
+		update_rq_clock(scope.rq);
+		__do_set_cpus_allowed(p, &ac);
+	}
 
 	/*
 	 * Because this is called with p->pi_lock held, it is not possible
@@ -3483,12 +3482,6 @@ static int select_fallback_rq(int cpu, s
 			}
 			fallthrough;
 		case possible:
-			/*
-			 * XXX When called from select_task_rq() we only
-			 * hold p->pi_lock and again violate locking order.
-			 *
-			 * More yuck to audit.
-			 */
 			do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
 			state = fail;
 			break;
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1847,6 +1847,11 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct
 		    task_rq_unlock(_T->rq, _T->lock, &_T->rf),
 		    struct rq *rq; struct rq_flags rf)
 
+DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
+		    _T->rq = __task_rq_lock(_T->lock, &_T->rf),
+		    __task_rq_unlock(_T->rq, &_T->rf),
+		    struct rq *rq; struct rq_flags rf)
+
 static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
 	__acquires(rq->lock)
 {

[REGRESSION] Deadlock during CPU hotplug caused by abfc01077df6

Posted by Jan Polensky 3 months, 2 weeks ago

We've identified a regression introduced by commit abfc01077df6 ("sched: Fix
do_set_cpus_allowed() locking") that causes a reproducible deadlock during CPU
hotplug testing on s390x.

While running the cpuhotplug02.sh test from LTP, which dynamically
offlines and onlines CPUs, the system consistently enters a stalled
state.

Observed behavior:
- migration/N attempts to migrate a task currently executing on another
  CPU.
- Concurrently, rcu_sched tries to complete an RCU grace period.
- Both threads are blocked on spinlocks (e.g., arch_spin_lock_wait),
  likely due to lock contention.
- Neither thread progresses; the grace period stalls.
- The kernel detects the stall and triggers a crash dump.

Sys info:
	RELEASE: 6.18.0-20251021.rc2.git224.fe45352cd106.63.fc42.s390x+next
	CPUS: 32
	TASKS: 623
	MEMORY: 16 GB

Crash log excerpt:
    [ 6146.992159] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
    [ 6146.992173] rcu:     1-...0: (5 ticks this GP) idle=cea4/1/0x4000000000000000 softirq=1055899/1055901 fqs=4769
    [ 6146.992236] rcu:     (detected by 3, t=240013 jiffies, g=2041729, q=14778 ncpus=32)
    [ 6146.992240] Task dump for CPU 1:
    [ 6146.992241] task:migration/1     state:R  running task     stack:0     pid:22    tgid:22    ppid:2      task_flags:0x4208040 flags:0x00000000
    [ 6146.992246] Stopper: __balance_push_cpu_stop+0x0/0x230 <- balance_push+0xea/0x170
    [ 6146.992254] Call Trace:
    [ 6146.992255]  [<000000009d9e2300>] 0x9d9e2300
    [ 6146.992280] rcu: rcu_sched kthread starved for 210010 jiffies! g2041729 f0x2 RCU_GP_DOING_FQS(6) ->state=0x0 ->cpu=23
    [ 6146.992287] rcu:     Unless rcu_sched kthread gets sufficient CPU time, OOM is now expected behavior.
    [ 6146.992288] rcu: RCU grace-period kthread stack dump:
    [ 6146.992289] task:rcu_sched       state:R  running task     stack:0     pid:16    tgid:16    ppid:2      task_flags:0x208040 flags:0x00000010
    [ 6146.992294] Call Trace:
    [ 6146.992295]  [<0700000000000001>] 0x700000000000001
    [ 6146.992298]  [<000002e1fb072998>] arch_spin_lock_wait+0xc8/0x110
    [ 6146.992303]  [<000002e1fa239d06>] raw_spin_rq_lock_nested+0x96/0xc0
    [ 6146.992306]  [<000002e1fa23bc90>] resched_cpu+0x50/0xc0
    [ 6146.992309]  [<000002e1fa29d646>] force_qs_rnp+0x306/0x3e0
    [ 6146.992314]  [<000002e1fa29ed30>] rcu_gp_fqs_loop+0x430/0x6e0
    [ 6146.992316]  [<000002e1fa2a1b0e>] rcu_gp_kthread+0x1ee/0x270
    [ 6146.992320]  [<000002e1fa228edc>] kthread+0x12c/0x250
    [ 6146.992323]  [<000002e1fa19ccfc>] __ret_from_fork+0x3c/0x150
    [ 6146.992328]  [<000002e1fb0800ba>] ret_from_fork+0xa/0x30

[tip: sched/core] sched: Fix do_set_cpus_allowed() locking

Posted by tip-bot2 for Peter Zijlstra 3 months, 3 weeks ago

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     abfc01077df66593f128d966fdad1d042facc9ac
Gitweb:        https://git.kernel.org/tip/abfc01077df66593f128d966fdad1d042facc9ac
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 10 Sep 2025 09:51:06 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:52 +02:00

sched: Fix do_set_cpus_allowed() locking

All callers of do_set_cpus_allowed() only take p->pi_lock, which is
not sufficient to actually change the cpumask. Again, this is mostly
ok in these cases, but it results in unnecessarily complicated
reasoning.

Furthermore, there is no reason what so ever to not just take all the
required locks, so do just that.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/kthread.c     | 15 +++++----------
 kernel/sched/core.c  | 21 +++++++--------------
 kernel/sched/sched.h |  5 +++++
 3 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 31b072e..832bd2a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -593,18 +593,16 @@ EXPORT_SYMBOL(kthread_create_on_node);
 
 static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
 {
-	unsigned long flags;
-
 	if (!wait_task_inactive(p, state)) {
 		WARN_ON(1);
 		return;
 	}
 
+	scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+		do_set_cpus_allowed(p, mask);
+
 	/* It's safe because the task is inactive. */
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	do_set_cpus_allowed(p, mask);
 	p->flags |= PF_NO_SETAFFINITY;
-	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 }
 
 static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
@@ -857,7 +855,6 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
 {
 	struct kthread *kthread = to_kthread(p);
 	cpumask_var_t affinity;
-	unsigned long flags;
 	int ret = 0;
 
 	if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
@@ -882,10 +879,8 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
 	list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
 	kthread_fetch_affinity(kthread, affinity);
 
-	/* It's safe because the task is inactive. */
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	do_set_cpus_allowed(p, affinity);
-	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+	scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+		do_set_cpus_allowed(p, affinity);
 
 	mutex_unlock(&kthreads_hotplug_lock);
 out:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f2d16d1..805e650 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2668,18 +2668,14 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
 	bool queued, running;
 
 	lockdep_assert_held(&p->pi_lock);
+	lockdep_assert_rq_held(rq);
 
 	queued = task_on_rq_queued(p);
 	running = task_current_donor(rq, p);
 
-	if (queued) {
-		/*
-		 * Because __kthread_bind() calls this on blocked tasks without
-		 * holding rq->lock.
-		 */
-		lockdep_assert_rq_held(rq);
+	if (queued)
 		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
-	}
+
 	if (running)
 		put_prev_task(rq, p);
 
@@ -2708,7 +2704,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 		struct rcu_head rcu;
 	};
 
-	__do_set_cpus_allowed(p, &ac);
+	scoped_guard (__task_rq_lock, p) {
+		update_rq_clock(scope.rq);
+		__do_set_cpus_allowed(p, &ac);
+	}
 
 	/*
 	 * Because this is called with p->pi_lock held, it is not possible
@@ -3483,12 +3482,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 			}
 			fallthrough;
 		case possible:
-			/*
-			 * XXX When called from select_task_rq() we only
-			 * hold p->pi_lock and again violate locking order.
-			 *
-			 * More yuck to audit.
-			 */
 			do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
 			state = fail;
 			break;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bcde43d..b23ce9c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1847,6 +1847,11 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
 		    task_rq_unlock(_T->rq, _T->lock, &_T->rf),
 		    struct rq *rq; struct rq_flags rf)
 
+DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
+		    _T->rq = __task_rq_lock(_T->lock, &_T->rf),
+		    __task_rq_unlock(_T->rq, &_T->rf),
+		    struct rq *rq; struct rq_flags rf)
+
 static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
 	__acquires(rq->lock)
 {