[PATCH] rcu: Break rcu_node_0 --> &rq->__lock order

Peter Zijlstra posted 1 patch 2 years, 2 months ago
kernel/rcu/tree.c |   34 ++++++++++++++++++++++++----------
1 file changed, 24 insertions(+), 10 deletions(-)
[PATCH] rcu: Break rcu_node_0 --> &rq->__lock order
Posted by Peter Zijlstra 2 years, 2 months ago
On Tue, Oct 31, 2023 at 07:29:04AM -0700, Paul E. McKenney wrote:
> Other than the de-alphabetization of the local variables, it looks
> plausible to me.  Frederic's suggestion also sounds plausible to me.

Having spend the better part of the past two decades using upside down
xmas trees for local variables, this alphabet thing is obnoxious :-)

But your code, your rules.

To reduce the number of alphabet songs required, I've taken the liberty
to move a few variables into a narrower scope, hope that doesn't offend.

---
Subject: rcu: Break rcu_node_0 --> &rq->__lock order
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 31 Oct 2023 09:53:08 +0100

Commit 851a723e45d1 ("sched: Always clear user_cpus_ptr in
do_set_cpus_allowed()") added a kfree() call to free any user
provided affinity mask, if present. It was changed later to use
kfree_rcu() in commit 9a5418bc48ba ("sched/core: Use kfree_rcu()
in do_set_cpus_allowed()") to avoid a circular locking dependency
problem.

It turns out that even kfree_rcu() isn't safe for avoiding
circular locking problem. As reported by kernel test robot,
the following circular locking dependency now exists:

  &rdp->nocb_lock --> rcu_node_0 --> &rq->__lock

Solve this by breaking the rcu_node_0 --> &rq->__lock chain by moving
the resched_cpu() out from under rcu_node lock.

[peterz: heavily borrowed from Waiman's Changelog]
Fixes: 851a723e45d1 ("sched: Always clear user_cpus_ptr in do_set_cpus_allowed()")
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/oe-lkp/202310302207.a25f1a30-oliver.sang@intel.com
---
 kernel/rcu/tree.c |   34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -754,14 +754,19 @@ static int dyntick_save_progress_counter
 }
 
 /*
- * Return true if the specified CPU has passed through a quiescent
- * state by virtue of being in or having passed through an dynticks
- * idle state since the last call to dyntick_save_progress_counter()
- * for this same CPU, or by virtue of having been offline.
+ * Returns positive if the specified CPU has passed through a quiescent state
+ * by virtue of being in or having passed through an dynticks idle state since
+ * the last call to dyntick_save_progress_counter() for this same CPU, or by
+ * virtue of having been offline.
+ *
+ * Returns negative if the specified CPU needs a force resched.
+ *
+ * Returns zero otherwise.
  */
 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 {
 	unsigned long jtsq;
+	int ret = 0;
 	struct rcu_node *rnp = rdp->mynode;
 
 	/*
@@ -847,8 +852,8 @@ static int rcu_implicit_dynticks_qs(stru
 	    (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
 	     rcu_state.cbovld)) {
 		WRITE_ONCE(rdp->rcu_urgent_qs, true);
-		resched_cpu(rdp->cpu);
 		WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+		ret = -1;
 	}
 
 	/*
@@ -891,7 +896,7 @@ static int rcu_implicit_dynticks_qs(stru
 		}
 	}
 
-	return 0;
+	return ret;
 }
 
 /* Trace-event wrapper function for trace_rcu_future_grace_period.  */
@@ -2257,15 +2262,15 @@ static void force_qs_rnp(int (*f)(struct
 {
 	int cpu;
 	unsigned long flags;
-	unsigned long mask;
-	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 
 	rcu_state.cbovld = rcu_state.cbovldnext;
 	rcu_state.cbovldnext = false;
 	rcu_for_each_leaf_node(rnp) {
+		unsigned long mask = 0;
+		unsigned long rsmask = 0;
+
 		cond_resched_tasks_rcu_qs();
-		mask = 0;
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		rcu_state.cbovldnext |= !!rnp->cbovldmask;
 		if (rnp->qsmask == 0) {
@@ -2283,11 +2288,17 @@ static void force_qs_rnp(int (*f)(struct
 			continue;
 		}
 		for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
+			struct rcu_data *rdp;
+			int ret;
+
 			rdp = per_cpu_ptr(&rcu_data, cpu);
-			if (f(rdp)) {
+			ret = f(rdp);
+			if (ret > 0) {
 				mask |= rdp->grpmask;
 				rcu_disable_urgency_upon_qs(rdp);
 			}
+			if (ret < 0)
+				rsmask |= rdp->grpmask;
 		}
 		if (mask != 0) {
 			/* Idle/offline CPUs, report (releases rnp->lock). */
@@ -2296,6 +2307,9 @@ static void force_qs_rnp(int (*f)(struct
 			/* Nothing to do here, so just drop the lock. */
 			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		}
+
+		for_each_leaf_node_cpu_mask(rnp, cpu, rsmask)
+			resched_cpu(cpu);
 	}
 }
Re: [PATCH] rcu: Break rcu_node_0 --> &rq->__lock order
Posted by Waiman Long 2 years, 2 months ago
On 10/31/23 16:02, Peter Zijlstra wrote:
> On Tue, Oct 31, 2023 at 07:29:04AM -0700, Paul E. McKenney wrote:
>> Other than the de-alphabetization of the local variables, it looks
>> plausible to me.  Frederic's suggestion also sounds plausible to me.
> Having spend the better part of the past two decades using upside down
> xmas trees for local variables, this alphabet thing is obnoxious :-)
>
> But your code, your rules.
>
> To reduce the number of alphabet songs required, I've taken the liberty
> to move a few variables into a narrower scope, hope that doesn't offend.
>
> ---
> Subject: rcu: Break rcu_node_0 --> &rq->__lock order
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Tue, 31 Oct 2023 09:53:08 +0100
>
> Commit 851a723e45d1 ("sched: Always clear user_cpus_ptr in
> do_set_cpus_allowed()") added a kfree() call to free any user
> provided affinity mask, if present. It was changed later to use
> kfree_rcu() in commit 9a5418bc48ba ("sched/core: Use kfree_rcu()
> in do_set_cpus_allowed()") to avoid a circular locking dependency
> problem.
>
> It turns out that even kfree_rcu() isn't safe for avoiding
> circular locking problem. As reported by kernel test robot,
> the following circular locking dependency now exists:
>
>    &rdp->nocb_lock --> rcu_node_0 --> &rq->__lock
>
> Solve this by breaking the rcu_node_0 --> &rq->__lock chain by moving
> the resched_cpu() out from under rcu_node lock.
>
> [peterz: heavily borrowed from Waiman's Changelog]
> Fixes: 851a723e45d1 ("sched: Always clear user_cpus_ptr in do_set_cpus_allowed()")
> Reported-by: kernel test robot <oliver.sang@intel.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Link: https://lore.kernel.org/oe-lkp/202310302207.a25f1a30-oliver.sang@intel.com

Thanks for addressing this issue. I am fine with your way as long as it 
gets the job done. I am not familiar enough of the RCU code to do a 
proper review, but I do get the general idea of your change and it looks 
good to me.

Acked-by: Waiman Long <longman@redhat.com>

> ---
>   kernel/rcu/tree.c |   34 ++++++++++++++++++++++++----------
>   1 file changed, 24 insertions(+), 10 deletions(-)
>
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -754,14 +754,19 @@ static int dyntick_save_progress_counter
>   }
>   
>   /*
> - * Return true if the specified CPU has passed through a quiescent
> - * state by virtue of being in or having passed through an dynticks
> - * idle state since the last call to dyntick_save_progress_counter()
> - * for this same CPU, or by virtue of having been offline.
> + * Returns positive if the specified CPU has passed through a quiescent state
> + * by virtue of being in or having passed through an dynticks idle state since
> + * the last call to dyntick_save_progress_counter() for this same CPU, or by
> + * virtue of having been offline.
> + *
> + * Returns negative if the specified CPU needs a force resched.
> + *
> + * Returns zero otherwise.
>    */
>   static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
>   {
>   	unsigned long jtsq;
> +	int ret = 0;
>   	struct rcu_node *rnp = rdp->mynode;
>   
>   	/*
> @@ -847,8 +852,8 @@ static int rcu_implicit_dynticks_qs(stru
>   	    (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
>   	     rcu_state.cbovld)) {
>   		WRITE_ONCE(rdp->rcu_urgent_qs, true);
> -		resched_cpu(rdp->cpu);
>   		WRITE_ONCE(rdp->last_fqs_resched, jiffies);
> +		ret = -1;
>   	}
>   
>   	/*
> @@ -891,7 +896,7 @@ static int rcu_implicit_dynticks_qs(stru
>   		}
>   	}
>   
> -	return 0;
> +	return ret;
>   }
>   
>   /* Trace-event wrapper function for trace_rcu_future_grace_period.  */
> @@ -2257,15 +2262,15 @@ static void force_qs_rnp(int (*f)(struct
>   {
>   	int cpu;
>   	unsigned long flags;
> -	unsigned long mask;
> -	struct rcu_data *rdp;
>   	struct rcu_node *rnp;
>   
>   	rcu_state.cbovld = rcu_state.cbovldnext;
>   	rcu_state.cbovldnext = false;
>   	rcu_for_each_leaf_node(rnp) {
> +		unsigned long mask = 0;
> +		unsigned long rsmask = 0;
> +
>   		cond_resched_tasks_rcu_qs();
> -		mask = 0;
>   		raw_spin_lock_irqsave_rcu_node(rnp, flags);
>   		rcu_state.cbovldnext |= !!rnp->cbovldmask;
>   		if (rnp->qsmask == 0) {
> @@ -2283,11 +2288,17 @@ static void force_qs_rnp(int (*f)(struct
>   			continue;
>   		}
>   		for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
> +			struct rcu_data *rdp;
> +			int ret;
> +
>   			rdp = per_cpu_ptr(&rcu_data, cpu);
> -			if (f(rdp)) {
> +			ret = f(rdp);
> +			if (ret > 0) {
>   				mask |= rdp->grpmask;
>   				rcu_disable_urgency_upon_qs(rdp);
>   			}
> +			if (ret < 0)
> +				rsmask |= rdp->grpmask;
>   		}
>   		if (mask != 0) {
>   			/* Idle/offline CPUs, report (releases rnp->lock). */
> @@ -2296,6 +2307,9 @@ static void force_qs_rnp(int (*f)(struct
>   			/* Nothing to do here, so just drop the lock. */
>   			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
>   		}
> +
> +		for_each_leaf_node_cpu_mask(rnp, cpu, rsmask)
> +			resched_cpu(cpu);
>   	}
>   }
>   
>
Re: [PATCH] rcu: Break rcu_node_0 --> &rq->__lock order
Posted by Paul E. McKenney 2 years, 2 months ago
On Tue, Oct 31, 2023 at 09:02:28PM +0100, Peter Zijlstra wrote:
> On Tue, Oct 31, 2023 at 07:29:04AM -0700, Paul E. McKenney wrote:
> > Other than the de-alphabetization of the local variables, it looks
> > plausible to me.  Frederic's suggestion also sounds plausible to me.
> 
> Having spend the better part of the past two decades using upside down
> xmas trees for local variables, this alphabet thing is obnoxious :-)
> 
> But your code, your rules.
> 
> To reduce the number of alphabet songs required, I've taken the liberty
> to move a few variables into a narrower scope, hope that doesn't offend.

I have no problem with pushing local variables to local scopes!  ;-)

> ---
> Subject: rcu: Break rcu_node_0 --> &rq->__lock order
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Tue, 31 Oct 2023 09:53:08 +0100
> 
> Commit 851a723e45d1 ("sched: Always clear user_cpus_ptr in
> do_set_cpus_allowed()") added a kfree() call to free any user
> provided affinity mask, if present. It was changed later to use
> kfree_rcu() in commit 9a5418bc48ba ("sched/core: Use kfree_rcu()
> in do_set_cpus_allowed()") to avoid a circular locking dependency
> problem.
> 
> It turns out that even kfree_rcu() isn't safe for avoiding
> circular locking problem. As reported by kernel test robot,
> the following circular locking dependency now exists:
> 
>   &rdp->nocb_lock --> rcu_node_0 --> &rq->__lock
> 
> Solve this by breaking the rcu_node_0 --> &rq->__lock chain by moving
> the resched_cpu() out from under rcu_node lock.
> 
> [peterz: heavily borrowed from Waiman's Changelog]
> Fixes: 851a723e45d1 ("sched: Always clear user_cpus_ptr in do_set_cpus_allowed()")
> Reported-by: kernel test robot <oliver.sang@intel.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Link: https://lore.kernel.org/oe-lkp/202310302207.a25f1a30-oliver.sang@intel.com

This passes light testing, so I have queued it for further review and
testing.

							Thanx, Paul

> ---
>  kernel/rcu/tree.c |   34 ++++++++++++++++++++++++----------
>  1 file changed, 24 insertions(+), 10 deletions(-)
> 
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -754,14 +754,19 @@ static int dyntick_save_progress_counter
>  }
>  
>  /*
> - * Return true if the specified CPU has passed through a quiescent
> - * state by virtue of being in or having passed through an dynticks
> - * idle state since the last call to dyntick_save_progress_counter()
> - * for this same CPU, or by virtue of having been offline.
> + * Returns positive if the specified CPU has passed through a quiescent state
> + * by virtue of being in or having passed through an dynticks idle state since
> + * the last call to dyntick_save_progress_counter() for this same CPU, or by
> + * virtue of having been offline.
> + *
> + * Returns negative if the specified CPU needs a force resched.
> + *
> + * Returns zero otherwise.
>   */
>  static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
>  {
>  	unsigned long jtsq;
> +	int ret = 0;
>  	struct rcu_node *rnp = rdp->mynode;
>  
>  	/*
> @@ -847,8 +852,8 @@ static int rcu_implicit_dynticks_qs(stru
>  	    (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
>  	     rcu_state.cbovld)) {
>  		WRITE_ONCE(rdp->rcu_urgent_qs, true);
> -		resched_cpu(rdp->cpu);
>  		WRITE_ONCE(rdp->last_fqs_resched, jiffies);
> +		ret = -1;
>  	}
>  
>  	/*
> @@ -891,7 +896,7 @@ static int rcu_implicit_dynticks_qs(stru
>  		}
>  	}
>  
> -	return 0;
> +	return ret;
>  }
>  
>  /* Trace-event wrapper function for trace_rcu_future_grace_period.  */
> @@ -2257,15 +2262,15 @@ static void force_qs_rnp(int (*f)(struct
>  {
>  	int cpu;
>  	unsigned long flags;
> -	unsigned long mask;
> -	struct rcu_data *rdp;
>  	struct rcu_node *rnp;
>  
>  	rcu_state.cbovld = rcu_state.cbovldnext;
>  	rcu_state.cbovldnext = false;
>  	rcu_for_each_leaf_node(rnp) {
> +		unsigned long mask = 0;
> +		unsigned long rsmask = 0;
> +
>  		cond_resched_tasks_rcu_qs();
> -		mask = 0;
>  		raw_spin_lock_irqsave_rcu_node(rnp, flags);
>  		rcu_state.cbovldnext |= !!rnp->cbovldmask;
>  		if (rnp->qsmask == 0) {
> @@ -2283,11 +2288,17 @@ static void force_qs_rnp(int (*f)(struct
>  			continue;
>  		}
>  		for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
> +			struct rcu_data *rdp;
> +			int ret;
> +
>  			rdp = per_cpu_ptr(&rcu_data, cpu);
> -			if (f(rdp)) {
> +			ret = f(rdp);
> +			if (ret > 0) {
>  				mask |= rdp->grpmask;
>  				rcu_disable_urgency_upon_qs(rdp);
>  			}
> +			if (ret < 0)
> +				rsmask |= rdp->grpmask;
>  		}
>  		if (mask != 0) {
>  			/* Idle/offline CPUs, report (releases rnp->lock). */
> @@ -2296,6 +2307,9 @@ static void force_qs_rnp(int (*f)(struct
>  			/* Nothing to do here, so just drop the lock. */
>  			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
>  		}
> +
> +		for_each_leaf_node_cpu_mask(rnp, cpu, rsmask)
> +			resched_cpu(cpu);
>  	}
>  }
>