rcu/nocb: Remove unnecessary WakeOvfIsDeferred wake path

[PATCH RFC] rcu/nocb: Remove unnecessary WakeOvfIsDeferred wake path

Posted by Joel Fernandes 1 month, 1 week ago

The WakeOvfIsDeferred code path in __call_rcu_nocb_wake() attempts to
wake rcuog when the callback count exceeds qhimark and callbacks aren't
done with their GP (newly queued or awaiting GP). However, a lot of
testing proves this wake is always redundant or useless.

In the flooding case, rcuog is always waiting for a GP to finish. So
waking up the rcuog thread is pointless. The timer wakeup adds overhead,
rcuog simply wakes up and goes back to sleep achieving nothing.

This path also adds a full memory barrier, and additional timer expiry
modifications unnecessarily.

The root cause is that WakeOvfIsDeferred fires when
!rcu_segcblist_ready_cbs() (GP not complete), but waking rcuog cannot
accelerate GP completion.

This commit therefore removes this path, which also adding some rdp
counters to ensure we don't have lost wake ups.

Tested with rcutorture scenarios: TREE01, TREE05, TREE08 (all NOCB
configurations) - all pass. Also stress tested using a kernel module
that floods call_rcu() to trigger the overload conditions and made the
observations confirming the findings.

Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 kernel/rcu/tree.h      |  3 ++-
 kernel/rcu/tree_nocb.h | 43 +++++++++++++++++-------------------------
 2 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b8bbe7960cda..03ff62f28d00 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -257,6 +257,8 @@ struct rcu_data {
 	unsigned long nocb_gp_loops;	/* # passes through wait code. */
 	struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
 	bool nocb_cb_sleep;		/* Is the nocb CB thread asleep? */
+	bool nocb_gp_wake_attempt;	/* Was rcuog wake attempted? */
+	bool nocb_gp_serving;		/* Is rcuog serving this rdp? */
 	struct task_struct *nocb_cb_kthread;
 	struct list_head nocb_head_rdp; /*
 					 * Head of rcu_data list in wakeup chain,
@@ -301,7 +303,6 @@ struct rcu_data {
 #define RCU_NOCB_WAKE_BYPASS	1
 #define RCU_NOCB_WAKE_LAZY	2
 #define RCU_NOCB_WAKE		3
-#define RCU_NOCB_WAKE_FORCE	4
 
 #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
 					/* For jiffies_till_first_fqs and */
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index e6cd56603cad..9f58589f08d7 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -518,10 +518,8 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 }
 
 /*
- * Awaken the no-CBs grace-period kthread if needed, either due to it
- * legitimately being asleep or due to overload conditions.
- *
- * If warranted, also wake up the kthread servicing this CPUs queues.
+ * Awaken the no-CBs grace-period kthread if needed due to it legitimately
+ * being asleep.
  */
 static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
 				 unsigned long flags)
@@ -533,7 +531,6 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
 	long lazy_len;
 	long len;
 	struct task_struct *t;
-	struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
 
 	// If we are being polled or there is no kthread, just leave.
 	t = READ_ONCE(rdp->nocb_gp_kthread);
@@ -549,24 +546,26 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
 	lazy_len = READ_ONCE(rdp->lazy_len);
 	if (was_alldone) {
 		rdp->qlen_last_fqs_check = len;
+		rdp->nocb_gp_wake_attempt = true;
+		rcu_nocb_unlock(rdp);
 		// Only lazy CBs in bypass list
 		if (lazy_len && bypass_len == lazy_len) {
-			rcu_nocb_unlock(rdp);
 			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
 					   TPS("WakeLazy"));
 		} else if (!irqs_disabled_flags(flags)) {
 			/* ... if queue was empty ... */
-			rcu_nocb_unlock(rdp);
 			wake_nocb_gp(rdp, false);
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 					    TPS("WakeEmpty"));
 		} else {
-			rcu_nocb_unlock(rdp);
 			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
 					   TPS("WakeEmptyIsDeferred"));
 		}
+
+		return;
 	} else if (len > rdp->qlen_last_fqs_check + qhimark) {
-		/* ... or if many callbacks queued. */
+		/* Callback overload condition. */
+		WARN_ON_ONCE(!rdp->nocb_gp_wake_attempt && !rdp->nocb_gp_serving);
 		rdp->qlen_last_fqs_check = len;
 		j = jiffies;
 		if (j != rdp->nocb_gp_adv_time &&
@@ -575,21 +574,10 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
 			rcu_advance_cbs_nowake(rdp->mynode, rdp);
 			rdp->nocb_gp_adv_time = j;
 		}
-		smp_mb(); /* Enqueue before timer_pending(). */
-		if ((rdp->nocb_cb_sleep ||
-		     !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
-		    !timer_pending(&rdp_gp->nocb_timer)) {
-			rcu_nocb_unlock(rdp);
-			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
-					   TPS("WakeOvfIsDeferred"));
-		} else {
-			rcu_nocb_unlock(rdp);
-			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
-		}
-	} else {
-		rcu_nocb_unlock(rdp);
-		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
 	}
+
+	rcu_nocb_unlock(rdp);
+	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
 }
 
 static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
@@ -702,10 +690,13 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
 		     bypass_ncbs > 2 * qhimark)) {
 			flush_bypass = true;
 		} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
+			rdp->nocb_gp_wake_attempt = false;
+			rdp->nocb_gp_serving = false;
 			rcu_nocb_unlock_irqrestore(rdp, flags);
 			continue; /* No callbacks here, try next. */
 		}
 
+		rdp->nocb_gp_serving = true;
 		if (flush_bypass) {
 			// Bypass full or old, so flush it.
 			(void)rcu_nocb_try_flush_bypass(rdp, j);
@@ -966,7 +957,6 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
 					   unsigned long flags)
 	__releases(rdp_gp->nocb_gp_lock)
 {
-	int ndw;
 	int ret;
 
 	if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) {
@@ -974,8 +964,7 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
 		return false;
 	}
 
-	ndw = rdp_gp->nocb_defer_wakeup;
-	ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+	ret = __wake_nocb_gp(rdp_gp, rdp, false, flags);
 	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
 
 	return ret;
@@ -1077,6 +1066,8 @@ static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
 	rcu_nocb_lock_irqsave(rdp, flags);
 	WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
 	WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist));
+	rdp->nocb_gp_wake_attempt = false;
+	rdp->nocb_gp_serving = false;
 	rcu_nocb_unlock_irqrestore(rdp, flags);
 
 	wake_gp = rcu_nocb_queue_toggle_rdp(rdp);
-- 
2.34.1

Re: [PATCH RFC] rcu/nocb: Remove unnecessary WakeOvfIsDeferred wake path

Posted by Frederic Weisbecker 1 month, 1 week ago

Le Thu, Dec 25, 2025 at 02:44:50AM -0500, Joel Fernandes a écrit :
> The WakeOvfIsDeferred code path in __call_rcu_nocb_wake() attempts to
> wake rcuog when the callback count exceeds qhimark and callbacks aren't
> done with their GP (newly queued or awaiting GP). However, a lot of
> testing proves this wake is always redundant or useless.
> 
> In the flooding case, rcuog is always waiting for a GP to finish. So
> waking up the rcuog thread is pointless. The timer wakeup adds overhead,
> rcuog simply wakes up and goes back to sleep achieving nothing.
> 
> This path also adds a full memory barrier, and additional timer expiry
> modifications unnecessarily.
> 
> The root cause is that WakeOvfIsDeferred fires when
> !rcu_segcblist_ready_cbs() (GP not complete), but waking rcuog cannot
> accelerate GP completion.
> 
> This commit therefore removes this path, which also adding some rdp
> counters to ensure we don't have lost wake ups.

There should be two patches: one that removes the useless path and the
other that adds the debugging.

> 
> Tested with rcutorture scenarios: TREE01, TREE05, TREE08 (all NOCB
> configurations) - all pass. Also stress tested using a kernel module
> that floods call_rcu() to trigger the overload conditions and made the
> observations confirming the findings.
> 
> Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>

Cool! Just a few comments:

> @@ -549,24 +546,26 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
>  	lazy_len = READ_ONCE(rdp->lazy_len);
>  	if (was_alldone) {
>  		rdp->qlen_last_fqs_check = len;
> +		rdp->nocb_gp_wake_attempt = true;
> +		rcu_nocb_unlock(rdp);
>  		// Only lazy CBs in bypass list
>  		if (lazy_len && bypass_len == lazy_len) {
> -			rcu_nocb_unlock(rdp);
>  			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
>  					   TPS("WakeLazy"));
>  		} else if (!irqs_disabled_flags(flags)) {
>  			/* ... if queue was empty ... */
> -			rcu_nocb_unlock(rdp);
>  			wake_nocb_gp(rdp, false);
>  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>  					    TPS("WakeEmpty"));
>  		} else {
> -			rcu_nocb_unlock(rdp);
>  			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
>  					   TPS("WakeEmptyIsDeferred"));
>  		}
> +
> +		return;
>  	} else if (len > rdp->qlen_last_fqs_check + qhimark) {
> -		/* ... or if many callbacks queued. */
> +		/* Callback overload condition. */
> +		WARN_ON_ONCE(!rdp->nocb_gp_wake_attempt && !rdp->nocb_gp_serving);

With this test, the point of ->nocb_gp_serving is unclear given that both
states are cleared in the same place but ->nocb_gp_serving is set later by
the gp kthread. ->nocb_gp_serving implies ->nocb_gp_wake_attempt so the above
test is the same as WARN_ON_ONCE(!rdp->nocb_gp_wake_attempt).

In fact ->nocb_gp_wake_attempt alone probably makes sense?

>  		rdp->qlen_last_fqs_check = len;
>  		j = jiffies;
>  		if (j != rdp->nocb_gp_adv_time &&
> @@ -575,21 +574,10 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
>  			rcu_advance_cbs_nowake(rdp->mynode, rdp);
>  			rdp->nocb_gp_adv_time = j;
>  		}
> -		smp_mb(); /* Enqueue before timer_pending(). */

You need to remove the pairing smp_mb__after_spin_lock() in
do_nocb_deferred_wakeup_timer().

Thanks.

-- 
Frederic Weisbecker
SUSE Labs

Re: [PATCH RFC] rcu/nocb: Remove unnecessary WakeOvfIsDeferred wake path

Posted by Joel Fernandes 1 month, 1 week ago

Hi Frederic,

On Thu, Dec 25, 2025 at 11:20:23PM +0100, Frederic Weisbecker wrote:
> Le Thu, Dec 25, 2025 at 02:44:50AM -0500, Joel Fernandes a écrit :
> > The WakeOvfIsDeferred code path in __call_rcu_nocb_wake() attempts to
> > wake rcuog when the callback count exceeds qhimark and callbacks aren't
> > done with their GP (newly queued or awaiting GP). However, a lot of
> > testing proves this wake is always redundant or useless.
> > 
> > In the flooding case, rcuog is always waiting for a GP to finish. So
> > waking up the rcuog thread is pointless. The timer wakeup adds overhead,
> > rcuog simply wakes up and goes back to sleep achieving nothing.
> > 
> > This path also adds a full memory barrier, and additional timer expiry
> > modifications unnecessarily.
> > 
> > The root cause is that WakeOvfIsDeferred fires when
> > !rcu_segcblist_ready_cbs() (GP not complete), but waking rcuog cannot
> > accelerate GP completion.
> > 
> > This commit therefore removes this path, which also adding some rdp
> > counters to ensure we don't have lost wake ups.
> 
> There should be two patches: one that removes the useless path and the
> other that adds the debugging.

Sure, will split.

> > Tested with rcutorture scenarios: TREE01, TREE05, TREE08 (all NOCB
> > configurations) - all pass. Also stress tested using a kernel module
> > that floods call_rcu() to trigger the overload conditions and made the
> > observations confirming the findings.
> > 
> > Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
> 
> Cool! Just a few comments:
> 
> > @@ -549,24 +546,26 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
> >  	lazy_len = READ_ONCE(rdp->lazy_len);
> >  	if (was_alldone) {
> >  		rdp->qlen_last_fqs_check = len;
> > +		rdp->nocb_gp_wake_attempt = true;
> > +		rcu_nocb_unlock(rdp);
> >  		// Only lazy CBs in bypass list
> >  		if (lazy_len && bypass_len == lazy_len) {
> > -			rcu_nocb_unlock(rdp);
> >  			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
> >  					   TPS("WakeLazy"));
> >  		} else if (!irqs_disabled_flags(flags)) {
> >  			/* ... if queue was empty ... */
> > -			rcu_nocb_unlock(rdp);
> >  			wake_nocb_gp(rdp, false);
> >  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
> >  					    TPS("WakeEmpty"));
> >  		} else {
> > -			rcu_nocb_unlock(rdp);
> >  			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
> >  					   TPS("WakeEmptyIsDeferred"));
> >  		}
> > +
> > +		return;
> >  	} else if (len > rdp->qlen_last_fqs_check + qhimark) {
> > -		/* ... or if many callbacks queued. */
> > +		/* Callback overload condition. */
> > +		WARN_ON_ONCE(!rdp->nocb_gp_wake_attempt && !rdp->nocb_gp_serving);
> 
> With this test, the point of ->nocb_gp_serving is unclear given that both
> states are cleared in the same place but ->nocb_gp_serving is set later by
> the gp kthread. ->nocb_gp_serving implies ->nocb_gp_wake_attempt so the above
> test is the same as WARN_ON_ONCE(!rdp->nocb_gp_wake_attempt).
> 
> In fact ->nocb_gp_wake_attempt alone probably makes sense?

Ah true, I got a bit paranoid about false positive warnings hence I added the
extra variable, however on further analysis I realized the nocb lock takes
care of preventing potential false positive warnings. So yes, I will just use
the single variable. Thanks.

> 
> >  		rdp->qlen_last_fqs_check = len;
> >  		j = jiffies;
> >  		if (j != rdp->nocb_gp_adv_time &&
> > @@ -575,21 +574,10 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
> >  			rcu_advance_cbs_nowake(rdp->mynode, rdp);
> >  			rdp->nocb_gp_adv_time = j;
> >  		}
> > -		smp_mb(); /* Enqueue before timer_pending(). */
> 
> You need to remove the pairing smp_mb__after_spin_lock() in
> do_nocb_deferred_wakeup_timer().

Ah, will do. Thanks!

 - Joel