[PATCH -next 4/8] rcu/nocb: Add warning if no rcuog wake up attempt happened during overload

Joel Fernandes posted 8 patches 1 month, 1 week ago
[PATCH -next 4/8] rcu/nocb: Add warning if no rcuog wake up attempt happened during overload
Posted by Joel Fernandes 1 month, 1 week ago
To be sure we have no rcog wake ups that were lost, add a warning
to cover the case where the rdp is overloaded with callbacks but
no wake up was attempted.

Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 kernel/rcu/tree.c      | 4 ++++
 kernel/rcu/tree.h      | 1 +
 kernel/rcu/tree_nocb.h | 6 +++++-
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 293bbd9ac3f4..78c045a5ef03 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3767,6 +3767,10 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
 		debug_rcu_head_unqueue(&rdp->barrier_head);
 		rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
 	}
+#ifdef CONFIG_RCU_NOCB_CPU
+	if (wake_nocb)
+		rdp->nocb_gp_wake_attempt = true;
+#endif
 	rcu_nocb_unlock(rdp);
 	if (wake_nocb)
 		wake_nocb_gp(rdp, false);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 653fb4ba5852..74bd6a2a2f84 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -257,6 +257,7 @@ struct rcu_data {
 	unsigned long nocb_gp_loops;	/* # passes through wait code. */
 	struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
 	bool nocb_cb_sleep;		/* Is the nocb CB thread asleep? */
+	bool nocb_gp_wake_attempt;	/* Was a rcuog wakeup attempted? */
 	struct task_struct *nocb_cb_kthread;
 	struct list_head nocb_head_rdp; /*
 					 * Head of rcu_data list in wakeup chain,
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index daff2756cd90..7e9d465c8ab1 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -546,6 +546,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
 	lazy_len = READ_ONCE(rdp->lazy_len);
 	if (was_alldone) {
 		rdp->qlen_last_fqs_check = len;
+		rdp->nocb_gp_wake_attempt = true;
 		rcu_nocb_unlock(rdp);
 		// Only lazy CBs in bypass list
 		if (lazy_len && bypass_len == lazy_len) {
@@ -563,7 +564,8 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
 
 		return;
 	} else if (len > rdp->qlen_last_fqs_check + qhimark) {
-		/* ... or if many callbacks queued. */
+		/* Callback overload condition. */
+		WARN_ON_ONCE(!rdp->nocb_gp_wake_attempt);
 		rdp->qlen_last_fqs_check = len;
 		j = jiffies;
 		if (j != rdp->nocb_gp_adv_time &&
@@ -688,6 +690,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
 		     bypass_ncbs > 2 * qhimark)) {
 			flush_bypass = true;
 		} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
+			rdp->nocb_gp_wake_attempt = false;
 			rcu_nocb_unlock_irqrestore(rdp, flags);
 			continue; /* No callbacks here, try next. */
 		}
@@ -1254,6 +1257,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 			continue;
 		}
 		rcu_nocb_try_flush_bypass(rdp, jiffies);
+		rdp->nocb_gp_wake_attempt = true;
 		rcu_nocb_unlock_irqrestore(rdp, flags);
 		wake_nocb_gp(rdp, false);
 		sc->nr_to_scan -= _count;
-- 
2.34.1
Re: [PATCH -next 4/8] rcu/nocb: Add warning if no rcuog wake up attempt happened during overload
Posted by Frederic Weisbecker 1 month ago
Le Thu, Jan 01, 2026 at 11:34:13AM -0500, Joel Fernandes a écrit :
> To be sure we have no rcog wake ups that were lost, add a warning
> to cover the case where the rdp is overloaded with callbacks but
> no wake up was attempted.
> 
> Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
> ---
>  kernel/rcu/tree.c      | 4 ++++
>  kernel/rcu/tree.h      | 1 +
>  kernel/rcu/tree_nocb.h | 6 +++++-
>  3 files changed, 10 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 293bbd9ac3f4..78c045a5ef03 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3767,6 +3767,10 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>  		debug_rcu_head_unqueue(&rdp->barrier_head);
>  		rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
>  	}
> +#ifdef CONFIG_RCU_NOCB_CPU
> +	if (wake_nocb)
> +		rdp->nocb_gp_wake_attempt = true;
> +#endif

entrain only queues a callback if the list is non-empty. And if it's
non-empty, rdp->nocb_gp_wake_attempt should be true already.

>  	rcu_nocb_unlock(rdp);
>  	if (wake_nocb)
>  		wake_nocb_gp(rdp, false);
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index 653fb4ba5852..74bd6a2a2f84 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -257,6 +257,7 @@ struct rcu_data {
>  	unsigned long nocb_gp_loops;	/* # passes through wait code. */
>  	struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
>  	bool nocb_cb_sleep;		/* Is the nocb CB thread asleep? */
> +	bool nocb_gp_wake_attempt;	/* Was a rcuog wakeup attempted? */

How about nocb_gp_handling ?

>  	struct task_struct *nocb_cb_kthread;
>  	struct list_head nocb_head_rdp; /*
>  					 * Head of rcu_data list in wakeup chain,
> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> index daff2756cd90..7e9d465c8ab1 100644
> --- a/kernel/rcu/tree_nocb.h
> +++ b/kernel/rcu/tree_nocb.h
> @@ -546,6 +546,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
>  	lazy_len = READ_ONCE(rdp->lazy_len);
>  	if (was_alldone) {
>  		rdp->qlen_last_fqs_check = len;
> +		rdp->nocb_gp_wake_attempt = true;
>  		rcu_nocb_unlock(rdp);
>  		// Only lazy CBs in bypass list
>  		if (lazy_len && bypass_len == lazy_len) {
> @@ -563,7 +564,8 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
>  
>  		return;
>  	} else if (len > rdp->qlen_last_fqs_check + qhimark) {
> -		/* ... or if many callbacks queued. */
> +		/* Callback overload condition. */
> +		WARN_ON_ONCE(!rdp->nocb_gp_wake_attempt);
>  		rdp->qlen_last_fqs_check = len;
>  		j = jiffies;
>  		if (j != rdp->nocb_gp_adv_time &&
> @@ -688,6 +690,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
>  		     bypass_ncbs > 2 * qhimark)) {
>  			flush_bypass = true;
>  		} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
> +			rdp->nocb_gp_wake_attempt = false;

This is when nocb_cb_wait() is done with callbacks but nocb_gp_wait() is done
with them sooner, when the grace period is done for all pending callbacks.

Something like this would perhaps be more accurate:

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index e6cd56603cad..52010cbeaa76 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -746,6 +746,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
 			needwait_gp = true;
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 					    TPS("NeedWaitGP"));
+		} else if (!rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
+			rdp->nocb_gp_wake_attempt = false;
 		}
 		if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
 			needwake = rdp->nocb_cb_sleep;


>  			rcu_nocb_unlock_irqrestore(rdp, flags);
>  			continue; /* No callbacks here, try next. */
>  		}
> @@ -1254,6 +1257,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
>  			continue;
>  		}
>  		rcu_nocb_try_flush_bypass(rdp, jiffies);
> +		rdp->nocb_gp_wake_attempt = true;

Same here, we should expect rdp->nocb_gp_wake_attempt to be already on since
there are lazy callbacks. That's a good opportunity to test the related assertion
though.

Thanks.

>  		rcu_nocb_unlock_irqrestore(rdp, flags);
>  		wake_nocb_gp(rdp, false);
>  		sc->nr_to_scan -= _count;
> -- 
> 2.34.1
> 

-- 
Frederic Weisbecker
SUSE Labs
Re: [PATCH -next 4/8] rcu/nocb: Add warning if no rcuog wake up attempt happened during overload
Posted by Joel Fernandes 1 month ago
Hi Frederic,

On Thu, Jan 08, 2026 at 06:22:45PM +0100, Frederic Weisbecker wrote:
> Le Thu, Jan 01, 2026 at 11:34:13AM -0500, Joel Fernandes a écrit :
> > To be sure we have no rcog wake ups that were lost, add a warning
> > to cover the case where the rdp is overloaded with callbacks but
> > no wake up was attempted.
> > 
> > Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
> > ---
> >  kernel/rcu/tree.c      | 4 ++++
> >  kernel/rcu/tree.h      | 1 +
> >  kernel/rcu/tree_nocb.h | 6 +++++-
> >  3 files changed, 10 insertions(+), 1 deletion(-)
> > 
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 293bbd9ac3f4..78c045a5ef03 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -3767,6 +3767,10 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
> >  		debug_rcu_head_unqueue(&rdp->barrier_head);
> >  		rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
> >  	}
> > +#ifdef CONFIG_RCU_NOCB_CPU
> > +	if (wake_nocb)
> > +		rdp->nocb_gp_wake_attempt = true;
> > +#endif
> 
> entrain only queues a callback if the list is non-empty. And if it's
> non-empty, rdp->nocb_gp_wake_attempt should be true already.

This is true, we don't need to track this wake up. I will replace it with a
WARN.

> >  	rcu_nocb_unlock(rdp);
> >  	if (wake_nocb)
> >  		wake_nocb_gp(rdp, false);
> > diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> > index 653fb4ba5852..74bd6a2a2f84 100644
> > --- a/kernel/rcu/tree.h
> > +++ b/kernel/rcu/tree.h
> > @@ -257,6 +257,7 @@ struct rcu_data {
> >  	unsigned long nocb_gp_loops;	/* # passes through wait code. */
> >  	struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
> >  	bool nocb_cb_sleep;		/* Is the nocb CB thread asleep? */
> > +	bool nocb_gp_wake_attempt;	/* Was a rcuog wakeup attempted? */
> 
> How about nocb_gp_handling ?

This is a better name indeed, considering that we also track this for
deferred wakeups of the GP thread.

> >  	struct task_struct *nocb_cb_kthread;
> >  	struct list_head nocb_head_rdp; /*
> >  					 * Head of rcu_data list in wakeup chain,
> > diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> > index daff2756cd90..7e9d465c8ab1 100644
> > --- a/kernel/rcu/tree_nocb.h
> > +++ b/kernel/rcu/tree_nocb.h
> > @@ -546,6 +546,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
> >  	lazy_len = READ_ONCE(rdp->lazy_len);
> >  	if (was_alldone) {
> >  		rdp->qlen_last_fqs_check = len;
> > +		rdp->nocb_gp_wake_attempt = true;
> >  		rcu_nocb_unlock(rdp);
> >  		// Only lazy CBs in bypass list
> >  		if (lazy_len && bypass_len == lazy_len) {
> > @@ -563,7 +564,8 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
> >  
> >  		return;
> >  	} else if (len > rdp->qlen_last_fqs_check + qhimark) {
> > -		/* ... or if many callbacks queued. */
> > +		/* Callback overload condition. */
> > +		WARN_ON_ONCE(!rdp->nocb_gp_wake_attempt);
> >  		rdp->qlen_last_fqs_check = len;
> >  		j = jiffies;
> >  		if (j != rdp->nocb_gp_adv_time &&
> > @@ -688,6 +690,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
> >  		     bypass_ncbs > 2 * qhimark)) {
> >  			flush_bypass = true;
> >  		} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
> > +			rdp->nocb_gp_wake_attempt = false;
> 
> This is when nocb_cb_wait() is done with callbacks but nocb_gp_wait() is done
> with them sooner, when the grace period is done for all pending callbacks.
> 
> Something like this would perhaps be more accurate:
> 
> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> index e6cd56603cad..52010cbeaa76 100644
> --- a/kernel/rcu/tree_nocb.h
> +++ b/kernel/rcu/tree_nocb.h
> @@ -746,6 +746,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
>  			needwait_gp = true;
>  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>  					    TPS("NeedWaitGP"));
> +		} else if (!rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
> +			rdp->nocb_gp_wake_attempt = false;
>  		}

Hmm, I am trying to understand why this suggestion is better than what I
already have. It is one extra line and adds another conditional.

Also shouldn't it be:

  } else if (!rcu_cblist_n_cbs(&rdp->nocb_bypass) &&
             rcu_segcblist_empty(&rdp->cblist)) {
      rdp->nocb_gp_wake_attempt = false;
  }

  ?

My goal was to mark wake_attempt as false when ALL callbacks on the rdp were
drained. IOW, the GP thread is done with the rdp.

>  		if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
>  			needwake = rdp->nocb_cb_sleep;
> 
> 
> >  			rcu_nocb_unlock_irqrestore(rdp, flags);
> >  			continue; /* No callbacks here, try next. */
> >  		}
> > @@ -1254,6 +1257,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> >  			continue;
> >  		}
> >  		rcu_nocb_try_flush_bypass(rdp, jiffies);
> > +		rdp->nocb_gp_wake_attempt = true;
> 
> Same here, we should expect rdp->nocb_gp_wake_attempt to be already on since
> there are lazy callbacks. That's a good opportunity to test the related assertion
> though.

Good point! I will turn it into a WARN.

Btw, I have more patches coming to simplify nocb_gp_wait()... it is quite long :)

thanks,

 - Joel
Re: [PATCH -next 4/8] rcu/nocb: Add warning if no rcuog wake up attempt happened during overload
Posted by Frederic Weisbecker 1 month ago
Le Thu, Jan 08, 2026 at 10:49:30PM -0500, Joel Fernandes a écrit :
> > > @@ -688,6 +690,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
> > >  		     bypass_ncbs > 2 * qhimark)) {
> > >  			flush_bypass = true;
> > >  		} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
> > > +			rdp->nocb_gp_wake_attempt = false;
> > 
> > This is when nocb_cb_wait() is done with callbacks but nocb_gp_wait() is done
> > with them sooner, when the grace period is done for all pending callbacks.
> > 
> > Something like this would perhaps be more accurate:
> > 
> > diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> > index e6cd56603cad..52010cbeaa76 100644
> > --- a/kernel/rcu/tree_nocb.h
> > +++ b/kernel/rcu/tree_nocb.h
> > @@ -746,6 +746,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
> >  			needwait_gp = true;
> >  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
> >  					    TPS("NeedWaitGP"));
> > +		} else if (!rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
> > +			rdp->nocb_gp_wake_attempt = false;
> >  		}
> 
> Hmm, I am trying to understand why this suggestion is better than what I
> already have. It is one extra line and adds another conditional.
> 
> Also shouldn't it be:
> 
>   } else if (!rcu_cblist_n_cbs(&rdp->nocb_bypass) &&
>              rcu_segcblist_empty(&rdp->cblist)) {
>       rdp->nocb_gp_wake_attempt = false;
>   }
> 
>   ?

This else already means that rcu_segcblist_nextgp() returned false because there
is no pending callbacks. rcu_segcblist_empty() is different because it also test
done callbacks.

> 
> My goal was to mark wake_attempt as false when ALL callbacks on the rdp were
> drained. IOW, the GP thread is done with the rdp.

So nocb_gp_wait (the rcuog kthreads) handle the pending callbacks, advancing
them throughout grace periods until they are moved to the done state.

But once they are moved as done, the callbacks are the responsibility of
nocb_cb_wait() (the rcuo kthreads) and nocb_gp_wait() stops paying attention
to that rdp if there are no more pending callbacks.

So with your initial patch, you still have rdp->nocb_gp_wake_attempt == true
even when there are only done callbacks. But without an appropriate wake-up
after subsequent enqueue, nocb_gp_wait() may ignore new callbacks, event though
rdp->nocb_gp_wake_attempt == true suggests otherwise.

> Btw, I have more patches coming to simplify nocb_gp_wait()... it is quite long
> :)

Cool :)

Thanks.

-- 
Frederic Weisbecker
SUSE Labs
Re: [PATCH -next 4/8] rcu/nocb: Add warning if no rcuog wake up attempt happened during overload
Posted by Joel Fernandes 1 month ago

On 1/9/2026 9:36 AM, Frederic Weisbecker wrote:
> Le Thu, Jan 08, 2026 at 10:49:30PM -0500, Joel Fernandes a écrit :
>>>> @@ -688,6 +690,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
>>>>  		     bypass_ncbs > 2 * qhimark)) {
>>>>  			flush_bypass = true;
>>>>  		} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
>>>> +			rdp->nocb_gp_wake_attempt = false;
>>>
>>> This is when nocb_cb_wait() is done with callbacks but nocb_gp_wait() is done
>>> with them sooner, when the grace period is done for all pending callbacks.
>>>
>>> Something like this would perhaps be more accurate:
>>>
>>> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
>>> index e6cd56603cad..52010cbeaa76 100644
>>> --- a/kernel/rcu/tree_nocb.h
>>> +++ b/kernel/rcu/tree_nocb.h
>>> @@ -746,6 +746,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
>>>  			needwait_gp = true;
>>>  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>>>  					    TPS("NeedWaitGP"));
>>> +		} else if (!rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
>>> +			rdp->nocb_gp_wake_attempt = false;
>>>  		}
>>
>> Hmm, I am trying to understand why this suggestion is better than what I
>> already have. It is one extra line and adds another conditional.
>>
>> Also shouldn't it be:
>>
>>   } else if (!rcu_cblist_n_cbs(&rdp->nocb_bypass) &&
>>              rcu_segcblist_empty(&rdp->cblist)) {
>>       rdp->nocb_gp_wake_attempt = false;
>>   }
>>
>>   ?
> 
> This else already means that rcu_segcblist_nextgp() returned false because there
> is no pending callbacks. rcu_segcblist_empty() is different because it also test
> done callbacks.
> 
>>
>> My goal was to mark wake_attempt as false when ALL callbacks on the rdp were
>> drained. IOW, the GP thread is done with the rdp.
> 
> So nocb_gp_wait (the rcuog kthreads) handle the pending callbacks, advancing
> them throughout grace periods until they are moved to the done state.
> 
> But once they are moved as done, the callbacks are the responsibility of
> nocb_cb_wait() (the rcuo kthreads) and nocb_gp_wait() stops paying attention
> to that rdp if there are no more pending callbacks.
> 
> So with your initial patch, you still have rdp->nocb_gp_wake_attempt == true
> even when there are only done callbacks. But without an appropriate wake-up
> after subsequent enqueue, nocb_gp_wait() may ignore new callbacks, event though
> rdp->nocb_gp_wake_attempt == true suggests otherwise.

Ah, got it! I was clubbing the acting of waking up rcuog to that of both the
rcuog and rcuop/s threads. Your suggestion, instead, is more accurate so I will
do it that way instead. Thanks for the explanations!

 - Joel