[PATCH] rcu: Fix rcu_read_unlock() deadloop due to softirq

Yao Kai posted 1 patch 1 month, 3 weeks ago
There is a newer version of this series
kernel/rcu/tree.h        |  2 +-
kernel/rcu/tree_plugin.h | 15 +++++++++------
2 files changed, 10 insertions(+), 7 deletions(-)
[PATCH] rcu: Fix rcu_read_unlock() deadloop due to softirq
Posted by Yao Kai 1 month, 3 weeks ago
Commit 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in
__rcu_read_unlock()") removes the recursion-protection code from
__rcu_read_unlock(). Therefore, we could invoke the deadloop in
raise_softirq_irqoff() with ftrace enabled as follows:

WARNING: CPU: 0 PID: 0 at kernel/trace/trace.c:3021 __ftrace_trace_stack.constprop.0+0x172/0x180
Modules linked in: my_irq_work(O)
CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G O 6.18.0-rc7-dirty #23 PREEMPT(full)
Tainted: [O]=OOT_MODULE
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
RIP: 0010:__ftrace_trace_stack.constprop.0+0x172/0x180
RSP: 0018:ffffc900000034a8 EFLAGS: 00010002
RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000000000
RDX: 0000000000000003 RSI: ffffffff826d7b87 RDI: ffffffff826e9329
RBP: 0000000000090009 R08: 0000000000000005 R09: ffffffff82afbc4c
R10: 0000000000000008 R11: 0000000000011d7a R12: 0000000000000000
R13: ffff888003874100 R14: 0000000000000003 R15: ffff8880038c1054
FS:  0000000000000000(0000) GS:ffff8880fa8ea000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000055b31fa7f540 CR3: 00000000078f4005 CR4: 0000000000770ef0
PKRU: 55555554
Call Trace:
 <IRQ>
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 raise_softirq_irqoff+0x6e/0xa0
 rcu_read_unlock_special+0xb1/0x160
 unwind_next_frame+0x203/0x9b0
 __unwind_start+0x15d/0x1c0
 arch_stack_walk+0x62/0xf0
 stack_trace_save+0x48/0x70
 __ftrace_trace_stack.constprop.0+0x144/0x180
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 raise_softirq_irqoff+0x6e/0xa0
 rcu_read_unlock_special+0xb1/0x160
 unwind_next_frame+0x203/0x9b0
 __unwind_start+0x15d/0x1c0
 arch_stack_walk+0x62/0xf0
 stack_trace_save+0x48/0x70
 __ftrace_trace_stack.constprop.0+0x144/0x180
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 raise_softirq_irqoff+0x6e/0xa0
 rcu_read_unlock_special+0xb1/0x160
 unwind_next_frame+0x203/0x9b0
 __unwind_start+0x15d/0x1c0
 arch_stack_walk+0x62/0xf0
 stack_trace_save+0x48/0x70
 __ftrace_trace_stack.constprop.0+0x144/0x180
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 raise_softirq_irqoff+0x6e/0xa0
 rcu_read_unlock_special+0xb1/0x160
 __is_insn_slot_addr+0x54/0x70
 kernel_text_address+0x48/0xc0
 __kernel_text_address+0xd/0x40
 unwind_get_return_address+0x1e/0x40
 arch_stack_walk+0x9c/0xf0
 stack_trace_save+0x48/0x70
 __ftrace_trace_stack.constprop.0+0x144/0x180
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 __raise_softirq_irqoff+0x61/0x80
 __flush_smp_call_function_queue+0x115/0x420
 __sysvec_call_function_single+0x17/0xb0
 sysvec_call_function_single+0x8c/0xc0
 </IRQ>

Commit b41642c87716 ("rcu: Fix rcu_read_unlock() deadloop due to IRQ work")
fixed the infinite loop in rcu_read_unlock_special() for IRQ work by
setting a flag before calling irq_work_queue_on(). We fix this issue by
setting the same flag before calling raise_softirq_irqoff() and rename the
flag to defer_qs_pending for more common.

Fixes: 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in __rcu_read_unlock()")
Reported-by: Tengda Wu <wutengda2@huawei.com>
Signed-off-by: Yao Kai <yaokai34@huawei.com>
---
 kernel/rcu/tree.h        |  2 +-
 kernel/rcu/tree_plugin.h | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b8bbe7960cda..2265b9c2906e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -203,7 +203,7 @@ struct rcu_data {
 					/*  during and after the last grace */
 					/* period it is aware of. */
 	struct irq_work defer_qs_iw;	/* Obtain later scheduler attention. */
-	int defer_qs_iw_pending;	/* Scheduler attention pending? */
+	int defer_qs_pending;		/* irqwork or softirq pending? */
 	struct work_struct strict_work;	/* Schedule readers for strict GPs. */
 
 	/* 2) batch handling */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index dbe2d02be824..95ad967adcf3 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -487,8 +487,8 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 	union rcu_special special;
 
 	rdp = this_cpu_ptr(&rcu_data);
-	if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING)
-		rdp->defer_qs_iw_pending = DEFER_QS_IDLE;
+	if (rdp->defer_qs_pending == DEFER_QS_PENDING)
+		rdp->defer_qs_pending = DEFER_QS_IDLE;
 
 	/*
 	 * If RCU core is waiting for this CPU to exit its critical section,
@@ -645,7 +645,7 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
 	 * 5. Deferred QS reporting does not happen.
 	 */
 	if (rcu_preempt_depth() > 0)
-		WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE);
+		WRITE_ONCE(rdp->defer_qs_pending, DEFER_QS_IDLE);
 }
 
 /*
@@ -747,7 +747,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
 			// Using softirq, safe to awaken, and either the
 			// wakeup is free or there is either an expedited
 			// GP in flight or a potential need to deboost.
-			raise_softirq_irqoff(RCU_SOFTIRQ);
+			if (rdp->defer_qs_pending != DEFER_QS_PENDING) {
+				rdp->defer_qs_pending = DEFER_QS_PENDING;
+				raise_softirq_irqoff(RCU_SOFTIRQ);
+			}
 		} else {
 			// Enabling BH or preempt does reschedule, so...
 			// Also if no expediting and no possible deboosting,
@@ -755,11 +758,11 @@ static void rcu_read_unlock_special(struct task_struct *t)
 			// tick enabled.
 			set_need_resched_current();
 			if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
-			    needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING &&
+			    needs_exp && rdp->defer_qs_pending != DEFER_QS_PENDING &&
 			    cpu_online(rdp->cpu)) {
 				// Get scheduler to re-evaluate and call hooks.
 				// If !IRQ_WORK, FQS scan will eventually IPI.
-				rdp->defer_qs_iw_pending = DEFER_QS_PENDING;
+				rdp->defer_qs_pending = DEFER_QS_PENDING;
 				irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
 			}
 		}
-- 
2.43.0
Re: [PATCH] rcu: Fix rcu_read_unlock() deadloop due to softirq
Posted by Joel Fernandes 1 month, 2 weeks ago
On Thu, Dec 18, 2025 at 03:49:50PM +0800, Yao Kai wrote:
> Commit 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in
> __rcu_read_unlock()") removes the recursion-protection code from
> __rcu_read_unlock(). Therefore, we could invoke the deadloop in
> raise_softirq_irqoff() with ftrace enabled as follows:
> 
> WARNING: CPU: 0 PID: 0 at kernel/trace/trace.c:3021 __ftrace_trace_stack.constprop.0+0x172/0x180
> Modules linked in: my_irq_work(O)
> CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G O 6.18.0-rc7-dirty #23 PREEMPT(full)
> Tainted: [O]=OOT_MODULE
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
> RIP: 0010:__ftrace_trace_stack.constprop.0+0x172/0x180
> RSP: 0018:ffffc900000034a8 EFLAGS: 00010002
> RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000000000
> RDX: 0000000000000003 RSI: ffffffff826d7b87 RDI: ffffffff826e9329
> RBP: 0000000000090009 R08: 0000000000000005 R09: ffffffff82afbc4c
> R10: 0000000000000008 R11: 0000000000011d7a R12: 0000000000000000
> R13: ffff888003874100 R14: 0000000000000003 R15: ffff8880038c1054
> FS:  0000000000000000(0000) GS:ffff8880fa8ea000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 000055b31fa7f540 CR3: 00000000078f4005 CR4: 0000000000770ef0
> PKRU: 55555554
> Call Trace:
>  <IRQ>
>  trace_buffer_unlock_commit_regs+0x6d/0x220
>  trace_event_buffer_commit+0x5c/0x260
>  trace_event_raw_event_softirq+0x47/0x80
>  raise_softirq_irqoff+0x6e/0xa0
>  rcu_read_unlock_special+0xb1/0x160
>  unwind_next_frame+0x203/0x9b0
>  __unwind_start+0x15d/0x1c0
>  arch_stack_walk+0x62/0xf0
>  stack_trace_save+0x48/0x70
>  __ftrace_trace_stack.constprop.0+0x144/0x180
>  trace_buffer_unlock_commit_regs+0x6d/0x220
>  trace_event_buffer_commit+0x5c/0x260
>  trace_event_raw_event_softirq+0x47/0x80
>  raise_softirq_irqoff+0x6e/0xa0
>  rcu_read_unlock_special+0xb1/0x160
>  unwind_next_frame+0x203/0x9b0
>  __unwind_start+0x15d/0x1c0
>  arch_stack_walk+0x62/0xf0
>  stack_trace_save+0x48/0x70
>  __ftrace_trace_stack.constprop.0+0x144/0x180
>  trace_buffer_unlock_commit_regs+0x6d/0x220
>  trace_event_buffer_commit+0x5c/0x260
>  trace_event_raw_event_softirq+0x47/0x80
>  raise_softirq_irqoff+0x6e/0xa0
>  rcu_read_unlock_special+0xb1/0x160
>  unwind_next_frame+0x203/0x9b0
>  __unwind_start+0x15d/0x1c0
>  arch_stack_walk+0x62/0xf0
>  stack_trace_save+0x48/0x70
>  __ftrace_trace_stack.constprop.0+0x144/0x180
>  trace_buffer_unlock_commit_regs+0x6d/0x220
>  trace_event_buffer_commit+0x5c/0x260
>  trace_event_raw_event_softirq+0x47/0x80
>  raise_softirq_irqoff+0x6e/0xa0
>  rcu_read_unlock_special+0xb1/0x160
>  __is_insn_slot_addr+0x54/0x70
>  kernel_text_address+0x48/0xc0
>  __kernel_text_address+0xd/0x40
>  unwind_get_return_address+0x1e/0x40
>  arch_stack_walk+0x9c/0xf0
>  stack_trace_save+0x48/0x70
>  __ftrace_trace_stack.constprop.0+0x144/0x180
>  trace_buffer_unlock_commit_regs+0x6d/0x220
>  trace_event_buffer_commit+0x5c/0x260
>  trace_event_raw_event_softirq+0x47/0x80
>  __raise_softirq_irqoff+0x61/0x80
>  __flush_smp_call_function_queue+0x115/0x420
>  __sysvec_call_function_single+0x17/0xb0
>  sysvec_call_function_single+0x8c/0xc0
>  </IRQ>
> 
> Commit b41642c87716 ("rcu: Fix rcu_read_unlock() deadloop due to IRQ work")
> fixed the infinite loop in rcu_read_unlock_special() for IRQ work by
> setting a flag before calling irq_work_queue_on(). We fix this issue by
> setting the same flag before calling raise_softirq_irqoff() and rename the
> flag to defer_qs_pending for more common.
> 
> Fixes: 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in __rcu_read_unlock()")
> Reported-by: Tengda Wu <wutengda2@huawei.com>
> Signed-off-by: Yao Kai <yaokai34@huawei.com>

Good change! It is the exact same pattern and both IRQ work and softirq
raising are for deferred QS purposes.

Reviewed-by: Joel Fernandes <joelagnelf@nvidia.com>

thanks,

 - Joel



> ---
>  kernel/rcu/tree.h        |  2 +-
>  kernel/rcu/tree_plugin.h | 15 +++++++++------
>  2 files changed, 10 insertions(+), 7 deletions(-)
> 
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index b8bbe7960cda..2265b9c2906e 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -203,7 +203,7 @@ struct rcu_data {
>  					/*  during and after the last grace */
>  					/* period it is aware of. */
>  	struct irq_work defer_qs_iw;	/* Obtain later scheduler attention. */
> -	int defer_qs_iw_pending;	/* Scheduler attention pending? */
> +	int defer_qs_pending;		/* irqwork or softirq pending? */
>  	struct work_struct strict_work;	/* Schedule readers for strict GPs. */
>  
>  	/* 2) batch handling */
> diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
> index dbe2d02be824..95ad967adcf3 100644
> --- a/kernel/rcu/tree_plugin.h
> +++ b/kernel/rcu/tree_plugin.h
> @@ -487,8 +487,8 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
>  	union rcu_special special;
>  
>  	rdp = this_cpu_ptr(&rcu_data);
> -	if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING)
> -		rdp->defer_qs_iw_pending = DEFER_QS_IDLE;
> +	if (rdp->defer_qs_pending == DEFER_QS_PENDING)
> +		rdp->defer_qs_pending = DEFER_QS_IDLE;
>  
>  	/*
>  	 * If RCU core is waiting for this CPU to exit its critical section,
> @@ -645,7 +645,7 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
>  	 * 5. Deferred QS reporting does not happen.
>  	 */
>  	if (rcu_preempt_depth() > 0)
> -		WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE);
> +		WRITE_ONCE(rdp->defer_qs_pending, DEFER_QS_IDLE);
>  }
>  
>  /*
> @@ -747,7 +747,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
>  			// Using softirq, safe to awaken, and either the
>  			// wakeup is free or there is either an expedited
>  			// GP in flight or a potential need to deboost.
> -			raise_softirq_irqoff(RCU_SOFTIRQ);
> +			if (rdp->defer_qs_pending != DEFER_QS_PENDING) {
> +				rdp->defer_qs_pending = DEFER_QS_PENDING;
> +				raise_softirq_irqoff(RCU_SOFTIRQ);
> +			}
>  		} else {
>  			// Enabling BH or preempt does reschedule, so...
>  			// Also if no expediting and no possible deboosting,
> @@ -755,11 +758,11 @@ static void rcu_read_unlock_special(struct task_struct *t)
>  			// tick enabled.
>  			set_need_resched_current();
>  			if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
> -			    needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING &&
> +			    needs_exp && rdp->defer_qs_pending != DEFER_QS_PENDING &&
>  			    cpu_online(rdp->cpu)) {
>  				// Get scheduler to re-evaluate and call hooks.
>  				// If !IRQ_WORK, FQS scan will eventually IPI.
> -				rdp->defer_qs_iw_pending = DEFER_QS_PENDING;
> +				rdp->defer_qs_pending = DEFER_QS_PENDING;
>  				irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
>  			}
>  		}
> -- 
> 2.43.0
>
Re: [PATCH] rcu: Fix rcu_read_unlock() deadloop due to softirq
Posted by Joel Fernandes 1 month, 2 weeks ago

On 12/19/2025 10:14 AM, Joel Fernandes wrote:
> On Thu, Dec 18, 2025 at 03:49:50PM +0800, Yao Kai wrote:
>> Commit 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in
>> __rcu_read_unlock()") removes the recursion-protection code from
>> __rcu_read_unlock(). Therefore, we could invoke the deadloop in
>> raise_softirq_irqoff() with ftrace enabled as follows:
>>
>> WARNING: CPU: 0 PID: 0 at kernel/trace/trace.c:3021 __ftrace_trace_stack.constprop.0+0x172/0x180
>> Modules linked in: my_irq_work(O)
>> CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G O 6.18.0-rc7-dirty #23 PREEMPT(full)
>> Tainted: [O]=OOT_MODULE
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
>> RIP: 0010:__ftrace_trace_stack.constprop.0+0x172/0x180
>> RSP: 0018:ffffc900000034a8 EFLAGS: 00010002
>> RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000000000
>> RDX: 0000000000000003 RSI: ffffffff826d7b87 RDI: ffffffff826e9329
>> RBP: 0000000000090009 R08: 0000000000000005 R09: ffffffff82afbc4c
>> R10: 0000000000000008 R11: 0000000000011d7a R12: 0000000000000000
>> R13: ffff888003874100 R14: 0000000000000003 R15: ffff8880038c1054
>> FS:  0000000000000000(0000) GS:ffff8880fa8ea000(0000) knlGS:0000000000000000
>> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> CR2: 000055b31fa7f540 CR3: 00000000078f4005 CR4: 0000000000770ef0
>> PKRU: 55555554
>> Call Trace:
>>  <IRQ>
>>  trace_buffer_unlock_commit_regs+0x6d/0x220
>>  trace_event_buffer_commit+0x5c/0x260
>>  trace_event_raw_event_softirq+0x47/0x80
>>  raise_softirq_irqoff+0x6e/0xa0
>>  rcu_read_unlock_special+0xb1/0x160
>>  unwind_next_frame+0x203/0x9b0
>>  __unwind_start+0x15d/0x1c0
>>  arch_stack_walk+0x62/0xf0
>>  stack_trace_save+0x48/0x70
>>  __ftrace_trace_stack.constprop.0+0x144/0x180
>>  trace_buffer_unlock_commit_regs+0x6d/0x220
>>  trace_event_buffer_commit+0x5c/0x260
>>  trace_event_raw_event_softirq+0x47/0x80
>>  raise_softirq_irqoff+0x6e/0xa0
>>  rcu_read_unlock_special+0xb1/0x160
>>  unwind_next_frame+0x203/0x9b0
>>  __unwind_start+0x15d/0x1c0
>>  arch_stack_walk+0x62/0xf0
>>  stack_trace_save+0x48/0x70
>>  __ftrace_trace_stack.constprop.0+0x144/0x180
>>  trace_buffer_unlock_commit_regs+0x6d/0x220
>>  trace_event_buffer_commit+0x5c/0x260
>>  trace_event_raw_event_softirq+0x47/0x80
>>  raise_softirq_irqoff+0x6e/0xa0
>>  rcu_read_unlock_special+0xb1/0x160
>>  unwind_next_frame+0x203/0x9b0
>>  __unwind_start+0x15d/0x1c0
>>  arch_stack_walk+0x62/0xf0
>>  stack_trace_save+0x48/0x70
>>  __ftrace_trace_stack.constprop.0+0x144/0x180
>>  trace_buffer_unlock_commit_regs+0x6d/0x220
>>  trace_event_buffer_commit+0x5c/0x260
>>  trace_event_raw_event_softirq+0x47/0x80
>>  raise_softirq_irqoff+0x6e/0xa0
>>  rcu_read_unlock_special+0xb1/0x160
>>  __is_insn_slot_addr+0x54/0x70
>>  kernel_text_address+0x48/0xc0
>>  __kernel_text_address+0xd/0x40
>>  unwind_get_return_address+0x1e/0x40
>>  arch_stack_walk+0x9c/0xf0
>>  stack_trace_save+0x48/0x70
>>  __ftrace_trace_stack.constprop.0+0x144/0x180
>>  trace_buffer_unlock_commit_regs+0x6d/0x220
>>  trace_event_buffer_commit+0x5c/0x260
>>  trace_event_raw_event_softirq+0x47/0x80
>>  __raise_softirq_irqoff+0x61/0x80
>>  __flush_smp_call_function_queue+0x115/0x420
>>  __sysvec_call_function_single+0x17/0xb0
>>  sysvec_call_function_single+0x8c/0xc0
>>  </IRQ>
>>
>> Commit b41642c87716 ("rcu: Fix rcu_read_unlock() deadloop due to IRQ work")
>> fixed the infinite loop in rcu_read_unlock_special() for IRQ work by
>> setting a flag before calling irq_work_queue_on(). We fix this issue by
>> setting the same flag before calling raise_softirq_irqoff() and rename the
>> flag to defer_qs_pending for more common.
>>
>> Fixes: 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in __rcu_read_unlock()")
>> Reported-by: Tengda Wu <wutengda2@huawei.com>
>> Signed-off-by: Yao Kai <yaokai34@huawei.com>
> 
> Good change! It is the exact same pattern and both IRQ work and softirq
> raising are for deferred QS purposes.
> 
> Reviewed-by: Joel Fernandes <joelagnelf@nvidia.com>
Could you please update the following comment in tree.h to say "An IRQ work or
softirq":

/*
 * An IRQ work (deferred_qs_iw) is used by RCU to get the scheduler's attention.
 * to report quiescent states at the soonest possible time.
 * The request can be in one of the following states:
 * - DEFER_QS_IDLE: An IRQ work is yet to be scheduled.
 * - DEFER_QS_PENDING: An IRQ work was scheduled but either not yet run, or it
 *                     ran and we still haven't reported a quiescent state.
 */
#define DEFER_QS_IDLE           0
#define DEFER_QS_PENDING        1

And resend the patch with my Review tag?

thanks,

 - Joel
Re: [PATCH] rcu: Fix rcu_read_unlock() deadloop due to softirq
Posted by Joel Fernandes 1 month, 2 weeks ago

On 12/19/2025 10:38 AM, Joel Fernandes wrote:
> 
> 
> On 12/19/2025 10:14 AM, Joel Fernandes wrote:
>> On Thu, Dec 18, 2025 at 03:49:50PM +0800, Yao Kai wrote:
>>> Commit 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in
>>> __rcu_read_unlock()") removes the recursion-protection code from
>>> __rcu_read_unlock(). Therefore, we could invoke the deadloop in
>>> raise_softirq_irqoff() with ftrace enabled as follows:
>>>
>>> WARNING: CPU: 0 PID: 0 at kernel/trace/trace.c:3021 __ftrace_trace_stack.constprop.0+0x172/0x180
>>> Modules linked in: my_irq_work(O)
>>> CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G O 6.18.0-rc7-dirty #23 PREEMPT(full)
>>> Tainted: [O]=OOT_MODULE
>>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
>>> RIP: 0010:__ftrace_trace_stack.constprop.0+0x172/0x180
>>> RSP: 0018:ffffc900000034a8 EFLAGS: 00010002
>>> RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000000000
>>> RDX: 0000000000000003 RSI: ffffffff826d7b87 RDI: ffffffff826e9329
>>> RBP: 0000000000090009 R08: 0000000000000005 R09: ffffffff82afbc4c
>>> R10: 0000000000000008 R11: 0000000000011d7a R12: 0000000000000000
>>> R13: ffff888003874100 R14: 0000000000000003 R15: ffff8880038c1054
>>> FS:  0000000000000000(0000) GS:ffff8880fa8ea000(0000) knlGS:0000000000000000
>>> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>> CR2: 000055b31fa7f540 CR3: 00000000078f4005 CR4: 0000000000770ef0
>>> PKRU: 55555554
>>> Call Trace:
>>>  <IRQ>
>>>  trace_buffer_unlock_commit_regs+0x6d/0x220
>>>  trace_event_buffer_commit+0x5c/0x260
>>>  trace_event_raw_event_softirq+0x47/0x80
>>>  raise_softirq_irqoff+0x6e/0xa0
>>>  rcu_read_unlock_special+0xb1/0x160
>>>  unwind_next_frame+0x203/0x9b0
>>>  __unwind_start+0x15d/0x1c0
>>>  arch_stack_walk+0x62/0xf0
>>>  stack_trace_save+0x48/0x70
>>>  __ftrace_trace_stack.constprop.0+0x144/0x180
>>>  trace_buffer_unlock_commit_regs+0x6d/0x220
>>>  trace_event_buffer_commit+0x5c/0x260
>>>  trace_event_raw_event_softirq+0x47/0x80
>>>  raise_softirq_irqoff+0x6e/0xa0
>>>  rcu_read_unlock_special+0xb1/0x160
>>>  unwind_next_frame+0x203/0x9b0
>>>  __unwind_start+0x15d/0x1c0
>>>  arch_stack_walk+0x62/0xf0
>>>  stack_trace_save+0x48/0x70
>>>  __ftrace_trace_stack.constprop.0+0x144/0x180
>>>  trace_buffer_unlock_commit_regs+0x6d/0x220
>>>  trace_event_buffer_commit+0x5c/0x260
>>>  trace_event_raw_event_softirq+0x47/0x80
>>>  raise_softirq_irqoff+0x6e/0xa0
>>>  rcu_read_unlock_special+0xb1/0x160
>>>  unwind_next_frame+0x203/0x9b0
>>>  __unwind_start+0x15d/0x1c0
>>>  arch_stack_walk+0x62/0xf0
>>>  stack_trace_save+0x48/0x70
>>>  __ftrace_trace_stack.constprop.0+0x144/0x180
>>>  trace_buffer_unlock_commit_regs+0x6d/0x220
>>>  trace_event_buffer_commit+0x5c/0x260
>>>  trace_event_raw_event_softirq+0x47/0x80
>>>  raise_softirq_irqoff+0x6e/0xa0
>>>  rcu_read_unlock_special+0xb1/0x160
>>>  __is_insn_slot_addr+0x54/0x70
>>>  kernel_text_address+0x48/0xc0
>>>  __kernel_text_address+0xd/0x40
>>>  unwind_get_return_address+0x1e/0x40
>>>  arch_stack_walk+0x9c/0xf0
>>>  stack_trace_save+0x48/0x70
>>>  __ftrace_trace_stack.constprop.0+0x144/0x180
>>>  trace_buffer_unlock_commit_regs+0x6d/0x220
>>>  trace_event_buffer_commit+0x5c/0x260
>>>  trace_event_raw_event_softirq+0x47/0x80
>>>  __raise_softirq_irqoff+0x61/0x80
>>>  __flush_smp_call_function_queue+0x115/0x420
>>>  __sysvec_call_function_single+0x17/0xb0
>>>  sysvec_call_function_single+0x8c/0xc0
>>>  </IRQ>
>>>
>>> Commit b41642c87716 ("rcu: Fix rcu_read_unlock() deadloop due to IRQ work")
>>> fixed the infinite loop in rcu_read_unlock_special() for IRQ work by
>>> setting a flag before calling irq_work_queue_on(). We fix this issue by
>>> setting the same flag before calling raise_softirq_irqoff() and rename the
>>> flag to defer_qs_pending for more common.
>>>
>>> Fixes: 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in __rcu_read_unlock()")
>>> Reported-by: Tengda Wu <wutengda2@huawei.com>
>>> Signed-off-by: Yao Kai <yaokai34@huawei.com>
>>
>> Good change! It is the exact same pattern and both IRQ work and softirq
>> raising are for deferred QS purposes.
>>
>> Reviewed-by: Joel Fernandes <joelagnelf@nvidia.com>
> Could you please update the following comment in tree.h to say "An IRQ work or
> softirq":
> 
> /*
>  * An IRQ work (deferred_qs_iw) is used by RCU to get the scheduler's attention.
>  * to report quiescent states at the soonest possible time.
>  * The request can be in one of the following states:
>  * - DEFER_QS_IDLE: An IRQ work is yet to be scheduled.
>  * - DEFER_QS_PENDING: An IRQ work was scheduled but either not yet run, or it
>  *                     ran and we still haven't reported a quiescent state.
>  */
> #define DEFER_QS_IDLE           0
> #define DEFER_QS_PENDING        1
> 
> And resend the patch with my Review tag?

Oh, and update the variable name in the comment too (deferred_qs_iw) :) and all
the occurences of "IRQ work" to "IRQ work or softirq".

 - Joel
Re: [PATCH] rcu: Fix rcu_read_unlock() deadloop due to softirq
Posted by Yao Kai 1 month, 2 weeks ago

On 12/19/2025 11:44 PM, Joel Fernandes wrote:
> 
> 
> On 12/19/2025 10:38 AM, Joel Fernandes wrote:
>>
>>
>> On 12/19/2025 10:14 AM, Joel Fernandes wrote:
>>> On Thu, Dec 18, 2025 at 03:49:50PM +0800, Yao Kai wrote:
>>>> Commit 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in
>>>> __rcu_read_unlock()") removes the recursion-protection code from
>>>> __rcu_read_unlock(). Therefore, we could invoke the deadloop in
>>>> raise_softirq_irqoff() with ftrace enabled as follows:
>>>>
>>>> WARNING: CPU: 0 PID: 0 at kernel/trace/trace.c:3021 __ftrace_trace_stack.constprop.0+0x172/0x180
>>>> Modules linked in: my_irq_work(O)
>>>> CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G O 6.18.0-rc7-dirty #23 PREEMPT(full)
>>>> Tainted: [O]=OOT_MODULE
>>>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
>>>> RIP: 0010:__ftrace_trace_stack.constprop.0+0x172/0x180
>>>> RSP: 0018:ffffc900000034a8 EFLAGS: 00010002
>>>> RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000000000
>>>> RDX: 0000000000000003 RSI: ffffffff826d7b87 RDI: ffffffff826e9329
>>>> RBP: 0000000000090009 R08: 0000000000000005 R09: ffffffff82afbc4c
>>>> R10: 0000000000000008 R11: 0000000000011d7a R12: 0000000000000000
>>>> R13: ffff888003874100 R14: 0000000000000003 R15: ffff8880038c1054
>>>> FS:  0000000000000000(0000) GS:ffff8880fa8ea000(0000) knlGS:0000000000000000
>>>> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>> CR2: 000055b31fa7f540 CR3: 00000000078f4005 CR4: 0000000000770ef0
>>>> PKRU: 55555554
>>>> Call Trace:
>>>>   <IRQ>
>>>>   trace_buffer_unlock_commit_regs+0x6d/0x220
>>>>   trace_event_buffer_commit+0x5c/0x260
>>>>   trace_event_raw_event_softirq+0x47/0x80
>>>>   raise_softirq_irqoff+0x6e/0xa0
>>>>   rcu_read_unlock_special+0xb1/0x160
>>>>   unwind_next_frame+0x203/0x9b0
>>>>   __unwind_start+0x15d/0x1c0
>>>>   arch_stack_walk+0x62/0xf0
>>>>   stack_trace_save+0x48/0x70
>>>>   __ftrace_trace_stack.constprop.0+0x144/0x180
>>>>   trace_buffer_unlock_commit_regs+0x6d/0x220
>>>>   trace_event_buffer_commit+0x5c/0x260
>>>>   trace_event_raw_event_softirq+0x47/0x80
>>>>   raise_softirq_irqoff+0x6e/0xa0
>>>>   rcu_read_unlock_special+0xb1/0x160
>>>>   unwind_next_frame+0x203/0x9b0
>>>>   __unwind_start+0x15d/0x1c0
>>>>   arch_stack_walk+0x62/0xf0
>>>>   stack_trace_save+0x48/0x70
>>>>   __ftrace_trace_stack.constprop.0+0x144/0x180
>>>>   trace_buffer_unlock_commit_regs+0x6d/0x220
>>>>   trace_event_buffer_commit+0x5c/0x260
>>>>   trace_event_raw_event_softirq+0x47/0x80
>>>>   raise_softirq_irqoff+0x6e/0xa0
>>>>   rcu_read_unlock_special+0xb1/0x160
>>>>   unwind_next_frame+0x203/0x9b0
>>>>   __unwind_start+0x15d/0x1c0
>>>>   arch_stack_walk+0x62/0xf0
>>>>   stack_trace_save+0x48/0x70
>>>>   __ftrace_trace_stack.constprop.0+0x144/0x180
>>>>   trace_buffer_unlock_commit_regs+0x6d/0x220
>>>>   trace_event_buffer_commit+0x5c/0x260
>>>>   trace_event_raw_event_softirq+0x47/0x80
>>>>   raise_softirq_irqoff+0x6e/0xa0
>>>>   rcu_read_unlock_special+0xb1/0x160
>>>>   __is_insn_slot_addr+0x54/0x70
>>>>   kernel_text_address+0x48/0xc0
>>>>   __kernel_text_address+0xd/0x40
>>>>   unwind_get_return_address+0x1e/0x40
>>>>   arch_stack_walk+0x9c/0xf0
>>>>   stack_trace_save+0x48/0x70
>>>>   __ftrace_trace_stack.constprop.0+0x144/0x180
>>>>   trace_buffer_unlock_commit_regs+0x6d/0x220
>>>>   trace_event_buffer_commit+0x5c/0x260
>>>>   trace_event_raw_event_softirq+0x47/0x80
>>>>   __raise_softirq_irqoff+0x61/0x80
>>>>   __flush_smp_call_function_queue+0x115/0x420
>>>>   __sysvec_call_function_single+0x17/0xb0
>>>>   sysvec_call_function_single+0x8c/0xc0
>>>>   </IRQ>
>>>>
>>>> Commit b41642c87716 ("rcu: Fix rcu_read_unlock() deadloop due to IRQ work")
>>>> fixed the infinite loop in rcu_read_unlock_special() for IRQ work by
>>>> setting a flag before calling irq_work_queue_on(). We fix this issue by
>>>> setting the same flag before calling raise_softirq_irqoff() and rename the
>>>> flag to defer_qs_pending for more common.
>>>>
>>>> Fixes: 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in __rcu_read_unlock()")
>>>> Reported-by: Tengda Wu <wutengda2@huawei.com>
>>>> Signed-off-by: Yao Kai <yaokai34@huawei.com>
>>>
>>> Good change! It is the exact same pattern and both IRQ work and softirq
>>> raising are for deferred QS purposes.
>>>
>>> Reviewed-by: Joel Fernandes <joelagnelf@nvidia.com>
>> Could you please update the following comment in tree.h to say "An IRQ work or
>> softirq":
>>
>> /*
>>   * An IRQ work (deferred_qs_iw) is used by RCU to get the scheduler's attention.
>>   * to report quiescent states at the soonest possible time.
>>   * The request can be in one of the following states:
>>   * - DEFER_QS_IDLE: An IRQ work is yet to be scheduled.
>>   * - DEFER_QS_PENDING: An IRQ work was scheduled but either not yet run, or it
>>   *                     ran and we still haven't reported a quiescent state.
>>   */
>> #define DEFER_QS_IDLE           0
>> #define DEFER_QS_PENDING        1
>>
>> And resend the patch with my Review tag?
> 
> Oh, and update the variable name in the comment too (deferred_qs_iw) :) and all
> the occurences of "IRQ work" to "IRQ work or softirq".
> 
>   - Joel
> 
> 

Thanks for review. I will send a new patch later.

  - Kai
[PATCH] rcu: Fix rcu_read_unlock() deadloop due to softirq
Posted by Yao Kai 1 month, 2 weeks ago
Commit 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in
__rcu_read_unlock()") removes the recursion-protection code from
__rcu_read_unlock(). Therefore, we could invoke the deadloop in
raise_softirq_irqoff() with ftrace enabled as follows:

WARNING: CPU: 0 PID: 0 at kernel/trace/trace.c:3021 __ftrace_trace_stack.constprop.0+0x172/0x180
Modules linked in: my_irq_work(O)
CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G O 6.18.0-rc7-dirty #23 PREEMPT(full)
Tainted: [O]=OOT_MODULE
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
RIP: 0010:__ftrace_trace_stack.constprop.0+0x172/0x180
RSP: 0018:ffffc900000034a8 EFLAGS: 00010002
RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000000000
RDX: 0000000000000003 RSI: ffffffff826d7b87 RDI: ffffffff826e9329
RBP: 0000000000090009 R08: 0000000000000005 R09: ffffffff82afbc4c
R10: 0000000000000008 R11: 0000000000011d7a R12: 0000000000000000
R13: ffff888003874100 R14: 0000000000000003 R15: ffff8880038c1054
FS:  0000000000000000(0000) GS:ffff8880fa8ea000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000055b31fa7f540 CR3: 00000000078f4005 CR4: 0000000000770ef0
PKRU: 55555554
Call Trace:
 <IRQ>
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 raise_softirq_irqoff+0x6e/0xa0
 rcu_read_unlock_special+0xb1/0x160
 unwind_next_frame+0x203/0x9b0
 __unwind_start+0x15d/0x1c0
 arch_stack_walk+0x62/0xf0
 stack_trace_save+0x48/0x70
 __ftrace_trace_stack.constprop.0+0x144/0x180
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 raise_softirq_irqoff+0x6e/0xa0
 rcu_read_unlock_special+0xb1/0x160
 unwind_next_frame+0x203/0x9b0
 __unwind_start+0x15d/0x1c0
 arch_stack_walk+0x62/0xf0
 stack_trace_save+0x48/0x70
 __ftrace_trace_stack.constprop.0+0x144/0x180
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 raise_softirq_irqoff+0x6e/0xa0
 rcu_read_unlock_special+0xb1/0x160
 unwind_next_frame+0x203/0x9b0
 __unwind_start+0x15d/0x1c0
 arch_stack_walk+0x62/0xf0
 stack_trace_save+0x48/0x70
 __ftrace_trace_stack.constprop.0+0x144/0x180
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 raise_softirq_irqoff+0x6e/0xa0
 rcu_read_unlock_special+0xb1/0x160
 __is_insn_slot_addr+0x54/0x70
 kernel_text_address+0x48/0xc0
 __kernel_text_address+0xd/0x40
 unwind_get_return_address+0x1e/0x40
 arch_stack_walk+0x9c/0xf0
 stack_trace_save+0x48/0x70
 __ftrace_trace_stack.constprop.0+0x144/0x180
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 __raise_softirq_irqoff+0x61/0x80
 __flush_smp_call_function_queue+0x115/0x420
 __sysvec_call_function_single+0x17/0xb0
 sysvec_call_function_single+0x8c/0xc0
 </IRQ>

Commit b41642c87716 ("rcu: Fix rcu_read_unlock() deadloop due to IRQ work")
fixed the infinite loop in rcu_read_unlock_special() for IRQ work by
setting a flag before calling irq_work_queue_on(). We fix this issue by
setting the same flag before calling raise_softirq_irqoff() and rename the
flag to defer_qs_pending for more common.

Fixes: 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in __rcu_read_unlock()")
Reported-by: Tengda Wu <wutengda2@huawei.com>
Signed-off-by: Yao Kai <yaokai34@huawei.com>
Reviewed-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 kernel/rcu/tree.h        |  8 ++++----
 kernel/rcu/tree_plugin.h | 15 +++++++++------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b8bbe7960cda..d1d2f746db3d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -175,11 +175,11 @@ struct rcu_snap_record {
 };
 
 /*
- * An IRQ work (deferred_qs_iw) is used by RCU to get the scheduler's attention.
+ * An IRQ work or softirq (deferred_qs) is used by RCU to get the scheduler's attention.
  * to report quiescent states at the soonest possible time.
  * The request can be in one of the following states:
- * - DEFER_QS_IDLE: An IRQ work is yet to be scheduled.
- * - DEFER_QS_PENDING: An IRQ work was scheduled but either not yet run, or it
+ * - DEFER_QS_IDLE: An IRQ work or softirq is yet to be scheduled.
+ * - DEFER_QS_PENDING: An IRQ work or softirq was scheduled but either not yet run, or it
  *                     ran and we still haven't reported a quiescent state.
  */
 #define DEFER_QS_IDLE		0
@@ -203,7 +203,7 @@ struct rcu_data {
 					/*  during and after the last grace */
 					/* period it is aware of. */
 	struct irq_work defer_qs_iw;	/* Obtain later scheduler attention. */
-	int defer_qs_iw_pending;	/* Scheduler attention pending? */
+	int defer_qs_pending;		/* irqwork or softirq pending? */
 	struct work_struct strict_work;	/* Schedule readers for strict GPs. */
 
 	/* 2) batch handling */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index dbe2d02be824..95ad967adcf3 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -487,8 +487,8 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 	union rcu_special special;
 
 	rdp = this_cpu_ptr(&rcu_data);
-	if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING)
-		rdp->defer_qs_iw_pending = DEFER_QS_IDLE;
+	if (rdp->defer_qs_pending == DEFER_QS_PENDING)
+		rdp->defer_qs_pending = DEFER_QS_IDLE;
 
 	/*
 	 * If RCU core is waiting for this CPU to exit its critical section,
@@ -645,7 +645,7 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
 	 * 5. Deferred QS reporting does not happen.
 	 */
 	if (rcu_preempt_depth() > 0)
-		WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE);
+		WRITE_ONCE(rdp->defer_qs_pending, DEFER_QS_IDLE);
 }
 
 /*
@@ -747,7 +747,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
 			// Using softirq, safe to awaken, and either the
 			// wakeup is free or there is either an expedited
 			// GP in flight or a potential need to deboost.
-			raise_softirq_irqoff(RCU_SOFTIRQ);
+			if (rdp->defer_qs_pending != DEFER_QS_PENDING) {
+				rdp->defer_qs_pending = DEFER_QS_PENDING;
+				raise_softirq_irqoff(RCU_SOFTIRQ);
+			}
 		} else {
 			// Enabling BH or preempt does reschedule, so...
 			// Also if no expediting and no possible deboosting,
@@ -755,11 +758,11 @@ static void rcu_read_unlock_special(struct task_struct *t)
 			// tick enabled.
 			set_need_resched_current();
 			if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
-			    needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING &&
+			    needs_exp && rdp->defer_qs_pending != DEFER_QS_PENDING &&
 			    cpu_online(rdp->cpu)) {
 				// Get scheduler to re-evaluate and call hooks.
 				// If !IRQ_WORK, FQS scan will eventually IPI.
-				rdp->defer_qs_iw_pending = DEFER_QS_PENDING;
+				rdp->defer_qs_pending = DEFER_QS_PENDING;
 				irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
 			}
 		}
-- 
2.43.0
[PATCH] rcu: Fix rcu_read_unlock() deadloop due to softirq
Posted by Yao Kai 1 month, 2 weeks ago
Commit 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in
__rcu_read_unlock()") removes the recursion-protection code from
__rcu_read_unlock(). Therefore, we could invoke the deadloop in
raise_softirq_irqoff() with ftrace enabled as follows:

WARNING: CPU: 0 PID: 0 at kernel/trace/trace.c:3021 __ftrace_trace_stack.constprop.0+0x172/0x180
Modules linked in: my_irq_work(O)
CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G O 6.18.0-rc7-dirty #23 PREEMPT(full)
Tainted: [O]=OOT_MODULE
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
RIP: 0010:__ftrace_trace_stack.constprop.0+0x172/0x180
RSP: 0018:ffffc900000034a8 EFLAGS: 00010002
RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000000000
RDX: 0000000000000003 RSI: ffffffff826d7b87 RDI: ffffffff826e9329
RBP: 0000000000090009 R08: 0000000000000005 R09: ffffffff82afbc4c
R10: 0000000000000008 R11: 0000000000011d7a R12: 0000000000000000
R13: ffff888003874100 R14: 0000000000000003 R15: ffff8880038c1054
FS:  0000000000000000(0000) GS:ffff8880fa8ea000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000055b31fa7f540 CR3: 00000000078f4005 CR4: 0000000000770ef0
PKRU: 55555554
Call Trace:
 <IRQ>
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 raise_softirq_irqoff+0x6e/0xa0
 rcu_read_unlock_special+0xb1/0x160
 unwind_next_frame+0x203/0x9b0
 __unwind_start+0x15d/0x1c0
 arch_stack_walk+0x62/0xf0
 stack_trace_save+0x48/0x70
 __ftrace_trace_stack.constprop.0+0x144/0x180
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 raise_softirq_irqoff+0x6e/0xa0
 rcu_read_unlock_special+0xb1/0x160
 unwind_next_frame+0x203/0x9b0
 __unwind_start+0x15d/0x1c0
 arch_stack_walk+0x62/0xf0
 stack_trace_save+0x48/0x70
 __ftrace_trace_stack.constprop.0+0x144/0x180
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 raise_softirq_irqoff+0x6e/0xa0
 rcu_read_unlock_special+0xb1/0x160
 unwind_next_frame+0x203/0x9b0
 __unwind_start+0x15d/0x1c0
 arch_stack_walk+0x62/0xf0
 stack_trace_save+0x48/0x70
 __ftrace_trace_stack.constprop.0+0x144/0x180
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 raise_softirq_irqoff+0x6e/0xa0
 rcu_read_unlock_special+0xb1/0x160
 __is_insn_slot_addr+0x54/0x70
 kernel_text_address+0x48/0xc0
 __kernel_text_address+0xd/0x40
 unwind_get_return_address+0x1e/0x40
 arch_stack_walk+0x9c/0xf0
 stack_trace_save+0x48/0x70
 __ftrace_trace_stack.constprop.0+0x144/0x180
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 __raise_softirq_irqoff+0x61/0x80
 __flush_smp_call_function_queue+0x115/0x420
 __sysvec_call_function_single+0x17/0xb0
 sysvec_call_function_single+0x8c/0xc0
 </IRQ>

Commit b41642c87716 ("rcu: Fix rcu_read_unlock() deadloop due to IRQ work")
fixed the infinite loop in rcu_read_unlock_special() for IRQ work by
setting a flag before calling irq_work_queue_on(). We fix this issue by
setting the same flag before calling raise_softirq_irqoff() and rename the
flag to defer_qs_pending for more common.

Fixes: 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in __rcu_read_unlock()")
Reported-by: Tengda Wu <wutengda2@huawei.com>
Signed-off-by: Yao Kai <yaokai34@huawei.com>
Reviewed-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 kernel/rcu/tree.h        |  8 ++++----
 kernel/rcu/tree_plugin.h | 15 +++++++++------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b8bbe7960cda..d1d2f746db3d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -175,11 +175,11 @@ struct rcu_snap_record {
 };
 
 /*
- * An IRQ work (deferred_qs_iw) is used by RCU to get the scheduler's attention.
+ * An IRQ work or softirq (deferred_qs) is used by RCU to get the scheduler's attention.
  * to report quiescent states at the soonest possible time.
  * The request can be in one of the following states:
- * - DEFER_QS_IDLE: An IRQ work is yet to be scheduled.
- * - DEFER_QS_PENDING: An IRQ work was scheduled but either not yet run, or it
+ * - DEFER_QS_IDLE: An IRQ work or softirq is yet to be scheduled.
+ * - DEFER_QS_PENDING: An IRQ work or softirq was scheduled but either not yet run, or it
  *                     ran and we still haven't reported a quiescent state.
  */
 #define DEFER_QS_IDLE		0
@@ -203,7 +203,7 @@ struct rcu_data {
 					/*  during and after the last grace */
 					/* period it is aware of. */
 	struct irq_work defer_qs_iw;	/* Obtain later scheduler attention. */
-	int defer_qs_iw_pending;	/* Scheduler attention pending? */
+	int defer_qs_pending;		/* irqwork or softirq pending? */
 	struct work_struct strict_work;	/* Schedule readers for strict GPs. */
 
 	/* 2) batch handling */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index dbe2d02be824..95ad967adcf3 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -487,8 +487,8 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 	union rcu_special special;
 
 	rdp = this_cpu_ptr(&rcu_data);
-	if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING)
-		rdp->defer_qs_iw_pending = DEFER_QS_IDLE;
+	if (rdp->defer_qs_pending == DEFER_QS_PENDING)
+		rdp->defer_qs_pending = DEFER_QS_IDLE;
 
 	/*
 	 * If RCU core is waiting for this CPU to exit its critical section,
@@ -645,7 +645,7 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
 	 * 5. Deferred QS reporting does not happen.
 	 */
 	if (rcu_preempt_depth() > 0)
-		WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE);
+		WRITE_ONCE(rdp->defer_qs_pending, DEFER_QS_IDLE);
 }
 
 /*
@@ -747,7 +747,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
 			// Using softirq, safe to awaken, and either the
 			// wakeup is free or there is either an expedited
 			// GP in flight or a potential need to deboost.
-			raise_softirq_irqoff(RCU_SOFTIRQ);
+			if (rdp->defer_qs_pending != DEFER_QS_PENDING) {
+				rdp->defer_qs_pending = DEFER_QS_PENDING;
+				raise_softirq_irqoff(RCU_SOFTIRQ);
+			}
 		} else {
 			// Enabling BH or preempt does reschedule, so...
 			// Also if no expediting and no possible deboosting,
@@ -755,11 +758,11 @@ static void rcu_read_unlock_special(struct task_struct *t)
 			// tick enabled.
 			set_need_resched_current();
 			if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
-			    needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING &&
+			    needs_exp && rdp->defer_qs_pending != DEFER_QS_PENDING &&
 			    cpu_online(rdp->cpu)) {
 				// Get scheduler to re-evaluate and call hooks.
 				// If !IRQ_WORK, FQS scan will eventually IPI.
-				rdp->defer_qs_iw_pending = DEFER_QS_PENDING;
+				rdp->defer_qs_pending = DEFER_QS_PENDING;
 				irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
 			}
 		}
-- 
2.43.0
Re: [PATCH] rcu: Fix rcu_read_unlock() deadloop due to softirq
Posted by Joel Fernandes 1 month, 2 weeks ago

On 12/22/2025 3:06 AM, Yao Kai wrote:
> Commit 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in
> __rcu_read_unlock()") removes the recursion-protection code from
> __rcu_read_unlock(). Therefore, we could invoke the deadloop in
> raise_softirq_irqoff() with ftrace enabled as follows:
> 
> WARNING: CPU: 0 PID: 0 at kernel/trace/trace.c:3021 __ftrace_trace_stack.constprop.0+0x172/0x180
> Modules linked in: my_irq_work(O)
> CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G O 6.18.0-rc7-dirty #23 PREEMPT(full)
> Tainted: [O]=OOT_MODULE
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
> RIP: 0010:__ftrace_trace_stack.constprop.0+0x172/0x180
> RSP: 0018:ffffc900000034a8 EFLAGS: 00010002
> RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000000000
> RDX: 0000000000000003 RSI: ffffffff826d7b87 RDI: ffffffff826e9329
> RBP: 0000000000090009 R08: 0000000000000005 R09: ffffffff82afbc4c
> R10: 0000000000000008 R11: 0000000000011d7a R12: 0000000000000000
> R13: ffff888003874100 R14: 0000000000000003 R15: ffff8880038c1054
> FS:  0000000000000000(0000) GS:ffff8880fa8ea000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 000055b31fa7f540 CR3: 00000000078f4005 CR4: 0000000000770ef0
> PKRU: 55555554
> Call Trace:
>  <IRQ>
>  trace_buffer_unlock_commit_regs+0x6d/0x220
>  trace_event_buffer_commit+0x5c/0x260
>  trace_event_raw_event_softirq+0x47/0x80
>  raise_softirq_irqoff+0x6e/0xa0
>  rcu_read_unlock_special+0xb1/0x160
>  unwind_next_frame+0x203/0x9b0
>  __unwind_start+0x15d/0x1c0
>  arch_stack_walk+0x62/0xf0
>  stack_trace_save+0x48/0x70
>  __ftrace_trace_stack.constprop.0+0x144/0x180
>  trace_buffer_unlock_commit_regs+0x6d/0x220
>  trace_event_buffer_commit+0x5c/0x260
>  trace_event_raw_event_softirq+0x47/0x80
>  raise_softirq_irqoff+0x6e/0xa0
>  rcu_read_unlock_special+0xb1/0x160
>  unwind_next_frame+0x203/0x9b0
>  __unwind_start+0x15d/0x1c0
>  arch_stack_walk+0x62/0xf0
>  stack_trace_save+0x48/0x70
>  __ftrace_trace_stack.constprop.0+0x144/0x180
>  trace_buffer_unlock_commit_regs+0x6d/0x220
>  trace_event_buffer_commit+0x5c/0x260
>  trace_event_raw_event_softirq+0x47/0x80
>  raise_softirq_irqoff+0x6e/0xa0
>  rcu_read_unlock_special+0xb1/0x160
>  unwind_next_frame+0x203/0x9b0
>  __unwind_start+0x15d/0x1c0
>  arch_stack_walk+0x62/0xf0
>  stack_trace_save+0x48/0x70
>  __ftrace_trace_stack.constprop.0+0x144/0x180
>  trace_buffer_unlock_commit_regs+0x6d/0x220
>  trace_event_buffer_commit+0x5c/0x260
>  trace_event_raw_event_softirq+0x47/0x80
>  raise_softirq_irqoff+0x6e/0xa0
>  rcu_read_unlock_special+0xb1/0x160
>  __is_insn_slot_addr+0x54/0x70
>  kernel_text_address+0x48/0xc0
>  __kernel_text_address+0xd/0x40
>  unwind_get_return_address+0x1e/0x40
>  arch_stack_walk+0x9c/0xf0
>  stack_trace_save+0x48/0x70
>  __ftrace_trace_stack.constprop.0+0x144/0x180
>  trace_buffer_unlock_commit_regs+0x6d/0x220
>  trace_event_buffer_commit+0x5c/0x260
>  trace_event_raw_event_softirq+0x47/0x80
>  __raise_softirq_irqoff+0x61/0x80
>  __flush_smp_call_function_queue+0x115/0x420
>  __sysvec_call_function_single+0x17/0xb0
>  sysvec_call_function_single+0x8c/0xc0
>  </IRQ>
> 
> Commit b41642c87716 ("rcu: Fix rcu_read_unlock() deadloop due to IRQ work")
> fixed the infinite loop in rcu_read_unlock_special() for IRQ work by
> setting a flag before calling irq_work_queue_on(). We fix this issue by
> setting the same flag before calling raise_softirq_irqoff() and rename the
> flag to defer_qs_pending for more common.
> 
> Fixes: 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in __rcu_read_unlock()")
> Reported-by: Tengda Wu <wutengda2@huawei.com>
> Signed-off-by: Yao Kai <yaokai34@huawei.com>
> Reviewed-by: Joel Fernandes <joelagnelf@nvidia.com>
Thank you. I will apply this patch, I am preparing a few other RCU patches from
me and others. Will send it in a series for review/testing for 7.0.

 - Joel