[PATCH RFC 02/14] rcu: Add per-CPU blocked task lists for PREEMPT_RCU

Joel Fernandes posted 14 patches 1 month ago
[PATCH RFC 02/14] rcu: Add per-CPU blocked task lists for PREEMPT_RCU
Posted by Joel Fernandes 1 month ago
Add per-CPU tracking of tasks blocked in RCU read-side critical
sections. Each rcu_data gets a blkd_list protected by blkd_lock,
mirroring the rcu_node blkd_tasks list at per-CPU granularity.

Tasks are added on preemption and removed on rcu_read_unlock.
A WARN_ON_ONCE in rcu_gp_init verifies list consistency.

Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 include/linux/sched.h    |  4 ++++
 kernel/fork.c            |  4 ++++
 kernel/rcu/Kconfig       | 12 ++++++++++++
 kernel/rcu/tree.c        | 32 ++++++++++++++++++++++++++++++++
 kernel/rcu/tree.h        |  6 ++++++
 kernel/rcu/tree_plugin.h | 21 +++++++++++++++++++++
 6 files changed, 79 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d395f2810fac..90ce501a568e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -931,6 +931,10 @@ struct task_struct {
 	union rcu_special		rcu_read_unlock_special;
 	struct list_head		rcu_node_entry;
 	struct rcu_node			*rcu_blocked_node;
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+	struct list_head		rcu_rdp_entry;
+	int				rcu_blocked_cpu;
+#endif
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 
 #ifdef CONFIG_TASKS_RCU
diff --git a/kernel/fork.c b/kernel/fork.c
index b1f3915d5f8e..7a5ba2d2c1b5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1819,6 +1819,10 @@ static inline void rcu_copy_process(struct task_struct *p)
 	p->rcu_read_unlock_special.s = 0;
 	p->rcu_blocked_node = NULL;
 	INIT_LIST_HEAD(&p->rcu_node_entry);
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+	INIT_LIST_HEAD(&p->rcu_rdp_entry);
+	p->rcu_blocked_cpu = -1;
+#endif
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 #ifdef CONFIG_TASKS_RCU
 	p->rcu_tasks_holdout = false;
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 4d9b21f69eaa..4bb12f1fed09 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -248,6 +248,18 @@ config RCU_EXP_KTHREAD
 
 	  Accept the default if unsure.
 
+config RCU_PER_CPU_BLOCKED_LISTS
+	bool "Use per-CPU blocked task lists in PREEMPT_RCU"
+	depends on PREEMPT_RCU
+	default n
+	help
+	  Enable per-CPU tracking of tasks blocked in RCU read-side
+	  critical sections. This allows to quickly toggle the feature.
+	  Eventually the config will be removed, in favor of always keeping
+	  the optimization enabled.
+
+	  Accept the default if unsure.
+
 config RCU_NOCB_CPU
 	bool "Offload RCU callback processing from boot-selected CPUs"
 	depends on TREE_RCU
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 293bbd9ac3f4..e2b6a4579086 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1809,6 +1809,14 @@ static noinline_for_stack bool rcu_gp_init(void)
 	struct rcu_node *rnp = rcu_get_root();
 	bool start_new_poll;
 	unsigned long old_gp_seq;
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+	struct task_struct *t_verify;
+	int cpu_verify;
+	int rnp_count;
+	int rdp_total;
+	struct rcu_data *rdp_cpu;
+	struct task_struct *t_rdp;
+#endif
 
 	WRITE_ONCE(rcu_state.gp_activity, jiffies);
 	raw_spin_lock_irq_rcu_node(rnp);
@@ -1891,6 +1899,26 @@ static noinline_for_stack bool rcu_gp_init(void)
 		 */
 		arch_spin_lock(&rcu_state.ofl_lock);
 		raw_spin_lock_rcu_node(rnp);
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+		/*
+		 * Verify rdp lists consistent with rnp list. Since the unlock
+		 * path removes from rdp before rnp, we can have tasks that are
+		 * on rnp but not on rdp (in the middle of being removed).
+		 * Therefore rnp_count >= rdp_total is the expected invariant.
+		 */
+		rnp_count = 0;
+		rdp_total = 0;
+		list_for_each_entry(t_verify, &rnp->blkd_tasks, rcu_node_entry)
+			rnp_count++;
+		for (cpu_verify = rnp->grplo; cpu_verify <= rnp->grphi; cpu_verify++) {
+			rdp_cpu = per_cpu_ptr(&rcu_data, cpu_verify);
+			raw_spin_lock(&rdp_cpu->blkd_lock);
+			list_for_each_entry(t_rdp, &rdp_cpu->blkd_list, rcu_rdp_entry)
+				rdp_total++;
+			raw_spin_unlock(&rdp_cpu->blkd_lock);
+		}
+		WARN_ON_ONCE(rnp_count < rdp_total);
+#endif
 		if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
 		    !rnp->wait_blkd_tasks) {
 			/* Nothing to do on this leaf rcu_node structure. */
@@ -4143,6 +4171,10 @@ rcu_boot_init_percpu_data(int cpu)
 	rdp->rcu_onl_gp_state = RCU_GP_CLEANED;
 	rdp->last_sched_clock = jiffies;
 	rdp->cpu = cpu;
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+	raw_spin_lock_init(&rdp->blkd_lock);
+	INIT_LIST_HEAD(&rdp->blkd_list);
+#endif
 	rcu_boot_init_nocb_percpu_data(rdp);
 }
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b8bbe7960cda..13d5649a80fb 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -294,6 +294,12 @@ struct rcu_data {
 
 	long lazy_len;			/* Length of buffered lazy callbacks. */
 	int cpu;
+
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+	/* 8) Per-CPU blocked task tracking. */
+	raw_spinlock_t blkd_lock;	/* Protects blkd_list. */
+	struct list_head blkd_list;	/* Tasks blocked on this CPU. */
+#endif
 };
 
 /* Values for nocb_defer_wakeup field in struct rcu_data. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 73ba5f4a968d..5d2bde19131a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -338,6 +338,12 @@ void rcu_note_context_switch(bool preempt)
 		raw_spin_lock_rcu_node(rnp);
 		t->rcu_read_unlock_special.b.blocked = true;
 		t->rcu_blocked_node = rnp;
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+		t->rcu_blocked_cpu = rdp->cpu;
+		raw_spin_lock(&rdp->blkd_lock);
+		list_add(&t->rcu_rdp_entry, &rdp->blkd_list);
+		raw_spin_unlock(&rdp->blkd_lock);
+#endif
 
 		/*
 		 * Verify the CPU's sanity, trace the preemption, and
@@ -485,6 +491,10 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 	union rcu_special special;
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+	int blocked_cpu;
+	struct rcu_data *blocked_rdp;
+#endif
 
 	rdp = this_cpu_ptr(&rcu_data);
 	if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING)
@@ -530,6 +540,17 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 		 * to loop.  Retain a WARN_ON_ONCE() out of sheer paranoia.
 		 */
 		rnp = t->rcu_blocked_node;
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+		/* Remove from per-CPU list if task was added to it. */
+		blocked_cpu = t->rcu_blocked_cpu;
+		if (blocked_cpu != -1) {
+			blocked_rdp = per_cpu_ptr(&rcu_data, blocked_cpu);
+			raw_spin_lock(&blocked_rdp->blkd_lock);
+			list_del_init(&t->rcu_rdp_entry);
+			t->rcu_blocked_cpu = -1;
+			raw_spin_unlock(&blocked_rdp->blkd_lock);
+		}
+#endif
 		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
 		WARN_ON_ONCE(rnp != t->rcu_blocked_node);
 		WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
-- 
2.34.1
Re: [PATCH RFC 02/14] rcu: Add per-CPU blocked task lists for PREEMPT_RCU
Posted by Steven Rostedt 1 month ago
On Fri,  2 Jan 2026 19:23:31 -0500
Joel Fernandes <joelagnelf@nvidia.com> wrote:

> --- a/kernel/rcu/Kconfig
> +++ b/kernel/rcu/Kconfig
> @@ -248,6 +248,18 @@ config RCU_EXP_KTHREAD
>  
>  	  Accept the default if unsure.
>  
> +config RCU_PER_CPU_BLOCKED_LISTS
> +	bool "Use per-CPU blocked task lists in PREEMPT_RCU"
> +	depends on PREEMPT_RCU

> +	default n

nit, you don't need "default n". The default for options without defining a
default setting is "n".

> +	help
> +	  Enable per-CPU tracking of tasks blocked in RCU read-side
> +	  critical sections. This allows to quickly toggle the feature.
> +	  Eventually the config will be removed, in favor of always keeping
> +	  the optimization enabled.
> +
> +	  Accept the default if unsure.

Hmm, RCU is the only place that says "Accept the default". That would
usually be for non boolean values (for numbers). But it should say either
"Say N if unsure" or "Say Y if unsure".

> +
>  config RCU_NOCB_CPU
>  	bool "Offload RCU callback processing from boot-selected CPUs"
>  	depends on TREE_RCU
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 293bbd9ac3f4..e2b6a4579086 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -1809,6 +1809,14 @@ static noinline_for_stack bool rcu_gp_init(void)
>  	struct rcu_node *rnp = rcu_get_root();
>  	bool start_new_poll;
>  	unsigned long old_gp_seq;
> +#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
> +	struct task_struct *t_verify;
> +	int cpu_verify;
> +	int rnp_count;
> +	int rdp_total;
> +	struct rcu_data *rdp_cpu;
> +	struct task_struct *t_rdp;
> +#endif
>  
>  	WRITE_ONCE(rcu_state.gp_activity, jiffies);
>  	raw_spin_lock_irq_rcu_node(rnp);
> @@ -1891,6 +1899,26 @@ static noinline_for_stack bool rcu_gp_init(void)
>  		 */
>  		arch_spin_lock(&rcu_state.ofl_lock);
>  		raw_spin_lock_rcu_node(rnp);
> +#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
> +		/*
> +		 * Verify rdp lists consistent with rnp list. Since the unlock
> +		 * path removes from rdp before rnp, we can have tasks that are
> +		 * on rnp but not on rdp (in the middle of being removed).
> +		 * Therefore rnp_count >= rdp_total is the expected invariant.
> +		 */
> +		rnp_count = 0;
> +		rdp_total = 0;
> +		list_for_each_entry(t_verify, &rnp->blkd_tasks, rcu_node_entry)
> +			rnp_count++;
> +		for (cpu_verify = rnp->grplo; cpu_verify <= rnp->grphi; cpu_verify++) {
> +			rdp_cpu = per_cpu_ptr(&rcu_data, cpu_verify);
> +			raw_spin_lock(&rdp_cpu->blkd_lock);
> +			list_for_each_entry(t_rdp, &rdp_cpu->blkd_list, rcu_rdp_entry)
> +				rdp_total++;
> +			raw_spin_unlock(&rdp_cpu->blkd_lock);
> +		}
> +		WARN_ON_ONCE(rnp_count < rdp_total);

This only happens at boot right? This isn't something that executes at
normal run time right? Otherwise I would be worried about loops like this
under raw spin locks that could affect RT.

> +#endif
>  		if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
>  		    !rnp->wait_blkd_tasks) {
>  			/* Nothing to do on this leaf rcu_node structure. */
> @@ -4143,6 +4171,10 @@ rcu_boot_init_percpu_data(int cpu)
>  	rdp->rcu_onl_gp_state = RCU_GP_CLEANED;
>  	rdp->last_sched_clock = jiffies;
>  	rdp->cpu = cpu;
> +#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
> +	raw_spin_lock_init(&rdp->blkd_lock);
> +	INIT_LIST_HEAD(&rdp->blkd_list);
> +#endif
>  	rcu_boot_init_nocb_percpu_data(rdp);
>  }
>  
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index b8bbe7960cda..13d5649a80fb 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -294,6 +294,12 @@ struct rcu_data {
>  
>  	long lazy_len;			/* Length of buffered lazy callbacks. */
>  	int cpu;
> +
> +#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
> +	/* 8) Per-CPU blocked task tracking. */
> +	raw_spinlock_t blkd_lock;	/* Protects blkd_list. */
> +	struct list_head blkd_list;	/* Tasks blocked on this CPU. */
> +#endif
>  };
>  
>  /* Values for nocb_defer_wakeup field in struct rcu_data. */
> diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
> index 73ba5f4a968d..5d2bde19131a 100644
> --- a/kernel/rcu/tree_plugin.h
> +++ b/kernel/rcu/tree_plugin.h
> @@ -338,6 +338,12 @@ void rcu_note_context_switch(bool preempt)
>  		raw_spin_lock_rcu_node(rnp);
>  		t->rcu_read_unlock_special.b.blocked = true;
>  		t->rcu_blocked_node = rnp;
> +#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
> +		t->rcu_blocked_cpu = rdp->cpu;
> +		raw_spin_lock(&rdp->blkd_lock);
> +		list_add(&t->rcu_rdp_entry, &rdp->blkd_list);
> +		raw_spin_unlock(&rdp->blkd_lock);

Should we use scoped_guard?

		scoped_guard(raw_spinlock, &rdp->blkd_lock) {
			list_add(&t->rcu_rdp_entry, &rdp->blkd_list);
		}

> +#endif
>  
>  		/*
>  		 * Verify the CPU's sanity, trace the preemption, and
> @@ -485,6 +491,10 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
>  	struct rcu_data *rdp;
>  	struct rcu_node *rnp;
>  	union rcu_special special;
> +#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
> +	int blocked_cpu;
> +	struct rcu_data *blocked_rdp;
> +#endif
>  
>  	rdp = this_cpu_ptr(&rcu_data);
>  	if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING)
> @@ -530,6 +540,17 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
>  		 * to loop.  Retain a WARN_ON_ONCE() out of sheer paranoia.
>  		 */
>  		rnp = t->rcu_blocked_node;
> +#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
> +		/* Remove from per-CPU list if task was added to it. */
> +		blocked_cpu = t->rcu_blocked_cpu;


And use guard here?

		if (blocked_cpu != -1) {
			blocked_rdp = per_cpu_ptr(&rcu_data, blocked_cpu);
			guard(raw_spin_lock)(&blocked_rdp->blkd_lock);
			list_del_init(&t->rcu_rdp_entry);
			t->rcu_blocked_cpu = -1;
		}

-- Steve


> +		}
> +#endif
>  		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
>  		WARN_ON_ONCE(rnp != t->rcu_blocked_node);
>  		WARN_ON_ONCE(!rcu_is_leaf_node(rnp));