[patch V2 06/37] rseq: Simplify the event notification

Thomas Gleixner posted 37 patches 1 month, 1 week ago
There is a newer version of this series
[patch V2 06/37] rseq: Simplify the event notification
Posted by Thomas Gleixner 1 month, 1 week ago
Since commit 0190e4198e47 ("rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_*
flags") the bits in task::rseq_event_mask are meaningless and just extra
work in terms of setting them individually.

Aside of that the only relevant point where an event has to be raised is
context switch. Neither the CPU nor MM CID can change without going through
a context switch.

Collapse them all into a single boolean which simplifies the code a lot and
remove the pointless invocations which have been sprinkled all over the
place for no value.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
---
V2: Reduce it to the sched switch event.
---
 fs/exec.c                 |    2 -
 include/linux/rseq.h      |   66 +++++++++-------------------------------------
 include/linux/sched.h     |   10 +++---
 include/uapi/linux/rseq.h |   21 ++++----------
 kernel/rseq.c             |   28 +++++++++++--------
 kernel/sched/core.c       |    5 ---
 kernel/sched/membarrier.c |    8 ++---
 7 files changed, 48 insertions(+), 92 deletions(-)
---
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1775,7 +1775,7 @@ static int bprm_execve(struct linux_binp
 		force_fatal_sig(SIGSEGV);
 
 	sched_mm_cid_after_execve(current);
-	rseq_set_notify_resume(current);
+	rseq_sched_switch_event(current);
 	current->in_execve = 0;
 
 	return retval;
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -3,38 +3,8 @@
 #define _LINUX_RSEQ_H
 
 #ifdef CONFIG_RSEQ
-
-#include <linux/preempt.h>
 #include <linux/sched.h>
 
-#ifdef CONFIG_MEMBARRIER
-# define RSEQ_EVENT_GUARD	irq
-#else
-# define RSEQ_EVENT_GUARD	preempt
-#endif
-
-/*
- * Map the event mask on the user-space ABI enum rseq_cs_flags
- * for direct mask checks.
- */
-enum rseq_event_mask_bits {
-	RSEQ_EVENT_PREEMPT_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
-	RSEQ_EVENT_SIGNAL_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
-	RSEQ_EVENT_MIGRATE_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
-};
-
-enum rseq_event_mask {
-	RSEQ_EVENT_PREEMPT	= (1U << RSEQ_EVENT_PREEMPT_BIT),
-	RSEQ_EVENT_SIGNAL	= (1U << RSEQ_EVENT_SIGNAL_BIT),
-	RSEQ_EVENT_MIGRATE	= (1U << RSEQ_EVENT_MIGRATE_BIT),
-};
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
-	if (t->rseq)
-		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
-}
-
 void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
 
 static inline void rseq_handle_notify_resume(struct pt_regs *regs)
@@ -43,35 +13,27 @@ static inline void rseq_handle_notify_re
 		__rseq_handle_notify_resume(NULL, regs);
 }
 
-static inline void rseq_signal_deliver(struct ksignal *ksig,
-				       struct pt_regs *regs)
+static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
 {
 	if (current->rseq) {
-		scoped_guard(RSEQ_EVENT_GUARD)
-			__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
+		current->rseq_event_pending = true;
 		__rseq_handle_notify_resume(ksig, regs);
 	}
 }
 
-/* rseq_preempt() requires preemption to be disabled. */
-static inline void rseq_preempt(struct task_struct *t)
+static inline void rseq_sched_switch_event(struct task_struct *t)
 {
-	__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
-	rseq_set_notify_resume(t);
-}
-
-/* rseq_migrate() requires preemption to be disabled. */
-static inline void rseq_migrate(struct task_struct *t)
-{
-	__set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
-	rseq_set_notify_resume(t);
+	if (t->rseq) {
+		t->rseq_event_pending = true;
+		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+	}
 }
 
 static __always_inline void rseq_exit_to_user_mode(void)
 {
 	if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
-		if (WARN_ON_ONCE(current->rseq && current->rseq_event_mask))
-			current->rseq_event_mask = 0;
+		if (WARN_ON_ONCE(current->rseq && current->rseq_event_pending))
+			current->rseq_event_pending = false;
 	}
 }
 
@@ -85,12 +47,12 @@ static inline void rseq_fork(struct task
 		t->rseq = NULL;
 		t->rseq_len = 0;
 		t->rseq_sig = 0;
-		t->rseq_event_mask = 0;
+		t->rseq_event_pending = false;
 	} else {
 		t->rseq = current->rseq;
 		t->rseq_len = current->rseq_len;
 		t->rseq_sig = current->rseq_sig;
-		t->rseq_event_mask = current->rseq_event_mask;
+		t->rseq_event_pending = current->rseq_event_pending;
 	}
 }
 
@@ -99,15 +61,13 @@ static inline void rseq_execve(struct ta
 	t->rseq = NULL;
 	t->rseq_len = 0;
 	t->rseq_sig = 0;
-	t->rseq_event_mask = 0;
+	t->rseq_event_pending = false;
 }
 
 #else /* CONFIG_RSEQ */
-static inline void rseq_set_notify_resume(struct task_struct *t) { }
 static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { }
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
-static inline void rseq_preempt(struct task_struct *t) { }
-static inline void rseq_migrate(struct task_struct *t) { }
+static inline void rseq_sched_switch_event(struct task_struct *t) { }
 static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
 static inline void rseq_execve(struct task_struct *t) { }
 static inline void rseq_exit_to_user_mode(void) { }
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1401,14 +1401,14 @@ struct task_struct {
 #endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_RSEQ
-	struct rseq __user *rseq;
-	u32 rseq_len;
-	u32 rseq_sig;
+	struct rseq __user		*rseq;
+	u32				rseq_len;
+	u32				rseq_sig;
 	/*
-	 * RmW on rseq_event_mask must be performed atomically
+	 * RmW on rseq_event_pending must be performed atomically
 	 * with respect to preemption.
 	 */
-	unsigned long rseq_event_mask;
+	bool				rseq_event_pending;
 # ifdef CONFIG_DEBUG_RSEQ
 	/*
 	 * This is a place holder to save a copy of the rseq fields for
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -114,20 +114,13 @@ struct rseq {
 	/*
 	 * Restartable sequences flags field.
 	 *
-	 * This field should only be updated by the thread which
-	 * registered this data structure. Read by the kernel.
-	 * Mainly used for single-stepping through rseq critical sections
-	 * with debuggers.
-	 *
-	 * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
-	 *     Inhibit instruction sequence block restart on preemption
-	 *     for this thread.
-	 * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
-	 *     Inhibit instruction sequence block restart on signal
-	 *     delivery for this thread.
-	 * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
-	 *     Inhibit instruction sequence block restart on migration for
-	 *     this thread.
+	 * This field was initialy intended to allow event masking for for
+	 * single-stepping through rseq critical sections with debuggers.
+	 * The kernel does not support this anymore and the relevant bits
+	 * are checked for being always false:
+	 *	- RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
+	 *	- RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
+	 *	- RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
 	 */
 	__u32 flags;
 
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -78,6 +78,12 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/rseq.h>
 
+#ifdef CONFIG_MEMBARRIER
+# define RSEQ_EVENT_GUARD	irq
+#else
+# define RSEQ_EVENT_GUARD	preempt
+#endif
+
 /* The original rseq structure size (including padding) is 32 bytes. */
 #define ORIG_RSEQ_SIZE		32
 
@@ -430,11 +436,11 @@ void __rseq_handle_notify_resume(struct
 	 */
 	if (regs) {
 		/*
-		 * Read and clear the event mask first. If the task was not
-		 * preempted or migrated or a signal is on the way, there
-		 * is no point in doing any of the heavy lifting here on
-		 * production kernels. In that case TIF_NOTIFY_RESUME was
-		 * raised by some other functionality.
+		 * Read and clear the event pending bit first. If the task
+		 * was not preempted or migrated or a signal is on the way,
+		 * there is no point in doing any of the heavy lifting here
+		 * on production kernels. In that case TIF_NOTIFY_RESUME
+		 * was raised by some other functionality.
 		 *
 		 * This is correct because the read/clear operation is
 		 * guarded against scheduler preemption, which makes it CPU
@@ -447,15 +453,15 @@ void __rseq_handle_notify_resume(struct
 		 * with the result handed in to allow the detection of
 		 * inconsistencies.
 		 */
-		u32 event_mask;
+		bool event;
 
 		scoped_guard(RSEQ_EVENT_GUARD) {
-			event_mask = t->rseq_event_mask;
-			t->rseq_event_mask = 0;
+			event = t->rseq_event_pending;
+			t->rseq_event_pending = false;
 		}
 
-		if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event_mask) {
-			ret = rseq_ip_fixup(regs, !!event_mask);
+		if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
+			ret = rseq_ip_fixup(regs, event);
 			if (unlikely(ret < 0))
 				goto error;
 		}
@@ -584,7 +590,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
 	 * registered, ensure the cpu_id_start and cpu_id fields
 	 * are updated before returning to user-space.
 	 */
-	rseq_set_notify_resume(current);
+	rseq_sched_switch_event(current);
 
 	return 0;
 }
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3364,7 +3364,6 @@ void set_task_cpu(struct task_struct *p,
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
-		rseq_migrate(p);
 		sched_mm_cid_migrate_from(p);
 		perf_event_task_migrate(p);
 	}
@@ -4795,7 +4794,6 @@ int sched_cgroup_fork(struct task_struct
 		p->sched_task_group = tg;
 	}
 #endif
-	rseq_migrate(p);
 	/*
 	 * We're setting the CPU for the first time, we don't migrate,
 	 * so use __set_task_cpu().
@@ -4859,7 +4857,6 @@ void wake_up_new_task(struct task_struct
 	 * as we're not fully set-up yet.
 	 */
 	p->recent_used_cpu = task_cpu(p);
-	rseq_migrate(p);
 	__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
 	rq = __task_rq_lock(p, &rf);
 	update_rq_clock(rq);
@@ -5153,7 +5150,7 @@ prepare_task_switch(struct rq *rq, struc
 	kcov_prepare_switch(prev);
 	sched_info_switch(rq, prev, next);
 	perf_event_task_sched_out(prev, next);
-	rseq_preempt(prev);
+	rseq_sched_switch_event(prev);
 	fire_sched_out_preempt_notifiers(prev, next);
 	kmap_local_sched_out();
 	prepare_task(next);
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -199,7 +199,7 @@ static void ipi_rseq(void *info)
 	 * is negligible.
 	 */
 	smp_mb();
-	rseq_preempt(current);
+	rseq_sched_switch_event(current);
 }
 
 static void ipi_sync_rq_state(void *info)
@@ -407,9 +407,9 @@ static int membarrier_private_expedited(
 		 * membarrier, we will end up with some thread in the mm
 		 * running without a core sync.
 		 *
-		 * For RSEQ, don't rseq_preempt() the caller.  User code
-		 * is not supposed to issue syscalls at all from inside an
-		 * rseq critical section.
+		 * For RSEQ, don't invoke rseq_sched_switch_event() on the
+		 * caller.  User code is not supposed to issue syscalls at
+		 * all from inside an rseq critical section.
 		 */
 		if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
 			preempt_disable();
Re: [patch V2 06/37] rseq: Simplify the event notification
Posted by Mathieu Desnoyers 1 month, 1 week ago
On 2025-08-23 12:39, Thomas Gleixner wrote:
> Since commit 0190e4198e47 ("rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_*
> flags") the bits in task::rseq_event_mask are meaningless and just extra
> work in terms of setting them individually.
> 
> Aside of that the only relevant point where an event has to be raised is
> context switch. Neither the CPU nor MM CID can change without going through
> a context switch.

Note: we may want to include the numa node id field as well in this
list of fields.

> 
> Collapse them all into a single boolean which simplifies the code a lot and
> remove the pointless invocations which have been sprinkled all over the
> place for no value.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Cc: "Paul E. McKenney" <paulmck@kernel.org>
> Cc: Boqun Feng <boqun.feng@gmail.com>
> ---
> V2: Reduce it to the sched switch event.
> ---
>   fs/exec.c                 |    2 -
>   include/linux/rseq.h      |   66 +++++++++-------------------------------------
>   include/linux/sched.h     |   10 +++---
>   include/uapi/linux/rseq.h |   21 ++++----------
>   kernel/rseq.c             |   28 +++++++++++--------
>   kernel/sched/core.c       |    5 ---
>   kernel/sched/membarrier.c |    8 ++---
>   7 files changed, 48 insertions(+), 92 deletions(-)
> ---
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1775,7 +1775,7 @@ static int bprm_execve(struct linux_binp
>   		force_fatal_sig(SIGSEGV);
>   
>   	sched_mm_cid_after_execve(current);
> -	rseq_set_notify_resume(current);
> +	rseq_sched_switch_event(current);
>   	current->in_execve = 0;
>   
>   	return retval;
> --- a/include/linux/rseq.h
> +++ b/include/linux/rseq.h
> @@ -3,38 +3,8 @@
>   #define _LINUX_RSEQ_H
>   
>   #ifdef CONFIG_RSEQ
> -
> -#include <linux/preempt.h>
>   #include <linux/sched.h>
>   
> -#ifdef CONFIG_MEMBARRIER
> -# define RSEQ_EVENT_GUARD	irq
> -#else
> -# define RSEQ_EVENT_GUARD	preempt
> -#endif
> -
> -/*
> - * Map the event mask on the user-space ABI enum rseq_cs_flags
> - * for direct mask checks.
> - */
> -enum rseq_event_mask_bits {
> -	RSEQ_EVENT_PREEMPT_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
> -	RSEQ_EVENT_SIGNAL_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
> -	RSEQ_EVENT_MIGRATE_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
> -};
> -
> -enum rseq_event_mask {
> -	RSEQ_EVENT_PREEMPT	= (1U << RSEQ_EVENT_PREEMPT_BIT),
> -	RSEQ_EVENT_SIGNAL	= (1U << RSEQ_EVENT_SIGNAL_BIT),
> -	RSEQ_EVENT_MIGRATE	= (1U << RSEQ_EVENT_MIGRATE_BIT),
> -};
> -
> -static inline void rseq_set_notify_resume(struct task_struct *t)
> -{
> -	if (t->rseq)
> -		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
> -}
> -
>   void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
>   
>   static inline void rseq_handle_notify_resume(struct pt_regs *regs)
> @@ -43,35 +13,27 @@ static inline void rseq_handle_notify_re
>   		__rseq_handle_notify_resume(NULL, regs);
>   }
>   
> -static inline void rseq_signal_deliver(struct ksignal *ksig,
> -				       struct pt_regs *regs)
> +static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
>   {
>   	if (current->rseq) {
> -		scoped_guard(RSEQ_EVENT_GUARD)
> -			__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
> +		current->rseq_event_pending = true;
>   		__rseq_handle_notify_resume(ksig, regs);
>   	}
>   }
>   
> -/* rseq_preempt() requires preemption to be disabled. */
> -static inline void rseq_preempt(struct task_struct *t)
> +static inline void rseq_sched_switch_event(struct task_struct *t)
>   {
> -	__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
> -	rseq_set_notify_resume(t);
> -}
> -
> -/* rseq_migrate() requires preemption to be disabled. */
> -static inline void rseq_migrate(struct task_struct *t)
> -{
> -	__set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
> -	rseq_set_notify_resume(t);
> +	if (t->rseq) {
> +		t->rseq_event_pending = true;
> +		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
> +	}
>   }
>   
>   static __always_inline void rseq_exit_to_user_mode(void)
>   {
>   	if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
> -		if (WARN_ON_ONCE(current->rseq && current->rseq_event_mask))
> -			current->rseq_event_mask = 0;
> +		if (WARN_ON_ONCE(current->rseq && current->rseq_event_pending))
> +			current->rseq_event_pending = false;
>   	}
>   }
>   
> @@ -85,12 +47,12 @@ static inline void rseq_fork(struct task
>   		t->rseq = NULL;
>   		t->rseq_len = 0;
>   		t->rseq_sig = 0;
> -		t->rseq_event_mask = 0;
> +		t->rseq_event_pending = false;
>   	} else {
>   		t->rseq = current->rseq;
>   		t->rseq_len = current->rseq_len;
>   		t->rseq_sig = current->rseq_sig;
> -		t->rseq_event_mask = current->rseq_event_mask;
> +		t->rseq_event_pending = current->rseq_event_pending;
>   	}
>   }
>   
> @@ -99,15 +61,13 @@ static inline void rseq_execve(struct ta
>   	t->rseq = NULL;
>   	t->rseq_len = 0;
>   	t->rseq_sig = 0;
> -	t->rseq_event_mask = 0;
> +	t->rseq_event_pending = false;
>   }
>   
>   #else /* CONFIG_RSEQ */
> -static inline void rseq_set_notify_resume(struct task_struct *t) { }
>   static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { }
>   static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
> -static inline void rseq_preempt(struct task_struct *t) { }
> -static inline void rseq_migrate(struct task_struct *t) { }
> +static inline void rseq_sched_switch_event(struct task_struct *t) { }
>   static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
>   static inline void rseq_execve(struct task_struct *t) { }
>   static inline void rseq_exit_to_user_mode(void) { }
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1401,14 +1401,14 @@ struct task_struct {
>   #endif /* CONFIG_NUMA_BALANCING */
>   
>   #ifdef CONFIG_RSEQ
> -	struct rseq __user *rseq;
> -	u32 rseq_len;
> -	u32 rseq_sig;
> +	struct rseq __user		*rseq;
> +	u32				rseq_len;
> +	u32				rseq_sig;
>   	/*
> -	 * RmW on rseq_event_mask must be performed atomically
> +	 * RmW on rseq_event_pending must be performed atomically
>   	 * with respect to preemption.
>   	 */
> -	unsigned long rseq_event_mask;
> +	bool				rseq_event_pending;

AFAIU, this rseq_event_pending field is now concurrently set from:

- rseq_signal_deliver (without any preempt nor irqoff guard)
- rseq_sched_switch_event (with preemption disabled)

Is it safe to concurrently store to a "bool" field within a structure
without any protection against concurrent stores ? Typically I've used
an integer field just to be on the safe side in that kind of situation.

AFAIR, a bool type needs to be at least 1 byte. Do all architectures
supported by Linux have a single byte store instruction, or can we end
up incorrectly storing to other nearby fields ? (for instance, DEC
Alpha ?)

>   # ifdef CONFIG_DEBUG_RSEQ
>   	/*
>   	 * This is a place holder to save a copy of the rseq fields for
> --- a/include/uapi/linux/rseq.h
> +++ b/include/uapi/linux/rseq.h
> @@ -114,20 +114,13 @@ struct rseq {
>   	/*
>   	 * Restartable sequences flags field.
>   	 *
> -	 * This field should only be updated by the thread which
> -	 * registered this data structure. Read by the kernel.
> -	 * Mainly used for single-stepping through rseq critical sections
> -	 * with debuggers.
> -	 *
> -	 * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
> -	 *     Inhibit instruction sequence block restart on preemption
> -	 *     for this thread.
> -	 * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
> -	 *     Inhibit instruction sequence block restart on signal
> -	 *     delivery for this thread.
> -	 * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
> -	 *     Inhibit instruction sequence block restart on migration for
> -	 *     this thread.
> +	 * This field was initialy intended to allow event masking for for

initially

for for -> for

> +	 * single-stepping through rseq critical sections with debuggers.
> +	 * The kernel does not support this anymore and the relevant bits
> +	 * are checked for being always false:
> +	 *	- RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
> +	 *	- RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
> +	 *	- RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
>   	 */
>   	__u32 flags;
>   
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -78,6 +78,12 @@
>   #define CREATE_TRACE_POINTS
>   #include <trace/events/rseq.h>
>   
> +#ifdef CONFIG_MEMBARRIER
> +# define RSEQ_EVENT_GUARD	irq
> +#else
> +# define RSEQ_EVENT_GUARD	preempt
> +#endif
> +
>   /* The original rseq structure size (including padding) is 32 bytes. */
>   #define ORIG_RSEQ_SIZE		32
>   
> @@ -430,11 +436,11 @@ void __rseq_handle_notify_resume(struct
>   	 */
>   	if (regs) {
>   		/*
> -		 * Read and clear the event mask first. If the task was not
> -		 * preempted or migrated or a signal is on the way, there
> -		 * is no point in doing any of the heavy lifting here on
> -		 * production kernels. In that case TIF_NOTIFY_RESUME was
> -		 * raised by some other functionality.
> +		 * Read and clear the event pending bit first. If the task
> +		 * was not preempted or migrated or a signal is on the way,
> +		 * there is no point in doing any of the heavy lifting here
> +		 * on production kernels. In that case TIF_NOTIFY_RESUME
> +		 * was raised by some other functionality.
>   		 *
>   		 * This is correct because the read/clear operation is
>   		 * guarded against scheduler preemption, which makes it CPU
> @@ -447,15 +453,15 @@ void __rseq_handle_notify_resume(struct
>   		 * with the result handed in to allow the detection of
>   		 * inconsistencies.
>   		 */
> -		u32 event_mask;
> +		bool event;
>   
>   		scoped_guard(RSEQ_EVENT_GUARD) {
> -			event_mask = t->rseq_event_mask;
> -			t->rseq_event_mask = 0;
> +			event = t->rseq_event_pending;
> +			t->rseq_event_pending = false;
>   		}
>   
> -		if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event_mask) {
> -			ret = rseq_ip_fixup(regs, !!event_mask);
> +		if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
> +			ret = rseq_ip_fixup(regs, event);
>   			if (unlikely(ret < 0))
>   				goto error;
>   		}
> @@ -584,7 +590,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
>   	 * registered, ensure the cpu_id_start and cpu_id fields
>   	 * are updated before returning to user-space.
>   	 */
> -	rseq_set_notify_resume(current);
> +	rseq_sched_switch_event(current);
>   
>   	return 0;
>   }
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3364,7 +3364,6 @@ void set_task_cpu(struct task_struct *p,
>   		if (p->sched_class->migrate_task_rq)
>   			p->sched_class->migrate_task_rq(p, new_cpu);
>   		p->se.nr_migrations++;
> -		rseq_migrate(p);

OK yes, all rseq_migrate can go away because it requires a preemption,
and those are combined into the same state.

Thanks,

Mathieu

>   		sched_mm_cid_migrate_from(p);
>   		perf_event_task_migrate(p);
>   	}
> @@ -4795,7 +4794,6 @@ int sched_cgroup_fork(struct task_struct
>   		p->sched_task_group = tg;
>   	}
>   #endif
> -	rseq_migrate(p);
>   	/*
>   	 * We're setting the CPU for the first time, we don't migrate,
>   	 * so use __set_task_cpu().
> @@ -4859,7 +4857,6 @@ void wake_up_new_task(struct task_struct
>   	 * as we're not fully set-up yet.
>   	 */
>   	p->recent_used_cpu = task_cpu(p);
> -	rseq_migrate(p);
>   	__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
>   	rq = __task_rq_lock(p, &rf);
>   	update_rq_clock(rq);
> @@ -5153,7 +5150,7 @@ prepare_task_switch(struct rq *rq, struc
>   	kcov_prepare_switch(prev);
>   	sched_info_switch(rq, prev, next);
>   	perf_event_task_sched_out(prev, next);
> -	rseq_preempt(prev);
> +	rseq_sched_switch_event(prev);
>   	fire_sched_out_preempt_notifiers(prev, next);
>   	kmap_local_sched_out();
>   	prepare_task(next);
> --- a/kernel/sched/membarrier.c
> +++ b/kernel/sched/membarrier.c
> @@ -199,7 +199,7 @@ static void ipi_rseq(void *info)
>   	 * is negligible.
>   	 */
>   	smp_mb();
> -	rseq_preempt(current);
> +	rseq_sched_switch_event(current);
>   }
>   
>   static void ipi_sync_rq_state(void *info)
> @@ -407,9 +407,9 @@ static int membarrier_private_expedited(
>   		 * membarrier, we will end up with some thread in the mm
>   		 * running without a core sync.
>   		 *
> -		 * For RSEQ, don't rseq_preempt() the caller.  User code
> -		 * is not supposed to issue syscalls at all from inside an
> -		 * rseq critical section.
> +		 * For RSEQ, don't invoke rseq_sched_switch_event() on the
> +		 * caller.  User code is not supposed to issue syscalls at
> +		 * all from inside an rseq critical section.
>   		 */
>   		if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
>   			preempt_disable();
> 


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
Re: [patch V2 06/37] rseq: Simplify the event notification
Posted by Thomas Gleixner 1 month ago
On Mon, Aug 25 2025 at 13:36, Mathieu Desnoyers wrote:
> On 2025-08-23 12:39, Thomas Gleixner wrote:
>> Since commit 0190e4198e47 ("rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_*
>> flags") the bits in task::rseq_event_mask are meaningless and just extra
>> work in terms of setting them individually.
>> 
>> Aside of that the only relevant point where an event has to be raised is
>> context switch. Neither the CPU nor MM CID can change without going through
>> a context switch.
>
> Note: we may want to include the numa node id field as well in this
> list of fields.

What for? The node to CPU relationship is not magically changing, so you
can't have a situation where the task stays on the same CPU and suddenly
runs on a different node.

>> -	unsigned long rseq_event_mask;
>> +	bool				rseq_event_pending;
>
> AFAIU, this rseq_event_pending field is now concurrently set from:
>
> - rseq_signal_deliver (without any preempt nor irqoff guard)
> - rseq_sched_switch_event (with preemption disabled)
>
> Is it safe to concurrently store to a "bool" field within a structure
> without any protection against concurrent stores ? Typically I've used
> an integer field just to be on the safe side in that kind of situation.
>
> AFAIR, a bool type needs to be at least 1 byte. Do all architectures
> supported by Linux have a single byte store instruction, or can we end
> up incorrectly storing to other nearby fields ? (for instance, DEC
> Alpha ?)

All architectures which support RSEQ do and I really don't care about
ALPHA, which has other problems than that.

Thanks,

        tglx
Re: [patch V2 06/37] rseq: Simplify the event notification
Posted by Mathieu Desnoyers 4 weeks, 1 day ago
On 2025-09-02 09:39, Thomas Gleixner wrote:
> On Mon, Aug 25 2025 at 13:36, Mathieu Desnoyers wrote:
>> On 2025-08-23 12:39, Thomas Gleixner wrote:
>>> Since commit 0190e4198e47 ("rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_*
>>> flags") the bits in task::rseq_event_mask are meaningless and just extra
>>> work in terms of setting them individually.
>>>
>>> Aside of that the only relevant point where an event has to be raised is
>>> context switch. Neither the CPU nor MM CID can change without going through
>>> a context switch.
>>
>> Note: we may want to include the numa node id field as well in this
>> list of fields.
> 
> What for? The node to CPU relationship is not magically changing, so you
> can't have a situation where the task stays on the same CPU and suddenly
> runs on a different node.

Good point. For the records, I suspect what I was confused about on the
powerpc side is that PAPR [1] allows the architecture to reconfigure
NUMA node to CPU mapping for virtualization use-cases, but AFAIU when
this happens the kernel will keep its own original mapping after the
CPUs were onlined at least once.

This would explain the purpose of the lparnumascore command that returns
a score estimating how much the kernel view of NUMA node to CPU mappings
differs from the current HW.

So AFAIU, from a kernel perspective, the NUMA node to CPU mapping is
invariant after it's been observed.

Out of curiosity, does the hwloc tool return the kernel's
CPU to node mapping as "logical" listing, and the PAPR CPU to node
mapping (which can change dynamically) as "physical" listing ?

[1] https://github.com/ibm-power-utilities/powerpc-utils

> 
>>> -	unsigned long rseq_event_mask;
>>> +	bool				rseq_event_pending;
>>
>> AFAIU, this rseq_event_pending field is now concurrently set from:
>>
>> - rseq_signal_deliver (without any preempt nor irqoff guard)
>> - rseq_sched_switch_event (with preemption disabled)
>>
>> Is it safe to concurrently store to a "bool" field within a structure
>> without any protection against concurrent stores ? Typically I've used
>> an integer field just to be on the safe side in that kind of situation.
>>
>> AFAIR, a bool type needs to be at least 1 byte. Do all architectures
>> supported by Linux have a single byte store instruction, or can we end
>> up incorrectly storing to other nearby fields ? (for instance, DEC
>> Alpha ?)
> 
> All architectures which support RSEQ do and I really don't care about
> ALPHA, which has other problems than that.

OK.

Thanks!

Mathieu

> 
> Thanks,
> 
>          tglx


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com