[v4] rseq: Optimize exit to user space

[patch V4 00/36] rseq: Optimize exit to user space

Posted by Thomas Gleixner 23 hours ago

This is a follow up on the V3 series, which can be found here:

   https://lore.kernel.org/all/20250904185336.943880027@linutronix.de

The V2 posting contains a detailed list of the addressed problems. TLDR:

    - A significant amount of pointless RSEQ operations on exit to user
      space, which have been reported by people as measurable impact after
      glibc switched to use RSEQ

    - Suboptimal hotpath handling both in the scheduler and on exit to user
      space.

This series addresses these issues by:

  1) Limiting the RSEQ work to the actual conditions where it is
     required. The full benefit is only available for architectures using
     the generic entry infrastructure. All others get at least the basic
     improvements.

  2) Re-implementing the whole user space handling based on proper data
     structures and by actually looking at the impact it creates in the
     fast path.

  3) Moving the actual handling of RSEQ out to the latest point in the exit
     path, where possible. This is fully inlined into the fast path to keep
     the impact confined.

Changes vs. V3:

  1) Move _all_ RSEQ related data into a umbrella data structure

     When trying to adopt the time slice extension PoC to the rework, I
     noticed that just moving the event and IDs into a data structure is
     dumb.

     Moving all rseq related data into an umbrella struct allows to
     simplify fork/exec by reducing them to memset/memcpy() plus minimal
     extra work, which pretty much avoids to copy or reset further
     additions later on.

     That's a purely mechanical change done with coccinelle on top of V3
     and then gradually folded back with scripting into the series.

  2) Further simplification of the exit_to_user_mode() integration.

     The games with ti_work returned by rseq_exit_to_user_mode() are not
     necessary at all and just complicate things.

     The point is that the RSEQ code is only invoked when all other TIF
     bits except TIF_RSEQ have been processed. So moving the handling out
     of the exit_to_user_mode() loop into a surrounding loop makes it way
     simpler and the resulting ASM is more straight forward. The unlikely
     error case is just looping back once more into the inner loop.

     That elimitates the extra TIF_RSEQ optimization of the previous series
     (patch 37/37) as it now comes for free immediately when hooking up
     TIF_RSEQ. That means the related helper and the ugly goto are gone
     too.

Delta patch to V3 is below.

As for the previous version these patches have a pile of dependencies:

The series depends on the separately posted rseq bugfix:

   https://lore.kernel.org/lkml/87o6sj6z95.ffs@tglx/

and the uaccess generic helper series:

   https://lore.kernel.org/lkml/20250813150610.521355442@linutronix.de/

and a related futex fix in

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/urgent

The combination of all of them and some other related fixes (rseq
selftests) are available here:

    git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git rseq/base

For your convenience all of it is also available as a conglomerate from
git:

    git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git rseq/perf

Thanks,

	tglx
---
 Documentation/admin-guide/kernel-parameters.txt |    4 
 arch/Kconfig                                    |    4 
 arch/loongarch/Kconfig                          |    1 
 arch/loongarch/include/asm/thread_info.h        |   76 +-
 arch/riscv/Kconfig                              |    1 
 arch/riscv/include/asm/thread_info.h            |   31 -
 arch/s390/Kconfig                               |    1 
 arch/s390/include/asm/thread_info.h             |   44 -
 arch/x86/Kconfig                                |    1 
 arch/x86/entry/syscall_32.c                     |    3 
 arch/x86/include/asm/thread_info.h              |   76 +-
 drivers/hv/mshv_root_main.c                     |    3 
 fs/binfmt_elf.c                                 |    2 
 fs/exec.c                                       |    2 
 include/asm-generic/thread_info_tif.h           |   51 +
 include/linux/entry-common.h                    |   38 -
 include/linux/irq-entry-common.h                |   68 ++
 include/linux/mm.h                              |   25 
 include/linux/resume_user_mode.h                |    2 
 include/linux/rseq.h                            |  235 +++++----
 include/linux/rseq_entry.h                      |  607 +++++++++++++++++++++++
 include/linux/rseq_types.h                      |   93 +++
 include/linux/sched.h                           |   48 +
 include/linux/thread_info.h                     |    5 
 include/trace/events/rseq.h                     |    4 
 include/uapi/linux/rseq.h                       |   21 
 init/Kconfig                                    |   28 +
 kernel/entry/common.c                           |   39 -
 kernel/entry/syscall-common.c                   |    8 
 kernel/ptrace.c                                 |    6 
 kernel/rseq.c                                   |  623 +++++++++---------------
 kernel/sched/core.c                             |   10 
 kernel/sched/membarrier.c                       |    8 
 kernel/sched/sched.h                            |    5 
 virt/kvm/kvm_main.c                             |    3 
 35 files changed, 1450 insertions(+), 726 deletions(-)

---
Delta to V3:

diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index 2f166e93f016..b8ea95011ec3 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -11,11 +11,11 @@ void __rseq_handle_slowpath(struct pt_regs *regs);
 static inline void rseq_handle_slowpath(struct pt_regs *regs)
 {
 	if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
-		if (current->rseq_event.slowpath)
+		if (current->rseq.event.slowpath)
 			__rseq_handle_slowpath(regs);
 	} else {
 		/* '&' is intentional to spare one conditional branch */
-		if (current->rseq_event.sched_switch & current->rseq_event.has_rseq)
+		if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
 			__rseq_handle_slowpath(regs);
 	}
 }
@@ -30,10 +30,10 @@ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *reg
 {
 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
 		/* '&' is intentional to spare one conditional branch */
-		if (current->rseq_event.has_rseq & current->rseq_event.user_irq)
+		if (current->rseq.event.has_rseq & current->rseq.event.user_irq)
 			__rseq_signal_deliver(ksig->sig, regs);
 	} else {
-		if (current->rseq_event.has_rseq)
+		if (current->rseq.event.has_rseq)
 			__rseq_signal_deliver(ksig->sig, regs);
 	}
 }
@@ -46,7 +46,7 @@ static inline void rseq_raise_notify_resume(struct task_struct *t)
 /* Invoked from context switch to force evaluation on exit to user */
 static __always_inline void rseq_sched_switch_event(struct task_struct *t)
 {
-	struct rseq_event *ev = &t->rseq_event;
+	struct rseq_event *ev = &t->rseq.event;
 
 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
 		/*
@@ -64,7 +64,7 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t)
 		}
 	} else {
 		if (ev->has_rseq) {
-			t->rseq_event.sched_switch = true;
+			t->rseq.event.sched_switch = true;
 			rseq_raise_notify_resume(t);
 		}
 	}
@@ -79,7 +79,7 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t)
  */
 static __always_inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu)
 {
-	t->rseq_event.ids_changed = true;
+	t->rseq.event.ids_changed = true;
 }
 
 /*
@@ -96,16 +96,16 @@ static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, un
 	 * provide a conditional for it readily. So avoid excessive updates
 	 * when nothing changes.
 	 */
-	if (t->rseq_ids.mm_cid != cid)
-		t->rseq_event.ids_changed = true;
+	if (t->rseq.ids.mm_cid != cid)
+		t->rseq.event.ids_changed = true;
 }
 
 /* Enforce a full update after RSEQ registration and when execve() failed */
 static inline void rseq_force_update(void)
 {
-	if (current->rseq_event.has_rseq) {
-		current->rseq_event.ids_changed = true;
-		current->rseq_event.sched_switch = true;
+	if (current->rseq.event.has_rseq) {
+		current->rseq.event.ids_changed = true;
+		current->rseq.event.sched_switch = true;
 		rseq_raise_notify_resume(current);
 	}
 }
@@ -124,15 +124,27 @@ static inline void rseq_force_update(void)
  */
 static inline void rseq_virt_userspace_exit(void)
 {
+	if (current->rseq.event.sched_switch)
 	/*
 	 * The generic optimization for deferring RSEQ updates until the next
 	 * exit relies on having a dedicated TIF_RSEQ.
 	 */
 	if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) &&
-	    current->rseq_event.sched_switch)
+	    current->rseq.event.sched_switch)
 		rseq_raise_notify_resume(current);
 }
 
+static inline void rseq_reset(struct task_struct *t)
+{
+	memset(&t->rseq, 0, sizeof(t->rseq));
+	t->rseq.ids.cpu_cid = ~0ULL;
+}
+
+static inline void rseq_execve(struct task_struct *t)
+{
+	rseq_reset(t);
+}
+
 /*
  * If parent process has a registered restartable sequences area, the
  * child inherits. Unregister rseq for a clone with CLONE_VM set.
@@ -140,17 +152,10 @@ static inline void rseq_virt_userspace_exit(void)
 static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
 {
 	if (clone_flags & CLONE_VM) {
-		t->rseq = NULL;
-		t->rseq_len = 0;
-		t->rseq_sig = 0;
-		t->rseq_ids.cpu_cid = ~0ULL;
-		t->rseq_event.all = 0;
+		rseq_reset(t);
 	} else {
 		t->rseq = current->rseq;
-		t->rseq_len = current->rseq_len;
-		t->rseq_sig = current->rseq_sig;
-		t->rseq_ids.cpu_cid = ~0ULL;
-		t->rseq_event = current->rseq_event;
+		t->rseq.ids.cpu_cid = ~0ULL;
 		/*
 		 * If it has rseq, force it into the slow path right away
 		 * because it is guaranteed to fault.
@@ -160,22 +165,13 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
 		 * for those who do it's required to enforce the slow path
 		 * as the scheduler sets only TIF_RSEQ.
 		 */
-		if (t->rseq_event.has_rseq) {
-			t->rseq_event.slowpath = true;
+		if (t->rseq.event.has_rseq) {
+			t->rseq.event.slowpath = true;
 			set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
 		}
 	}
 }
 
-static inline void rseq_execve(struct task_struct *t)
-{
-	t->rseq = NULL;
-	t->rseq_len = 0;
-	t->rseq_sig = 0;
-	t->rseq_ids.cpu_cid = ~0ULL;
-	t->rseq_event.all = 0;
-}
-
 #else /* CONFIG_RSEQ */
 static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index fe6b3cc54d54..ce1ad66c48c3 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -11,7 +11,6 @@ struct rseq_stats {
 	unsigned long	signal;
 	unsigned long	slowpath;
 	unsigned long	fastpath;
-	unsigned long	quicktif;
 	unsigned long	ids;
 	unsigned long	cs;
 	unsigned long	clear;
@@ -65,7 +64,7 @@ static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
 }
 
 #else /* CONFIG_TRACEPOINT */
-static inline void rseq_trace_update(struct task_struct *t) { }
+static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { }
 static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
 				       unsigned long offset, unsigned long abort_ip) { }
 #endif /* !CONFIG_TRACEPOINT */
@@ -84,7 +83,7 @@ bool rseq_debug_validate_ids(struct task_struct *t);
 static __always_inline void rseq_note_user_irq_entry(void)
 {
 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
-		current->rseq_event.user_irq = true;
+		current->rseq.event.user_irq = true;
 }
 
 /*
@@ -172,17 +171,17 @@ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsi
 	/* abort_ip - 4 is >= 0. See abort_ip check above */
 	uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
 	unsafe_get_user(usig, uc_sig, fail);
-	if (unlikely(usig != t->rseq_sig))
+	if (unlikely(usig != t->rseq.sig))
 		goto die;
 
 	/* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */
 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
 		/* If not in interrupt from user context, let it die */
-		if (unlikely(!t->rseq_event.user_irq))
+		if (unlikely(!t->rseq.event.user_irq))
 			goto die;
 	}
 
-	unsafe_put_user(0ULL, &t->rseq->rseq_cs, fail);
+	unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, fail);
 	user_access_end();
 
 	instruction_pointer_set(regs, (unsigned long)abort_ip);
@@ -191,12 +190,12 @@ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsi
 	rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
 	return true;
 clear:
-	unsafe_put_user(0ULL, &t->rseq->rseq_cs, fail);
+	unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, fail);
 	user_access_end();
 	rseq_stat_inc(rseq_stats.clear);
 	return true;
 die:
-	t->rseq_event.fatal = true;
+	t->rseq.event.fatal = true;
 fail:
 	user_access_end();
 	return false;
@@ -209,23 +208,23 @@ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsi
  */
 bool rseq_debug_validate_ids(struct task_struct *t)
 {
-	struct rseq __user *rseq = t->rseq;
+	struct rseq __user *rseq = t->rseq.usrptr;
 	u32 cpu_id, uval, node_id;
 
-	if (t->rseq_ids.cpu_cid == ~0)
+	if (t->rseq.ids.cpu_cid == ~0)
 		return true;
 
 	/*
 	 * Look it up outside of the user access section as cpu_to_node()
 	 * can end up in debug code.
 	 */
-	node_id = cpu_to_node(t->rseq_ids.cpu_id);
+	node_id = cpu_to_node(t->rseq.ids.cpu_id);
 
 	if (!user_read_masked_begin(rseq))
 		return false;
 
 	unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
-	if (cpu_id != t->rseq_ids.cpu_id)
+	if (cpu_id != t->rseq.ids.cpu_id)
 		goto die;
 	unsafe_get_user(uval, &rseq->cpu_id, efault);
 	if (uval != cpu_id)
@@ -234,12 +233,12 @@ bool rseq_debug_validate_ids(struct task_struct *t)
 	if (uval != node_id)
 		goto die;
 	unsafe_get_user(uval, &rseq->mm_cid, efault);
-	if (uval != t->rseq_ids.mm_cid)
+	if (uval != t->rseq.ids.mm_cid)
 		goto die;
 	user_access_end();
 	return true;
 die:
-	t->rseq_event.fatal = true;
+	t->rseq.event.fatal = true;
 efault:
 	user_access_end();
 	return false;
@@ -263,7 +262,7 @@ rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long c
 	rseq_stat_inc(rseq_stats.cs);
 
 	if (unlikely(csaddr >= tasksize)) {
-		t->rseq_event.fatal = true;
+		t->rseq.event.fatal = true;
 		return false;
 	}
 
@@ -307,11 +306,11 @@ rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long c
 	/* The address is guaranteed to be >= 0 and < TASK_SIZE */
 	uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
 	unsafe_get_user(usig, uc_sig, fail);
-	if (unlikely(usig != t->rseq_sig))
+	if (unlikely(usig != t->rseq.sig))
 		goto die;
 
 	/* Invalidate the critical section */
-	unsafe_put_user(0ULL, &t->rseq->rseq_cs, fail);
+	unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, fail);
 	user_access_end();
 
 	/* Update the instruction pointer */
@@ -321,12 +320,12 @@ rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long c
 	rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
 	return true;
 clear:
-	unsafe_put_user(0ULL, &t->rseq->rseq_cs, fail);
+	unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, fail);
 	user_access_end();
 	rseq_stat_inc(rseq_stats.clear);
 	return true;
 die:
-	t->rseq_event.fatal = true;
+	t->rseq.event.fatal = true;
 fail:
 	user_access_end();
 	return false;
@@ -358,7 +357,7 @@ static rseq_inline
 bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
 			     u32 node_id, u64 *csaddr)
 {
-	struct rseq __user *rseq = t->rseq;
+	struct rseq __user *rseq = t->rseq.usrptr;
 
 	if (static_branch_unlikely(&rseq_debug_enabled)) {
 		if (!rseq_debug_validate_ids(t))
@@ -377,7 +376,7 @@ bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
 	user_access_end();
 
 	/* Cache the new values */
-	t->rseq_ids.cpu_cid = ids->cpu_cid;
+	t->rseq.ids.cpu_cid = ids->cpu_cid;
 	rseq_stat_inc(rseq_stats.ids);
 	rseq_trace_update(t, ids);
 	return true;
@@ -405,7 +404,7 @@ static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *r
 	 */
 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
 		if (!static_branch_unlikely(&rseq_debug_enabled)) {
-			if (likely(!t->rseq_event.user_irq))
+			if (likely(!t->rseq.event.user_irq))
 				return true;
 		}
 	}
@@ -476,21 +475,21 @@ static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *reg
 	 * A sane compiler requires four instructions for the nothing to do
 	 * case including clearing the events, but your milage might vary.
 	 */
-	if (likely(!(t->rseq_event.sched_switch & t->rseq_event.has_rseq)))
+	if (likely(!(t->rseq.event.sched_switch & t->rseq.event.has_rseq)))
 		goto done;
 
 	rseq_stat_inc(rseq_stats.fastpath);
 
 	pagefault_disable();
 
-	if (likely(!t->rseq_event.ids_changed)) {
+	if (likely(!t->rseq.event.ids_changed)) {
 		/*
 		 * If IDs have not changed rseq_event::user_irq must be true
 		 * See rseq_sched_switch_event().
 		 */
 		u64 csaddr;
 
-		if (unlikely(get_user_masked_u64(&csaddr, &t->rseq->rseq_cs)))
+		if (unlikely(get_user_masked_u64(&csaddr, &t->rseq.usrptr->rseq_cs)))
 			goto fail;
 
 		if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) {
@@ -512,61 +511,55 @@ static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *reg
 
 done:
 	/* Clear state so next entry starts from a clean slate */
-	t->rseq_event.events = 0;
+	t->rseq.event.events = 0;
 	return false;
 
 fail:
 	pagefault_enable();
 	/* Force it into the slow path. Don't clear the state! */
-	t->rseq_event.slowpath = true;
+	t->rseq.event.slowpath = true;
 	set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
 	return true;
 }
 
+/*
+ * Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS
+ * as that's not upstream yet.
+ */
 #ifdef CONFIG_HAVE_GENERIC_TIF_BITS
-# define CHECK_TIF_RSEQ		_TIF_RSEQ
+static __always_inline bool test_tif_rseq(unsigned long ti_work)
+{
+	return ti_work & _TIF_RSEQ;
+}
+
 static __always_inline void clear_tif_rseq(void)
 {
 	static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME);
 	clear_thread_flag(TIF_RSEQ);
 }
 #else
-# define CHECK_TIF_RSEQ		0UL
-static inline void clear_tif_rseq(void) { }
+static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; }
+static __always_inline void clear_tif_rseq(void) { }
 #endif
 
-static __always_inline unsigned long
-rseq_exit_to_user_mode_work(struct pt_regs *regs, unsigned long ti_work, const unsigned long mask)
+static __always_inline bool
+rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
 {
-	/*
-	 * Check if all work bits have been cleared before handling rseq.
-	 *
-	 * In case of a seperate TIF_RSEQ this checks for all other bits to
-	 * be cleared and TIF_RSEQ to be set.
-	 */
-	if ((ti_work & mask) != CHECK_TIF_RSEQ)
-		return ti_work;
+	if (likely(!test_tif_rseq(ti_work)))
+		return false;
 
-	if (likely(!__rseq_exit_to_user_mode_restart(regs))) {
-		clear_tif_rseq();
-		return ti_work & ~CHECK_TIF_RSEQ;
-	}
-	return ti_work | _TIF_NOTIFY_RESUME;
-}
+	if (unlikely(__rseq_exit_to_user_mode_restart(regs)))
+		return true;
 
-static __always_inline bool
-rseq_exit_to_user_mode_early(unsigned long ti_work, const unsigned long mask)
-{
-	if (IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS))
-		return (ti_work & mask) == CHECK_TIF_RSEQ;
+	clear_tif_rseq();
 	return false;
 }
 
-#endif /* !CONFIG_GENERIC_ENTRY */
+#endif /* CONFIG_GENERIC_ENTRY */
 
 static __always_inline void rseq_syscall_exit_to_user_mode(void)
 {
-	struct rseq_event *ev = &current->rseq_event;
+	struct rseq_event *ev = &current->rseq.event;
 
 	rseq_stat_inc(rseq_stats.exit);
 
@@ -579,7 +572,7 @@ static __always_inline void rseq_syscall_exit_to_user_mode(void)
 
 static __always_inline void rseq_irqentry_exit_to_user_mode(void)
 {
-	struct rseq_event *ev = &current->rseq_event;
+	struct rseq_event *ev = &current->rseq.event;
 
 	rseq_stat_inc(rseq_stats.exit);
 
@@ -601,18 +594,11 @@ static inline void rseq_debug_syscall_return(struct pt_regs *regs)
 		__rseq_debug_syscall_return(regs);
 }
 #else /* CONFIG_RSEQ */
-static inline unsigned long rseq_exit_to_user_mode_work(struct pt_regs *regs,
-							unsigned long ti_work,
-							const unsigned long mask)
-{
-	return ti_work;
-}
-
-static inline bool rseq_exit_to_user_mode_early(unsigned long ti_work, const unsigned long mask)
+static inline void rseq_note_user_irq_entry(void) { }
+static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs)
 {
 	return false;
 }
-static inline void rseq_note_user_irq_entry(void) { }
 static inline void rseq_syscall_exit_to_user_mode(void) { }
 static inline void rseq_irqentry_exit_to_user_mode(void) { }
 static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 68dfc215bbff..9c7a34154de8 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -3,10 +3,11 @@
 #define _LINUX_RSEQ_TYPES_H
 
 #include <linux/types.h>
-/* Forward declaration for sched.h */
+
+#ifdef CONFIG_RSEQ
 struct rseq;
 
-/*
+/**
  * struct rseq_event - Storage for rseq related event management
  * @all:		Compound to initialize and clear the data efficiently
  * @events:		Compound to access events with a single load/store
@@ -50,7 +51,7 @@ struct rseq_event {
 	};
 };
 
-/*
+/**
  * struct rseq_ids - Cache for ids, which need to be updated
  * @cpu_cid:	Compound of @cpu_id and @mm_cid to make the
  *		compiler emit a single compare on 64-bit
@@ -69,4 +70,24 @@ struct rseq_ids {
 	};
 };
 
+/**
+ * struct rseq_data - Storage for all rseq related data
+ * @usrptr:	Pointer to the registered user space RSEQ memory
+ * @len:	Length of the RSEQ region
+ * @sig:	Signature of critial section abort IPs
+ * @event:	Storage for event management
+ * @ids:	Storage for cached CPU ID and MM CID
+ */
+struct rseq_data {
+	struct rseq __user		*usrptr;
+	u32				len;
+	u32				sig;
+	struct rseq_event		event;
+	struct rseq_ids			ids;
+};
+
+#else /* CONFIG_RSEQ */
+struct rseq_data { };
+#endif /* !CONFIG_RSEQ */
+
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5ba86a668980..857ed17d443b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1400,13 +1400,7 @@ struct task_struct {
 	unsigned long			numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
-#ifdef CONFIG_RSEQ
-	struct rseq __user		*rseq;
-	u32				rseq_len;
-	u32				rseq_sig;
-	struct rseq_event		rseq_event;
-	struct rseq_ids			rseq_ids;
-#endif
+	struct rseq_data		rseq;
 
 #ifdef CONFIG_SCHED_MM_CID
 	int				mm_cid;		/* Current cid in mm */
diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h
index a6ec3f0c8ae7..ce85d650bf4b 100644
--- a/include/trace/events/rseq.h
+++ b/include/trace/events/rseq.h
@@ -21,9 +21,9 @@ TRACE_EVENT(rseq_update,
 	),
 
 	TP_fast_assign(
-		__entry->cpu_id = t->rseq_ids.cpu_id;
+		__entry->cpu_id = t->rseq.ids.cpu_id;
 		__entry->node_id = cpu_to_node(__entry->cpu_id);
-		__entry->mm_cid = t->rseq_ids.mm_cid;
+		__entry->mm_cid = t->rseq.ids.mm_cid;
 	),
 
 	TP_printk("cpu_id=%d node_id=%d mm_cid=%d", __entry->cpu_id,
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 73028b98aa6a..cca17016c5da 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -11,26 +11,21 @@
 /* Workaround to allow gradual conversion of architecture code */
 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
 
-/**
- * exit_to_user_mode_loop - do any pending work before leaving to user space
- * @regs:	Pointer to pt_regs on entry stack
- * @ti_work:	TIF work flags as read by the caller
- */
-__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
-						     unsigned long ti_work)
+#ifdef CONFIG_HAVE_GENERIC_TIF_BITS
+#define EXIT_TO_USER_MODE_WORK_LOOP	(EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ)
+#else
+#define EXIT_TO_USER_MODE_WORK_LOOP	(EXIT_TO_USER_MODE_WORK)
+#endif
+
+static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs,
+							      unsigned long ti_work)
 {
 	/*
 	 * Before returning to user space ensure that all pending work
 	 * items have been completed.
-	 *
-	 * Optimize for TIF_RSEQ being the only bit set.
 	 */
-	if (rseq_exit_to_user_mode_early(ti_work, EXIT_TO_USER_MODE_WORK)) {
-		rseq_stat_inc(rseq_stats.quicktif);
-		goto do_rseq;
-	}
+	while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
 
-	do {
 		local_irq_enable_exit_to_user(ti_work);
 
 		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
@@ -62,26 +57,29 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 		tick_nohz_user_enter_prepare();
 
 		ti_work = read_thread_flags();
-
-	do_rseq:
-		/*
-		 * This returns the unmodified ti_work, when ti_work is not
-		 * empty (except for TIF_RSEQ). In that case it waits for
-		 * the next round to avoid multiple updates in case of
-		 * rescheduling.
-		 *
-		 * When it handles rseq it returns either with empty work
-		 * on success or with TIF_NOTIFY_RESUME set on failure to
-		 * kick the handling into the slow path.
-		 */
-		ti_work = rseq_exit_to_user_mode_work(regs, ti_work, EXIT_TO_USER_MODE_WORK);
-
-	} while (ti_work & EXIT_TO_USER_MODE_WORK);
+	}
 
 	/* Return the latest work state for arch_exit_to_user_mode() */
 	return ti_work;
 }
 
+/**
+ * exit_to_user_mode_loop - do any pending work before leaving to user space
+ * @regs:	Pointer to pt_regs on entry stack
+ * @ti_work:	TIF work flags as read by the caller
+ */
+__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+						     unsigned long ti_work)
+{
+	for (;;) {
+		ti_work = __exit_to_user_mode_loop(regs, ti_work);
+
+		if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work)))
+			return ti_work;
+		ti_work = read_thread_flags();
+	}
+}
+
 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 {
 	irqentry_state_t ret = {
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 75a84efad40f..392ec2f75f01 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -793,9 +793,9 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,
 					  unsigned long size, void __user *data)
 {
 	struct ptrace_rseq_configuration conf = {
-		.rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
-		.rseq_abi_size = task->rseq_len,
-		.signature = task->rseq_sig,
+		.rseq_abi_pointer = (u64)(uintptr_t)task->rseq.usrptr,
+		.rseq_abi_size = task->rseq.len,
+		.signature = task->rseq.sig,
 		.flags = 0,
 	};
 
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 01c7402d13a3..52b276a5e004 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -134,7 +134,6 @@ static int rseq_stats_show(struct seq_file *m, void *p)
 		stats.signal	+= data_race(per_cpu(rseq_stats.signal, cpu));
 		stats.slowpath	+= data_race(per_cpu(rseq_stats.slowpath, cpu));
 		stats.fastpath	+= data_race(per_cpu(rseq_stats.fastpath, cpu));
-		stats.quicktif	+= data_race(per_cpu(rseq_stats.quicktif, cpu));
 		stats.ids	+= data_race(per_cpu(rseq_stats.ids, cpu));
 		stats.cs	+= data_race(per_cpu(rseq_stats.cs, cpu));
 		stats.clear	+= data_race(per_cpu(rseq_stats.clear, cpu));
@@ -145,7 +144,6 @@ static int rseq_stats_show(struct seq_file *m, void *p)
 	seq_printf(m, "signal: %16lu\n", stats.signal);
 	seq_printf(m, "slowp:  %16lu\n", stats.slowpath);
 	seq_printf(m, "fastp:  %16lu\n", stats.fastpath);
-	seq_printf(m, "quickt: %16lu\n", stats.quicktif);
 	seq_printf(m, "ids:    %16lu\n", stats.ids);
 	seq_printf(m, "cs:     %16lu\n", stats.cs);
 	seq_printf(m, "clear:  %16lu\n", stats.clear);
@@ -227,7 +225,7 @@ static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
 {
 	u64 csaddr;
 
-	if (get_user_masked_u64(&csaddr, &t->rseq->rseq_cs))
+	if (get_user_masked_u64(&csaddr, &t->rseq.usrptr->rseq_cs))
 		return false;
 	if (likely(!csaddr))
 		return true;
@@ -271,10 +269,10 @@ static void rseq_slowpath_update_usr(struct pt_regs *regs)
 	 * inconsistencies.
 	 */
 	scoped_guard(irq) {
+		event = t->rseq.event.sched_switch;
+		t->rseq.event.all &= evt_mask.all;
 		ids.cpu_id = task_cpu(t);
 		ids.mm_cid = task_mm_cid(t);
-		event = t->rseq_event.sched_switch;
-		t->rseq_event.all &= evt_mask.all;
 	}
 
 	if (!event)
@@ -287,7 +285,7 @@ static void rseq_slowpath_update_usr(struct pt_regs *regs)
 		 * Clear the errors just in case this might survive magically, but
 		 * leave the rest intact.
 		 */
-		t->rseq_event.error = 0;
+		t->rseq.event.error = 0;
 		force_sig(SIGSEGV);
 	}
 }
@@ -325,7 +323,7 @@ void __rseq_signal_deliver(int sig, struct pt_regs *regs)
 		 * Clear the errors just in case this might survive
 		 * magically, but leave the rest intact.
 		 */
-		current->rseq_event.error = 0;
+		current->rseq.event.error = 0;
 		force_sigsegv(sig);
 	}
 }
@@ -335,9 +333,9 @@ void __rseq_debug_syscall_return(struct pt_regs *regs)
 	struct task_struct *t = current;
 	u64 csaddr;
 
-	if (!t->rseq_event.has_rseq)
+	if (!t->rseq.event.has_rseq)
 		return;
-	if (get_user_masked_u64(&csaddr, &t->rseq->rseq_cs))
+	if (get_user_masked_u64(&csaddr, &t->rseq.usrptr->rseq_cs))
 		goto fail;
 	if (likely(!csaddr))
 		return;
@@ -393,33 +391,30 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
 		if (flags & ~RSEQ_FLAG_UNREGISTER)
 			return -EINVAL;
 		/* Unregister rseq for current thread. */
-		if (current->rseq != rseq || !current->rseq)
+		if (current->rseq.usrptr != rseq || !current->rseq.usrptr)
 			return -EINVAL;
-		if (rseq_len != current->rseq_len)
+		if (rseq_len != current->rseq.len)
 			return -EINVAL;
-		if (current->rseq_sig != sig)
+		if (current->rseq.sig != sig)
 			return -EPERM;
 		if (!rseq_reset_ids())
 			return -EFAULT;
-		current->rseq = NULL;
-		current->rseq_sig = 0;
-		current->rseq_len = 0;
-		current->rseq_event.all = 0;
+		rseq_reset(current);
 		return 0;
 	}
 
 	if (unlikely(flags))
 		return -EINVAL;
 
-	if (current->rseq) {
+	if (current->rseq.usrptr) {
 		/*
 		 * If rseq is already registered, check whether
 		 * the provided address differs from the prior
 		 * one.
 		 */
-		if (current->rseq != rseq || rseq_len != current->rseq_len)
+		if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len)
 			return -EINVAL;
-		if (current->rseq_sig != sig)
+		if (current->rseq.sig != sig)
 			return -EPERM;
 		/* Already registered. */
 		return -EBUSY;
@@ -457,16 +452,16 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
 	 * Activate the registration by setting the rseq area address, length
 	 * and signature in the task struct.
 	 */
-	current->rseq = rseq;
-	current->rseq_len = rseq_len;
-	current->rseq_sig = sig;
+	current->rseq.usrptr = rseq;
+	current->rseq.len = rseq_len;
+	current->rseq.sig = sig;
 
 	/*
 	 * If rseq was previously inactive, and has just been
 	 * registered, ensure the cpu_id_start and cpu_id fields
 	 * are updated before returning to user-space.
 	 */
-	current->rseq_event.has_rseq = true;
+	current->rseq.event.has_rseq = true;
 	rseq_force_update();
 
 	return 0;

[patch V4 01/36] rseq: Avoid pointless evaluation in __rseq_notify_resume()

Posted by Thomas Gleixner 23 hours ago

From: Thomas Gleixner <tglx@linutronix.de>

The RSEQ critical section mechanism only clears the event mask when a
critical section is registered, otherwise it is stale and collects
bits.

That means once a critical section is installed the first invocation of
that code when TIF_NOTIFY_RESUME is set will abort the critical section,
even when the TIF bit was not raised by the rseq preempt/migrate/signal
helpers.

This also has a performance implication because TIF_NOTIFY_RESUME is a
multiplexing TIF bit, which is utilized by quite some infrastructure. That
means every invocation of __rseq_notify_resume() goes unconditionally
through the heavy lifting of user space access and consistency checks even
if there is no reason to do so.

Keeping the stale event mask around when exiting to user space also
prevents it from being utilized by the upcoming time slice extension
mechanism.

Avoid this by reading and clearing the event mask before doing the user
space critical section access with interrupts or preemption disabled, which
ensures that the read and clear operation is CPU local atomic versus
scheduling and the membarrier IPI.

This is correct as after re-enabling interrupts/preemption any relevant
event will set the bit again and raise TIF_NOTIFY_RESUME, which makes the
user space exit code take another round of TIF bit clearing.

If the event mask was non-zero, invoke the slow path. On debug kernels the
slow path is invoked unconditionally and the result of the event mask
evaluation is handed in.

Add a exit path check after the TIF bit loop, which validates on debug
kernels that the event mask is zero before exiting to user space.

While at it reword the convoluted comment why the pt_regs pointer can be
NULL under certain circumstances.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Boqun Feng <boqun.feng@gmail.com>


---
 include/linux/irq-entry-common.h |    7 ++--
 include/linux/rseq.h             |   10 +++++
 kernel/rseq.c                    |   66 ++++++++++++++++++++++++++-------------
 3 files changed, 58 insertions(+), 25 deletions(-)
---

--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -2,11 +2,12 @@
 #ifndef __LINUX_IRQENTRYCOMMON_H
 #define __LINUX_IRQENTRYCOMMON_H
 
+#include <linux/context_tracking.h>
+#include <linux/kmsan.h>
+#include <linux/rseq.h>
 #include <linux/static_call_types.h>
 #include <linux/syscalls.h>
-#include <linux/context_tracking.h>
 #include <linux/tick.h>
-#include <linux/kmsan.h>
 #include <linux/unwind_deferred.h>
 
 #include <asm/entry-common.h>
@@ -226,6 +227,8 @@ static __always_inline void exit_to_user
 
 	arch_exit_to_user_mode_prepare(regs, ti_work);
 
+	rseq_exit_to_user_mode();
+
 	/* Ensure that kernel state is sane for a return to userspace */
 	kmap_assert_nomap();
 	lockdep_assert_irqs_disabled();
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -66,6 +66,14 @@ static inline void rseq_migrate(struct t
 	rseq_set_notify_resume(t);
 }
 
+static __always_inline void rseq_exit_to_user_mode(void)
+{
+	if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
+		if (WARN_ON_ONCE(current->rseq && current->rseq_event_mask))
+			current->rseq_event_mask = 0;
+	}
+}
+
 /*
  * If parent process has a registered restartable sequences area, the
  * child inherits. Unregister rseq for a clone with CLONE_VM set.
@@ -118,7 +126,7 @@ static inline void rseq_fork(struct task
 static inline void rseq_execve(struct task_struct *t)
 {
 }
-
+static inline void rseq_exit_to_user_mode(void) { }
 #endif
 
 #ifdef CONFIG_DEBUG_RSEQ
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -324,9 +324,9 @@ static bool rseq_warn_flags(const char *
 	return true;
 }
 
-static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
+static int rseq_check_flags(struct task_struct *t, u32 cs_flags)
 {
-	u32 flags, event_mask;
+	u32 flags;
 	int ret;
 
 	if (rseq_warn_flags("rseq_cs", cs_flags))
@@ -339,17 +339,7 @@ static int rseq_need_restart(struct task
 
 	if (rseq_warn_flags("rseq", flags))
 		return -EINVAL;
-
-	/*
-	 * Load and clear event mask atomically with respect to
-	 * scheduler preemption and membarrier IPIs.
-	 */
-	scoped_guard(RSEQ_EVENT_GUARD) {
-		event_mask = t->rseq_event_mask;
-		t->rseq_event_mask = 0;
-	}
-
-	return !!event_mask;
+	return 0;
 }
 
 static int clear_rseq_cs(struct rseq __user *rseq)
@@ -380,7 +370,7 @@ static bool in_rseq_cs(unsigned long ip,
 	return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
 }
 
-static int rseq_ip_fixup(struct pt_regs *regs)
+static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
 {
 	unsigned long ip = instruction_pointer(regs);
 	struct task_struct *t = current;
@@ -398,9 +388,11 @@ static int rseq_ip_fixup(struct pt_regs
 	 */
 	if (!in_rseq_cs(ip, &rseq_cs))
 		return clear_rseq_cs(t->rseq);
-	ret = rseq_need_restart(t, rseq_cs.flags);
-	if (ret <= 0)
+	ret = rseq_check_flags(t, rseq_cs.flags);
+	if (ret < 0)
 		return ret;
+	if (!abort)
+		return 0;
 	ret = clear_rseq_cs(t->rseq);
 	if (ret)
 		return ret;
@@ -430,14 +422,44 @@ void __rseq_handle_notify_resume(struct
 		return;
 
 	/*
-	 * regs is NULL if and only if the caller is in a syscall path.  Skip
-	 * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
-	 * kill a misbehaving userspace on debug kernels.
+	 * If invoked from hypervisors or IO-URING, then @regs is a NULL
+	 * pointer, so fixup cannot be done. If the syscall which led to
+	 * this invocation was invoked inside a critical section, then it
+	 * will either end up in this code again or a possible violation of
+	 * a syscall inside a critical region can only be detected by the
+	 * debug code in rseq_syscall() in a debug enabled kernel.
 	 */
 	if (regs) {
-		ret = rseq_ip_fixup(regs);
-		if (unlikely(ret < 0))
-			goto error;
+		/*
+		 * Read and clear the event mask first. If the task was not
+		 * preempted or migrated or a signal is on the way, there
+		 * is no point in doing any of the heavy lifting here on
+		 * production kernels. In that case TIF_NOTIFY_RESUME was
+		 * raised by some other functionality.
+		 *
+		 * This is correct because the read/clear operation is
+		 * guarded against scheduler preemption, which makes it CPU
+		 * local atomic. If the task is preempted right after
+		 * re-enabling preemption then TIF_NOTIFY_RESUME is set
+		 * again and this function is invoked another time _before_
+		 * the task is able to return to user mode.
+		 *
+		 * On a debug kernel, invoke the fixup code unconditionally
+		 * with the result handed in to allow the detection of
+		 * inconsistencies.
+		 */
+		u32 event_mask;
+
+		scoped_guard(RSEQ_EVENT_GUARD) {
+			event_mask = t->rseq_event_mask;
+			t->rseq_event_mask = 0;
+		}
+
+		if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event_mask) {
+			ret = rseq_ip_fixup(regs, !!event_mask);
+			if (unlikely(ret < 0))
+				goto error;
+		}
 	}
 	if (unlikely(rseq_update_cpu_node_id(t)))
 		goto error;

[patch V4 02/36] rseq: Condense the inline stubs

Posted by Thomas Gleixner 23 hours ago

From: Thomas Gleixner <tglx@linutronix.de>

Scrolling over tons of pointless

{
}

lines to find the actual code is annoying at best.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Boqun Feng <boqun.feng@gmail.com>


---
 include/linux/rseq.h |   47 ++++++++++++-----------------------------------
 1 file changed, 12 insertions(+), 35 deletions(-)
---

--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -101,44 +101,21 @@ static inline void rseq_execve(struct ta
 	t->rseq_event_mask = 0;
 }
 
-#else
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
-}
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
-					     struct pt_regs *regs)
-{
-}
-static inline void rseq_signal_deliver(struct ksignal *ksig,
-				       struct pt_regs *regs)
-{
-}
-static inline void rseq_preempt(struct task_struct *t)
-{
-}
-static inline void rseq_migrate(struct task_struct *t)
-{
-}
-static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
-{
-}
-static inline void rseq_execve(struct task_struct *t)
-{
-}
+#else /* CONFIG_RSEQ */
+static inline void rseq_set_notify_resume(struct task_struct *t) { }
+static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { }
+static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
+static inline void rseq_preempt(struct task_struct *t) { }
+static inline void rseq_migrate(struct task_struct *t) { }
+static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
+static inline void rseq_execve(struct task_struct *t) { }
 static inline void rseq_exit_to_user_mode(void) { }
-#endif
+#endif  /* !CONFIG_RSEQ */
 
 #ifdef CONFIG_DEBUG_RSEQ
-
 void rseq_syscall(struct pt_regs *regs);
-
-#else
-
-static inline void rseq_syscall(struct pt_regs *regs)
-{
-}
-
-#endif
+#else /* CONFIG_DEBUG_RSEQ */
+static inline void rseq_syscall(struct pt_regs *regs) { }
+#endif /* !CONFIG_DEBUG_RSEQ */
 
 #endif /* _LINUX_RSEQ_H */

[patch V4 03/36] rseq: Move algorithm comment to top

Posted by Thomas Gleixner 23 hours ago

Move the comment which documents the RSEQ algorithm to the top of the file,
so it does not create horrible diffs later when the actual implementation
is fed into the mincer.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

---
 kernel/rseq.c |  119 ++++++++++++++++++++++++++++------------------------------
 1 file changed, 59 insertions(+), 60 deletions(-)

--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -8,6 +8,65 @@
  * Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  */
 
+/*
+ * Restartable sequences are a lightweight interface that allows
+ * user-level code to be executed atomically relative to scheduler
+ * preemption and signal delivery. Typically used for implementing
+ * per-cpu operations.
+ *
+ * It allows user-space to perform update operations on per-cpu data
+ * without requiring heavy-weight atomic operations.
+ *
+ * Detailed algorithm of rseq user-space assembly sequences:
+ *
+ *                     init(rseq_cs)
+ *                     cpu = TLS->rseq::cpu_id_start
+ *   [1]               TLS->rseq::rseq_cs = rseq_cs
+ *   [start_ip]        ----------------------------
+ *   [2]               if (cpu != TLS->rseq::cpu_id)
+ *                             goto abort_ip;
+ *   [3]               <last_instruction_in_cs>
+ *   [post_commit_ip]  ----------------------------
+ *
+ *   The address of jump target abort_ip must be outside the critical
+ *   region, i.e.:
+ *
+ *     [abort_ip] < [start_ip]  || [abort_ip] >= [post_commit_ip]
+ *
+ *   Steps [2]-[3] (inclusive) need to be a sequence of instructions in
+ *   userspace that can handle being interrupted between any of those
+ *   instructions, and then resumed to the abort_ip.
+ *
+ *   1.  Userspace stores the address of the struct rseq_cs assembly
+ *       block descriptor into the rseq_cs field of the registered
+ *       struct rseq TLS area. This update is performed through a single
+ *       store within the inline assembly instruction sequence.
+ *       [start_ip]
+ *
+ *   2.  Userspace tests to check whether the current cpu_id field match
+ *       the cpu number loaded before start_ip, branching to abort_ip
+ *       in case of a mismatch.
+ *
+ *       If the sequence is preempted or interrupted by a signal
+ *       at or after start_ip and before post_commit_ip, then the kernel
+ *       clears TLS->__rseq_abi::rseq_cs, and sets the user-space return
+ *       ip to abort_ip before returning to user-space, so the preempted
+ *       execution resumes at abort_ip.
+ *
+ *   3.  Userspace critical section final instruction before
+ *       post_commit_ip is the commit. The critical section is
+ *       self-terminating.
+ *       [post_commit_ip]
+ *
+ *   4.  <success>
+ *
+ *   On failure at [2], or if interrupted by preempt or signal delivery
+ *   between [1] and [3]:
+ *
+ *       [abort_ip]
+ *   F1. <failure>
+ */
+
 #include <linux/sched.h>
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
@@ -98,66 +157,6 @@ static int rseq_validate_ro_fields(struc
 	unsafe_put_user(value, &t->rseq->field, error_label)
 #endif
 
-/*
- *
- * Restartable sequences are a lightweight interface that allows
- * user-level code to be executed atomically relative to scheduler
- * preemption and signal delivery. Typically used for implementing
- * per-cpu operations.
- *
- * It allows user-space to perform update operations on per-cpu data
- * without requiring heavy-weight atomic operations.
- *
- * Detailed algorithm of rseq user-space assembly sequences:
- *
- *                     init(rseq_cs)
- *                     cpu = TLS->rseq::cpu_id_start
- *   [1]               TLS->rseq::rseq_cs = rseq_cs
- *   [start_ip]        ----------------------------
- *   [2]               if (cpu != TLS->rseq::cpu_id)
- *                             goto abort_ip;
- *   [3]               <last_instruction_in_cs>
- *   [post_commit_ip]  ----------------------------
- *
- *   The address of jump target abort_ip must be outside the critical
- *   region, i.e.:
- *
- *     [abort_ip] < [start_ip]  || [abort_ip] >= [post_commit_ip]
- *
- *   Steps [2]-[3] (inclusive) need to be a sequence of instructions in
- *   userspace that can handle being interrupted between any of those
- *   instructions, and then resumed to the abort_ip.
- *
- *   1.  Userspace stores the address of the struct rseq_cs assembly
- *       block descriptor into the rseq_cs field of the registered
- *       struct rseq TLS area. This update is performed through a single
- *       store within the inline assembly instruction sequence.
- *       [start_ip]
- *
- *   2.  Userspace tests to check whether the current cpu_id field match
- *       the cpu number loaded before start_ip, branching to abort_ip
- *       in case of a mismatch.
- *
- *       If the sequence is preempted or interrupted by a signal
- *       at or after start_ip and before post_commit_ip, then the kernel
- *       clears TLS->__rseq_abi::rseq_cs, and sets the user-space return
- *       ip to abort_ip before returning to user-space, so the preempted
- *       execution resumes at abort_ip.
- *
- *   3.  Userspace critical section final instruction before
- *       post_commit_ip is the commit. The critical section is
- *       self-terminating.
- *       [post_commit_ip]
- *
- *   4.  <success>
- *
- *   On failure at [2], or if interrupted by preempt or signal delivery
- *   between [1] and [3]:
- *
- *       [abort_ip]
- *   F1. <failure>
- */
-
 static int rseq_update_cpu_node_id(struct task_struct *t)
 {
 	struct rseq __user *rseq = t->rseq;

[patch V4 04/36] rseq: Remove the ksig argument from rseq_handle_notify_resume()

Posted by Thomas Gleixner 23 hours ago

There is no point for this being visible in the resume_to_user_mode()
handling.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

---
 include/linux/resume_user_mode.h |    2 +-
 include/linux/rseq.h             |   13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

--- a/include/linux/resume_user_mode.h
+++ b/include/linux/resume_user_mode.h
@@ -59,7 +59,7 @@ static inline void resume_user_mode_work
 	mem_cgroup_handle_over_high(GFP_KERNEL);
 	blkcg_maybe_throttle_current();
 
-	rseq_handle_notify_resume(NULL, regs);
+	rseq_handle_notify_resume(regs);
 }
 
 #endif /* LINUX_RESUME_USER_MODE_H */
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -37,19 +37,20 @@ static inline void rseq_set_notify_resum
 
 void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
 
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
-					     struct pt_regs *regs)
+static inline void rseq_handle_notify_resume(struct pt_regs *regs)
 {
 	if (current->rseq)
-		__rseq_handle_notify_resume(ksig, regs);
+		__rseq_handle_notify_resume(NULL, regs);
 }
 
 static inline void rseq_signal_deliver(struct ksignal *ksig,
 				       struct pt_regs *regs)
 {
-	scoped_guard(RSEQ_EVENT_GUARD)
-		__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
-	rseq_handle_notify_resume(ksig, regs);
+	if (current->rseq) {
+		scoped_guard(RSEQ_EVENT_GUARD)
+			__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
+		__rseq_handle_notify_resume(ksig, regs);
+	}
 }
 
 /* rseq_preempt() requires preemption to be disabled. */

[patch V4 05/36] rseq: Simplify registration

Posted by Thomas Gleixner 23 hours ago

There is no point to read the critical section element in the newly
registered user space RSEQ struct first in order to clear it.

Just clear it and be done with it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

---
 kernel/rseq.c |   10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -492,11 +492,9 @@ void rseq_syscall(struct pt_regs *regs)
 /*
  * sys_rseq - setup restartable sequences for caller thread.
  */
-SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
-		int, flags, u32, sig)
+SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
 {
 	int ret;
-	u64 rseq_cs;
 
 	if (flags & RSEQ_FLAG_UNREGISTER) {
 		if (flags & ~RSEQ_FLAG_UNREGISTER)
@@ -557,11 +555,9 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
 	 * avoid a potential segfault on return to user-space. The proper thing
 	 * to do would have been to fail the registration but this would break
 	 * older libcs that reuse the rseq area for new threads without
-	 * clearing the fields.
+	 * clearing the fields. Don't bother reading it, just reset it.
 	 */
-	if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs))
-	        return -EFAULT;
-	if (rseq_cs && clear_rseq_cs(rseq))
+	if (put_user_masked_u64(0UL, &rseq->rseq_cs))
 		return -EFAULT;
 
 #ifdef CONFIG_DEBUG_RSEQ

[patch V4 06/36] rseq: Simplify the event notification

Posted by Thomas Gleixner 23 hours ago

Since commit 0190e4198e47 ("rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_*
flags") the bits in task::rseq_event_mask are meaningless and just extra
work in terms of setting them individually.

Aside of that the only relevant point where an event has to be raised is
context switch. Neither the CPU nor MM CID can change without going through
a context switch.

Collapse them all into a single boolean which simplifies the code a lot and
remove the pointless invocations which have been sprinkled all over the
place for no value.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Boqun Feng <boqun.feng@gmail.com>

---
V2: Reduce it to the sched switch event.
---
 fs/exec.c                 |    2 -
 include/linux/rseq.h      |   66 +++++++++-------------------------------------
 include/linux/sched.h     |   10 +++---
 include/uapi/linux/rseq.h |   21 ++++----------
 kernel/rseq.c             |   28 +++++++++++--------
 kernel/sched/core.c       |    5 ---
 kernel/sched/membarrier.c |    8 ++---
 7 files changed, 48 insertions(+), 92 deletions(-)
---

--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1775,7 +1775,7 @@ static int bprm_execve(struct linux_binp
 		force_fatal_sig(SIGSEGV);
 
 	sched_mm_cid_after_execve(current);
-	rseq_set_notify_resume(current);
+	rseq_sched_switch_event(current);
 	current->in_execve = 0;
 
 	return retval;
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -3,38 +3,8 @@
 #define _LINUX_RSEQ_H
 
 #ifdef CONFIG_RSEQ
-
-#include <linux/preempt.h>
 #include <linux/sched.h>
 
-#ifdef CONFIG_MEMBARRIER
-# define RSEQ_EVENT_GUARD	irq
-#else
-# define RSEQ_EVENT_GUARD	preempt
-#endif
-
-/*
- * Map the event mask on the user-space ABI enum rseq_cs_flags
- * for direct mask checks.
- */
-enum rseq_event_mask_bits {
-	RSEQ_EVENT_PREEMPT_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
-	RSEQ_EVENT_SIGNAL_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
-	RSEQ_EVENT_MIGRATE_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
-};
-
-enum rseq_event_mask {
-	RSEQ_EVENT_PREEMPT	= (1U << RSEQ_EVENT_PREEMPT_BIT),
-	RSEQ_EVENT_SIGNAL	= (1U << RSEQ_EVENT_SIGNAL_BIT),
-	RSEQ_EVENT_MIGRATE	= (1U << RSEQ_EVENT_MIGRATE_BIT),
-};
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
-	if (t->rseq)
-		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
-}
-
 void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
 
 static inline void rseq_handle_notify_resume(struct pt_regs *regs)
@@ -43,35 +13,27 @@ static inline void rseq_handle_notify_re
 		__rseq_handle_notify_resume(NULL, regs);
 }
 
-static inline void rseq_signal_deliver(struct ksignal *ksig,
-				       struct pt_regs *regs)
+static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
 {
 	if (current->rseq) {
-		scoped_guard(RSEQ_EVENT_GUARD)
-			__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
+		current->rseq_event_pending = true;
 		__rseq_handle_notify_resume(ksig, regs);
 	}
 }
 
-/* rseq_preempt() requires preemption to be disabled. */
-static inline void rseq_preempt(struct task_struct *t)
+static inline void rseq_sched_switch_event(struct task_struct *t)
 {
-	__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
-	rseq_set_notify_resume(t);
-}
-
-/* rseq_migrate() requires preemption to be disabled. */
-static inline void rseq_migrate(struct task_struct *t)
-{
-	__set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
-	rseq_set_notify_resume(t);
+	if (t->rseq) {
+		t->rseq_event_pending = true;
+		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+	}
 }
 
 static __always_inline void rseq_exit_to_user_mode(void)
 {
 	if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
-		if (WARN_ON_ONCE(current->rseq && current->rseq_event_mask))
-			current->rseq_event_mask = 0;
+		if (WARN_ON_ONCE(current->rseq && current->rseq_event_pending))
+			current->rseq_event_pending = false;
 	}
 }
 
@@ -85,12 +47,12 @@ static inline void rseq_fork(struct task
 		t->rseq = NULL;
 		t->rseq_len = 0;
 		t->rseq_sig = 0;
-		t->rseq_event_mask = 0;
+		t->rseq_event_pending = false;
 	} else {
 		t->rseq = current->rseq;
 		t->rseq_len = current->rseq_len;
 		t->rseq_sig = current->rseq_sig;
-		t->rseq_event_mask = current->rseq_event_mask;
+		t->rseq_event_pending = current->rseq_event_pending;
 	}
 }
 
@@ -99,15 +61,13 @@ static inline void rseq_execve(struct ta
 	t->rseq = NULL;
 	t->rseq_len = 0;
 	t->rseq_sig = 0;
-	t->rseq_event_mask = 0;
+	t->rseq_event_pending = false;
 }
 
 #else /* CONFIG_RSEQ */
-static inline void rseq_set_notify_resume(struct task_struct *t) { }
 static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { }
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
-static inline void rseq_preempt(struct task_struct *t) { }
-static inline void rseq_migrate(struct task_struct *t) { }
+static inline void rseq_sched_switch_event(struct task_struct *t) { }
 static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
 static inline void rseq_execve(struct task_struct *t) { }
 static inline void rseq_exit_to_user_mode(void) { }
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1401,14 +1401,14 @@ struct task_struct {
 #endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_RSEQ
-	struct rseq __user *rseq;
-	u32 rseq_len;
-	u32 rseq_sig;
+	struct rseq __user		*rseq;
+	u32				rseq_len;
+	u32				rseq_sig;
 	/*
-	 * RmW on rseq_event_mask must be performed atomically
+	 * RmW on rseq_event_pending must be performed atomically
 	 * with respect to preemption.
 	 */
-	unsigned long rseq_event_mask;
+	bool				rseq_event_pending;
 # ifdef CONFIG_DEBUG_RSEQ
 	/*
 	 * This is a place holder to save a copy of the rseq fields for
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -114,20 +114,13 @@ struct rseq {
 	/*
 	 * Restartable sequences flags field.
 	 *
-	 * This field should only be updated by the thread which
-	 * registered this data structure. Read by the kernel.
-	 * Mainly used for single-stepping through rseq critical sections
-	 * with debuggers.
-	 *
-	 * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
-	 *     Inhibit instruction sequence block restart on preemption
-	 *     for this thread.
-	 * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
-	 *     Inhibit instruction sequence block restart on signal
-	 *     delivery for this thread.
-	 * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
-	 *     Inhibit instruction sequence block restart on migration for
-	 *     this thread.
+	 * This field was initialy intended to allow event masking for for
+	 * single-stepping through rseq critical sections with debuggers.
+	 * The kernel does not support this anymore and the relevant bits
+	 * are checked for being always false:
+	 *	- RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
+	 *	- RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
+	 *	- RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
 	 */
 	__u32 flags;
 
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -78,6 +78,12 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/rseq.h>
 
+#ifdef CONFIG_MEMBARRIER
+# define RSEQ_EVENT_GUARD	irq
+#else
+# define RSEQ_EVENT_GUARD	preempt
+#endif
+
 /* The original rseq structure size (including padding) is 32 bytes. */
 #define ORIG_RSEQ_SIZE		32
 
@@ -430,11 +436,11 @@ void __rseq_handle_notify_resume(struct
 	 */
 	if (regs) {
 		/*
-		 * Read and clear the event mask first. If the task was not
-		 * preempted or migrated or a signal is on the way, there
-		 * is no point in doing any of the heavy lifting here on
-		 * production kernels. In that case TIF_NOTIFY_RESUME was
-		 * raised by some other functionality.
+		 * Read and clear the event pending bit first. If the task
+		 * was not preempted or migrated or a signal is on the way,
+		 * there is no point in doing any of the heavy lifting here
+		 * on production kernels. In that case TIF_NOTIFY_RESUME
+		 * was raised by some other functionality.
 		 *
 		 * This is correct because the read/clear operation is
 		 * guarded against scheduler preemption, which makes it CPU
@@ -447,15 +453,15 @@ void __rseq_handle_notify_resume(struct
 		 * with the result handed in to allow the detection of
 		 * inconsistencies.
 		 */
-		u32 event_mask;
+		bool event;
 
 		scoped_guard(RSEQ_EVENT_GUARD) {
-			event_mask = t->rseq_event_mask;
-			t->rseq_event_mask = 0;
+			event = t->rseq_event_pending;
+			t->rseq_event_pending = false;
 		}
 
-		if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event_mask) {
-			ret = rseq_ip_fixup(regs, !!event_mask);
+		if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
+			ret = rseq_ip_fixup(regs, event);
 			if (unlikely(ret < 0))
 				goto error;
 		}
@@ -584,7 +590,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
 	 * registered, ensure the cpu_id_start and cpu_id fields
 	 * are updated before returning to user-space.
 	 */
-	rseq_set_notify_resume(current);
+	rseq_sched_switch_event(current);
 
 	return 0;
 }
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3364,7 +3364,6 @@ void set_task_cpu(struct task_struct *p,
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
-		rseq_migrate(p);
 		sched_mm_cid_migrate_from(p);
 		perf_event_task_migrate(p);
 	}
@@ -4795,7 +4794,6 @@ int sched_cgroup_fork(struct task_struct
 		p->sched_task_group = tg;
 	}
 #endif
-	rseq_migrate(p);
 	/*
 	 * We're setting the CPU for the first time, we don't migrate,
 	 * so use __set_task_cpu().
@@ -4859,7 +4857,6 @@ void wake_up_new_task(struct task_struct
 	 * as we're not fully set-up yet.
 	 */
 	p->recent_used_cpu = task_cpu(p);
-	rseq_migrate(p);
 	__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
 	rq = __task_rq_lock(p, &rf);
 	update_rq_clock(rq);
@@ -5153,7 +5150,7 @@ prepare_task_switch(struct rq *rq, struc
 	kcov_prepare_switch(prev);
 	sched_info_switch(rq, prev, next);
 	perf_event_task_sched_out(prev, next);
-	rseq_preempt(prev);
+	rseq_sched_switch_event(prev);
 	fire_sched_out_preempt_notifiers(prev, next);
 	kmap_local_sched_out();
 	prepare_task(next);
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -199,7 +199,7 @@ static void ipi_rseq(void *info)
 	 * is negligible.
 	 */
 	smp_mb();
-	rseq_preempt(current);
+	rseq_sched_switch_event(current);
 }
 
 static void ipi_sync_rq_state(void *info)
@@ -407,9 +407,9 @@ static int membarrier_private_expedited(
 		 * membarrier, we will end up with some thread in the mm
 		 * running without a core sync.
 		 *
-		 * For RSEQ, don't rseq_preempt() the caller.  User code
-		 * is not supposed to issue syscalls at all from inside an
-		 * rseq critical section.
+		 * For RSEQ, don't invoke rseq_sched_switch_event() on the
+		 * caller.  User code is not supposed to issue syscalls at
+		 * all from inside an rseq critical section.
 		 */
 		if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
 			preempt_disable();

Re: [patch V4 06/36] rseq: Simplify the event notification

Posted by Mathieu Desnoyers 8 hours ago

On 2025-09-08 17:31, Thomas Gleixner wrote:
> Since commit 0190e4198e47 ("rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_*
> flags") the bits in task::rseq_event_mask are meaningless and just extra
> work in terms of setting them individually.
> 
> Aside of that the only relevant point where an event has to be raised is
> context switch. Neither the CPU nor MM CID can change without going through
> a context switch.
> 
> Collapse them all into a single boolean which simplifies the code a lot and
> remove the pointless invocations which have been sprinkled all over the
> place for no value.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Cc: "Paul E. McKenney" <paulmck@kernel.org>
> Cc: Boqun Feng <boqun.feng@gmail.com>
> 
> ---
> V2: Reduce it to the sched switch event.
> ---
>   fs/exec.c                 |    2 -
>   include/linux/rseq.h      |   66 +++++++++-------------------------------------
>   include/linux/sched.h     |   10 +++---
>   include/uapi/linux/rseq.h |   21 ++++----------
>   kernel/rseq.c             |   28 +++++++++++--------
>   kernel/sched/core.c       |    5 ---
>   kernel/sched/membarrier.c |    8 ++---
>   7 files changed, 48 insertions(+), 92 deletions(-)
> ---
> 

[...]

> --- a/include/uapi/linux/rseq.h
> +++ b/include/uapi/linux/rseq.h
> @@ -114,20 +114,13 @@ struct rseq {
>   	/*
>   	 * Restartable sequences flags field.
>   	 *
> -	 * This field should only be updated by the thread which
> -	 * registered this data structure. Read by the kernel.
> -	 * Mainly used for single-stepping through rseq critical sections
> -	 * with debuggers.
> -	 *
> -	 * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
> -	 *     Inhibit instruction sequence block restart on preemption
> -	 *     for this thread.
> -	 * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
> -	 *     Inhibit instruction sequence block restart on signal
> -	 *     delivery for this thread.
> -	 * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
> -	 *     Inhibit instruction sequence block restart on migration for
> -	 *     this thread.
> +	 * This field was initialy intended to allow event masking for for

initialy -> initially

for for -> for

Other than those nits:

Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

[patch V4 07/36] rseq, virt: Retrigger RSEQ after vcpu_run()

Posted by Thomas Gleixner 23 hours ago

Hypervisors invoke resume_user_mode_work() before entering the guest, which
clears TIF_NOTIFY_RESUME. The @regs argument is NULL as there is no user
space context available to them, so the rseq notify handler skips
inspecting the critical section, but updates the CPU/MM CID values
unconditionally so that the eventual pending rseq event is not lost on the
way to user space.

This is a pointless exercise as the task might be rescheduled before
actually returning to user space and it creates unnecessary work in the
vcpu_run() loops.

It's way more efficient to ignore that invocation based on @regs == NULL
and let the hypervisors re-raise TIF_NOTIFY_RESUME after returning from the
vcpu_run() loop before returning from the ioctl().

This ensures that a pending RSEQ update is not lost and the IDs are updated
before returning to user space.

Once the RSEQ handling is decoupled from TIF_NOTIFY_RESUME, this turns into
a NOOP.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Wei Liu <wei.liu@kernel.org>
Cc: Dexuan Cui <decui@microsoft.com>
---
V3: Add the missing rseq.h include for HV - 0-day
---
 drivers/hv/mshv_root_main.c |    3 +
 include/linux/rseq.h        |   17 +++++++++
 kernel/rseq.c               |   76 +++++++++++++++++++++++---------------------
 virt/kvm/kvm_main.c         |    3 +
 4 files changed, 63 insertions(+), 36 deletions(-)

--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -28,6 +28,7 @@
 #include <linux/crash_dump.h>
 #include <linux/panic_notifier.h>
 #include <linux/vmalloc.h>
+#include <linux/rseq.h>
 
 #include "mshv_eventfd.h"
 #include "mshv.h"
@@ -585,6 +586,8 @@ static long mshv_run_vp_with_root_schedu
 		}
 	} while (!vp->run.flags.intercept_suspend);
 
+	rseq_virt_userspace_exit();
+
 	return ret;
 }
 
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -38,6 +38,22 @@ static __always_inline void rseq_exit_to
 }
 
 /*
+ * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
+ * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in
+ * that case just to do it eventually again before returning to user space,
+ * the entry resume_user_mode_work() invocation is ignored as the register
+ * argument is NULL.
+ *
+ * After returning from guest mode, they have to invoke this function to
+ * re-raise TIF_NOTIFY_RESUME if necessary.
+ */
+static inline void rseq_virt_userspace_exit(void)
+{
+	if (current->rseq_event_pending)
+		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+}
+
+/*
  * If parent process has a registered restartable sequences area, the
  * child inherits. Unregister rseq for a clone with CLONE_VM set.
  */
@@ -68,6 +84,7 @@ static inline void rseq_execve(struct ta
 static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { }
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
 static inline void rseq_sched_switch_event(struct task_struct *t) { }
+static inline void rseq_virt_userspace_exit(void) { }
 static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
 static inline void rseq_execve(struct task_struct *t) { }
 static inline void rseq_exit_to_user_mode(void) { }
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -422,50 +422,54 @@ void __rseq_handle_notify_resume(struct
 {
 	struct task_struct *t = current;
 	int ret, sig;
+	bool event;
+
+	/*
+	 * If invoked from hypervisors before entering the guest via
+	 * resume_user_mode_work(), then @regs is a NULL pointer.
+	 *
+	 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
+	 * it before returning from the ioctl() to user space when
+	 * rseq_event.sched_switch is set.
+	 *
+	 * So it's safe to ignore here instead of pointlessly updating it
+	 * in the vcpu_run() loop.
+	 */
+	if (!regs)
+		return;
 
 	if (unlikely(t->flags & PF_EXITING))
 		return;
 
 	/*
-	 * If invoked from hypervisors or IO-URING, then @regs is a NULL
-	 * pointer, so fixup cannot be done. If the syscall which led to
-	 * this invocation was invoked inside a critical section, then it
-	 * will either end up in this code again or a possible violation of
-	 * a syscall inside a critical region can only be detected by the
-	 * debug code in rseq_syscall() in a debug enabled kernel.
+	 * Read and clear the event pending bit first. If the task
+	 * was not preempted or migrated or a signal is on the way,
+	 * there is no point in doing any of the heavy lifting here
+	 * on production kernels. In that case TIF_NOTIFY_RESUME
+	 * was raised by some other functionality.
+	 *
+	 * This is correct because the read/clear operation is
+	 * guarded against scheduler preemption, which makes it CPU
+	 * local atomic. If the task is preempted right after
+	 * re-enabling preemption then TIF_NOTIFY_RESUME is set
+	 * again and this function is invoked another time _before_
+	 * the task is able to return to user mode.
+	 *
+	 * On a debug kernel, invoke the fixup code unconditionally
+	 * with the result handed in to allow the detection of
+	 * inconsistencies.
 	 */
-	if (regs) {
-		/*
-		 * Read and clear the event pending bit first. If the task
-		 * was not preempted or migrated or a signal is on the way,
-		 * there is no point in doing any of the heavy lifting here
-		 * on production kernels. In that case TIF_NOTIFY_RESUME
-		 * was raised by some other functionality.
-		 *
-		 * This is correct because the read/clear operation is
-		 * guarded against scheduler preemption, which makes it CPU
-		 * local atomic. If the task is preempted right after
-		 * re-enabling preemption then TIF_NOTIFY_RESUME is set
-		 * again and this function is invoked another time _before_
-		 * the task is able to return to user mode.
-		 *
-		 * On a debug kernel, invoke the fixup code unconditionally
-		 * with the result handed in to allow the detection of
-		 * inconsistencies.
-		 */
-		bool event;
-
-		scoped_guard(RSEQ_EVENT_GUARD) {
-			event = t->rseq_event_pending;
-			t->rseq_event_pending = false;
-		}
+	scoped_guard(RSEQ_EVENT_GUARD) {
+		event = t->rseq_event_pending;
+		t->rseq_event_pending = false;
+	}
 
-		if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
-			ret = rseq_ip_fixup(regs, event);
-			if (unlikely(ret < 0))
-				goto error;
-		}
+	if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
+		ret = rseq_ip_fixup(regs, event);
+		if (unlikely(ret < 0))
+			goto error;
 	}
+
 	if (unlikely(rseq_update_cpu_node_id(t)))
 		goto error;
 	return;
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -49,6 +49,7 @@
 #include <linux/lockdep.h>
 #include <linux/kthread.h>
 #include <linux/suspend.h>
+#include <linux/rseq.h>
 
 #include <asm/processor.h>
 #include <asm/ioctl.h>
@@ -4466,6 +4467,8 @@ static long kvm_vcpu_ioctl(struct file *
 		r = kvm_arch_vcpu_ioctl_run(vcpu);
 		vcpu->wants_to_run = false;
 
+		rseq_virt_userspace_exit();
+
 		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
 		break;
 	}

Re: [patch V4 07/36] rseq, virt: Retrigger RSEQ after vcpu_run()

Posted by Mathieu Desnoyers 8 hours ago

On 2025-09-08 17:31, Thomas Gleixner wrote:
> Hypervisors invoke resume_user_mode_work() before entering the guest, which
> clears TIF_NOTIFY_RESUME. The @regs argument is NULL as there is no user
> space context available to them, so the rseq notify handler skips
> inspecting the critical section, but updates the CPU/MM CID values
> unconditionally so that the eventual pending rseq event is not lost on the
> way to user space.
> 
> This is a pointless exercise as the task might be rescheduled before
> actually returning to user space and it creates unnecessary work in the
> vcpu_run() loops.
> 
> It's way more efficient to ignore that invocation based on @regs == NULL
> and let the hypervisors re-raise TIF_NOTIFY_RESUME after returning from the
> vcpu_run() loop before returning from the ioctl().
> 
> This ensures that a pending RSEQ update is not lost and the IDs are updated
> before returning to user space.
> 
> Once the RSEQ handling is decoupled from TIF_NOTIFY_RESUME, this turns into
> a NOOP.

Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Sean Christopherson <seanjc@google.com>
> Cc: Wei Liu <wei.liu@kernel.org>
> Cc: Dexuan Cui <decui@microsoft.com>
> ---
> V3: Add the missing rseq.h include for HV - 0-day
> ---
>   drivers/hv/mshv_root_main.c |    3 +
>   include/linux/rseq.h        |   17 +++++++++
>   kernel/rseq.c               |   76 +++++++++++++++++++++++---------------------
>   virt/kvm/kvm_main.c         |    3 +
>   4 files changed, 63 insertions(+), 36 deletions(-)
> 
> --- a/drivers/hv/mshv_root_main.c
> +++ b/drivers/hv/mshv_root_main.c
> @@ -28,6 +28,7 @@
>   #include <linux/crash_dump.h>
>   #include <linux/panic_notifier.h>
>   #include <linux/vmalloc.h>
> +#include <linux/rseq.h>
>   
>   #include "mshv_eventfd.h"
>   #include "mshv.h"
> @@ -585,6 +586,8 @@ static long mshv_run_vp_with_root_schedu
>   		}
>   	} while (!vp->run.flags.intercept_suspend);
>   
> +	rseq_virt_userspace_exit();
> +
>   	return ret;
>   }
>   
> --- a/include/linux/rseq.h
> +++ b/include/linux/rseq.h
> @@ -38,6 +38,22 @@ static __always_inline void rseq_exit_to
>   }
>   
>   /*
> + * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
> + * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in
> + * that case just to do it eventually again before returning to user space,
> + * the entry resume_user_mode_work() invocation is ignored as the register
> + * argument is NULL.
> + *
> + * After returning from guest mode, they have to invoke this function to
> + * re-raise TIF_NOTIFY_RESUME if necessary.
> + */
> +static inline void rseq_virt_userspace_exit(void)
> +{
> +	if (current->rseq_event_pending)
> +		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
> +}
> +
> +/*
>    * If parent process has a registered restartable sequences area, the
>    * child inherits. Unregister rseq for a clone with CLONE_VM set.
>    */
> @@ -68,6 +84,7 @@ static inline void rseq_execve(struct ta
>   static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { }
>   static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
>   static inline void rseq_sched_switch_event(struct task_struct *t) { }
> +static inline void rseq_virt_userspace_exit(void) { }
>   static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
>   static inline void rseq_execve(struct task_struct *t) { }
>   static inline void rseq_exit_to_user_mode(void) { }
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -422,50 +422,54 @@ void __rseq_handle_notify_resume(struct
>   {
>   	struct task_struct *t = current;
>   	int ret, sig;
> +	bool event;
> +
> +	/*
> +	 * If invoked from hypervisors before entering the guest via
> +	 * resume_user_mode_work(), then @regs is a NULL pointer.
> +	 *
> +	 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
> +	 * it before returning from the ioctl() to user space when
> +	 * rseq_event.sched_switch is set.
> +	 *
> +	 * So it's safe to ignore here instead of pointlessly updating it
> +	 * in the vcpu_run() loop.
> +	 */
> +	if (!regs)
> +		return;
>   
>   	if (unlikely(t->flags & PF_EXITING))
>   		return;
>   
>   	/*
> -	 * If invoked from hypervisors or IO-URING, then @regs is a NULL
> -	 * pointer, so fixup cannot be done. If the syscall which led to
> -	 * this invocation was invoked inside a critical section, then it
> -	 * will either end up in this code again or a possible violation of
> -	 * a syscall inside a critical region can only be detected by the
> -	 * debug code in rseq_syscall() in a debug enabled kernel.
> +	 * Read and clear the event pending bit first. If the task
> +	 * was not preempted or migrated or a signal is on the way,
> +	 * there is no point in doing any of the heavy lifting here
> +	 * on production kernels. In that case TIF_NOTIFY_RESUME
> +	 * was raised by some other functionality.
> +	 *
> +	 * This is correct because the read/clear operation is
> +	 * guarded against scheduler preemption, which makes it CPU
> +	 * local atomic. If the task is preempted right after
> +	 * re-enabling preemption then TIF_NOTIFY_RESUME is set
> +	 * again and this function is invoked another time _before_
> +	 * the task is able to return to user mode.
> +	 *
> +	 * On a debug kernel, invoke the fixup code unconditionally
> +	 * with the result handed in to allow the detection of
> +	 * inconsistencies.
>   	 */
> -	if (regs) {
> -		/*
> -		 * Read and clear the event pending bit first. If the task
> -		 * was not preempted or migrated or a signal is on the way,
> -		 * there is no point in doing any of the heavy lifting here
> -		 * on production kernels. In that case TIF_NOTIFY_RESUME
> -		 * was raised by some other functionality.
> -		 *
> -		 * This is correct because the read/clear operation is
> -		 * guarded against scheduler preemption, which makes it CPU
> -		 * local atomic. If the task is preempted right after
> -		 * re-enabling preemption then TIF_NOTIFY_RESUME is set
> -		 * again and this function is invoked another time _before_
> -		 * the task is able to return to user mode.
> -		 *
> -		 * On a debug kernel, invoke the fixup code unconditionally
> -		 * with the result handed in to allow the detection of
> -		 * inconsistencies.
> -		 */
> -		bool event;
> -
> -		scoped_guard(RSEQ_EVENT_GUARD) {
> -			event = t->rseq_event_pending;
> -			t->rseq_event_pending = false;
> -		}
> +	scoped_guard(RSEQ_EVENT_GUARD) {
> +		event = t->rseq_event_pending;
> +		t->rseq_event_pending = false;
> +	}
>   
> -		if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
> -			ret = rseq_ip_fixup(regs, event);
> -			if (unlikely(ret < 0))
> -				goto error;
> -		}
> +	if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
> +		ret = rseq_ip_fixup(regs, event);
> +		if (unlikely(ret < 0))
> +			goto error;
>   	}
> +
>   	if (unlikely(rseq_update_cpu_node_id(t)))
>   		goto error;
>   	return;
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -49,6 +49,7 @@
>   #include <linux/lockdep.h>
>   #include <linux/kthread.h>
>   #include <linux/suspend.h>
> +#include <linux/rseq.h>
>   
>   #include <asm/processor.h>
>   #include <asm/ioctl.h>
> @@ -4466,6 +4467,8 @@ static long kvm_vcpu_ioctl(struct file *
>   		r = kvm_arch_vcpu_ioctl_run(vcpu);
>   		vcpu->wants_to_run = false;
>   
> +		rseq_virt_userspace_exit();
> +
>   		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
>   		break;
>   	}
> 


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

Re: [patch V4 07/36] rseq, virt: Retrigger RSEQ after vcpu_run()

Posted by Sean Christopherson 21 hours ago

On Mon, Sep 08, 2025, Thomas Gleixner wrote:
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -49,6 +49,7 @@
>  #include <linux/lockdep.h>
>  #include <linux/kthread.h>
>  #include <linux/suspend.h>
> +#include <linux/rseq.h>
>  
>  #include <asm/processor.h>
>  #include <asm/ioctl.h>
> @@ -4466,6 +4467,8 @@ static long kvm_vcpu_ioctl(struct file *
>  		r = kvm_arch_vcpu_ioctl_run(vcpu);
>  		vcpu->wants_to_run = false;
>  

Finally had a lightbulb moment as to how to eat this hack while not stonewalling
the entire series.  Can you add something like:

		/*
		 * FIXME: Remove this hack once all KVM architectures support
		 * the generic TIF bits, i.e. a dedicated TIF_RSEQ.
		 */

to discourage further abuse, and to make it clear that such ugliness isn't anyone's
first choice.  With that,

Acked-by: Sean Christopherson <seanjc@google.com>

> +		rseq_virt_userspace_exit();
> +
>  		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
>  		break;
>  	}
>

Re: [patch V4 07/36] rseq, virt: Retrigger RSEQ after vcpu_run()

Posted by Thomas Gleixner 9 hours ago

On Mon, Sep 08 2025 at 17:00, Sean Christopherson wrote:
> On Mon, Sep 08, 2025, Thomas Gleixner wrote:
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -49,6 +49,7 @@
>>  #include <linux/lockdep.h>
>>  #include <linux/kthread.h>
>>  #include <linux/suspend.h>
>> +#include <linux/rseq.h>
>>  
>>  #include <asm/processor.h>
>>  #include <asm/ioctl.h>
>> @@ -4466,6 +4467,8 @@ static long kvm_vcpu_ioctl(struct file *
>>  		r = kvm_arch_vcpu_ioctl_run(vcpu);
>>  		vcpu->wants_to_run = false;
>>  
>
> Finally had a lightbulb moment as to how to eat this hack while not stonewalling
> the entire series.  Can you add something like:
>
> 		/*
> 		 * FIXME: Remove this hack once all KVM architectures support
> 		 * the generic TIF bits, i.e. a dedicated TIF_RSEQ.
> 		 */
>
> to discourage further abuse, and to make it clear that such ugliness isn't anyone's
> first choice.  With that,

Fair enough.

[patch V4 08/36] rseq: Avoid CPU/MM CID updates when no event pending

Posted by Thomas Gleixner 23 hours ago

There is no need to update these values unconditionally if there is no
event pending.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 kernel/rseq.c |   11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -464,11 +464,12 @@ void __rseq_handle_notify_resume(struct
 		t->rseq_event_pending = false;
 	}
 
-	if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
-		ret = rseq_ip_fixup(regs, event);
-		if (unlikely(ret < 0))
-			goto error;
-	}
+	if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
+		return;
+
+	ret = rseq_ip_fixup(regs, event);
+	if (unlikely(ret < 0))
+		goto error;
 
 	if (unlikely(rseq_update_cpu_node_id(t)))
 		goto error;

Re: [patch V4 08/36] rseq: Avoid CPU/MM CID updates when no event pending

Posted by Mathieu Desnoyers 8 hours ago

On 2025-09-08 17:31, Thomas Gleixner wrote:
> There is no need to update these values unconditionally if there is no
> event pending.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

> 
> ---
>   kernel/rseq.c |   11 ++++++-----
>   1 file changed, 6 insertions(+), 5 deletions(-)
> 
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -464,11 +464,12 @@ void __rseq_handle_notify_resume(struct
>   		t->rseq_event_pending = false;
>   	}
>   
> -	if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
> -		ret = rseq_ip_fixup(regs, event);
> -		if (unlikely(ret < 0))
> -			goto error;
> -	}
> +	if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
> +		return;
> +
> +	ret = rseq_ip_fixup(regs, event);
> +	if (unlikely(ret < 0))
> +		goto error;
>   
>   	if (unlikely(rseq_update_cpu_node_id(t)))
>   		goto error;
> 
> 


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

[patch V4 09/36] rseq: Introduce struct rseq_data

Posted by Thomas Gleixner 23 hours ago

In preparation for a major rewrite of this code, provide a data structure
for rseq management.

Put all the rseq related data into it (except for the debug part), which
allows to simplify fork/execve by using memset() and memcpy() instead of
adding new fields to initialize over and over.

Create a storage struct for event management as well and put the
sched_switch event and a indicator for RSEQ on a task into it as a
start. That uses a union, which allows to mask and clear the whole lot
efficiently.

The indicators are explicitly not a bit field. Bit fields generate abysmal
code.

The boolean members are defined as u8 as that actually guarantees that it
fits. There seem to be strange architecture ABIs which need more than 8 bits
for a boolean.

The has_rseq member is redundant vs. task::rseq, but it turns out that
boolean operations and quick checks on the union generate better code than
fiddling with separate entities and data types.

This struct will be extended over time to carry more information.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
V4: Move all rseq related data into a dedicated umbrella struct
---
 include/linux/rseq.h       |   48 +++++++++++++++-------------------
 include/linux/rseq_types.h |   51 ++++++++++++++++++++++++++++++++++++
 include/linux/sched.h      |   14 ++--------
 kernel/ptrace.c            |    6 ++--
 kernel/rseq.c              |   63 ++++++++++++++++++++++-----------------------
 5 files changed, 110 insertions(+), 72 deletions(-)

--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -9,22 +9,22 @@ void __rseq_handle_notify_resume(struct
 
 static inline void rseq_handle_notify_resume(struct pt_regs *regs)
 {
-	if (current->rseq)
+	if (current->rseq.event.has_rseq)
 		__rseq_handle_notify_resume(NULL, regs);
 }
 
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
 {
-	if (current->rseq) {
-		current->rseq_event_pending = true;
+	if (current->rseq.event.has_rseq) {
+		current->rseq.event.sched_switch = true;
 		__rseq_handle_notify_resume(ksig, regs);
 	}
 }
 
 static inline void rseq_sched_switch_event(struct task_struct *t)
 {
-	if (t->rseq) {
-		t->rseq_event_pending = true;
+	if (t->rseq.event.has_rseq) {
+		t->rseq.event.sched_switch = true;
 		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
 	}
 }
@@ -32,8 +32,9 @@ static inline void rseq_sched_switch_eve
 static __always_inline void rseq_exit_to_user_mode(void)
 {
 	if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
-		if (WARN_ON_ONCE(current->rseq && current->rseq_event_pending))
-			current->rseq_event_pending = false;
+		if (WARN_ON_ONCE(current->rseq.event.has_rseq &&
+				 current->rseq.event.events))
+			current->rseq.event.events = 0;
 	}
 }
 
@@ -49,35 +50,30 @@ static __always_inline void rseq_exit_to
  */
 static inline void rseq_virt_userspace_exit(void)
 {
-	if (current->rseq_event_pending)
+	if (current->rseq.event.sched_switch)
 		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
 }
 
+static inline void rseq_reset(struct task_struct *t)
+{
+	memset(&t->rseq, 0, sizeof(t->rseq));
+}
+
+static inline void rseq_execve(struct task_struct *t)
+{
+	rseq_reset(t);
+}
+
 /*
  * If parent process has a registered restartable sequences area, the
  * child inherits. Unregister rseq for a clone with CLONE_VM set.
  */
 static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
 {
-	if (clone_flags & CLONE_VM) {
-		t->rseq = NULL;
-		t->rseq_len = 0;
-		t->rseq_sig = 0;
-		t->rseq_event_pending = false;
-	} else {
+	if (clone_flags & CLONE_VM)
+		rseq_reset(t);
+	else
 		t->rseq = current->rseq;
-		t->rseq_len = current->rseq_len;
-		t->rseq_sig = current->rseq_sig;
-		t->rseq_event_pending = current->rseq_event_pending;
-	}
-}
-
-static inline void rseq_execve(struct task_struct *t)
-{
-	t->rseq = NULL;
-	t->rseq_len = 0;
-	t->rseq_sig = 0;
-	t->rseq_event_pending = false;
 }
 
 #else /* CONFIG_RSEQ */
--- /dev/null
+++ b/include/linux/rseq_types.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RSEQ_TYPES_H
+#define _LINUX_RSEQ_TYPES_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_RSEQ
+struct rseq;
+
+/**
+ * struct rseq_event - Storage for rseq related event management
+ * @all:		Compound to initialize and clear the data efficiently
+ * @events:		Compound to access events with a single load/store
+ * @sched_switch:	True if the task was scheduled out
+ * @has_rseq:		True if the task has a rseq pointer installed
+ */
+struct rseq_event {
+	union {
+		u32				all;
+		struct {
+			union {
+				u16		events;
+				struct {
+					u8	sched_switch;
+				};
+			};
+
+			u8			has_rseq;
+		};
+	};
+};
+
+/**
+ * struct rseq_data - Storage for all rseq related data
+ * @usrptr:	Pointer to the registered user space RSEQ memory
+ * @len:	Length of the RSEQ region
+ * @sig:	Signature of critial section abort IPs
+ * @event:	Storage for event management
+ */
+struct rseq_data {
+	struct rseq __user		*usrptr;
+	u32				len;
+	u32				sig;
+	struct rseq_event		event;
+};
+
+#else /* CONFIG_RSEQ */
+struct rseq_data { };
+#endif /* !CONFIG_RSEQ */
+
+#endif
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -41,6 +41,7 @@
 #include <linux/task_io_accounting.h>
 #include <linux/posix-timers_types.h>
 #include <linux/restart_block.h>
+#include <linux/rseq_types.h>
 #include <uapi/linux/rseq.h>
 #include <linux/seqlock_types.h>
 #include <linux/kcsan.h>
@@ -1400,16 +1401,8 @@ struct task_struct {
 	unsigned long			numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
-#ifdef CONFIG_RSEQ
-	struct rseq __user		*rseq;
-	u32				rseq_len;
-	u32				rseq_sig;
-	/*
-	 * RmW on rseq_event_pending must be performed atomically
-	 * with respect to preemption.
-	 */
-	bool				rseq_event_pending;
-# ifdef CONFIG_DEBUG_RSEQ
+	struct rseq_data		rseq;
+#ifdef CONFIG_DEBUG_RSEQ
 	/*
 	 * This is a place holder to save a copy of the rseq fields for
 	 * validation of read-only fields. The struct rseq has a
@@ -1417,7 +1410,6 @@ struct task_struct {
 	 * directly. Reserve a size large enough for the known fields.
 	 */
 	char				rseq_fields[sizeof(struct rseq)];
-# endif
 #endif
 
 #ifdef CONFIG_SCHED_MM_CID
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -793,9 +793,9 @@ static long ptrace_get_rseq_configuratio
 					  unsigned long size, void __user *data)
 {
 	struct ptrace_rseq_configuration conf = {
-		.rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
-		.rseq_abi_size = task->rseq_len,
-		.signature = task->rseq_sig,
+		.rseq_abi_pointer = (u64)(uintptr_t)task->rseq.usrptr,
+		.rseq_abi_size = task->rseq.len,
+		.signature = task->rseq.sig,
 		.flags = 0,
 	};
 
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -103,13 +103,13 @@ static int rseq_validate_ro_fields(struc
 				      DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
 	u32 cpu_id_start, cpu_id, node_id, mm_cid;
-	struct rseq __user *rseq = t->rseq;
+	struct rseq __user *rseq = t->rseq.usrptr;
 
 	/*
 	 * Validate fields which are required to be read-only by
 	 * user-space.
 	 */
-	if (!user_read_access_begin(rseq, t->rseq_len))
+	if (!user_read_access_begin(rseq, t->rseq.len))
 		goto efault;
 	unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
 	unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end);
@@ -147,10 +147,10 @@ static int rseq_validate_ro_fields(struc
  * Update an rseq field and its in-kernel copy in lock-step to keep a coherent
  * state.
  */
-#define rseq_unsafe_put_user(t, value, field, error_label)		\
-	do {								\
-		unsafe_put_user(value, &t->rseq->field, error_label);	\
-		rseq_kernel_fields(t)->field = value;			\
+#define rseq_unsafe_put_user(t, value, field, error_label)			\
+	do {									\
+		unsafe_put_user(value, &t->rseq.usrptr->field, error_label);	\
+		rseq_kernel_fields(t)->field = value;				\
 	} while (0)
 
 #else
@@ -160,12 +160,12 @@ static int rseq_validate_ro_fields(struc
 }
 
 #define rseq_unsafe_put_user(t, value, field, error_label)		\
-	unsafe_put_user(value, &t->rseq->field, error_label)
+	unsafe_put_user(value, &t->rseq.usrptr->field, error_label)
 #endif
 
 static int rseq_update_cpu_node_id(struct task_struct *t)
 {
-	struct rseq __user *rseq = t->rseq;
+	struct rseq __user *rseq = t->rseq.usrptr;
 	u32 cpu_id = raw_smp_processor_id();
 	u32 node_id = cpu_to_node(cpu_id);
 	u32 mm_cid = task_mm_cid(t);
@@ -176,7 +176,7 @@ static int rseq_update_cpu_node_id(struc
 	if (rseq_validate_ro_fields(t))
 		goto efault;
 	WARN_ON_ONCE((int) mm_cid < 0);
-	if (!user_write_access_begin(rseq, t->rseq_len))
+	if (!user_write_access_begin(rseq, t->rseq.len))
 		goto efault;
 
 	rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end);
@@ -201,7 +201,7 @@ static int rseq_update_cpu_node_id(struc
 
 static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
 {
-	struct rseq __user *rseq = t->rseq;
+	struct rseq __user *rseq = t->rseq.usrptr;
 	u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
 	    mm_cid = 0;
 
@@ -211,7 +211,7 @@ static int rseq_reset_rseq_cpu_node_id(s
 	if (rseq_validate_ro_fields(t))
 		goto efault;
 
-	if (!user_write_access_begin(rseq, t->rseq_len))
+	if (!user_write_access_begin(rseq, t->rseq.len))
 		goto efault;
 
 	/*
@@ -272,7 +272,7 @@ static int rseq_get_rseq_cs(struct task_
 	u32 sig;
 	int ret;
 
-	ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr);
+	ret = rseq_get_rseq_cs_ptr_val(t->rseq.usrptr, &ptr);
 	if (ret)
 		return ret;
 
@@ -305,10 +305,10 @@ static int rseq_get_rseq_cs(struct task_
 	if (ret)
 		return ret;
 
-	if (current->rseq_sig != sig) {
+	if (current->rseq.sig != sig) {
 		printk_ratelimited(KERN_WARNING
 			"Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
-			sig, current->rseq_sig, current->pid, usig);
+			sig, current->rseq.sig, current->pid, usig);
 		return -EINVAL;
 	}
 	return 0;
@@ -338,7 +338,7 @@ static int rseq_check_flags(struct task_
 		return -EINVAL;
 
 	/* Get thread flags. */
-	ret = get_user(flags, &t->rseq->flags);
+	ret = get_user(flags, &t->rseq.usrptr->flags);
 	if (ret)
 		return ret;
 
@@ -392,13 +392,13 @@ static int rseq_ip_fixup(struct pt_regs
 	 * Clear the rseq_cs pointer and return.
 	 */
 	if (!in_rseq_cs(ip, &rseq_cs))
-		return clear_rseq_cs(t->rseq);
+		return clear_rseq_cs(t->rseq.usrptr);
 	ret = rseq_check_flags(t, rseq_cs.flags);
 	if (ret < 0)
 		return ret;
 	if (!abort)
 		return 0;
-	ret = clear_rseq_cs(t->rseq);
+	ret = clear_rseq_cs(t->rseq.usrptr);
 	if (ret)
 		return ret;
 	trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
@@ -460,8 +460,8 @@ void __rseq_handle_notify_resume(struct
 	 * inconsistencies.
 	 */
 	scoped_guard(RSEQ_EVENT_GUARD) {
-		event = t->rseq_event_pending;
-		t->rseq_event_pending = false;
+		event = t->rseq.event.sched_switch;
+		t->rseq.event.sched_switch = false;
 	}
 
 	if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
@@ -492,7 +492,7 @@ void rseq_syscall(struct pt_regs *regs)
 	struct task_struct *t = current;
 	struct rseq_cs rseq_cs;
 
-	if (!t->rseq)
+	if (!t->rseq.usrptr)
 		return;
 	if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
 		force_sig(SIGSEGV);
@@ -511,33 +511,31 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
 		if (flags & ~RSEQ_FLAG_UNREGISTER)
 			return -EINVAL;
 		/* Unregister rseq for current thread. */
-		if (current->rseq != rseq || !current->rseq)
+		if (current->rseq.usrptr != rseq || !current->rseq.usrptr)
 			return -EINVAL;
-		if (rseq_len != current->rseq_len)
+		if (rseq_len != current->rseq.len)
 			return -EINVAL;
-		if (current->rseq_sig != sig)
+		if (current->rseq.sig != sig)
 			return -EPERM;
 		ret = rseq_reset_rseq_cpu_node_id(current);
 		if (ret)
 			return ret;
-		current->rseq = NULL;
-		current->rseq_sig = 0;
-		current->rseq_len = 0;
+		rseq_reset(current);
 		return 0;
 	}
 
 	if (unlikely(flags))
 		return -EINVAL;
 
-	if (current->rseq) {
+	if (current->rseq.usrptr) {
 		/*
 		 * If rseq is already registered, check whether
 		 * the provided address differs from the prior
 		 * one.
 		 */
-		if (current->rseq != rseq || rseq_len != current->rseq_len)
+		if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len)
 			return -EINVAL;
-		if (current->rseq_sig != sig)
+		if (current->rseq.sig != sig)
 			return -EPERM;
 		/* Already registered. */
 		return -EBUSY;
@@ -586,15 +584,16 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
 	 * Activate the registration by setting the rseq area address, length
 	 * and signature in the task struct.
 	 */
-	current->rseq = rseq;
-	current->rseq_len = rseq_len;
-	current->rseq_sig = sig;
+	current->rseq.usrptr = rseq;
+	current->rseq.len = rseq_len;
+	current->rseq.sig = sig;
 
 	/*
 	 * If rseq was previously inactive, and has just been
 	 * registered, ensure the cpu_id_start and cpu_id fields
 	 * are updated before returning to user-space.
 	 */
+	current->rseq.event.has_rseq = true;
 	rseq_sched_switch_event(current);
 
 	return 0;

Re: [patch V4 09/36] rseq: Introduce struct rseq_data

Posted by Mathieu Desnoyers 7 hours ago

On 2025-09-08 17:31, Thomas Gleixner wrote:
> In preparation for a major rewrite of this code, provide a data structure
> for rseq management.
> 
> Put all the rseq related data into it (except for the debug part), which
> allows to simplify fork/execve by using memset() and memcpy() instead of
> adding new fields to initialize over and over.
> 
> Create a storage struct for event management as well and put the
> sched_switch event and a indicator for RSEQ on a task into it as a
> start. That uses a union, which allows to mask and clear the whole lot
> efficiently.
> 
> The indicators are explicitly not a bit field. Bit fields generate abysmal
> code.
> 
> The boolean members are defined as u8 as that actually guarantees that it
> fits. There seem to be strange architecture ABIs which need more than 8 bits

seem -> seems

Other than this nit:

Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

> for a boolean.
> 
> The has_rseq member is redundant vs. task::rseq, but it turns out that
> boolean operations and quick checks on the union generate better code than
> fiddling with separate entities and data types.
> 
> This struct will be extended over time to carry more information.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
> V4: Move all rseq related data into a dedicated umbrella struct
> ---
>   include/linux/rseq.h       |   48 +++++++++++++++-------------------
>   include/linux/rseq_types.h |   51 ++++++++++++++++++++++++++++++++++++
>   include/linux/sched.h      |   14 ++--------
>   kernel/ptrace.c            |    6 ++--
>   kernel/rseq.c              |   63 ++++++++++++++++++++++-----------------------
>   5 files changed, 110 insertions(+), 72 deletions(-)
> 
> --- a/include/linux/rseq.h
> +++ b/include/linux/rseq.h
> @@ -9,22 +9,22 @@ void __rseq_handle_notify_resume(struct
>   
>   static inline void rseq_handle_notify_resume(struct pt_regs *regs)
>   {
> -	if (current->rseq)
> +	if (current->rseq.event.has_rseq)
>   		__rseq_handle_notify_resume(NULL, regs);
>   }
>   
>   static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
>   {
> -	if (current->rseq) {
> -		current->rseq_event_pending = true;
> +	if (current->rseq.event.has_rseq) {
> +		current->rseq.event.sched_switch = true;
>   		__rseq_handle_notify_resume(ksig, regs);
>   	}
>   }
>   
>   static inline void rseq_sched_switch_event(struct task_struct *t)
>   {
> -	if (t->rseq) {
> -		t->rseq_event_pending = true;
> +	if (t->rseq.event.has_rseq) {
> +		t->rseq.event.sched_switch = true;
>   		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
>   	}
>   }
> @@ -32,8 +32,9 @@ static inline void rseq_sched_switch_eve
>   static __always_inline void rseq_exit_to_user_mode(void)
>   {
>   	if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
> -		if (WARN_ON_ONCE(current->rseq && current->rseq_event_pending))
> -			current->rseq_event_pending = false;
> +		if (WARN_ON_ONCE(current->rseq.event.has_rseq &&
> +				 current->rseq.event.events))
> +			current->rseq.event.events = 0;
>   	}
>   }
>   
> @@ -49,35 +50,30 @@ static __always_inline void rseq_exit_to
>    */
>   static inline void rseq_virt_userspace_exit(void)
>   {
> -	if (current->rseq_event_pending)
> +	if (current->rseq.event.sched_switch)
>   		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
>   }
>   
> +static inline void rseq_reset(struct task_struct *t)
> +{
> +	memset(&t->rseq, 0, sizeof(t->rseq));
> +}
> +
> +static inline void rseq_execve(struct task_struct *t)
> +{
> +	rseq_reset(t);
> +}
> +
>   /*
>    * If parent process has a registered restartable sequences area, the
>    * child inherits. Unregister rseq for a clone with CLONE_VM set.
>    */
>   static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
>   {
> -	if (clone_flags & CLONE_VM) {
> -		t->rseq = NULL;
> -		t->rseq_len = 0;
> -		t->rseq_sig = 0;
> -		t->rseq_event_pending = false;
> -	} else {
> +	if (clone_flags & CLONE_VM)
> +		rseq_reset(t);
> +	else
>   		t->rseq = current->rseq;
> -		t->rseq_len = current->rseq_len;
> -		t->rseq_sig = current->rseq_sig;
> -		t->rseq_event_pending = current->rseq_event_pending;
> -	}
> -}
> -
> -static inline void rseq_execve(struct task_struct *t)
> -{
> -	t->rseq = NULL;
> -	t->rseq_len = 0;
> -	t->rseq_sig = 0;
> -	t->rseq_event_pending = false;
>   }
>   
>   #else /* CONFIG_RSEQ */
> --- /dev/null
> +++ b/include/linux/rseq_types.h
> @@ -0,0 +1,51 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_RSEQ_TYPES_H
> +#define _LINUX_RSEQ_TYPES_H
> +
> +#include <linux/types.h>
> +
> +#ifdef CONFIG_RSEQ
> +struct rseq;
> +
> +/**
> + * struct rseq_event - Storage for rseq related event management
> + * @all:		Compound to initialize and clear the data efficiently
> + * @events:		Compound to access events with a single load/store
> + * @sched_switch:	True if the task was scheduled out
> + * @has_rseq:		True if the task has a rseq pointer installed
> + */
> +struct rseq_event {
> +	union {
> +		u32				all;
> +		struct {
> +			union {
> +				u16		events;
> +				struct {
> +					u8	sched_switch;
> +				};
> +			};
> +
> +			u8			has_rseq;
> +		};
> +	};
> +};
> +
> +/**
> + * struct rseq_data - Storage for all rseq related data
> + * @usrptr:	Pointer to the registered user space RSEQ memory
> + * @len:	Length of the RSEQ region
> + * @sig:	Signature of critial section abort IPs
> + * @event:	Storage for event management
> + */
> +struct rseq_data {
> +	struct rseq __user		*usrptr;
> +	u32				len;
> +	u32				sig;
> +	struct rseq_event		event;
> +};
> +
> +#else /* CONFIG_RSEQ */
> +struct rseq_data { };
> +#endif /* !CONFIG_RSEQ */
> +
> +#endif
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -41,6 +41,7 @@
>   #include <linux/task_io_accounting.h>
>   #include <linux/posix-timers_types.h>
>   #include <linux/restart_block.h>
> +#include <linux/rseq_types.h>
>   #include <uapi/linux/rseq.h>
>   #include <linux/seqlock_types.h>
>   #include <linux/kcsan.h>
> @@ -1400,16 +1401,8 @@ struct task_struct {
>   	unsigned long			numa_pages_migrated;
>   #endif /* CONFIG_NUMA_BALANCING */
>   
> -#ifdef CONFIG_RSEQ
> -	struct rseq __user		*rseq;
> -	u32				rseq_len;
> -	u32				rseq_sig;
> -	/*
> -	 * RmW on rseq_event_pending must be performed atomically
> -	 * with respect to preemption.
> -	 */
> -	bool				rseq_event_pending;
> -# ifdef CONFIG_DEBUG_RSEQ
> +	struct rseq_data		rseq;
> +#ifdef CONFIG_DEBUG_RSEQ
>   	/*
>   	 * This is a place holder to save a copy of the rseq fields for
>   	 * validation of read-only fields. The struct rseq has a
> @@ -1417,7 +1410,6 @@ struct task_struct {
>   	 * directly. Reserve a size large enough for the known fields.
>   	 */
>   	char				rseq_fields[sizeof(struct rseq)];
> -# endif
>   #endif
>   
>   #ifdef CONFIG_SCHED_MM_CID
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -793,9 +793,9 @@ static long ptrace_get_rseq_configuratio
>   					  unsigned long size, void __user *data)
>   {
>   	struct ptrace_rseq_configuration conf = {
> -		.rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
> -		.rseq_abi_size = task->rseq_len,
> -		.signature = task->rseq_sig,
> +		.rseq_abi_pointer = (u64)(uintptr_t)task->rseq.usrptr,
> +		.rseq_abi_size = task->rseq.len,
> +		.signature = task->rseq.sig,
>   		.flags = 0,
>   	};
>   
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -103,13 +103,13 @@ static int rseq_validate_ro_fields(struc
>   				      DEFAULT_RATELIMIT_INTERVAL,
>   				      DEFAULT_RATELIMIT_BURST);
>   	u32 cpu_id_start, cpu_id, node_id, mm_cid;
> -	struct rseq __user *rseq = t->rseq;
> +	struct rseq __user *rseq = t->rseq.usrptr;
>   
>   	/*
>   	 * Validate fields which are required to be read-only by
>   	 * user-space.
>   	 */
> -	if (!user_read_access_begin(rseq, t->rseq_len))
> +	if (!user_read_access_begin(rseq, t->rseq.len))
>   		goto efault;
>   	unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
>   	unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end);
> @@ -147,10 +147,10 @@ static int rseq_validate_ro_fields(struc
>    * Update an rseq field and its in-kernel copy in lock-step to keep a coherent
>    * state.
>    */
> -#define rseq_unsafe_put_user(t, value, field, error_label)		\
> -	do {								\
> -		unsafe_put_user(value, &t->rseq->field, error_label);	\
> -		rseq_kernel_fields(t)->field = value;			\
> +#define rseq_unsafe_put_user(t, value, field, error_label)			\
> +	do {									\
> +		unsafe_put_user(value, &t->rseq.usrptr->field, error_label);	\
> +		rseq_kernel_fields(t)->field = value;				\
>   	} while (0)
>   
>   #else
> @@ -160,12 +160,12 @@ static int rseq_validate_ro_fields(struc
>   }
>   
>   #define rseq_unsafe_put_user(t, value, field, error_label)		\
> -	unsafe_put_user(value, &t->rseq->field, error_label)
> +	unsafe_put_user(value, &t->rseq.usrptr->field, error_label)
>   #endif
>   
>   static int rseq_update_cpu_node_id(struct task_struct *t)
>   {
> -	struct rseq __user *rseq = t->rseq;
> +	struct rseq __user *rseq = t->rseq.usrptr;
>   	u32 cpu_id = raw_smp_processor_id();
>   	u32 node_id = cpu_to_node(cpu_id);
>   	u32 mm_cid = task_mm_cid(t);
> @@ -176,7 +176,7 @@ static int rseq_update_cpu_node_id(struc
>   	if (rseq_validate_ro_fields(t))
>   		goto efault;
>   	WARN_ON_ONCE((int) mm_cid < 0);
> -	if (!user_write_access_begin(rseq, t->rseq_len))
> +	if (!user_write_access_begin(rseq, t->rseq.len))
>   		goto efault;
>   
>   	rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end);
> @@ -201,7 +201,7 @@ static int rseq_update_cpu_node_id(struc
>   
>   static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
>   {
> -	struct rseq __user *rseq = t->rseq;
> +	struct rseq __user *rseq = t->rseq.usrptr;
>   	u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
>   	    mm_cid = 0;
>   
> @@ -211,7 +211,7 @@ static int rseq_reset_rseq_cpu_node_id(s
>   	if (rseq_validate_ro_fields(t))
>   		goto efault;
>   
> -	if (!user_write_access_begin(rseq, t->rseq_len))
> +	if (!user_write_access_begin(rseq, t->rseq.len))
>   		goto efault;
>   
>   	/*
> @@ -272,7 +272,7 @@ static int rseq_get_rseq_cs(struct task_
>   	u32 sig;
>   	int ret;
>   
> -	ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr);
> +	ret = rseq_get_rseq_cs_ptr_val(t->rseq.usrptr, &ptr);
>   	if (ret)
>   		return ret;
>   
> @@ -305,10 +305,10 @@ static int rseq_get_rseq_cs(struct task_
>   	if (ret)
>   		return ret;
>   
> -	if (current->rseq_sig != sig) {
> +	if (current->rseq.sig != sig) {
>   		printk_ratelimited(KERN_WARNING
>   			"Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
> -			sig, current->rseq_sig, current->pid, usig);
> +			sig, current->rseq.sig, current->pid, usig);
>   		return -EINVAL;
>   	}
>   	return 0;
> @@ -338,7 +338,7 @@ static int rseq_check_flags(struct task_
>   		return -EINVAL;
>   
>   	/* Get thread flags. */
> -	ret = get_user(flags, &t->rseq->flags);
> +	ret = get_user(flags, &t->rseq.usrptr->flags);
>   	if (ret)
>   		return ret;
>   
> @@ -392,13 +392,13 @@ static int rseq_ip_fixup(struct pt_regs
>   	 * Clear the rseq_cs pointer and return.
>   	 */
>   	if (!in_rseq_cs(ip, &rseq_cs))
> -		return clear_rseq_cs(t->rseq);
> +		return clear_rseq_cs(t->rseq.usrptr);
>   	ret = rseq_check_flags(t, rseq_cs.flags);
>   	if (ret < 0)
>   		return ret;
>   	if (!abort)
>   		return 0;
> -	ret = clear_rseq_cs(t->rseq);
> +	ret = clear_rseq_cs(t->rseq.usrptr);
>   	if (ret)
>   		return ret;
>   	trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
> @@ -460,8 +460,8 @@ void __rseq_handle_notify_resume(struct
>   	 * inconsistencies.
>   	 */
>   	scoped_guard(RSEQ_EVENT_GUARD) {
> -		event = t->rseq_event_pending;
> -		t->rseq_event_pending = false;
> +		event = t->rseq.event.sched_switch;
> +		t->rseq.event.sched_switch = false;
>   	}
>   
>   	if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
> @@ -492,7 +492,7 @@ void rseq_syscall(struct pt_regs *regs)
>   	struct task_struct *t = current;
>   	struct rseq_cs rseq_cs;
>   
> -	if (!t->rseq)
> +	if (!t->rseq.usrptr)
>   		return;
>   	if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
>   		force_sig(SIGSEGV);
> @@ -511,33 +511,31 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
>   		if (flags & ~RSEQ_FLAG_UNREGISTER)
>   			return -EINVAL;
>   		/* Unregister rseq for current thread. */
> -		if (current->rseq != rseq || !current->rseq)
> +		if (current->rseq.usrptr != rseq || !current->rseq.usrptr)
>   			return -EINVAL;
> -		if (rseq_len != current->rseq_len)
> +		if (rseq_len != current->rseq.len)
>   			return -EINVAL;
> -		if (current->rseq_sig != sig)
> +		if (current->rseq.sig != sig)
>   			return -EPERM;
>   		ret = rseq_reset_rseq_cpu_node_id(current);
>   		if (ret)
>   			return ret;
> -		current->rseq = NULL;
> -		current->rseq_sig = 0;
> -		current->rseq_len = 0;
> +		rseq_reset(current);
>   		return 0;
>   	}
>   
>   	if (unlikely(flags))
>   		return -EINVAL;
>   
> -	if (current->rseq) {
> +	if (current->rseq.usrptr) {
>   		/*
>   		 * If rseq is already registered, check whether
>   		 * the provided address differs from the prior
>   		 * one.
>   		 */
> -		if (current->rseq != rseq || rseq_len != current->rseq_len)
> +		if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len)
>   			return -EINVAL;
> -		if (current->rseq_sig != sig)
> +		if (current->rseq.sig != sig)
>   			return -EPERM;
>   		/* Already registered. */
>   		return -EBUSY;
> @@ -586,15 +584,16 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
>   	 * Activate the registration by setting the rseq area address, length
>   	 * and signature in the task struct.
>   	 */
> -	current->rseq = rseq;
> -	current->rseq_len = rseq_len;
> -	current->rseq_sig = sig;
> +	current->rseq.usrptr = rseq;
> +	current->rseq.len = rseq_len;
> +	current->rseq.sig = sig;
>   
>   	/*
>   	 * If rseq was previously inactive, and has just been
>   	 * registered, ensure the cpu_id_start and cpu_id fields
>   	 * are updated before returning to user-space.
>   	 */
> +	current->rseq.event.has_rseq = true;
>   	rseq_sched_switch_event(current);
>   
>   	return 0;
> 


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

[patch V4 10/36] entry: Cleanup header

Posted by Thomas Gleixner 23 hours ago

From: Thomas Gleixner <tglx@linutronix.de>

Cleanup the include ordering, kernel-doc and other trivialities before
making further changes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>


---
 include/linux/entry-common.h     |    8 ++++----
 include/linux/irq-entry-common.h |    2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)
---

--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -3,11 +3,11 @@
 #define __LINUX_ENTRYCOMMON_H
 
 #include <linux/irq-entry-common.h>
+#include <linux/livepatch.h>
 #include <linux/ptrace.h>
+#include <linux/resume_user_mode.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
-#include <linux/livepatch.h>
-#include <linux/resume_user_mode.h>
 
 #include <asm/entry-common.h>
 #include <asm/syscall.h>
@@ -37,6 +37,7 @@
 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
 				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
 				 ARCH_SYSCALL_WORK_ENTER)
+
 #define SYSCALL_WORK_EXIT	(SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
 				 SYSCALL_WORK_SYSCALL_TRACE |		\
 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
@@ -61,8 +62,7 @@
  */
 void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
 
-long syscall_trace_enter(struct pt_regs *regs, long syscall,
-			 unsigned long work);
+long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work);
 
 /**
  * syscall_enter_from_user_mode_work - Check and handle work before invoking
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -68,6 +68,7 @@ static __always_inline bool arch_in_rcu_
 
 /**
  * enter_from_user_mode - Establish state when coming from user mode
+ * @regs:	Pointer to currents pt_regs
  *
  * Syscall/interrupt entry disables interrupts, but user mode is traced as
  * interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
@@ -357,6 +358,7 @@ irqentry_state_t noinstr irqentry_enter(
  * Conditional reschedule with additional sanity checks.
  */
 void raw_irqentry_exit_cond_resched(void);
+
 #ifdef CONFIG_PREEMPT_DYNAMIC
 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
 #define irqentry_exit_cond_resched_dynamic_enabled	raw_irqentry_exit_cond_resched

[patch V4 11/36] entry: Remove syscall_enter_from_user_mode_prepare()

Posted by Thomas Gleixner 23 hours ago

Open code the only user in the x86 syscall code and reduce the zoo of
functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org

---
 arch/x86/entry/syscall_32.c   |    3 ++-
 include/linux/entry-common.h  |   26 +++++---------------------
 kernel/entry/syscall-common.c |    8 --------
 3 files changed, 7 insertions(+), 30 deletions(-)

--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -274,9 +274,10 @@ static noinstr bool __do_fast_syscall_32
 	 * fetch EBP before invoking any of the syscall entry work
 	 * functions.
 	 */
-	syscall_enter_from_user_mode_prepare(regs);
+	enter_from_user_mode(regs);
 
 	instrumentation_begin();
+	local_irq_enable();
 	/* Fetch EBP from where the vDSO stashed it. */
 	if (IS_ENABLED(CONFIG_X86_64)) {
 		/*
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -45,23 +45,6 @@
 				 SYSCALL_WORK_SYSCALL_EXIT_TRAP	|	\
 				 ARCH_SYSCALL_WORK_EXIT)
 
-/**
- * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts
- * @regs:	Pointer to currents pt_regs
- *
- * Invoked from architecture specific syscall entry code with interrupts
- * disabled. The calling code has to be non-instrumentable. When the
- * function returns all state is correct, interrupts are enabled and the
- * subsequent functions can be instrumented.
- *
- * This handles lockdep, RCU (context tracking) and tracing state, i.e.
- * the functionality provided by enter_from_user_mode().
- *
- * This is invoked when there is extra architecture specific functionality
- * to be done between establishing state and handling user mode entry work.
- */
-void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
-
 long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work);
 
 /**
@@ -71,8 +54,8 @@ long syscall_trace_enter(struct pt_regs
  * @syscall:	The syscall number
  *
  * Invoked from architecture specific syscall entry code with interrupts
- * enabled after invoking syscall_enter_from_user_mode_prepare() and extra
- * architecture specific work.
+ * enabled after invoking enter_from_user_mode(), enabling interrupts and
+ * extra architecture specific work.
  *
  * Returns: The original or a modified syscall number
  *
@@ -108,8 +91,9 @@ static __always_inline long syscall_ente
  * function returns all state is correct, interrupts are enabled and the
  * subsequent functions can be instrumented.
  *
- * This is combination of syscall_enter_from_user_mode_prepare() and
- * syscall_enter_from_user_mode_work().
+ * This is the combination of enter_from_user_mode() and
+ * syscall_enter_from_user_mode_work() to be used when there is no
+ * architecture specific work to be done between the two.
  *
  * Returns: The original or a modified syscall number. See
  * syscall_enter_from_user_mode_work() for further explanation.
--- a/kernel/entry/syscall-common.c
+++ b/kernel/entry/syscall-common.c
@@ -63,14 +63,6 @@ long syscall_trace_enter(struct pt_regs
 	return ret ? : syscall;
 }
 
-noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
-{
-	enter_from_user_mode(regs);
-	instrumentation_begin();
-	local_irq_enable();
-	instrumentation_end();
-}
-
 /*
  * If SYSCALL_EMU is set, then the only reason to report is when
  * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall

Re: [patch V4 11/36] entry: Remove syscall_enter_from_user_mode_prepare()

Posted by Mathieu Desnoyers 7 hours ago

On 2025-09-08 17:31, Thomas Gleixner wrote:
> Open code the only user in the x86 syscall code and reduce the zoo of
> functions.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: x86@kernel.org

Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

> 
> ---
>   arch/x86/entry/syscall_32.c   |    3 ++-
>   include/linux/entry-common.h  |   26 +++++---------------------
>   kernel/entry/syscall-common.c |    8 --------
>   3 files changed, 7 insertions(+), 30 deletions(-)
> 
> --- a/arch/x86/entry/syscall_32.c
> +++ b/arch/x86/entry/syscall_32.c
> @@ -274,9 +274,10 @@ static noinstr bool __do_fast_syscall_32
>   	 * fetch EBP before invoking any of the syscall entry work
>   	 * functions.
>   	 */
> -	syscall_enter_from_user_mode_prepare(regs);
> +	enter_from_user_mode(regs);
>   
>   	instrumentation_begin();
> +	local_irq_enable();
>   	/* Fetch EBP from where the vDSO stashed it. */
>   	if (IS_ENABLED(CONFIG_X86_64)) {
>   		/*
> --- a/include/linux/entry-common.h
> +++ b/include/linux/entry-common.h
> @@ -45,23 +45,6 @@
>   				 SYSCALL_WORK_SYSCALL_EXIT_TRAP	|	\
>   				 ARCH_SYSCALL_WORK_EXIT)
>   
> -/**
> - * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts
> - * @regs:	Pointer to currents pt_regs
> - *
> - * Invoked from architecture specific syscall entry code with interrupts
> - * disabled. The calling code has to be non-instrumentable. When the
> - * function returns all state is correct, interrupts are enabled and the
> - * subsequent functions can be instrumented.
> - *
> - * This handles lockdep, RCU (context tracking) and tracing state, i.e.
> - * the functionality provided by enter_from_user_mode().
> - *
> - * This is invoked when there is extra architecture specific functionality
> - * to be done between establishing state and handling user mode entry work.
> - */
> -void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
> -
>   long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work);
>   
>   /**
> @@ -71,8 +54,8 @@ long syscall_trace_enter(struct pt_regs
>    * @syscall:	The syscall number
>    *
>    * Invoked from architecture specific syscall entry code with interrupts
> - * enabled after invoking syscall_enter_from_user_mode_prepare() and extra
> - * architecture specific work.
> + * enabled after invoking enter_from_user_mode(), enabling interrupts and
> + * extra architecture specific work.
>    *
>    * Returns: The original or a modified syscall number
>    *
> @@ -108,8 +91,9 @@ static __always_inline long syscall_ente
>    * function returns all state is correct, interrupts are enabled and the
>    * subsequent functions can be instrumented.
>    *
> - * This is combination of syscall_enter_from_user_mode_prepare() and
> - * syscall_enter_from_user_mode_work().
> + * This is the combination of enter_from_user_mode() and
> + * syscall_enter_from_user_mode_work() to be used when there is no
> + * architecture specific work to be done between the two.
>    *
>    * Returns: The original or a modified syscall number. See
>    * syscall_enter_from_user_mode_work() for further explanation.
> --- a/kernel/entry/syscall-common.c
> +++ b/kernel/entry/syscall-common.c
> @@ -63,14 +63,6 @@ long syscall_trace_enter(struct pt_regs
>   	return ret ? : syscall;
>   }
>   
> -noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
> -{
> -	enter_from_user_mode(regs);
> -	instrumentation_begin();
> -	local_irq_enable();
> -	instrumentation_end();
> -}
> -
>   /*
>    * If SYSCALL_EMU is set, then the only reason to report is when
>    * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
> 
> 


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

[patch V4 12/36] entry: Inline irqentry_enter/exit_from/to_user_mode()

Posted by Thomas Gleixner 23 hours ago

There is no point to have this as a function which just inlines
enter_from_user_mode(). The function call overhead is larger than the
function itself.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 include/linux/irq-entry-common.h |   13 +++++++++++--
 kernel/entry/common.c            |   13 -------------
 2 files changed, 11 insertions(+), 15 deletions(-)

--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -278,7 +278,10 @@ static __always_inline void exit_to_user
  *
  * The function establishes state (lockdep, RCU (context tracking), tracing)
  */
-void irqentry_enter_from_user_mode(struct pt_regs *regs);
+static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
+{
+	enter_from_user_mode(regs);
+}
 
 /**
  * irqentry_exit_to_user_mode - Interrupt exit work
@@ -293,7 +296,13 @@ void irqentry_enter_from_user_mode(struc
  * Interrupt exit is not invoking #1 which is the syscall specific one time
  * work.
  */
-void irqentry_exit_to_user_mode(struct pt_regs *regs);
+static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs)
+{
+	instrumentation_begin();
+	exit_to_user_mode_prepare(regs);
+	instrumentation_end();
+	exit_to_user_mode();
+}
 
 #ifndef irqentry_state
 /**
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -62,19 +62,6 @@ void __weak arch_do_signal_or_restart(st
 	return ti_work;
 }
 
-noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
-{
-	enter_from_user_mode(regs);
-}
-
-noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
-{
-	instrumentation_begin();
-	exit_to_user_mode_prepare(regs);
-	instrumentation_end();
-	exit_to_user_mode();
-}
-
 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 {
 	irqentry_state_t ret = {

Re: [patch V4 12/36] entry: Inline irqentry_enter/exit_from/to_user_mode()

Posted by Mathieu Desnoyers 7 hours ago

On 2025-09-08 17:31, Thomas Gleixner wrote:
> There is no point to have this as a function which just inlines
> enter_from_user_mode(). The function call overhead is larger than the
> function itself.

I'm wondering if there is a reason for making those actual functions and
not inlines.

The functions sit in kernel/entry/common.c, which are built with
specific compiler flags in kernel/entry/Makefile:

# Prevent the noinstr section from being pestered by sanitizer and other 
goodies
# as long as these things cannot be disabled per function.
KASAN_SANITIZE := n
UBSAN_SANITIZE := n
KCOV_INSTRUMENT := n

# Branch profiling isn't noinstr-safe
ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING

CFLAGS_REMOVE_common.o   = -fstack-protector -fstack-protector-strong
CFLAGS_common.o         += -fno-stack-protector

So I wonder if we're not breaking something in the area of "noinstr"
tagging by inlining those into their caller ?

Thanks,

Mathieu

> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> 
> ---
>   include/linux/irq-entry-common.h |   13 +++++++++++--
>   kernel/entry/common.c            |   13 -------------
>   2 files changed, 11 insertions(+), 15 deletions(-)
> 
> --- a/include/linux/irq-entry-common.h
> +++ b/include/linux/irq-entry-common.h
> @@ -278,7 +278,10 @@ static __always_inline void exit_to_user
>    *
>    * The function establishes state (lockdep, RCU (context tracking), tracing)
>    */
> -void irqentry_enter_from_user_mode(struct pt_regs *regs);
> +static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
> +{
> +	enter_from_user_mode(regs);
> +}
>   
>   /**
>    * irqentry_exit_to_user_mode - Interrupt exit work
> @@ -293,7 +296,13 @@ void irqentry_enter_from_user_mode(struc
>    * Interrupt exit is not invoking #1 which is the syscall specific one time
>    * work.
>    */
> -void irqentry_exit_to_user_mode(struct pt_regs *regs);
> +static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs)
> +{
> +	instrumentation_begin();
> +	exit_to_user_mode_prepare(regs);
> +	instrumentation_end();
> +	exit_to_user_mode();
> +}
>   
>   #ifndef irqentry_state
>   /**
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -62,19 +62,6 @@ void __weak arch_do_signal_or_restart(st
>   	return ti_work;
>   }
>   
> -noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
> -{
> -	enter_from_user_mode(regs);
> -}
> -
> -noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
> -{
> -	instrumentation_begin();
> -	exit_to_user_mode_prepare(regs);
> -	instrumentation_end();
> -	exit_to_user_mode();
> -}
> -
>   noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
>   {
>   	irqentry_state_t ret = {
> 
> 


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

Re: [patch V4 12/36] entry: Inline irqentry_enter/exit_from/to_user_mode()

Posted by Thomas Gleixner 7 hours ago

On Tue, Sep 09 2025 at 09:38, Mathieu Desnoyers wrote:

> On 2025-09-08 17:31, Thomas Gleixner wrote:
>> There is no point to have this as a function which just inlines
>> enter_from_user_mode(). The function call overhead is larger than the
>> function itself.
>
> I'm wondering if there is a reason for making those actual functions and
> not inlines.
>
> The functions sit in kernel/entry/common.c, which are built with
> specific compiler flags in kernel/entry/Makefile:
>
> # Prevent the noinstr section from being pestered by sanitizer and other 
> goodies
> # as long as these things cannot be disabled per function.
> KASAN_SANITIZE := n
> UBSAN_SANITIZE := n
> KCOV_INSTRUMENT := n
>
> # Branch profiling isn't noinstr-safe
> ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING
>
> CFLAGS_REMOVE_common.o   = -fstack-protector -fstack-protector-strong
> CFLAGS_common.o         += -fno-stack-protector
>
> So I wonder if we're not breaking something in the area of "noinstr"
> tagging by inlining those into their caller ?

No, because the call sites have to be non-instrumented as well.

Re: [patch V4 12/36] entry: Inline irqentry_enter/exit_from/to_user_mode()

Posted by Mathieu Desnoyers 6 hours ago

On 2025-09-09 10:10, Thomas Gleixner wrote:
> On Tue, Sep 09 2025 at 09:38, Mathieu Desnoyers wrote:
> 
>> On 2025-09-08 17:31, Thomas Gleixner wrote:
>>> There is no point to have this as a function which just inlines
>>> enter_from_user_mode(). The function call overhead is larger than the
>>> function itself.
>>
>> I'm wondering if there is a reason for making those actual functions and
>> not inlines.
>>
>> The functions sit in kernel/entry/common.c, which are built with
>> specific compiler flags in kernel/entry/Makefile:
>>
>> # Prevent the noinstr section from being pestered by sanitizer and other
>> goodies
>> # as long as these things cannot be disabled per function.
>> KASAN_SANITIZE := n
>> UBSAN_SANITIZE := n
>> KCOV_INSTRUMENT := n
>>
>> # Branch profiling isn't noinstr-safe
>> ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING
>>
>> CFLAGS_REMOVE_common.o   = -fstack-protector -fstack-protector-strong
>> CFLAGS_common.o         += -fno-stack-protector
>>
>> So I wonder if we're not breaking something in the area of "noinstr"
>> tagging by inlining those into their caller ?
> 
> No, because the call sites have to be non-instrumented as well.

OK.

Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>



-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

[patch V4 13/36] sched: Move MM CID related functions to sched.h

Posted by Thomas Gleixner 23 hours ago

There is nothing mm specific in that and including mm.h can cause header
recursion hell.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

---
 include/linux/mm.h    |   25 -------------------------
 include/linux/sched.h |   26 ++++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 25 deletions(-)

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2310,31 +2310,6 @@ struct zap_details {
 /* Set in unmap_vmas() to indicate a final unmap call.  Only used by hugetlb */
 #define  ZAP_FLAG_UNMAP              ((__force zap_flags_t) BIT(1))
 
-#ifdef CONFIG_SCHED_MM_CID
-void sched_mm_cid_before_execve(struct task_struct *t);
-void sched_mm_cid_after_execve(struct task_struct *t);
-void sched_mm_cid_fork(struct task_struct *t);
-void sched_mm_cid_exit_signals(struct task_struct *t);
-static inline int task_mm_cid(struct task_struct *t)
-{
-	return t->mm_cid;
-}
-#else
-static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
-static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
-static inline void sched_mm_cid_fork(struct task_struct *t) { }
-static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
-static inline int task_mm_cid(struct task_struct *t)
-{
-	/*
-	 * Use the processor id as a fall-back when the mm cid feature is
-	 * disabled. This provides functional per-cpu data structure accesses
-	 * in user-space, althrough it won't provide the memory usage benefits.
-	 */
-	return raw_smp_processor_id();
-}
-#endif
-
 #ifdef CONFIG_MMU
 extern bool can_do_mlock(void);
 #else
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2304,4 +2304,30 @@ static __always_inline void alloc_tag_re
 #define alloc_tag_restore(_tag, _old)		do {} while (0)
 #endif
 
+/* Avoids recursive inclusion hell */
+#ifdef CONFIG_SCHED_MM_CID
+void sched_mm_cid_before_execve(struct task_struct *t);
+void sched_mm_cid_after_execve(struct task_struct *t);
+void sched_mm_cid_fork(struct task_struct *t);
+void sched_mm_cid_exit_signals(struct task_struct *t);
+static inline int task_mm_cid(struct task_struct *t)
+{
+	return t->mm_cid;
+}
+#else
+static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_fork(struct task_struct *t) { }
+static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
+static inline int task_mm_cid(struct task_struct *t)
+{
+	/*
+	 * Use the processor id as a fall-back when the mm cid feature is
+	 * disabled. This provides functional per-cpu data structure accesses
+	 * in user-space, althrough it won't provide the memory usage benefits.
+	 */
+	return task_cpu(t);
+}
+#endif
+
 #endif

[patch V4 14/36] rseq: Cache CPU ID and MM CID values

Posted by Thomas Gleixner 23 hours ago

In preparation for rewriting RSEQ exit to user space handling provide
storage to cache the CPU ID and MM CID values which were written to user
space. That prepares for a quick check, which avoids the update when
nothing changed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 include/linux/rseq.h        |    7 +++++--
 include/linux/rseq_types.h  |   21 +++++++++++++++++++++
 include/trace/events/rseq.h |    4 ++--
 kernel/rseq.c               |    4 ++++
 4 files changed, 32 insertions(+), 4 deletions(-)

--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -57,6 +57,7 @@ static inline void rseq_virt_userspace_e
 static inline void rseq_reset(struct task_struct *t)
 {
 	memset(&t->rseq, 0, sizeof(t->rseq));
+	t->rseq.ids.cpu_cid = ~0ULL;
 }
 
 static inline void rseq_execve(struct task_struct *t)
@@ -70,10 +71,12 @@ static inline void rseq_execve(struct ta
  */
 static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
 {
-	if (clone_flags & CLONE_VM)
+	if (clone_flags & CLONE_VM) {
 		rseq_reset(t);
-	else
+	} else {
 		t->rseq = current->rseq;
+		t->rseq.ids.cpu_cid = ~0ULL;
+	}
 }
 
 #else /* CONFIG_RSEQ */
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -31,17 +31,38 @@ struct rseq_event {
 };
 
 /**
+ * struct rseq_ids - Cache for ids, which need to be updated
+ * @cpu_cid:	Compound of @cpu_id and @mm_cid to make the
+ *		compiler emit a single compare on 64-bit
+ * @cpu_id:	The CPU ID which was written last to user space
+ * @mm_cid:	The MM CID which was written last to user space
+ *
+ * @cpu_id and @mm_cid are updated when the data is written to user space.
+ */
+struct rseq_ids {
+	union {
+		u64		cpu_cid;
+		struct {
+			u32	cpu_id;
+			u32	mm_cid;
+		};
+	};
+};
+
+/**
  * struct rseq_data - Storage for all rseq related data
  * @usrptr:	Pointer to the registered user space RSEQ memory
  * @len:	Length of the RSEQ region
  * @sig:	Signature of critial section abort IPs
  * @event:	Storage for event management
+ * @ids:	Storage for cached CPU ID and MM CID
  */
 struct rseq_data {
 	struct rseq __user		*usrptr;
 	u32				len;
 	u32				sig;
 	struct rseq_event		event;
+	struct rseq_ids			ids;
 };
 
 #else /* CONFIG_RSEQ */
--- a/include/trace/events/rseq.h
+++ b/include/trace/events/rseq.h
@@ -21,9 +21,9 @@ TRACE_EVENT(rseq_update,
 	),
 
 	TP_fast_assign(
-		__entry->cpu_id = raw_smp_processor_id();
+		__entry->cpu_id = t->rseq.ids.cpu_id;
 		__entry->node_id = cpu_to_node(__entry->cpu_id);
-		__entry->mm_cid = task_mm_cid(t);
+		__entry->mm_cid = t->rseq.ids.mm_cid;
 	),
 
 	TP_printk("cpu_id=%d node_id=%d mm_cid=%d", __entry->cpu_id,
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -184,6 +184,10 @@ static int rseq_update_cpu_node_id(struc
 	rseq_unsafe_put_user(t, node_id, node_id, efault_end);
 	rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
 
+	/* Cache the user space values */
+	t->rseq.ids.cpu_id = cpu_id;
+	t->rseq.ids.mm_cid = mm_cid;
+
 	/*
 	 * Additional feature fields added after ORIG_RSEQ_SIZE
 	 * need to be conditionally updated only if

Re: [patch V4 14/36] rseq: Cache CPU ID and MM CID values

Posted by Mathieu Desnoyers 7 hours ago

On 2025-09-08 17:31, Thomas Gleixner wrote:
> In preparation for rewriting RSEQ exit to user space handling provide
> storage to cache the CPU ID and MM CID values which were written to user
> space. That prepares for a quick check, which avoids the update when
> nothing changed.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> 
> ---
>   include/linux/rseq.h        |    7 +++++--
>   include/linux/rseq_types.h  |   21 +++++++++++++++++++++
>   include/trace/events/rseq.h |    4 ++--
>   kernel/rseq.c               |    4 ++++
>   4 files changed, 32 insertions(+), 4 deletions(-)
> 
> --- a/include/linux/rseq.h
> +++ b/include/linux/rseq.h
> @@ -57,6 +57,7 @@ static inline void rseq_virt_userspace_e
>   static inline void rseq_reset(struct task_struct *t)
>   {
>   	memset(&t->rseq, 0, sizeof(t->rseq));
> +	t->rseq.ids.cpu_cid = ~0ULL;
>   }
>   
>   static inline void rseq_execve(struct task_struct *t)
> @@ -70,10 +71,12 @@ static inline void rseq_execve(struct ta
>    */
>   static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
>   {
> -	if (clone_flags & CLONE_VM)
> +	if (clone_flags & CLONE_VM) {
>   		rseq_reset(t);
> -	else
> +	} else {
>   		t->rseq = current->rseq;
> +		t->rseq.ids.cpu_cid = ~0ULL;
> +	}
>   }
>   
>   #else /* CONFIG_RSEQ */
> --- a/include/linux/rseq_types.h
> +++ b/include/linux/rseq_types.h
> @@ -31,17 +31,38 @@ struct rseq_event {
>   };
>   
>   /**
> + * struct rseq_ids - Cache for ids, which need to be updated

need -> needs

> + * @cpu_cid:	Compound of @cpu_id and @mm_cid to make the
> + *		compiler emit a single compare on 64-bit
> + * @cpu_id:	The CPU ID which was written last to user space
> + * @mm_cid:	The MM CID which was written last to user space
> + *
> + * @cpu_id and @mm_cid are updated when the data is written to user space.
> + */
> +struct rseq_ids {
> +	union {
> +		u64		cpu_cid;
> +		struct {
> +			u32	cpu_id;
> +			u32	mm_cid;
> +		};
> +	};
> +};
> +
> +/**
>    * struct rseq_data - Storage for all rseq related data
>    * @usrptr:	Pointer to the registered user space RSEQ memory
>    * @len:	Length of the RSEQ region
>    * @sig:	Signature of critial section abort IPs
>    * @event:	Storage for event management
> + * @ids:	Storage for cached CPU ID and MM CID

It's far from clear from the diff, but is there a missing space at the
beginning of the line above ?

Other than that:

Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>


>    */
>   struct rseq_data {
>   	struct rseq __user		*usrptr;
>   	u32				len;
>   	u32				sig;
>   	struct rseq_event		event;
> +	struct rseq_ids			ids;
>   };
>   
>   #else /* CONFIG_RSEQ */
> --- a/include/trace/events/rseq.h
> +++ b/include/trace/events/rseq.h
> @@ -21,9 +21,9 @@ TRACE_EVENT(rseq_update,
>   	),
>   
>   	TP_fast_assign(
> -		__entry->cpu_id = raw_smp_processor_id();
> +		__entry->cpu_id = t->rseq.ids.cpu_id;
>   		__entry->node_id = cpu_to_node(__entry->cpu_id);
> -		__entry->mm_cid = task_mm_cid(t);
> +		__entry->mm_cid = t->rseq.ids.mm_cid;
>   	),
>   
>   	TP_printk("cpu_id=%d node_id=%d mm_cid=%d", __entry->cpu_id,
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -184,6 +184,10 @@ static int rseq_update_cpu_node_id(struc
>   	rseq_unsafe_put_user(t, node_id, node_id, efault_end);
>   	rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
>   
> +	/* Cache the user space values */
> +	t->rseq.ids.cpu_id = cpu_id;
> +	t->rseq.ids.mm_cid = mm_cid;
> +
>   	/*
>   	 * Additional feature fields added after ORIG_RSEQ_SIZE
>   	 * need to be conditionally updated only if
> 


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

Re: [patch V4 14/36] rseq: Cache CPU ID and MM CID values

Posted by Thomas Gleixner 7 hours ago

On Tue, Sep 09 2025 at 09:43, Mathieu Desnoyers wrote:
> On 2025-09-08 17:31, Thomas Gleixner wrote:
>>   /**
>> + * struct rseq_ids - Cache for ids, which need to be updated
>
> need -> needs

ids is plural, so 'need' is correct, no?

>> + * @cpu_cid:	Compound of @cpu_id and @mm_cid to make the
>> + *		compiler emit a single compare on 64-bit
>> + * @cpu_id:	The CPU ID which was written last to user space
>> + * @mm_cid:	The MM CID which was written last to user space
>> + *
>> + * @cpu_id and @mm_cid are updated when the data is written to user space.
>> + */
>> +struct rseq_ids {
>> +	union {
>> +		u64		cpu_cid;
>> +		struct {
>> +			u32	cpu_id;
>> +			u32	mm_cid;
>> +		};
>> +	};
>> +};
>> +
>> +/**
>>    * struct rseq_data - Storage for all rseq related data
>>    * @usrptr:	Pointer to the registered user space RSEQ memory
>>    * @len:	Length of the RSEQ region
>>    * @sig:	Signature of critial section abort IPs
>>    * @event:	Storage for event management
>> + * @ids:	Storage for cached CPU ID and MM CID
>
> It's far from clear from the diff, but is there a missing space at the
> beginning of the line above ?

No. The actual diff is:

  * @event:	Storage for event management
+ * @ids:	Storage for cached CPU ID and MM CID
  */

It's just the reply quoting which makes it ugly.

Re: [patch V4 14/36] rseq: Cache CPU ID and MM CID values

Posted by Mathieu Desnoyers 6 hours ago

On 2025-09-09 10:13, Thomas Gleixner wrote:
> On Tue, Sep 09 2025 at 09:43, Mathieu Desnoyers wrote:
>> On 2025-09-08 17:31, Thomas Gleixner wrote:
>>>    /**
>>> + * struct rseq_ids - Cache for ids, which need to be updated
>>
>> need -> needs
> 
> ids is plural, so 'need' is correct, no?

It's the cache that needs to be updated (cache for ids). So
technically the verb conjugates with "cache" (singular) and not
"ids".

> 
>>> + * @cpu_cid:	Compound of @cpu_id and @mm_cid to make the
>>> + *		compiler emit a single compare on 64-bit
>>> + * @cpu_id:	The CPU ID which was written last to user space
>>> + * @mm_cid:	The MM CID which was written last to user space
>>> + *
>>> + * @cpu_id and @mm_cid are updated when the data is written to user space.
>>> + */
>>> +struct rseq_ids {
>>> +	union {
>>> +		u64		cpu_cid;
>>> +		struct {
>>> +			u32	cpu_id;
>>> +			u32	mm_cid;
>>> +		};
>>> +	};
>>> +};
>>> +
>>> +/**
>>>     * struct rseq_data - Storage for all rseq related data
>>>     * @usrptr:	Pointer to the registered user space RSEQ memory
>>>     * @len:	Length of the RSEQ region
>>>     * @sig:	Signature of critial section abort IPs
>>>     * @event:	Storage for event management
>>> + * @ids:	Storage for cached CPU ID and MM CID
>>
>> It's far from clear from the diff, but is there a missing space at the
>> beginning of the line above ?
> 
> No. The actual diff is:
> 
>    * @event:	Storage for event management
> + * @ids:	Storage for cached CPU ID and MM CID
>    */
> 
> It's just the reply quoting which makes it ugly.

Sounds good.

Thanks,

Mathieu

> 


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

[patch V4 15/36] rseq: Record interrupt from user space

Posted by Thomas Gleixner 23 hours ago

For RSEQ the only relevant reason to inspect and eventually fixup (abort)
user space critical sections is when user space was interrupted and the
task was scheduled out.

If the user to kernel entry was from a syscall no fixup is required. If
user space invokes a syscall from a critical section it can keep the
pieces as documented.

This is only supported on architectures which utilize the generic entry
code. If your architecture does not use it, bad luck.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 include/linux/irq-entry-common.h |    3 ++-
 include/linux/rseq.h             |   16 +++++++++++-----
 include/linux/rseq_entry.h       |   18 ++++++++++++++++++
 include/linux/rseq_types.h       |    2 ++
 4 files changed, 33 insertions(+), 6 deletions(-)

--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -4,7 +4,7 @@
 
 #include <linux/context_tracking.h>
 #include <linux/kmsan.h>
-#include <linux/rseq.h>
+#include <linux/rseq_entry.h>
 #include <linux/static_call_types.h>
 #include <linux/syscalls.h>
 #include <linux/tick.h>
@@ -281,6 +281,7 @@ static __always_inline void exit_to_user
 static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
 {
 	enter_from_user_mode(regs);
+	rseq_note_user_irq_entry();
 }
 
 /**
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -31,11 +31,17 @@ static inline void rseq_sched_switch_eve
 
 static __always_inline void rseq_exit_to_user_mode(void)
 {
-	if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
-		if (WARN_ON_ONCE(current->rseq.event.has_rseq &&
-				 current->rseq.event.events))
-			current->rseq.event.events = 0;
-	}
+	struct rseq_event *ev = &current->rseq.event;
+
+	if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
+		WARN_ON_ONCE(ev->sched_switch);
+
+	/*
+	 * Ensure that event (especially user_irq) is cleared when the
+	 * interrupt did not result in a schedule and therefore the
+	 * rseq processing did not clear it.
+	 */
+	ev->events = 0;
 }
 
 /*
--- /dev/null
+++ b/include/linux/rseq_entry.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RSEQ_ENTRY_H
+#define _LINUX_RSEQ_ENTRY_H
+
+#ifdef CONFIG_RSEQ
+#include <linux/rseq.h>
+
+static __always_inline void rseq_note_user_irq_entry(void)
+{
+	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
+		current->rseq.event.user_irq = true;
+}
+
+#else /* CONFIG_RSEQ */
+static inline void rseq_note_user_irq_entry(void) { }
+#endif /* !CONFIG_RSEQ */
+
+#endif /* _LINUX_RSEQ_ENTRY_H */
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -12,6 +12,7 @@ struct rseq;
  * @all:		Compound to initialize and clear the data efficiently
  * @events:		Compound to access events with a single load/store
  * @sched_switch:	True if the task was scheduled out
+ * @user_irq:		True on interrupt entry from user mode
  * @has_rseq:		True if the task has a rseq pointer installed
  */
 struct rseq_event {
@@ -22,6 +23,7 @@ struct rseq_event {
 				u16		events;
 				struct {
 					u8	sched_switch;
+					u8	user_irq;
 				};
 			};

Re: [patch V4 15/36] rseq: Record interrupt from user space

Posted by Mathieu Desnoyers 7 hours ago

On 2025-09-08 17:31, Thomas Gleixner wrote:
> For RSEQ the only relevant reason to inspect and eventually fixup (abort)
> user space critical sections is when user space was interrupted and the
> task was scheduled out.
> 
> If the user to kernel entry was from a syscall no fixup is required. If
> user space invokes a syscall from a critical section it can keep the
> pieces as documented.
> 
> This is only supported on architectures which utilize the generic entry
> code. If your architecture does not use it, bad luck.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> 
> ---
>   include/linux/irq-entry-common.h |    3 ++-
>   include/linux/rseq.h             |   16 +++++++++++-----
>   include/linux/rseq_entry.h       |   18 ++++++++++++++++++
>   include/linux/rseq_types.h       |    2 ++
>   4 files changed, 33 insertions(+), 6 deletions(-)
> 
> --- a/include/linux/irq-entry-common.h
> +++ b/include/linux/irq-entry-common.h
> @@ -4,7 +4,7 @@
>   
>   #include <linux/context_tracking.h>
>   #include <linux/kmsan.h>
> -#include <linux/rseq.h>
> +#include <linux/rseq_entry.h>
>   #include <linux/static_call_types.h>
>   #include <linux/syscalls.h>
>   #include <linux/tick.h>
> @@ -281,6 +281,7 @@ static __always_inline void exit_to_user
>   static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
>   {
>   	enter_from_user_mode(regs);
> +	rseq_note_user_irq_entry();
>   }
>   
>   /**
> --- a/include/linux/rseq.h
> +++ b/include/linux/rseq.h
> @@ -31,11 +31,17 @@ static inline void rseq_sched_switch_eve
>   
>   static __always_inline void rseq_exit_to_user_mode(void)
>   {
> -	if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
> -		if (WARN_ON_ONCE(current->rseq.event.has_rseq &&
> -				 current->rseq.event.events))
> -			current->rseq.event.events = 0;
> -	}
> +	struct rseq_event *ev = &current->rseq.event;
> +
> +	if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
> +		WARN_ON_ONCE(ev->sched_switch);

OK. Now I'm confused.

It is perfectly legal to issue a system call from userspace as long
as it's not from within an rseq critical section.

That system call can be scheduled out, and can set the ev->sched_switch.

This would cause the rseq_exit_to_user_mode from system call to
hit this.

What is disallowed is only issuing a system call from a rseq critical
section. The other parts of rseq (updates of cpu id and mm cid) still
have to happen when returning from a system call.

What am I missing ?

Thanks,

Mathieu

> +
> +	/*
> +	 * Ensure that event (especially user_irq) is cleared when the
> +	 * interrupt did not result in a schedule and therefore the
> +	 * rseq processing did not clear it.
> +	 */
> +	ev->events = 0;
>   }
>   
>   /*
> --- /dev/null
> +++ b/include/linux/rseq_entry.h
> @@ -0,0 +1,18 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_RSEQ_ENTRY_H
> +#define _LINUX_RSEQ_ENTRY_H
> +
> +#ifdef CONFIG_RSEQ
> +#include <linux/rseq.h>
> +
> +static __always_inline void rseq_note_user_irq_entry(void)
> +{
> +	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
> +		current->rseq.event.user_irq = true;
> +}
> +
> +#else /* CONFIG_RSEQ */
> +static inline void rseq_note_user_irq_entry(void) { }
> +#endif /* !CONFIG_RSEQ */
> +
> +#endif /* _LINUX_RSEQ_ENTRY_H */
> --- a/include/linux/rseq_types.h
> +++ b/include/linux/rseq_types.h
> @@ -12,6 +12,7 @@ struct rseq;
>    * @all:		Compound to initialize and clear the data efficiently
>    * @events:		Compound to access events with a single load/store
>    * @sched_switch:	True if the task was scheduled out
> + * @user_irq:		True on interrupt entry from user mode
>    * @has_rseq:		True if the task has a rseq pointer installed
>    */
>   struct rseq_event {
> @@ -22,6 +23,7 @@ struct rseq_event {
>   				u16		events;
>   				struct {
>   					u8	sched_switch;
> +					u8	user_irq;
>   				};
>   			};
>   
> 


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

Re: [patch V4 15/36] rseq: Record interrupt from user space

Posted by Thomas Gleixner 7 hours ago

On Tue, Sep 09 2025 at 09:53, Mathieu Desnoyers wrote:
>>   static __always_inline void rseq_exit_to_user_mode(void)
>>   {
>> -	if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
>> -		if (WARN_ON_ONCE(current->rseq.event.has_rseq &&
>> -				 current->rseq.event.events))
>> -			current->rseq.event.events = 0;
>> -	}
>> +	struct rseq_event *ev = &current->rseq.event;
>> +
>> +	if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
>> +		WARN_ON_ONCE(ev->sched_switch);
>
> OK. Now I'm confused.
>
> It is perfectly legal to issue a system call from userspace as long
> as it's not from within an rseq critical section.
>
> That system call can be scheduled out, and can set the ev->sched_switch.
>
> This would cause the rseq_exit_to_user_mode from system call to
> hit this.
>
> What is disallowed is only issuing a system call from a rseq critical
> section. The other parts of rseq (updates of cpu id and mm cid) still
> have to happen when returning from a system call.
>
> What am I missing ?

The fact that any setting of ev->sched_switch has to be handled on the
way out independent of user interrupt or not as MM CID can change
obviously.

This is not any different from the state before this patch. Just that it
now only looks at sched_switch instead of the full event as that might
contain a set user_irq bit w/o sched_switch being set, no?

Thanks,

        tglx

Re: [patch V4 15/36] rseq: Record interrupt from user space

Posted by Mathieu Desnoyers 6 hours ago

On 2025-09-09 10:17, Thomas Gleixner wrote:
> On Tue, Sep 09 2025 at 09:53, Mathieu Desnoyers wrote:
>>>    static __always_inline void rseq_exit_to_user_mode(void)
>>>    {
>>> -	if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
>>> -		if (WARN_ON_ONCE(current->rseq.event.has_rseq &&
>>> -				 current->rseq.event.events))
>>> -			current->rseq.event.events = 0;
>>> -	}
>>> +	struct rseq_event *ev = &current->rseq.event;
>>> +
>>> +	if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
>>> +		WARN_ON_ONCE(ev->sched_switch);
>>
>> OK. Now I'm confused.
>>
>> It is perfectly legal to issue a system call from userspace as long
>> as it's not from within an rseq critical section.
>>
>> That system call can be scheduled out, and can set the ev->sched_switch.
>>
>> This would cause the rseq_exit_to_user_mode from system call to
>> hit this.
>>
>> What is disallowed is only issuing a system call from a rseq critical
>> section. The other parts of rseq (updates of cpu id and mm cid) still
>> have to happen when returning from a system call.
>>
>> What am I missing ?
> 
> The fact that any setting of ev->sched_switch has to be handled on the
> way out independent of user interrupt or not as MM CID can change
> obviously.
> 
> This is not any different from the state before this patch. Just that it
> now only looks at sched_switch instead of the full event as that might
> contain a set user_irq bit w/o sched_switch being set, no?

OK, so this is called after the events were handled, and this is just
a sanity check.

Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

> 
> Thanks,
> 
>          tglx
> 
> 


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

[patch V4 16/36] rseq: Provide tracepoint wrappers for inline code

Posted by Thomas Gleixner 23 hours ago

Provide tracepoint wrappers for the upcoming RSEQ exit to user space inline
fast path, so that the header can be safely included by code which defines
actual trace points.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
---
V4: Fix the fallback stub
V3: Get rid of one indentation level - Mathieu
---
 include/linux/rseq_entry.h |   28 ++++++++++++++++++++++++++++
 kernel/rseq.c              |   17 +++++++++++++++++
 2 files changed, 45 insertions(+)

--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -5,6 +5,34 @@
 #ifdef CONFIG_RSEQ
 #include <linux/rseq.h>
 
+#include <linux/tracepoint-defs.h>
+
+#ifdef CONFIG_TRACEPOINTS
+DECLARE_TRACEPOINT(rseq_update);
+DECLARE_TRACEPOINT(rseq_ip_fixup);
+void __rseq_trace_update(struct task_struct *t);
+void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
+			   unsigned long offset, unsigned long abort_ip);
+
+static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids)
+{
+	if (tracepoint_enabled(rseq_update) && ids)
+		__rseq_trace_update(t);
+}
+
+static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
+				       unsigned long offset, unsigned long abort_ip)
+{
+	if (tracepoint_enabled(rseq_ip_fixup))
+		__rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
+}
+
+#else /* CONFIG_TRACEPOINT */
+static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { }
+static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
+				       unsigned long offset, unsigned long abort_ip) { }
+#endif /* !CONFIG_TRACEPOINT */
+
 static __always_inline void rseq_note_user_irq_entry(void)
 {
 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -91,6 +91,23 @@
 				  RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
 				  RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
 
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * Out of line, so the actual update functions can be in a header to be
+ * inlined into the exit to user code.
+ */
+void __rseq_trace_update(struct task_struct *t)
+{
+	trace_rseq_update(t);
+}
+
+void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
+			   unsigned long offset, unsigned long abort_ip)
+{
+	trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip);
+}
+#endif /* CONFIG_TRACEPOINTS */
+
 #ifdef CONFIG_DEBUG_RSEQ
 static struct rseq *rseq_kernel_fields(struct task_struct *t)
 {

[patch V4 17/36] rseq: Expose lightweight statistics in debugfs

Posted by Thomas Gleixner 23 hours ago

Analyzing the call frequency without actually using tracing is helpful for
analysis of this infrastructure. The overhead is minimal as it just
increments a per CPU counter associated to each operation.

The debugfs readout provides a racy sum of all counters.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

---
 include/linux/rseq.h       |   16 ---------
 include/linux/rseq_entry.h |   49 +++++++++++++++++++++++++++
 init/Kconfig               |   12 ++++++
 kernel/rseq.c              |   79 +++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 133 insertions(+), 23 deletions(-)

--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -29,21 +29,6 @@ static inline void rseq_sched_switch_eve
 	}
 }
 
-static __always_inline void rseq_exit_to_user_mode(void)
-{
-	struct rseq_event *ev = &current->rseq.event;
-
-	if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
-		WARN_ON_ONCE(ev->sched_switch);
-
-	/*
-	 * Ensure that event (especially user_irq) is cleared when the
-	 * interrupt did not result in a schedule and therefore the
-	 * rseq processing did not clear it.
-	 */
-	ev->events = 0;
-}
-
 /*
  * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
  * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in
@@ -92,7 +77,6 @@ static inline void rseq_sched_switch_eve
 static inline void rseq_virt_userspace_exit(void) { }
 static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
 static inline void rseq_execve(struct task_struct *t) { }
-static inline void rseq_exit_to_user_mode(void) { }
 #endif  /* !CONFIG_RSEQ */
 
 #ifdef CONFIG_DEBUG_RSEQ
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -2,6 +2,37 @@
 #ifndef _LINUX_RSEQ_ENTRY_H
 #define _LINUX_RSEQ_ENTRY_H
 
+/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
+#ifdef CONFIG_RSEQ_STATS
+#include <linux/percpu.h>
+
+struct rseq_stats {
+	unsigned long	exit;
+	unsigned long	signal;
+	unsigned long	slowpath;
+	unsigned long	ids;
+	unsigned long	cs;
+	unsigned long	clear;
+	unsigned long	fixup;
+};
+
+DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
+
+/*
+ * Slow path has interrupts and preemption enabled, but the fast path
+ * runs with interrupts disabled so there is no point in having the
+ * preemption checks implied in __this_cpu_inc() for every operation.
+ */
+#ifdef RSEQ_BUILD_SLOW_PATH
+#define rseq_stat_inc(which)	this_cpu_inc((which))
+#else
+#define rseq_stat_inc(which)	raw_cpu_inc((which))
+#endif
+
+#else /* CONFIG_RSEQ_STATS */
+#define rseq_stat_inc(x)	do { } while (0)
+#endif /* !CONFIG_RSEQ_STATS */
+
 #ifdef CONFIG_RSEQ
 #include <linux/rseq.h>
 
@@ -39,8 +70,26 @@ static __always_inline void rseq_note_us
 		current->rseq.event.user_irq = true;
 }
 
+static __always_inline void rseq_exit_to_user_mode(void)
+{
+	struct rseq_event *ev = &current->rseq.event;
+
+	rseq_stat_inc(rseq_stats.exit);
+
+	if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
+		WARN_ON_ONCE(ev->sched_switch);
+
+	/*
+	 * Ensure that event (especially user_irq) is cleared when the
+	 * interrupt did not result in a schedule and therefore the
+	 * rseq processing did not clear it.
+	 */
+	ev->events = 0;
+}
+
 #else /* CONFIG_RSEQ */
 static inline void rseq_note_user_irq_entry(void) { }
+static inline void rseq_exit_to_user_mode(void) { }
 #endif /* !CONFIG_RSEQ */
 
 #endif /* _LINUX_RSEQ_ENTRY_H */
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1883,6 +1883,18 @@ config RSEQ
 
 	  If unsure, say Y.
 
+config RSEQ_STATS
+	default n
+	bool "Enable lightweight statistics of restartable sequences" if EXPERT
+	depends on RSEQ && DEBUG_FS
+	help
+	  Enable lightweight counters which expose information about the
+	  frequency of RSEQ operations via debugfs. Mostly interesting for
+	  kernel debugging or performance analysis. While lightweight it's
+	  still adding code into the user/kernel mode transitions.
+
+	  If unsure, say N.
+
 config DEBUG_RSEQ
 	default n
 	bool "Enable debugging of rseq() system call" if EXPERT
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -67,12 +67,16 @@
  *   F1. <failure>
  */
 
+/* Required to select the proper per_cpu ops for rseq_stats_inc() */
+#define RSEQ_BUILD_SLOW_PATH
+
+#include <linux/debugfs.h>
+#include <linux/ratelimit.h>
+#include <linux/rseq_entry.h>
 #include <linux/sched.h>
-#include <linux/uaccess.h>
 #include <linux/syscalls.h>
-#include <linux/rseq.h>
+#include <linux/uaccess.h>
 #include <linux/types.h>
-#include <linux/ratelimit.h>
 #include <asm/ptrace.h>
 
 #define CREATE_TRACE_POINTS
@@ -108,6 +112,56 @@ void __rseq_trace_ip_fixup(unsigned long
 }
 #endif /* CONFIG_TRACEPOINTS */
 
+#ifdef CONFIG_RSEQ_STATS
+DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
+
+static int rseq_debug_show(struct seq_file *m, void *p)
+{
+	struct rseq_stats stats = { };
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		stats.exit	+= data_race(per_cpu(rseq_stats.exit, cpu));
+		stats.signal	+= data_race(per_cpu(rseq_stats.signal, cpu));
+		stats.slowpath	+= data_race(per_cpu(rseq_stats.slowpath, cpu));
+		stats.ids	+= data_race(per_cpu(rseq_stats.ids, cpu));
+		stats.cs	+= data_race(per_cpu(rseq_stats.cs, cpu));
+		stats.clear	+= data_race(per_cpu(rseq_stats.clear, cpu));
+		stats.fixup	+= data_race(per_cpu(rseq_stats.fixup, cpu));
+	}
+
+	seq_printf(m, "exit:   %16lu\n", stats.exit);
+	seq_printf(m, "signal: %16lu\n", stats.signal);
+	seq_printf(m, "slowp:  %16lu\n", stats.slowpath);
+	seq_printf(m, "ids:    %16lu\n", stats.ids);
+	seq_printf(m, "cs:     %16lu\n", stats.cs);
+	seq_printf(m, "clear:  %16lu\n", stats.clear);
+	seq_printf(m, "fixup:  %16lu\n", stats.fixup);
+	return 0;
+}
+
+static int rseq_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rseq_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_ops = {
+	.open		= rseq_debug_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init rseq_debugfs_init(void)
+{
+	struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
+
+	debugfs_create_file("stats", 0444, root_dir, NULL, &dfs_ops);
+	return 0;
+}
+__initcall(rseq_debugfs_init);
+#endif /* CONFIG_RSEQ_STATS */
+
 #ifdef CONFIG_DEBUG_RSEQ
 static struct rseq *rseq_kernel_fields(struct task_struct *t)
 {
@@ -187,12 +241,13 @@ static int rseq_update_cpu_node_id(struc
 	u32 node_id = cpu_to_node(cpu_id);
 	u32 mm_cid = task_mm_cid(t);
 
-	/*
-	 * Validate read-only rseq fields.
-	 */
+	rseq_stat_inc(rseq_stats.ids);
+
+	/* Validate read-only rseq fields on debug kernels */
 	if (rseq_validate_ro_fields(t))
 		goto efault;
 	WARN_ON_ONCE((int) mm_cid < 0);
+
 	if (!user_write_access_begin(rseq, t->rseq.len))
 		goto efault;
 
@@ -403,6 +458,8 @@ static int rseq_ip_fixup(struct pt_regs
 	struct rseq_cs rseq_cs;
 	int ret;
 
+	rseq_stat_inc(rseq_stats.cs);
+
 	ret = rseq_get_rseq_cs(t, &rseq_cs);
 	if (ret)
 		return ret;
@@ -412,8 +469,10 @@ static int rseq_ip_fixup(struct pt_regs
 	 * If not nested over a rseq critical section, restart is useless.
 	 * Clear the rseq_cs pointer and return.
 	 */
-	if (!in_rseq_cs(ip, &rseq_cs))
+	if (!in_rseq_cs(ip, &rseq_cs)) {
+		rseq_stat_inc(rseq_stats.clear);
 		return clear_rseq_cs(t->rseq.usrptr);
+	}
 	ret = rseq_check_flags(t, rseq_cs.flags);
 	if (ret < 0)
 		return ret;
@@ -422,6 +481,7 @@ static int rseq_ip_fixup(struct pt_regs
 	ret = clear_rseq_cs(t->rseq.usrptr);
 	if (ret)
 		return ret;
+	rseq_stat_inc(rseq_stats.fixup);
 	trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
 			    rseq_cs.abort_ip);
 	instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
@@ -462,6 +522,11 @@ void __rseq_handle_notify_resume(struct
 	if (unlikely(t->flags & PF_EXITING))
 		return;
 
+	if (ksig)
+		rseq_stat_inc(rseq_stats.signal);
+	else
+		rseq_stat_inc(rseq_stats.slowpath);
+
 	/*
 	 * Read and clear the event pending bit first. If the task
 	 * was not preempted or migrated or a signal is on the way,

[patch V4 18/36] rseq: Provide static branch for runtime debugging

Posted by Thomas Gleixner 23 hours ago

Config based debug is rarely turned on and is not available easily when
things go wrong.

Provide a static branch to allow permanent integration of debug mechanisms
along with the usual toggles in Kconfig, command line and debugfs.

Requested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
---
V3: Fix __setup() return value - Michael
---
 Documentation/admin-guide/kernel-parameters.txt |    4 +
 include/linux/rseq_entry.h                      |    3 
 init/Kconfig                                    |   14 ++++
 kernel/rseq.c                                   |   73 ++++++++++++++++++++++--
 4 files changed, 90 insertions(+), 4 deletions(-)

--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6443,6 +6443,10 @@
 			Memory area to be used by remote processor image,
 			managed by CMA.
 
+	rseq_debug=	[KNL] Enable or disable restartable sequence
+			debug mode. Defaults to CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE.
+			Format: <bool>
+
 	rt_group_sched=	[KNL] Enable or disable SCHED_RR/FIFO group scheduling
 			when CONFIG_RT_GROUP_SCHED=y. Defaults to
 			!CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED.
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -34,6 +34,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_
 #endif /* !CONFIG_RSEQ_STATS */
 
 #ifdef CONFIG_RSEQ
+#include <linux/jump_label.h>
 #include <linux/rseq.h>
 
 #include <linux/tracepoint-defs.h>
@@ -64,6 +65,8 @@ static inline void rseq_trace_ip_fixup(u
 				       unsigned long offset, unsigned long abort_ip) { }
 #endif /* !CONFIG_TRACEPOINT */
 
+DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
+
 static __always_inline void rseq_note_user_irq_entry(void)
 {
 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1895,10 +1895,24 @@ config RSEQ_STATS
 
 	  If unsure, say N.
 
+config RSEQ_DEBUG_DEFAULT_ENABLE
+	default n
+	bool "Enable restartable sequences debug mode by default" if EXPERT
+	depends on RSEQ
+	help
+	  This enables the static branch for debug mode of restartable
+	  sequences.
+
+	  This also can be controlled on the kernel command line via the
+	  command line parameter "rseq_debug=0/1" and through debugfs.
+
+	  If unsure, say N.
+
 config DEBUG_RSEQ
 	default n
 	bool "Enable debugging of rseq() system call" if EXPERT
 	depends on RSEQ && DEBUG_KERNEL
+	select RSEQ_DEBUG_DEFAULT_ENABLE
 	help
 	  Enable extra debugging checks for the rseq system call.
 
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -95,6 +95,27 @@
 				  RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
 				  RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
 
+DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
+
+static inline void rseq_control_debug(bool on)
+{
+	if (on)
+		static_branch_enable(&rseq_debug_enabled);
+	else
+		static_branch_disable(&rseq_debug_enabled);
+}
+
+static int __init rseq_setup_debug(char *str)
+{
+	bool on;
+
+	if (kstrtobool(str, &on))
+		return -EINVAL;
+	rseq_control_debug(on);
+	return 1;
+}
+__setup("rseq_debug=", rseq_setup_debug);
+
 #ifdef CONFIG_TRACEPOINTS
 /*
  * Out of line, so the actual update functions can be in a header to be
@@ -112,10 +133,11 @@ void __rseq_trace_ip_fixup(unsigned long
 }
 #endif /* CONFIG_TRACEPOINTS */
 
+#ifdef CONFIG_DEBUG_FS
 #ifdef CONFIG_RSEQ_STATS
 DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
 
-static int rseq_debug_show(struct seq_file *m, void *p)
+static int rseq_stats_show(struct seq_file *m, void *p)
 {
 	struct rseq_stats stats = { };
 	unsigned int cpu;
@@ -140,14 +162,56 @@ static int rseq_debug_show(struct seq_fi
 	return 0;
 }
 
+static int rseq_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rseq_stats_show, inode->i_private);
+}
+
+static const struct file_operations stat_ops = {
+	.open		= rseq_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init rseq_stats_init(struct dentry *root_dir)
+{
+	debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops);
+	return 0;
+}
+#else
+static inline void rseq_stats_init(struct dentry *root_dir) { }
+#endif /* CONFIG_RSEQ_STATS */
+
+static int rseq_debug_show(struct seq_file *m, void *p)
+{
+	bool on = static_branch_unlikely(&rseq_debug_enabled);
+
+	seq_printf(m, "%d\n", on);
+	return 0;
+}
+
+static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf,
+			    size_t count, loff_t *ppos)
+{
+	bool on;
+
+	if (kstrtobool_from_user(ubuf, count, &on))
+		return -EINVAL;
+
+	rseq_control_debug(on);
+	return count;
+}
+
 static int rseq_debug_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, rseq_debug_show, inode->i_private);
 }
 
-static const struct file_operations dfs_ops = {
+static const struct file_operations debug_ops = {
 	.open		= rseq_debug_open,
 	.read		= seq_read,
+	.write		= rseq_debug_write,
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
@@ -156,11 +220,12 @@ static int __init rseq_debugfs_init(void
 {
 	struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
 
-	debugfs_create_file("stats", 0444, root_dir, NULL, &dfs_ops);
+	debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops);
+	rseq_stats_init(root_dir);
 	return 0;
 }
 __initcall(rseq_debugfs_init);
-#endif /* CONFIG_RSEQ_STATS */
+#endif /* CONFIG_DEBUG_FS */
 
 #ifdef CONFIG_DEBUG_RSEQ
 static struct rseq *rseq_kernel_fields(struct task_struct *t)

[patch V4 19/36] rseq: Provide and use rseq_update_user_cs()

Posted by Thomas Gleixner 23 hours ago

Provide a straight forward implementation to check for and eventually
clear/fixup critical sections in user space.

The non-debug version does only the minimal sanity checks and aims for
efficiency.

There are two attack vectors, which are checked for:

  1) An abort IP which is in the kernel address space. That would cause at
     least x86 to return to kernel space via IRET.

  2) A rogue critical section descriptor with an abort IP pointing to some
     arbitrary address, which is not preceeded by the RSEQ signature.

If the section descriptors are invalid then the resulting misbehaviour of
the user space application is not the kernels problem.

The kernel provides a run-time switchable debug slow path, which implements
the full zoo of checks including termination of the task when one of the
gazillion conditions is not met.

Replace the zoo in rseq.c with it and invoke it from the TIF_NOTIFY_RESUME
handler. Move the reminders into the CONFIG_DEBUG_RSEQ section, which will
be replaced and removed in a subsequent step.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
V3: Brought back the signature check along with a comment - Mathieu
---
 include/linux/rseq_entry.h |  209 +++++++++++++++++++++++++++++++++++++++
 include/linux/rseq_types.h |   11 +-
 kernel/rseq.c              |  238 +++++++++++++--------------------------------
 3 files changed, 288 insertions(+), 170 deletions(-)

--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -36,6 +36,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_
 #ifdef CONFIG_RSEQ
 #include <linux/jump_label.h>
 #include <linux/rseq.h>
+#include <linux/uaccess.h>
 
 #include <linux/tracepoint-defs.h>
 
@@ -67,12 +68,220 @@ static inline void rseq_trace_ip_fixup(u
 
 DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
 
+#ifdef RSEQ_BUILD_SLOW_PATH
+#define rseq_inline
+#else
+#define rseq_inline __always_inline
+#endif
+
+bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
+
 static __always_inline void rseq_note_user_irq_entry(void)
 {
 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
 		current->rseq.event.user_irq = true;
 }
 
+/*
+ * Check whether there is a valid critical section and whether the
+ * instruction pointer in @regs is inside the critical section.
+ *
+ *  - If the critical section is invalid, terminate the task.
+ *
+ *  - If valid and the instruction pointer is inside, set it to the abort IP
+ *
+ *  - If valid and the instruction pointer is outside, clear the critical
+ *    section address.
+ *
+ * Returns true, if the section was valid and either fixup or clear was
+ * done, false otherwise.
+ *
+ * In the failure case task::rseq_event::fatal is set when a invalid
+ * section was found. It's clear when the failure was an unresolved page
+ * fault.
+ *
+ * If inlined into the exit to user path with interrupts disabled, the
+ * caller has to protect against page faults with pagefault_disable().
+ *
+ * In preemptible task context this would be counterproductive as the page
+ * faults could not be fully resolved. As a consequence unresolved page
+ * faults in task context are fatal too.
+ */
+
+#ifdef RSEQ_BUILD_SLOW_PATH
+/*
+ * The debug version is put out of line, but kept here so the code stays
+ * together.
+ *
+ * @csaddr has already been checked by the caller to be in user space
+ */
+bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr)
+{
+	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
+	u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE;
+	unsigned long ip = instruction_pointer(regs);
+	u64 __user *uc_head = (u64 __user *) ucs;
+	u32 usig, __user *uc_sig;
+
+	if (!user_rw_masked_begin(ucs))
+		return false;
+
+	/*
+	 * Evaluate the user pile and exit if one of the conditions is not
+	 * fulfilled.
+	 */
+	unsafe_get_user(start_ip, &ucs->start_ip, fail);
+	if (unlikely(start_ip >= tasksize))
+		goto die;
+	/* If outside, just clear the critical section. */
+	if (ip < start_ip)
+		goto clear;
+
+	unsafe_get_user(offset, &ucs->post_commit_offset, fail);
+	cs_end = start_ip + offset;
+	/* Check for overflow and wraparound */
+	if (unlikely(cs_end >= tasksize || cs_end < start_ip))
+		goto die;
+
+	/* If not inside, clear it. */
+	if (ip >= cs_end)
+		goto clear;
+
+	unsafe_get_user(abort_ip, &ucs->abort_ip, fail);
+	/* Ensure it's "valid" */
+	if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
+		goto die;
+	/* Validate that the abort IP is not in the critical section */
+	if (unlikely(abort_ip - start_ip < offset))
+		goto die;
+
+	/*
+	 * Check version and flags for 0. No point in emitting deprecated
+	 * warnings before dying. That could be done in the slow path
+	 * eventually, but *shrug*.
+	 */
+	unsafe_get_user(head, uc_head, fail);
+	if (unlikely(head))
+		goto die;
+
+	/* abort_ip - 4 is >= 0. See abort_ip check above */
+	uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
+	unsafe_get_user(usig, uc_sig, fail);
+	if (unlikely(usig != t->rseq.sig))
+		goto die;
+
+	/* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */
+	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+		/* If not in interrupt from user context, let it die */
+		if (unlikely(!t->rseq.event.user_irq))
+			goto die;
+	}
+
+	unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, fail);
+	user_access_end();
+
+	instruction_pointer_set(regs, (unsigned long)abort_ip);
+
+	rseq_stat_inc(rseq_stats.fixup);
+	rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
+	return true;
+clear:
+	unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, fail);
+	user_access_end();
+	rseq_stat_inc(rseq_stats.clear);
+	return true;
+die:
+	t->rseq.event.fatal = true;
+fail:
+	user_access_end();
+	return false;
+}
+#endif /* RSEQ_BUILD_SLOW_PATH */
+
+/*
+ * This only ensures that abort_ip is in the user address space by masking it.
+ * No other sanity checks are done here, that's what the debug code is for.
+ */
+static rseq_inline bool
+rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr)
+{
+	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
+	unsigned long ip = instruction_pointer(regs);
+	u64 start_ip, abort_ip, offset;
+	u32 usig, __user *uc_sig;
+
+	rseq_stat_inc(rseq_stats.cs);
+
+	if (unlikely(csaddr >= TASK_SIZE)) {
+		t->rseq.event.fatal = true;
+		return false;
+	}
+
+	if (static_branch_unlikely(&rseq_debug_enabled))
+		return rseq_debug_update_user_cs(t, regs, csaddr);
+
+	if (!user_rw_masked_begin(ucs))
+		return false;
+
+	unsafe_get_user(start_ip, &ucs->start_ip, fail);
+	unsafe_get_user(offset, &ucs->post_commit_offset, fail);
+	unsafe_get_user(abort_ip, &ucs->abort_ip, fail);
+
+	/*
+	 * No sanity checks. If user space screwed it up, it can
+	 * keep the pieces. That's what debug code is for.
+	 *
+	 * If outside, just clear the critical section.
+	 */
+	if (ip - start_ip >= offset)
+		goto clear;
+
+	/*
+	 * Two requirements for @abort_ip:
+	 *   - Must be in user space as x86 IRET would happily return to
+	 *     the kernel.
+	 *   - The four bytes preceeding the instruction at @abort_ip must
+	 *     contain the signature.
+	 *
+	 * The latter protects against the following attack vector:
+	 *
+	 * An attacker with limited abilities to write, creates a critical
+	 * section descriptor, sets the abort IP to a library function or
+	 * some other ROP gadget and stores the address of the descriptor
+	 * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP
+	 * protection.
+	 */
+	if (abort_ip >= TASK_SIZE || abort_ip < sizeof(*uc_sig))
+		goto die;
+
+	/* The address is guaranteed to be >= 0 and < TASK_SIZE */
+	uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
+	unsafe_get_user(usig, uc_sig, fail);
+	if (unlikely(usig != t->rseq.sig))
+		goto die;
+
+	/* Invalidate the critical section */
+	unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, fail);
+	user_access_end();
+
+	/* Update the instruction pointer */
+	instruction_pointer_set(regs, (unsigned long)abort_ip);
+
+	rseq_stat_inc(rseq_stats.fixup);
+	rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
+	return true;
+clear:
+	unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, fail);
+	user_access_end();
+	rseq_stat_inc(rseq_stats.clear);
+	return true;
+die:
+	t->rseq.event.fatal = true;
+fail:
+	user_access_end();
+	return false;
+}
+
 static __always_inline void rseq_exit_to_user_mode(void)
 {
 	struct rseq_event *ev = &current->rseq.event;
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -14,10 +14,12 @@ struct rseq;
  * @sched_switch:	True if the task was scheduled out
  * @user_irq:		True on interrupt entry from user mode
  * @has_rseq:		True if the task has a rseq pointer installed
+ * @error:		Compound error code for the slow path to analyze
+ * @fatal:		User space data corrupted or invalid
  */
 struct rseq_event {
 	union {
-		u32				all;
+		u64				all;
 		struct {
 			union {
 				u16		events;
@@ -28,6 +30,13 @@ struct rseq_event {
 			};
 
 			u8			has_rseq;
+			u8			__pad;
+			union {
+				u16		error;
+				struct {
+					u8	fatal;
+				};
+			};
 		};
 	};
 };
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -382,175 +382,15 @@ static int rseq_reset_rseq_cpu_node_id(s
 	return -EFAULT;
 }
 
-/*
- * Get the user-space pointer value stored in the 'rseq_cs' field.
- */
-static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs)
-{
-	if (!rseq_cs)
-		return -EFAULT;
-
-#ifdef CONFIG_64BIT
-	if (get_user(*rseq_cs, &rseq->rseq_cs))
-		return -EFAULT;
-#else
-	if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs)))
-		return -EFAULT;
-#endif
-
-	return 0;
-}
-
-/*
- * If the rseq_cs field of 'struct rseq' contains a valid pointer to
- * user-space, copy 'struct rseq_cs' from user-space and validate its fields.
- */
-static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
-{
-	struct rseq_cs __user *urseq_cs;
-	u64 ptr;
-	u32 __user *usig;
-	u32 sig;
-	int ret;
-
-	ret = rseq_get_rseq_cs_ptr_val(t->rseq.usrptr, &ptr);
-	if (ret)
-		return ret;
-
-	/* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */
-	if (!ptr) {
-		memset(rseq_cs, 0, sizeof(*rseq_cs));
-		return 0;
-	}
-	/* Check that the pointer value fits in the user-space process space. */
-	if (ptr >= TASK_SIZE)
-		return -EINVAL;
-	urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
-	if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
-		return -EFAULT;
-
-	if (rseq_cs->start_ip >= TASK_SIZE ||
-	    rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
-	    rseq_cs->abort_ip >= TASK_SIZE ||
-	    rseq_cs->version > 0)
-		return -EINVAL;
-	/* Check for overflow. */
-	if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
-		return -EINVAL;
-	/* Ensure that abort_ip is not in the critical section. */
-	if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
-		return -EINVAL;
-
-	usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
-	ret = get_user(sig, usig);
-	if (ret)
-		return ret;
-
-	if (current->rseq.sig != sig) {
-		printk_ratelimited(KERN_WARNING
-			"Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
-			sig, current->rseq.sig, current->pid, usig);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static bool rseq_warn_flags(const char *str, u32 flags)
+static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
 {
-	u32 test_flags;
+	u64 csaddr;
 
-	if (!flags)
+	if (get_user_masked_u64(&csaddr, &t->rseq.usrptr->rseq_cs))
 		return false;
-	test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS;
-	if (test_flags)
-		pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str);
-	test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS;
-	if (test_flags)
-		pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str);
-	return true;
-}
-
-static int rseq_check_flags(struct task_struct *t, u32 cs_flags)
-{
-	u32 flags;
-	int ret;
-
-	if (rseq_warn_flags("rseq_cs", cs_flags))
-		return -EINVAL;
-
-	/* Get thread flags. */
-	ret = get_user(flags, &t->rseq.usrptr->flags);
-	if (ret)
-		return ret;
-
-	if (rseq_warn_flags("rseq", flags))
-		return -EINVAL;
-	return 0;
-}
-
-static int clear_rseq_cs(struct rseq __user *rseq)
-{
-	/*
-	 * The rseq_cs field is set to NULL on preemption or signal
-	 * delivery on top of rseq assembly block, as well as on top
-	 * of code outside of the rseq assembly block. This performs
-	 * a lazy clear of the rseq_cs field.
-	 *
-	 * Set rseq_cs to NULL.
-	 */
-#ifdef CONFIG_64BIT
-	return put_user(0UL, &rseq->rseq_cs);
-#else
-	if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs)))
-		return -EFAULT;
-	return 0;
-#endif
-}
-
-/*
- * Unsigned comparison will be true when ip >= start_ip, and when
- * ip < start_ip + post_commit_offset.
- */
-static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
-{
-	return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
-}
-
-static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
-{
-	unsigned long ip = instruction_pointer(regs);
-	struct task_struct *t = current;
-	struct rseq_cs rseq_cs;
-	int ret;
-
-	rseq_stat_inc(rseq_stats.cs);
-
-	ret = rseq_get_rseq_cs(t, &rseq_cs);
-	if (ret)
-		return ret;
-
-	/*
-	 * Handle potentially not being within a critical section.
-	 * If not nested over a rseq critical section, restart is useless.
-	 * Clear the rseq_cs pointer and return.
-	 */
-	if (!in_rseq_cs(ip, &rseq_cs)) {
-		rseq_stat_inc(rseq_stats.clear);
-		return clear_rseq_cs(t->rseq.usrptr);
-	}
-	ret = rseq_check_flags(t, rseq_cs.flags);
-	if (ret < 0)
-		return ret;
-	if (!abort)
-		return 0;
-	ret = clear_rseq_cs(t->rseq.usrptr);
-	if (ret)
-		return ret;
-	rseq_stat_inc(rseq_stats.fixup);
-	trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
-			    rseq_cs.abort_ip);
-	instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
-	return 0;
+	if (likely(!csaddr))
+		return true;
+	return rseq_update_user_cs(t, regs, csaddr);
 }
 
 /*
@@ -567,8 +407,8 @@ static int rseq_ip_fixup(struct pt_regs
 void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 {
 	struct task_struct *t = current;
-	int ret, sig;
 	bool event;
+	int sig;
 
 	/*
 	 * If invoked from hypervisors before entering the guest via
@@ -618,8 +458,7 @@ void __rseq_handle_notify_resume(struct
 	if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
 		return;
 
-	ret = rseq_ip_fixup(regs, event);
-	if (unlikely(ret < 0))
+	if (!rseq_handle_cs(t, regs))
 		goto error;
 
 	if (unlikely(rseq_update_cpu_node_id(t)))
@@ -632,6 +471,67 @@ void __rseq_handle_notify_resume(struct
 }
 
 #ifdef CONFIG_DEBUG_RSEQ
+/*
+ * Unsigned comparison will be true when ip >= start_ip, and when
+ * ip < start_ip + post_commit_offset.
+ */
+static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
+{
+	return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
+}
+
+/*
+ * If the rseq_cs field of 'struct rseq' contains a valid pointer to
+ * user-space, copy 'struct rseq_cs' from user-space and validate its fields.
+ */
+static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
+{
+	struct rseq_cs __user *urseq_cs;
+	u64 ptr;
+	u32 __user *usig;
+	u32 sig;
+	int ret;
+
+	if (get_user_masked_u64(&ptr, &t->rseq.usrptr->rseq_cs))
+		return -EFAULT;
+
+	/* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */
+	if (!ptr) {
+		memset(rseq_cs, 0, sizeof(*rseq_cs));
+		return 0;
+	}
+	/* Check that the pointer value fits in the user-space process space. */
+	if (ptr >= TASK_SIZE)
+		return -EINVAL;
+	urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
+	if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
+		return -EFAULT;
+
+	if (rseq_cs->start_ip >= TASK_SIZE ||
+	    rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
+	    rseq_cs->abort_ip >= TASK_SIZE ||
+	    rseq_cs->version > 0)
+		return -EINVAL;
+	/* Check for overflow. */
+	if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
+		return -EINVAL;
+	/* Ensure that abort_ip is not in the critical section. */
+	if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
+		return -EINVAL;
+
+	usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
+	ret = get_user(sig, usig);
+	if (ret)
+		return ret;
+
+	if (current->rseq.sig != sig) {
+		printk_ratelimited(KERN_WARNING
+			"Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
+			sig, current->rseq.sig, current->pid, usig);
+		return -EINVAL;
+	}
+	return 0;
+}
 
 /*
  * Terminate the process if a syscall is issued within a restartable

Re: [patch V4 19/36] rseq: Provide and use rseq_update_user_cs()

Posted by Mathieu Desnoyers 6 hours ago

On 2025-09-08 17:32, Thomas Gleixner wrote:
> Provide a straight forward implementation to check for and eventually
> clear/fixup critical sections in user space.
> 
> The non-debug version does only the minimal sanity checks and aims for
> efficiency.
> 
> There are two attack vectors, which are checked for:
> 
>    1) An abort IP which is in the kernel address space. That would cause at
>       least x86 to return to kernel space via IRET.
> 
>    2) A rogue critical section descriptor with an abort IP pointing to some
>       arbitrary address, which is not preceeded by the RSEQ signature.

preceeded -> preceded

> 
> If the section descriptors are invalid then the resulting misbehaviour of
> the user space application is not the kernels problem.
> 
> The kernel provides a run-time switchable debug slow path, which implements
> the full zoo of checks including termination of the task when one of the
> gazillion conditions is not met.
> 
> Replace the zoo in rseq.c with it and invoke it from the TIF_NOTIFY_RESUME
> handler. Move the reminders into the CONFIG_DEBUG_RSEQ section, which will

reminders -> remainders

> be replaced and removed in a subsequent step.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
> V3: Brought back the signature check along with a comment - Mathieu
> ---
>   include/linux/rseq_entry.h |  209 +++++++++++++++++++++++++++++++++++++++
>   include/linux/rseq_types.h |   11 +-
>   kernel/rseq.c              |  238 +++++++++++++--------------------------------
>   3 files changed, 288 insertions(+), 170 deletions(-)
> 
> --- a/include/linux/rseq_entry.h
> +++ b/include/linux/rseq_entry.h
> @@ -36,6 +36,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_
>   #ifdef CONFIG_RSEQ
>   #include <linux/jump_label.h>
>   #include <linux/rseq.h>
> +#include <linux/uaccess.h>
>   
>   #include <linux/tracepoint-defs.h>
>   
> @@ -67,12 +68,220 @@ static inline void rseq_trace_ip_fixup(u
>   
>   DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
>   
> +#ifdef RSEQ_BUILD_SLOW_PATH
> +#define rseq_inline
> +#else
> +#define rseq_inline __always_inline
> +#endif
> +
> +bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
> +
>   static __always_inline void rseq_note_user_irq_entry(void)
>   {
>   	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
>   		current->rseq.event.user_irq = true;
>   }
>   
> +/*
> + * Check whether there is a valid critical section and whether the
> + * instruction pointer in @regs is inside the critical section.
> + *
> + *  - If the critical section is invalid, terminate the task.
> + *
> + *  - If valid and the instruction pointer is inside, set it to the abort IP
> + *
> + *  - If valid and the instruction pointer is outside, clear the critical
> + *    section address.
> + *
> + * Returns true, if the section was valid and either fixup or clear was
> + * done, false otherwise.
> + *
> + * In the failure case task::rseq_event::fatal is set when a invalid
> + * section was found. It's clear when the failure was an unresolved page
> + * fault.
> + *
> + * If inlined into the exit to user path with interrupts disabled, the
> + * caller has to protect against page faults with pagefault_disable().
> + *
> + * In preemptible task context this would be counterproductive as the page
> + * faults could not be fully resolved. As a consequence unresolved page
> + * faults in task context are fatal too.
> + */
> +
> +#ifdef RSEQ_BUILD_SLOW_PATH
> +/*
> + * The debug version is put out of line, but kept here so the code stays
> + * together.
> + *
> + * @csaddr has already been checked by the caller to be in user space
> + */
> +bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr)
> +{
> +	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
> +	u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE;
> +	unsigned long ip = instruction_pointer(regs);
> +	u64 __user *uc_head = (u64 __user *) ucs;
> +	u32 usig, __user *uc_sig;
> +
> +	if (!user_rw_masked_begin(ucs))
> +		return false;
> +
> +	/*
> +	 * Evaluate the user pile and exit if one of the conditions is not
> +	 * fulfilled.
> +	 */
> +	unsafe_get_user(start_ip, &ucs->start_ip, fail);
> +	if (unlikely(start_ip >= tasksize))
> +		goto die;
> +	/* If outside, just clear the critical section. */
> +	if (ip < start_ip)
> +		goto clear;
> +
> +	unsafe_get_user(offset, &ucs->post_commit_offset, fail);
> +	cs_end = start_ip + offset;
> +	/* Check for overflow and wraparound */
> +	if (unlikely(cs_end >= tasksize || cs_end < start_ip))
> +		goto die;
> +
> +	/* If not inside, clear it. */
> +	if (ip >= cs_end)
> +		goto clear;
> +
> +	unsafe_get_user(abort_ip, &ucs->abort_ip, fail);
> +	/* Ensure it's "valid" */
> +	if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
> +		goto die;
> +	/* Validate that the abort IP is not in the critical section */
> +	if (unlikely(abort_ip - start_ip < offset))
> +		goto die;
> +
> +	/*
> +	 * Check version and flags for 0. No point in emitting deprecated
> +	 * warnings before dying. That could be done in the slow path
> +	 * eventually, but *shrug*.
> +	 */
> +	unsafe_get_user(head, uc_head, fail);
> +	if (unlikely(head))
> +		goto die;
> +
> +	/* abort_ip - 4 is >= 0. See abort_ip check above */
> +	uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
> +	unsafe_get_user(usig, uc_sig, fail);
> +	if (unlikely(usig != t->rseq.sig))
> +		goto die;
> +
> +	/* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */
> +	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
> +		/* If not in interrupt from user context, let it die */
> +		if (unlikely(!t->rseq.event.user_irq))
> +			goto die;
> +	}
> +
> +	unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, fail);
> +	user_access_end();
> +
> +	instruction_pointer_set(regs, (unsigned long)abort_ip);
> +
> +	rseq_stat_inc(rseq_stats.fixup);
> +	rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
> +	return true;
> +clear:
> +	unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, fail);
> +	user_access_end();
> +	rseq_stat_inc(rseq_stats.clear);
> +	return true;
> +die:
> +	t->rseq.event.fatal = true;
> +fail:
> +	user_access_end();
> +	return false;
> +}
> +#endif /* RSEQ_BUILD_SLOW_PATH */
> +
> +/*
> + * This only ensures that abort_ip is in the user address space by masking it.
> + * No other sanity checks are done here, that's what the debug code is for.

This comment should be updated to state that the signature is also
checked.

> + */
> +static rseq_inline bool
> +rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr)
> +{
> +	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
> +	unsigned long ip = instruction_pointer(regs);
> +	u64 start_ip, abort_ip, offset;
> +	u32 usig, __user *uc_sig;
> +
> +	rseq_stat_inc(rseq_stats.cs);
> +
> +	if (unlikely(csaddr >= TASK_SIZE)) {
> +		t->rseq.event.fatal = true;
> +		return false;
> +	}
> +
> +	if (static_branch_unlikely(&rseq_debug_enabled))
> +		return rseq_debug_update_user_cs(t, regs, csaddr);
> +
> +	if (!user_rw_masked_begin(ucs))
> +		return false;
> +
> +	unsafe_get_user(start_ip, &ucs->start_ip, fail);
> +	unsafe_get_user(offset, &ucs->post_commit_offset, fail);
> +	unsafe_get_user(abort_ip, &ucs->abort_ip, fail);
> +
> +	/*
> +	 * No sanity checks. If user space screwed it up, it can
> +	 * keep the pieces. That's what debug code is for.
> +	 *
> +	 * If outside, just clear the critical section.
> +	 */
> +	if (ip - start_ip >= offset)
> +		goto clear;
> +
> +	/*
> +	 * Two requirements for @abort_ip:
> +	 *   - Must be in user space as x86 IRET would happily return to
> +	 *     the kernel.
> +	 *   - The four bytes preceeding the instruction at @abort_ip must

preceeding -> preceding

> +	 *     contain the signature.
> +	 *
> +	 * The latter protects against the following attack vector:
> +	 *
> +	 * An attacker with limited abilities to write, creates a critical
> +	 * section descriptor, sets the abort IP to a library function or
> +	 * some other ROP gadget and stores the address of the descriptor
> +	 * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP
> +	 * protection.

Thanks for documenting this.

Mathieu

> +	 */
> +	if (abort_ip >= TASK_SIZE || abort_ip < sizeof(*uc_sig))
> +		goto die;
> +
> +	/* The address is guaranteed to be >= 0 and < TASK_SIZE */
> +	uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
> +	unsafe_get_user(usig, uc_sig, fail);
> +	if (unlikely(usig != t->rseq.sig))
> +		goto die;
> +
> +	/* Invalidate the critical section */
> +	unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, fail);
> +	user_access_end();
> +
> +	/* Update the instruction pointer */
> +	instruction_pointer_set(regs, (unsigned long)abort_ip);
> +
> +	rseq_stat_inc(rseq_stats.fixup);
> +	rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
> +	return true;
> +clear:
> +	unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, fail);
> +	user_access_end();
> +	rseq_stat_inc(rseq_stats.clear);
> +	return true;
> +die:
> +	t->rseq.event.fatal = true;
> +fail:
> +	user_access_end();
> +	return false;
> +}
> +
>   static __always_inline void rseq_exit_to_user_mode(void)
>   {
>   	struct rseq_event *ev = &current->rseq.event;
> --- a/include/linux/rseq_types.h
> +++ b/include/linux/rseq_types.h
> @@ -14,10 +14,12 @@ struct rseq;
>    * @sched_switch:	True if the task was scheduled out
>    * @user_irq:		True on interrupt entry from user mode
>    * @has_rseq:		True if the task has a rseq pointer installed
> + * @error:		Compound error code for the slow path to analyze
> + * @fatal:		User space data corrupted or invalid
>    */
>   struct rseq_event {
>   	union {
> -		u32				all;
> +		u64				all;
>   		struct {
>   			union {
>   				u16		events;
> @@ -28,6 +30,13 @@ struct rseq_event {
>   			};
>   
>   			u8			has_rseq;
> +			u8			__pad;
> +			union {
> +				u16		error;
> +				struct {
> +					u8	fatal;
> +				};
> +			};
>   		};
>   	};
>   };
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -382,175 +382,15 @@ static int rseq_reset_rseq_cpu_node_id(s
>   	return -EFAULT;
>   }
>   
> -/*
> - * Get the user-space pointer value stored in the 'rseq_cs' field.
> - */
> -static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs)
> -{
> -	if (!rseq_cs)
> -		return -EFAULT;
> -
> -#ifdef CONFIG_64BIT
> -	if (get_user(*rseq_cs, &rseq->rseq_cs))
> -		return -EFAULT;
> -#else
> -	if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs)))
> -		return -EFAULT;
> -#endif
> -
> -	return 0;
> -}
> -
> -/*
> - * If the rseq_cs field of 'struct rseq' contains a valid pointer to
> - * user-space, copy 'struct rseq_cs' from user-space and validate its fields.
> - */
> -static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
> -{
> -	struct rseq_cs __user *urseq_cs;
> -	u64 ptr;
> -	u32 __user *usig;
> -	u32 sig;
> -	int ret;
> -
> -	ret = rseq_get_rseq_cs_ptr_val(t->rseq.usrptr, &ptr);
> -	if (ret)
> -		return ret;
> -
> -	/* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */
> -	if (!ptr) {
> -		memset(rseq_cs, 0, sizeof(*rseq_cs));
> -		return 0;
> -	}
> -	/* Check that the pointer value fits in the user-space process space. */
> -	if (ptr >= TASK_SIZE)
> -		return -EINVAL;
> -	urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
> -	if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
> -		return -EFAULT;
> -
> -	if (rseq_cs->start_ip >= TASK_SIZE ||
> -	    rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
> -	    rseq_cs->abort_ip >= TASK_SIZE ||
> -	    rseq_cs->version > 0)
> -		return -EINVAL;
> -	/* Check for overflow. */
> -	if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
> -		return -EINVAL;
> -	/* Ensure that abort_ip is not in the critical section. */
> -	if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
> -		return -EINVAL;
> -
> -	usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
> -	ret = get_user(sig, usig);
> -	if (ret)
> -		return ret;
> -
> -	if (current->rseq.sig != sig) {
> -		printk_ratelimited(KERN_WARNING
> -			"Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
> -			sig, current->rseq.sig, current->pid, usig);
> -		return -EINVAL;
> -	}
> -	return 0;
> -}
> -
> -static bool rseq_warn_flags(const char *str, u32 flags)
> +static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
>   {
> -	u32 test_flags;
> +	u64 csaddr;
>   
> -	if (!flags)
> +	if (get_user_masked_u64(&csaddr, &t->rseq.usrptr->rseq_cs))
>   		return false;
> -	test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS;
> -	if (test_flags)
> -		pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str);
> -	test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS;
> -	if (test_flags)
> -		pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str);
> -	return true;
> -}
> -
> -static int rseq_check_flags(struct task_struct *t, u32 cs_flags)
> -{
> -	u32 flags;
> -	int ret;
> -
> -	if (rseq_warn_flags("rseq_cs", cs_flags))
> -		return -EINVAL;
> -
> -	/* Get thread flags. */
> -	ret = get_user(flags, &t->rseq.usrptr->flags);
> -	if (ret)
> -		return ret;
> -
> -	if (rseq_warn_flags("rseq", flags))
> -		return -EINVAL;
> -	return 0;
> -}
> -
> -static int clear_rseq_cs(struct rseq __user *rseq)
> -{
> -	/*
> -	 * The rseq_cs field is set to NULL on preemption or signal
> -	 * delivery on top of rseq assembly block, as well as on top
> -	 * of code outside of the rseq assembly block. This performs
> -	 * a lazy clear of the rseq_cs field.
> -	 *
> -	 * Set rseq_cs to NULL.
> -	 */
> -#ifdef CONFIG_64BIT
> -	return put_user(0UL, &rseq->rseq_cs);
> -#else
> -	if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs)))
> -		return -EFAULT;
> -	return 0;
> -#endif
> -}
> -
> -/*
> - * Unsigned comparison will be true when ip >= start_ip, and when
> - * ip < start_ip + post_commit_offset.
> - */
> -static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
> -{
> -	return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
> -}
> -
> -static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
> -{
> -	unsigned long ip = instruction_pointer(regs);
> -	struct task_struct *t = current;
> -	struct rseq_cs rseq_cs;
> -	int ret;
> -
> -	rseq_stat_inc(rseq_stats.cs);
> -
> -	ret = rseq_get_rseq_cs(t, &rseq_cs);
> -	if (ret)
> -		return ret;
> -
> -	/*
> -	 * Handle potentially not being within a critical section.
> -	 * If not nested over a rseq critical section, restart is useless.
> -	 * Clear the rseq_cs pointer and return.
> -	 */
> -	if (!in_rseq_cs(ip, &rseq_cs)) {
> -		rseq_stat_inc(rseq_stats.clear);
> -		return clear_rseq_cs(t->rseq.usrptr);
> -	}
> -	ret = rseq_check_flags(t, rseq_cs.flags);
> -	if (ret < 0)
> -		return ret;
> -	if (!abort)
> -		return 0;
> -	ret = clear_rseq_cs(t->rseq.usrptr);
> -	if (ret)
> -		return ret;
> -	rseq_stat_inc(rseq_stats.fixup);
> -	trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
> -			    rseq_cs.abort_ip);
> -	instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
> -	return 0;
> +	if (likely(!csaddr))
> +		return true;
> +	return rseq_update_user_cs(t, regs, csaddr);
>   }
>   
>   /*
> @@ -567,8 +407,8 @@ static int rseq_ip_fixup(struct pt_regs
>   void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
>   {
>   	struct task_struct *t = current;
> -	int ret, sig;
>   	bool event;
> +	int sig;
>   
>   	/*
>   	 * If invoked from hypervisors before entering the guest via
> @@ -618,8 +458,7 @@ void __rseq_handle_notify_resume(struct
>   	if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
>   		return;
>   
> -	ret = rseq_ip_fixup(regs, event);
> -	if (unlikely(ret < 0))
> +	if (!rseq_handle_cs(t, regs))
>   		goto error;
>   
>   	if (unlikely(rseq_update_cpu_node_id(t)))
> @@ -632,6 +471,67 @@ void __rseq_handle_notify_resume(struct
>   }
>   
>   #ifdef CONFIG_DEBUG_RSEQ
> +/*
> + * Unsigned comparison will be true when ip >= start_ip, and when
> + * ip < start_ip + post_commit_offset.
> + */
> +static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
> +{
> +	return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
> +}
> +
> +/*
> + * If the rseq_cs field of 'struct rseq' contains a valid pointer to
> + * user-space, copy 'struct rseq_cs' from user-space and validate its fields.
> + */
> +static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
> +{
> +	struct rseq_cs __user *urseq_cs;
> +	u64 ptr;
> +	u32 __user *usig;
> +	u32 sig;
> +	int ret;
> +
> +	if (get_user_masked_u64(&ptr, &t->rseq.usrptr->rseq_cs))
> +		return -EFAULT;
> +
> +	/* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */
> +	if (!ptr) {
> +		memset(rseq_cs, 0, sizeof(*rseq_cs));
> +		return 0;
> +	}
> +	/* Check that the pointer value fits in the user-space process space. */
> +	if (ptr >= TASK_SIZE)
> +		return -EINVAL;
> +	urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
> +	if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
> +		return -EFAULT;
> +
> +	if (rseq_cs->start_ip >= TASK_SIZE ||
> +	    rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
> +	    rseq_cs->abort_ip >= TASK_SIZE ||
> +	    rseq_cs->version > 0)
> +		return -EINVAL;
> +	/* Check for overflow. */
> +	if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
> +		return -EINVAL;
> +	/* Ensure that abort_ip is not in the critical section. */
> +	if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
> +		return -EINVAL;
> +
> +	usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
> +	ret = get_user(sig, usig);
> +	if (ret)
> +		return ret;
> +
> +	if (current->rseq.sig != sig) {
> +		printk_ratelimited(KERN_WARNING
> +			"Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
> +			sig, current->rseq.sig, current->pid, usig);
> +		return -EINVAL;
> +	}
> +	return 0;
> +}
>   
>   /*
>    * Terminate the process if a syscall is issued within a restartable
> 


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

[patch V4 20/36] rseq: Replace the original debug implementation

Posted by Thomas Gleixner 23 hours ago

Just utilize the new infrastructure and put the original one to rest.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

---
 kernel/rseq.c |   80 ++++++++--------------------------------------------------
 1 file changed, 12 insertions(+), 68 deletions(-)

--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -472,83 +472,27 @@ void __rseq_handle_notify_resume(struct
 
 #ifdef CONFIG_DEBUG_RSEQ
 /*
- * Unsigned comparison will be true when ip >= start_ip, and when
- * ip < start_ip + post_commit_offset.
- */
-static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
-{
-	return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
-}
-
-/*
- * If the rseq_cs field of 'struct rseq' contains a valid pointer to
- * user-space, copy 'struct rseq_cs' from user-space and validate its fields.
- */
-static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
-{
-	struct rseq_cs __user *urseq_cs;
-	u64 ptr;
-	u32 __user *usig;
-	u32 sig;
-	int ret;
-
-	if (get_user_masked_u64(&ptr, &t->rseq.usrptr->rseq_cs))
-		return -EFAULT;
-
-	/* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */
-	if (!ptr) {
-		memset(rseq_cs, 0, sizeof(*rseq_cs));
-		return 0;
-	}
-	/* Check that the pointer value fits in the user-space process space. */
-	if (ptr >= TASK_SIZE)
-		return -EINVAL;
-	urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
-	if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
-		return -EFAULT;
-
-	if (rseq_cs->start_ip >= TASK_SIZE ||
-	    rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
-	    rseq_cs->abort_ip >= TASK_SIZE ||
-	    rseq_cs->version > 0)
-		return -EINVAL;
-	/* Check for overflow. */
-	if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
-		return -EINVAL;
-	/* Ensure that abort_ip is not in the critical section. */
-	if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
-		return -EINVAL;
-
-	usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
-	ret = get_user(sig, usig);
-	if (ret)
-		return ret;
-
-	if (current->rseq.sig != sig) {
-		printk_ratelimited(KERN_WARNING
-			"Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
-			sig, current->rseq.sig, current->pid, usig);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-/*
  * Terminate the process if a syscall is issued within a restartable
  * sequence.
  */
 void rseq_syscall(struct pt_regs *regs)
 {
-	unsigned long ip = instruction_pointer(regs);
 	struct task_struct *t = current;
-	struct rseq_cs rseq_cs;
+	u64 csaddr;
 
-	if (!t->rseq.usrptr)
+	if (!t->rseq.event.has_rseq)
+		return;
+	if (get_user_masked_u64(&csaddr, &t->rseq.usrptr->rseq_cs))
+		goto fail;
+	if (likely(!csaddr))
 		return;
-	if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
-		force_sig(SIGSEGV);
+	if (unlikely(csaddr >= TASK_SIZE))
+		goto fail;
+	if (rseq_debug_update_user_cs(t, regs, csaddr))
+		return;
+fail:
+	force_sig(SIGSEGV);
 }
-
 #endif
 
 /*

[patch V4 21/36] rseq: Make exit debugging static branch based

Posted by Thomas Gleixner 23 hours ago

Disconnect it from the config switch and use the static debug branch. This
is a temporary measure for validating the rework. At the end this check
needs to be hidden behind lockdep as it has nothing to do with the other
debug infrastructure, which mainly aids user space debugging by enabling a
zoo of checks which terminate misbehaving tasks instead of letting them
keep the hard to diagnose pieces.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

---
 include/linux/rseq_entry.h |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -288,7 +288,7 @@ static __always_inline void rseq_exit_to
 
 	rseq_stat_inc(rseq_stats.exit);
 
-	if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
+	if (static_branch_unlikely(&rseq_debug_enabled))
 		WARN_ON_ONCE(ev->sched_switch);
 
 	/*

[patch V4 22/36] rseq: Use static branch for syscall exit debug when GENERIC_IRQ_ENTRY=y

Posted by Thomas Gleixner 23 hours ago

Make the syscall exit debug mechanism available via the static branch on
architectures which utilize the generic entry code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

---
 include/linux/entry-common.h |    2 +-
 include/linux/rseq_entry.h   |    9 +++++++++
 kernel/rseq.c                |   19 +++++++++++++------
 3 files changed, 23 insertions(+), 7 deletions(-)

--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -146,7 +146,7 @@ static __always_inline void syscall_exit
 			local_irq_enable();
 	}
 
-	rseq_syscall(regs);
+	rseq_debug_syscall_return(regs);
 
 	/*
 	 * Do one-time syscall specific work. If these work items are
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -286,9 +286,18 @@ static __always_inline void rseq_exit_to
 	ev->events = 0;
 }
 
+void __rseq_debug_syscall_return(struct pt_regs *regs);
+
+static inline void rseq_debug_syscall_return(struct pt_regs *regs)
+{
+	if (static_branch_unlikely(&rseq_debug_enabled))
+		__rseq_debug_syscall_return(regs);
+}
+
 #else /* CONFIG_RSEQ */
 static inline void rseq_note_user_irq_entry(void) { }
 static inline void rseq_exit_to_user_mode(void) { }
+static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
 #endif /* !CONFIG_RSEQ */
 
 #endif /* _LINUX_RSEQ_ENTRY_H */
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -470,12 +470,7 @@ void __rseq_handle_notify_resume(struct
 	force_sigsegv(sig);
 }
 
-#ifdef CONFIG_DEBUG_RSEQ
-/*
- * Terminate the process if a syscall is issued within a restartable
- * sequence.
- */
-void rseq_syscall(struct pt_regs *regs)
+void __rseq_debug_syscall_return(struct pt_regs *regs)
 {
 	struct task_struct *t = current;
 	u64 csaddr;
@@ -493,6 +488,18 @@ void rseq_syscall(struct pt_regs *regs)
 fail:
 	force_sig(SIGSEGV);
 }
+
+#ifdef CONFIG_DEBUG_RSEQ
+/*
+ * Kept around to keep GENERIC_ENTRY=n architectures supported.
+ *
+ * Terminate the process if a syscall is issued within a restartable
+ * sequence.
+ */
+void rseq_syscall(struct pt_regs *regs)
+{
+	__rseq_debug_syscall_return(regs);
+}
 #endif
 
 /*

[patch V4 23/36] rseq: Provide and use rseq_set_ids()

Posted by Thomas Gleixner 23 hours ago

Provide a new and straight forward implementation to set the IDs (CPU ID,
Node ID and MM CID), which can be later inlined into the fast path.

It does all operations in one user_rw_masked_begin() section and retrieves
also the critical section member (rseq::cs_rseq) from user space to avoid
another user..begin/end() pair. This is in preparation for optimizing the
fast path to avoid extra work when not required.

Use it to replace the whole related zoo in rseq.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
V3: Fixed the node ID comparison in the debug path - Mathieu
---
 fs/binfmt_elf.c            |    2 
 include/linux/rseq_entry.h |  101 +++++++++++++++++++++
 include/linux/sched.h      |   10 --
 kernel/rseq.c              |  208 ++++++---------------------------------------
 4 files changed, 134 insertions(+), 187 deletions(-)

--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -46,7 +46,7 @@
 #include <linux/cred.h>
 #include <linux/dax.h>
 #include <linux/uaccess.h>
-#include <linux/rseq.h>
+#include <uapi/linux/rseq.h>
 #include <asm/param.h>
 #include <asm/page.h>
 
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -38,6 +38,8 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_
 #include <linux/rseq.h>
 #include <linux/uaccess.h>
 
+#include <uapi/linux/rseq.h>
+
 #include <linux/tracepoint-defs.h>
 
 #ifdef CONFIG_TRACEPOINTS
@@ -75,6 +77,7 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEB
 #endif
 
 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
+bool rseq_debug_validate_ids(struct task_struct *t);
 
 static __always_inline void rseq_note_user_irq_entry(void)
 {
@@ -196,6 +199,50 @@ bool rseq_debug_update_user_cs(struct ta
 	user_access_end();
 	return false;
 }
+
+/*
+ * On debug kernels validate that user space did not mess with it if
+ * DEBUG_RSEQ is enabled, but don't on the first exit to user space. In
+ * that case cpu_cid is ~0. See fork/execve.
+ */
+bool rseq_debug_validate_ids(struct task_struct *t)
+{
+	struct rseq __user *rseq = t->rseq.usrptr;
+	u32 cpu_id, uval, node_id;
+
+	if (t->rseq.ids.cpu_cid == ~0)
+		return true;
+
+	/*
+	 * Look it up outside of the user access section as cpu_to_node()
+	 * can end up in debug code.
+	 */
+	node_id = cpu_to_node(t->rseq.ids.cpu_id);
+
+	if (!user_read_masked_begin(rseq))
+		return false;
+
+	unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
+	if (cpu_id != t->rseq.ids.cpu_id)
+		goto die;
+	unsafe_get_user(uval, &rseq->cpu_id, efault);
+	if (uval != cpu_id)
+		goto die;
+	unsafe_get_user(uval, &rseq->node_id, efault);
+	if (uval != node_id)
+		goto die;
+	unsafe_get_user(uval, &rseq->mm_cid, efault);
+	if (uval != t->rseq.ids.mm_cid)
+		goto die;
+	user_access_end();
+	return true;
+die:
+	t->rseq.event.fatal = true;
+efault:
+	user_access_end();
+	return false;
+}
+
 #endif /* RSEQ_BUILD_SLOW_PATH */
 
 /*
@@ -281,6 +328,60 @@ rseq_update_user_cs(struct task_struct *
 	user_access_end();
 	return false;
 }
+
+/*
+ * Updates CPU ID, Node ID and MM CID and reads the critical section
+ * address, when @csaddr != NULL. This allows to put the ID update and the
+ * read under the same uaccess region to spare a seperate begin/end.
+ *
+ * As this is either invoked from a C wrapper with @csaddr = NULL or from
+ * the fast path code with a valid pointer, a clever compiler should be
+ * able to optimize the read out. Spares a duplicate implementation.
+ *
+ * Returns true, if the operation was successful, false otherwise.
+ *
+ * In the failure case task::rseq_event::fatal is set when invalid data
+ * was found on debug kernels. It's clear when the failure was an unresolved page
+ * fault.
+ *
+ * If inlined into the exit to user path with interrupts disabled, the
+ * caller has to protect against page faults with pagefault_disable().
+ *
+ * In preemptible task context this would be counterproductive as the page
+ * faults could not be fully resolved. As a consequence unresolved page
+ * faults in task context are fatal too.
+ */
+static rseq_inline
+bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
+			     u32 node_id, u64 *csaddr)
+{
+	struct rseq __user *rseq = t->rseq.usrptr;
+
+	if (static_branch_unlikely(&rseq_debug_enabled)) {
+		if (!rseq_debug_validate_ids(t))
+			return false;
+	}
+
+	if (!user_rw_masked_begin(rseq))
+		return false;
+
+	unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault);
+	unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault);
+	unsafe_put_user(node_id, &rseq->node_id, efault);
+	unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
+	if (csaddr)
+		unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
+	user_access_end();
+
+	/* Cache the new values */
+	t->rseq.ids.cpu_cid = ids->cpu_cid;
+	rseq_stat_inc(rseq_stats.ids);
+	rseq_trace_update(t, ids);
+	return true;
+efault:
+	user_access_end();
+	return false;
+}
 
 static __always_inline void rseq_exit_to_user_mode(void)
 {
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -42,7 +42,6 @@
 #include <linux/posix-timers_types.h>
 #include <linux/restart_block.h>
 #include <linux/rseq_types.h>
-#include <uapi/linux/rseq.h>
 #include <linux/seqlock_types.h>
 #include <linux/kcsan.h>
 #include <linux/rv.h>
@@ -1402,15 +1401,6 @@ struct task_struct {
 #endif /* CONFIG_NUMA_BALANCING */
 
 	struct rseq_data		rseq;
-#ifdef CONFIG_DEBUG_RSEQ
-	/*
-	 * This is a place holder to save a copy of the rseq fields for
-	 * validation of read-only fields. The struct rseq has a
-	 * variable-length array at the end, so it cannot be used
-	 * directly. Reserve a size large enough for the known fields.
-	 */
-	char				rseq_fields[sizeof(struct rseq)];
-#endif
 
 #ifdef CONFIG_SCHED_MM_CID
 	int				mm_cid;		/* Current cid in mm */
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -88,13 +88,6 @@
 # define RSEQ_EVENT_GUARD	preempt
 #endif
 
-/* The original rseq structure size (including padding) is 32 bytes. */
-#define ORIG_RSEQ_SIZE		32
-
-#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \
-				  RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
-				  RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
-
 DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
 
 static inline void rseq_control_debug(bool on)
@@ -227,159 +220,9 @@ static int __init rseq_debugfs_init(void
 __initcall(rseq_debugfs_init);
 #endif /* CONFIG_DEBUG_FS */
 
-#ifdef CONFIG_DEBUG_RSEQ
-static struct rseq *rseq_kernel_fields(struct task_struct *t)
-{
-	return (struct rseq *) t->rseq_fields;
-}
-
-static int rseq_validate_ro_fields(struct task_struct *t)
-{
-	static DEFINE_RATELIMIT_STATE(_rs,
-				      DEFAULT_RATELIMIT_INTERVAL,
-				      DEFAULT_RATELIMIT_BURST);
-	u32 cpu_id_start, cpu_id, node_id, mm_cid;
-	struct rseq __user *rseq = t->rseq.usrptr;
-
-	/*
-	 * Validate fields which are required to be read-only by
-	 * user-space.
-	 */
-	if (!user_read_access_begin(rseq, t->rseq.len))
-		goto efault;
-	unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
-	unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end);
-	unsafe_get_user(node_id, &rseq->node_id, efault_end);
-	unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end);
-	user_read_access_end();
-
-	if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start ||
-	    cpu_id != rseq_kernel_fields(t)->cpu_id ||
-	    node_id != rseq_kernel_fields(t)->node_id ||
-	    mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) {
-
-		pr_warn("Detected rseq corruption for pid: %d, name: %s\n"
-			"\tcpu_id_start: %u ?= %u\n"
-			"\tcpu_id:       %u ?= %u\n"
-			"\tnode_id:      %u ?= %u\n"
-			"\tmm_cid:       %u ?= %u\n",
-			t->pid, t->comm,
-			cpu_id_start, rseq_kernel_fields(t)->cpu_id_start,
-			cpu_id, rseq_kernel_fields(t)->cpu_id,
-			node_id, rseq_kernel_fields(t)->node_id,
-			mm_cid, rseq_kernel_fields(t)->mm_cid);
-	}
-
-	/* For now, only print a console warning on mismatch. */
-	return 0;
-
-efault_end:
-	user_read_access_end();
-efault:
-	return -EFAULT;
-}
-
-/*
- * Update an rseq field and its in-kernel copy in lock-step to keep a coherent
- * state.
- */
-#define rseq_unsafe_put_user(t, value, field, error_label)			\
-	do {									\
-		unsafe_put_user(value, &t->rseq.usrptr->field, error_label);	\
-		rseq_kernel_fields(t)->field = value;				\
-	} while (0)
-
-#else
-static int rseq_validate_ro_fields(struct task_struct *t)
-{
-	return 0;
-}
-
-#define rseq_unsafe_put_user(t, value, field, error_label)		\
-	unsafe_put_user(value, &t->rseq.usrptr->field, error_label)
-#endif
-
-static int rseq_update_cpu_node_id(struct task_struct *t)
-{
-	struct rseq __user *rseq = t->rseq.usrptr;
-	u32 cpu_id = raw_smp_processor_id();
-	u32 node_id = cpu_to_node(cpu_id);
-	u32 mm_cid = task_mm_cid(t);
-
-	rseq_stat_inc(rseq_stats.ids);
-
-	/* Validate read-only rseq fields on debug kernels */
-	if (rseq_validate_ro_fields(t))
-		goto efault;
-	WARN_ON_ONCE((int) mm_cid < 0);
-
-	if (!user_write_access_begin(rseq, t->rseq.len))
-		goto efault;
-
-	rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end);
-	rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end);
-	rseq_unsafe_put_user(t, node_id, node_id, efault_end);
-	rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
-
-	/* Cache the user space values */
-	t->rseq.ids.cpu_id = cpu_id;
-	t->rseq.ids.mm_cid = mm_cid;
-
-	/*
-	 * Additional feature fields added after ORIG_RSEQ_SIZE
-	 * need to be conditionally updated only if
-	 * t->rseq_len != ORIG_RSEQ_SIZE.
-	 */
-	user_write_access_end();
-	trace_rseq_update(t);
-	return 0;
-
-efault_end:
-	user_write_access_end();
-efault:
-	return -EFAULT;
-}
-
-static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
+static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id)
 {
-	struct rseq __user *rseq = t->rseq.usrptr;
-	u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
-	    mm_cid = 0;
-
-	/*
-	 * Validate read-only rseq fields.
-	 */
-	if (rseq_validate_ro_fields(t))
-		goto efault;
-
-	if (!user_write_access_begin(rseq, t->rseq.len))
-		goto efault;
-
-	/*
-	 * Reset all fields to their initial state.
-	 *
-	 * All fields have an initial state of 0 except cpu_id which is set to
-	 * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after
-	 * unregistration can figure out that rseq needs to be registered
-	 * again.
-	 */
-	rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end);
-	rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end);
-	rseq_unsafe_put_user(t, node_id, node_id, efault_end);
-	rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
-
-	/*
-	 * Additional feature fields added after ORIG_RSEQ_SIZE
-	 * need to be conditionally reset only if
-	 * t->rseq_len != ORIG_RSEQ_SIZE.
-	 */
-	user_write_access_end();
-	return 0;
-
-efault_end:
-	user_write_access_end();
-efault:
-	return -EFAULT;
+	return rseq_set_ids_get_csaddr(t, ids, node_id, NULL);
 }
 
 static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
@@ -407,6 +250,8 @@ static bool rseq_handle_cs(struct task_s
 void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 {
 	struct task_struct *t = current;
+	struct rseq_ids ids;
+	u32 node_id;
 	bool event;
 	int sig;
 
@@ -453,6 +298,8 @@ void __rseq_handle_notify_resume(struct
 	scoped_guard(RSEQ_EVENT_GUARD) {
 		event = t->rseq.event.sched_switch;
 		t->rseq.event.sched_switch = false;
+		ids.cpu_id = task_cpu(t);
+		ids.mm_cid = task_mm_cid(t);
 	}
 
 	if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
@@ -461,7 +308,8 @@ void __rseq_handle_notify_resume(struct
 	if (!rseq_handle_cs(t, regs))
 		goto error;
 
-	if (unlikely(rseq_update_cpu_node_id(t)))
+	node_id = cpu_to_node(ids.cpu_id);
+	if (!rseq_set_ids(t, &ids, node_id))
 		goto error;
 	return;
 
@@ -502,13 +350,33 @@ void rseq_syscall(struct pt_regs *regs)
 }
 #endif
 
+static bool rseq_reset_ids(void)
+{
+	struct rseq_ids ids = {
+		.cpu_id		= RSEQ_CPU_ID_UNINITIALIZED,
+		.mm_cid		= 0,
+	};
+
+	/*
+	 * If this fails, terminate it because this leaves the kernel in
+	 * stupid state as exit to user space will try to fixup the ids
+	 * again.
+	 */
+	if (rseq_set_ids(current, &ids, 0))
+		return true;
+
+	force_sig(SIGSEGV);
+	return false;
+}
+
+/* The original rseq structure size (including padding) is 32 bytes. */
+#define ORIG_RSEQ_SIZE		32
+
 /*
  * sys_rseq - setup restartable sequences for caller thread.
  */
 SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
 {
-	int ret;
-
 	if (flags & RSEQ_FLAG_UNREGISTER) {
 		if (flags & ~RSEQ_FLAG_UNREGISTER)
 			return -EINVAL;
@@ -519,9 +387,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
 			return -EINVAL;
 		if (current->rseq.sig != sig)
 			return -EPERM;
-		ret = rseq_reset_rseq_cpu_node_id(current);
-		if (ret)
-			return ret;
+		if (!rseq_reset_ids())
+			return -EFAULT;
 		rseq_reset(current);
 		return 0;
 	}
@@ -571,17 +438,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
 	if (put_user_masked_u64(0UL, &rseq->rseq_cs))
 		return -EFAULT;
 
-#ifdef CONFIG_DEBUG_RSEQ
-	/*
-	 * Initialize the in-kernel rseq fields copy for validation of
-	 * read-only fields.
-	 */
-	if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) ||
-	    get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) ||
-	    get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) ||
-	    get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid))
-		return -EFAULT;
-#endif
 	/*
 	 * Activate the registration by setting the rseq area address, length
 	 * and signature in the task struct.

[patch V4 24/36] rseq: Separate the signal delivery path

Posted by Thomas Gleixner 23 hours ago

Completely separate the signal delivery path from the notify handler as
they have different semantics versus the event handling.

The signal delivery only needs to ensure that the interrupted user context
was not in a critical section or the section is aborted before it switches
to the signal frame context. The signal frame context does not have the
original instruction pointer anymore, so that can't be handled on exit to
user space.

No point in updating the CPU/CID ids as they might change again before the
task returns to user space for real.

The fast path optimization, which checks for the 'entry from user via
interrupt' condition is only available for architectures which use the
generic entry code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
V3: Move rseq_update_usr() to the next patch - Mathieu
---
 include/linux/rseq.h |   21 ++++++++++++++++-----
 kernel/rseq.c        |   30 ++++++++++++++++++++++--------
 2 files changed, 38 insertions(+), 13 deletions(-)

--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -5,22 +5,33 @@
 #ifdef CONFIG_RSEQ
 #include <linux/sched.h>
 
-void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
+void __rseq_handle_notify_resume(struct pt_regs *regs);
 
 static inline void rseq_handle_notify_resume(struct pt_regs *regs)
 {
 	if (current->rseq.event.has_rseq)
-		__rseq_handle_notify_resume(NULL, regs);
+		__rseq_handle_notify_resume(regs);
 }
 
+void __rseq_signal_deliver(int sig, struct pt_regs *regs);
+
+/*
+ * Invoked from signal delivery to fixup based on the register context before
+ * switching to the signal delivery context.
+ */
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
 {
-	if (current->rseq.event.has_rseq) {
-		current->rseq.event.sched_switch = true;
-		__rseq_handle_notify_resume(ksig, regs);
+	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+		/* '&' is intentional to spare one conditional branch */
+		if (current->rseq.event.has_rseq & current->rseq.event.user_irq)
+			__rseq_signal_deliver(ksig->sig, regs);
+	} else {
+		if (current->rseq.event.has_rseq)
+			__rseq_signal_deliver(ksig->sig, regs);
 	}
 }
 
+/* Raised from context switch and exevce to force evaluation on exit to user */
 static inline void rseq_sched_switch_event(struct task_struct *t)
 {
 	if (t->rseq.event.has_rseq) {
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -247,13 +247,12 @@ static bool rseq_handle_cs(struct task_s
  * respect to other threads scheduled on the same CPU, and with respect
  * to signal handlers.
  */
-void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
+void __rseq_handle_notify_resume(struct pt_regs *regs)
 {
 	struct task_struct *t = current;
 	struct rseq_ids ids;
 	u32 node_id;
 	bool event;
-	int sig;
 
 	/*
 	 * If invoked from hypervisors before entering the guest via
@@ -272,10 +271,7 @@ void __rseq_handle_notify_resume(struct
 	if (unlikely(t->flags & PF_EXITING))
 		return;
 
-	if (ksig)
-		rseq_stat_inc(rseq_stats.signal);
-	else
-		rseq_stat_inc(rseq_stats.slowpath);
+	rseq_stat_inc(rseq_stats.slowpath);
 
 	/*
 	 * Read and clear the event pending bit first. If the task
@@ -314,8 +310,26 @@ void __rseq_handle_notify_resume(struct
 	return;
 
 error:
-	sig = ksig ? ksig->sig : 0;
-	force_sigsegv(sig);
+	force_sig(SIGSEGV);
+}
+
+void __rseq_signal_deliver(int sig, struct pt_regs *regs)
+{
+	rseq_stat_inc(rseq_stats.signal);
+	/*
+	 * Don't update IDs, they are handled on exit to user if
+	 * necessary. The important thing is to abort a critical section of
+	 * the interrupted context as after this point the instruction
+	 * pointer in @regs points to the signal handler.
+	 */
+	if (unlikely(!rseq_handle_cs(current, regs))) {
+		/*
+		 * Clear the errors just in case this might survive
+		 * magically, but leave the rest intact.
+		 */
+		current->rseq.event.error = 0;
+		force_sigsegv(sig);
+	}
 }
 
 void __rseq_debug_syscall_return(struct pt_regs *regs)

[patch V4 25/36] rseq: Rework the TIF_NOTIFY handler

Posted by Thomas Gleixner 23 hours ago

Replace the whole logic with a new implementation, which is shared with
signal delivery and the upcoming exit fast path.

Contrary to the original implementation, this ignores invocations from
KVM/IO-uring, which invoke resume_user_mode_work() with the @regs argument
set to NULL.

The original implementation updated the CPU/Node/MM CID fields, but that
was just a side effect, which was addressing the problem that this
invocation cleared TIF_NOTIFY_RESUME, which in turn could cause an update
on return to user space to be lost.

This problem has been addressed differently, so that it's not longer
required to do that update before entering the guest.

That might be considered a user visible change, when the hosts thread TLS
memory is mapped into the guest, but as this was never intentionally
supported, this abuse of kernel internal implementation details is not
considered an ABI break.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
---
V3: Moved rseq_update_usr() to this one - Mathieu
    Documented the KVM visible change - Sean
---
 include/linux/rseq_entry.h |   29 +++++++++++++++++
 kernel/rseq.c              |   76 +++++++++++++++++++--------------------------
 2 files changed, 62 insertions(+), 43 deletions(-)

--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -383,6 +383,35 @@ bool rseq_set_ids_get_csaddr(struct task
 	return false;
 }
 
+/*
+ * Update user space with new IDs and conditionally check whether the task
+ * is in a critical section.
+ */
+static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs,
+					struct rseq_ids *ids, u32 node_id)
+{
+	u64 csaddr;
+
+	if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr))
+		return false;
+
+	/*
+	 * On architectures which utilize the generic entry code this
+	 * allows to skip the critical section when the entry was not from
+	 * a user space interrupt, unless debug mode is enabled.
+	 */
+	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+		if (!static_branch_unlikely(&rseq_debug_enabled)) {
+			if (likely(!t->rseq.event.user_irq))
+				return true;
+		}
+	}
+	if (likely(!csaddr))
+		return true;
+	/* Sigh, this really needs to do work */
+	return rseq_update_user_cs(t, regs, csaddr);
+}
+
 static __always_inline void rseq_exit_to_user_mode(void)
 {
 	struct rseq_event *ev = &current->rseq.event;
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -82,12 +82,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/rseq.h>
 
-#ifdef CONFIG_MEMBARRIER
-# define RSEQ_EVENT_GUARD	irq
-#else
-# define RSEQ_EVENT_GUARD	preempt
-#endif
-
 DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
 
 static inline void rseq_control_debug(bool on)
@@ -236,38 +230,15 @@ static bool rseq_handle_cs(struct task_s
 	return rseq_update_user_cs(t, regs, csaddr);
 }
 
-/*
- * This resume handler must always be executed between any of:
- * - preemption,
- * - signal delivery,
- * and return to user-space.
- *
- * This is how we can ensure that the entire rseq critical section
- * will issue the commit instruction only if executed atomically with
- * respect to other threads scheduled on the same CPU, and with respect
- * to signal handlers.
- */
-void __rseq_handle_notify_resume(struct pt_regs *regs)
+static void rseq_slowpath_update_usr(struct pt_regs *regs)
 {
+	/* Preserve rseq state and user_irq state for exit to user */
+	const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, };
 	struct task_struct *t = current;
 	struct rseq_ids ids;
 	u32 node_id;
 	bool event;
 
-	/*
-	 * If invoked from hypervisors before entering the guest via
-	 * resume_user_mode_work(), then @regs is a NULL pointer.
-	 *
-	 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
-	 * it before returning from the ioctl() to user space when
-	 * rseq_event.sched_switch is set.
-	 *
-	 * So it's safe to ignore here instead of pointlessly updating it
-	 * in the vcpu_run() loop.
-	 */
-	if (!regs)
-		return;
-
 	if (unlikely(t->flags & PF_EXITING))
 		return;
 
@@ -291,26 +262,45 @@ void __rseq_handle_notify_resume(struct
 	 * with the result handed in to allow the detection of
 	 * inconsistencies.
 	 */
-	scoped_guard(RSEQ_EVENT_GUARD) {
+	scoped_guard(irq) {
 		event = t->rseq.event.sched_switch;
-		t->rseq.event.sched_switch = false;
+		t->rseq.event.all &= evt_mask.all;
 		ids.cpu_id = task_cpu(t);
 		ids.mm_cid = task_mm_cid(t);
 	}
 
-	if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
+	if (!event)
 		return;
 
-	if (!rseq_handle_cs(t, regs))
-		goto error;
-
 	node_id = cpu_to_node(ids.cpu_id);
-	if (!rseq_set_ids(t, &ids, node_id))
-		goto error;
-	return;
 
-error:
-	force_sig(SIGSEGV);
+	if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) {
+		/*
+		 * Clear the errors just in case this might survive magically, but
+		 * leave the rest intact.
+		 */
+		t->rseq.event.error = 0;
+		force_sig(SIGSEGV);
+	}
+}
+
+void __rseq_handle_notify_resume(struct pt_regs *regs)
+{
+	/*
+	 * If invoked from hypervisors before entering the guest via
+	 * resume_user_mode_work(), then @regs is a NULL pointer.
+	 *
+	 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
+	 * it before returning from the ioctl() to user space when
+	 * rseq_event.sched_switch is set.
+	 *
+	 * So it's safe to ignore here instead of pointlessly updating it
+	 * in the vcpu_run() loop.
+	 */
+	if (!regs)
+		return;
+
+	rseq_slowpath_update_usr(regs);
 }
 
 void __rseq_signal_deliver(int sig, struct pt_regs *regs)

[patch V4 26/36] rseq: Optimize event setting

Posted by Thomas Gleixner 23 hours ago

After removing the various condition bits earlier it turns out that one
extra information is needed to avoid setting event::sched_switch and
TIF_NOTIFY_RESUME unconditionally on every context switch.

The update of the RSEQ user space memory is only required, when either

  the task was interrupted in user space and schedules

or

  the CPU or MM CID changes in schedule() independent of the entry mode

Right now only the interrupt from user information is available.

Add a event flag, which is set when the CPU or MM CID or both change.

Evaluate this event in the scheduler to decide whether the sched_switch
event and the TIF bit need to be set.

It's an extra conditional in context_switch(), but the downside of
unconditionally handling RSEQ after a context switch to user is way more
significant. The utilized boolean logic minimizes this to a single
conditional branch.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 fs/exec.c                  |    2 -
 include/linux/rseq.h       |   81 +++++++++++++++++++++++++++++++++++++++++----
 include/linux/rseq_types.h |   11 +++++-
 kernel/rseq.c              |    2 -
 kernel/sched/core.c        |    7 +++
 kernel/sched/sched.h       |    5 ++
 6 files changed, 95 insertions(+), 13 deletions(-)

--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1775,7 +1775,7 @@ static int bprm_execve(struct linux_binp
 		force_fatal_sig(SIGSEGV);
 
 	sched_mm_cid_after_execve(current);
-	rseq_sched_switch_event(current);
+	rseq_force_update();
 	current->in_execve = 0;
 
 	return retval;
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -9,7 +9,8 @@ void __rseq_handle_notify_resume(struct
 
 static inline void rseq_handle_notify_resume(struct pt_regs *regs)
 {
-	if (current->rseq.event.has_rseq)
+	/* '&' is intentional to spare one conditional branch */
+	if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
 		__rseq_handle_notify_resume(regs);
 }
 
@@ -31,12 +32,75 @@ static inline void rseq_signal_deliver(s
 	}
 }
 
-/* Raised from context switch and exevce to force evaluation on exit to user */
-static inline void rseq_sched_switch_event(struct task_struct *t)
+static inline void rseq_raise_notify_resume(struct task_struct *t)
 {
-	if (t->rseq.event.has_rseq) {
-		t->rseq.event.sched_switch = true;
-		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+	set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+}
+
+/* Invoked from context switch to force evaluation on exit to user */
+static __always_inline void rseq_sched_switch_event(struct task_struct *t)
+{
+	struct rseq_event *ev = &t->rseq.event;
+
+	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+		/*
+		 * Avoid a boat load of conditionals by using simple logic
+		 * to determine whether NOTIFY_RESUME needs to be raised.
+		 *
+		 * It's required when the CPU or MM CID has changed or
+		 * the entry was from user space.
+		 */
+		bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq;
+
+		if (raise) {
+			ev->sched_switch = true;
+			rseq_raise_notify_resume(t);
+		}
+	} else {
+		if (ev->has_rseq) {
+			t->rseq.event.sched_switch = true;
+			rseq_raise_notify_resume(t);
+		}
+	}
+}
+
+/*
+ * Invoked from __set_task_cpu() when a task migrates to enforce an IDs
+ * update.
+ *
+ * This does not raise TIF_NOTIFY_RESUME as that happens in
+ * rseq_sched_switch_event().
+ */
+static __always_inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu)
+{
+	t->rseq.event.ids_changed = true;
+}
+
+/*
+ * Invoked from switch_mm_cid() in context switch when the task gets a MM
+ * CID assigned.
+ *
+ * This does not raise TIF_NOTIFY_RESUME as that happens in
+ * rseq_sched_switch_event().
+ */
+static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid)
+{
+	/*
+	 * Requires a comparison as the switch_mm_cid() code does not
+	 * provide a conditional for it readily. So avoid excessive updates
+	 * when nothing changes.
+	 */
+	if (t->rseq.ids.mm_cid != cid)
+		t->rseq.event.ids_changed = true;
+}
+
+/* Enforce a full update after RSEQ registration and when execve() failed */
+static inline void rseq_force_update(void)
+{
+	if (current->rseq.event.has_rseq) {
+		current->rseq.event.ids_changed = true;
+		current->rseq.event.sched_switch = true;
+		rseq_raise_notify_resume(current);
 	}
 }
 
@@ -53,7 +117,7 @@ static inline void rseq_sched_switch_eve
 static inline void rseq_virt_userspace_exit(void)
 {
 	if (current->rseq.event.sched_switch)
-		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+		rseq_raise_notify_resume(current);
 }
 
 static inline void rseq_reset(struct task_struct *t)
@@ -85,6 +149,9 @@ static inline void rseq_fork(struct task
 static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { }
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
 static inline void rseq_sched_switch_event(struct task_struct *t) { }
+static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { }
+static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { }
+static inline void rseq_force_update(void) { }
 static inline void rseq_virt_userspace_exit(void) { }
 static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
 static inline void rseq_execve(struct task_struct *t) { }
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -11,20 +11,27 @@ struct rseq;
  * struct rseq_event - Storage for rseq related event management
  * @all:		Compound to initialize and clear the data efficiently
  * @events:		Compound to access events with a single load/store
- * @sched_switch:	True if the task was scheduled out
+ * @sched_switch:	True if the task was scheduled and needs update on
+ *			exit to user
+ * @ids_changed:	Indicator that IDs need to be updated
  * @user_irq:		True on interrupt entry from user mode
  * @has_rseq:		True if the task has a rseq pointer installed
  * @error:		Compound error code for the slow path to analyze
  * @fatal:		User space data corrupted or invalid
+ *
+ * @sched_switch and @ids_changed must be adjacent and the combo must be
+ * 16bit aligned to allow a single store, when both are set at the same
+ * time in the scheduler.
  */
 struct rseq_event {
 	union {
 		u64				all;
 		struct {
 			union {
-				u16		events;
+				u32		events;
 				struct {
 					u8	sched_switch;
+					u8	ids_changed;
 					u8	user_irq;
 				};
 			};
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -456,7 +456,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
 	 * are updated before returning to user-space.
 	 */
 	current->rseq.event.has_rseq = true;
-	rseq_sched_switch_event(current);
+	rseq_force_update();
 
 	return 0;
 }
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5150,7 +5150,6 @@ prepare_task_switch(struct rq *rq, struc
 	kcov_prepare_switch(prev);
 	sched_info_switch(rq, prev, next);
 	perf_event_task_sched_out(prev, next);
-	rseq_sched_switch_event(prev);
 	fire_sched_out_preempt_notifiers(prev, next);
 	kmap_local_sched_out();
 	prepare_task(next);
@@ -5348,6 +5347,12 @@ context_switch(struct rq *rq, struct tas
 	/* switch_mm_cid() requires the memory barriers above. */
 	switch_mm_cid(rq, prev, next);
 
+	/*
+	 * Tell rseq that the task was scheduled in. Must be after
+	 * switch_mm_cid() to get the TIF flag set.
+	 */
+	rseq_sched_switch_event(next);
+
 	prepare_lock_switch(rq, next, rf);
 
 	/* Here we just switch the register state and the stack. */
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2181,6 +2181,7 @@ static inline void __set_task_cpu(struct
 	smp_wmb();
 	WRITE_ONCE(task_thread_info(p)->cpu, cpu);
 	p->wake_cpu = cpu;
+	rseq_sched_set_task_cpu(p, cpu);
 #endif /* CONFIG_SMP */
 }
 
@@ -3778,8 +3779,10 @@ static inline void switch_mm_cid(struct
 		mm_cid_put_lazy(prev);
 		prev->mm_cid = -1;
 	}
-	if (next->mm_cid_active)
+	if (next->mm_cid_active) {
 		next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm);
+		rseq_sched_set_task_mm_cid(next, next->mm_cid);
+	}
 }
 
 #else /* !CONFIG_SCHED_MM_CID: */

[patch V4 27/36] rseq: Implement fast path for exit to user

Posted by Thomas Gleixner 23 hours ago

Implement the actual logic for handling RSEQ updates in a fast path after
handling the TIF work and at the point where the task is actually returning
to user space.

This is the right point to do that because at this point the CPU and the MM
CID are stable and cannot longer change due to yet another reschedule.
That happens when the task is handling it via TIF_NOTIFY_RESUME in
resume_user_mode_work(), which is invoked from the exit to user mode work
loop.

The function is invoked after the TIF work is handled and runs with
interrupts disabled, which means it cannot resolve page faults. It
therefore disables page faults and in case the access to the user space
memory faults, it:

  - notes the fail in the event struct
  - raises TIF_NOTIFY_RESUME
  - returns false to the caller

The caller has to go back to the TIF work, which runs with interrupts
enabled and therefore can resolve the page faults. This happens mostly on
fork() when the memory is marked COW. That will be optimized by setting the
failure flag and raising TIF_NOTIFY_RESUME right on fork to avoid the
otherwise unavoidable round trip.

If the user memory inspection finds invalid data, the function returns
false as well and sets the fatal flag in the event struct along with
TIF_NOTIFY_RESUME. The slow path notify handler has to evaluate that flag
and terminate the task with SIGSEGV as documented.

The initial decision to invoke any of this is based on two flags in the
event struct: @has_rseq and @sched_switch. The decision is in pseudo ASM:

      load	tsk::event::has_rseq
      and	tsk::event::sched_switch
      jnz	inspect_user_space
      mov	$0, tsk::event::events
      ...
      leave

So for the common case where the task was not scheduled out, this really
boils down to four instructions before going out if the compiler is not
completely stupid (and yes, some of them are).

If the condition is true, then it checks, whether CPU ID or MM CID have
changed. If so, then the CPU/MM IDs have to be updated and are thereby
cached for the next round. The update unconditionally retrieves the user
space critical section address to spare another user*begin/end() pair.  If
that's not zero and tsk::event::user_irq is set, then the critical section
is analyzed and acted upon. If either zero or the entry came via syscall
the critical section analysis is skipped.

If the comparison is false then the critical section has to be analyzed
because the event flag is then only true when entry from user was by
interrupt.

This is provided without the actual hookup to let reviewers focus on the
implementation details. The hookup happens in the next step.

Note: As with quite some other optimizations this depends on the generic
entry infrastructure and is not enabled to be sucked into random
architecture implementations.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 include/linux/rseq_entry.h |  133 +++++++++++++++++++++++++++++++++++++++++++--
 include/linux/rseq_types.h |    3 +
 kernel/rseq.c              |    2 
 3 files changed, 133 insertions(+), 5 deletions(-)

--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -10,6 +10,7 @@ struct rseq_stats {
 	unsigned long	exit;
 	unsigned long	signal;
 	unsigned long	slowpath;
+	unsigned long	fastpath;
 	unsigned long	ids;
 	unsigned long	cs;
 	unsigned long	clear;
@@ -202,8 +203,8 @@ bool rseq_debug_update_user_cs(struct ta
 
 /*
  * On debug kernels validate that user space did not mess with it if
- * DEBUG_RSEQ is enabled, but don't on the first exit to user space. In
- * that case cpu_cid is ~0. See fork/execve.
+ * debugging is enabled, but don't do that on the first exit to user
+ * space. In that case cpu_cid is ~0. See fork/execve.
  */
 bool rseq_debug_validate_ids(struct task_struct *t)
 {
@@ -254,12 +255,13 @@ rseq_update_user_cs(struct task_struct *
 {
 	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
 	unsigned long ip = instruction_pointer(regs);
+	unsigned long tasksize = TASK_SIZE;
 	u64 start_ip, abort_ip, offset;
 	u32 usig, __user *uc_sig;
 
 	rseq_stat_inc(rseq_stats.cs);
 
-	if (unlikely(csaddr >= TASK_SIZE)) {
+	if (unlikely(csaddr >= tasksize)) {
 		t->rseq.event.fatal = true;
 		return false;
 	}
@@ -298,7 +300,7 @@ rseq_update_user_cs(struct task_struct *
 	 * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP
 	 * protection.
 	 */
-	if (abort_ip >= TASK_SIZE || abort_ip < sizeof(*uc_sig))
+	if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
 		goto die;
 
 	/* The address is guaranteed to be >= 0 and < TASK_SIZE */
@@ -412,6 +414,124 @@ static rseq_inline bool rseq_update_usr(
 	return rseq_update_user_cs(t, regs, csaddr);
 }
 
+/*
+ * If you want to use this then convert your architecture to the generic
+ * entry code. I'm tired of building workarounds for people who can't be
+ * bothered to make the maintainence of generic infrastructure less
+ * burdensome. Just sucking everything into the architecture code and
+ * thereby making others chase the horrible hacks and keep them working is
+ * neither acceptable nor sustainable.
+ */
+#ifdef CONFIG_GENERIC_ENTRY
+
+/*
+ * This is inlined into the exit path because:
+ *
+ * 1) It's a one time comparison in the fast path when there is no event to
+ *    handle
+ *
+ * 2) The access to the user space rseq memory (TLS) is unlikely to fault
+ *    so the straight inline operation is:
+ *
+ *	- Four 32-bit stores only if CPU ID/ MM CID need to be updated
+ *	- One 64-bit load to retrieve the critical section address
+ *
+ * 3) In the unlikely case that the critical section address is != NULL:
+ *
+ *     - One 64-bit load to retrieve the start IP
+ *     - One 64-bit load to retrieve the offset for calculating the end
+ *     - One 64-bit load to retrieve the abort IP
+ *     - One store to clear the critical section address
+ *
+ * The non-debug case implements only the minimal required checking and
+ * protection against a rogue abort IP in kernel space, which would be
+ * exploitable at least on x86. Any fallout from invalid critical section
+ * descriptors is a user space problem. The debug case provides the full
+ * set of checks and terminates the task if a condition is not met.
+ *
+ * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and
+ * tells the caller to loop back into exit_to_user_mode_loop(). The rseq
+ * slow path there will handle the fail.
+ */
+static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs)
+{
+	struct task_struct *t = current;
+
+	/*
+	 * If the task did not go through schedule or got the flag enforced
+	 * by the rseq syscall or execve, then nothing to do here.
+	 *
+	 * CPU ID and MM CID can only change when going through a context
+	 * switch.
+	 *
+	 * This can only be done when rseq_event::has_rseq is true.
+	 * rseq_sched_switch_event() sets rseq_event::sched unconditionally
+	 * true to avoid a load of rseq_event::has_rseq in the context
+	 * switch path.
+	 *
+	 * This check uses a '&' and not a '&&' to force the compiler to do
+	 * an actual AND operation instead of two seperate conditionals.
+	 *
+	 * A sane compiler requires four instructions for the nothing to do
+	 * case including clearing the events, but your milage might vary.
+	 */
+	if (likely(!(t->rseq.event.sched_switch & t->rseq.event.has_rseq)))
+		goto done;
+
+	rseq_stat_inc(rseq_stats.fastpath);
+
+	pagefault_disable();
+
+	if (likely(!t->rseq.event.ids_changed)) {
+		/*
+		 * If IDs have not changed rseq_event::user_irq must be true
+		 * See rseq_sched_switch_event().
+		 */
+		u64 csaddr;
+
+		if (unlikely(get_user_masked_u64(&csaddr, &t->rseq.usrptr->rseq_cs)))
+			goto fail;
+
+		if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) {
+			if (unlikely(!rseq_update_user_cs(t, regs, csaddr)))
+				goto fail;
+		}
+	} else {
+		struct rseq_ids ids = {
+			.cpu_id = task_cpu(t),
+			.mm_cid = task_mm_cid(t),
+		};
+		u32 node_id = cpu_to_node(ids.cpu_id);
+
+		if (unlikely(!rseq_update_usr(t, regs, &ids, node_id)))
+			goto fail;
+	}
+
+	pagefault_enable();
+
+done:
+	/* Clear state so next entry starts from a clean slate */
+	t->rseq.event.events = 0;
+	return false;
+
+fail:
+	pagefault_enable();
+	/* Force it into the slow path. Don't clear the state! */
+	t->rseq.event.slowpath = true;
+	set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+	return true;
+}
+
+static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs)
+{
+	if (unlikely(__rseq_exit_to_user_mode_restart(regs)))
+		return true;
+
+	return false;
+}
+
+#endif /* CONFIG_GENERIC_ENTRY */
+
 static __always_inline void rseq_exit_to_user_mode(void)
 {
 	struct rseq_event *ev = &current->rseq.event;
@@ -436,9 +556,12 @@ static inline void rseq_debug_syscall_re
 	if (static_branch_unlikely(&rseq_debug_enabled))
 		__rseq_debug_syscall_return(regs);
 }
-
 #else /* CONFIG_RSEQ */
 static inline void rseq_note_user_irq_entry(void) { }
+static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs)
+{
+	return false;
+}
 static inline void rseq_exit_to_user_mode(void) { }
 static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
 #endif /* !CONFIG_RSEQ */
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -18,6 +18,8 @@ struct rseq;
  * @has_rseq:		True if the task has a rseq pointer installed
  * @error:		Compound error code for the slow path to analyze
  * @fatal:		User space data corrupted or invalid
+ * @slowpath:		Indicator that slow path processing via TIF_NOTIFY_RESUME
+ *			is required
  *
  * @sched_switch and @ids_changed must be adjacent and the combo must be
  * 16bit aligned to allow a single store, when both are set at the same
@@ -42,6 +44,7 @@ struct rseq_event {
 				u16		error;
 				struct {
 					u8	fatal;
+					u8	slowpath;
 				};
 			};
 		};
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -133,6 +133,7 @@ static int rseq_stats_show(struct seq_fi
 		stats.exit	+= data_race(per_cpu(rseq_stats.exit, cpu));
 		stats.signal	+= data_race(per_cpu(rseq_stats.signal, cpu));
 		stats.slowpath	+= data_race(per_cpu(rseq_stats.slowpath, cpu));
+		stats.fastpath	+= data_race(per_cpu(rseq_stats.fastpath, cpu));
 		stats.ids	+= data_race(per_cpu(rseq_stats.ids, cpu));
 		stats.cs	+= data_race(per_cpu(rseq_stats.cs, cpu));
 		stats.clear	+= data_race(per_cpu(rseq_stats.clear, cpu));
@@ -142,6 +143,7 @@ static int rseq_stats_show(struct seq_fi
 	seq_printf(m, "exit:   %16lu\n", stats.exit);
 	seq_printf(m, "signal: %16lu\n", stats.signal);
 	seq_printf(m, "slowp:  %16lu\n", stats.slowpath);
+	seq_printf(m, "fastp:  %16lu\n", stats.fastpath);
 	seq_printf(m, "ids:    %16lu\n", stats.ids);
 	seq_printf(m, "cs:     %16lu\n", stats.cs);
 	seq_printf(m, "clear:  %16lu\n", stats.clear);

[patch V4 28/36] rseq: Switch to fast path processing on exit to user

Posted by Thomas Gleixner 23 hours ago

Now that all bits and pieces are in place, hook the RSEQ handling fast path
function into exit_to_user_mode_prepare() after the TIF work bits have been
handled. If case of fast path failure, TIF_NOTIFY_RESUME has been raised
and the caller needs to take another turn through the TIF handling slow
path.

This only works for architectures, which use the generic entry code.
Architectures, who still have their own incomplete hacks are not supported
and won't be.

This results in the following improvements:

  Kernel build	       Before		  After		      Reduction
		       
  exit to user         80692981		  80514451      
  signal checks:          32581		       121	       99%
  slowpath runs:        1201408   1.49%	       198 0.00%      100%
  fastpath runs:           	  	    675941 0.84%       N/A
  id updates:           1233989   1.53%	     50541 0.06%       96%
  cs checks:            1125366   1.39%	         0 0.00%      100%
    cs cleared:         1125366      100%	 0            100%
    cs fixup:                 0        0%	 0      

  RSEQ selftests      Before		  After		      Reduction

  exit to user:       386281778		  387373750       
  signal checks:       35661203		          0           100%
  slowpath runs:      140542396 36.38%	        100  0.00%    100%
  fastpath runs:           	  	    9509789  2.51%     N/A
  id updates:         176203599 45.62%	    9087994  2.35%     95%
  cs checks:          175587856 45.46%	    4728394  1.22%     98%
    cs cleared:       172359544   98.16%    1319307   27.90%   99% 
    cs fixup:           3228312    1.84%    3409087   72.10%

The 'cs cleared' and 'cs fixup' percentanges are not relative to the exit
to user invocations, they are relative to the actual 'cs check'
invocations.

While some of this could have been avoided in the original code, like the
obvious clearing of CS when it's already clear, the main problem of going
through TIF_NOTIFY_RESUME cannot be solved. In some workloads the RSEQ
notify handler is invoked more than once before going out to user
space. Doing this once when everything has stabilized is the only solution
to avoid this.

The initial attempt to completely decouple it from the TIF work turned out
to be suboptimal for workloads, which do a lot of quick and short system
calls. Even if the fast path decision is only 4 instructions (including a
conditional branch), this adds up quickly and becomes measurable when the
rate for actually having to handle rseq is in the low single digit
percentage range of user/kernel transitions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
V4: Move the rseq handling into a separate loop to avoid gotos later on
---
 include/linux/irq-entry-common.h |    7 ++-----
 include/linux/resume_user_mode.h |    2 +-
 include/linux/rseq.h             |   23 +++++++++++++++++------
 init/Kconfig                     |    2 +-
 kernel/entry/common.c            |   26 +++++++++++++++++++-------
 kernel/rseq.c                    |    8 ++++++--
 6 files changed, 46 insertions(+), 22 deletions(-)

--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -197,11 +197,8 @@ static __always_inline void arch_exit_to
  */
 void arch_do_signal_or_restart(struct pt_regs *regs);
 
-/**
- * exit_to_user_mode_loop - do any pending work before leaving to user space
- */
-unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
-				     unsigned long ti_work);
+/* Handle pending TIF work */
+unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work);
 
 /**
  * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
--- a/include/linux/resume_user_mode.h
+++ b/include/linux/resume_user_mode.h
@@ -59,7 +59,7 @@ static inline void resume_user_mode_work
 	mem_cgroup_handle_over_high(GFP_KERNEL);
 	blkcg_maybe_throttle_current();
 
-	rseq_handle_notify_resume(regs);
+	rseq_handle_slowpath(regs);
 }
 
 #endif /* LINUX_RESUME_USER_MODE_H */
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -5,13 +5,19 @@
 #ifdef CONFIG_RSEQ
 #include <linux/sched.h>
 
-void __rseq_handle_notify_resume(struct pt_regs *regs);
+void __rseq_handle_slowpath(struct pt_regs *regs);
 
-static inline void rseq_handle_notify_resume(struct pt_regs *regs)
+/* Invoked from resume_user_mode_work() */
+static inline void rseq_handle_slowpath(struct pt_regs *regs)
 {
-	/* '&' is intentional to spare one conditional branch */
-	if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
-		__rseq_handle_notify_resume(regs);
+	if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
+		if (current->rseq.event.slowpath)
+			__rseq_handle_slowpath(regs);
+	} else {
+		/* '&' is intentional to spare one conditional branch */
+		if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
+			__rseq_handle_slowpath(regs);
+	}
 }
 
 void __rseq_signal_deliver(int sig, struct pt_regs *regs);
@@ -142,11 +148,16 @@ static inline void rseq_fork(struct task
 	} else {
 		t->rseq = current->rseq;
 		t->rseq.ids.cpu_cid = ~0ULL;
+		/*
+		 * If it has rseq, force it into the slow path right away
+		 * because it is guaranteed to fault.
+		 */
+		t->rseq.event.slowpath = t->rseq.event.has_rseq;
 	}
 }
 
 #else /* CONFIG_RSEQ */
-static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { }
+static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
 static inline void rseq_sched_switch_event(struct task_struct *t) { }
 static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { }
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1911,7 +1911,7 @@ config RSEQ_DEBUG_DEFAULT_ENABLE
 config DEBUG_RSEQ
 	default n
 	bool "Enable debugging of rseq() system call" if EXPERT
-	depends on RSEQ && DEBUG_KERNEL
+	depends on RSEQ && DEBUG_KERNEL && !GENERIC_ENTRY
 	select RSEQ_DEBUG_DEFAULT_ENABLE
 	help
 	  Enable extra debugging checks for the rseq system call.
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -11,13 +11,8 @@
 /* Workaround to allow gradual conversion of architecture code */
 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
 
-/**
- * exit_to_user_mode_loop - do any pending work before leaving to user space
- * @regs:	Pointer to pt_regs on entry stack
- * @ti_work:	TIF work flags as read by the caller
- */
-__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
-						     unsigned long ti_work)
+static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs,
+							      unsigned long ti_work)
 {
 	/*
 	 * Before returning to user space ensure that all pending work
@@ -62,6 +57,23 @@ void __weak arch_do_signal_or_restart(st
 	return ti_work;
 }
 
+/**
+ * exit_to_user_mode_loop - do any pending work before leaving to user space
+ * @regs:	Pointer to pt_regs on entry stack
+ * @ti_work:	TIF work flags as read by the caller
+ */
+__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+						     unsigned long ti_work)
+{
+	for (;;) {
+		ti_work = __exit_to_user_mode_loop(regs, ti_work);
+
+		if (likely(!rseq_exit_to_user_mode_restart(regs)))
+			return ti_work;
+		ti_work = read_thread_flags();
+	}
+}
+
 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 {
 	irqentry_state_t ret = {
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -234,7 +234,11 @@ static bool rseq_handle_cs(struct task_s
 
 static void rseq_slowpath_update_usr(struct pt_regs *regs)
 {
-	/* Preserve rseq state and user_irq state for exit to user */
+	/*
+	 * Preserve rseq state and user_irq state. The generic entry code
+	 * clears user_irq on the way out, the non-generic entry
+	 * architectures are not having user_irq.
+	 */
 	const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, };
 	struct task_struct *t = current;
 	struct rseq_ids ids;
@@ -286,7 +290,7 @@ static void rseq_slowpath_update_usr(str
 	}
 }
 
-void __rseq_handle_notify_resume(struct pt_regs *regs)
+void __rseq_handle_slowpath(struct pt_regs *regs)
 {
 	/*
 	 * If invoked from hypervisors before entering the guest via

[patch V4 29/36] entry: Split up exit_to_user_mode_prepare()

Posted by Thomas Gleixner 23 hours ago

exit_to_user_mode_prepare() is used for both interrupts and syscalls, but
there is extra rseq work, which is only required for in the interrupt exit
case.

Split up the function and provide wrappers for syscalls and interrupts,
which allows to separate the rseq exit work in the next step.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

---
 include/linux/entry-common.h     |    2 -
 include/linux/irq-entry-common.h |   42 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 38 insertions(+), 6 deletions(-)

--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -156,7 +156,7 @@ static __always_inline void syscall_exit
 	if (unlikely(work & SYSCALL_WORK_EXIT))
 		syscall_exit_work(regs, work);
 	local_irq_disable_exit_to_user();
-	exit_to_user_mode_prepare(regs);
+	syscall_exit_to_user_mode_prepare(regs);
 }
 
 /**
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -201,7 +201,7 @@ void arch_do_signal_or_restart(struct pt
 unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work);
 
 /**
- * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
+ * __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
  * @regs:	Pointer to pt_regs on entry stack
  *
  * 1) check that interrupts are disabled
@@ -209,8 +209,10 @@ unsigned long exit_to_user_mode_loop(str
  * 3) call exit_to_user_mode_loop() if any flags from
  *    EXIT_TO_USER_MODE_WORK are set
  * 4) check that interrupts are still disabled
+ *
+ * Don't invoke directly, use the syscall/irqentry_ prefixed variants below
  */
-static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
+static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs)
 {
 	unsigned long ti_work;
 
@@ -224,15 +226,45 @@ static __always_inline void exit_to_user
 		ti_work = exit_to_user_mode_loop(regs, ti_work);
 
 	arch_exit_to_user_mode_prepare(regs, ti_work);
+}
 
-	rseq_exit_to_user_mode();
-
+static __always_inline void __exit_to_user_mode_validate(void)
+{
 	/* Ensure that kernel state is sane for a return to userspace */
 	kmap_assert_nomap();
 	lockdep_assert_irqs_disabled();
 	lockdep_sys_exit();
 }
 
+
+/**
+ * syscall_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
+ * @regs:	Pointer to pt_regs on entry stack
+ *
+ * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for
+ * syscalls and interrupts.
+ */
+static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+	__exit_to_user_mode_prepare(regs);
+	rseq_exit_to_user_mode();
+	__exit_to_user_mode_validate();
+}
+
+/**
+ * irqentry_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
+ * @regs:	Pointer to pt_regs on entry stack
+ *
+ * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for
+ * syscalls and interrupts.
+ */
+static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+	__exit_to_user_mode_prepare(regs);
+	rseq_exit_to_user_mode();
+	__exit_to_user_mode_validate();
+}
+
 /**
  * exit_to_user_mode - Fixup state when exiting to user mode
  *
@@ -297,7 +329,7 @@ static __always_inline void irqentry_ent
 static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs)
 {
 	instrumentation_begin();
-	exit_to_user_mode_prepare(regs);
+	irqentry_exit_to_user_mode_prepare(regs);
 	instrumentation_end();
 	exit_to_user_mode();
 }

[patch V4 30/36] rseq: Split up rseq_exit_to_user_mode()

Posted by Thomas Gleixner 23 hours ago

Separate the interrupt and syscall exit handling. Syscall exit does not
require to clear the user_irq bit as it can't be set. On interrupt exit it
can be set when the interrupt did not result in a scheduling event and
therefore the return path did not invoke the TIF work handling, which would
have cleared it.

The debug check for the event state is also not really required even when
debug mode is enabled via the static key. Debug mode is largely aiding user
space by enabling a larger amount of validation checks, which cause a
segfault when a malformed critical section is detected. In production mode
the critical section handling takes the content mostly as is and lets user
space keep the pieces when it screwed up.

On kernel changes in that area the state check is useful, but that can be
done when lockdep is enabled, which is anyway a required test scenario for
fundamental changes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

---
 include/linux/irq-entry-common.h |    4 ++--
 include/linux/rseq_entry.h       |   21 +++++++++++++++++----
 2 files changed, 19 insertions(+), 6 deletions(-)

--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -247,7 +247,7 @@ static __always_inline void __exit_to_us
 static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
 {
 	__exit_to_user_mode_prepare(regs);
-	rseq_exit_to_user_mode();
+	rseq_syscall_exit_to_user_mode();
 	__exit_to_user_mode_validate();
 }
 
@@ -261,7 +261,7 @@ static __always_inline void syscall_exit
 static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs)
 {
 	__exit_to_user_mode_prepare(regs);
-	rseq_exit_to_user_mode();
+	rseq_irqentry_exit_to_user_mode();
 	__exit_to_user_mode_validate();
 }
 
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -532,19 +532,31 @@ static __always_inline bool rseq_exit_to
 
 #endif /* CONFIG_GENERIC_ENTRY */
 
-static __always_inline void rseq_exit_to_user_mode(void)
+static __always_inline void rseq_syscall_exit_to_user_mode(void)
 {
 	struct rseq_event *ev = &current->rseq.event;
 
 	rseq_stat_inc(rseq_stats.exit);
 
-	if (static_branch_unlikely(&rseq_debug_enabled))
+	/* Needed to remove the store for the !lockdep case */
+	if (IS_ENABLED(CONFIG_LOCKDEP)) {
 		WARN_ON_ONCE(ev->sched_switch);
+		ev->events = 0;
+	}
+}
+
+static __always_inline void rseq_irqentry_exit_to_user_mode(void)
+{
+	struct rseq_event *ev = &current->rseq.event;
+
+	rseq_stat_inc(rseq_stats.exit);
+
+	lockdep_assert_once(!ev->sched_switch);
 
 	/*
 	 * Ensure that event (especially user_irq) is cleared when the
 	 * interrupt did not result in a schedule and therefore the
-	 * rseq processing did not clear it.
+	 * rseq processing could not clear it.
 	 */
 	ev->events = 0;
 }
@@ -562,7 +574,8 @@ static inline bool rseq_exit_to_user_mod
 {
 	return false;
 }
-static inline void rseq_exit_to_user_mode(void) { }
+static inline void rseq_syscall_exit_to_user_mode(void) { }
+static inline void rseq_irqentry_exit_to_user_mode(void) { }
 static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
 #endif /* !CONFIG_RSEQ */

[patch V4 31/36] asm-generic: Provide generic TIF infrastructure

Posted by Thomas Gleixner 23 hours ago

Common TIF bits do not have to be defined by every architecture. They can
be defined in a generic header.

That allows adding generic TIF bits without chasing a gazillion of
architecture headers, which is again a unjustified burden on anyone who
works on generic infrastructure as it always needs a boat load of work to
keep existing architecture code working when adding new stuff.

While it is not as horrible as the ignorance of the generic entry
infrastructure, it is a welcome mechanism to make architecture people
rethink their approach of just leaching generic improvements into
architecture code and thereby making it accumulatingly harder to maintain
and improve generic code. It's about time that this changes.

Provide the infrastructure and split the TIF space in half, 16 generic and
16 architecture specific bits.

This could probably be extended by TIF_SINGLESTEP and BLOCKSTEP, but those
are only used in architecture specific code. So leave them alone for now.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>

---
 arch/Kconfig                          |    4 ++
 include/asm-generic/thread_info_tif.h |   48 ++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1730,6 +1730,10 @@ config ARCH_VMLINUX_NEEDS_RELOCS
 	  relocations preserved. This is used by some architectures to
 	  construct bespoke relocation tables for KASLR.
 
+# Select if architecture uses the common generic TIF bits
+config HAVE_GENERIC_TIF_BITS
+       bool
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
--- /dev/null
+++ b/include/asm-generic/thread_info_tif.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_GENERIC_THREAD_INFO_TIF_H_
+#define _ASM_GENERIC_THREAD_INFO_TIF_H_
+
+#include <vdso/bits.h>
+
+/* Bits 16-31 are reserved for architecture specific purposes */
+
+#define TIF_NOTIFY_RESUME	0	// callback before returning to user
+#define _TIF_NOTIFY_RESUME	BIT(TIF_NOTIFY_RESUME)
+
+#define TIF_SIGPENDING		1	// signal pending
+#define _TIF_SIGPENDING		BIT(TIF_SIGPENDING)
+
+#define TIF_NOTIFY_SIGNAL	2	// signal notifications exist
+#define _TIF_NOTIFY_SIGNAL	BIT(TIF_NOTIFY_SIGNAL)
+
+#define TIF_MEMDIE		3	// is terminating due to OOM killer
+#define _TIF_MEMDIE		BIT(TIF_MEMDIE)
+
+#define TIF_NEED_RESCHED	4	// rescheduling necessary
+#define _TIF_NEED_RESCHED	BIT(TIF_NEED_RESCHED)
+
+#ifdef HAVE_TIF_NEED_RESCHED_LAZY
+# define TIF_NEED_RESCHED_LAZY	5	// Lazy rescheduling needed
+# define _TIF_NEED_RESCHED_LAZY	BIT(TIF_NEED_RESCHED_LAZY)
+#endif
+
+#ifdef HAVE_TIF_POLLING_NRFLAG
+# define TIF_POLLING_NRFLAG	6	// idle is polling for TIF_NEED_RESCHED
+# define _TIF_POLLING_NRFLAG	BIT(TIF_POLLING_NRFLAG)
+#endif
+
+#define TIF_USER_RETURN_NOTIFY	7	// notify kernel of userspace return
+#define _TIF_USER_RETURN_NOTIFY	BIT(TIF_USER_RETURN_NOTIFY)
+
+#define TIF_UPROBE		8	// breakpointed or singlestepping
+#define _TIF_UPROBE		BIT(TIF_UPROBE)
+
+#define TIF_PATCH_PENDING	9	// pending live patching update
+#define _TIF_PATCH_PENDING	BIT(TIF_PATCH_PENDING)
+
+#ifdef HAVE_TIF_RESTORE_SIGMASK
+# define TIF_RESTORE_SIGMASK	10	// Restore signal mask in do_signal() */
+# define _TIF_RESTORE_SIGMASK	BIT(TIF_RESTORE_SIGMASK)
+#endif
+
+#endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */

[patch V4 32/36] x86: Use generic TIF bits

Posted by Thomas Gleixner 23 hours ago

No point in defining generic items and the upcoming RSEQ optimizations are
only available with this _and_ the generic entry infrastructure, which is
already used by x86. So no further action required here.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: x86@kernel.org

---
 arch/x86/Kconfig                   |    1 
 arch/x86/include/asm/thread_info.h |   74 +++++++++++++++----------------------
 2 files changed, 31 insertions(+), 44 deletions(-)

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -239,6 +239,7 @@ config X86
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_EISA			if X86_32
 	select HAVE_EXIT_THREAD
+	select HAVE_GENERIC_TIF_BITS
 	select HAVE_GUP_FAST
 	select HAVE_FENTRY			if X86_64 || DYNAMIC_FTRACE
 	select HAVE_FTRACE_GRAPH_FUNC		if HAVE_FUNCTION_GRAPH_TRACER
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -80,56 +80,42 @@ struct thread_info {
 #endif
 
 /*
- * thread information flags
- * - these are process state flags that various assembly files
- *   may need to access
+ * Tell the generic TIF infrastructure which bits x86 supports
  */
-#define TIF_NOTIFY_RESUME	1	/* callback before returning to user */
-#define TIF_SIGPENDING		2	/* signal pending */
-#define TIF_NEED_RESCHED	3	/* rescheduling necessary */
-#define TIF_NEED_RESCHED_LAZY	4	/* Lazy rescheduling needed */
-#define TIF_SINGLESTEP		5	/* reenable singlestep on user return*/
-#define TIF_SSBD		6	/* Speculative store bypass disable */
-#define TIF_SPEC_IB		9	/* Indirect branch speculation mitigation */
-#define TIF_SPEC_L1D_FLUSH	10	/* Flush L1D on mm switches (processes) */
-#define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
-#define TIF_UPROBE		12	/* breakpointed or singlestepping */
-#define TIF_PATCH_PENDING	13	/* pending live patching update */
-#define TIF_NEED_FPU_LOAD	14	/* load FPU on return to userspace */
-#define TIF_NOCPUID		15	/* CPUID is not accessible in userland */
-#define TIF_NOTSC		16	/* TSC is not accessible in userland */
-#define TIF_NOTIFY_SIGNAL	17	/* signal notifications exist */
-#define TIF_MEMDIE		20	/* is terminating due to OOM killer */
-#define TIF_POLLING_NRFLAG	21	/* idle is polling for TIF_NEED_RESCHED */
+#define HAVE_TIF_NEED_RESCHED_LAZY
+#define HAVE_TIF_POLLING_NRFLAG
+#define HAVE_TIF_SINGLESTEP
+
+#include <asm-generic/thread_info_tif.h>
+
+/* Architecture specific TIF space starts at 16 */
+#define TIF_SSBD		16	/* Speculative store bypass disable */
+#define TIF_SPEC_IB		17	/* Indirect branch speculation mitigation */
+#define TIF_SPEC_L1D_FLUSH	18	/* Flush L1D on mm switches (processes) */
+#define TIF_NEED_FPU_LOAD	19	/* load FPU on return to userspace */
+#define TIF_NOCPUID		20	/* CPUID is not accessible in userland */
+#define TIF_NOTSC		21	/* TSC is not accessible in userland */
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
 #define TIF_SPEC_FORCE_UPDATE	23	/* Force speculation MSR update in context switch */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
-#define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
+#define TIF_SINGLESTEP		25	/* reenable singlestep on user return*/
+#define TIF_BLOCKSTEP		26	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
-#define TIF_ADDR32		29	/* 32-bit address space on 64 bits */
+#define TIF_ADDR32		28	/* 32-bit address space on 64 bits */
 
-#define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
-#define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
-#define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
-#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
-#define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
-#define _TIF_SSBD		(1 << TIF_SSBD)
-#define _TIF_SPEC_IB		(1 << TIF_SPEC_IB)
-#define _TIF_SPEC_L1D_FLUSH	(1 << TIF_SPEC_L1D_FLUSH)
-#define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
-#define _TIF_UPROBE		(1 << TIF_UPROBE)
-#define _TIF_PATCH_PENDING	(1 << TIF_PATCH_PENDING)
-#define _TIF_NEED_FPU_LOAD	(1 << TIF_NEED_FPU_LOAD)
-#define _TIF_NOCPUID		(1 << TIF_NOCPUID)
-#define _TIF_NOTSC		(1 << TIF_NOTSC)
-#define _TIF_NOTIFY_SIGNAL	(1 << TIF_NOTIFY_SIGNAL)
-#define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
-#define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
-#define _TIF_SPEC_FORCE_UPDATE	(1 << TIF_SPEC_FORCE_UPDATE)
-#define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
-#define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
-#define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
-#define _TIF_ADDR32		(1 << TIF_ADDR32)
+#define _TIF_SSBD		BIT(TIF_SSBD)
+#define _TIF_SPEC_IB		BIT(TIF_SPEC_IB)
+#define _TIF_SPEC_L1D_FLUSH	BIT(TIF_SPEC_L1D_FLUSH)
+#define _TIF_NEED_FPU_LOAD	BIT(TIF_NEED_FPU_LOAD)
+#define _TIF_NOCPUID		BIT(TIF_NOCPUID)
+#define _TIF_NOTSC		BIT(TIF_NOTSC)
+#define _TIF_IO_BITMAP		BIT(TIF_IO_BITMAP)
+#define _TIF_SPEC_FORCE_UPDATE	BIT(TIF_SPEC_FORCE_UPDATE)
+#define _TIF_FORCED_TF		BIT(TIF_FORCED_TF)
+#define _TIF_BLOCKSTEP		BIT(TIF_BLOCKSTEP)
+#define _TIF_SINGLESTEP		BIT(TIF_SINGLESTEP)
+#define _TIF_LAZY_MMU_UPDATES	BIT(TIF_LAZY_MMU_UPDATES)
+#define _TIF_ADDR32		BIT(TIF_ADDR32)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW_BASE					\

[patch V4 33/36] s390: Use generic TIF bits

Posted by Thomas Gleixner 23 hours ago

No point in defining generic items and the upcoming RSEQ optimizations are
only available with this _and_ the generic entry infrastructure, which is
already used by s390. So no further action required here.

This leaves a comment about the AUDIT/TRACE/SECCOMP bits which are handled
by SYSCALL_WORK in the generic code, so they seem redundant, but that's a
problem for the s390 wizards to think about.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>

---
 arch/s390/Kconfig                   |    1 
 arch/s390/include/asm/thread_info.h |   44 ++++++++++++++----------------------
 2 files changed, 19 insertions(+), 26 deletions(-)

--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -199,6 +199,7 @@ config S390
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
 	select HAVE_EBPF_JIT if HAVE_MARCH_Z196_FEATURES
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
+	select HAVE_GENERIC_TIF_BITS
 	select HAVE_GUP_FAST
 	select HAVE_FENTRY
 	select HAVE_FTRACE_GRAPH_FUNC
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -56,43 +56,35 @@ void arch_setup_new_exec(void);
 
 /*
  * thread information flags bit numbers
+ *
+ * Tell the generic TIF infrastructure which special bits s390 supports
  */
-#define TIF_NOTIFY_RESUME	0	/* callback before returning to user */
-#define TIF_SIGPENDING		1	/* signal pending */
-#define TIF_NEED_RESCHED	2	/* rescheduling necessary */
-#define TIF_NEED_RESCHED_LAZY	3	/* lazy rescheduling needed */
-#define TIF_UPROBE		4	/* breakpointed or single-stepping */
-#define TIF_PATCH_PENDING	5	/* pending live patching update */
-#define TIF_ASCE_PRIMARY	6	/* primary asce is kernel asce */
-#define TIF_NOTIFY_SIGNAL	7	/* signal notifications exist */
-#define TIF_GUARDED_STORAGE	8	/* load guarded storage control block */
-#define TIF_ISOLATE_BP_GUEST	9	/* Run KVM guests with isolated BP */
-#define TIF_PER_TRAP		10	/* Need to handle PER trap on exit to usermode */
-#define TIF_31BIT		16	/* 32bit process */
-#define TIF_MEMDIE		17	/* is terminating due to OOM killer */
-#define TIF_RESTORE_SIGMASK	18	/* restore signal mask in do_signal() */
-#define TIF_SINGLE_STEP		19	/* This task is single stepped */
-#define TIF_BLOCK_STEP		20	/* This task is block stepped */
-#define TIF_UPROBE_SINGLESTEP	21	/* This task is uprobe single stepped */
+#define HAVE_TIF_NEED_RESCHED_LAZY
+#define HAVE_TIF_RESTORE_SIGMASK
+
+#include <asm-generic/thread_info_tif.h>
+
+/* Architecture specific bits */
+#define TIF_ASCE_PRIMARY	16	/* primary asce is kernel asce */
+#define TIF_GUARDED_STORAGE	17	/* load guarded storage control block */
+#define TIF_ISOLATE_BP_GUEST	18	/* Run KVM guests with isolated BP */
+#define TIF_PER_TRAP		19	/* Need to handle PER trap on exit to usermode */
+#define TIF_31BIT		20	/* 32bit process */
+#define TIF_SINGLE_STEP		21	/* This task is single stepped */
+#define TIF_BLOCK_STEP		22	/* This task is block stepped */
+#define TIF_UPROBE_SINGLESTEP	23	/* This task is uprobe single stepped */
+
+/* These could move over to SYSCALL_WORK bits, no? */
 #define TIF_SYSCALL_TRACE	24	/* syscall trace active */
 #define TIF_SYSCALL_AUDIT	25	/* syscall auditing active */
 #define TIF_SECCOMP		26	/* secure computing */
 #define TIF_SYSCALL_TRACEPOINT	27	/* syscall tracepoint instrumentation */
 
-#define _TIF_NOTIFY_RESUME	BIT(TIF_NOTIFY_RESUME)
-#define _TIF_SIGPENDING		BIT(TIF_SIGPENDING)
-#define _TIF_NEED_RESCHED	BIT(TIF_NEED_RESCHED)
-#define _TIF_NEED_RESCHED_LAZY	BIT(TIF_NEED_RESCHED_LAZY)
-#define _TIF_UPROBE		BIT(TIF_UPROBE)
-#define _TIF_PATCH_PENDING	BIT(TIF_PATCH_PENDING)
 #define _TIF_ASCE_PRIMARY	BIT(TIF_ASCE_PRIMARY)
-#define _TIF_NOTIFY_SIGNAL	BIT(TIF_NOTIFY_SIGNAL)
 #define _TIF_GUARDED_STORAGE	BIT(TIF_GUARDED_STORAGE)
 #define _TIF_ISOLATE_BP_GUEST	BIT(TIF_ISOLATE_BP_GUEST)
 #define _TIF_PER_TRAP		BIT(TIF_PER_TRAP)
 #define _TIF_31BIT		BIT(TIF_31BIT)
-#define _TIF_MEMDIE		BIT(TIF_MEMDIE)
-#define _TIF_RESTORE_SIGMASK	BIT(TIF_RESTORE_SIGMASK)
 #define _TIF_SINGLE_STEP	BIT(TIF_SINGLE_STEP)
 #define _TIF_BLOCK_STEP		BIT(TIF_BLOCK_STEP)
 #define _TIF_UPROBE_SINGLESTEP	BIT(TIF_UPROBE_SINGLESTEP)

[patch V4 34/36] loongarch: Use generic TIF bits

Posted by Thomas Gleixner 23 hours ago

No point in defining generic items and the upcoming RSEQ optimizations are
only available with this _and_ the generic entry infrastructure, which is
already used by loongarch. So no further action required here.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Huacai Chen <chenhuacai@kernel.org>

---
 arch/loongarch/Kconfig                   |    1 
 arch/loongarch/include/asm/thread_info.h |   76 +++++++++++++------------------
 2 files changed, 35 insertions(+), 42 deletions(-)

--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -140,6 +140,7 @@ config LOONGARCH
 	select HAVE_EBPF_JIT
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS if !ARCH_STRICT_ALIGN
 	select HAVE_EXIT_THREAD
+	select HAVE_GENERIC_TIF_BITS
 	select HAVE_GUP_FAST
 	select HAVE_FTRACE_GRAPH_FUNC
 	select HAVE_FUNCTION_ARG_ACCESS_API
--- a/arch/loongarch/include/asm/thread_info.h
+++ b/arch/loongarch/include/asm/thread_info.h
@@ -65,50 +65,42 @@ register unsigned long current_stack_poi
  *   access
  * - pending work-to-be-done flags are in LSW
  * - other flags in MSW
+ *
+ * Tell the generic TIF infrastructure which special bits loongarch supports
  */
-#define TIF_NEED_RESCHED	0	/* rescheduling necessary */
-#define TIF_NEED_RESCHED_LAZY	1	/* lazy rescheduling necessary */
-#define TIF_SIGPENDING		2	/* signal pending */
-#define TIF_NOTIFY_RESUME	3	/* callback before returning to user */
-#define TIF_NOTIFY_SIGNAL	4	/* signal notifications exist */
-#define TIF_RESTORE_SIGMASK	5	/* restore signal mask in do_signal() */
-#define TIF_NOHZ		6	/* in adaptive nohz mode */
-#define TIF_UPROBE		7	/* breakpointed or singlestepping */
-#define TIF_USEDFPU		8	/* FPU was used by this task this quantum (SMP) */
-#define TIF_USEDSIMD		9	/* SIMD has been used this quantum */
-#define TIF_MEMDIE		10	/* is terminating due to OOM killer */
-#define TIF_FIXADE		11	/* Fix address errors in software */
-#define TIF_LOGADE		12	/* Log address errors to syslog */
-#define TIF_32BIT_REGS		13	/* 32-bit general purpose registers */
-#define TIF_32BIT_ADDR		14	/* 32-bit address space */
-#define TIF_LOAD_WATCH		15	/* If set, load watch registers */
-#define TIF_SINGLESTEP		16	/* Single Step */
-#define TIF_LSX_CTX_LIVE	17	/* LSX context must be preserved */
-#define TIF_LASX_CTX_LIVE	18	/* LASX context must be preserved */
-#define TIF_USEDLBT		19	/* LBT was used by this task this quantum (SMP) */
-#define TIF_LBT_CTX_LIVE	20	/* LBT context must be preserved */
-#define TIF_PATCH_PENDING	21	/* pending live patching update */
+#define HAVE_TIF_NEED_RESCHED_LAZY
+#define HAVE_TIF_RESTORE_SIGMASK
 
-#define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
-#define _TIF_NEED_RESCHED_LAZY	(1<<TIF_NEED_RESCHED_LAZY)
-#define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
-#define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
-#define _TIF_NOTIFY_SIGNAL	(1<<TIF_NOTIFY_SIGNAL)
-#define _TIF_NOHZ		(1<<TIF_NOHZ)
-#define _TIF_UPROBE		(1<<TIF_UPROBE)
-#define _TIF_USEDFPU		(1<<TIF_USEDFPU)
-#define _TIF_USEDSIMD		(1<<TIF_USEDSIMD)
-#define _TIF_FIXADE		(1<<TIF_FIXADE)
-#define _TIF_LOGADE		(1<<TIF_LOGADE)
-#define _TIF_32BIT_REGS		(1<<TIF_32BIT_REGS)
-#define _TIF_32BIT_ADDR		(1<<TIF_32BIT_ADDR)
-#define _TIF_LOAD_WATCH		(1<<TIF_LOAD_WATCH)
-#define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
-#define _TIF_LSX_CTX_LIVE	(1<<TIF_LSX_CTX_LIVE)
-#define _TIF_LASX_CTX_LIVE	(1<<TIF_LASX_CTX_LIVE)
-#define _TIF_USEDLBT		(1<<TIF_USEDLBT)
-#define _TIF_LBT_CTX_LIVE	(1<<TIF_LBT_CTX_LIVE)
-#define _TIF_PATCH_PENDING	(1<<TIF_PATCH_PENDING)
+#include <asm-generic/thread_info_tif.h>
+
+/* Architecture specific bits */
+#define TIF_NOHZ		16	/* in adaptive nohz mode */
+#define TIF_USEDFPU		17	/* FPU was used by this task this quantum (SMP) */
+#define TIF_USEDSIMD		18	/* SIMD has been used this quantum */
+#define TIF_FIXADE		10	/* Fix address errors in software */
+#define TIF_LOGADE		20	/* Log address errors to syslog */
+#define TIF_32BIT_REGS		21	/* 32-bit general purpose registers */
+#define TIF_32BIT_ADDR		22	/* 32-bit address space */
+#define TIF_LOAD_WATCH		23	/* If set, load watch registers */
+#define TIF_SINGLESTEP		24	/* Single Step */
+#define TIF_LSX_CTX_LIVE	25	/* LSX context must be preserved */
+#define TIF_LASX_CTX_LIVE	26	/* LASX context must be preserved */
+#define TIF_USEDLBT		27	/* LBT was used by this task this quantum (SMP) */
+#define TIF_LBT_CTX_LIVE	28	/* LBT context must be preserved */
+
+#define _TIF_NOHZ		BIT(TIF_NOHZ)
+#define _TIF_USEDFPU		BIT(TIF_USEDFPU)
+#define _TIF_USEDSIMD		BIT(TIF_USEDSIMD)
+#define _TIF_FIXADE		BIT(TIF_FIXADE)
+#define _TIF_LOGADE		BIT(TIF_LOGADE)
+#define _TIF_32BIT_REGS		BIT(TIF_32BIT_REGS)
+#define _TIF_32BIT_ADDR		BIT(TIF_32BIT_ADDR)
+#define _TIF_LOAD_WATCH		BIT(TIF_LOAD_WATCH)
+#define _TIF_SINGLESTEP		BIT(TIF_SINGLESTEP)
+#define _TIF_LSX_CTX_LIVE	BIT(TIF_LSX_CTX_LIVE)
+#define _TIF_LASX_CTX_LIVE	BIT(TIF_LASX_CTX_LIVE)
+#define _TIF_USEDLBT		BIT(TIF_USEDLBT)
+#define _TIF_LBT_CTX_LIVE	BIT(TIF_LBT_CTX_LIVE)
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_THREAD_INFO_H */

[patch V4 35/36] riscv: Use generic TIF bits

Posted by Thomas Gleixner 23 hours ago

No point in defining generic items and the upcoming RSEQ optimizations are
only available with this _and_ the generic entry infrastructure, which is
already used by RISCV. So no further action required here.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>

---
 arch/riscv/Kconfig                   |    1 +
 arch/riscv/include/asm/thread_info.h |   29 ++++++++++++-----------------
 2 files changed, 13 insertions(+), 17 deletions(-)

--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -161,6 +161,7 @@ config RISCV
 	select HAVE_FUNCTION_GRAPH_FREGS
 	select HAVE_FUNCTION_TRACER if !XIP_KERNEL && HAVE_DYNAMIC_FTRACE
 	select HAVE_EBPF_JIT if MMU
+	select HAVE_GENERIC_TIF_BITS
 	select HAVE_GUP_FAST if MMU
 	select HAVE_FUNCTION_ARG_ACCESS_API
 	select HAVE_FUNCTION_ERROR_INJECTION
--- a/arch/riscv/include/asm/thread_info.h
+++ b/arch/riscv/include/asm/thread_info.h
@@ -107,23 +107,18 @@ int arch_dup_task_struct(struct task_str
  * - pending work-to-be-done flags are in lowest half-word
  * - other flags in upper half-word(s)
  */
-#define TIF_NEED_RESCHED	0	/* rescheduling necessary */
-#define TIF_NEED_RESCHED_LAZY	1       /* Lazy rescheduling needed */
-#define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
-#define TIF_SIGPENDING		3	/* signal pending */
-#define TIF_RESTORE_SIGMASK	4	/* restore signal mask in do_signal() */
-#define TIF_MEMDIE		5	/* is terminating due to OOM killer */
-#define TIF_NOTIFY_SIGNAL	9	/* signal notifications exist */
-#define TIF_UPROBE		10	/* uprobe breakpoint or singlestep */
-#define TIF_32BIT		11	/* compat-mode 32bit process */
-#define TIF_RISCV_V_DEFER_RESTORE	12 /* restore Vector before returing to user */
 
-#define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
-#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
-#define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
-#define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
-#define _TIF_NOTIFY_SIGNAL	(1 << TIF_NOTIFY_SIGNAL)
-#define _TIF_UPROBE		(1 << TIF_UPROBE)
-#define _TIF_RISCV_V_DEFER_RESTORE	(1 << TIF_RISCV_V_DEFER_RESTORE)
+/*
+ * Tell the generic TIF infrastructure which bits riscv supports
+ */
+#define HAVE_TIF_NEED_RESCHED_LAZY
+#define HAVE_TIF_RESTORE_SIGMASK
+
+#include <asm-generic/thread_info_tif.h>
+
+#define TIF_32BIT			16	/* compat-mode 32bit process */
+#define TIF_RISCV_V_DEFER_RESTORE	17	/* restore Vector before returing to user */
+
+#define _TIF_RISCV_V_DEFER_RESTORE	BIT(TIF_RISCV_V_DEFER_RESTORE)
 
 #endif /* _ASM_RISCV_THREAD_INFO_H */

[patch V4 36/36] rseq: Switch to TIF_RSEQ if supported

Posted by Thomas Gleixner 23 hours ago

TIF_NOTIFY_RESUME is a multiplexing TIF bit, which is suboptimal especially
with the RSEQ fast path depending on it, but not really handling it.

Define a seperate TIF_RSEQ in the generic TIF space and enable the full
seperation of fast and slow path for architectures which utilize that.

That avoids the hassle with invocations of resume_user_mode_work() from
hypervisors, which clear TIF_NOTIFY_RESUME. It makes the therefore required
re-evaluation at the end of vcpu_run() a NOOP on architectures which
utilize the generic TIF space and have a seperate TIF_RSEQ.

The hypervisor TIF handling does not include the seperate TIF_RSEQ as there
is no point in doing so. The guest does neither know nor care about the VMM
host applications RSEQ state. That state is only relevant when the ioctl()
returns to user space.

The fastpath implementation still utilizes TIF_NOTIFY_RESUME for failure
handling, but this only happens within exit_to_user_mode_loop(), so
arguably the hypervisor ioctl() code is long done when this happens.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
---
V4: Adjust it to the new outer loop mechanism

V3: Updated the comment for rseq_virt_userspace_exit() - Sean
    Added a static assert for TIF_RSEQ != TIF_NOTIFY_RESUME - Sean
---
 include/asm-generic/thread_info_tif.h |    3 +++
 include/linux/irq-entry-common.h      |    2 +-
 include/linux/rseq.h                  |   32 ++++++++++++++++++++++++--------
 include/linux/rseq_entry.h            |   29 +++++++++++++++++++++++++++--
 include/linux/thread_info.h           |    5 +++++
 kernel/entry/common.c                 |   10 ++++++++--
 6 files changed, 68 insertions(+), 13 deletions(-)

--- a/include/asm-generic/thread_info_tif.h
+++ b/include/asm-generic/thread_info_tif.h
@@ -45,4 +45,7 @@
 # define _TIF_RESTORE_SIGMASK	BIT(TIF_RESTORE_SIGMASK)
 #endif
 
+#define TIF_RSEQ		11	// Run RSEQ fast path
+#define _TIF_RSEQ		BIT(TIF_RSEQ)
+
 #endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -30,7 +30,7 @@
 #define EXIT_TO_USER_MODE_WORK						\
 	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |		\
 	 _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY |			\
-	 _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |			\
+	 _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ |		\
 	 ARCH_EXIT_TO_USER_MODE_WORK)
 
 /**
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -40,7 +40,7 @@ static inline void rseq_signal_deliver(s
 
 static inline void rseq_raise_notify_resume(struct task_struct *t)
 {
-	set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+	set_tsk_thread_flag(t, TIF_RSEQ);
 }
 
 /* Invoked from context switch to force evaluation on exit to user */
@@ -112,17 +112,25 @@ static inline void rseq_force_update(voi
 
 /*
  * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
- * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in
- * that case just to do it eventually again before returning to user space,
- * the entry resume_user_mode_work() invocation is ignored as the register
- * argument is NULL.
+ * which clears TIF_NOTIFY_RESUME on architectures that don't use the
+ * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag.
  *
- * After returning from guest mode, they have to invoke this function to
- * re-raise TIF_NOTIFY_RESUME if necessary.
+ * To avoid updating user space RSEQ in that case just to do it eventually
+ * again before returning to user space, because __rseq_handle_slowpath()
+ * does nothing when invoked with NULL register state.
+ *
+ * After returning from guest mode, before exiting to userspace, hypervisors
+ * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary.
  */
 static inline void rseq_virt_userspace_exit(void)
 {
 	if (current->rseq.event.sched_switch)
+	/*
+	 * The generic optimization for deferring RSEQ updates until the next
+	 * exit relies on having a dedicated TIF_RSEQ.
+	 */
+	if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) &&
+	    current->rseq.event.sched_switch)
 		rseq_raise_notify_resume(current);
 }
 
@@ -151,8 +159,16 @@ static inline void rseq_fork(struct task
 		/*
 		 * If it has rseq, force it into the slow path right away
 		 * because it is guaranteed to fault.
+		 *
+		 * Setting TIF_NOTIFY_RESUME is redundant but harmless for
+		 * architectures which do not have a seperate TIF_RSEQ, but
+		 * for those who do it's required to enforce the slow path
+		 * as the scheduler sets only TIF_RSEQ.
 		 */
-		t->rseq.event.slowpath = t->rseq.event.has_rseq;
+		if (t->rseq.event.has_rseq) {
+			t->rseq.event.slowpath = true;
+			set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+		}
 	}
 }
 
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -522,11 +522,36 @@ static __always_inline bool __rseq_exit_
 	return true;
 }
 
-static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs)
+/*
+ * Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS
+ * as that's not upstream yet.
+ */
+#ifdef CONFIG_HAVE_GENERIC_TIF_BITS
+static __always_inline bool test_tif_rseq(unsigned long ti_work)
 {
+	return ti_work & _TIF_RSEQ;
+}
+
+static __always_inline void clear_tif_rseq(void)
+{
+	static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME);
+	clear_thread_flag(TIF_RSEQ);
+}
+#else
+static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; }
+static __always_inline void clear_tif_rseq(void) { }
+#endif
+
+static __always_inline bool
+rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
+{
+	if (likely(!test_tif_rseq(ti_work)))
+		return false;
+
 	if (unlikely(__rseq_exit_to_user_mode_restart(regs)))
 		return true;
 
+	clear_tif_rseq();
 	return false;
 }
 
@@ -570,7 +595,7 @@ static inline void rseq_debug_syscall_re
 }
 #else /* CONFIG_RSEQ */
 static inline void rseq_note_user_irq_entry(void) { }
-static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs)
+static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
 {
 	return false;
 }
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -67,6 +67,11 @@ enum syscall_work_bit {
 #define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
 #endif
 
+#ifndef TIF_RSEQ
+# define TIF_RSEQ	TIF_NOTIFY_RESUME
+# define _TIF_RSEQ	_TIF_NOTIFY_RESUME
+#endif
+
 #ifdef __KERNEL__
 
 #ifndef arch_set_restart_data
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -11,6 +11,12 @@
 /* Workaround to allow gradual conversion of architecture code */
 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
 
+#ifdef CONFIG_HAVE_GENERIC_TIF_BITS
+#define EXIT_TO_USER_MODE_WORK_LOOP	(EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ)
+#else
+#define EXIT_TO_USER_MODE_WORK_LOOP	(EXIT_TO_USER_MODE_WORK)
+#endif
+
 static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs,
 							      unsigned long ti_work)
 {
@@ -18,7 +24,7 @@ static __always_inline unsigned long __e
 	 * Before returning to user space ensure that all pending work
 	 * items have been completed.
 	 */
-	while (ti_work & EXIT_TO_USER_MODE_WORK) {
+	while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
 
 		local_irq_enable_exit_to_user(ti_work);
 
@@ -68,7 +74,7 @@ static __always_inline unsigned long __e
 	for (;;) {
 		ti_work = __exit_to_user_mode_loop(regs, ti_work);
 
-		if (likely(!rseq_exit_to_user_mode_restart(regs)))
+		if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work)))
 			return ti_work;
 		ti_work = read_thread_flags();
 	}