[patch V2 36/37] rseq: Switch to TIF_RSEQ if supported

Thomas Gleixner posted 37 patches 1 month, 1 week ago
There is a newer version of this series
[patch V2 36/37] rseq: Switch to TIF_RSEQ if supported
Posted by Thomas Gleixner 1 month, 1 week ago
TIF_NOTIFY_RESUME is a multiplexing TIF bit, which is suboptimal especially
with the RSEQ fast path depending on it, but not really handling it.

Define a seperate TIF_RSEQ in the generic TIF space and enable the full
seperation of fast and slow path for architectures which utilize that.

That avoids the hassle with invocations of resume_user_mode_work() from
hypervisors, which clear TIF_NOTIFY_RESUME. It makes the therefore required
re-evaluation at the end of vcpu_run() a NOOP on architectures which
utilize the generic TIF space and have a seperate TIF_RSEQ.

The hypervisor TIF handling does not include the seperate TIF_RSEQ as there
is no point in doing so. The guest does neither know nor care about the VMM
host applications RSEQ state. That state is only relevant when the ioctl()
returns to user space.

The fastpath implementation still utilizes TIF_NOTIFY_RESUME for failure
handling, but this only happens within exit_to_user_mode_loop(), so
arguably the hypervisor ioctl() code is long done when this happens.

This allows further optimizations for blocking syscall heavy workloads in a
subsequent step.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/asm-generic/thread_info_tif.h |    3 +++
 include/linux/irq-entry-common.h      |    2 +-
 include/linux/rseq.h                  |   13 ++++++++++---
 include/linux/rseq_entry.h            |   23 +++++++++++++++++++----
 include/linux/thread_info.h           |    5 +++++
 5 files changed, 38 insertions(+), 8 deletions(-)

--- a/include/asm-generic/thread_info_tif.h
+++ b/include/asm-generic/thread_info_tif.h
@@ -45,4 +45,7 @@
 # define _TIF_RESTORE_SIGMASK	BIT(TIF_RESTORE_SIGMASK)
 #endif
 
+#define TIF_RSEQ		11	// Run RSEQ fast path
+#define _TIF_RSEQ		BIT(TIF_RSEQ)
+
 #endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -30,7 +30,7 @@
 #define EXIT_TO_USER_MODE_WORK						\
 	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |		\
 	 _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY |			\
-	 _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |			\
+	 _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ |		\
 	 ARCH_EXIT_TO_USER_MODE_WORK)
 
 /**
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -40,7 +40,7 @@ static inline void rseq_signal_deliver(s
 
 static inline void rseq_raise_notify_resume(struct task_struct *t)
 {
-	set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+	set_tsk_thread_flag(t, TIF_RSEQ);
 }
 
 /* Invoked from context switch to force evaluation on exit to user */
@@ -122,7 +122,7 @@ static inline void rseq_force_update(voi
  */
 static inline void rseq_virt_userspace_exit(void)
 {
-	if (current->rseq_event.sched_switch)
+	if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) && current->rseq_event.sched_switch)
 		rseq_raise_notify_resume(current);
 }
 
@@ -147,9 +147,16 @@ static inline void rseq_fork(struct task
 		/*
 		 * If it has rseq, force it into the slow path right away
 		 * because it is guaranteed to fault.
+		 *
+		 * Setting TIF_NOTIFY_RESUME is redundant but harmless for
+		 * architectures which do not have a seperate TIF_RSEQ, but
+		 * for those who do it's required to enforce the slow path
+		 * as the scheduler sets only TIF_RSEQ.
 		 */
-		if (t->rseq_event.has_rseq)
+		if (t->rseq_event.has_rseq) {
 			t->rseq_event.slowpath = true;
+			set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+		}
 	}
 }
 
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -502,18 +502,33 @@ static __always_inline bool __rseq_exit_
 	return true;
 }
 
+#ifdef CONFIG_HAVE_GENERIC_TIF_BITS
+# define CHECK_TIF_RSEQ		_TIF_RSEQ
+static __always_inline void clear_tif_rseq(void)
+{
+	clear_thread_flag(TIF_RSEQ);
+}
+#else
+# define CHECK_TIF_RSEQ		0UL
+static inline void clear_tif_rseq(void) { }
+#endif
+
 static __always_inline unsigned long
 rseq_exit_to_user_mode_work(struct pt_regs *regs, unsigned long ti_work, const unsigned long mask)
 {
 	/*
 	 * Check if all work bits have been cleared before handling rseq.
+	 *
+	 * In case of a seperate TIF_RSEQ this checks for all other bits to
+	 * be cleared and TIF_RSEQ to be set.
 	 */
-	if ((ti_work & mask) != 0)
-		return ti_work;
-
-	if (likely(!__rseq_exit_to_user_mode_restart(regs)))
+	if ((ti_work & mask) != CHECK_TIF_RSEQ)
 		return ti_work;
 
+	if (likely(!__rseq_exit_to_user_mode_restart(regs))) {
+		clear_tif_rseq();
+		return ti_work & ~CHECK_TIF_RSEQ;
+	}
 	return ti_work | _TIF_NOTIFY_RESUME;
 }
 
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -67,6 +67,11 @@ enum syscall_work_bit {
 #define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
 #endif
 
+#ifndef TIF_RSEQ
+# define TIF_RSEQ	TIF_NOTIFY_RESUME
+# define _TIF_RSEQ	_TIF_NOTIFY_RESUME
+#endif
+
 #ifdef __KERNEL__
 
 #ifndef arch_set_restart_data
Re: [patch V2 36/37] rseq: Switch to TIF_RSEQ if supported
Posted by Sean Christopherson 1 month, 1 week ago
On Sat, Aug 23, 2025, Thomas Gleixner wrote:
> @@ -122,7 +122,7 @@ static inline void rseq_force_update(voi
>   */
>  static inline void rseq_virt_userspace_exit(void)
>  {
> -	if (current->rseq_event.sched_switch)
> +	if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) && current->rseq_event.sched_switch)

Rather than pivot on CONFIG_HAVE_GENERIC_TIF_BITS, which makes the "why" quite
difficult to find/understand, what if this checks TIF_RSEQ == TIF_NOTIFY_RESUME?
That would also allow architectures to define TIF_RSEQ without switching to the
generic TIF bits implementation (though I don't know that we want to encourage
that?).

Updating the comment to explain what's going on would also be helpful, e.g.

diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index 185a4875b261..9a8e238ae9d1 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -112,17 +112,17 @@ static inline void rseq_force_update(void)
 
 /*
  * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
- * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in
- * that case just to do it eventually again before returning to user space,
- * the entry resume_user_mode_work() invocation is ignored as the register
- * argument is NULL.
+ * which clears TIF_NOTIFY_RESUME on architectures that don't provide a separate
+ * TIF_RSEQ flag. To avoid updating user space RSEQ in that case just to do it
+ * eventually again before returning to user space, __rseq_handle_slowpath()
+ * does nothing when invoked with NULL register state.
  *
- * After returning from guest mode, they have to invoke this function to
- * re-raise TIF_NOTIFY_RESUME if necessary.
+ * After returning from guest mode, before exiting to userspace, hypervisors
+ * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary.
  */
 static inline void rseq_virt_userspace_exit(void)
 {
-       if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) && current->rseq_event.sched_switch)
+       if (TIF_RSEQ == TIF_NOTIFY_RESUME && current->rseq_event.sched_switch)
                rseq_raise_notify_resume(current);
 }
 
>  		rseq_raise_notify_resume(current);
>  }
Re: [patch V2 36/37] rseq: Switch to TIF_RSEQ if supported
Posted by Thomas Gleixner 1 month ago
On Mon, Aug 25 2025 at 13:02, Sean Christopherson wrote:
> On Sat, Aug 23, 2025, Thomas Gleixner wrote:
>> @@ -122,7 +122,7 @@ static inline void rseq_force_update(voi
>>   */
>>  static inline void rseq_virt_userspace_exit(void)
>>  {
>> -	if (current->rseq_event.sched_switch)
>> +	if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) && current->rseq_event.sched_switch)
>
> Rather than pivot on CONFIG_HAVE_GENERIC_TIF_BITS, which makes the "why" quite
> difficult to find/understand, what if this checks TIF_RSEQ == TIF_NOTIFY_RESUME?
> That would also allow architectures to define TIF_RSEQ without switching to the
> generic TIF bits implementation (though I don't know that we want to encourage
> that?).

Did you read the cover letter?

Consolidating on common infrastructure is the goal here. Stop
proliferating the architecture specific hackery, which has zero value
and justification. If people want to harvest the core improvements, then
they should get their act together and mop up their architecture
code. If they can't be bothered, so be it.

I'm happy to add a comment which explains that.

Thanks,

        tglx
Re: [patch V2 36/37] rseq: Switch to TIF_RSEQ if supported
Posted by Sean Christopherson 4 weeks, 1 day ago
On Tue, Sep 02, 2025, Thomas Gleixner wrote:
> On Mon, Aug 25 2025 at 13:02, Sean Christopherson wrote:
> > On Sat, Aug 23, 2025, Thomas Gleixner wrote:
> >> @@ -122,7 +122,7 @@ static inline void rseq_force_update(voi
> >>   */
> >>  static inline void rseq_virt_userspace_exit(void)
> >>  {
> >> -	if (current->rseq_event.sched_switch)
> >> +	if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) && current->rseq_event.sched_switch)
> >
> > Rather than pivot on CONFIG_HAVE_GENERIC_TIF_BITS, which makes the "why" quite
> > difficult to find/understand, what if this checks TIF_RSEQ == TIF_NOTIFY_RESUME?
> > That would also allow architectures to define TIF_RSEQ without switching to the
> > generic TIF bits implementation (though I don't know that we want to encourage
> > that?).
> 
> Did you read the cover letter?

I read part of it :-)

> Consolidating on common infrastructure is the goal here. Stop
> proliferating the architecture specific hackery, which has zero value
> and justification. If people want to harvest the core improvements, then
> they should get their act together and mop up their architecture
> code. If they can't be bothered, so be it.

Definitely no argument on that front.

> I'm happy to add a comment which explains that.

And maybe a BUILD_BUG_ON() to assert that TIF_RSEQ != TIF_NOTIFY_RESUME?  My main
interest is documenting why the generic implementation doesn't need to re-raise
TIF_NOTIFY_RESUME.  E.g. something like this?

/*
 * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
 * which clears TIF_NOTIFY_RESUME on architectures that don't provide support
 * the generic TIF bits.  To avoid updating user space RSEQ in that case just
 * to do it eventually again before returning to user space,
 * __rseq_handle_slowpath() does nothing when invoked with NULL register state.
 *
 * After returning from guest mode, before exiting to userspace, hypervisors
 * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary.
 */
static inline void rseq_virt_userspace_exit(void)
{
	/*
	 * The generic optimization for deferring RSEQ updates until the next
	 * exit relies on having a dedicated TIF_RSEQ.
	 */
	if (IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS))
		BUILD_BUG_ON(TIF_RSEQ == TIF_NOTIFY_RESUME);
	else if (current->rseq_event.sched_switch)
		rseq_raise_notify_resume(current);
}
Re: [patch V2 36/37] rseq: Switch to TIF_RSEQ if supported
Posted by Thomas Gleixner 4 weeks, 1 day ago
On Thu, Sep 04 2025 at 03:08, Sean Christopherson wrote:
> On Tue, Sep 02, 2025, Thomas Gleixner wrote:
>> I'm happy to add a comment which explains that.
>
> And maybe a BUILD_BUG_ON() to assert that TIF_RSEQ != TIF_NOTIFY_RESUME?  My main
> interest is documenting why the generic implementation doesn't need to re-raise
> TIF_NOTIFY_RESUME.  E.g. something like this?

Done.
Re: [patch V2 36/37] rseq: Switch to TIF_RSEQ if supported
Posted by Mathieu Desnoyers 1 month, 1 week ago
On 2025-08-23 12:40, Thomas Gleixner wrote:
> TIF_NOTIFY_RESUME is a multiplexing TIF bit, which is suboptimal especially
> with the RSEQ fast path depending on it, but not really handling it.
> 
> Define a seperate TIF_RSEQ in the generic TIF space and enable the full
> seperation of fast and slow path for architectures which utilize that.
> 
> That avoids the hassle with invocations of resume_user_mode_work() from
> hypervisors, which clear TIF_NOTIFY_RESUME. It makes the therefore required
> re-evaluation at the end of vcpu_run() a NOOP on architectures which
> utilize the generic TIF space and have a seperate TIF_RSEQ.
> 
> The hypervisor TIF handling does not include the seperate TIF_RSEQ as there
> is no point in doing so. The guest does neither know nor care about the VMM
> host applications RSEQ state. That state is only relevant when the ioctl()
> returns to user space.
> 
> The fastpath implementation still utilizes TIF_NOTIFY_RESUME for failure
> handling, but this only happens within exit_to_user_mode_loop(), so
> arguably the hypervisor ioctl() code is long done when this happens.
> 
> This allows further optimizations for blocking syscall heavy workloads in a
> subsequent step.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

> ---
>   include/asm-generic/thread_info_tif.h |    3 +++
>   include/linux/irq-entry-common.h      |    2 +-
>   include/linux/rseq.h                  |   13 ++++++++++---
>   include/linux/rseq_entry.h            |   23 +++++++++++++++++++----
>   include/linux/thread_info.h           |    5 +++++
>   5 files changed, 38 insertions(+), 8 deletions(-)
> 
> --- a/include/asm-generic/thread_info_tif.h
> +++ b/include/asm-generic/thread_info_tif.h
> @@ -45,4 +45,7 @@
>   # define _TIF_RESTORE_SIGMASK	BIT(TIF_RESTORE_SIGMASK)
>   #endif
>   
> +#define TIF_RSEQ		11	// Run RSEQ fast path
> +#define _TIF_RSEQ		BIT(TIF_RSEQ)
> +
>   #endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */
> --- a/include/linux/irq-entry-common.h
> +++ b/include/linux/irq-entry-common.h
> @@ -30,7 +30,7 @@
>   #define EXIT_TO_USER_MODE_WORK						\
>   	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |		\
>   	 _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY |			\
> -	 _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |			\
> +	 _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ |		\
>   	 ARCH_EXIT_TO_USER_MODE_WORK)
>   
>   /**
> --- a/include/linux/rseq.h
> +++ b/include/linux/rseq.h
> @@ -40,7 +40,7 @@ static inline void rseq_signal_deliver(s
>   
>   static inline void rseq_raise_notify_resume(struct task_struct *t)
>   {
> -	set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
> +	set_tsk_thread_flag(t, TIF_RSEQ);
>   }
>   
>   /* Invoked from context switch to force evaluation on exit to user */
> @@ -122,7 +122,7 @@ static inline void rseq_force_update(voi
>    */
>   static inline void rseq_virt_userspace_exit(void)
>   {
> -	if (current->rseq_event.sched_switch)
> +	if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) && current->rseq_event.sched_switch)
>   		rseq_raise_notify_resume(current);
>   }
>   
> @@ -147,9 +147,16 @@ static inline void rseq_fork(struct task
>   		/*
>   		 * If it has rseq, force it into the slow path right away
>   		 * because it is guaranteed to fault.
> +		 *
> +		 * Setting TIF_NOTIFY_RESUME is redundant but harmless for
> +		 * architectures which do not have a seperate TIF_RSEQ, but
> +		 * for those who do it's required to enforce the slow path
> +		 * as the scheduler sets only TIF_RSEQ.
>   		 */
> -		if (t->rseq_event.has_rseq)
> +		if (t->rseq_event.has_rseq) {
>   			t->rseq_event.slowpath = true;
> +			set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
> +		}
>   	}
>   }
>   
> --- a/include/linux/rseq_entry.h
> +++ b/include/linux/rseq_entry.h
> @@ -502,18 +502,33 @@ static __always_inline bool __rseq_exit_
>   	return true;
>   }
>   
> +#ifdef CONFIG_HAVE_GENERIC_TIF_BITS
> +# define CHECK_TIF_RSEQ		_TIF_RSEQ
> +static __always_inline void clear_tif_rseq(void)
> +{
> +	clear_thread_flag(TIF_RSEQ);
> +}
> +#else
> +# define CHECK_TIF_RSEQ		0UL
> +static inline void clear_tif_rseq(void) { }
> +#endif
> +
>   static __always_inline unsigned long
>   rseq_exit_to_user_mode_work(struct pt_regs *regs, unsigned long ti_work, const unsigned long mask)
>   {
>   	/*
>   	 * Check if all work bits have been cleared before handling rseq.
> +	 *
> +	 * In case of a seperate TIF_RSEQ this checks for all other bits to
> +	 * be cleared and TIF_RSEQ to be set.
>   	 */
> -	if ((ti_work & mask) != 0)
> -		return ti_work;
> -
> -	if (likely(!__rseq_exit_to_user_mode_restart(regs)))
> +	if ((ti_work & mask) != CHECK_TIF_RSEQ)
>   		return ti_work;
>   
> +	if (likely(!__rseq_exit_to_user_mode_restart(regs))) {
> +		clear_tif_rseq();
> +		return ti_work & ~CHECK_TIF_RSEQ;
> +	}
>   	return ti_work | _TIF_NOTIFY_RESUME;
>   }
>   
> --- a/include/linux/thread_info.h
> +++ b/include/linux/thread_info.h
> @@ -67,6 +67,11 @@ enum syscall_work_bit {
>   #define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
>   #endif
>   
> +#ifndef TIF_RSEQ
> +# define TIF_RSEQ	TIF_NOTIFY_RESUME
> +# define _TIF_RSEQ	_TIF_NOTIFY_RESUME
> +#endif
> +
>   #ifdef __KERNEL__
>   
>   #ifndef arch_set_restart_data
> 


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com