[patch V2 25/37] rseq: Rework the TIF_NOTIFY handler

Thomas Gleixner posted 37 patches 1 month, 1 week ago
There is a newer version of this series
[patch V2 25/37] rseq: Rework the TIF_NOTIFY handler
Posted by Thomas Gleixner 1 month, 1 week ago
Replace the whole logic with the new implementation, which is shared with
signal delivery and the upcoming exit fast path.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/rseq.c |   78 +++++++++++++++++++++++++---------------------------------
 1 file changed, 34 insertions(+), 44 deletions(-)

--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -82,12 +82,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/rseq.h>
 
-#ifdef CONFIG_MEMBARRIER
-# define RSEQ_EVENT_GUARD	irq
-#else
-# define RSEQ_EVENT_GUARD	preempt
-#endif
-
 DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
 
 static inline void rseq_control_debug(bool on)
@@ -236,38 +230,15 @@ static bool rseq_handle_cs(struct task_s
 	return rseq_update_user_cs(t, regs, csaddr);
 }
 
-/*
- * This resume handler must always be executed between any of:
- * - preemption,
- * - signal delivery,
- * and return to user-space.
- *
- * This is how we can ensure that the entire rseq critical section
- * will issue the commit instruction only if executed atomically with
- * respect to other threads scheduled on the same CPU, and with respect
- * to signal handlers.
- */
-void __rseq_handle_notify_resume(struct pt_regs *regs)
+static void rseq_slowpath_update_usr(struct pt_regs *regs)
 {
+	/* Preserve rseq state and user_irq state for exit to user */
+	const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, };
 	struct task_struct *t = current;
 	struct rseq_ids ids;
 	u32 node_id;
 	bool event;
 
-	/*
-	 * If invoked from hypervisors before entering the guest via
-	 * resume_user_mode_work(), then @regs is a NULL pointer.
-	 *
-	 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
-	 * it before returning from the ioctl() to user space when
-	 * rseq_event.sched_switch is set.
-	 *
-	 * So it's safe to ignore here instead of pointlessly updating it
-	 * in the vcpu_run() loop.
-	 */
-	if (!regs)
-		return;
-
 	if (unlikely(t->flags & PF_EXITING))
 		return;
 
@@ -291,26 +262,45 @@ void __rseq_handle_notify_resume(struct
 	 * with the result handed in to allow the detection of
 	 * inconsistencies.
 	 */
-	scoped_guard(RSEQ_EVENT_GUARD) {
-		event = t->rseq_event.sched_switch;
-		t->rseq_event.sched_switch = false;
+	scoped_guard(irq) {
 		ids.cpu_id = task_cpu(t);
 		ids.mm_cid = task_mm_cid(t);
+		event = t->rseq_event.sched_switch;
+		t->rseq_event.all &= evt_mask.all;
 	}
 
-	if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
+	if (!event)
 		return;
 
-	if (!rseq_handle_cs(t, regs))
-		goto error;
-
 	node_id = cpu_to_node(ids.cpu_id);
-	if (!rseq_set_uids(t, &ids, node_id))
-		goto error;
-	return;
 
-error:
-	force_sig(SIGSEGV);
+	if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) {
+		/*
+		 * Clear the errors just in case this might survive magically, but
+		 * leave the rest intact.
+		 */
+		t->rseq_event.error = 0;
+		force_sig(SIGSEGV);
+	}
+}
+
+void __rseq_handle_notify_resume(struct pt_regs *regs)
+{
+	/*
+	 * If invoked from hypervisors before entering the guest via
+	 * resume_user_mode_work(), then @regs is a NULL pointer.
+	 *
+	 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
+	 * it before returning from the ioctl() to user space when
+	 * rseq_event.sched_switch is set.
+	 *
+	 * So it's safe to ignore here instead of pointlessly updating it
+	 * in the vcpu_run() loop.
+	 */
+	if (!regs)
+		return;
+
+	rseq_slowpath_update_usr(regs);
 }
 
 void __rseq_signal_deliver(int sig, struct pt_regs *regs)
Re: [patch V2 25/37] rseq: Rework the TIF_NOTIFY handler
Posted by Mathieu Desnoyers 1 month, 1 week ago
On 2025-08-23 12:40, Thomas Gleixner wrote:
> Replace the whole logic with the new implementation, which is shared with
> signal delivery and the upcoming exit fast path.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>   kernel/rseq.c |   78 +++++++++++++++++++++++++---------------------------------
>   1 file changed, 34 insertions(+), 44 deletions(-)
> 
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -82,12 +82,6 @@
>   #define CREATE_TRACE_POINTS
>   #include <trace/events/rseq.h>
>   
> -#ifdef CONFIG_MEMBARRIER
> -# define RSEQ_EVENT_GUARD	irq
> -#else
> -# define RSEQ_EVENT_GUARD	preempt
> -#endif
> -
>   DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
>   
>   static inline void rseq_control_debug(bool on)
> @@ -236,38 +230,15 @@ static bool rseq_handle_cs(struct task_s
>   	return rseq_update_user_cs(t, regs, csaddr);
>   }
>   
> -/*
> - * This resume handler must always be executed between any of:
> - * - preemption,
> - * - signal delivery,
> - * and return to user-space.
> - *
> - * This is how we can ensure that the entire rseq critical section
> - * will issue the commit instruction only if executed atomically with
> - * respect to other threads scheduled on the same CPU, and with respect
> - * to signal handlers.
> - */
> -void __rseq_handle_notify_resume(struct pt_regs *regs)
> +static void rseq_slowpath_update_usr(struct pt_regs *regs)
>   {
> +	/* Preserve rseq state and user_irq state for exit to user */
> +	const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, };
>   	struct task_struct *t = current;
>   	struct rseq_ids ids;
>   	u32 node_id;
>   	bool event;
>   
> -	/*
> -	 * If invoked from hypervisors before entering the guest via
> -	 * resume_user_mode_work(), then @regs is a NULL pointer.
> -	 *
> -	 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
> -	 * it before returning from the ioctl() to user space when
> -	 * rseq_event.sched_switch is set.
> -	 *
> -	 * So it's safe to ignore here instead of pointlessly updating it
> -	 * in the vcpu_run() loop.
> -	 */
> -	if (!regs)
> -		return;
> -
>   	if (unlikely(t->flags & PF_EXITING))
>   		return;
>   
> @@ -291,26 +262,45 @@ void __rseq_handle_notify_resume(struct
>   	 * with the result handed in to allow the detection of
>   	 * inconsistencies.
>   	 */
> -	scoped_guard(RSEQ_EVENT_GUARD) {
> -		event = t->rseq_event.sched_switch;
> -		t->rseq_event.sched_switch = false;
> +	scoped_guard(irq) {
>   		ids.cpu_id = task_cpu(t);
>   		ids.mm_cid = task_mm_cid(t);
> +		event = t->rseq_event.sched_switch;
> +		t->rseq_event.all &= evt_mask.all;
>   	}
>   
> -	if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
> +	if (!event)
>   		return;
>   
> -	if (!rseq_handle_cs(t, regs))
> -		goto error;
> -
>   	node_id = cpu_to_node(ids.cpu_id);
> -	if (!rseq_set_uids(t, &ids, node_id))
> -		goto error;
> -	return;
>   
> -error:
> -	force_sig(SIGSEGV);
> +	if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) {
> +		/*
> +		 * Clear the errors just in case this might survive magically, but
> +		 * leave the rest intact.
> +		 */
> +		t->rseq_event.error = 0;
> +		force_sig(SIGSEGV);
> +	}
> +}
> +
> +void __rseq_handle_notify_resume(struct pt_regs *regs)
> +{
> +	/*
> +	 * If invoked from hypervisors before entering the guest via
> +	 * resume_user_mode_work(), then @regs is a NULL pointer.
> +	 *
> +	 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
> +	 * it before returning from the ioctl() to user space when
> +	 * rseq_event.sched_switch is set.
> +	 *
> +	 * So it's safe to ignore here instead of pointlessly updating it
> +	 * in the vcpu_run() loop.

I don't think any virt user should expect the userspace fields to be
updated on the host process while running in guest mode, but it's good
to clarify that we intend to change this user-visible behavior within
this series, to spare any unwelcome surprise.

Other than that:

Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

Thanks,

Mathieu

> +	 */
> +	if (!regs)
> +		return;
> +
> +	rseq_slowpath_update_usr(regs);
>   }
>   
>   void __rseq_signal_deliver(int sig, struct pt_regs *regs)
> 


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
Re: [patch V2 25/37] rseq: Rework the TIF_NOTIFY handler
Posted by Thomas Gleixner 1 month ago
On Tue, Aug 26 2025 at 11:12, Mathieu Desnoyers wrote:
> On 2025-08-23 12:40, Thomas Gleixner wrote:
>> +void __rseq_handle_notify_resume(struct pt_regs *regs)
>> +{
>> +	/*
>> +	 * If invoked from hypervisors before entering the guest via
>> +	 * resume_user_mode_work(), then @regs is a NULL pointer.
>> +	 *
>> +	 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
>> +	 * it before returning from the ioctl() to user space when
>> +	 * rseq_event.sched_switch is set.
>> +	 *
>> +	 * So it's safe to ignore here instead of pointlessly updating it
>> +	 * in the vcpu_run() loop.
>
> I don't think any virt user should expect the userspace fields to be
> updated on the host process while running in guest mode, but it's good
> to clarify that we intend to change this user-visible behavior within
> this series, to spare any unwelcome surprise.

Actually it is not really a user-visible change.

TLS::rseq is thread local and any update to it becomes only visible to
user space once the vCPU thread actually returns to user space. Arguably
no guest has legitimately access to the hosts VCPU thread's TLS.

You might argue, that GDB might look at the thread's TLS::rseq while the
task runs in VCPUs guest mode. But that's completely irrelevant because
once a task enters the kernel the RSEQ CPU/NODE/MM ids have no meaning
anymore. They are only valid as long as the task runs in user space.
When a task hits a breakpoint GDB can only look at the state _before_
that and that's all what it can see when it looks at the TLS of a
thread, which voluntarily went into the kernel via the KVM ioctl.

That update is truly a kernel internal implementation detail and it got
introduced way _after_ the initial RSEQ implementation.

Before 5.9 KVM ignored most of the pending TIF work including
TIF_NOTIFY_RESUME. Once that got fixed it turned out that handling the
other TIF_NOTIFY_RESUME work could result in losing an RSEQ update. To
cure that the rseq handler got pulled in to that TIF_NOTIFY_RESUME
demultiplexing function and gained that NULL pointer check inside to
exclude the critical section check.

In hindsight RSEQ should have used a separate TIF bit right from the
beginning, but that's water under the bridge...

Thanks,

        tglx
Re: [patch V2 25/37] rseq: Rework the TIF_NOTIFY handler
Posted by Sean Christopherson 4 weeks, 1 day ago
On Tue, Sep 02, 2025, Thomas Gleixner wrote:
> On Tue, Aug 26 2025 at 11:12, Mathieu Desnoyers wrote:
> > On 2025-08-23 12:40, Thomas Gleixner wrote:
> >> +void __rseq_handle_notify_resume(struct pt_regs *regs)
> >> +{
> >> +	/*
> >> +	 * If invoked from hypervisors before entering the guest via
> >> +	 * resume_user_mode_work(), then @regs is a NULL pointer.
> >> +	 *
> >> +	 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
> >> +	 * it before returning from the ioctl() to user space when
> >> +	 * rseq_event.sched_switch is set.
> >> +	 *
> >> +	 * So it's safe to ignore here instead of pointlessly updating it
> >> +	 * in the vcpu_run() loop.
> >
> > I don't think any virt user should expect the userspace fields to be
> > updated on the host process while running in guest mode, but it's good
> > to clarify that we intend to change this user-visible behavior within
> > this series, to spare any unwelcome surprise.
> 
> Actually it is not really a user-visible change.

It's definitely a user-visible change in the sense that userspace, via the guest,
will see different behavior.

> TLS::rseq is thread local and any update to it becomes only visible to
> user space once the vCPU thread actually returns to user space. Arguably
> no guest has legitimately access to the hosts VCPU thread's TLS.
> 
> You might argue, that GDB might look at the thread's TLS::rseq while the
> task runs in VCPUs guest mode. But that's completely irrelevant because
> once a task enters the kernel the RSEQ CPU/NODE/MM ids have no meaning
> anymore. They are only valid as long as the task runs in user space.

Paravirt setups, e.g. hoisting host-controlled workloads into VMs, have explored
(ab)using rseq.  In such setups, host threads are often mapped 1:1 to vCPUs, in
which case the pCPU in particular becomes interesting.

> When a task hits a breakpoint GDB can only look at the state _before_
> that and that's all what it can see when it looks at the TLS of a
> thread, which voluntarily went into the kernel via the KVM ioctl.
> 
> That update is truly a kernel internal implementation detail and it got
> introduced way _after_ the initial RSEQ implementation.

Yes, but that doesn't change the fact that a user _could_ have come to depend on
the current behavior sometime in the last ~5 years.

I'm ok formally stating that exposing rseq directly to a KVM guest is unsupported,
but I would like to explicitly call out and document the change.
Re: [patch V2 25/37] rseq: Rework the TIF_NOTIFY handler
Posted by Thomas Gleixner 4 weeks, 1 day ago
On Thu, Sep 04 2025 at 02:52, Sean Christopherson wrote:
> On Tue, Sep 02, 2025, Thomas Gleixner wrote:
>> > I don't think any virt user should expect the userspace fields to be
>> > updated on the host process while running in guest mode, but it's good
>> > to clarify that we intend to change this user-visible behavior within
>> > this series, to spare any unwelcome surprise.
>> 
>> Actually it is not really a user-visible change.
>
> It's definitely a user-visible change in the sense that userspace, via the guest,
> will see different behavior.
>
>> TLS::rseq is thread local and any update to it becomes only visible to
>> user space once the vCPU thread actually returns to user space. Arguably
>> no guest has legitimately access to the hosts VCPU thread's TLS.
>> 
>> You might argue, that GDB might look at the thread's TLS::rseq while the
>> task runs in VCPUs guest mode. But that's completely irrelevant because
>> once a task enters the kernel the RSEQ CPU/NODE/MM ids have no meaning
>> anymore. They are only valid as long as the task runs in user space.
>
> Paravirt setups, e.g. hoisting host-controlled workloads into VMs, have explored
> (ab)using rseq.  In such setups, host threads are often mapped 1:1 to vCPUs, in
> which case the pCPU in particular becomes interesting.

Why am I not suprised?

>> When a task hits a breakpoint GDB can only look at the state _before_
>> that and that's all what it can see when it looks at the TLS of a
>> thread, which voluntarily went into the kernel via the KVM ioctl.
>> 
>> That update is truly a kernel internal implementation detail and it got
>> introduced way _after_ the initial RSEQ implementation.
>
> Yes, but that doesn't change the fact that a user _could_ have come to depend on
> the current behavior sometime in the last ~5 years.

So it depends on a kernel internal implementation detail which happened
to be introduced by chance rather by design and without any guaranteed
behaviour vs. a guest.

> I'm ok formally stating that exposing rseq directly to a KVM guest is unsupported,
> but I would like to explicitly call out and document the change.

Fair enough. I've amended the change log accordingly.

If that turns out to be a real world problem, then it needs to be
brought back explicitly into the virt TIF work handling code, but I
prefer not to :)

Thanks,

        tglx
Re: [patch V2 25/37] rseq: Rework the TIF_NOTIFY handler
Posted by Mathieu Desnoyers 4 weeks, 1 day ago
On 2025-09-04 06:53, Thomas Gleixner wrote:
> On Thu, Sep 04 2025 at 02:52, Sean Christopherson wrote:
>> On Tue, Sep 02, 2025, Thomas Gleixner wrote:
>>>> I don't think any virt user should expect the userspace fields to be
>>>> updated on the host process while running in guest mode, but it's good
>>>> to clarify that we intend to change this user-visible behavior within
>>>> this series, to spare any unwelcome surprise.
>>>
>>> Actually it is not really a user-visible change.
>>
>> It's definitely a user-visible change in the sense that userspace, via the guest,
>> will see different behavior.
>>
>>> TLS::rseq is thread local and any update to it becomes only visible to
>>> user space once the vCPU thread actually returns to user space. Arguably
>>> no guest has legitimately access to the hosts VCPU thread's TLS.
>>>
>>> You might argue, that GDB might look at the thread's TLS::rseq while the
>>> task runs in VCPUs guest mode. But that's completely irrelevant because
>>> once a task enters the kernel the RSEQ CPU/NODE/MM ids have no meaning
>>> anymore. They are only valid as long as the task runs in user space.
>>
>> Paravirt setups, e.g. hoisting host-controlled workloads into VMs, have explored
>> (ab)using rseq.  In such setups, host threads are often mapped 1:1 to vCPUs, in
>> which case the pCPU in particular becomes interesting.
> 
> Why am I not suprised?
> 
>>> When a task hits a breakpoint GDB can only look at the state _before_
>>> that and that's all what it can see when it looks at the TLS of a
>>> thread, which voluntarily went into the kernel via the KVM ioctl.
>>>
>>> That update is truly a kernel internal implementation detail and it got
>>> introduced way _after_ the initial RSEQ implementation.
>>
>> Yes, but that doesn't change the fact that a user _could_ have come to depend on
>> the current behavior sometime in the last ~5 years.
> 
> So it depends on a kernel internal implementation detail which happened
> to be introduced by chance rather by design and without any guaranteed
> behaviour vs. a guest.
> 
>> I'm ok formally stating that exposing rseq directly to a KVM guest is unsupported,
>> but I would like to explicitly call out and document the change.
> 
> Fair enough. I've amended the change log accordingly.
> 
> If that turns out to be a real world problem, then it needs to be
> brought back explicitly into the virt TIF work handling code, but I
> prefer not to :)

That works for me !

Thanks,

Mathieu

> 
> Thanks,
> 
>          tglx


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com