For RSEQ the only relevant reason to inspect and eventually fixup (abort)
user space critical sections is when user space was interrupted and the
task was scheduled out.
If the user to kernel entry was from a syscall no fixup is required. If
user space invokes a syscall from a critical section it can keep the
pieces as documented.
This is only supported on architectures which utilize the generic entry
code. If your architecture does not use it, bad luck.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
---
include/linux/irq-entry-common.h | 3 ++-
include/linux/rseq.h | 16 +++++++++++-----
include/linux/rseq_entry.h | 18 ++++++++++++++++++
include/linux/rseq_types.h | 2 ++
4 files changed, 33 insertions(+), 6 deletions(-)
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -4,7 +4,7 @@
#include <linux/context_tracking.h>
#include <linux/kmsan.h>
-#include <linux/rseq.h>
+#include <linux/rseq_entry.h>
#include <linux/static_call_types.h>
#include <linux/syscalls.h>
#include <linux/tick.h>
@@ -281,6 +281,7 @@ static __always_inline void exit_to_user
static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
{
enter_from_user_mode(regs);
+ rseq_note_user_irq_entry();
}
/**
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -31,11 +31,17 @@ static inline void rseq_sched_switch_eve
static __always_inline void rseq_exit_to_user_mode(void)
{
- if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
- if (WARN_ON_ONCE(current->rseq.event.has_rseq &&
- current->rseq.event.events))
- current->rseq.event.events = 0;
- }
+ struct rseq_event *ev = ¤t->rseq.event;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
+ WARN_ON_ONCE(ev->sched_switch);
+
+ /*
+ * Ensure that event (especially user_irq) is cleared when the
+ * interrupt did not result in a schedule and therefore the
+ * rseq processing did not clear it.
+ */
+ ev->events = 0;
}
/*
--- /dev/null
+++ b/include/linux/rseq_entry.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RSEQ_ENTRY_H
+#define _LINUX_RSEQ_ENTRY_H
+
+#ifdef CONFIG_RSEQ
+#include <linux/rseq.h>
+
+static __always_inline void rseq_note_user_irq_entry(void)
+{
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
+ current->rseq.event.user_irq = true;
+}
+
+#else /* CONFIG_RSEQ */
+static inline void rseq_note_user_irq_entry(void) { }
+#endif /* !CONFIG_RSEQ */
+
+#endif /* _LINUX_RSEQ_ENTRY_H */
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -12,6 +12,7 @@ struct rseq;
* @all: Compound to initialize and clear the data efficiently
* @events: Compound to access events with a single load/store
* @sched_switch: True if the task was scheduled out
+ * @user_irq: True on interrupt entry from user mode
* @has_rseq: True if the task has a rseq pointer installed
*/
struct rseq_event {
@@ -22,6 +23,7 @@ struct rseq_event {
u16 events;
struct {
u8 sched_switch;
+ u8 user_irq;
};
};
On 2025-10-27 04:44, Thomas Gleixner wrote:
[...]
> @@ -281,6 +281,7 @@ static __always_inline void exit_to_user
> static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
> {
> enter_from_user_mode(regs);
> + rseq_note_user_irq_entry();
> }
>
Looking at x86, both exc_debug_user() and exc_int3() invoke
irqentry_enter_from_user_mode(), but there are various
other traps that can come from userspace (e.g. math_error,
exc_general_protection, ...). Some of those traps don't
necessarily end with a signal delivery to the offending
process. And some of those traps enable interrupts.
So what happens if such a trap is triggered from userspace,
and then scheduling happens on top of this trap ? Is this
skipping rseq ip fixup and rseq fields updates ?
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
On Tue, Oct 28 2025 at 11:26, Mathieu Desnoyers wrote:
> On 2025-10-27 04:44, Thomas Gleixner wrote:
> [...]
>> @@ -281,6 +281,7 @@ static __always_inline void exit_to_user
>> static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
>> {
>> enter_from_user_mode(regs);
>> + rseq_note_user_irq_entry();
>> }
>>
> Looking at x86, both exc_debug_user() and exc_int3() invoke
> irqentry_enter_from_user_mode(), but there are various
> other traps that can come from userspace (e.g. math_error,
> exc_general_protection, ...). Some of those traps don't
> necessarily end with a signal delivery to the offending
> process. And some of those traps enable interrupts.
They all go through irqentry_enter_from_user_mode(). See
DEFINE_IDTENTRY*() macros. They invoke:
irqentry_enter()
if (user_mode())
irqentry_enter_from_user_mode();
If that wouldn't be the case then all the RCU/NOHZ magic would not work
either. So any exception, trap, interrupt must go through this to
establish state correctly. Whether that's explicit as it's required for
int3 and debug_user or implicit through the IDTENTRY magic.
Thanks,
tglx
On 2025-10-28 13:02, Thomas Gleixner wrote:
> On Tue, Oct 28 2025 at 11:26, Mathieu Desnoyers wrote:
>> On 2025-10-27 04:44, Thomas Gleixner wrote:
>> [...]
>>> @@ -281,6 +281,7 @@ static __always_inline void exit_to_user
>>> static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
>>> {
>>> enter_from_user_mode(regs);
>>> + rseq_note_user_irq_entry();
>>> }
>>>
>> Looking at x86, both exc_debug_user() and exc_int3() invoke
>> irqentry_enter_from_user_mode(), but there are various
>> other traps that can come from userspace (e.g. math_error,
>> exc_general_protection, ...). Some of those traps don't
>> necessarily end with a signal delivery to the offending
>> process. And some of those traps enable interrupts.
>
> They all go through irqentry_enter_from_user_mode(). See
> DEFINE_IDTENTRY*() macros. They invoke:
>
> irqentry_enter()
> if (user_mode())
> irqentry_enter_from_user_mode();
>
> If that wouldn't be the case then all the RCU/NOHZ magic would not work
> either. So any exception, trap, interrupt must go through this to
> establish state correctly. Whether that's explicit as it's required for
> int3 and debug_user or implicit through the IDTENTRY magic.
That's what I missed, thanks for the explanation.
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
The following commit has been merged into the core/rseq branch of tip:
Commit-ID: 2fc0e4b4126caadfa5772ba69276b350609584dd
Gitweb: https://git.kernel.org/tip/2fc0e4b4126caadfa5772ba69276b350609584dd
Author: Thomas Gleixner <tglx@linutronix.de>
AuthorDate: Mon, 27 Oct 2025 09:44:48 +01:00
Committer: Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 04 Nov 2025 08:32:23 +01:00
rseq: Record interrupt from user space
For RSEQ the only relevant reason to inspect and eventually fixup (abort)
user space critical sections is when user space was interrupted and the
task was scheduled out.
If the user to kernel entry was from a syscall no fixup is required. If
user space invokes a syscall from a critical section it can keep the
pieces as documented.
This is only supported on architectures which utilize the generic entry
code. If your architecture does not use it, bad luck.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.905067101@linutronix.de
---
include/linux/irq-entry-common.h | 3 ++-
include/linux/rseq.h | 16 +++++++++++-----
include/linux/rseq_entry.h | 18 ++++++++++++++++++
include/linux/rseq_types.h | 2 ++
4 files changed, 33 insertions(+), 6 deletions(-)
create mode 100644 include/linux/rseq_entry.h
diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index 83c9d84..cb31fb8 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -4,7 +4,7 @@
#include <linux/context_tracking.h>
#include <linux/kmsan.h>
-#include <linux/rseq.h>
+#include <linux/rseq_entry.h>
#include <linux/static_call_types.h>
#include <linux/syscalls.h>
#include <linux/tick.h>
@@ -281,6 +281,7 @@ static __always_inline void exit_to_user_mode(void)
static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
{
enter_from_user_mode(regs);
+ rseq_note_user_irq_entry();
}
/**
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index d315a92..a200836 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -31,11 +31,17 @@ static inline void rseq_sched_switch_event(struct task_struct *t)
static __always_inline void rseq_exit_to_user_mode(void)
{
- if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
- if (WARN_ON_ONCE(current->rseq.event.has_rseq &&
- current->rseq.event.events))
- current->rseq.event.events = 0;
- }
+ struct rseq_event *ev = ¤t->rseq.event;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
+ WARN_ON_ONCE(ev->sched_switch);
+
+ /*
+ * Ensure that event (especially user_irq) is cleared when the
+ * interrupt did not result in a schedule and therefore the
+ * rseq processing did not clear it.
+ */
+ ev->events = 0;
}
/*
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
new file mode 100644
index 0000000..ce30e87
--- /dev/null
+++ b/include/linux/rseq_entry.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RSEQ_ENTRY_H
+#define _LINUX_RSEQ_ENTRY_H
+
+#ifdef CONFIG_RSEQ
+#include <linux/rseq.h>
+
+static __always_inline void rseq_note_user_irq_entry(void)
+{
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
+ current->rseq.event.user_irq = true;
+}
+
+#else /* CONFIG_RSEQ */
+static inline void rseq_note_user_irq_entry(void) { }
+#endif /* !CONFIG_RSEQ */
+
+#endif /* _LINUX_RSEQ_ENTRY_H */
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 40901b0..80f6c39 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -12,6 +12,7 @@ struct rseq;
* @all: Compound to initialize and clear the data efficiently
* @events: Compound to access events with a single load/store
* @sched_switch: True if the task was scheduled out
+ * @user_irq: True on interrupt entry from user mode
* @has_rseq: True if the task has a rseq pointer installed
*/
struct rseq_event {
@@ -22,6 +23,7 @@ struct rseq_event {
u16 events;
struct {
u8 sched_switch;
+ u8 user_irq;
};
};
The following commit has been merged into the core/rseq branch of tip:
Commit-ID: 60cbf3a8e3b17637498dbe5a13c58008ecec09ba
Gitweb: https://git.kernel.org/tip/60cbf3a8e3b17637498dbe5a13c58008ecec09ba
Author: Thomas Gleixner <tglx@linutronix.de>
AuthorDate: Mon, 27 Oct 2025 09:44:48 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 03 Nov 2025 15:26:17 +01:00
rseq: Record interrupt from user space
For RSEQ the only relevant reason to inspect and eventually fixup (abort)
user space critical sections is when user space was interrupted and the
task was scheduled out.
If the user to kernel entry was from a syscall no fixup is required. If
user space invokes a syscall from a critical section it can keep the
pieces as documented.
This is only supported on architectures which utilize the generic entry
code. If your architecture does not use it, bad luck.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.905067101@linutronix.de
---
include/linux/irq-entry-common.h | 3 ++-
include/linux/rseq.h | 16 +++++++++++-----
include/linux/rseq_entry.h | 18 ++++++++++++++++++
include/linux/rseq_types.h | 2 ++
4 files changed, 33 insertions(+), 6 deletions(-)
create mode 100644 include/linux/rseq_entry.h
diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index 83c9d84..cb31fb8 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -4,7 +4,7 @@
#include <linux/context_tracking.h>
#include <linux/kmsan.h>
-#include <linux/rseq.h>
+#include <linux/rseq_entry.h>
#include <linux/static_call_types.h>
#include <linux/syscalls.h>
#include <linux/tick.h>
@@ -281,6 +281,7 @@ static __always_inline void exit_to_user_mode(void)
static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
{
enter_from_user_mode(regs);
+ rseq_note_user_irq_entry();
}
/**
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index d315a92..a200836 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -31,11 +31,17 @@ static inline void rseq_sched_switch_event(struct task_struct *t)
static __always_inline void rseq_exit_to_user_mode(void)
{
- if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
- if (WARN_ON_ONCE(current->rseq.event.has_rseq &&
- current->rseq.event.events))
- current->rseq.event.events = 0;
- }
+ struct rseq_event *ev = ¤t->rseq.event;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
+ WARN_ON_ONCE(ev->sched_switch);
+
+ /*
+ * Ensure that event (especially user_irq) is cleared when the
+ * interrupt did not result in a schedule and therefore the
+ * rseq processing did not clear it.
+ */
+ ev->events = 0;
}
/*
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
new file mode 100644
index 0000000..ce30e87
--- /dev/null
+++ b/include/linux/rseq_entry.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RSEQ_ENTRY_H
+#define _LINUX_RSEQ_ENTRY_H
+
+#ifdef CONFIG_RSEQ
+#include <linux/rseq.h>
+
+static __always_inline void rseq_note_user_irq_entry(void)
+{
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
+ current->rseq.event.user_irq = true;
+}
+
+#else /* CONFIG_RSEQ */
+static inline void rseq_note_user_irq_entry(void) { }
+#endif /* !CONFIG_RSEQ */
+
+#endif /* _LINUX_RSEQ_ENTRY_H */
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 40901b0..80f6c39 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -12,6 +12,7 @@ struct rseq;
* @all: Compound to initialize and clear the data efficiently
* @events: Compound to access events with a single load/store
* @sched_switch: True if the task was scheduled out
+ * @user_irq: True on interrupt entry from user mode
* @has_rseq: True if the task has a rseq pointer installed
*/
struct rseq_event {
@@ -22,6 +23,7 @@ struct rseq_event {
u16 events;
struct {
u8 sched_switch;
+ u8 user_irq;
};
};
The following commit has been merged into the core/rseq branch of tip:
Commit-ID: 3cdfc5701dcd290c100ece742bcd13c2e6415cda
Gitweb: https://git.kernel.org/tip/3cdfc5701dcd290c100ece742bcd13c2e6415cda
Author: Thomas Gleixner <tglx@linutronix.de>
AuthorDate: Mon, 27 Oct 2025 09:44:48 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 29 Oct 2025 11:07:15 +01:00
rseq: Record interrupt from user space
For RSEQ the only relevant reason to inspect and eventually fixup (abort)
user space critical sections is when user space was interrupted and the
task was scheduled out.
If the user to kernel entry was from a syscall no fixup is required. If
user space invokes a syscall from a critical section it can keep the
pieces as documented.
This is only supported on architectures which utilize the generic entry
code. If your architecture does not use it, bad luck.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.905067101@linutronix.de
---
include/linux/irq-entry-common.h | 3 ++-
include/linux/rseq.h | 16 +++++++++++-----
include/linux/rseq_entry.h | 18 ++++++++++++++++++
include/linux/rseq_types.h | 2 ++
4 files changed, 33 insertions(+), 6 deletions(-)
create mode 100644 include/linux/rseq_entry.h
diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index 83c9d84..cb31fb8 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -4,7 +4,7 @@
#include <linux/context_tracking.h>
#include <linux/kmsan.h>
-#include <linux/rseq.h>
+#include <linux/rseq_entry.h>
#include <linux/static_call_types.h>
#include <linux/syscalls.h>
#include <linux/tick.h>
@@ -281,6 +281,7 @@ static __always_inline void exit_to_user_mode(void)
static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
{
enter_from_user_mode(regs);
+ rseq_note_user_irq_entry();
}
/**
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index 88067a6..eb0dd13 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -31,11 +31,17 @@ static inline void rseq_sched_switch_event(struct task_struct *t)
static __always_inline void rseq_exit_to_user_mode(void)
{
- if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
- if (WARN_ON_ONCE(current->rseq.event.has_rseq &&
- current->rseq.event.events))
- current->rseq.event.events = 0;
- }
+ struct rseq_event *ev = ¤t->rseq.event;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
+ WARN_ON_ONCE(ev->sched_switch);
+
+ /*
+ * Ensure that event (especially user_irq) is cleared when the
+ * interrupt did not result in a schedule and therefore the
+ * rseq processing did not clear it.
+ */
+ ev->events = 0;
}
/*
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
new file mode 100644
index 0000000..ce30e87
--- /dev/null
+++ b/include/linux/rseq_entry.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RSEQ_ENTRY_H
+#define _LINUX_RSEQ_ENTRY_H
+
+#ifdef CONFIG_RSEQ
+#include <linux/rseq.h>
+
+static __always_inline void rseq_note_user_irq_entry(void)
+{
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
+ current->rseq.event.user_irq = true;
+}
+
+#else /* CONFIG_RSEQ */
+static inline void rseq_note_user_irq_entry(void) { }
+#endif /* !CONFIG_RSEQ */
+
+#endif /* _LINUX_RSEQ_ENTRY_H */
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 40901b0..80f6c39 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -12,6 +12,7 @@ struct rseq;
* @all: Compound to initialize and clear the data efficiently
* @events: Compound to access events with a single load/store
* @sched_switch: True if the task was scheduled out
+ * @user_irq: True on interrupt entry from user mode
* @has_rseq: True if the task has a rseq pointer installed
*/
struct rseq_event {
@@ -22,6 +23,7 @@ struct rseq_event {
u16 events;
struct {
u8 sched_switch;
+ u8 user_irq;
};
};
© 2016 - 2025 Red Hat, Inc.