If an application registers rseq, and ever switches to another pkey
protection (such that the rseq becomes inaccessible), then any
context switch will cause failure in __rseq_handle_notify_resume()
attempting to read/write struct rseq and/or rseq_cs. Since context
switches are asynchronous and are outside of the application control
(not part of the restricted code scope), temporarily switch to
pkey value that allows access to the 0 (default) PKEY.
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
Fixes: d7822b1e24f2 ("rseq: Introduce restartable sequences system call")
---
Changes in v6:
- Added a comment to struct rseq with MPK rules
Changes in v4:
- Added Fixes tag
Changes in v3:
- simplify control flow to always enable access to 0 pkey
Changes in v2:
- fixed typos and reworded the comment
---
include/uapi/linux/rseq.h | 4 ++++
kernel/rseq.c | 11 +++++++++++
2 files changed, 15 insertions(+)
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index c233aae5eac90..019fd248cf749 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -58,6 +58,10 @@ struct rseq_cs {
* contained within a single cache-line.
*
* A single struct rseq per thread is allowed.
+ *
+ * If struct rseq or struct rseq_cs is used with Memory Protection Keys,
+ * then the assigned pkey should either be accessible whenever these structs
+ * are registered/installed, or they should be protected with pkey 0.
*/
struct rseq {
/*
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 2cb16091ec0ae..9d9c976d3b78c 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -10,6 +10,7 @@
#include <linux/sched.h>
#include <linux/uaccess.h>
+#include <linux/pkeys.h>
#include <linux/syscalls.h>
#include <linux/rseq.h>
#include <linux/types.h>
@@ -402,11 +403,19 @@ static int rseq_ip_fixup(struct pt_regs *regs)
void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
{
struct task_struct *t = current;
+ pkey_reg_t saved_pkey;
int ret, sig;
if (unlikely(t->flags & PF_EXITING))
return;
+ /*
+ * Enable access to the default (0) pkey in case the thread has
+ * currently disabled access to it and struct rseq/rseq_cs has
+ * 0 pkey assigned (the only supported value for now).
+ */
+ saved_pkey = enable_zero_pkey_val();
+
/*
* regs is NULL if and only if the caller is in a syscall path. Skip
* fixup and leave rseq_cs as is so that rseq_sycall() will detect and
@@ -419,9 +428,11 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
}
if (unlikely(rseq_update_cpu_node_id(t)))
goto error;
+ write_pkey_val(saved_pkey);
return;
error:
+ write_pkey_val(saved_pkey);
sig = ksig ? ksig->sig : 0;
force_sigsegv(sig);
}
--
2.48.1.658.g4767266eb4-goog
On 2025-02-27 09:03, Dmitry Vyukov wrote:
> If an application registers rseq, and ever switches to another pkey
> protection (such that the rseq becomes inaccessible), then any
> context switch will cause failure in __rseq_handle_notify_resume()
> attempting to read/write struct rseq and/or rseq_cs. Since context
> switches are asynchronous and are outside of the application control
> (not part of the restricted code scope), temporarily switch to
> pkey value that allows access to the 0 (default) PKEY.
>
> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: "Paul E. McKenney" <paulmck@kernel.org>
> Cc: Boqun Feng <boqun.feng@gmail.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
> Cc: x86@kernel.org
> Cc: linux-kernel@vger.kernel.org
> Fixes: d7822b1e24f2 ("rseq: Introduce restartable sequences system call")
>
> ---
> Changes in v6:
> - Added a comment to struct rseq with MPK rules
>
> Changes in v4:
> - Added Fixes tag
>
> Changes in v3:
> - simplify control flow to always enable access to 0 pkey
>
> Changes in v2:
> - fixed typos and reworded the comment
> ---
> include/uapi/linux/rseq.h | 4 ++++
> kernel/rseq.c | 11 +++++++++++
> 2 files changed, 15 insertions(+)
>
> diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
> index c233aae5eac90..019fd248cf749 100644
> --- a/include/uapi/linux/rseq.h
> +++ b/include/uapi/linux/rseq.h
> @@ -58,6 +58,10 @@ struct rseq_cs {
> * contained within a single cache-line.
> *
> * A single struct rseq per thread is allowed.
> + *
> + * If struct rseq or struct rseq_cs is used with Memory Protection Keys,
> + * then the assigned pkey should either be accessible whenever these structs
> + * are registered/installed, or they should be protected with pkey 0.
> */
> struct rseq {
> /*
> diff --git a/kernel/rseq.c b/kernel/rseq.c
> index 2cb16091ec0ae..9d9c976d3b78c 100644
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -10,6 +10,7 @@
>
> #include <linux/sched.h>
> #include <linux/uaccess.h>
> +#include <linux/pkeys.h>
> #include <linux/syscalls.h>
> #include <linux/rseq.h>
> #include <linux/types.h>
> @@ -402,11 +403,19 @@ static int rseq_ip_fixup(struct pt_regs *regs)
> void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
> {
> struct task_struct *t = current;
> + pkey_reg_t saved_pkey;
> int ret, sig;
>
> if (unlikely(t->flags & PF_EXITING))
> return;
>
> + /*
> + * Enable access to the default (0) pkey in case the thread has
> + * currently disabled access to it and struct rseq/rseq_cs has
> + * 0 pkey assigned (the only supported value for now).
> + */
> + saved_pkey = enable_zero_pkey_val();
> +
> /*
> * regs is NULL if and only if the caller is in a syscall path. Skip
> * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
> @@ -419,9 +428,11 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
> }
> if (unlikely(rseq_update_cpu_node_id(t)))
> goto error;
> + write_pkey_val(saved_pkey);
> return;
>
> error:
> + write_pkey_val(saved_pkey);
> sig = ksig ? ksig->sig : 0;
> force_sigsegv(sig);
> }
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
On Thu, 27 Feb 2025 at 15:03, Dmitry Vyukov <dvyukov@google.com> wrote:
>
> If an application registers rseq, and ever switches to another pkey
> protection (such that the rseq becomes inaccessible), then any
> context switch will cause failure in __rseq_handle_notify_resume()
> attempting to read/write struct rseq and/or rseq_cs. Since context
> switches are asynchronous and are outside of the application control
> (not part of the restricted code scope), temporarily switch to
> pkey value that allows access to the 0 (default) PKEY.
>
> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: "Paul E. McKenney" <paulmck@kernel.org>
> Cc: Boqun Feng <boqun.feng@gmail.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
> Cc: x86@kernel.org
> Cc: linux-kernel@vger.kernel.org
> Fixes: d7822b1e24f2 ("rseq: Introduce restartable sequences system call")
Any remaining concerns with this series?
What tree should it go into?
> ---
> Changes in v6:
> - Added a comment to struct rseq with MPK rules
>
> Changes in v4:
> - Added Fixes tag
>
> Changes in v3:
> - simplify control flow to always enable access to 0 pkey
>
> Changes in v2:
> - fixed typos and reworded the comment
> ---
> include/uapi/linux/rseq.h | 4 ++++
> kernel/rseq.c | 11 +++++++++++
> 2 files changed, 15 insertions(+)
>
> diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
> index c233aae5eac90..019fd248cf749 100644
> --- a/include/uapi/linux/rseq.h
> +++ b/include/uapi/linux/rseq.h
> @@ -58,6 +58,10 @@ struct rseq_cs {
> * contained within a single cache-line.
> *
> * A single struct rseq per thread is allowed.
> + *
> + * If struct rseq or struct rseq_cs is used with Memory Protection Keys,
> + * then the assigned pkey should either be accessible whenever these structs
> + * are registered/installed, or they should be protected with pkey 0.
> */
> struct rseq {
> /*
> diff --git a/kernel/rseq.c b/kernel/rseq.c
> index 2cb16091ec0ae..9d9c976d3b78c 100644
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -10,6 +10,7 @@
>
> #include <linux/sched.h>
> #include <linux/uaccess.h>
> +#include <linux/pkeys.h>
> #include <linux/syscalls.h>
> #include <linux/rseq.h>
> #include <linux/types.h>
> @@ -402,11 +403,19 @@ static int rseq_ip_fixup(struct pt_regs *regs)
> void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
> {
> struct task_struct *t = current;
> + pkey_reg_t saved_pkey;
> int ret, sig;
>
> if (unlikely(t->flags & PF_EXITING))
> return;
>
> + /*
> + * Enable access to the default (0) pkey in case the thread has
> + * currently disabled access to it and struct rseq/rseq_cs has
> + * 0 pkey assigned (the only supported value for now).
> + */
> + saved_pkey = enable_zero_pkey_val();
> +
> /*
> * regs is NULL if and only if the caller is in a syscall path. Skip
> * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
> @@ -419,9 +428,11 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
> }
> if (unlikely(rseq_update_cpu_node_id(t)))
> goto error;
> + write_pkey_val(saved_pkey);
> return;
>
> error:
> + write_pkey_val(saved_pkey);
> sig = ksig ? ksig->sig : 0;
> force_sigsegv(sig);
> }
> --
> 2.48.1.658.g4767266eb4-goog
>
On 2025-03-08 05:02, Dmitry Vyukov wrote:
> On Thu, 27 Feb 2025 at 15:03, Dmitry Vyukov <dvyukov@google.com> wrote:
>>
>> If an application registers rseq, and ever switches to another pkey
>> protection (such that the rseq becomes inaccessible), then any
>> context switch will cause failure in __rseq_handle_notify_resume()
>> attempting to read/write struct rseq and/or rseq_cs. Since context
>> switches are asynchronous and are outside of the application control
>> (not part of the restricted code scope), temporarily switch to
>> pkey value that allows access to the 0 (default) PKEY.
>>
>> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
>> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: "Paul E. McKenney" <paulmck@kernel.org>
>> Cc: Boqun Feng <boqun.feng@gmail.com>
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>> Cc: Ingo Molnar <mingo@redhat.com>
>> Cc: Borislav Petkov <bp@alien8.de>
>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>> Cc: "H. Peter Anvin" <hpa@zytor.com>
>> Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
>> Cc: x86@kernel.org
>> Cc: linux-kernel@vger.kernel.org
>> Fixes: d7822b1e24f2 ("rseq: Introduce restartable sequences system call")
>
> Any remaining concerns with this series?
>
> What tree should it go into?
Usually the rseq bits go through the -tip tree.
Thanks,
Mathieu
>
>> ---
>> Changes in v6:
>> - Added a comment to struct rseq with MPK rules
>>
>> Changes in v4:
>> - Added Fixes tag
>>
>> Changes in v3:
>> - simplify control flow to always enable access to 0 pkey
>>
>> Changes in v2:
>> - fixed typos and reworded the comment
>> ---
>> include/uapi/linux/rseq.h | 4 ++++
>> kernel/rseq.c | 11 +++++++++++
>> 2 files changed, 15 insertions(+)
>>
>> diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
>> index c233aae5eac90..019fd248cf749 100644
>> --- a/include/uapi/linux/rseq.h
>> +++ b/include/uapi/linux/rseq.h
>> @@ -58,6 +58,10 @@ struct rseq_cs {
>> * contained within a single cache-line.
>> *
>> * A single struct rseq per thread is allowed.
>> + *
>> + * If struct rseq or struct rseq_cs is used with Memory Protection Keys,
>> + * then the assigned pkey should either be accessible whenever these structs
>> + * are registered/installed, or they should be protected with pkey 0.
>> */
>> struct rseq {
>> /*
>> diff --git a/kernel/rseq.c b/kernel/rseq.c
>> index 2cb16091ec0ae..9d9c976d3b78c 100644
>> --- a/kernel/rseq.c
>> +++ b/kernel/rseq.c
>> @@ -10,6 +10,7 @@
>>
>> #include <linux/sched.h>
>> #include <linux/uaccess.h>
>> +#include <linux/pkeys.h>
>> #include <linux/syscalls.h>
>> #include <linux/rseq.h>
>> #include <linux/types.h>
>> @@ -402,11 +403,19 @@ static int rseq_ip_fixup(struct pt_regs *regs)
>> void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
>> {
>> struct task_struct *t = current;
>> + pkey_reg_t saved_pkey;
>> int ret, sig;
>>
>> if (unlikely(t->flags & PF_EXITING))
>> return;
>>
>> + /*
>> + * Enable access to the default (0) pkey in case the thread has
>> + * currently disabled access to it and struct rseq/rseq_cs has
>> + * 0 pkey assigned (the only supported value for now).
>> + */
>> + saved_pkey = enable_zero_pkey_val();
>> +
>> /*
>> * regs is NULL if and only if the caller is in a syscall path. Skip
>> * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
>> @@ -419,9 +428,11 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
>> }
>> if (unlikely(rseq_update_cpu_node_id(t)))
>> goto error;
>> + write_pkey_val(saved_pkey);
>> return;
>>
>> error:
>> + write_pkey_val(saved_pkey);
>> sig = ksig ? ksig->sig : 0;
>> force_sigsegv(sig);
>> }
>> --
>> 2.48.1.658.g4767266eb4-goog
>>
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
On Mon, 10 Mar 2025 at 15:31, Mathieu Desnoyers
<mathieu.desnoyers@efficios.com> wrote:
>
> On 2025-03-08 05:02, Dmitry Vyukov wrote:
> > On Thu, 27 Feb 2025 at 15:03, Dmitry Vyukov <dvyukov@google.com> wrote:
> >>
> >> If an application registers rseq, and ever switches to another pkey
> >> protection (such that the rseq becomes inaccessible), then any
> >> context switch will cause failure in __rseq_handle_notify_resume()
> >> attempting to read/write struct rseq and/or rseq_cs. Since context
> >> switches are asynchronous and are outside of the application control
> >> (not part of the restricted code scope), temporarily switch to
> >> pkey value that allows access to the 0 (default) PKEY.
> >>
> >> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> >> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> >> Cc: Peter Zijlstra <peterz@infradead.org>
> >> Cc: "Paul E. McKenney" <paulmck@kernel.org>
> >> Cc: Boqun Feng <boqun.feng@gmail.com>
> >> Cc: Thomas Gleixner <tglx@linutronix.de>
> >> Cc: Ingo Molnar <mingo@redhat.com>
> >> Cc: Borislav Petkov <bp@alien8.de>
> >> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> >> Cc: "H. Peter Anvin" <hpa@zytor.com>
> >> Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
> >> Cc: x86@kernel.org
> >> Cc: linux-kernel@vger.kernel.org
> >> Fixes: d7822b1e24f2 ("rseq: Introduce restartable sequences system call")
> >
> > Any remaining concerns with this series?
> >
> > What tree should it go into?
>
> Usually the rseq bits go through the -tip tree.
Thomas, Ingo, can you please take this to -tip tree? Or who would that be?
> Thanks,
>
> Mathieu
>
> >
> >> ---
> >> Changes in v6:
> >> - Added a comment to struct rseq with MPK rules
> >>
> >> Changes in v4:
> >> - Added Fixes tag
> >>
> >> Changes in v3:
> >> - simplify control flow to always enable access to 0 pkey
> >>
> >> Changes in v2:
> >> - fixed typos and reworded the comment
> >> ---
> >> include/uapi/linux/rseq.h | 4 ++++
> >> kernel/rseq.c | 11 +++++++++++
> >> 2 files changed, 15 insertions(+)
> >>
> >> diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
> >> index c233aae5eac90..019fd248cf749 100644
> >> --- a/include/uapi/linux/rseq.h
> >> +++ b/include/uapi/linux/rseq.h
> >> @@ -58,6 +58,10 @@ struct rseq_cs {
> >> * contained within a single cache-line.
> >> *
> >> * A single struct rseq per thread is allowed.
> >> + *
> >> + * If struct rseq or struct rseq_cs is used with Memory Protection Keys,
> >> + * then the assigned pkey should either be accessible whenever these structs
> >> + * are registered/installed, or they should be protected with pkey 0.
> >> */
> >> struct rseq {
> >> /*
> >> diff --git a/kernel/rseq.c b/kernel/rseq.c
> >> index 2cb16091ec0ae..9d9c976d3b78c 100644
> >> --- a/kernel/rseq.c
> >> +++ b/kernel/rseq.c
> >> @@ -10,6 +10,7 @@
> >>
> >> #include <linux/sched.h>
> >> #include <linux/uaccess.h>
> >> +#include <linux/pkeys.h>
> >> #include <linux/syscalls.h>
> >> #include <linux/rseq.h>
> >> #include <linux/types.h>
> >> @@ -402,11 +403,19 @@ static int rseq_ip_fixup(struct pt_regs *regs)
> >> void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
> >> {
> >> struct task_struct *t = current;
> >> + pkey_reg_t saved_pkey;
> >> int ret, sig;
> >>
> >> if (unlikely(t->flags & PF_EXITING))
> >> return;
> >>
> >> + /*
> >> + * Enable access to the default (0) pkey in case the thread has
> >> + * currently disabled access to it and struct rseq/rseq_cs has
> >> + * 0 pkey assigned (the only supported value for now).
> >> + */
> >> + saved_pkey = enable_zero_pkey_val();
> >> +
> >> /*
> >> * regs is NULL if and only if the caller is in a syscall path. Skip
> >> * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
> >> @@ -419,9 +428,11 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
> >> }
> >> if (unlikely(rseq_update_cpu_node_id(t)))
> >> goto error;
> >> + write_pkey_val(saved_pkey);
> >> return;
> >>
> >> error:
> >> + write_pkey_val(saved_pkey);
> >> sig = ksig ? ksig->sig : 0;
> >> force_sigsegv(sig);
> >> }
> >> --
> >> 2.48.1.658.g4767266eb4-goog
> >>
>
>
> --
> Mathieu Desnoyers
> EfficiOS Inc.
> https://www.efficios.com
* Dmitry Vyukov <dvyukov@google.com> wrote:
> > >> If an application registers rseq, and ever switches to another
> > >> pkey protection (such that the rseq becomes inaccessible), then
> > >> any context switch will cause failure in
> > >> __rseq_handle_notify_resume() attempting to read/write struct
> > >> rseq and/or rseq_cs. Since context switches are asynchronous and
> > >> are outside of the application control (not part of the
> > >> restricted code scope), temporarily switch to pkey value that
> > >> allows access to the 0 (default) PKEY.
> > >>
> > >> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> > >> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> > >> Cc: Peter Zijlstra <peterz@infradead.org>
> > >> Cc: "Paul E. McKenney" <paulmck@kernel.org>
> > >> Cc: Boqun Feng <boqun.feng@gmail.com>
> > >> Cc: Thomas Gleixner <tglx@linutronix.de>
> > >> Cc: Ingo Molnar <mingo@redhat.com>
> > >> Cc: Borislav Petkov <bp@alien8.de>
> > >> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> > >> Cc: "H. Peter Anvin" <hpa@zytor.com>
> > >> Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
> > >> Cc: x86@kernel.org
> > >> Cc: linux-kernel@vger.kernel.org
> > >> Fixes: d7822b1e24f2 ("rseq: Introduce restartable sequences system call")
> > >
> > > Any remaining concerns with this series?
> > >
> > > What tree should it go into?
> >
> > Usually the rseq bits go through the -tip tree.
>
> Thomas, Ingo, can you please take this to -tip tree? Or who would that be?
I was waiting whether Dave Hansen would have an opinion on this series.
Also, could you please add all the new Reviewed-by tags for the next
version, plus there was still a bit of a discussion on patch #4, has
that been resolved?
Thanks,
Ingo
© 2016 - 2026 Red Hat, Inc.