arch/x86/include/asm/fpu/internal.h | 13 ++++++++----- arch/x86/kernel/process_32.c | 6 ++---- arch/x86/kernel/process_64.c | 6 ++---- 3 files changed, 12 insertions(+), 13 deletions(-)
When eagerly switching PKRU in switch_fpu_finish() it checks that
current is not a kernel thread as kernel threads will never use PKRU.
It's possible that this_cpu_read_stable() on current_task
(ie. get_current()) is returning an old cached value. To resolve this
reference next_p directly rather than relying on current.
As written it's possible when switching from a kernel thread to a
userspace thread to observe a cached PF_KTHREAD flag and never restore
the PKRU. And as a result this issue only occurs when switching
from a kernel thread to a userspace thread, switching from a non kernel
thread works perfectly fine because all that is considered in that
situation are the flags from some other non kernel task and the next fpu
is passed in to switch_fpu_finish().
This behavior only exists between 5.2 and 5.13 when it was fixed by a
rewrite decoupling PKRU from xstate, in:
commit 954436989cc5 ("x86/fpu: Remove PKRU handling from switch_fpu_finish()")
Unfortunately backporting the fix from 5.13 is probably not realistic as
it's part of a 60+ patch series which rewrites most of the PKRU handling.
Fixes: 0cecca9d03c9 ("x86/fpu: Eager switch PKRU state")
Signed-off-by: Brian Geffon <bgeffon@google.com>
Signed-off-by: Willis Kung <williskung@google.com>
Tested-by: Willis Kung <williskung@google.com>
Cc: <stable@vger.kernel.org> # v5.4.x
Cc: <stable@vger.kernel.org> # v5.10.x
---
arch/x86/include/asm/fpu/internal.h | 13 ++++++++-----
arch/x86/kernel/process_32.c | 6 ++----
arch/x86/kernel/process_64.c | 6 ++----
3 files changed, 12 insertions(+), 13 deletions(-)
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 03b3de491b5e..5ed702e2c55f 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -560,9 +560,11 @@ static inline void __fpregs_load_activate(void)
* The FPU context is only stored/restored for a user task and
* PF_KTHREAD is used to distinguish between kernel and user threads.
*/
-static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
+static inline void switch_fpu_prepare(struct task_struct *prev, int cpu)
{
- if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) {
+ struct fpu *old_fpu = &prev->thread.fpu;
+
+ if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD)) {
if (!copy_fpregs_to_fpstate(old_fpu))
old_fpu->last_cpu = -1;
else
@@ -581,10 +583,11 @@ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
* Load PKRU from the FPU context if available. Delay loading of the
* complete FPU state until the return to userland.
*/
-static inline void switch_fpu_finish(struct fpu *new_fpu)
+static inline void switch_fpu_finish(struct task_struct *next)
{
u32 pkru_val = init_pkru_value;
struct pkru_state *pk;
+ struct fpu *next_fpu = &next->thread.fpu;
if (!static_cpu_has(X86_FEATURE_FPU))
return;
@@ -598,7 +601,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
* PKRU state is switched eagerly because it needs to be valid before we
* return to userland e.g. for a copy_to_user() operation.
*/
- if (!(current->flags & PF_KTHREAD)) {
+ if (!(next->flags & PF_KTHREAD)) {
/*
* If the PKRU bit in xsave.header.xfeatures is not set,
* then the PKRU component was in init state, which means
@@ -607,7 +610,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
* in memory is not valid. This means pkru_val has to be
* set to 0 and not to init_pkru_value.
*/
- pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU);
+ pk = get_xsave_addr(&next_fpu->state.xsave, XFEATURE_PKRU);
pkru_val = pk ? pk->pkru : 0;
}
__write_pkru(pkru_val);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index b8ceec4974fe..352f876950ab 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -229,14 +229,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
- struct fpu *prev_fpu = &prev->fpu;
- struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
- switch_fpu_prepare(prev_fpu, cpu);
+ switch_fpu_prepare(prev_p, cpu);
/*
* Save away %gs. No need to save %fs, as it was saved on the
@@ -292,7 +290,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
this_cpu_write(current_task, next_p);
- switch_fpu_finish(next_fpu);
+ switch_fpu_finish(next_p);
/* Load the Intel cache allocation PQR MSR. */
resctrl_sched_in();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index da3cc3a10d63..633788362906 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -505,15 +505,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
- struct fpu *prev_fpu = &prev->fpu;
- struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
this_cpu_read(irq_count) != -1);
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
- switch_fpu_prepare(prev_fpu, cpu);
+ switch_fpu_prepare(prev_p, cpu);
/* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS().
@@ -565,7 +563,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
this_cpu_write(current_task, next_p);
this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
- switch_fpu_finish(next_fpu);
+ switch_fpu_finish(next_p);
/* Reload sp0. */
update_task_stack(next_p);
--
2.35.1.265.g69c8d7142f-goog
On Tue, Feb 15, 2022 at 11:22:33AM -0800, Brian Geffon wrote:
> When eagerly switching PKRU in switch_fpu_finish() it checks that
> current is not a kernel thread as kernel threads will never use PKRU.
> It's possible that this_cpu_read_stable() on current_task
> (ie. get_current()) is returning an old cached value. To resolve this
> reference next_p directly rather than relying on current.
>
> As written it's possible when switching from a kernel thread to a
> userspace thread to observe a cached PF_KTHREAD flag and never restore
> the PKRU. And as a result this issue only occurs when switching
> from a kernel thread to a userspace thread, switching from a non kernel
> thread works perfectly fine because all that is considered in that
> situation are the flags from some other non kernel task and the next fpu
> is passed in to switch_fpu_finish().
>
> This behavior only exists between 5.2 and 5.13 when it was fixed by a
> rewrite decoupling PKRU from xstate, in:
> commit 954436989cc5 ("x86/fpu: Remove PKRU handling from switch_fpu_finish()")
>
> Unfortunately backporting the fix from 5.13 is probably not realistic as
> it's part of a 60+ patch series which rewrites most of the PKRU handling.
>
> Fixes: 0cecca9d03c9 ("x86/fpu: Eager switch PKRU state")
> Signed-off-by: Brian Geffon <bgeffon@google.com>
> Signed-off-by: Willis Kung <williskung@google.com>
> Tested-by: Willis Kung <williskung@google.com>
> Cc: <stable@vger.kernel.org> # v5.4.x
> Cc: <stable@vger.kernel.org> # v5.10.x
> ---
> arch/x86/include/asm/fpu/internal.h | 13 ++++++++-----
> arch/x86/kernel/process_32.c | 6 ++----
> arch/x86/kernel/process_64.c | 6 ++----
> 3 files changed, 12 insertions(+), 13 deletions(-)
So this is ONLY for 5.4.y and 5.10.y? I'm really really loath to take
non-upstream changes as 95% of the time (really) it goes wrong.
How was this tested, and what do the maintainers of this subsystem
think? And will you be around to fix the bugs in this when they are
found?
And finally, what's wrong with 60+ patches to backport to fix a severe
issue? What's preventing that from happening? Did you try it and see
what exactly is involved?
thanks,
greg k-h
On Tue, Feb 15, 2022 at 2:45 PM Greg KH <gregkh@linuxfoundation.org> wrote:
>
> On Tue, Feb 15, 2022 at 11:22:33AM -0800, Brian Geffon wrote:
> > When eagerly switching PKRU in switch_fpu_finish() it checks that
> > current is not a kernel thread as kernel threads will never use PKRU.
> > It's possible that this_cpu_read_stable() on current_task
> > (ie. get_current()) is returning an old cached value. To resolve this
> > reference next_p directly rather than relying on current.
> >
> > As written it's possible when switching from a kernel thread to a
> > userspace thread to observe a cached PF_KTHREAD flag and never restore
> > the PKRU. And as a result this issue only occurs when switching
> > from a kernel thread to a userspace thread, switching from a non kernel
> > thread works perfectly fine because all that is considered in that
> > situation are the flags from some other non kernel task and the next fpu
> > is passed in to switch_fpu_finish().
> >
> > This behavior only exists between 5.2 and 5.13 when it was fixed by a
> > rewrite decoupling PKRU from xstate, in:
> > commit 954436989cc5 ("x86/fpu: Remove PKRU handling from switch_fpu_finish()")
> >
> > Unfortunately backporting the fix from 5.13 is probably not realistic as
> > it's part of a 60+ patch series which rewrites most of the PKRU handling.
> >
> > Fixes: 0cecca9d03c9 ("x86/fpu: Eager switch PKRU state")
> > Signed-off-by: Brian Geffon <bgeffon@google.com>
> > Signed-off-by: Willis Kung <williskung@google.com>
> > Tested-by: Willis Kung <williskung@google.com>
> > Cc: <stable@vger.kernel.org> # v5.4.x
> > Cc: <stable@vger.kernel.org> # v5.10.x
> > ---
> > arch/x86/include/asm/fpu/internal.h | 13 ++++++++-----
> > arch/x86/kernel/process_32.c | 6 ++----
> > arch/x86/kernel/process_64.c | 6 ++----
> > 3 files changed, 12 insertions(+), 13 deletions(-)
>
> So this is ONLY for 5.4.y and 5.10.y? I'm really really loath to take
> non-upstream changes as 95% of the time (really) it goes wrong.
That's correct, this bug was introduced in 5.2 and that code was
completely refactored in 5.13 indirectly fixing it.
>
> How was this tested, and what do the maintainers of this subsystem
> think? And will you be around to fix the bugs in this when they are
> found?
This has been trivial to reproduce, I've used a small repro which I've
put here: https://gist.github.com/bgaff/9f8cbfc8dd22e60f9492e4f0aff8f04f
, I also was able to reproduce this using the protection_keys self
tests on a 11th Gen Core i5-1135G7. I'm happy to commit to addressing
any bugs that may appear. I'll see what the maintainers say, but there
is also a smaller fix that just involves using this_cpu_read() in
switch_fpu_finish() for this specific issue, although that approach
isn't as clean.
>
> And finally, what's wrong with 60+ patches to backport to fix a severe
> issue? What's preventing that from happening? Did you try it and see
> what exactly is involved?
It was quite a substantial rewrite of that code with fixes layered on since.
>
> thanks,
>
> greg k-h
On 2/15/22 13:32, Brian Geffon wrote: >> How was this tested, and what do the maintainers of this subsystem >> think? And will you be around to fix the bugs in this when they are >> found? > This has been trivial to reproduce, I've used a small repro which I've > put here: https://gist.github.com/bgaff/9f8cbfc8dd22e60f9492e4f0aff8f04f > , I also was able to reproduce this using the protection_keys self > tests on a 11th Gen Core i5-1135G7. I've got an i7-1165G7, but I'm not seeing any failures on a 5.11 distro kernel. How long does this take for you to reproduce?
On Tue, Feb 15, 2022 at 4:42 PM Dave Hansen <dave.hansen@intel.com> wrote: > > On 2/15/22 13:32, Brian Geffon wrote: > >> How was this tested, and what do the maintainers of this subsystem > >> think? And will you be around to fix the bugs in this when they are > >> found? > > This has been trivial to reproduce, I've used a small repro which I've > > put here: https://gist.github.com/bgaff/9f8cbfc8dd22e60f9492e4f0aff8f04f > > , I also was able to reproduce this using the protection_keys self > > tests on a 11th Gen Core i5-1135G7. > > I've got an i7-1165G7, but I'm not seeing any failures on a > 5.11 distro kernel. > > How long does this take for you to reproduce? It reproduces for me in just a few seconds. I'm not sure what could be different about my setup, I've tested on a vanilla 5.10 kernel and it reproduced the same way. Brian
On Tue, Feb 15, 2022 at 4:42 PM Dave Hansen <dave.hansen@intel.com> wrote: > > On 2/15/22 13:32, Brian Geffon wrote: > >> How was this tested, and what do the maintainers of this subsystem > >> think? And will you be around to fix the bugs in this when they are > >> found? > > This has been trivial to reproduce, I've used a small repro which I've > > put here: https://gist.github.com/bgaff/9f8cbfc8dd22e60f9492e4f0aff8f04f > > , I also was able to reproduce this using the protection_keys self > > tests on a 11th Gen Core i5-1135G7. > > I've got an i7-1165G7, but I'm not seeing any failures on a > 5.11 distro kernel. > Hi Dave, I suspect the reason you're not seeing it is toolchain related, I'm building with clang 14.0.0 and it produces the sequence of instructions which use the cached value. Let me know if there is anything I can do to help you investigate this further. Brian
On Tue, Feb 15, 2022 at 09:01:54PM -0500, Brian Geffon wrote: > On Tue, Feb 15, 2022 at 4:42 PM Dave Hansen <dave.hansen@intel.com> wrote: > > > > On 2/15/22 13:32, Brian Geffon wrote: > > >> How was this tested, and what do the maintainers of this subsystem > > >> think? And will you be around to fix the bugs in this when they are > > >> found? > > > This has been trivial to reproduce, I've used a small repro which I've > > > put here: https://gist.github.com/bgaff/9f8cbfc8dd22e60f9492e4f0aff8f04f > > > , I also was able to reproduce this using the protection_keys self > > > tests on a 11th Gen Core i5-1135G7. > > > > I've got an i7-1165G7, but I'm not seeing any failures on a > > 5.11 distro kernel. > > > > Hi Dave, > I suspect the reason you're not seeing it is toolchain related, I'm > building with clang 14.0.0 and it produces the sequence of > instructions which use the cached value. Let me know if there is > anything I can do to help you investigate this further. Do older versions of clang have this problem? thanks, greg k-h
On Tue, Feb 15, 2022 at 04:32:48PM -0500, Brian Geffon wrote:
> On Tue, Feb 15, 2022 at 2:45 PM Greg KH <gregkh@linuxfoundation.org> wrote:
> >
> > On Tue, Feb 15, 2022 at 11:22:33AM -0800, Brian Geffon wrote:
> > > When eagerly switching PKRU in switch_fpu_finish() it checks that
> > > current is not a kernel thread as kernel threads will never use PKRU.
> > > It's possible that this_cpu_read_stable() on current_task
> > > (ie. get_current()) is returning an old cached value. To resolve this
> > > reference next_p directly rather than relying on current.
> > >
> > > As written it's possible when switching from a kernel thread to a
> > > userspace thread to observe a cached PF_KTHREAD flag and never restore
> > > the PKRU. And as a result this issue only occurs when switching
> > > from a kernel thread to a userspace thread, switching from a non kernel
> > > thread works perfectly fine because all that is considered in that
> > > situation are the flags from some other non kernel task and the next fpu
> > > is passed in to switch_fpu_finish().
> > >
> > > This behavior only exists between 5.2 and 5.13 when it was fixed by a
> > > rewrite decoupling PKRU from xstate, in:
> > > commit 954436989cc5 ("x86/fpu: Remove PKRU handling from switch_fpu_finish()")
> > >
> > > Unfortunately backporting the fix from 5.13 is probably not realistic as
> > > it's part of a 60+ patch series which rewrites most of the PKRU handling.
> > >
> > > Fixes: 0cecca9d03c9 ("x86/fpu: Eager switch PKRU state")
> > > Signed-off-by: Brian Geffon <bgeffon@google.com>
> > > Signed-off-by: Willis Kung <williskung@google.com>
> > > Tested-by: Willis Kung <williskung@google.com>
> > > Cc: <stable@vger.kernel.org> # v5.4.x
> > > Cc: <stable@vger.kernel.org> # v5.10.x
> > > ---
> > > arch/x86/include/asm/fpu/internal.h | 13 ++++++++-----
> > > arch/x86/kernel/process_32.c | 6 ++----
> > > arch/x86/kernel/process_64.c | 6 ++----
> > > 3 files changed, 12 insertions(+), 13 deletions(-)
> >
> > So this is ONLY for 5.4.y and 5.10.y? I'm really really loath to take
> > non-upstream changes as 95% of the time (really) it goes wrong.
>
> That's correct, this bug was introduced in 5.2 and that code was
> completely refactored in 5.13 indirectly fixing it.
What series of commits contain that work?
And again, why not just take them? What is wrong with that if this is
such a big issue?
> > How was this tested, and what do the maintainers of this subsystem
> > think? And will you be around to fix the bugs in this when they are
> > found?
>
> This has been trivial to reproduce, I've used a small repro which I've
> put here: https://gist.github.com/bgaff/9f8cbfc8dd22e60f9492e4f0aff8f04f
> , I also was able to reproduce this using the protection_keys self
> tests on a 11th Gen Core i5-1135G7. I'm happy to commit to addressing
> any bugs that may appear. I'll see what the maintainers say, but there
> is also a smaller fix that just involves using this_cpu_read() in
> switch_fpu_finish() for this specific issue, although that approach
> isn't as clean.
Can you add the test to the in-kernel tests so that we make sure it is
fixed and never comes back?
thanks,
greg k-h
On Wed, Feb 16, 2022 at 5:05 AM Greg KH <gregkh@linuxfoundation.org> wrote:
>
> On Tue, Feb 15, 2022 at 04:32:48PM -0500, Brian Geffon wrote:
> > On Tue, Feb 15, 2022 at 2:45 PM Greg KH <gregkh@linuxfoundation.org> wrote:
> > >
> > > On Tue, Feb 15, 2022 at 11:22:33AM -0800, Brian Geffon wrote:
> > > > When eagerly switching PKRU in switch_fpu_finish() it checks that
> > > > current is not a kernel thread as kernel threads will never use PKRU.
> > > > It's possible that this_cpu_read_stable() on current_task
> > > > (ie. get_current()) is returning an old cached value. To resolve this
> > > > reference next_p directly rather than relying on current.
> > > >
> > > > As written it's possible when switching from a kernel thread to a
> > > > userspace thread to observe a cached PF_KTHREAD flag and never restore
> > > > the PKRU. And as a result this issue only occurs when switching
> > > > from a kernel thread to a userspace thread, switching from a non kernel
> > > > thread works perfectly fine because all that is considered in that
> > > > situation are the flags from some other non kernel task and the next fpu
> > > > is passed in to switch_fpu_finish().
> > > >
> > > > This behavior only exists between 5.2 and 5.13 when it was fixed by a
> > > > rewrite decoupling PKRU from xstate, in:
> > > > commit 954436989cc5 ("x86/fpu: Remove PKRU handling from switch_fpu_finish()")
> > > >
> > > > Unfortunately backporting the fix from 5.13 is probably not realistic as
> > > > it's part of a 60+ patch series which rewrites most of the PKRU handling.
> > > >
> > > > Fixes: 0cecca9d03c9 ("x86/fpu: Eager switch PKRU state")
> > > > Signed-off-by: Brian Geffon <bgeffon@google.com>
> > > > Signed-off-by: Willis Kung <williskung@google.com>
> > > > Tested-by: Willis Kung <williskung@google.com>
> > > > Cc: <stable@vger.kernel.org> # v5.4.x
> > > > Cc: <stable@vger.kernel.org> # v5.10.x
> > > > ---
> > > > arch/x86/include/asm/fpu/internal.h | 13 ++++++++-----
> > > > arch/x86/kernel/process_32.c | 6 ++----
> > > > arch/x86/kernel/process_64.c | 6 ++----
> > > > 3 files changed, 12 insertions(+), 13 deletions(-)
> > >
> > > So this is ONLY for 5.4.y and 5.10.y? I'm really really loath to take
> > > non-upstream changes as 95% of the time (really) it goes wrong.
> >
> > That's correct, this bug was introduced in 5.2 and that code was
> > completely refactored in 5.13 indirectly fixing it.
>
Hi Greg,
> What series of commits contain that work?
This is the series,
https://lore.kernel.org/all/20210623120127.327154589@linutronix.de/,
it does quite a bit of cleanup to correct the larger problem of having
pkru merged into xstate.
> And again, why not just take them? What is wrong with that if this is
> such a big issue?
Anything is possible I suppose but looking through the series it seems
that it's not going to apply cleanly so we're going to end up with
something that, if we made it work, would look very different and
would touch a much larger cross section of code. If the concern here
is risk of things going wrong, attempting to pull back or cherry-pick
parts of this series seems riskier. However, if we don't attempt to
pull back all those patches I will likely be proposing at least one
more small patch for 5.4 and 5.10 to fix PKRU in these kernels because
right now it's broken, particularly on AMD processors as Dave
mentioned.
>
> > > How was this tested, and what do the maintainers of this subsystem
> > > think? And will you be around to fix the bugs in this when they are
> > > found?
> >
> > This has been trivial to reproduce, I've used a small repro which I've
> > put here: https://gist.github.com/bgaff/9f8cbfc8dd22e60f9492e4f0aff8f04f
> > , I also was able to reproduce this using the protection_keys self
> > tests on a 11th Gen Core i5-1135G7. I'm happy to commit to addressing
> > any bugs that may appear. I'll see what the maintainers say, but there
> > is also a smaller fix that just involves using this_cpu_read() in
> > switch_fpu_finish() for this specific issue, although that approach
> > isn't as clean.
>
> Can you add the test to the in-kernel tests so that we make sure it is
> fixed and never comes back?
I'm already able to reproduce it with the kernel selftests. To be
honest, I'm not sure why this hasn't been reported yet. I could be
doing something horribly wrong. But it seems the likely reason is that
my compiler is doing what it's allowed to do, which is optimize the
load of current_task. current -> get_current() ->
this_cpu_read_stable(current_task) is allowed to read a cached value.
Perhaps gcc is just not taking advantage of that optimization, I'm not
sure. But writing to current_task and then immediately reading it back
via this_cpu_read_stable() can be problematic for this reason, and the
disassembled code shows that this happening.
Brian
>
> thanks,
>
> greg k-h
On 2/16/22 02:05, Greg KH wrote: >>> How was this tested, and what do the maintainers of this subsystem >>> think? And will you be around to fix the bugs in this when they are >>> found? >> This has been trivial to reproduce, I've used a small repro which I've >> put here: https://gist.github.com/bgaff/9f8cbfc8dd22e60f9492e4f0aff8f04f >> , I also was able to reproduce this using the protection_keys self >> tests on a 11th Gen Core i5-1135G7. I'm happy to commit to addressing >> any bugs that may appear. I'll see what the maintainers say, but there >> is also a smaller fix that just involves using this_cpu_read() in >> switch_fpu_finish() for this specific issue, although that approach >> isn't as clean. > Can you add the test to the in-kernel tests so that we make sure it is > fixed and never comes back? It would be great if Brian could confirm this. But, I'm 99% sure that this can be reproduced in the vm/protection_keys.c selftest, if you run it for long enough. The symptom here is corruption of the PKRU register. I created *lots* of bugs like this during protection keys development so the selftest keeps a shadow copy of the register to specifically watch for corruption. It's _plausible_ that no one ever ran the pkey selftests with a clang-compiled kernel for long enough to hit this issue.
On Wed, Feb 16, 2022 at 10:16 AM Dave Hansen <dave.hansen@intel.com> wrote: > > On 2/16/22 02:05, Greg KH wrote: > >>> How was this tested, and what do the maintainers of this subsystem > >>> think? And will you be around to fix the bugs in this when they are > >>> found? > >> This has been trivial to reproduce, I've used a small repro which I've > >> put here: https://gist.github.com/bgaff/9f8cbfc8dd22e60f9492e4f0aff8f04f > >> , I also was able to reproduce this using the protection_keys self > >> tests on a 11th Gen Core i5-1135G7. I'm happy to commit to addressing > >> any bugs that may appear. I'll see what the maintainers say, but there > >> is also a smaller fix that just involves using this_cpu_read() in > >> switch_fpu_finish() for this specific issue, although that approach > >> isn't as clean. > > Can you add the test to the in-kernel tests so that we make sure it is > > fixed and never comes back? > > It would be great if Brian could confirm this. But, I'm 99% sure that > this can be reproduced in the vm/protection_keys.c selftest, if you run > it for long enough. Hi Dave, Yes, this is reproduced by the protection keys selfs tests. If your kernel was compiled in a way which caches current_task when read via this_cpu_read_stable(), then when switching from a kernel thread to a user thread you will observe this behavior, so the only situation where it's timing related is waiting for that switch from a kernel to user thread. If your kernel is compiled in a way which does not cache the value of current_task then you will never observe this behavior. My understanding is that this is perfectly valid for the compiler to produce that code. > > The symptom here is corruption of the PKRU register. I created *lots* > of bugs like this during protection keys development so the selftest > keeps a shadow copy of the register to specifically watch for corruption. > > It's _plausible_ that no one ever ran the pkey selftests with a > clang-compiled kernel for long enough to hit this issue. For ChromeOS we use clang and when I tested a vanilla 5.10 kernel _built with clang_ it also reproduced, so I suspect you're right that the self tests just haven't been run against clang built kernels all that often. How would you and Greg KH like to proceed with this? I'm happy to help however I can. Brian
On 2/17/22 05:31, Brian Geffon wrote: > How would you and Greg KH like to proceed with this? I'm happy to help > however I can. If I could wave a magic wand, I'd just apply the whole FPU rewrite to stable. My second choice would be to stop managing PKRU with XSAVE. x86_pkru_load() uses WRPKRU instead of XSAVE and keeps the task's PKRU in task->pkru instead of the XSAVE buffer. Doing that will take some care, including pulling XFEATURE_PKRU out of the feature mask (RFBM) at XRSTOR. I _think_ that can be done in a manageable set of patches which will keep stable close to mainline. I recognize that more bugs might get introduced in the process which are unique to stable. If you give that a shot and realize that it's not feasible to do a subset, then we can fall back to the minimal fix. I'm not asking for a multi-month engineering effort here. Maybe an hour or two to see if it's really as scary as it looks.
On Thu, Feb 17, 2022 at 11:44 AM Dave Hansen <dave.hansen@intel.com> wrote: > > On 2/17/22 05:31, Brian Geffon wrote: > > How would you and Greg KH like to proceed with this? I'm happy to help > > however I can. > > If I could wave a magic wand, I'd just apply the whole FPU rewrite to > stable. > > My second choice would be to stop managing PKRU with XSAVE. > x86_pkru_load() uses WRPKRU instead of XSAVE and keeps the task's PKRU > in task->pkru instead of the XSAVE buffer. Doing that will take some > care, including pulling XFEATURE_PKRU out of the feature mask (RFBM) at > XRSTOR. I _think_ that can be done in a manageable set of patches which > will keep stable close to mainline. I recognize that more bugs might > get introduced in the process which are unique to stable. Hi Dave, I did take some time to look through that series and pick out what I think is the minimum set that would pull out PKRU from xstate, that list is: 9782a712eb x86/fpu: Add PKRU storage outside of task XSAVE buffer 784a46618f x86/pkeys: Move read_pkru() and write_pkru() ff7ebff47c59 x86/pkru: Provide pkru_write_default() 739e2eec0f x86/pkru: Provide pkru_get_init_value() fa8c84b77a x86/cpu: Write the default PKRU value when enabling PKE 72a6c08c44 x86/pkru: Remove xstate fiddling from write_pkru() 2ebe81c6d8 x86/fpu: Dont restore PKRU in fpregs_restore_userspace() 71ef453355 x86/kvm: Avoid looking up PKRU in XSAVE buffer 954436989c x86/fpu: Remove PKRU handling from switch_fpu_finish() e84ba47e31 x86/fpu: Hook up PKRU into ptrace() 30a304a138 x86/fpu: Mask PKRU from kernel XRSTOR[S] operations 0e8c54f6b2c x86/fpu: Don't store PKRU in xstate in fpu_reset_fpstate() The majority of these don't apply cleanly to 5.4, and there are some other patches we'd have to pull back too that moved code and some of the those won't be needed for 5.10 though. TBH, I'm not sure it makes sense to try to do this just given the fact that most do not cleanly apply. Brian > > If you give that a shot and realize that it's not feasible to do a > subset, then we can fall back to the minimal fix. I'm not asking for a > multi-month engineering effort here. Maybe an hour or two to see if > it's really as scary as it looks.
On 2/15/22 11:22, Brian Geffon wrote:
> When eagerly switching PKRU in switch_fpu_finish() it checks that
> current is not a kernel thread as kernel threads will never use PKRU.
> It's possible that this_cpu_read_stable() on current_task
> (ie. get_current()) is returning an old cached value. To resolve this
> reference next_p directly rather than relying on current.
>
> As written it's possible when switching from a kernel thread to a
> userspace thread to observe a cached PF_KTHREAD flag and never restore
> the PKRU. And as a result this issue only occurs when switching
> from a kernel thread to a userspace thread, switching from a non kernel
> thread works perfectly fine because all that is considered in that
> situation are the flags from some other non kernel task and the next fpu
> is passed in to switch_fpu_finish().
>
> This behavior only exists between 5.2 and 5.13 when it was fixed by a
> rewrite decoupling PKRU from xstate, in:
> commit 954436989cc5 ("x86/fpu: Remove PKRU handling from switch_fpu_finish()")
>
> Unfortunately backporting the fix from 5.13 is probably not realistic as
> it's part of a 60+ patch series which rewrites most of the PKRU handling.
>
> Fixes: 0cecca9d03c9 ("x86/fpu: Eager switch PKRU state")
> Signed-off-by: Brian Geffon <bgeffon@google.com>
> Signed-off-by: Willis Kung <williskung@google.com>
> Tested-by: Willis Kung <williskung@google.com>
> Cc: <stable@vger.kernel.org> # v5.4.x
> Cc: <stable@vger.kernel.org> # v5.10.x
I don't like forking the stable code from mainline. But I also think
that backporting the FPU reworking that changed the PKRU handling is
likely to cause more bugs in stable than it fixes.
This fix is at least isolated to the protection keys code.
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
© 2016 - 2026 Red Hat, Inc.