FRED RSP0 is a per task constant pointing to top of its kernel stack
for user level event delivery, and needs to be updated when a task is
scheduled in.
Introduce a new TI flag TIF_LOAD_USER_STATES to track whether FRED RSP0
needs to be loaded, and do the actual load of FRED RSP0 in
arch_exit_to_user_mode_prepare() if the TI flag is set, thus to avoid a
fair number of WRMSRs in both KVM and the kernel.
Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Xin Li (Intel) <xin@zytor.com>
---
arch/x86/include/asm/entry-common.h | 5 +++++
arch/x86/include/asm/switch_to.h | 3 +--
arch/x86/include/asm/thread_info.h | 2 ++
arch/x86/kernel/cpu/cpuid-deps.c | 1 -
4 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index 4c78b99060b5..ae365579efb3 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -51,6 +51,11 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
if (ti_work & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();
+ if (cpu_feature_enabled(X86_FEATURE_FRED) &&
+ (ti_work & _TIF_LOAD_USER_STATES))
+ wrmsrns(MSR_IA32_FRED_RSP0,
+ (unsigned long)task_stack_page(current) + THREAD_SIZE);
+
if (unlikely(ti_work & _TIF_IO_BITMAP))
tss_update_io_bitmap();
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index c3bd0c0758c9..a31ea544cc0e 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -71,8 +71,7 @@ static inline void update_task_stack(struct task_struct *task)
this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
#else
if (cpu_feature_enabled(X86_FEATURE_FRED)) {
- /* WRMSRNS is a baseline feature for FRED. */
- wrmsrns(MSR_IA32_FRED_RSP0, (unsigned long)task_stack_page(task) + THREAD_SIZE);
+ set_thread_flag(TIF_LOAD_USER_STATES);
} else if (cpu_feature_enabled(X86_FEATURE_XENPV)) {
/* Xen PV enters the kernel on the thread stack. */
load_sp0(task_top_of_stack(task));
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 12da7dfd5ef1..fb51904651c0 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -106,6 +106,7 @@ struct thread_info {
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
+#define TIF_LOAD_USER_STATES 30 /* Load user level states */
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
@@ -128,6 +129,7 @@ struct thread_info {
#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
#define _TIF_ADDR32 (1 << TIF_ADDR32)
+#define _TIF_LOAD_USER_STATES (1 << TIF_LOAD_USER_STATES)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW_BASE \
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index b7d9f530ae16..8bd84114c2d9 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -83,7 +83,6 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
{ X86_FEATURE_SHSTK, X86_FEATURE_XSAVES },
{ X86_FEATURE_FRED, X86_FEATURE_LKGS },
- { X86_FEATURE_FRED, X86_FEATURE_WRMSRNS },
{}
};
--
2.45.2
On 7.08.24 г. 8:47 ч., Xin Li (Intel) wrote:
> FRED RSP0 is a per task constant pointing to top of its kernel stack
> for user level event delivery, and needs to be updated when a task is
> scheduled in.
>
> Introduce a new TI flag TIF_LOAD_USER_STATES to track whether FRED RSP0
> needs to be loaded, and do the actual load of FRED RSP0 in
> arch_exit_to_user_mode_prepare() if the TI flag is set, thus to avoid a
> fair number of WRMSRs in both KVM and the kernel.
>
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Xin Li (Intel) <xin@zytor.com>
> ---
> arch/x86/include/asm/entry-common.h | 5 +++++
> arch/x86/include/asm/switch_to.h | 3 +--
> arch/x86/include/asm/thread_info.h | 2 ++
> arch/x86/kernel/cpu/cpuid-deps.c | 1 -
> 4 files changed, 8 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
> index 4c78b99060b5..ae365579efb3 100644
> --- a/arch/x86/include/asm/entry-common.h
> +++ b/arch/x86/include/asm/entry-common.h
> @@ -51,6 +51,11 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
> if (ti_work & _TIF_USER_RETURN_NOTIFY)
> fire_user_return_notifiers();
>
> + if (cpu_feature_enabled(X86_FEATURE_FRED) &&
> + (ti_work & _TIF_LOAD_USER_STATES))
> + wrmsrns(MSR_IA32_FRED_RSP0,
> + (unsigned long)task_stack_page(current) + THREAD_SIZE);
> +
> if (unlikely(ti_work & _TIF_IO_BITMAP))
> tss_update_io_bitmap();
>
> diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
> index c3bd0c0758c9..a31ea544cc0e 100644
> --- a/arch/x86/include/asm/switch_to.h
> +++ b/arch/x86/include/asm/switch_to.h
> @@ -71,8 +71,7 @@ static inline void update_task_stack(struct task_struct *task)
> this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
> #else
> if (cpu_feature_enabled(X86_FEATURE_FRED)) {
> - /* WRMSRNS is a baseline feature for FRED. */
> - wrmsrns(MSR_IA32_FRED_RSP0, (unsigned long)task_stack_page(task) + THREAD_SIZE);
> + set_thread_flag(TIF_LOAD_USER_STATES);
> } else if (cpu_feature_enabled(X86_FEATURE_XENPV)) {
> /* Xen PV enters the kernel on the thread stack. */
> load_sp0(task_top_of_stack(task));
> diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
> index 12da7dfd5ef1..fb51904651c0 100644
> --- a/arch/x86/include/asm/thread_info.h
> +++ b/arch/x86/include/asm/thread_info.h
> @@ -106,6 +106,7 @@ struct thread_info {
> #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
> #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
> #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
> +#define TIF_LOAD_USER_STATES 30 /* Load user level states */
Wouldn't something along the l ines of TIF_LOAD_FRED_RSP be more
descriptive, or it's expected that this flag can cover more state in the
future?
>
> #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
> #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
> @@ -128,6 +129,7 @@ struct thread_info {
> #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
> #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
> #define _TIF_ADDR32 (1 << TIF_ADDR32)
> +#define _TIF_LOAD_USER_STATES (1 << TIF_LOAD_USER_STATES)
>
> /* flags to check in __switch_to() */
> #define _TIF_WORK_CTXSW_BASE \
> diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
> index b7d9f530ae16..8bd84114c2d9 100644
> --- a/arch/x86/kernel/cpu/cpuid-deps.c
> +++ b/arch/x86/kernel/cpu/cpuid-deps.c
> @@ -83,7 +83,6 @@ static const struct cpuid_dep cpuid_deps[] = {
> { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
> { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES },
> { X86_FEATURE_FRED, X86_FEATURE_LKGS },
> - { X86_FEATURE_FRED, X86_FEATURE_WRMSRNS },
> {}
> };
>
On 8/9/2024 3:45 AM, Nikolay Borisov wrote: >> +#define TIF_LOAD_USER_STATES 30 /* Load user level states */ > > Wouldn't something along the l ines of TIF_LOAD_FRED_RSP be more > descriptive, or it's expected that this flag can cover more state in the > future? Sean mentioned TIF_LOAD_FRED_RSP, however we also have FRED SSP0, which needs to be handled similarly. And we also want to use it to cover future states.
On August 9, 2024 10:37:08 AM PDT, Xin Li <xin@zytor.com> wrote: >On 8/9/2024 3:45 AM, Nikolay Borisov wrote: >>> +#define TIF_LOAD_USER_STATES 30 /* Load user level states */ >> >> Wouldn't something along the l ines of TIF_LOAD_FRED_RSP be more descriptive, or it's expected that this flag can cover more state in the future? > >Sean mentioned TIF_LOAD_FRED_RSP, however we also have FRED SSP0, which >needs to be handled similarly. > >And we also want to use it to cover future states. It is very likely that there will be additional uses of this beyond FRED.
On Tue, Aug 06 2024 at 22:47, Xin Li wrote:
> --- a/arch/x86/include/asm/entry-common.h
> +++ b/arch/x86/include/asm/entry-common.h
> @@ -51,6 +51,11 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
> if (ti_work & _TIF_USER_RETURN_NOTIFY)
> fire_user_return_notifiers();
>
> + if (cpu_feature_enabled(X86_FEATURE_FRED) &&
> + (ti_work & _TIF_LOAD_USER_STATES))
> + wrmsrns(MSR_IA32_FRED_RSP0,
> + (unsigned long)task_stack_page(current) + THREAD_SIZE);
Eyes are bleeding. If you do the inline in patch 1 then this becomes
if (cpu_feature_enabled(X86_FEATURE_FRED) && (ti_work & _TIF_LOAD_USER_STATES))
wrmsrns(MSR_IA32_FRED_RSP0, (unsigned long)task_stack_page(current) + THREAD_SIZE);
which is in the 100 character limit and readable.
Though I wonder if this should not have a per CPU cache for that.
if (cpu_feature_enabled(X86_FEATURE_FRED) {
unsigned long rsp0 = (unsigned long) task_stack_page(current) + THREAD_SIZE;
if (__this_cpu_read(cpu_fred_rsp0) != rsp0) {
wrmsrns(MSR_IA32_FRED_RSP0, fred_rsp0);
this_cpu_write(cpu_fred_rsp0, rsp0);
}
}
Hmm?
> if (unlikely(ti_work & _TIF_IO_BITMAP))
> tss_update_io_bitmap();
>
> diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
> index c3bd0c0758c9..a31ea544cc0e 100644
> --- a/arch/x86/include/asm/switch_to.h
> +++ b/arch/x86/include/asm/switch_to.h
> @@ -71,8 +71,7 @@ static inline void update_task_stack(struct task_struct *task)
> this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
> #else
> if (cpu_feature_enabled(X86_FEATURE_FRED)) {
> - /* WRMSRNS is a baseline feature for FRED. */
This comment is wrong when using the alternative WRMSR fallback and
should be remove by the alternatives patch.
> - wrmsrns(MSR_IA32_FRED_RSP0, (unsigned long)task_stack_page(task) + THREAD_SIZE);
> + set_thread_flag(TIF_LOAD_USER_STATES);
Thanks,
tglx
On Wed, Aug 07, 2024, Thomas Gleixner wrote:
> Though I wonder if this should not have a per CPU cache for that.
>
> if (cpu_feature_enabled(X86_FEATURE_FRED) {
> unsigned long rsp0 = (unsigned long) task_stack_page(current) + THREAD_SIZE;
>
> if (__this_cpu_read(cpu_fred_rsp0) != rsp0) {
> wrmsrns(MSR_IA32_FRED_RSP0, fred_rsp0);
> this_cpu_write(cpu_fred_rsp0, rsp0);
> }
> }
>
> Hmm?
A per-CPU cache would work for KVM too, KVM would just need to stuff the cache
with the guest's value instead of setting _TIF_LOAD_USER_STATES.
On Wed, Aug 07 2024 at 14:55, Sean Christopherson wrote:
> On Wed, Aug 07, 2024, Thomas Gleixner wrote:
>> Though I wonder if this should not have a per CPU cache for that.
>>
>> if (cpu_feature_enabled(X86_FEATURE_FRED) {
>> unsigned long rsp0 = (unsigned long) task_stack_page(current) + THREAD_SIZE;
>>
>> if (__this_cpu_read(cpu_fred_rsp0) != rsp0) {
>> wrmsrns(MSR_IA32_FRED_RSP0, fred_rsp0);
>> this_cpu_write(cpu_fred_rsp0, rsp0);
>> }
>> }
>>
>> Hmm?
>
> A per-CPU cache would work for KVM too, KVM would just need to stuff the cache
> with the guest's value instead of setting _TIF_LOAD_USER_STATES.
There are two options vs. the cache:
1) Use a cache only
static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
unsigned long ti_work)
{
if (IS_ENABLED(CONFIG_X86_DEBUG_FPU) || unlikely(ti_work))
arch_exit_work(ti_work);
fred_rsp0_magic();
2) Use both the TIF bit and the cache
static inline void arch_exit_work(unsigned long ti_work)
{
if (....)
...
fred_rsp0_magic();
}
#1 has the charm that it avoids arch_exit_work() completely when ti_work
is 0, but has the unconditional check on the per CPU cache variable
and the extra conditional on every return
#2 has the charm that it avoids touching the per CPU cache variable on
return when the TIF bit is not set, but comes with the downside of
going through the ti_work chain to update RSP0
I'm inclined to claim that #1 wins. Why?
The probability for a RSP0 update is higher than the probability for
ti_work != 0, and for syscall heavy workloads the extra per CPU read and
the conditional is not really relevant when we stick this into struct
pcpu_hot. That cacheline is hot anyway in that code path due to
'current' and other members there.
But that's a problem for micro benchmark experts to figure out. I'm not
one of them :)
In any case the cache should be a win over an unconditionl MSR write
independent of MRMSRNS.
Thanks,
tglx
© 2016 - 2025 Red Hat, Inc.