[PATCH v4 09/14] x86/pv: Adjust GS handling for FRED mode

Andrew Cooper posted 14 patches 3 days, 4 hours ago
[PATCH v4 09/14] x86/pv: Adjust GS handling for FRED mode
Posted by Andrew Cooper 3 days, 4 hours ago
When FRED is active, hardware automatically swaps GS when changing privilege,
and the SWAPGS instruction is disallowed.

For native OSes using GS as the thread local pointer this is a massive
improvement on the pre-FRED architecture, but under Xen it makes handling PV
guests more complicated.  Specifically, it means that GS_BASE and GS_SHADOW
are the opposite way around in FRED mode, as opposed to IDT mode.

This leads to the following changes:

  * In load_segments(), we have to load both GSes.  Account for this in the
    SWAP() condition and avoid the path with SWAGS.

  * In save_segments(), we need to read GS_SHADOW rather than GS_BASE.

  * In toggle_guest_mode(), we need to emulate SWAPGS.

  * In {read,write}_msr() which access the live registers, GS_SHADOW and
    GS_BASE need swapping.

  * In do_set_segment_base(), merge the SEGBASE_GS_{USER,KERNEL} cases and
    take FRED into account when choosing which base to update.

    SEGBASE_GS_USER_SEL was already an LKGS invocation (decades before FRED)
    so under FRED needs to be just a MOV %gs.  Simply skip the SWAPGSes.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>

v4:
 * Adjust GS accesses for emulated {RD,WR}MSR too.
---
 xen/arch/x86/domain.c             | 16 +++++++++++-----
 xen/arch/x86/pv/domain.c          | 22 ++++++++++++++++++++--
 xen/arch/x86/pv/emul-priv-op.c    | 24 +++++++++++++++---------
 xen/arch/x86/pv/misc-hypercalls.c | 16 ++++++++++------
 4 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index e658c2d647b7..9c1f6ef76d52 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1791,9 +1791,10 @@ static void load_segments(struct vcpu *n)
 
         /*
          * Figure out which way around gsb/gss want to be.  gsb needs to be
-         * the active context, and gss needs to be the inactive context.
+         * the active context, and gss needs to be the inactive context,
+         * unless we're in FRED mode where they're reversed.
          */
-        if ( !(n->arch.flags & TF_kernel_mode) )
+        if ( !(n->arch.flags & TF_kernel_mode) ^ opt_fred )
             SWAP(gsb, gss);
 
         if ( using_svm() && (n->arch.pv.fs | n->arch.pv.gs) <= 3 )
@@ -1814,7 +1815,9 @@ static void load_segments(struct vcpu *n)
 
     if ( !fs_gs_done && !compat )
     {
-        if ( read_cr4() & X86_CR4_FSGSBASE )
+        unsigned long cr4 = read_cr4();
+
+        if ( !(cr4 & X86_CR4_FRED) && (cr4 & X86_CR4_FSGSBASE) )
         {
             __wrgsbase(gss);
             __wrfsbase(n->arch.pv.fs_base);
@@ -1931,6 +1934,9 @@ static void load_segments(struct vcpu *n)
  * Guests however cannot use SWAPGS, so there is no mechanism to modify the
  * inactive GS base behind Xen's back.  Therefore, Xen's copy of the inactive
  * GS base is still accurate, and doesn't need reading back from hardware.
+ *
+ * Under FRED, hardware automatically swaps GS for us, so SHADOW_GS is the
+ * active GS from the guest's point of view.
  */
 static void save_segments(struct vcpu *v)
 {
@@ -1946,12 +1952,12 @@ static void save_segments(struct vcpu *v)
         if ( read_cr4() & X86_CR4_FSGSBASE )
         {
             fs_base = __rdfsbase();
-            gs_base = __rdgsbase();
+            gs_base = opt_fred ? rdmsr(MSR_SHADOW_GS_BASE) : __rdgsbase();
         }
         else
         {
             fs_base = rdmsr(MSR_FS_BASE);
-            gs_base = rdmsr(MSR_GS_BASE);
+            gs_base = opt_fred ? rdmsr(MSR_SHADOW_GS_BASE) : rdmsr(MSR_GS_BASE);
         }
 
         v->arch.pv.fs_base = fs_base;
diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
index d16583a7454d..b85abb5ed903 100644
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -14,9 +14,10 @@
 #include <asm/cpufeature.h>
 #include <asm/fsgsbase.h>
 #include <asm/invpcid.h>
-#include <asm/spec_ctrl.h>
 #include <asm/pv/domain.h>
 #include <asm/shadow.h>
+#include <asm/spec_ctrl.h>
+#include <asm/traps.h>
 
 #ifdef CONFIG_PV32
 int8_t __read_mostly opt_pv32 = -1;
@@ -514,11 +515,28 @@ void toggle_guest_mode(struct vcpu *v)
      * subsequent context switch won't bother re-reading it.
      */
     gs_base = read_gs_base();
+
+    /*
+     * In FRED mode, not only are the two GSes the other way around (i.e. we
+     * want to read GS_SHADOW here), the SWAPGS instruction is disallowed so
+     * we have to emulate it.
+     */
+    if ( opt_fred )
+    {
+        unsigned long gs_shadow = rdmsr(MSR_SHADOW_GS_BASE);
+
+        wrmsrns(MSR_SHADOW_GS_BASE, gs_base);
+        write_gs_base(gs_shadow);
+
+        gs_base = gs_shadow;
+    }
+    else
+        asm volatile ( "swapgs" );
+
     if ( v->arch.flags & TF_kernel_mode )
         v->arch.pv.gs_base_kernel = gs_base;
     else
         v->arch.pv.gs_base_user = gs_base;
-    asm volatile ( "swapgs" );
 
     _toggle_guest_pt(v);
 
diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
index 87d3bbcf901f..81153084129a 100644
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -25,6 +25,7 @@
 #include <asm/pv/traps.h>
 #include <asm/shared.h>
 #include <asm/stubs.h>
+#include <asm/traps.h>
 
 #include <xsm/xsm.h>
 
@@ -926,7 +927,7 @@ static int cf_check read_msr(
     case MSR_GS_BASE:
         if ( !cp->extd.lm )
             break;
-        *val = read_gs_base();
+        *val = opt_fred ? rdmsr(MSR_SHADOW_GS_BASE) : read_gs_base();
         return X86EMUL_OKAY;
 
     case MSR_SHADOW_GS_BASE:
@@ -1066,17 +1067,22 @@ static int cf_check write_msr(
         if ( !cp->extd.lm || !is_canonical_address(val) )
             break;
 
-        if ( reg == MSR_FS_BASE )
-            write_fs_base(val);
-        else if ( reg == MSR_GS_BASE )
-            write_gs_base(val);
-        else if ( reg == MSR_SHADOW_GS_BASE )
+        switch ( reg )
         {
-            write_gs_shadow(val);
+        case MSR_FS_BASE:
+            write_fs_base(val);
+            break;
+
+        case MSR_SHADOW_GS_BASE:
             curr->arch.pv.gs_base_user = val;
+            fallthrough;
+        case MSR_GS_BASE:
+            if ( (reg == MSR_GS_BASE) ^ opt_fred )
+                write_gs_base(val);
+            else
+                write_gs_shadow(val);
+            break;
         }
-        else
-            ASSERT_UNREACHABLE();
         return X86EMUL_OKAY;
 
     case MSR_EFER:
diff --git a/xen/arch/x86/pv/misc-hypercalls.c b/xen/arch/x86/pv/misc-hypercalls.c
index 4c2abeb4add8..2c9cf50638db 100644
--- a/xen/arch/x86/pv/misc-hypercalls.c
+++ b/xen/arch/x86/pv/misc-hypercalls.c
@@ -11,6 +11,7 @@
 
 #include <asm/debugreg.h>
 #include <asm/fsgsbase.h>
+#include <asm/traps.h>
 
 long do_set_debugreg(int reg, unsigned long value)
 {
@@ -192,11 +193,12 @@ long do_set_segment_base(unsigned int which, unsigned long base)
 
         case SEGBASE_GS_USER:
             v->arch.pv.gs_base_user = base;
-            write_gs_shadow(base);
-            break;
-
+            fallthrough;
         case SEGBASE_GS_KERNEL:
-            write_gs_base(base);
+            if ( (which == SEGBASE_GS_KERNEL) ^ opt_fred )
+                write_gs_base(base);
+            else
+                write_gs_shadow(base);
             break;
         }
         break;
@@ -209,7 +211,8 @@ long do_set_segment_base(unsigned int which, unsigned long base)
          * We wish to update the user %gs from the GDT/LDT.  Currently, the
          * guest kernel's GS_BASE is in context.
          */
-        asm volatile ( "swapgs" );
+        if ( !opt_fred )
+            asm volatile ( "swapgs" );
 
         if ( sel > 3 )
             /* Fix up RPL for non-NUL selectors. */
@@ -247,7 +250,8 @@ long do_set_segment_base(unsigned int which, unsigned long base)
         /* Update the cache of the inactive base, as read from the GDT/LDT. */
         v->arch.pv.gs_base_user = read_gs_base();
 
-        asm volatile ( safe_swapgs );
+        if ( !opt_fred )
+            asm volatile ( safe_swapgs );
         break;
     }
 
-- 
2.39.5


Re: [PATCH v4 09/14] x86/pv: Adjust GS handling for FRED mode
Posted by Jan Beulich 11 hours ago
On 28.02.2026 00:16, Andrew Cooper wrote:
> When FRED is active, hardware automatically swaps GS when changing privilege,
> and the SWAPGS instruction is disallowed.
> 
> For native OSes using GS as the thread local pointer this is a massive
> improvement on the pre-FRED architecture, but under Xen it makes handling PV
> guests more complicated.  Specifically, it means that GS_BASE and GS_SHADOW
> are the opposite way around in FRED mode, as opposed to IDT mode.
> 
> This leads to the following changes:
> 
>   * In load_segments(), we have to load both GSes.  Account for this in the
>     SWAP() condition and avoid the path with SWAGS.
> 
>   * In save_segments(), we need to read GS_SHADOW rather than GS_BASE.
> 
>   * In toggle_guest_mode(), we need to emulate SWAPGS.
> 
>   * In {read,write}_msr() which access the live registers, GS_SHADOW and
>     GS_BASE need swapping.
> 
>   * In do_set_segment_base(), merge the SEGBASE_GS_{USER,KERNEL} cases and
>     take FRED into account when choosing which base to update.
> 
>     SEGBASE_GS_USER_SEL was already an LKGS invocation (decades before FRED)
>     so under FRED needs to be just a MOV %gs.  Simply skip the SWAPGSes.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>
> ---
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Roger Pau Monné <roger.pau@citrix.com>
> 
> v4:
>  * Adjust GS accesses for emulated {RD,WR}MSR too.

I think this ...

> @@ -926,7 +927,7 @@ static int cf_check read_msr(
>      case MSR_GS_BASE:
>          if ( !cp->extd.lm )
>              break;
> -        *val = read_gs_base();
> +        *val = opt_fred ? rdmsr(MSR_SHADOW_GS_BASE) : read_gs_base();
>          return X86EMUL_OKAY;

... calls for a comment both here and ...

> @@ -1066,17 +1067,22 @@ static int cf_check write_msr(
>          if ( !cp->extd.lm || !is_canonical_address(val) )
>              break;
>  
> -        if ( reg == MSR_FS_BASE )
> -            write_fs_base(val);
> -        else if ( reg == MSR_GS_BASE )
> -            write_gs_base(val);
> -        else if ( reg == MSR_SHADOW_GS_BASE )
> +        switch ( reg )
>          {
> -            write_gs_shadow(val);
> +        case MSR_FS_BASE:
> +            write_fs_base(val);
> +            break;
> +
> +        case MSR_SHADOW_GS_BASE:
>              curr->arch.pv.gs_base_user = val;
> +            fallthrough;
> +        case MSR_GS_BASE:
> +            if ( (reg == MSR_GS_BASE) ^ opt_fred )
> +                write_gs_base(val);
> +            else
> +                write_gs_shadow(val);
> +            break;

... here, much like you do about everywhere else.

> @@ -192,11 +193,12 @@ long do_set_segment_base(unsigned int which, unsigned long base)
>  
>          case SEGBASE_GS_USER:
>              v->arch.pv.gs_base_user = base;
> -            write_gs_shadow(base);
> -            break;
> -
> +            fallthrough;
>          case SEGBASE_GS_KERNEL:
> -            write_gs_base(base);
> +            if ( (which == SEGBASE_GS_KERNEL) ^ opt_fred )
> +                write_gs_base(base);
> +            else
> +                write_gs_shadow(base);
>              break;
>          }
>          break;

Same perhaps here, and ...

> @@ -209,7 +211,8 @@ long do_set_segment_base(unsigned int which, unsigned long base)
>           * We wish to update the user %gs from the GDT/LDT.  Currently, the
>           * guest kernel's GS_BASE is in context.
>           */
> -        asm volatile ( "swapgs" );
> +        if ( !opt_fred )
> +            asm volatile ( "swapgs" );

... the comment in context could also do with inserting "unless using FRED"
or some such.

Jan