[PATCH v3 13/22] x86/traps: Enable FRED when requested

Andrew Cooper posted 22 patches 23 hours ago
[PATCH v3 13/22] x86/traps: Enable FRED when requested
Posted by Andrew Cooper 23 hours ago
With the shadow stack and exception handling adjustements in place, we can now
activate FRED when appropriate.  Note that opt_fred is still disabled by
default.

Introduce init_fred() to set up all the MSRs relevant for FRED.  FRED uses
MSR_STAR (entries from Ring3 only), and MSR_FRED_SSP_SL0 aliases MSR_PL0_SSP
when CET-SS is active.  Otherwise, they're all new MSRs.

With init_fred() existing, load_system_tables() and legacy_syscall_init()
should only be used when setting up IDT delivery.  Insert ASSERT()s to this
effect, and adjust the various *_init() functions to make this property true.

Per the documentation, percpu_early_traps_init() is responsible for switching
off the boot GDT, which needs doing even in FRED mode.

Finally, set CR4.FRED in traps_init()/percpu_early_traps_init().

Xen can now boot in FRED mode up until starting a PV guest, where it faults
because IRET is not permitted to change privilege.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>

v3:
 * Fix poisoning of SL1 pointers.
 * Adjust bsp_traps_reinit().  It probably doesn't matter.

v2:
 * Explain the lack of BUG_ON()
 * Posion SL1

In principle we can stop allocating the IDT and TSS for CPUs now, although I
want to get shutdown and kexec working before making this optimisation, in
case there's something I've overlooked.
---
 xen/arch/x86/include/asm/current.h |  3 ++
 xen/arch/x86/include/asm/traps.h   |  2 +
 xen/arch/x86/traps-setup.c         | 83 ++++++++++++++++++++++++++++--
 3 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/xen/arch/x86/include/asm/current.h b/xen/arch/x86/include/asm/current.h
index 62817e8476ec..6139980ab115 100644
--- a/xen/arch/x86/include/asm/current.h
+++ b/xen/arch/x86/include/asm/current.h
@@ -23,6 +23,9 @@
  * 2 - NMI IST stack
  * 1 - #MC IST stack
  * 0 - IST Shadow Stacks (4x 1k, read-only)
+ *
+ * In FRED mode, #DB and NMI do not need special stacks, so their IST stacks
+ * are unused.
  */
 
 /*
diff --git a/xen/arch/x86/include/asm/traps.h b/xen/arch/x86/include/asm/traps.h
index 73097e957d05..5d7504bc44d1 100644
--- a/xen/arch/x86/include/asm/traps.h
+++ b/xen/arch/x86/include/asm/traps.h
@@ -16,6 +16,8 @@ void traps_init(void);
 void bsp_traps_reinit(void);
 void percpu_traps_init(void);
 
+void nocall entry_FRED_R3(void);
+
 extern unsigned int ler_msr;
 
 const char *vector_name(unsigned int vec);
diff --git a/xen/arch/x86/traps-setup.c b/xen/arch/x86/traps-setup.c
index d77be8f83921..d937209ae606 100644
--- a/xen/arch/x86/traps-setup.c
+++ b/xen/arch/x86/traps-setup.c
@@ -59,6 +59,8 @@ static void load_system_tables(void)
         .limit = sizeof(bsp_idt) - 1,
     };
 
+    ASSERT(opt_fred == 0);
+
     /*
      * Set up the TSS.  Warning - may be live, and the NMI/#MC must remain
      * valid on every instruction boundary.  (Note: these are all
@@ -191,6 +193,8 @@ static void legacy_syscall_init(void)
     unsigned char *stub_page;
     unsigned int offset;
 
+    ASSERT(opt_fred == 0);
+
     /* No PV guests?  No need to set up SYSCALL/SYSENTER infrastructure. */
     if ( !IS_ENABLED(CONFIG_PV) )
         return;
@@ -268,6 +272,52 @@ static void __init init_ler(void)
     setup_force_cpu_cap(X86_FEATURE_XEN_LBR);
 }
 
+/*
+ * Set up all MSRs relevant for FRED event delivery.
+ *
+ * Xen does not use any of the optional config in MSR_FRED_CONFIG, so all that
+ * is needed is the entrypoint.
+ *
+ * Because FRED always provides a good stack, NMI and #DB do not need any
+ * special treatment.  Only #DF needs another stack level, and #MC for the
+ * offchance that Xen's main stack suffers an uncorrectable error.
+ *
+ * This makes Stack Level 1 unused, but we use #DB's stacks, and with the
+ * regular and shadow stacks reversed as posion to guarantee that any use
+ * escalates to #DF.
+ *
+ * FRED reuses MSR_STAR to provide the segment selector values to load on
+ * entry from Ring3.  Entry from Ring0 leave %cs and %ss unmodified.
+ */
+static void init_fred(void)
+{
+    unsigned long stack_top = get_stack_bottom() & ~(STACK_SIZE - 1);
+
+    ASSERT(opt_fred == 1);
+
+    wrmsrns(MSR_STAR, XEN_MSR_STAR);
+    wrmsrns(MSR_FRED_CONFIG, (unsigned long)entry_FRED_R3);
+
+    /*
+     * MSR_FRED_RSP_* all come with an 64-byte alignment check, avoiding the
+     * need for an explicit BUG_ON().
+     */
+    wrmsrns(MSR_FRED_RSP_SL0, (unsigned long)(&get_cpu_info()->_fred + 1));
+    wrmsrns(MSR_FRED_RSP_SL1, stack_top + (IST_DB * IST_SHSTK_SIZE)); /* Poison */
+    wrmsrns(MSR_FRED_RSP_SL2, stack_top + (1 + IST_MCE)  * PAGE_SIZE);
+    wrmsrns(MSR_FRED_RSP_SL3, stack_top + (1 + IST_DF)   * PAGE_SIZE);
+    wrmsrns(MSR_FRED_STK_LVLS, ((2UL << (X86_EXC_MC * 2)) |
+                                (3UL << (X86_EXC_DF * 2))));
+
+    if ( cpu_has_xen_shstk )
+    {
+        wrmsrns(MSR_FRED_SSP_SL0, stack_top + (PRIMARY_SHSTK_SLOT + 1) * PAGE_SIZE);
+        wrmsrns(MSR_FRED_SSP_SL1, stack_top + (1 + IST_DB) * PAGE_SIZE); /* Poison */
+        wrmsrns(MSR_FRED_SSP_SL2, stack_top + (IST_MCE * IST_SHSTK_SIZE));
+        wrmsrns(MSR_FRED_SSP_SL3, stack_top + (IST_DF  * IST_SHSTK_SIZE));
+    }
+}
+
 /*
  * Configure basic exception handling.  This is prior to parsing the command
  * line or configuring a console, and needs to be as simple as possible.
@@ -329,16 +379,20 @@ void __init traps_init(void)
             printk(XENLOG_INFO "Disabling PV32 due to FRED\n");
         }
 #endif
+
+        init_fred();
+        set_in_cr4(X86_CR4_FRED);
+
         setup_force_cpu_cap(X86_FEATURE_XEN_FRED);
         printk("Using FRED event delivery\n");
     }
     else
     {
+        load_system_tables();
+
         printk("Using IDT event delivery\n");
     }
 
-    load_system_tables();
-
     init_ler();
 
     /* Cache {,compat_}gdt_l1e now that physically relocation is done. */
@@ -356,7 +410,11 @@ void __init traps_init(void)
  */
 void __init bsp_traps_reinit(void)
 {
-    load_system_tables();
+    if ( opt_fred )
+        init_fred();
+    else
+        load_system_tables();
+
     percpu_traps_init();
 }
 
@@ -366,7 +424,8 @@ void __init bsp_traps_reinit(void)
  */
 void percpu_traps_init(void)
 {
-    legacy_syscall_init();
+    if ( !opt_fred )
+        legacy_syscall_init();
 
     if ( cpu_has_xen_lbr )
         wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR);
@@ -381,7 +440,21 @@ void percpu_traps_init(void)
  */
 void asmlinkage percpu_early_traps_init(void)
 {
-    load_system_tables();
+    if ( opt_fred )
+    {
+        const seg_desc_t *gdt = this_cpu(gdt) - FIRST_RESERVED_GDT_ENTRY;
+        const struct desc_ptr gdtr = {
+            .base = (unsigned long)gdt,
+            .limit = LAST_RESERVED_GDT_BYTE,
+        };
+
+        lgdt(&gdtr);
+
+        init_fred();
+        write_cr4(read_cr4() | X86_CR4_FRED);
+    }
+    else
+        load_system_tables();
 }
 
 static void __init __maybe_unused build_assertions(void)
-- 
2.39.5