xen/arch/x86/setup.c | 46 ++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 21 deletions(-)
Delay setting X86_CR4_SMAP on the BSP until the domain building is done, so
that there's no need to disable SMAP. Note however that SMAP is enabled for
the APs on bringup, as domain builder code strictly run on the BSP. Delaying
the setting for the APs would mean having to do a callfunc IPI later in order
to set it on all the APs.
The fixes tag is to account for the wrong usage of cpu_has_smap in
create_dom0(), it should instead have used
boot_cpu_has(X86_FEATURE_XEN_SMAP).
While there also make cr4_pv32_mask __ro_after_init.
Fixes: 493ab190e5b1 ('xen/sm{e, a}p: allow disabling sm{e, a}p for Xen itself')
Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Changes since v2:
- Change approach.
- Add fixes tag.
---
xen/arch/x86/setup.c | 46 ++++++++++++++++++++++++--------------------
1 file changed, 25 insertions(+), 21 deletions(-)
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index eee20bb1753c..3f792a7a8f39 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -80,7 +80,7 @@ int8_t __initdata opt_probe_port_aliases = -1;
boolean_param("probe-port-aliases", opt_probe_port_aliases);
/* Only used in asm code and within this source file */
-unsigned long asmlinkage __read_mostly cr4_pv32_mask;
+unsigned long asmlinkage __ro_after_init cr4_pv32_mask;
/* **** Linux config option: propagated to domain0. */
/* "acpi=off": Sisables both ACPI table parsing and interpreter. */
@@ -955,26 +955,9 @@ static struct domain *__init create_dom0(const module_t *image,
}
}
- /*
- * Temporarily clear SMAP in CR4 to allow user-accesses in construct_dom0().
- * This saves a large number of corner cases interactions with
- * copy_from_user().
- */
- if ( cpu_has_smap )
- {
- cr4_pv32_mask &= ~X86_CR4_SMAP;
- write_cr4(read_cr4() & ~X86_CR4_SMAP);
- }
-
if ( construct_dom0(d, image, headroom, initrd, cmdline) != 0 )
panic("Could not construct domain 0\n");
- if ( cpu_has_smap )
- {
- write_cr4(read_cr4() | X86_CR4_SMAP);
- cr4_pv32_mask |= X86_CR4_SMAP;
- }
-
return d;
}
@@ -1907,16 +1890,25 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
if ( cpu_has_smep && opt_smep != SMEP_HVM_ONLY )
setup_force_cpu_cap(X86_FEATURE_XEN_SMEP);
if ( boot_cpu_has(X86_FEATURE_XEN_SMEP) )
+ {
set_in_cr4(X86_CR4_SMEP);
+ cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS;
+ }
if ( !opt_smap )
setup_clear_cpu_cap(X86_FEATURE_SMAP);
if ( cpu_has_smap && opt_smap != SMAP_HVM_ONLY )
setup_force_cpu_cap(X86_FEATURE_XEN_SMAP);
if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) )
- set_in_cr4(X86_CR4_SMAP);
-
- cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS;
+ /*
+ * Set SMAP on the %cr4 mask so that it's set for APs on bringup, but
+ * don't set for the BSP until domain building is done.
+ *
+ * Don't set it in cr4_pv32_mask either, until it's also set on the
+ * BSP. Otherwise the BUG in cr4_pv32_restore would trigger for events
+ * received on the BSP.
+ */
+ mmu_cr4_features |= X86_CR4_SMAP;
if ( boot_cpu_has(X86_FEATURE_FSGSBASE) )
set_in_cr4(X86_CR4_FSGSBASE);
@@ -2048,6 +2040,18 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
if ( !dom0 )
panic("Could not set up DOM0 guest OS\n");
+ /*
+ * Enable SMAP only after being done with the domain building phase, as the
+ * PV builder switches to the domain page-tables and must be run with SMAP
+ * disabled.
+ */
+ if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) )
+ {
+ ASSERT(mmu_cr4_features & X86_CR4_SMAP);
+ write_cr4(read_cr4() | X86_CR4_SMAP);
+ cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS;
+ }
+
heap_init_late();
init_trace_bufs();
--
2.45.2
On 01.08.2024 11:52, Roger Pau Monne wrote: > @@ -1907,16 +1890,25 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p) > if ( cpu_has_smep && opt_smep != SMEP_HVM_ONLY ) > setup_force_cpu_cap(X86_FEATURE_XEN_SMEP); > if ( boot_cpu_has(X86_FEATURE_XEN_SMEP) ) > + { > set_in_cr4(X86_CR4_SMEP); > + cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS; Could be just "cr4_pv32_mask = X86_CR4_SMEP" now? > + } > > if ( !opt_smap ) > setup_clear_cpu_cap(X86_FEATURE_SMAP); > if ( cpu_has_smap && opt_smap != SMAP_HVM_ONLY ) > setup_force_cpu_cap(X86_FEATURE_XEN_SMAP); > if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) ) > - set_in_cr4(X86_CR4_SMAP); > - > - cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS; > + /* > + * Set SMAP on the %cr4 mask so that it's set for APs on bringup, but > + * don't set for the BSP until domain building is done. > + * > + * Don't set it in cr4_pv32_mask either, until it's also set on the > + * BSP. Otherwise the BUG in cr4_pv32_restore would trigger for events > + * received on the BSP. > + */ > + mmu_cr4_features |= X86_CR4_SMAP; Don't you put APs at risk this way of triggering the BUG in cr4_pv32_restore()? They'll have the bit set in %cr4, but the bit remains clear in cr4_pv32_mask until much later. > @@ -2048,6 +2040,18 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p) > if ( !dom0 ) > panic("Could not set up DOM0 guest OS\n"); > > + /* > + * Enable SMAP only after being done with the domain building phase, as the > + * PV builder switches to the domain page-tables and must be run with SMAP > + * disabled. > + */ > + if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) ) > + { > + ASSERT(mmu_cr4_features & X86_CR4_SMAP); > + write_cr4(read_cr4() | X86_CR4_SMAP); > + cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS; > + } Similarly for the BSP here: If we take an NMI between setting CR4.SMAP and setting the bit in cr4_pv32_mask, cr4_pv32_restore() would hit the BUG there if I'm not mistaken. I further fear that switching things around won't help either. The code you remove from create_dom0() looks to have the same issue. The only NMI-safe sequence looks to be: Clear both bits from %cr4, update cr4_pv32_mask as wanted, and then write %cr4 with the bits from cr4_pv32_mask ORed in. Jan
On Thu, Aug 01, 2024 at 12:28:06PM +0200, Jan Beulich wrote: > On 01.08.2024 11:52, Roger Pau Monne wrote: > > @@ -1907,16 +1890,25 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p) > > if ( cpu_has_smep && opt_smep != SMEP_HVM_ONLY ) > > setup_force_cpu_cap(X86_FEATURE_XEN_SMEP); > > if ( boot_cpu_has(X86_FEATURE_XEN_SMEP) ) > > + { > > set_in_cr4(X86_CR4_SMEP); > > + cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS; > > Could be just "cr4_pv32_mask = X86_CR4_SMEP" now? Yes, indeed, same below then. > > + } > > > > if ( !opt_smap ) > > setup_clear_cpu_cap(X86_FEATURE_SMAP); > > if ( cpu_has_smap && opt_smap != SMAP_HVM_ONLY ) > > setup_force_cpu_cap(X86_FEATURE_XEN_SMAP); > > if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) ) > > - set_in_cr4(X86_CR4_SMAP); > > - > > - cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS; > > + /* > > + * Set SMAP on the %cr4 mask so that it's set for APs on bringup, but > > + * don't set for the BSP until domain building is done. > > + * > > + * Don't set it in cr4_pv32_mask either, until it's also set on the > > + * BSP. Otherwise the BUG in cr4_pv32_restore would trigger for events > > + * received on the BSP. > > + */ > > + mmu_cr4_features |= X86_CR4_SMAP; > > Don't you put APs at risk this way of triggering the BUG in cr4_pv32_restore()? > They'll have the bit set in %cr4, but the bit remains clear in cr4_pv32_mask > until much later. As long as the bit is set in %cr4, but not in cr4_pv32_mask the BUG in cr4_pv32_restore won't hit. > > @@ -2048,6 +2040,18 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p) > > if ( !dom0 ) > > panic("Could not set up DOM0 guest OS\n"); > > > > + /* > > + * Enable SMAP only after being done with the domain building phase, as the > > + * PV builder switches to the domain page-tables and must be run with SMAP > > + * disabled. > > + */ > > + if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) ) > > + { > > + ASSERT(mmu_cr4_features & X86_CR4_SMAP); > > + write_cr4(read_cr4() | X86_CR4_SMAP); > > + cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS; > > + } > > Similarly for the BSP here: If we take an NMI between setting CR4.SMAP and > setting the bit in cr4_pv32_mask, cr4_pv32_restore() would hit the BUG > there if I'm not mistaken. No, having extra bits set in %cr4 not present in cr4_pv32_mask won't trigger the BUG. It's the other way around, having bits set in cr4_pv32_mask but not in %cr4. As long as %cr4 is always updated ahead of setting the bit in cr4_pv32_mask that's fine. See the logic in cr4_pv32_restore: /* Check that _all_ of the bits intended to be set actually are. */ mov %cr4, %rax and cr4_pv32_mask(%rip), %rax cmp cr4_pv32_mask(%rip), %rax je 1j [...] BUG Otherwise none of this would be safe, as it's impossible to set %cr4 and cr4_pv32_mask atomically in a single instruction. > I further fear that switching things around won't > help either. The code you remove from create_dom0() looks to have the same > issue. The only NMI-safe sequence looks to be: Clear both bits from %cr4, > update cr4_pv32_mask as wanted, and then write %cr4 with the bits from > cr4_pv32_mask ORed in. No, that won't be safe, as cr4_pv32_mask would contain bits not set in %cr4 and cr4_pv32_restore would hit the BUG. The update order is the %other way around, first set in %cr4, then in cr4_pv32_mask. Thanks, Roger.
On 01.08.2024 17:36, Roger Pau Monné wrote: > On Thu, Aug 01, 2024 at 12:28:06PM +0200, Jan Beulich wrote: >> On 01.08.2024 11:52, Roger Pau Monne wrote: >>> @@ -1907,16 +1890,25 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p) >>> if ( cpu_has_smep && opt_smep != SMEP_HVM_ONLY ) >>> setup_force_cpu_cap(X86_FEATURE_XEN_SMEP); >>> if ( boot_cpu_has(X86_FEATURE_XEN_SMEP) ) >>> + { >>> set_in_cr4(X86_CR4_SMEP); >>> + cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS; >> >> Could be just "cr4_pv32_mask = X86_CR4_SMEP" now? > > Yes, indeed, same below then. > >>> + } >>> >>> if ( !opt_smap ) >>> setup_clear_cpu_cap(X86_FEATURE_SMAP); >>> if ( cpu_has_smap && opt_smap != SMAP_HVM_ONLY ) >>> setup_force_cpu_cap(X86_FEATURE_XEN_SMAP); >>> if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) ) >>> - set_in_cr4(X86_CR4_SMAP); >>> - >>> - cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS; >>> + /* >>> + * Set SMAP on the %cr4 mask so that it's set for APs on bringup, but >>> + * don't set for the BSP until domain building is done. >>> + * >>> + * Don't set it in cr4_pv32_mask either, until it's also set on the >>> + * BSP. Otherwise the BUG in cr4_pv32_restore would trigger for events >>> + * received on the BSP. >>> + */ >>> + mmu_cr4_features |= X86_CR4_SMAP; >> >> Don't you put APs at risk this way of triggering the BUG in cr4_pv32_restore()? >> They'll have the bit set in %cr4, but the bit remains clear in cr4_pv32_mask >> until much later. > > As long as the bit is set in %cr4, but not in cr4_pv32_mask the BUG in > cr4_pv32_restore won't hit. Hmm, you're right. Despite staring at that for quite some time, I got it wrong. Feel free to add Reviewed-by: Jan Beulich <jbeulich@suse.com> then, preferably with said minor adjustment (left in context above). Jan
On Thu, Aug 01, 2024 at 05:58:38PM +0200, Jan Beulich wrote: > On 01.08.2024 17:36, Roger Pau Monné wrote: > > On Thu, Aug 01, 2024 at 12:28:06PM +0200, Jan Beulich wrote: > >> On 01.08.2024 11:52, Roger Pau Monne wrote: > >>> @@ -1907,16 +1890,25 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p) > >>> if ( cpu_has_smep && opt_smep != SMEP_HVM_ONLY ) > >>> setup_force_cpu_cap(X86_FEATURE_XEN_SMEP); > >>> if ( boot_cpu_has(X86_FEATURE_XEN_SMEP) ) > >>> + { > >>> set_in_cr4(X86_CR4_SMEP); > >>> + cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS; > >> > >> Could be just "cr4_pv32_mask = X86_CR4_SMEP" now? > > > > Yes, indeed, same below then. > > > >>> + } > >>> > >>> if ( !opt_smap ) > >>> setup_clear_cpu_cap(X86_FEATURE_SMAP); > >>> if ( cpu_has_smap && opt_smap != SMAP_HVM_ONLY ) > >>> setup_force_cpu_cap(X86_FEATURE_XEN_SMAP); > >>> if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) ) > >>> - set_in_cr4(X86_CR4_SMAP); > >>> - > >>> - cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS; > >>> + /* > >>> + * Set SMAP on the %cr4 mask so that it's set for APs on bringup, but > >>> + * don't set for the BSP until domain building is done. > >>> + * > >>> + * Don't set it in cr4_pv32_mask either, until it's also set on the > >>> + * BSP. Otherwise the BUG in cr4_pv32_restore would trigger for events > >>> + * received on the BSP. > >>> + */ > >>> + mmu_cr4_features |= X86_CR4_SMAP; > >> > >> Don't you put APs at risk this way of triggering the BUG in cr4_pv32_restore()? > >> They'll have the bit set in %cr4, but the bit remains clear in cr4_pv32_mask > >> until much later. > > > > As long as the bit is set in %cr4, but not in cr4_pv32_mask the BUG in > > cr4_pv32_restore won't hit. > > Hmm, you're right. Despite staring at that for quite some time, I got > it wrong. Feel free to add > Reviewed-by: Jan Beulich <jbeulich@suse.com> > then, preferably with said minor adjustment (left in context above). Sure, thanks! Will adjust and send v4.
On 01.08.2024 12:28, Jan Beulich wrote: > On 01.08.2024 11:52, Roger Pau Monne wrote: >> @@ -2048,6 +2040,18 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p) >> if ( !dom0 ) >> panic("Could not set up DOM0 guest OS\n"); >> >> + /* >> + * Enable SMAP only after being done with the domain building phase, as the >> + * PV builder switches to the domain page-tables and must be run with SMAP >> + * disabled. >> + */ >> + if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) ) >> + { >> + ASSERT(mmu_cr4_features & X86_CR4_SMAP); >> + write_cr4(read_cr4() | X86_CR4_SMAP); >> + cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS; >> + } > > Similarly for the BSP here: If we take an NMI between setting CR4.SMAP and > setting the bit in cr4_pv32_mask, cr4_pv32_restore() would hit the BUG > there if I'm not mistaken. I further fear that switching things around won't > help either. The code you remove from create_dom0() looks to have the same > issue. The only NMI-safe sequence looks to be: Clear both bits from %cr4, > update cr4_pv32_mask as wanted, and then write %cr4 with the bits from > cr4_pv32_mask ORed in. Argh - and that would need doing simultaneously on all CPUs, as it seems. Getting a little too complicated, I guess. Jan
© 2016 - 2024 Red Hat, Inc.