Instead of using the pv_ops vector for RDMSR/WRMSR related functions,
use a more explicit approach allowing to inline the RDMSR/WRMSR
instructions directly when not running as a Xen PV guest.
By using cpu_feature_enabled(X86_FEATURE_XENPV) for the Xen PV case
the related calls to Xen specific code will be statically disabled via
runtime patching.
Signed-off-by: Juergen Gross <jgross@suse.com>
---
V2:
- new patch
---
arch/x86/include/asm/msr.h | 57 ++++++++++++++++++++++-----
arch/x86/include/asm/paravirt.h | 45 ---------------------
arch/x86/include/asm/paravirt_types.h | 13 ------
arch/x86/kernel/paravirt.c | 5 ---
arch/x86/xen/enlighten_pv.c | 20 ++++------
arch/x86/xen/pmu.c | 1 +
6 files changed, 57 insertions(+), 84 deletions(-)
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index cc592611e333..d42cd2c19818 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -290,24 +290,22 @@ static __always_inline void native_wrmsr(u32 msr, u32 low, u32 high)
native_wrmsrq(msr, (u64)high << 32 | low);
}
-static inline u64 native_read_msr(u32 msr)
+static __always_inline u64 native_read_msr(u32 msr)
{
return native_rdmsrq(msr);
}
-static inline int native_read_msr_safe(u32 msr, u64 *val)
+static __always_inline int native_read_msr_safe(u32 msr, u64 *val)
{
return __rdmsr(msr, val, EX_TYPE_RDMSR_SAFE) ? -EIO : 0;
}
-/* Can be uninlined because referenced by paravirt */
-static inline void notrace native_write_msr(u32 msr, u64 val)
+static __always_inline void native_write_msr(u32 msr, u64 val)
{
native_wrmsrq(msr, val);
}
-/* Can be uninlined because referenced by paravirt */
-static inline int notrace native_write_msr_safe(u32 msr, u64 val)
+static __always_inline int native_write_msr_safe(u32 msr, u64 val)
{
return __wrmsrq(msr, val, EX_TYPE_WRMSR_SAFE) ? -EIO : 0;
}
@@ -325,8 +323,49 @@ static inline u64 native_read_pmc(int counter)
return EAX_EDX_VAL(val, low, high);
}
-#ifdef CONFIG_PARAVIRT_XXL
-#include <asm/paravirt.h>
+#ifdef CONFIG_XEN_PV
+#include <asm/xen/msr.h>
+
+static __always_inline u64 read_msr(u32 msr)
+{
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return xen_read_msr(msr);
+
+ return native_rdmsrq(msr);
+}
+
+static __always_inline int read_msr_safe(u32 msr, u64 *p)
+{
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return xen_read_msr_safe(msr, p);
+
+ return native_read_msr_safe(msr, p);
+}
+
+static __always_inline void write_msr(u32 msr, u64 val)
+{
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ xen_write_msr(msr, val);
+ else
+ native_wrmsrq(msr, val);
+}
+
+static __always_inline int write_msr_safe(u32 msr, u64 val)
+{
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return xen_write_msr_safe(msr, val);
+
+ return native_write_msr_safe(msr, val);
+}
+
+static __always_inline u64 rdpmc(int counter)
+{
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return xen_read_pmc(counter);
+
+ return native_read_pmc(counter);
+}
+
#else
static __always_inline u64 read_msr(u32 msr)
{
@@ -353,7 +392,7 @@ static __always_inline u64 rdpmc(int counter)
return native_read_pmc(counter);
}
-#endif /* !CONFIG_PARAVIRT_XXL */
+#endif /* !CONFIG_XEN_PV */
/*
* Access to machine-specific registers (available on 586 and better only)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index dc297f62b935..45f47b7d9f56 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -175,51 +175,6 @@ static inline void __write_cr4(unsigned long x)
PVOP_VCALL1(cpu.write_cr4, x);
}
-static inline u64 paravirt_read_msr(u32 msr)
-{
- return PVOP_CALL1(u64, cpu.read_msr, msr);
-}
-
-static inline void paravirt_write_msr(u32 msr, u64 val)
-{
- PVOP_VCALL2(cpu.write_msr, msr, val);
-}
-
-static inline int paravirt_read_msr_safe(u32 msr, u64 *val)
-{
- return PVOP_CALL2(int, cpu.read_msr_safe, msr, val);
-}
-
-static inline int paravirt_write_msr_safe(u32 msr, u64 val)
-{
- return PVOP_CALL2(int, cpu.write_msr_safe, msr, val);
-}
-
-static __always_inline u64 read_msr(u32 msr)
-{
- return paravirt_read_msr(msr);
-}
-
-static __always_inline int read_msr_safe(u32 msr, u64 *p)
-{
- return paravirt_read_msr_safe(msr, p);
-}
-
-static __always_inline void write_msr(u32 msr, u64 val)
-{
- paravirt_write_msr(msr, val);
-}
-
-static __always_inline int write_msr_safe(u32 msr, u64 val)
-{
- return paravirt_write_msr_safe(msr, val);
-}
-
-static __always_inline u64 rdpmc(int counter)
-{
- return PVOP_CALL1(u64, cpu.read_pmc, counter);
-}
-
static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
{
PVOP_VCALL2(cpu.alloc_ldt, ldt, entries);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 37a8627d8277..0d03e658ea8f 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -90,19 +90,6 @@ struct pv_cpu_ops {
void (*cpuid)(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx);
- /* Unsafe MSR operations. These will warn or panic on failure. */
- u64 (*read_msr)(u32 msr);
- void (*write_msr)(u32 msr, u64 val);
-
- /*
- * Safe MSR operations.
- * Returns 0 or -EIO.
- */
- int (*read_msr_safe)(u32 msr, u64 *val);
- int (*write_msr_safe)(u32 msr, u64 val);
-
- u64 (*read_pmc)(int counter);
-
void (*start_context_switch)(struct task_struct *prev);
void (*end_context_switch)(struct task_struct *next);
#endif
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index ab3e172dcc69..240eeb1beab5 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -129,11 +129,6 @@ struct paravirt_patch_template pv_ops = {
.cpu.read_cr0 = native_read_cr0,
.cpu.write_cr0 = native_write_cr0,
.cpu.write_cr4 = native_write_cr4,
- .cpu.read_msr = native_read_msr,
- .cpu.write_msr = native_write_msr,
- .cpu.read_msr_safe = native_read_msr_safe,
- .cpu.write_msr_safe = native_write_msr_safe,
- .cpu.read_pmc = native_read_pmc,
.cpu.load_tr_desc = native_load_tr_desc,
.cpu.set_ldt = native_set_ldt,
.cpu.load_gdt = native_load_gdt,
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 26bbaf4b7330..df653099c567 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1160,15 +1160,16 @@ static void xen_do_write_msr(u32 msr, u64 val, int *err)
}
}
-static int xen_read_msr_safe(u32 msr, u64 *val)
+int xen_read_msr_safe(u32 msr, u64 *val)
{
int err = 0;
*val = xen_do_read_msr(msr, &err);
return err;
}
+EXPORT_SYMBOL(xen_read_msr_safe);
-static int xen_write_msr_safe(u32 msr, u64 val)
+int xen_write_msr_safe(u32 msr, u64 val)
{
int err = 0;
@@ -1176,20 +1177,23 @@ static int xen_write_msr_safe(u32 msr, u64 val)
return err;
}
+EXPORT_SYMBOL(xen_write_msr_safe);
-static u64 xen_read_msr(u32 msr)
+u64 xen_read_msr(u32 msr)
{
int err = 0;
return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL);
}
+EXPORT_SYMBOL(xen_read_msr);
-static void xen_write_msr(u32 msr, u64 val)
+void xen_write_msr(u32 msr, u64 val)
{
int err;
xen_do_write_msr(msr, val, xen_msr_safe ? &err : NULL);
}
+EXPORT_SYMBOL(xen_write_msr);
/* This is called once we have the cpu_possible_mask */
void __init xen_setup_vcpu_info_placement(void)
@@ -1225,14 +1229,6 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = {
.write_cr4 = xen_write_cr4,
- .read_msr = xen_read_msr,
- .write_msr = xen_write_msr,
-
- .read_msr_safe = xen_read_msr_safe,
- .write_msr_safe = xen_write_msr_safe,
-
- .read_pmc = xen_read_pmc,
-
.load_tr_desc = paravirt_nop,
.set_ldt = xen_set_ldt,
.load_gdt = xen_load_gdt,
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index d49a3bdc448b..d0dea950cd4f 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -370,6 +370,7 @@ u64 xen_read_pmc(int counter)
else
return xen_intel_read_pmc(counter);
}
+EXPORT_SYMBOL(xen_read_pmc);
int pmu_apic_update(uint32_t val)
{
--
2.51.0
Hi Juergen, kernel test robot noticed the following build errors: [auto build test ERROR on tip/x86/core] [also build test ERROR on linus/master v6.17] [cannot apply to kvm/queue kvm/next tip/master kvm/linux-next tip/auto-latest next-20250929] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Juergen-Gross/coco-tdx-Rename-MSR-access-helpers/20250930-150753 base: tip/x86/core patch link: https://lore.kernel.org/r/20250930070356.30695-12-jgross%40suse.com patch subject: [PATCH v2 11/12] x86/paravirt: Don't use pv_ops vector for MSR access functions config: x86_64-buildonly-randconfig-001-20251001 (https://download.01.org/0day-ci/archive/20251001/202510010555.InsgYDTd-lkp@intel.com/config) compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251001/202510010555.InsgYDTd-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202510010555.InsgYDTd-lkp@intel.com/ All errors (new ones prefixed by >>): In file included from arch/x86/kernel/asm-offsets.c:9: In file included from include/linux/crypto.h:18: In file included from include/linux/slab.h:16: In file included from include/linux/gfp.h:7: In file included from include/linux/mmzone.h:22: In file included from include/linux/mm_types.h:16: In file included from include/linux/uprobes.h:18: In file included from include/linux/timer.h:6: In file included from include/linux/ktime.h:25: In file included from include/linux/jiffies.h:10: In file included from include/linux/time.h:60: In file included from include/linux/time32.h:13: In file included from include/linux/timex.h:67: In file included from arch/x86/include/asm/timex.h:6: In file included from arch/x86/include/asm/tsc.h:11: >> arch/x86/include/asm/msr.h:327:10: fatal error: 'asm/xen/msr.h' file not found 327 | #include <asm/xen/msr.h> | ^~~~~~~~~~~~~~~ 1 error generated. make[3]: *** [scripts/Makefile.build:182: arch/x86/kernel/asm-offsets.s] Error 1 shuffle=3471495288 make[3]: Target 'prepare' not remade because of errors. make[2]: *** [Makefile:1282: prepare0] Error 2 shuffle=3471495288 make[2]: Target 'prepare' not remade because of errors. make[1]: *** [Makefile:248: __sub-make] Error 2 shuffle=3471495288 make[1]: Target 'prepare' not remade because of errors. make: *** [Makefile:248: __sub-make] Error 2 shuffle=3471495288 make: Target 'prepare' not remade because of errors. vim +327 arch/x86/include/asm/msr.h 325 326 #ifdef CONFIG_XEN_PV > 327 #include <asm/xen/msr.h> 328 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
On 30.09.25 23:27, kernel test robot wrote: > Hi Juergen, > > kernel test robot noticed the following build errors: > > [auto build test ERROR on tip/x86/core] > [also build test ERROR on linus/master v6.17] > [cannot apply to kvm/queue kvm/next tip/master kvm/linux-next tip/auto-latest next-20250929] > [If your patch is applied to the wrong git tree, kindly drop us a note. > And when submitting patch, we suggest to use '--base' as documented in > https://git-scm.com/docs/git-format-patch#_base_tree_information] > > url: https://github.com/intel-lab-lkp/linux/commits/Juergen-Gross/coco-tdx-Rename-MSR-access-helpers/20250930-150753 > base: tip/x86/core > patch link: https://lore.kernel.org/r/20250930070356.30695-12-jgross%40suse.com > patch subject: [PATCH v2 11/12] x86/paravirt: Don't use pv_ops vector for MSR access functions > config: x86_64-buildonly-randconfig-001-20251001 (https://download.01.org/0day-ci/archive/20251001/202510010555.InsgYDTd-lkp@intel.com/config) > compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261) > reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251001/202510010555.InsgYDTd-lkp@intel.com/reproduce) > > If you fix the issue in a separate patch/commit (i.e. not just a new version of > the same patch/commit), kindly add following tags > | Reported-by: kernel test robot <lkp@intel.com> > | Closes: https://lore.kernel.org/oe-kbuild-all/202510010555.InsgYDTd-lkp@intel.com/ > > All errors (new ones prefixed by >>): > > In file included from arch/x86/kernel/asm-offsets.c:9: > In file included from include/linux/crypto.h:18: > In file included from include/linux/slab.h:16: > In file included from include/linux/gfp.h:7: > In file included from include/linux/mmzone.h:22: > In file included from include/linux/mm_types.h:16: > In file included from include/linux/uprobes.h:18: > In file included from include/linux/timer.h:6: > In file included from include/linux/ktime.h:25: > In file included from include/linux/jiffies.h:10: > In file included from include/linux/time.h:60: > In file included from include/linux/time32.h:13: > In file included from include/linux/timex.h:67: > In file included from arch/x86/include/asm/timex.h:6: > In file included from arch/x86/include/asm/tsc.h:11: >>> arch/x86/include/asm/msr.h:327:10: fatal error: 'asm/xen/msr.h' file not found > 327 | #include <asm/xen/msr.h> > | ^~~~~~~~~~~~~~~ > 1 error generated. > make[3]: *** [scripts/Makefile.build:182: arch/x86/kernel/asm-offsets.s] Error 1 shuffle=3471495288 > make[3]: Target 'prepare' not remade because of errors. > make[2]: *** [Makefile:1282: prepare0] Error 2 shuffle=3471495288 > make[2]: Target 'prepare' not remade because of errors. > make[1]: *** [Makefile:248: __sub-make] Error 2 shuffle=3471495288 > make[1]: Target 'prepare' not remade because of errors. > make: *** [Makefile:248: __sub-make] Error 2 shuffle=3471495288 > make: Target 'prepare' not remade because of errors. > > > vim +327 arch/x86/include/asm/msr.h > > 325 > 326 #ifdef CONFIG_XEN_PV > > 327 #include <asm/xen/msr.h> > 328 > Uh, I forgot to "git add" that new header file. Will be corrected in V3. Juergen
On Tue, Sep 30, 2025 at 09:03:55AM +0200, Juergen Gross wrote: > +static __always_inline u64 read_msr(u32 msr) > +{ > + if (cpu_feature_enabled(X86_FEATURE_XENPV)) > + return xen_read_msr(msr); > + > + return native_rdmsrq(msr); > +} > + > +static __always_inline int read_msr_safe(u32 msr, u64 *p) > +{ > + if (cpu_feature_enabled(X86_FEATURE_XENPV)) > + return xen_read_msr_safe(msr, p); > + > + return native_read_msr_safe(msr, p); > +} > + > +static __always_inline void write_msr(u32 msr, u64 val) > +{ > + if (cpu_feature_enabled(X86_FEATURE_XENPV)) > + xen_write_msr(msr, val); > + else > + native_wrmsrq(msr, val); > +} > + > +static __always_inline int write_msr_safe(u32 msr, u64 val) > +{ > + if (cpu_feature_enabled(X86_FEATURE_XENPV)) > + return xen_write_msr_safe(msr, val); > + > + return native_write_msr_safe(msr, val); > +} > + > +static __always_inline u64 rdpmc(int counter) > +{ > + if (cpu_feature_enabled(X86_FEATURE_XENPV)) > + return xen_read_pmc(counter); > + > + return native_read_pmc(counter); > +} Egads, didn't we just construct giant ALTERNATIVE()s for the native_ things? Why wrap that in a cpu_feature_enabled() instead of just adding one more case to the ALTERNATIVE() ?
On 30.09.25 10:38, Peter Zijlstra wrote: > On Tue, Sep 30, 2025 at 09:03:55AM +0200, Juergen Gross wrote: > >> +static __always_inline u64 read_msr(u32 msr) >> +{ >> + if (cpu_feature_enabled(X86_FEATURE_XENPV)) >> + return xen_read_msr(msr); >> + >> + return native_rdmsrq(msr); >> +} >> + >> +static __always_inline int read_msr_safe(u32 msr, u64 *p) >> +{ >> + if (cpu_feature_enabled(X86_FEATURE_XENPV)) >> + return xen_read_msr_safe(msr, p); >> + >> + return native_read_msr_safe(msr, p); >> +} >> + >> +static __always_inline void write_msr(u32 msr, u64 val) >> +{ >> + if (cpu_feature_enabled(X86_FEATURE_XENPV)) >> + xen_write_msr(msr, val); >> + else >> + native_wrmsrq(msr, val); >> +} >> + >> +static __always_inline int write_msr_safe(u32 msr, u64 val) >> +{ >> + if (cpu_feature_enabled(X86_FEATURE_XENPV)) >> + return xen_write_msr_safe(msr, val); >> + >> + return native_write_msr_safe(msr, val); >> +} >> + >> +static __always_inline u64 rdpmc(int counter) >> +{ >> + if (cpu_feature_enabled(X86_FEATURE_XENPV)) >> + return xen_read_pmc(counter); >> + >> + return native_read_pmc(counter); >> +} > > Egads, didn't we just construct giant ALTERNATIVE()s for the native_ > things? Why wrap that in a cpu_feature_enabled() instead of just adding > one more case to the ALTERNATIVE() ? The problem I encountered with using pv_ops was to implement the *_safe() variants. There is no simple way to do that using ALTERNATIVE_<n>(), as in the Xen PV case the call will remain, and I didn't find a way to specify a sane interface between the call-site and the called Xen function to return the error indicator. Remember that at the call site the main interface is the one of the RDMSR/WRMSR instructions. They lack an error indicator. In Xin's series there was a patch written initially by you to solve such a problem by adding the _ASM_EXTABLE_FUNC_REWIND() exception table method. I think this is a dead end, as it will break when using a shadow stack. Additionally I found a rather ugly hack only to avoid re-iterating most of the bare metal ALTERNATIVE() for the paravirt case. It is possible, but the bare metal case is gaining one additional ALTERNATIVE level, resulting in patching the original instruction with an identical copy first. Another benefit of my approach is the dropping of "#include <asm/paravirt.h>" from msr.h, which is making life a little bit easier. Juergen
On Tue, Sep 30, 2025 at 11:02:52AM +0200, Jürgen Groß wrote: > On 30.09.25 10:38, Peter Zijlstra wrote: > > On Tue, Sep 30, 2025 at 09:03:55AM +0200, Juergen Gross wrote: > > > > > +static __always_inline u64 read_msr(u32 msr) > > > +{ > > > + if (cpu_feature_enabled(X86_FEATURE_XENPV)) > > > + return xen_read_msr(msr); > > > + > > > + return native_rdmsrq(msr); > > > +} > > > + > > > +static __always_inline int read_msr_safe(u32 msr, u64 *p) > > > +{ > > > + if (cpu_feature_enabled(X86_FEATURE_XENPV)) > > > + return xen_read_msr_safe(msr, p); > > > + > > > + return native_read_msr_safe(msr, p); > > > +} > > > + > > > +static __always_inline void write_msr(u32 msr, u64 val) > > > +{ > > > + if (cpu_feature_enabled(X86_FEATURE_XENPV)) > > > + xen_write_msr(msr, val); > > > + else > > > + native_wrmsrq(msr, val); > > > +} > > > + > > > +static __always_inline int write_msr_safe(u32 msr, u64 val) > > > +{ > > > + if (cpu_feature_enabled(X86_FEATURE_XENPV)) > > > + return xen_write_msr_safe(msr, val); > > > + > > > + return native_write_msr_safe(msr, val); > > > +} > > > + > > > +static __always_inline u64 rdpmc(int counter) > > > +{ > > > + if (cpu_feature_enabled(X86_FEATURE_XENPV)) > > > + return xen_read_pmc(counter); > > > + > > > + return native_read_pmc(counter); > > > +} > > > > Egads, didn't we just construct giant ALTERNATIVE()s for the native_ > > things? Why wrap that in a cpu_feature_enabled() instead of just adding > > one more case to the ALTERNATIVE() ? > > The problem I encountered with using pv_ops was to implement the *_safe() > variants. There is no simple way to do that using ALTERNATIVE_<n>(), as > in the Xen PV case the call will remain, and I didn't find a way to > specify a sane interface between the call-site and the called Xen function > to return the error indicator. Remember that at the call site the main > interface is the one of the RDMSR/WRMSR instructions. They lack an error > indicator. Would've been useful Changelog material that I suppose. > In Xin's series there was a patch written initially by you to solve such > a problem by adding the _ASM_EXTABLE_FUNC_REWIND() exception table method. > I think this is a dead end, as it will break when using a shadow stack. No memories, let me go search. I found this: https://patchwork.ozlabs.org/project/linux-ide/patch/20250331082251.3171276-12-xin@zytor.com/ That's the other Peter :-) Anyway, with shadowstack you should be able to frob SSP along with SP in the exception context. IIRC the SSP 'return' value is on the SS itself, so a WRSS to that field can easily make the whole CALL go away. > Additionally I found a rather ugly hack only to avoid re-iterating most of > the bare metal ALTERNATIVE() for the paravirt case. It is possible, but the > bare metal case is gaining one additional ALTERNATIVE level, resulting in > patching the original instruction with an identical copy first. OTOH the above generates atrocious crap code :/ You get that _static_cpu_has() crud, which is basically a really fat jump_label (because it needs to include the runtime test) and then the code for both your xen thing and the alternative. /me ponders things a bit.. > Remember that at the call site the main interface is the one of the > RDMSR/WRMSR instructions. They lack an error indicator. This, that isn't true. Note how ex_handler_msr() takes a reg argument and how that sets that reg to -EIO. See how the current native_read_msr_safe() uses that: _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err]) (also note how using _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_*_SAFE) like you do, will result in reg being 0 or ax. Scribbling your 0 return value) It very explicitly uses @err as error return value. So your call would return eax:edx and take ecx to be the msr, but there is nothing stopping us from then using say ebx for error return, like: int err = 0; asm_inline( "1:\n" ALTERNATIVE("ds rdmsr", "call xen_rdmsr", XENPV) "2:\n" _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %%ebx) : "a" (ax), "d" (dx), "+b" (err) : "c" (msr)); return err; Hmm?
On 30.09.25 12:04, Peter Zijlstra wrote: > On Tue, Sep 30, 2025 at 11:02:52AM +0200, Jürgen Groß wrote: >> On 30.09.25 10:38, Peter Zijlstra wrote: >>> On Tue, Sep 30, 2025 at 09:03:55AM +0200, Juergen Gross wrote: >>> >>>> +static __always_inline u64 read_msr(u32 msr) >>>> +{ >>>> + if (cpu_feature_enabled(X86_FEATURE_XENPV)) >>>> + return xen_read_msr(msr); >>>> + >>>> + return native_rdmsrq(msr); >>>> +} >>>> + >>>> +static __always_inline int read_msr_safe(u32 msr, u64 *p) >>>> +{ >>>> + if (cpu_feature_enabled(X86_FEATURE_XENPV)) >>>> + return xen_read_msr_safe(msr, p); >>>> + >>>> + return native_read_msr_safe(msr, p); >>>> +} >>>> + >>>> +static __always_inline void write_msr(u32 msr, u64 val) >>>> +{ >>>> + if (cpu_feature_enabled(X86_FEATURE_XENPV)) >>>> + xen_write_msr(msr, val); >>>> + else >>>> + native_wrmsrq(msr, val); >>>> +} >>>> + >>>> +static __always_inline int write_msr_safe(u32 msr, u64 val) >>>> +{ >>>> + if (cpu_feature_enabled(X86_FEATURE_XENPV)) >>>> + return xen_write_msr_safe(msr, val); >>>> + >>>> + return native_write_msr_safe(msr, val); >>>> +} >>>> + >>>> +static __always_inline u64 rdpmc(int counter) >>>> +{ >>>> + if (cpu_feature_enabled(X86_FEATURE_XENPV)) >>>> + return xen_read_pmc(counter); >>>> + >>>> + return native_read_pmc(counter); >>>> +} >>> >>> Egads, didn't we just construct giant ALTERNATIVE()s for the native_ >>> things? Why wrap that in a cpu_feature_enabled() instead of just adding >>> one more case to the ALTERNATIVE() ? >> >> The problem I encountered with using pv_ops was to implement the *_safe() >> variants. There is no simple way to do that using ALTERNATIVE_<n>(), as >> in the Xen PV case the call will remain, and I didn't find a way to >> specify a sane interface between the call-site and the called Xen function >> to return the error indicator. Remember that at the call site the main >> interface is the one of the RDMSR/WRMSR instructions. They lack an error >> indicator. > > Would've been useful Changelog material that I suppose. > >> In Xin's series there was a patch written initially by you to solve such >> a problem by adding the _ASM_EXTABLE_FUNC_REWIND() exception table method. >> I think this is a dead end, as it will break when using a shadow stack. > > No memories, let me go search. I found this: > > https://patchwork.ozlabs.org/project/linux-ide/patch/20250331082251.3171276-12-xin@zytor.com/ > > That's the other Peter :-) Oh, my bad, sorry. :-) > Anyway, with shadowstack you should be able to frob SSP along with SP in > the exception context. IIRC the SSP 'return' value is on the SS itself, > so a WRSS to that field can easily make the whole CALL go away. Yeah, but being able to avoid all of that dance wouldn't be too bad either. >> Additionally I found a rather ugly hack only to avoid re-iterating most of >> the bare metal ALTERNATIVE() for the paravirt case. It is possible, but the >> bare metal case is gaining one additional ALTERNATIVE level, resulting in >> patching the original instruction with an identical copy first. > > OTOH the above generates atrocious crap code :/ Yes. > You get that _static_cpu_has() crud, which is basically a really fat > jump_label (because it needs to include the runtime test) and then the > code for both your xen thing and the alternative. Seeing both variants would make it easier to decide, I guess. > > /me ponders things a bit.. > >> Remember that at the call site the main interface is the one of the >> RDMSR/WRMSR instructions. They lack an error indicator. > > This, that isn't true. > > Note how ex_handler_msr() takes a reg argument and how that sets that > reg to -EIO. See how the current native_read_msr_safe() uses that: > > _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err]) > > (also note how using _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_*_SAFE) like you > do, will result in reg being 0 or ax. Scribbling your 0 return value) > > It very explicitly uses @err as error return value. So your call would > return eax:edx and take ecx to be the msr, but there is nothing stopping > us from then using say ebx for error return, like: > > int err = 0; > > asm_inline( > "1:\n" > ALTERNATIVE("ds rdmsr", > "call xen_rdmsr", XENPV) > "2:\n" > > _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %%ebx) > > : "a" (ax), "d" (dx), "+b" (err) > : "c" (msr)); > > return err; > > Hmm? Oh, indeed. Let me try that and we can choose the less evil. :-) Juergen
On 2025-09-30 03:43, Jürgen Groß wrote: >> >>> In Xin's series there was a patch written initially by you to solve such >>> a problem by adding the _ASM_EXTABLE_FUNC_REWIND() exception table method. >>> I think this is a dead end, as it will break when using a shadow stack. >> >> No memories, let me go search. I found this: >> >> https://patchwork.ozlabs.org/project/linux-ide/ >> patch/20250331082251.3171276-12-xin@zytor.com/ >> >> That's the other Peter :-) > > Oh, my bad, sorry. :-) Yes, you would have to patch both the stack and the shadow stack. BUT: in the end it really doesn't really buy much. The only thing that benefits is Xen, but it is fairly easy for Xen (my original POC did this) to filter out the quite few MSRs that they do special dispatch for (plus the variable case), and then they can just use the native code including the benefit of using WRMSRNS and immediates. The other approach that I also came up with looks like this: /* Native code (non-immediate): trap point at +7 */ 0: 48 89 c2 mov %rax,%rdx 3: 48 c1 ea 20 shr $0x20,%rdx 7: 0f 01 c6 wrmsrns a: /* Native code (immediate): trap point at +0 */ 0: 36 c4 e7 7a f6 c0 xx ds wrmsrns %rax,$XX xx xx xx a: /* Xen code, stub sets CF = 1 on failure */ 0: e8 xx xx xx xx call asm_xen_pv_wrmsr 5: 73 03 jnc 0xa 7: 0f 0b ud2 9: 90 nop a: The trap point even ends up in the same place! UD2 can be any 1-, 2-, or 3-byte trapping instruction. > >>> Additionally I found a rather ugly hack only to avoid re-iterating most of >>> the bare metal ALTERNATIVE() for the paravirt case. It is possible, but the >>> bare metal case is gaining one additional ALTERNATIVE level, resulting in >>> patching the original instruction with an identical copy first. >> >> OTOH the above generates atrocious crap code :/ > > Yes. Please don't generate crap code -- that's exactly The Wrong Thing. I didn't actually look at the output code; I honestly didn't think that that would even be an issue. If it is REALLY evil, then do something like shell script to generate the code instead... (One big problem here is that cpp doesn't understand colons as separators...) -hpa
On Tue, Sep 30, 2025 at 12:49:21PM -0700, H. Peter Anvin wrote: > /* Xen code, stub sets CF = 1 on failure */ > > 0: e8 xx xx xx xx call asm_xen_pv_wrmsr > 5: 73 03 jnc 0xa > 7: 0f 0b ud2 > 9: 90 nop > a: > > The trap point even ends up in the same place! UD2 can be any 1-, 2-, or > 3-byte trapping instruction. Please don't rely on flags to be retained by RET. The various mitigations have trouble with that.
On 2025-09-30 12:49, H. Peter Anvin wrote: > > /* Xen code, stub sets CF = 1 on failure */ > > 0: e8 xx xx xx xx call asm_xen_pv_wrmsr > 5: 73 03 jnc 0xa > 7: 0f 0b ud2 > 9: 90 nop > a: > > The trap point even ends up in the same place! UD2 can be any 1-, 2-, or > 3-byte trapping instruction. > You can, of course, also simply have a conditional jump, at the expense of making the whole alternative block one byte longer: 0: e8 xx xx xx xx call asm_xen_pv_wrmsr 5: 0f 82 xx xx xx xx jc wrmsr_failed -hpa
© 2016 - 2025 Red Hat, Inc.