[v2] MSR refactor with new MSR instructions support

[RFC PATCH v2 21/34] x86/msr: Utilize the alternatives mechanism to write MSR

Posted by Xin Li (Intel) 9 months, 3 weeks ago

The story started from tglx's reply in [1]:

  For actual performance relevant code the current PV ops mechanics
  are a horrorshow when the op defaults to the native instruction.

  look at wrmsrl():

  wrmsrl(msr, val
   wrmsr(msr, (u32)val, (u32)val >> 32))
    paravirt_write_msr(msr, low, high)
      PVOP_VCALL3(cpu.write_msr, msr, low, high)

  Which results in

	mov	$msr, %edi
	mov	$val, %rdx
	mov	%edx, %esi
	shr	$0x20, %rdx
	call	native_write_msr

  and native_write_msr() does at minimum:

	mov    %edi,%ecx
	mov    %esi,%eax
	wrmsr
	ret

  In the worst case 'ret' is going through the return thunk. Not to
  talk about function prologues and whatever.

  This becomes even more silly for trivial instructions like STI/CLI
  or in the worst case paravirt_nop().

  The call makes only sense, when the native default is an actual
  function, but for the trivial cases it's a blatant engineering
  trainwreck.

Later a consensus was reached to utilize the alternatives mechanism to
eliminate the indirect call overhead introduced by the pv_ops APIs:

    1) When built with !CONFIG_XEN_PV, X86_FEATURE_XENPV becomes a
       disabled feature, preventing the Xen code from being built
       and ensuring the native code is executed unconditionally.

    2) When built with CONFIG_XEN_PV:

       2.1) If not running on the Xen hypervisor (!X86_FEATURE_XENPV),
            the kernel runtime binary is patched to unconditionally
            jump to the native MSR write code.

       2.2) If running on the Xen hypervisor (X86_FEATURE_XENPV), the
            kernel runtime binary is patched to unconditionally jump
            to the Xen MSR write code.

The alternatives mechanism is also used to choose the new immediate
form MSR write instruction when it's available.

Consequently, remove the pv_ops MSR write APIs and the Xen callbacks.

[1]: https://lore.kernel.org/lkml/87y1h81ht4.ffs@tglx/

Originally-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li (Intel) <xin@zytor.com>
---
 arch/x86/include/asm/fred.h           |   2 +-
 arch/x86/include/asm/msr.h            | 294 ++++++++++++++++++++------
 arch/x86/include/asm/paravirt.h       |  25 ---
 arch/x86/include/asm/paravirt_types.h |   2 -
 arch/x86/kernel/paravirt.c            |   2 -
 arch/x86/xen/enlighten_pv.c           |  41 +---
 arch/x86/xen/xen-asm.S                |  64 ++++++
 arch/x86/xen/xen-ops.h                |   2 +
 8 files changed, 302 insertions(+), 130 deletions(-)

diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
index 12b34d5b2953..8ae4429e5401 100644
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -101,7 +101,7 @@ static __always_inline void fred_update_rsp0(void)
 	unsigned long rsp0 = (unsigned long) task_stack_page(current) + THREAD_SIZE;
 
 	if (cpu_feature_enabled(X86_FEATURE_FRED) && (__this_cpu_read(fred_rsp0) != rsp0)) {
-		wrmsrns(MSR_IA32_FRED_RSP0, rsp0);
+		native_wrmsrq(MSR_IA32_FRED_RSP0, rsp0);
 		__this_cpu_write(fred_rsp0, rsp0);
 	}
 }
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 8f7a67b1c61c..bd3bdb3c3d23 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -75,9 +75,40 @@ static inline void do_trace_rdpmc(u32 msr, u64 val, int failed) {}
 #endif
 
 #ifdef CONFIG_XEN_PV
+extern void asm_xen_write_msr(void);
 extern u64 xen_read_pmc(int counter);
 #endif
 
+/* The GNU Assembler (Gas) with Binutils 2.40 adds WRMSRNS support */
+#if defined(CONFIG_AS_IS_GNU) && CONFIG_AS_VERSION >= 24000
+#define ASM_WRMSRNS		"wrmsrns"
+#else
+#define ASM_WRMSRNS		_ASM_BYTES(0x0f,0x01,0xc6)
+#endif
+
+/* The GNU Assembler (Gas) with Binutils 2.41 adds the .insn directive support */
+#if defined(CONFIG_AS_IS_GNU) && CONFIG_AS_VERSION >= 24100
+#define ASM_WRMSRNS_IMM			\
+	" .insn VEX.128.F3.M7.W0 0xf6 /0, %[val], %[msr]%{:u32}\n\t"
+#else
+/*
+ * Note, clang also doesn't support the .insn directive.
+ *
+ * The register operand is encoded as %rax because all uses of the immediate
+ * form MSR access instructions reference %rax as the register operand.
+ */
+#define ASM_WRMSRNS_IMM			\
+	" .byte 0xc4,0xe7,0x7a,0xf6,0xc0; .long %c[msr]"
+#endif
+
+#define PREPARE_RDX_FOR_WRMSR		\
+	"mov %%rax, %%rdx\n\t"		\
+	"shr $0x20, %%rdx\n\t"
+
+#define PREPARE_RCX_RDX_FOR_WRMSR	\
+	"mov %[msr], %%ecx\n\t"		\
+	PREPARE_RDX_FOR_WRMSR
+
 /*
  * Called only from an MSR fault handler, the instruction pointer points to
  * the MSR access instruction that caused the fault.
@@ -96,13 +127,6 @@ static __always_inline bool is_msr_imm_insn(void *ip)
 #endif
 }
 
-/*
- * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR
- * accessors and should not have any tracing or other functionality piggybacking
- * on them - those are *purely* for accessing MSRs and nothing more. So don't even
- * think of extending them - you will be slapped with a stinking trout or a frozen
- * shark will reach you, wherever you are! You've been warned.
- */
 static __always_inline u64 __rdmsr(u32 msr)
 {
 	DECLARE_ARGS(val, low, high);
@@ -115,14 +139,6 @@ static __always_inline u64 __rdmsr(u32 msr)
 	return EAX_EDX_VAL(val, low, high);
 }
 
-static __always_inline void __wrmsrq(u32 msr, u64 val)
-{
-	asm volatile("1: wrmsr\n"
-		     "2:\n"
-		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
-		     : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32)) : "memory");
-}
-
 #define native_rdmsr(msr, val1, val2)			\
 do {							\
 	u64 __val = __rdmsr((msr));			\
@@ -135,12 +151,6 @@ static __always_inline u64 native_rdmsrq(u32 msr)
 	return __rdmsr(msr);
 }
 
-#define native_wrmsr(msr, low, high)			\
-	__wrmsrq((msr), (u64)(high) << 32 | (low))
-
-#define native_wrmsrq(msr, val)				\
-	__wrmsrq((msr), (val))
-
 static inline u64 native_read_msr(u32 msr)
 {
 	u64 val;
@@ -171,7 +181,132 @@ static inline int native_read_msr_safe(u32 msr, u64 *p)
 	return err;
 }
 
-/* Can be uninlined because referenced by paravirt */
+/*
+ * There are two sets of APIs for MSR accesses: native APIs and generic APIs.
+ * Native MSR APIs execute MSR instructions directly, regardless of whether the
+ * CPU is paravirtualized or native.  Generic MSR APIs determine the appropriate
+ * MSR access method at runtime, allowing them to be used generically on both
+ * paravirtualized and native CPUs.
+ *
+ * When the compiler can determine the MSR number at compile time, the APIs
+ * with the suffix _constant() are used to enable the immediate form MSR
+ * instructions when available.  The APIs with the suffix _variable() are
+ * used when the MSR number is not known until run time.
+ *
+ * Below is a diagram illustrating the derivation of the MSR write APIs:
+ *
+ *      __native_wrmsrq_variable()    __native_wrmsrq_constant()
+ *                         \           /
+ *                          \         /
+ *                         __native_wrmsrq()   -----------------------
+ *                            /     \                                |
+ *                           /       \                               |
+ *               native_wrmsrq()    native_write_msr_safe()          |
+ *                   /    \                                          |
+ *                  /      \                                         |
+ *      native_wrmsr()    native_write_msr()                         |
+ *                                                                   |
+ *                                                                   |
+ *                                                                   |
+ *                   __xenpv_wrmsrq()                                |
+ *                         |                                         |
+ *                         |                                         |
+ *                      __wrmsrq()   <--------------------------------
+ *                       /    \
+ *                      /      \
+ *                 wrmsrq()   wrmsrq_safe()
+ *                    /          \
+ *                   /            \
+ *                wrmsr()        wrmsr_safe()
+ */
+
+/*
+ * Non-serializing WRMSR, when available.
+ *
+ * Otherwise, it falls back to a serializing WRMSR.
+ */
+static __always_inline bool __native_wrmsrq_variable(u32 msr, u64 val, int type)
+{
+#ifdef CONFIG_X86_64
+	BUILD_BUG_ON(__builtin_constant_p(msr));
+#endif
+
+	/*
+	 * WRMSR is 2 bytes.  WRMSRNS is 3 bytes.  Pad WRMSR with a redundant
+	 * DS prefix to avoid a trailing NOP.
+	 */
+	asm_inline volatile goto(
+		"1:\n"
+		ALTERNATIVE("ds wrmsr",
+			    ASM_WRMSRNS,
+			    X86_FEATURE_WRMSRNS)
+		_ASM_EXTABLE_TYPE(1b, %l[badmsr], %c[type])
+
+		:
+		: "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32)), [type] "i" (type)
+		: "memory"
+		: badmsr);
+
+	return false;
+
+badmsr:
+	return true;
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Non-serializing WRMSR or its immediate form, when available.
+ *
+ * Otherwise, it falls back to a serializing WRMSR.
+ */
+static __always_inline bool __native_wrmsrq_constant(u32 msr, u64 val, int type)
+{
+	BUILD_BUG_ON(!__builtin_constant_p(msr));
+
+	asm_inline volatile goto(
+		"1:\n"
+		ALTERNATIVE_2(PREPARE_RCX_RDX_FOR_WRMSR
+			      "2: ds wrmsr",
+			      PREPARE_RCX_RDX_FOR_WRMSR
+			      ASM_WRMSRNS,
+			      X86_FEATURE_WRMSRNS,
+			      ASM_WRMSRNS_IMM,
+			      X86_FEATURE_MSR_IMM)
+		_ASM_EXTABLE_TYPE(1b, %l[badmsr], %c[type])	/* For WRMSRNS immediate */
+		_ASM_EXTABLE_TYPE(2b, %l[badmsr], %c[type])	/* For WRMSR(NS) */
+
+		:
+		: [val] "a" (val), [msr] "i" (msr), [type] "i" (type)
+		: "memory", "ecx", "rdx"
+		: badmsr);
+
+	return false;
+
+badmsr:
+	return true;
+}
+#endif
+
+static __always_inline bool __native_wrmsrq(u32 msr, u64 val, int type)
+{
+#ifdef CONFIG_X86_64
+	if (__builtin_constant_p(msr))
+		return __native_wrmsrq_constant(msr, val, type);
+#endif
+
+	return __native_wrmsrq_variable(msr, val, type);
+}
+
+static __always_inline void native_wrmsrq(u32 msr, u64 val)
+{
+	__native_wrmsrq(msr, val, EX_TYPE_WRMSR);
+}
+
+static __always_inline void native_wrmsr(u32 msr, u32 low, u32 high)
+{
+	native_wrmsrq(msr, (u64)high << 32 | low);
+}
+
 static inline void notrace native_write_msr(u32 msr, u64 val)
 {
 	native_wrmsrq(msr, val);
@@ -180,22 +315,82 @@ static inline void notrace native_write_msr(u32 msr, u64 val)
 		do_trace_write_msr(msr, val, 0);
 }
 
-/* Can be uninlined because referenced by paravirt */
 static inline int notrace native_write_msr_safe(u32 msr, u64 val)
 {
-	int err;
+	int err = __native_wrmsrq(msr, val, EX_TYPE_WRMSR_SAFE) ? -EIO : 0;
 
-	asm volatile("1: wrmsr ; xor %[err],%[err]\n"
-		     "2:\n\t"
-		     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err])
-		     : [err] "=a" (err)
-		     : "c" (msr), "0" ((u32)val), "d" ((u32)(val >> 32))
-		     : "memory");
 	if (tracepoint_enabled(write_msr))
 		do_trace_write_msr(msr, val, err);
+
 	return err;
 }
 
+#ifdef CONFIG_XEN_PV
+/* No plan to support immediate form MSR instructions in Xen */
+static __always_inline bool __xenpv_wrmsrq(u32 msr, u64 val, int type)
+{
+	asm_inline volatile goto(
+		"call asm_xen_write_msr\n\t"
+		"jnz 2f\n\t"
+		ALTERNATIVE("1: ds wrmsr",
+			    ASM_WRMSRNS,
+			    X86_FEATURE_WRMSRNS)
+		"2:\n"
+		_ASM_EXTABLE_TYPE(1b, %l[badmsr], %c[type])	/* For WRMSR(NS) */
+
+		: ASM_CALL_CONSTRAINT
+		: "a" (val), "c" (msr), [type] "i" (type)
+		: "memory", "rdx"
+		: badmsr);
+
+	return false;
+
+badmsr:
+	return true;
+}
+#endif
+
+static __always_inline bool __wrmsrq(u32 msr, u64 val, int type)
+{
+	bool ret;
+
+#ifdef CONFIG_XEN_PV
+	if (cpu_feature_enabled(X86_FEATURE_XENPV))
+		return __xenpv_wrmsrq(msr, val, type);
+#endif
+
+	/*
+	 * 1) When built with !CONFIG_XEN_PV.
+	 * 2) When built with CONFIG_XEN_PV but not running on Xen hypervisor.
+	 */
+	ret = __native_wrmsrq(msr, val, type);
+
+	if (tracepoint_enabled(write_msr))
+		do_trace_write_msr(msr, val, ret ? -EIO : 0);
+
+	return ret;
+}
+
+static __always_inline void wrmsrq(u32 msr, u64 val)
+{
+	__wrmsrq(msr, val, EX_TYPE_WRMSR);
+}
+
+static __always_inline void wrmsr(u32 msr, u32 low, u32 high)
+{
+	wrmsrq(msr, (u64)high << 32 | low);
+}
+
+static __always_inline int wrmsrq_safe(u32 msr, u64 val)
+{
+	return __wrmsrq(msr, val, EX_TYPE_WRMSR_SAFE) ? -EIO : 0;
+}
+
+static __always_inline int wrmsr_safe(u32 msr, u32 low, u32 high)
+{
+	return wrmsrq_safe(msr, (u64)high << 32 | low);
+}
+
 extern int rdmsr_safe_regs(u32 regs[8]);
 extern int wrmsr_safe_regs(u32 regs[8]);
 
@@ -242,25 +437,9 @@ do {								\
 	(void)((high) = (u32)(__val >> 32));			\
 } while (0)
 
-static inline void wrmsr(u32 msr, u32 low, u32 high)
-{
-	native_write_msr(msr, (u64)high << 32 | low);
-}
-
 #define rdmsrq(msr, val)			\
 	((val) = native_read_msr((msr)))
 
-static inline void wrmsrq(u32 msr, u64 val)
-{
-	native_write_msr(msr, val);
-}
-
-/* wrmsr with exception handling */
-static inline int wrmsrq_safe(u32 msr, u64 val)
-{
-	return native_write_msr_safe(msr, val);
-}
-
 /* rdmsr with exception handling */
 #define rdmsr_safe(msr, low, high)				\
 ({								\
@@ -277,29 +456,6 @@ static inline int rdmsrq_safe(u32 msr, u64 *p)
 }
 #endif	/* !CONFIG_PARAVIRT_XXL */
 
-/* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */
-#define WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6)
-
-/* Non-serializing WRMSR, when available.  Falls back to a serializing WRMSR. */
-static __always_inline void wrmsrns(u32 msr, u64 val)
-{
-	/*
-	 * WRMSR is 2 bytes.  WRMSRNS is 3 bytes.  Pad WRMSR with a redundant
-	 * DS prefix to avoid a trailing NOP.
-	 */
-	asm volatile("1: " ALTERNATIVE("ds wrmsr", WRMSRNS, X86_FEATURE_WRMSRNS)
-		     "2: " _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
-		     : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32)));
-}
-
-/*
- * Dual u32 version of wrmsrq_safe():
- */
-static inline int wrmsr_safe(u32 msr, u32 low, u32 high)
-{
-	return wrmsrq_safe(msr, (u64)high << 32 | low);
-}
-
 struct msr __percpu *msrs_alloc(void);
 void msrs_free(struct msr __percpu *msrs);
 int msr_set_bit(u32 msr, u8 bit);
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 1bd1dad8da5a..6634f6cf801f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -180,21 +180,11 @@ static inline u64 paravirt_read_msr(unsigned msr)
 	return PVOP_CALL1(u64, cpu.read_msr, msr);
 }
 
-static inline void paravirt_write_msr(u32 msr, u64 val)
-{
-	PVOP_VCALL2(cpu.write_msr, msr, val);
-}
-
 static inline u64 paravirt_read_msr_safe(unsigned msr, int *err)
 {
 	return PVOP_CALL2(u64, cpu.read_msr_safe, msr, err);
 }
 
-static inline int paravirt_write_msr_safe(u32 msr, u64 val)
-{
-	return PVOP_CALL2(int, cpu.write_msr_safe, msr, val);
-}
-
 #define rdmsr(msr, val1, val2)			\
 do {						\
 	u64 _l = paravirt_read_msr(msr);	\
@@ -202,26 +192,11 @@ do {						\
 	val2 = _l >> 32;			\
 } while (0)
 
-static __always_inline void wrmsr(u32 msr, u32 low, u32 high)
-{
-	paravirt_write_msr(msr, (u64)high << 32 | low);
-}
-
 #define rdmsrq(msr, val)			\
 do {						\
 	val = paravirt_read_msr(msr);		\
 } while (0)
 
-static inline void wrmsrq(u32 msr, u64 val)
-{
-	paravirt_write_msr(msr, val);
-}
-
-static inline int wrmsrq_safe(u32 msr, u64 val)
-{
-	return paravirt_write_msr_safe(msr, val)
-}
-
 /* rdmsr with exception handling */
 #define rdmsr_safe(msr, a, b)				\
 ({							\
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index d2db38c32bc5..18bb0e5bd22f 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -92,14 +92,12 @@ struct pv_cpu_ops {
 
 	/* Unsafe MSR operations.  These will warn or panic on failure. */
 	u64 (*read_msr)(unsigned int msr);
-	void (*write_msr)(u32 msr, u64 val);
 
 	/*
 	 * Safe MSR operations.
 	 * Returns 0 or -EIO.
 	 */
 	int (*read_msr_safe)(unsigned int msr, u64 *val);
-	int (*write_msr_safe)(u32 msr, u64 val);
 
 	void (*start_context_switch)(struct task_struct *prev);
 	void (*end_context_switch)(struct task_struct *next);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 28d195ad7514..62bf66f61821 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -129,9 +129,7 @@ struct paravirt_patch_template pv_ops = {
 	.cpu.write_cr0		= native_write_cr0,
 	.cpu.write_cr4		= native_write_cr4,
 	.cpu.read_msr		= native_read_msr,
-	.cpu.write_msr		= native_write_msr,
 	.cpu.read_msr_safe	= native_read_msr_safe,
-	.cpu.write_msr_safe	= native_write_msr_safe,
 	.cpu.load_tr_desc	= native_load_tr_desc,
 	.cpu.set_ldt		= native_set_ldt,
 	.cpu.load_gdt		= native_load_gdt,
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 195e6501a000..4672de7fc084 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1118,26 +1118,26 @@ static void set_seg(u32 which, u64 base)
 }
 
 /*
- * Support write_msr_safe() and write_msr() semantics.
- * With err == NULL write_msr() semantics are selected.
- * Supplying an err pointer requires err to be pre-initialized with 0.
+ * Return true to indicate the requested MSR write has been done successfully,
+ * otherwise return false to have the calling MSR write primitives in msr.h to
+ * fail.
  */
-static void xen_do_write_msr(u32 msr, u64 val, int *err)
+bool xen_write_msr(u32 msr, u64 val)
 {
 	bool emulated;
 
 	switch (msr) {
 	case MSR_FS_BASE:
 		set_seg(SEGBASE_FS, val);
-		break;
+		return true;
 
 	case MSR_KERNEL_GS_BASE:
 		set_seg(SEGBASE_GS_USER, val);
-		break;
+		return true;
 
 	case MSR_GS_BASE:
 		set_seg(SEGBASE_GS_KERNEL, val);
-		break;
+		return true;
 
 	case MSR_STAR:
 	case MSR_CSTAR:
@@ -1149,16 +1149,13 @@ static void xen_do_write_msr(u32 msr, u64 val, int *err)
 		/* Fast syscall setup is all done in hypercalls, so
 		   these are all ignored.  Stub them out here to stop
 		   Xen console noise. */
-		break;
+		return true;
 
 	default:
 		if (pmu_msr_chk_emulated(msr, &val, false, &emulated) && emulated)
-			return;
+			return true;
 
-		if (err)
-			*err = native_write_msr_safe(msr, val);
-		else
-			native_write_msr(msr, val);
+		return false;
 	}
 }
 
@@ -1170,15 +1167,6 @@ static int xen_read_msr_safe(unsigned int msr, u64 *val)
 	return err;
 }
 
-static int xen_write_msr_safe(u32 msr, u64 val)
-{
-	int err = 0;
-
-	xen_do_write_msr(msr, val, &err);
-
-	return err;
-}
-
 static u64 xen_read_msr(unsigned int msr)
 {
 	int err;
@@ -1186,13 +1174,6 @@ static u64 xen_read_msr(unsigned int msr)
 	return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL);
 }
 
-static void xen_write_msr(u32 msr, u64 val)
-{
-	int err;
-
-	xen_do_write_msr(msr, val, xen_msr_safe ? &err : NULL);
-}
-
 /* This is called once we have the cpu_possible_mask */
 void __init xen_setup_vcpu_info_placement(void)
 {
@@ -1228,10 +1209,8 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = {
 		.write_cr4 = xen_write_cr4,
 
 		.read_msr = xen_read_msr,
-		.write_msr = xen_write_msr,
 
 		.read_msr_safe = xen_read_msr_safe,
-		.write_msr_safe = xen_write_msr_safe,
 
 		.load_tr_desc = paravirt_nop,
 		.set_ldt = xen_set_ldt,
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 461bb1526502..eecce47fbe49 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -342,3 +342,67 @@ SYM_CODE_END(xen_entry_SYSENTER_compat)
 SYM_CODE_END(xen_entry_SYSCALL_compat)
 
 #endif	/* CONFIG_IA32_EMULATION */
+
+/*
+ * To leverage the alternatives mechanism and eliminate the overhead of Xen
+ * MSR and PMU counter access on native systems, as well as to enable new MSR
+ * instructions based on their availability, assembly trampoline functions
+ * are introduced when CONFIG_XEN_PV is enabled.
+ *
+ * Since these trampoline functions are invoked without saving callee registers,
+ * they must save the callee registers and the frame pointer.
+ */
+.macro XEN_SAVE_CALLEE_REGS_FOR_MSR
+	push %rcx
+	push %rdi
+	push %rsi
+	push %r8
+	push %r9
+	push %r10
+	push %r11
+.endm
+
+.macro XEN_RESTORE_CALLEE_REGS_FOR_MSR
+	pop %r11
+	pop %r10
+	pop %r9
+	pop %r8
+	pop %rsi
+	pop %rdi
+	pop %rcx
+.endm
+
+/*
+ * MSR number in %ecx, MSR value in %rax.
+ *
+ * %edx is set up to match %rax >> 32 like the native stub
+ * is expected to do
+ *
+ * Let xen_write_msr() return 'false' if the MSR access should
+ * be executed natively, IOW, 'true' means it has done the job.
+ *
+ * 	bool xen_write_msr(u32 msr, u64 value)
+ *
+ * If ZF=1 then this will fall down to the actual native WRMSR[NS]
+ * instruction.
+ *
+ * This also removes the need for Xen to maintain different safe and
+ * unsafe MSR routines, as the difference is handled by the same
+ * trap handler as is used natively.
+ */
+ SYM_FUNC_START(asm_xen_write_msr)
+	ENDBR
+	FRAME_BEGIN
+	push %rax		/* Save in case of native fallback */
+	XEN_SAVE_CALLEE_REGS_FOR_MSR
+	mov %ecx, %edi		/* MSR number */
+	mov %rax, %rsi		/* MSR data */
+	call xen_write_msr
+	test %al, %al		/* %al=1, i.e., ZF=0, means successfully done */
+	XEN_RESTORE_CALLEE_REGS_FOR_MSR
+	mov 4(%rsp), %edx	/* Set up %edx for native execution */
+	pop %rax
+	FRAME_END
+	RET
+SYM_FUNC_END(asm_xen_write_msr)
+EXPORT_SYMBOL_GPL(asm_xen_write_msr)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index fde9f9d7415f..56712242262a 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -146,6 +146,8 @@ __visible unsigned long xen_read_cr2_direct(void);
 /* These are not functions, and cannot be called normally */
 __visible void xen_iret(void);
 
+extern bool xen_write_msr(u32 msr, u64 val);
+
 extern int xen_panic_handler_init(void);
 
 int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int),
-- 
2.49.0

Re: [RFC PATCH v2 21/34] x86/msr: Utilize the alternatives mechanism to write MSR

Posted by Jürgen Groß 9 months, 3 weeks ago

On 22.04.25 10:22, Xin Li (Intel) wrote:
> The story started from tglx's reply in [1]:
> 
>    For actual performance relevant code the current PV ops mechanics
>    are a horrorshow when the op defaults to the native instruction.
> 
>    look at wrmsrl():
> 
>    wrmsrl(msr, val
>     wrmsr(msr, (u32)val, (u32)val >> 32))
>      paravirt_write_msr(msr, low, high)
>        PVOP_VCALL3(cpu.write_msr, msr, low, high)
> 
>    Which results in
> 
> 	mov	$msr, %edi
> 	mov	$val, %rdx
> 	mov	%edx, %esi
> 	shr	$0x20, %rdx
> 	call	native_write_msr
> 
>    and native_write_msr() does at minimum:
> 
> 	mov    %edi,%ecx
> 	mov    %esi,%eax
> 	wrmsr
> 	ret
> 
>    In the worst case 'ret' is going through the return thunk. Not to
>    talk about function prologues and whatever.
> 
>    This becomes even more silly for trivial instructions like STI/CLI
>    or in the worst case paravirt_nop().

This is nonsense.

In the non-Xen case the initial indirect call is directly replaced with
STI/CLI via alternative patching, while for Xen it is replaced by a direct
call.

The paravirt_nop() case is handled in alt_replace_call() by replacing the
indirect call with a nop in case the target of the call was paravirt_nop()
(which is in fact no_func()).

> 
>    The call makes only sense, when the native default is an actual
>    function, but for the trivial cases it's a blatant engineering
>    trainwreck.

The trivial cases are all handled as stated above: a direct replacement
instruction is placed at the indirect call position.

> Later a consensus was reached to utilize the alternatives mechanism to
> eliminate the indirect call overhead introduced by the pv_ops APIs:
> 
>      1) When built with !CONFIG_XEN_PV, X86_FEATURE_XENPV becomes a
>         disabled feature, preventing the Xen code from being built
>         and ensuring the native code is executed unconditionally.

This is the case today already. There is no need for any change to have
this in place.

> 
>      2) When built with CONFIG_XEN_PV:
> 
>         2.1) If not running on the Xen hypervisor (!X86_FEATURE_XENPV),
>              the kernel runtime binary is patched to unconditionally
>              jump to the native MSR write code.
> 
>         2.2) If running on the Xen hypervisor (X86_FEATURE_XENPV), the
>              kernel runtime binary is patched to unconditionally jump
>              to the Xen MSR write code.

I can't see what is different here compared to today's state.

> 
> The alternatives mechanism is also used to choose the new immediate
> form MSR write instruction when it's available.

Yes, this needs to be added.

> Consequently, remove the pv_ops MSR write APIs and the Xen callbacks.

I still don't see a major difference to today's solution.

Only the "paravirt" term has been eliminated.


Juergen

Re: [RFC PATCH v2 21/34] x86/msr: Utilize the alternatives mechanism to write MSR

Posted by Peter Zijlstra 9 months, 2 weeks ago

On Tue, Apr 22, 2025 at 11:57:01AM +0200, Jürgen Groß wrote:
> On 22.04.25 10:22, Xin Li (Intel) wrote:

> >    This becomes even more silly for trivial instructions like STI/CLI
> >    or in the worst case paravirt_nop().
> 
> This is nonsense.

What Jurgen says. Someone hasn't done their homework.

static __always_inline void arch_local_irq_disable(void)
{
        PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN);
}

static __always_inline void arch_local_irq_enable(void)
{
        PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN);
}

That very much patches in STI/CLI directly when not Xen.

Re: [RFC PATCH v2 21/34] x86/msr: Utilize the alternatives mechanism to write MSR

Posted by Xin Li 9 months, 3 weeks ago

On 4/22/2025 2:57 AM, Jürgen Groß wrote:
> On 22.04.25 10:22, Xin Li (Intel) wrote:
>> The story started from tglx's reply in [1]:
>>
>>    For actual performance relevant code the current PV ops mechanics
>>    are a horrorshow when the op defaults to the native instruction.
>>
>>    look at wrmsrl():
>>
>>    wrmsrl(msr, val
>>     wrmsr(msr, (u32)val, (u32)val >> 32))
>>      paravirt_write_msr(msr, low, high)
>>        PVOP_VCALL3(cpu.write_msr, msr, low, high)
>>
>>    Which results in
>>
>>     mov    $msr, %edi
>>     mov    $val, %rdx
>>     mov    %edx, %esi
>>     shr    $0x20, %rdx
>>     call    native_write_msr
>>
>>    and native_write_msr() does at minimum:
>>
>>     mov    %edi,%ecx
>>     mov    %esi,%eax
>>     wrmsr
>>     ret
>>
>>    In the worst case 'ret' is going through the return thunk. Not to
>>    talk about function prologues and whatever.
>>
>>    This becomes even more silly for trivial instructions like STI/CLI
>>    or in the worst case paravirt_nop().
> 
> This is nonsense.
> 
> In the non-Xen case the initial indirect call is directly replaced with
> STI/CLI via alternative patching, while for Xen it is replaced by a direct
> call.
> 
> The paravirt_nop() case is handled in alt_replace_call() by replacing the
> indirect call with a nop in case the target of the call was paravirt_nop()
> (which is in fact no_func()).
> 
>>
>>    The call makes only sense, when the native default is an actual
>>    function, but for the trivial cases it's a blatant engineering
>>    trainwreck.
> 
> The trivial cases are all handled as stated above: a direct replacement
> instruction is placed at the indirect call position.

The above comment was given in 2023 IIRC, and you have addressed it.

> 
>> Later a consensus was reached to utilize the alternatives mechanism to
>> eliminate the indirect call overhead introduced by the pv_ops APIs:
>>
>>      1) When built with !CONFIG_XEN_PV, X86_FEATURE_XENPV becomes a
>>         disabled feature, preventing the Xen code from being built
>>         and ensuring the native code is executed unconditionally.
> 
> This is the case today already. There is no need for any change to have
> this in place.
> 
>>
>>      2) When built with CONFIG_XEN_PV:
>>
>>         2.1) If not running on the Xen hypervisor (!X86_FEATURE_XENPV),
>>              the kernel runtime binary is patched to unconditionally
>>              jump to the native MSR write code.
>>
>>         2.2) If running on the Xen hypervisor (X86_FEATURE_XENPV), the
>>              kernel runtime binary is patched to unconditionally jump
>>              to the Xen MSR write code.
> 
> I can't see what is different here compared to today's state.
> 
>>
>> The alternatives mechanism is also used to choose the new immediate
>> form MSR write instruction when it's available.
> 
> Yes, this needs to be added.
> 
>> Consequently, remove the pv_ops MSR write APIs and the Xen callbacks.
> 
> I still don't see a major difference to today's solution.

The existing code generates:

     ...
     bf e0 06 00 00          mov    $0x6e0,%edi
     89 d6                   mov    %edx,%esi
     48 c1 ea 20             shr    $0x20,%rdx
     ff 15 07 48 8c 01       call   *0x18c4807(%rip)  # <pv_ops+0xb8>
     31 c0                   xor    %eax,%eax
     ...

And on native, the indirect call instruction is patched to a direct call
as you mentioned:

     ...
     bf e0 06 00 00          mov    $0x6e0,%edi
     89 d6                   mov    %edx,%esi
     48 c1 ea 20             shr    $0x20,%rdx
     e8 60 3e 01 00          call   <{native,xen}_write_msr> # direct
     90                      nop
     31 c0                   xor    %eax,%eax
     ...


This patch set generates assembly w/o CALL on native:

     ...
     e9 e6 22 c6 01          jmp    1f   # on native or nop on Xen
     b9 e0 06 00 00          mov    $0x6e0,%ecx
     e8 91 d4 fa ff          call   ffffffff8134ee80 <asm_xen_write_msr>
     e9 a4 9f eb 00          jmp    ffffffff8225b9a0 <__x86_return_thunk>
         ...
1:  b9 e0 06 00 00          mov    $0x6e0,%ecx   # immediate form here
     48 89 c2                mov    %rax,%rdx
     48 c1 ea 20             shr    $0x20,%rdx
     3e 0f 30                ds wrmsr
     ...

It's not a major change, but when it is patched to use the immediate 
form MSR write instruction, it's straightforwardly streamlined.

> 
> Only the "paravirt" term has been eliminated.

Yes.

But a PV guest doesn't operate at the highest privilege level, which
means MSR instructions typically result in a #GP fault.  I actually 
think the pv_ops MSR APIs are unnecessary because of this inherent
limitation.

Looking at the Xen MSR code, except PMU and just a few MSRs, it falls
back to executes native MSR instructions.  As MSR instructions trigger
#GP, Xen takes control and handles them in 2 ways:

   1) emulate (or ignore) a MSR operation and skip the guest instruction.

   2) inject the #GP back to guest OS and let its #GP handler handle it.
      But Linux MSR exception handler just ignores the MSR instruction
      (MCE MSR exception will panic).

So why not let Xen handle all the details which it already tries to do?
(Linux w/ such a change may not be able to run on old Xen hypervisors.)

BTW, if performance is a concern, writes to MSR_KERNEL_GS_BASE and
MSR_GS_BASE anyway are hpyercalls into Xen.

Thanks!
     Xin

Re: [RFC PATCH v2 21/34] x86/msr: Utilize the alternatives mechanism to write MSR

Posted by Jürgen Groß 9 months, 3 weeks ago

On 23.04.25 10:51, Xin Li wrote:
> On 4/22/2025 2:57 AM, Jürgen Groß wrote:
>> On 22.04.25 10:22, Xin Li (Intel) wrote:
>>> The story started from tglx's reply in [1]:
>>>
>>>    For actual performance relevant code the current PV ops mechanics
>>>    are a horrorshow when the op defaults to the native instruction.
>>>
>>>    look at wrmsrl():
>>>
>>>    wrmsrl(msr, val
>>>     wrmsr(msr, (u32)val, (u32)val >> 32))
>>>      paravirt_write_msr(msr, low, high)
>>>        PVOP_VCALL3(cpu.write_msr, msr, low, high)
>>>
>>>    Which results in
>>>
>>>     mov    $msr, %edi
>>>     mov    $val, %rdx
>>>     mov    %edx, %esi
>>>     shr    $0x20, %rdx
>>>     call    native_write_msr
>>>
>>>    and native_write_msr() does at minimum:
>>>
>>>     mov    %edi,%ecx
>>>     mov    %esi,%eax
>>>     wrmsr
>>>     ret
>>>
>>>    In the worst case 'ret' is going through the return thunk. Not to
>>>    talk about function prologues and whatever.
>>>
>>>    This becomes even more silly for trivial instructions like STI/CLI
>>>    or in the worst case paravirt_nop().
>>
>> This is nonsense.
>>
>> In the non-Xen case the initial indirect call is directly replaced with
>> STI/CLI via alternative patching, while for Xen it is replaced by a direct
>> call.
>>
>> The paravirt_nop() case is handled in alt_replace_call() by replacing the
>> indirect call with a nop in case the target of the call was paravirt_nop()
>> (which is in fact no_func()).
>>
>>>
>>>    The call makes only sense, when the native default is an actual
>>>    function, but for the trivial cases it's a blatant engineering
>>>    trainwreck.
>>
>> The trivial cases are all handled as stated above: a direct replacement
>> instruction is placed at the indirect call position.
> 
> The above comment was given in 2023 IIRC, and you have addressed it.
> 
>>
>>> Later a consensus was reached to utilize the alternatives mechanism to
>>> eliminate the indirect call overhead introduced by the pv_ops APIs:
>>>
>>>      1) When built with !CONFIG_XEN_PV, X86_FEATURE_XENPV becomes a
>>>         disabled feature, preventing the Xen code from being built
>>>         and ensuring the native code is executed unconditionally.
>>
>> This is the case today already. There is no need for any change to have
>> this in place.
>>
>>>
>>>      2) When built with CONFIG_XEN_PV:
>>>
>>>         2.1) If not running on the Xen hypervisor (!X86_FEATURE_XENPV),
>>>              the kernel runtime binary is patched to unconditionally
>>>              jump to the native MSR write code.
>>>
>>>         2.2) If running on the Xen hypervisor (X86_FEATURE_XENPV), the
>>>              kernel runtime binary is patched to unconditionally jump
>>>              to the Xen MSR write code.
>>
>> I can't see what is different here compared to today's state.
>>
>>>
>>> The alternatives mechanism is also used to choose the new immediate
>>> form MSR write instruction when it's available.
>>
>> Yes, this needs to be added.
>>
>>> Consequently, remove the pv_ops MSR write APIs and the Xen callbacks.
>>
>> I still don't see a major difference to today's solution.
> 
> The existing code generates:
> 
>      ...
>      bf e0 06 00 00          mov    $0x6e0,%edi
>      89 d6                   mov    %edx,%esi
>      48 c1 ea 20             shr    $0x20,%rdx
>      ff 15 07 48 8c 01       call   *0x18c4807(%rip)  # <pv_ops+0xb8>
>      31 c0                   xor    %eax,%eax
>      ...
> 
> And on native, the indirect call instruction is patched to a direct call
> as you mentioned:
> 
>      ...
>      bf e0 06 00 00          mov    $0x6e0,%edi
>      89 d6                   mov    %edx,%esi
>      48 c1 ea 20             shr    $0x20,%rdx
>      e8 60 3e 01 00          call   <{native,xen}_write_msr> # direct
>      90                      nop
>      31 c0                   xor    %eax,%eax
>      ...
> 
> 
> This patch set generates assembly w/o CALL on native:
> 
>      ...
>      e9 e6 22 c6 01          jmp    1f   # on native or nop on Xen
>      b9 e0 06 00 00          mov    $0x6e0,%ecx
>      e8 91 d4 fa ff          call   ffffffff8134ee80 <asm_xen_write_msr>
>      e9 a4 9f eb 00          jmp    ffffffff8225b9a0 <__x86_return_thunk>
>          ...
> 1:  b9 e0 06 00 00          mov    $0x6e0,%ecx   # immediate form here
>      48 89 c2                mov    %rax,%rdx
>      48 c1 ea 20             shr    $0x20,%rdx
>      3e 0f 30                ds wrmsr
>      ...
> 
> It's not a major change, but when it is patched to use the immediate form MSR 
> write instruction, it's straightforwardly streamlined.

It should be rather easy to switch the current wrmsr/rdmsr paravirt patching
locations to use the rdmsr/wrmsr instructions instead of doing a call to
native_*msr().

The case of the new immediate form could be handled the same way.

> 
>>
>> Only the "paravirt" term has been eliminated.
> 
> Yes.
> 
> But a PV guest doesn't operate at the highest privilege level, which
> means MSR instructions typically result in a #GP fault.  I actually think the 
> pv_ops MSR APIs are unnecessary because of this inherent
> limitation.
> 
> Looking at the Xen MSR code, except PMU and just a few MSRs, it falls
> back to executes native MSR instructions.  As MSR instructions trigger
> #GP, Xen takes control and handles them in 2 ways:
> 
>    1) emulate (or ignore) a MSR operation and skip the guest instruction.
> 
>    2) inject the #GP back to guest OS and let its #GP handler handle it.
>       But Linux MSR exception handler just ignores the MSR instruction
>       (MCE MSR exception will panic).
> 
> So why not let Xen handle all the details which it already tries to do?

Some MSRs are not handled that way, but via a kernel internal emulation.
And those are handled that way mostly due to performance reasons. And some
need special treatment.

> (Linux w/ such a change may not be able to run on old Xen hypervisors.)

Yes, and this is something to avoid.

And remember that Linux isn't the only PV-mode guest existing.

> BTW, if performance is a concern, writes to MSR_KERNEL_GS_BASE and
> MSR_GS_BASE anyway are hpyercalls into Xen.

Yes, and some other MSR writes are just NOPs with Xen-PV.


Juergen

Re: [RFC PATCH v2 21/34] x86/msr: Utilize the alternatives mechanism to write MSR

Posted by Peter Zijlstra 9 months, 2 weeks ago

On Wed, Apr 23, 2025 at 06:05:19PM +0200, Jürgen Groß wrote:

> > It's not a major change, but when it is patched to use the immediate
> > form MSR write instruction, it's straightforwardly streamlined.
> 
> It should be rather easy to switch the current wrmsr/rdmsr paravirt patching
> locations to use the rdmsr/wrmsr instructions instead of doing a call to
> native_*msr().

Right, just make the Xen functions asm stubs that expect the instruction
registers instead of C-abi and ALT_NOT_XEN the thing.

Shouldn't be hard at all.

Re: [RFC PATCH v2 21/34] x86/msr: Utilize the alternatives mechanism to write MSR

Posted by H. Peter Anvin 9 months, 2 weeks ago

On April 25, 2025 5:33:17 AM PDT, Peter Zijlstra <peterz@infradead.org> wrote:
>On Wed, Apr 23, 2025 at 06:05:19PM +0200, Jürgen Groß wrote:
>
>> > It's not a major change, but when it is patched to use the immediate
>> > form MSR write instruction, it's straightforwardly streamlined.
>> 
>> It should be rather easy to switch the current wrmsr/rdmsr paravirt patching
>> locations to use the rdmsr/wrmsr instructions instead of doing a call to
>> native_*msr().
>
>Right, just make the Xen functions asm stubs that expect the instruction
>registers instead of C-abi and ALT_NOT_XEN the thing.
>
>Shouldn't be hard at all.

And that's what we will be doing. We already have code for that.

Re: [RFC PATCH v2 21/34] x86/msr: Utilize the alternatives mechanism to write MSR

Posted by Jürgen Groß 9 months, 2 weeks ago

On 25.04.25 14:33, Peter Zijlstra wrote:
> On Wed, Apr 23, 2025 at 06:05:19PM +0200, Jürgen Groß wrote:
> 
>>> It's not a major change, but when it is patched to use the immediate
>>> form MSR write instruction, it's straightforwardly streamlined.
>>
>> It should be rather easy to switch the current wrmsr/rdmsr paravirt patching
>> locations to use the rdmsr/wrmsr instructions instead of doing a call to
>> native_*msr().
> 
> Right, just make the Xen functions asm stubs that expect the instruction
> registers instead of C-abi and ALT_NOT_XEN the thing.
> 
> Shouldn't be hard at all.

Correct. And for the new immediate form we can use ALTERNATIVE_3().


Juergen

Re: [RFC PATCH v2 21/34] x86/msr: Utilize the alternatives mechanism to write MSR

Posted by H. Peter Anvin 9 months, 2 weeks ago

On April 25, 2025 5:51:27 AM PDT, "Jürgen Groß" <jgross@suse.com> wrote:
>On 25.04.25 14:33, Peter Zijlstra wrote:
>> On Wed, Apr 23, 2025 at 06:05:19PM +0200, Jürgen Groß wrote:
>> 
>>>> It's not a major change, but when it is patched to use the immediate
>>>> form MSR write instruction, it's straightforwardly streamlined.
>>> 
>>> It should be rather easy to switch the current wrmsr/rdmsr paravirt patching
>>> locations to use the rdmsr/wrmsr instructions instead of doing a call to
>>> native_*msr().
>> 
>> Right, just make the Xen functions asm stubs that expect the instruction
>> registers instead of C-abi and ALT_NOT_XEN the thing.
>> 
>> Shouldn't be hard at all.
>
>Correct. And for the new immediate form we can use ALTERNATIVE_3().
>
>
>Juergen

Yes; in the ultimate case there are *four* alternatives, but the concept is the same and again we have it implemented already.

Re: [RFC PATCH v2 21/34] x86/msr: Utilize the alternatives mechanism to write MSR

Posted by Xin Li 9 months, 3 weeks ago

On 4/23/2025 9:05 AM, Jürgen Groß wrote:
>> It's not a major change, but when it is patched to use the immediate 
>> form MSR write instruction, it's straightforwardly streamlined.
> 
> It should be rather easy to switch the current wrmsr/rdmsr paravirt 
> patching
> locations to use the rdmsr/wrmsr instructions instead of doing a call to
> native_*msr().
> 
> The case of the new immediate form could be handled the same way.

Actually, that is how we get this patch with the existing alternatives
infrastructure.  And we took a step further to also remove the pv_ops
MSR APIs...

It looks to me that you want to add a new facility to the alternatives
infrastructure first?


>>> Only the "paravirt" term has been eliminated.
>>
>> Yes.
>>
>> But a PV guest doesn't operate at the highest privilege level, which
>> means MSR instructions typically result in a #GP fault.  I actually 
>> think the pv_ops MSR APIs are unnecessary because of this inherent
>> limitation.
>>
>> Looking at the Xen MSR code, except PMU and just a few MSRs, it falls
>> back to executes native MSR instructions.  As MSR instructions trigger
>> #GP, Xen takes control and handles them in 2 ways:
>>
>>    1) emulate (or ignore) a MSR operation and skip the guest instruction.
>>
>>    2) inject the #GP back to guest OS and let its #GP handler handle it.
>>       But Linux MSR exception handler just ignores the MSR instruction
>>       (MCE MSR exception will panic).
>>
>> So why not let Xen handle all the details which it already tries to do?
> 
> Some MSRs are not handled that way, but via a kernel internal emulation.
> And those are handled that way mostly due to performance reasons. And some
> need special treatment.
> 
>> (Linux w/ such a change may not be able to run on old Xen hypervisors.)
> 
> Yes, and this is something to avoid.
> 
> And remember that Linux isn't the only PV-mode guest existing.
> 
>> BTW, if performance is a concern, writes to MSR_KERNEL_GS_BASE and
>> MSR_GS_BASE anyway are hpyercalls into Xen.
> 
> Yes, and some other MSR writes are just NOPs with Xen-PV.
> 

I will do some cleanup and refactor first.

BTW, at least we can merge the safe() APIs into the non-safe() ones.

Thanks!
     Xin

Re: [RFC PATCH v2 21/34] x86/msr: Utilize the alternatives mechanism to write MSR

Posted by Jürgen Groß 9 months, 3 weeks ago

On 24.04.25 10:06, Xin Li wrote:
> On 4/23/2025 9:05 AM, Jürgen Groß wrote:
>>> It's not a major change, but when it is patched to use the immediate form MSR 
>>> write instruction, it's straightforwardly streamlined.
>>
>> It should be rather easy to switch the current wrmsr/rdmsr paravirt patching
>> locations to use the rdmsr/wrmsr instructions instead of doing a call to
>> native_*msr().
>>
>> The case of the new immediate form could be handled the same way.
> 
> Actually, that is how we get this patch with the existing alternatives
> infrastructure.  And we took a step further to also remove the pv_ops
> MSR APIs...

And this is what I'm questioning. IMHO this approach is adding more
code by removing the pv_ops MSR_APIs just because "pv_ops is bad". And
I believe most refusal of pv_ops is based on no longer valid reasoning.

> It looks to me that you want to add a new facility to the alternatives
> infrastructure first?

Why would we need a new facility in the alternatives infrastructure?


Juergen

Re: [RFC PATCH v2 21/34] x86/msr: Utilize the alternatives mechanism to write MSR

Posted by H. Peter Anvin 9 months, 2 weeks ago

On 4/24/25 01:14, Jürgen Groß wrote:
>>
>> Actually, that is how we get this patch with the existing alternatives
>> infrastructure.  And we took a step further to also remove the pv_ops
>> MSR APIs...
> 
> And this is what I'm questioning. IMHO this approach is adding more
> code by removing the pv_ops MSR_APIs just because "pv_ops is bad". And
> I believe most refusal of pv_ops is based on no longer valid reasoning.
> 

pvops are a headache because it is effectively a secondary alternatives 
infrastructure that is incompatible with the alternatives one...

>> It looks to me that you want to add a new facility to the alternatives
>> infrastructure first?
> 
> Why would we need a new facility in the alternatives infrastructure?

I'm not sure what Xin means with "facility", but a key motivation for 
this is to:

a. Avoid using the pvops for MSRs when on the only remaining user 
thereof (Xen) is only using it for a very small subset of MSRs and for 
the rest it is just overhead, even for Xen;

b. Being able to do wrmsrns immediate/wrmsrns/wrmsr and rdmsr 
immediate/rdmsr alternatives.

Of these, (b) is by far the biggest motivation. The architectural 
direction for supervisor states is to avoid ad hoc and XSAVES ISA and 
instead use MSRs. The immediate forms are expected to be significantly 
faster, because they make the MSR index available at the very beginning 
of the pipeline instead of at a relatively late stage.

	-hpa

Re: [RFC PATCH v2 21/34] x86/msr: Utilize the alternatives mechanism to write MSR

Posted by Jürgen Groß 9 months, 2 weeks ago

On 25.04.25 03:15, H. Peter Anvin wrote:
> On 4/24/25 01:14, Jürgen Groß wrote:
>>>
>>> Actually, that is how we get this patch with the existing alternatives
>>> infrastructure.  And we took a step further to also remove the pv_ops
>>> MSR APIs...
>>
>> And this is what I'm questioning. IMHO this approach is adding more
>> code by removing the pv_ops MSR_APIs just because "pv_ops is bad". And
>> I believe most refusal of pv_ops is based on no longer valid reasoning.
>>
> 
> pvops are a headache because it is effectively a secondary alternatives 
> infrastructure that is incompatible with the alternatives one...

Hu? How can that be, as pv_ops is using only alternatives infrastructure
for doing the patching?

I'd say today pv_ops is a convenience wrapper around alternatives.

> 
>>> It looks to me that you want to add a new facility to the alternatives
>>> infrastructure first?
>>
>> Why would we need a new facility in the alternatives infrastructure?
> 
> I'm not sure what Xin means with "facility", but a key motivation for this is to:
> 
> a. Avoid using the pvops for MSRs when on the only remaining user thereof (Xen) 
> is only using it for a very small subset of MSRs and for the rest it is just 
> overhead, even for Xen;
> 
> b. Being able to do wrmsrns immediate/wrmsrns/wrmsr and rdmsr immediate/rdmsr 
> alternatives.
> 
> Of these, (b) is by far the biggest motivation. The architectural direction for 
> supervisor states is to avoid ad hoc and XSAVES ISA and instead use MSRs. The 
> immediate forms are expected to be significantly faster, because they make the 
> MSR index available at the very beginning of the pipeline instead of at a 
> relatively late stage.

I understand the motivation for b), but I think this could be achieved without
a) rather easily. And I continue to believe that your reasoning for a) is based
on old facts. But may be I'm just not understanding your concerns with today's
pv_ops implementation.


Juergen

Re: [RFC PATCH v2 21/34] x86/msr: Utilize the alternatives mechanism to write MSR

Posted by H. Peter Anvin 9 months, 2 weeks ago

On 4/24/25 18:15, H. Peter Anvin wrote:
> On 4/24/25 01:14, Jürgen Groß wrote:
>>>
>>> Actually, that is how we get this patch with the existing alternatives
>>> infrastructure.  And we took a step further to also remove the pv_ops
>>> MSR APIs...
>>
>> And this is what I'm questioning. IMHO this approach is adding more
>> code by removing the pv_ops MSR_APIs just because "pv_ops is bad". And
>> I believe most refusal of pv_ops is based on no longer valid reasoning.
>>
> 
> pvops are a headache because it is effectively a secondary alternatives 
> infrastructure that is incompatible with the alternatives one...
> 
>>> It looks to me that you want to add a new facility to the alternatives
>>> infrastructure first?
>>
>> Why would we need a new facility in the alternatives infrastructure?
> 
> I'm not sure what Xin means with "facility", but a key motivation for 
> this is to:
> 
> a. Avoid using the pvops for MSRs when on the only remaining user 
> thereof (Xen) is only using it for a very small subset of MSRs and for 
> the rest it is just overhead, even for Xen;
> 
> b. Being able to do wrmsrns immediate/wrmsrns/wrmsr and rdmsr immediate/ 
> rdmsr alternatives.
> 
> Of these, (b) is by far the biggest motivation. The architectural 
> direction for supervisor states is to avoid ad hoc and XSAVES ISA and 
> instead use MSRs. The immediate forms are expected to be significantly 
> faster, because they make the MSR index available at the very beginning 
> of the pipeline instead of at a relatively late stage.
> 

Note that to support the immediate forms, we *must* do these inline, or 
the const-ness of the MSR index -- which applies to by far the vast 
majority of MSR references -- gets lost. pvops does exactly that.

Furthermore, the MSR immediate instructions take a 64-bit number in a 
single register; as these instructions are by necessity relatively long, 
it makes sense for the alternative sequence to accept a 64-bit input 
register and do the %eax/%edx shuffle in the legacy fallback code... we 
did a bunch of experiments to see what made most sense.

	-hpa

Re: [RFC PATCH v2 21/34] x86/msr: Utilize the alternatives mechanism to write MSR

Posted by Jürgen Groß 9 months, 2 weeks ago

On 25.04.25 05:44, H. Peter Anvin wrote:
> On 4/24/25 18:15, H. Peter Anvin wrote:
>> On 4/24/25 01:14, Jürgen Groß wrote:
>>>>
>>>> Actually, that is how we get this patch with the existing alternatives
>>>> infrastructure.  And we took a step further to also remove the pv_ops
>>>> MSR APIs...
>>>
>>> And this is what I'm questioning. IMHO this approach is adding more
>>> code by removing the pv_ops MSR_APIs just because "pv_ops is bad". And
>>> I believe most refusal of pv_ops is based on no longer valid reasoning.
>>>
>>
>> pvops are a headache because it is effectively a secondary alternatives 
>> infrastructure that is incompatible with the alternatives one...
>>
>>>> It looks to me that you want to add a new facility to the alternatives
>>>> infrastructure first?
>>>
>>> Why would we need a new facility in the alternatives infrastructure?
>>
>> I'm not sure what Xin means with "facility", but a key motivation for this is to:
>>
>> a. Avoid using the pvops for MSRs when on the only remaining user thereof 
>> (Xen) is only using it for a very small subset of MSRs and for the rest it is 
>> just overhead, even for Xen;
>>
>> b. Being able to do wrmsrns immediate/wrmsrns/wrmsr and rdmsr immediate/ rdmsr 
>> alternatives.
>>
>> Of these, (b) is by far the biggest motivation. The architectural direction 
>> for supervisor states is to avoid ad hoc and XSAVES ISA and instead use MSRs. 
>> The immediate forms are expected to be significantly faster, because they make 
>> the MSR index available at the very beginning of the pipeline instead of at a 
>> relatively late stage.
>>
> 
> Note that to support the immediate forms, we *must* do these inline, or the 
> const-ness of the MSR index -- which applies to by far the vast majority of MSR 
> references -- gets lost. pvops does exactly that.
> 
> Furthermore, the MSR immediate instructions take a 64-bit number in a single 
> register; as these instructions are by necessity relatively long, it makes sense 
> for the alternative sequence to accept a 64-bit input register and do the %eax/ 
> %edx shuffle in the legacy fallback code... we did a bunch of experiments to see 
> what made most sense.

Yes, I understand that.

And I'm totally in favor of Xin's rework of the MSR low level functions.

Inlining the MSR access instructions with pv_ops should not be very
complicated. We do that with other instructions (STI/CLI, PTE accesses)
today, so this is no new kind of functionality.

I could have a try writing a patch achieving that, but I would only start
that work in case you might consider taking it instead of Xin's patch
removing the pv_ops usage for rdmsr/wrmsr. In case it turns out that my
version results in more code changes than Xin's patch, I'd be fine to drop
my patch, of course.


Juergen

Re: [RFC PATCH v2 21/34] x86/msr: Utilize the alternatives mechanism to write MSR

Posted by H. Peter Anvin 9 months, 2 weeks ago

On April 25, 2025 12:01:29 AM PDT, "Jürgen Groß" <jgross@suse.com> wrote:
>On 25.04.25 05:44, H. Peter Anvin wrote:
>> On 4/24/25 18:15, H. Peter Anvin wrote:
>>> On 4/24/25 01:14, Jürgen Groß wrote:
>>>>> 
>>>>> Actually, that is how we get this patch with the existing alternatives
>>>>> infrastructure.  And we took a step further to also remove the pv_ops
>>>>> MSR APIs...
>>>> 
>>>> And this is what I'm questioning. IMHO this approach is adding more
>>>> code by removing the pv_ops MSR_APIs just because "pv_ops is bad". And
>>>> I believe most refusal of pv_ops is based on no longer valid reasoning.
>>>> 
>>> 
>>> pvops are a headache because it is effectively a secondary alternatives infrastructure that is incompatible with the alternatives one...
>>> 
>>>>> It looks to me that you want to add a new facility to the alternatives
>>>>> infrastructure first?
>>>> 
>>>> Why would we need a new facility in the alternatives infrastructure?
>>> 
>>> I'm not sure what Xin means with "facility", but a key motivation for this is to:
>>> 
>>> a. Avoid using the pvops for MSRs when on the only remaining user thereof (Xen) is only using it for a very small subset of MSRs and for the rest it is just overhead, even for Xen;
>>> 
>>> b. Being able to do wrmsrns immediate/wrmsrns/wrmsr and rdmsr immediate/ rdmsr alternatives.
>>> 
>>> Of these, (b) is by far the biggest motivation. The architectural direction for supervisor states is to avoid ad hoc and XSAVES ISA and instead use MSRs. The immediate forms are expected to be significantly faster, because they make the MSR index available at the very beginning of the pipeline instead of at a relatively late stage.
>>> 
>> 
>> Note that to support the immediate forms, we *must* do these inline, or the const-ness of the MSR index -- which applies to by far the vast majority of MSR references -- gets lost. pvops does exactly that.
>> 
>> Furthermore, the MSR immediate instructions take a 64-bit number in a single register; as these instructions are by necessity relatively long, it makes sense for the alternative sequence to accept a 64-bit input register and do the %eax/ %edx shuffle in the legacy fallback code... we did a bunch of experiments to see what made most sense.
>
>Yes, I understand that.
>
>And I'm totally in favor of Xin's rework of the MSR low level functions.
>
>Inlining the MSR access instructions with pv_ops should not be very
>complicated. We do that with other instructions (STI/CLI, PTE accesses)
>today, so this is no new kind of functionality.
>
>I could have a try writing a patch achieving that, but I would only start
>that work in case you might consider taking it instead of Xin's patch
>removing the pv_ops usage for rdmsr/wrmsr. In case it turns out that my
>version results in more code changes than Xin's patch, I'd be fine to drop
>my patch, of course.
>
>
>Juergen

The wrapper in question is painfully opaque, but if it is much simpler, then I'm certainly willing to consider it... but I don't really see how it would be possible given among other things the need for trap points for the safe MSRs.

Keep in mind this needs to work even without PV enabled!

Note that Andrew encouraged us to pursue the pvops removal for MSRs. Note that Xen benefits pretty heavily because it can dispatch the proper path of the few that are left for the common case of fixed MSRs.