Instead of having callback functions for rdmsr/wrmsr on native, switch
to inline the respective instructions directly in order to avoid
overhead with the call interface.
This requires to use the instruction interfaces for rdmsr/wrmsr
emulation when running as a Xen PV guest.
In order to prepare support for the immediate forms of RDMSR and WRMSR
when not running as a Xen PV guest, use the RDMSR and WRMSR
instructions as the fallback case instead of ALT_CALL_INSTR.
Note that in the Xen PV case the RDMSR/WRMSR patching must not happen
even as an intermediate step, as this would clobber the indirect call
information needed when patching in the direct call for the Xen case.
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/include/asm/paravirt.h           | 114 +++++++++++++++++-----
 arch/x86/include/asm/paravirt_types.h     |  13 ++-
 arch/x86/include/asm/qspinlock_paravirt.h |   5 +-
 arch/x86/kernel/paravirt.c                |  26 ++++-
 arch/x86/xen/enlighten_pv.c               |  56 ++++++++---
 5 files changed, 167 insertions(+), 47 deletions(-)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index a463c747c780..df10b0e4f7b8 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -175,24 +175,72 @@ static inline void __write_cr4(unsigned long x)
 	PVOP_VCALL1(cpu.write_cr4, x);
 }
 
-static inline u64 paravirt_read_msr(u32 msr)
+static __always_inline u64 paravirt_read_msr(u32 msr)
 {
-	return PVOP_CALL1(u64, cpu.read_msr, msr);
+	EAX_EDX_DECLARE_ARGS(val, low, high);
+
+	PVOP_TEST_NULL(cpu.read_msr);
+	asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
+					"rdmsr", ALT_NOT_XEN,
+					ALT_CALL_INSTR, ALT_XENPV_CALL)
+		     "2:\n"
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR)
+		     : EAX_EDX_RET(val, low, high), ASM_CALL_CONSTRAINT
+		     : paravirt_ptr(cpu.read_msr), "c" (msr));
+
+	return EAX_EDX_VAL(val, low, high);
 }
 
-static inline void paravirt_write_msr(u32 msr, u64 val)
+static __always_inline void paravirt_write_msr(u32 msr, u64 val)
 {
-	PVOP_VCALL2(cpu.write_msr, msr, val);
+	PVOP_TEST_NULL(cpu.write_msr);
+	asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
+					"wrmsr", ALT_NOT_XEN,
+					ALT_CALL_INSTR, ALT_XENPV_CALL)
+		      "2:\n"
+		      _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
+		      : ASM_CALL_CONSTRAINT
+		      : paravirt_ptr(cpu.write_msr),
+			  "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32))
+		      : "memory");
 }
 
-static inline int paravirt_read_msr_safe(u32 msr, u64 *val)
+static __always_inline int paravirt_read_msr_safe(u32 msr, u64 *p)
 {
-	return PVOP_CALL2(int, cpu.read_msr_safe, msr, val);
+	int err;
+	EAX_EDX_DECLARE_ARGS(val, low, high);
+
+	PVOP_TEST_NULL(cpu.read_msr_safe);
+	asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
+					"rdmsr; xor %[err],%[err]", ALT_NOT_XEN,
+					ALT_CALL_INSTR, ALT_XENPV_CALL)
+		     "2:\n"
+		     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err])
+		     : [err] "=c" (err), EAX_EDX_RET(val, low, high),
+		       ASM_CALL_CONSTRAINT
+		     : paravirt_ptr(cpu.read_msr_safe), "0" (msr));
+
+	*p = EAX_EDX_VAL(val, low, high);
+
+	return err;
 }
 
-static inline int paravirt_write_msr_safe(u32 msr, u64 val)
+static __always_inline int paravirt_write_msr_safe(u32 msr, u64 val)
 {
-	return PVOP_CALL2(int, cpu.write_msr_safe, msr, val);
+	int err;
+
+	PVOP_TEST_NULL(cpu.write_msr_safe);
+	asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
+					"wrmsr; xor %[err],%[err]", ALT_NOT_XEN,
+					ALT_CALL_INSTR, ALT_XENPV_CALL)
+		     "2:\n"
+		     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err])
+		     : [err] "=a" (err), ASM_CALL_CONSTRAINT
+		     : paravirt_ptr(cpu.write_msr_safe),
+		       "c" (msr), "0" ((u32)val), "d" ((u32)(val >> 32))
+		     : "memory");
+
+	return err;
 }
 
 static __always_inline u64 read_msr(u32 msr)
@@ -573,27 +621,43 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
 #define PV_SAVE_ALL_CALLER_REGS		"pushl %ecx;"
 #define PV_RESTORE_ALL_CALLER_REGS	"popl  %ecx;"
 #else
+/* save and restore caller-save registers, except %rax, %rcx and %rdx. */
+#define PV_SAVE_COMMON_CALLER_REGS	\
+	"push %rsi;"			\
+	"push %rdi;"			\
+	"push %r8;"			\
+	"push %r9;"			\
+	"push %r10;"			\
+	"push %r11;"
+#define PV_RESTORE_COMMON_CALLER_REGS	\
+	"pop %r11;"			\
+	"pop %r10;"			\
+	"pop %r9;"			\
+	"pop %r8;"			\
+	"pop %rdi;"			\
+	"pop %rsi;"
+
+#define PV_PROLOGUE_MSR(func)		\
+	PV_SAVE_COMMON_CALLER_REGS	\
+	PV_PROLOGUE_MSR_##func
+#define PV_EPILOGUE_MSR(func)		\
+	PV_EPILOGUE_MSR_##func		\
+	PV_RESTORE_COMMON_CALLER_REGS
+
 /* save and restore all caller-save registers, except return value */
 #define PV_SAVE_ALL_CALLER_REGS						\
 	"push %rcx;"							\
 	"push %rdx;"							\
-	"push %rsi;"							\
-	"push %rdi;"							\
-	"push %r8;"							\
-	"push %r9;"							\
-	"push %r10;"							\
-	"push %r11;"
+	PV_SAVE_COMMON_CALLER_REGS
 #define PV_RESTORE_ALL_CALLER_REGS					\
-	"pop %r11;"							\
-	"pop %r10;"							\
-	"pop %r9;"							\
-	"pop %r8;"							\
-	"pop %rdi;"							\
-	"pop %rsi;"							\
+	PV_RESTORE_COMMON_CALLER_REGS					\
 	"pop %rdx;"							\
 	"pop %rcx;"
 #endif
 
+#define PV_PROLOGUE_ALL(func)	PV_SAVE_ALL_CALLER_REGS
+#define PV_EPILOGUE_ALL(func)	PV_RESTORE_ALL_CALLER_REGS
+
 /*
  * Generate a thunk around a function which saves all caller-save
  * registers except for the return value.  This allows C functions to
@@ -607,7 +671,7 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
  * functions.
  */
 #define PV_THUNK_NAME(func) "__raw_callee_save_" #func
-#define __PV_CALLEE_SAVE_REGS_THUNK(func, section)			\
+#define __PV_CALLEE_SAVE_REGS_THUNK(func, section, helper)		\
 	extern typeof(func) __raw_callee_save_##func;			\
 									\
 	asm(".pushsection " section ", \"ax\";"				\
@@ -617,16 +681,18 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
 	    PV_THUNK_NAME(func) ":"					\
 	    ASM_ENDBR							\
 	    FRAME_BEGIN							\
-	    PV_SAVE_ALL_CALLER_REGS					\
+	    PV_PROLOGUE_##helper(func)					\
 	    "call " #func ";"						\
-	    PV_RESTORE_ALL_CALLER_REGS					\
+	    PV_EPILOGUE_##helper(func)					\
 	    FRAME_END							\
 	    ASM_RET							\
 	    ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";"	\
 	    ".popsection")
 
 #define PV_CALLEE_SAVE_REGS_THUNK(func)			\
-	__PV_CALLEE_SAVE_REGS_THUNK(func, ".text")
+	__PV_CALLEE_SAVE_REGS_THUNK(func, ".text", ALL)
+#define PV_CALLEE_SAVE_REGS_MSR_THUNK(func)		\
+	__PV_CALLEE_SAVE_REGS_THUNK(func, ".text", MSR)
 
 /* Get a reference to a callee-save function */
 #define PV_CALLEE_SAVE(func)						\
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b08b9d3122d6..f7f879319e90 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -91,15 +91,15 @@ struct pv_cpu_ops {
 		      unsigned int *ecx, unsigned int *edx);
 
 	/* Unsafe MSR operations.  These will warn or panic on failure. */
-	u64 (*read_msr)(u32 msr);
-	void (*write_msr)(u32 msr, u64 val);
+	struct paravirt_callee_save read_msr;
+	struct paravirt_callee_save write_msr;
 
 	/*
 	 * Safe MSR operations.
 	 * Returns 0 or -EIO.
 	 */
-	int (*read_msr_safe)(u32 msr, u64 *val);
-	int (*write_msr_safe)(u32 msr, u64 val);
+	struct paravirt_callee_save read_msr_safe;
+	struct paravirt_callee_save write_msr_safe;
 
 	u64 (*read_pmc)(int counter);
 
@@ -520,6 +520,10 @@ unsigned long pv_native_save_fl(void);
 void pv_native_irq_disable(void);
 void pv_native_irq_enable(void);
 unsigned long pv_native_read_cr2(void);
+void pv_native_rdmsr(void);
+void pv_native_wrmsr(void);
+void pv_native_rdmsr_safe(void);
+void pv_native_wrmsr_safe(void);
 #endif
 
 #define paravirt_nop	((void *)nop_func)
@@ -527,6 +531,7 @@ unsigned long pv_native_read_cr2(void);
 #endif	/* __ASSEMBLER__ */
 
 #define ALT_NOT_XEN	ALT_NOT(X86_FEATURE_XENPV)
+#define ALT_XENPV_CALL	ALT_DIRECT_CALL(X86_FEATURE_XENPV)
 
 #endif  /* CONFIG_PARAVIRT */
 #endif	/* _ASM_X86_PARAVIRT_TYPES_H */
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index 0a985784be9b..0351acb5a143 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -14,7 +14,8 @@ void __lockfunc __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 lock
  */
 #ifdef CONFIG_64BIT
 
-__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text");
+__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text",
+			    ALL);
 #define __pv_queued_spin_unlock	__pv_queued_spin_unlock
 
 /*
@@ -61,7 +62,7 @@ DEFINE_ASM_FUNC(__raw_callee_save___pv_queued_spin_unlock,
 #else /* CONFIG_64BIT */
 
 extern void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock);
-__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock, ".spinlock.text");
+__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock, ".spinlock.text", ALL);
 
 #endif /* CONFIG_64BIT */
 #endif
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 015bf298434f..ff7d7fdae360 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -50,6 +50,24 @@ DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text);
 DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text);
 DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text);
 DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_rdmsr,
+		"1: rdmsr\n"
+		"2:\n"
+		_ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR), .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_wrmsr,
+		"1: wrmsr\n"
+		"2:\n"
+		_ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR), .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_rdmsr_safe,
+		"1: rdmsr; xor %ecx, %ecx\n"
+		"2:\n"
+		_ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %%ecx),
+		.noinstr.text);
+DEFINE_ASM_FUNC(pv_native_wrmsr_safe,
+		"1: wrmsr; xor %eax, %eax\n"
+		"2:\n"
+		_ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %%eax),
+		.noinstr.text);
 #endif
 
 DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
@@ -129,10 +147,10 @@ struct paravirt_patch_template pv_ops = {
 	.cpu.read_cr0		= native_read_cr0,
 	.cpu.write_cr0		= native_write_cr0,
 	.cpu.write_cr4		= native_write_cr4,
-	.cpu.read_msr		= native_read_msr,
-	.cpu.write_msr		= native_write_msr,
-	.cpu.read_msr_safe	= native_read_msr_safe,
-	.cpu.write_msr_safe	= native_write_msr_safe,
+	.cpu.read_msr		= __PV_IS_CALLEE_SAVE(pv_native_rdmsr),
+	.cpu.write_msr		= __PV_IS_CALLEE_SAVE(pv_native_wrmsr),
+	.cpu.read_msr_safe	= __PV_IS_CALLEE_SAVE(pv_native_rdmsr_safe),
+	.cpu.write_msr_safe	= __PV_IS_CALLEE_SAVE(pv_native_wrmsr_safe),
 	.cpu.read_pmc		= native_read_pmc,
 	.cpu.load_tr_desc	= native_load_tr_desc,
 	.cpu.set_ldt		= native_set_ldt,
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 3be38350f044..c279b2bef7eb 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1160,36 +1160,66 @@ static void xen_do_write_msr(u32 msr, u64 val, int *err)
 	}
 }
 
-static int xen_read_msr_safe(u32 msr, u64 *val)
-{
+/*
+ * Prototypes for functions called via PV_CALLEE_SAVE_REGS_THUNK() in order
+ * to avoid warnings with "-Wmissing-prototypes".
+ */
+struct xen_rdmsr_safe_ret {
+	u64 val;
 	int err;
+};
+struct xen_rdmsr_safe_ret xen_read_msr_safe(u32 msr);
+int xen_write_msr_safe(u32 msr, u32 low, u32 high);
+u64 xen_read_msr(u32 msr);
+void xen_write_msr(u32 msr, u32 low, u32 high);
 
-	*val = xen_do_read_msr(msr, &err);
-	return err;
+__visible struct xen_rdmsr_safe_ret xen_read_msr_safe(u32 msr)
+{
+	struct xen_rdmsr_safe_ret ret;
+
+	ret.val = xen_do_read_msr(msr, &ret.err);
+	return ret;
 }
+#define PV_PROLOGUE_MSR_xen_read_msr_safe	"mov %ecx, %edi;"
+#define PV_EPILOGUE_MSR_xen_read_msr_safe	\
+	"mov %edx, %ecx; mov %rax, %rdx; mov %eax, %eax; shr $0x20, %rdx;"
+PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_read_msr_safe);
 
-static int xen_write_msr_safe(u32 msr, u64 val)
+__visible int xen_write_msr_safe(u32 msr, u32 low, u32 high)
 {
 	int err = 0;
 
-	xen_do_write_msr(msr, val, &err);
+	xen_do_write_msr(msr, (u64)high << 32 | low, &err);
 
 	return err;
 }
+#define PV_PROLOGUE_MSR_xen_write_msr_safe	\
+	"mov %ecx, %edi; mov %eax, %esi;"
+#define PV_EPILOGUE_MSR_xen_write_msr_safe
+PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_write_msr_safe);
 
-static u64 xen_read_msr(u32 msr)
+__visible u64 xen_read_msr(u32 msr)
 {
 	int err;
 
 	return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL);
 }
+#define PV_PROLOGUE_MSR_xen_read_msr	"mov %ecx, %edi;"
+#define PV_EPILOGUE_MSR_xen_read_msr	\
+	"mov %rax, %rdx; mov %eax, %eax; shr $0x20, %rdx;"
+PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_read_msr);
 
-static void xen_write_msr(u32 msr, u64 val)
+__visible void xen_write_msr(u32 msr, u32 low, u32 high)
 {
 	int err;
 
-	xen_do_write_msr(msr, val, xen_msr_safe ? &err : NULL);
+	xen_do_write_msr(msr, (u64)high << 32 | low,
+			 xen_msr_safe ? &err : NULL);
 }
+#define PV_PROLOGUE_MSR_xen_write_msr	\
+	"mov %ecx, %edi; mov %eax, %esi;"
+#define PV_EPILOGUE_MSR_xen_write_msr
+PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_write_msr);
 
 /* This is called once we have the cpu_possible_mask */
 void __init xen_setup_vcpu_info_placement(void)
@@ -1225,11 +1255,11 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = {
 
 		.write_cr4 = xen_write_cr4,
 
-		.read_msr = xen_read_msr,
-		.write_msr = xen_write_msr,
+		.read_msr = PV_CALLEE_SAVE(xen_read_msr),
+		.write_msr = PV_CALLEE_SAVE(xen_write_msr),
 
-		.read_msr_safe = xen_read_msr_safe,
-		.write_msr_safe = xen_write_msr_safe,
+		.read_msr_safe = PV_CALLEE_SAVE(xen_read_msr_safe),
+		.write_msr_safe = PV_CALLEE_SAVE(xen_write_msr_safe),
 
 		.read_pmc = xen_read_pmc,
 
-- 
2.43.0On 5/6/2025 2:20 AM, Juergen Gross wrote:
> Instead of having callback functions for rdmsr/wrmsr on native, switch
> to inline the respective instructions directly in order to avoid
> overhead with the call interface.
To me, this is a beneficial addition to the existing pvops MSR code.
> 
> This requires to use the instruction interfaces for rdmsr/wrmsr
> emulation when running as a Xen PV guest.
> 
> In order to prepare support for the immediate forms of RDMSR and WRMSR
> when not running as a Xen PV guest, use the RDMSR and WRMSR
> instructions as the fallback case instead of ALT_CALL_INSTR.
I'm trying to evaluate how to add the immediate form MSR instructions
on top of this patch set.  And I'm close to get it done.
> 
> Note that in the Xen PV case the RDMSR/WRMSR patching must not happen
> even as an intermediate step, as this would clobber the indirect call
> information needed when patching in the direct call for the Xen case.
Good point!
Deciding whether to retain the pvops MSR API is the responsibility of
the x86 maintainers, who are the ones experiencing the challenges of 
maintaining the code.
tglx said @https://lore.kernel.org/lkml/87y1h81ht4.ffs@tglx/:
 > I fundamentaly hate adding this to the PV infrastructure. We don't
 > want more PV ops, quite the contrary.
That is the reason I took a different direction, i.e., removing the
pvops MSR APIs.  But if your approach is cleaner, they may prefer it.
> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
> index a463c747c780..df10b0e4f7b8 100644
> --- a/arch/x86/include/asm/paravirt.h
> +++ b/arch/x86/include/asm/paravirt.h
> @@ -175,24 +175,72 @@ static inline void __write_cr4(unsigned long x)
>   	PVOP_VCALL1(cpu.write_cr4, x);
>   }
>   
> -static inline u64 paravirt_read_msr(u32 msr)
> +static __always_inline u64 paravirt_read_msr(u32 msr)
>   {
> -	return PVOP_CALL1(u64, cpu.read_msr, msr);
> +	EAX_EDX_DECLARE_ARGS(val, low, high);
This is under CONFIG_PARAVIRT_XXL, thus CONFIG_XEN_PV and CONFIG_X86_64,
therefore we don't need to consider 32-bit at all, no?
> +
> +	PVOP_TEST_NULL(cpu.read_msr);
> +	asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
> +					"rdmsr", ALT_NOT_XEN,
> +					ALT_CALL_INSTR, ALT_XENPV_CALL)
> +		     "2:\n"
> +		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR)
> +		     : EAX_EDX_RET(val, low, high), ASM_CALL_CONSTRAINT
> +		     : paravirt_ptr(cpu.read_msr), "c" (msr));
> +
> +	return EAX_EDX_VAL(val, low, high);
>   }
>   
> -static inline void paravirt_write_msr(u32 msr, u64 val)
> +static __always_inline void paravirt_write_msr(u32 msr, u64 val)
>   {
> -	PVOP_VCALL2(cpu.write_msr, msr, val);
> +	PVOP_TEST_NULL(cpu.write_msr);
> +	asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
> +					"wrmsr", ALT_NOT_XEN,
> +					ALT_CALL_INSTR, ALT_XENPV_CALL)
> +		      "2:\n"
> +		      _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
> +		      : ASM_CALL_CONSTRAINT
> +		      : paravirt_ptr(cpu.write_msr),
> +			  "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32))
> +		      : "memory");
>   }
>   
> -static inline int paravirt_read_msr_safe(u32 msr, u64 *val)
> +static __always_inline int paravirt_read_msr_safe(u32 msr, u64 *p)
>   {
> -	return PVOP_CALL2(int, cpu.read_msr_safe, msr, val);
> +	int err;
> +	EAX_EDX_DECLARE_ARGS(val, low, high);
> +
> +	PVOP_TEST_NULL(cpu.read_msr_safe);
> +	asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
> +					"rdmsr; xor %[err],%[err]", ALT_NOT_XEN,
> +					ALT_CALL_INSTR, ALT_XENPV_CALL)
> +		     "2:\n"
> +		     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err])
> +		     : [err] "=c" (err), EAX_EDX_RET(val, low, high),
> +		       ASM_CALL_CONSTRAINT
> +		     : paravirt_ptr(cpu.read_msr_safe), "0" (msr));
> +
> +	*p = EAX_EDX_VAL(val, low, high);
> +
> +	return err;
>   }
>   
> -static inline int paravirt_write_msr_safe(u32 msr, u64 val)
> +static __always_inline int paravirt_write_msr_safe(u32 msr, u64 val)
>   {
> -	return PVOP_CALL2(int, cpu.write_msr_safe, msr, val);
> +	int err;
> +
> +	PVOP_TEST_NULL(cpu.write_msr_safe);
> +	asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
> +					"wrmsr; xor %[err],%[err]", ALT_NOT_XEN,
> +					ALT_CALL_INSTR, ALT_XENPV_CALL)
> +		     "2:\n"
> +		     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err])
> +		     : [err] "=a" (err), ASM_CALL_CONSTRAINT
> +		     : paravirt_ptr(cpu.write_msr_safe),
> +		       "c" (msr), "0" ((u32)val), "d" ((u32)(val >> 32))
> +		     : "memory");
> +
> +	return err;
>   }
>   
>   static __always_inline u64 read_msr(u32 msr)
> @@ -573,27 +621,43 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
>   #define PV_SAVE_ALL_CALLER_REGS		"pushl %ecx;"
>   #define PV_RESTORE_ALL_CALLER_REGS	"popl  %ecx;"
>   #else
> +/* save and restore caller-save registers, except %rax, %rcx and %rdx. */
> +#define PV_SAVE_COMMON_CALLER_REGS	\
> +	"push %rsi;"			\
> +	"push %rdi;"			\
> +	"push %r8;"			\
> +	"push %r9;"			\
> +	"push %r10;"			\
> +	"push %r11;"
Add an empty line please, easier to read.
> +#define PV_RESTORE_COMMON_CALLER_REGS	\
> +	"pop %r11;"			\
> +	"pop %r10;"			\
> +	"pop %r9;"			\
> +	"pop %r8;"			\
> +	"pop %rdi;"			\
> +	"pop %rsi;"
> +
> +#define PV_PROLOGUE_MSR(func)		\
> +	PV_SAVE_COMMON_CALLER_REGS	\
> +	PV_PROLOGUE_MSR_##func
Ditto.  And the following similar cases.
> +#define PV_EPILOGUE_MSR(func)		\
> +	PV_EPILOGUE_MSR_##func		\
> +	PV_RESTORE_COMMON_CALLER_REGS
> +
>   /* save and restore all caller-save registers, except return value */
>   #define PV_SAVE_ALL_CALLER_REGS						\
>   	"push %rcx;"							\
>   	"push %rdx;"							\
> -	"push %rsi;"							\
> -	"push %rdi;"							\
> -	"push %r8;"							\
> -	"push %r9;"							\
> -	"push %r10;"							\
> -	"push %r11;"
> +	PV_SAVE_COMMON_CALLER_REGS
>   #define PV_RESTORE_ALL_CALLER_REGS					\
> -	"pop %r11;"							\
> -	"pop %r10;"							\
> -	"pop %r9;"							\
> -	"pop %r8;"							\
> -	"pop %rdi;"							\
> -	"pop %rsi;"							\
> +	PV_RESTORE_COMMON_CALLER_REGS					\
>   	"pop %rdx;"							\
>   	"pop %rcx;"
>   #endif
>   
> +#define PV_PROLOGUE_ALL(func)	PV_SAVE_ALL_CALLER_REGS
> +#define PV_EPILOGUE_ALL(func)	PV_RESTORE_ALL_CALLER_REGS
> +
>   /*
>    * Generate a thunk around a function which saves all caller-save
>    * registers except for the return value.  This allows C functions to
> @@ -607,7 +671,7 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
>    * functions.
>    */
>   #define PV_THUNK_NAME(func) "__raw_callee_save_" #func
> -#define __PV_CALLEE_SAVE_REGS_THUNK(func, section)			\
> +#define __PV_CALLEE_SAVE_REGS_THUNK(func, section, helper)		\
>   	extern typeof(func) __raw_callee_save_##func;			\
>   									\
>   	asm(".pushsection " section ", \"ax\";"				\
> @@ -617,16 +681,18 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
>   	    PV_THUNK_NAME(func) ":"					\
>   	    ASM_ENDBR							\
>   	    FRAME_BEGIN							\
> -	    PV_SAVE_ALL_CALLER_REGS					\
> +	    PV_PROLOGUE_##helper(func)					\
>   	    "call " #func ";"						\
> -	    PV_RESTORE_ALL_CALLER_REGS					\
> +	    PV_EPILOGUE_##helper(func)					\
>   	    FRAME_END							\
>   	    ASM_RET							\
>   	    ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";"	\
>   	    ".popsection")
>   
>   #define PV_CALLEE_SAVE_REGS_THUNK(func)			\
> -	__PV_CALLEE_SAVE_REGS_THUNK(func, ".text")
> +	__PV_CALLEE_SAVE_REGS_THUNK(func, ".text", ALL)
> +#define PV_CALLEE_SAVE_REGS_MSR_THUNK(func)		\
> +	__PV_CALLEE_SAVE_REGS_THUNK(func, ".text", MSR)
>   
>   /* Get a reference to a callee-save function */
>   #define PV_CALLEE_SAVE(func)						\
> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
> index b08b9d3122d6..f7f879319e90 100644
> --- a/arch/x86/include/asm/paravirt_types.h
> +++ b/arch/x86/include/asm/paravirt_types.h
> @@ -91,15 +91,15 @@ struct pv_cpu_ops {
>   		      unsigned int *ecx, unsigned int *edx);
>   
>   	/* Unsafe MSR operations.  These will warn or panic on failure. */
> -	u64 (*read_msr)(u32 msr);
> -	void (*write_msr)(u32 msr, u64 val);
> +	struct paravirt_callee_save read_msr;
> +	struct paravirt_callee_save write_msr;
>   
>   	/*
>   	 * Safe MSR operations.
>   	 * Returns 0 or -EIO.
>   	 */
> -	int (*read_msr_safe)(u32 msr, u64 *val);
> -	int (*write_msr_safe)(u32 msr, u64 val);
> +	struct paravirt_callee_save read_msr_safe;
> +	struct paravirt_callee_save write_msr_safe;
>   
>   	u64 (*read_pmc)(int counter);
>   
> @@ -520,6 +520,10 @@ unsigned long pv_native_save_fl(void);
>   void pv_native_irq_disable(void);
>   void pv_native_irq_enable(void);
>   unsigned long pv_native_read_cr2(void);
> +void pv_native_rdmsr(void);
> +void pv_native_wrmsr(void);
> +void pv_native_rdmsr_safe(void);
> +void pv_native_wrmsr_safe(void);
>   #endif
>   
>   #define paravirt_nop	((void *)nop_func)
> @@ -527,6 +531,7 @@ unsigned long pv_native_read_cr2(void);
>   #endif	/* __ASSEMBLER__ */
>   
>   #define ALT_NOT_XEN	ALT_NOT(X86_FEATURE_XENPV)
> +#define ALT_XENPV_CALL	ALT_DIRECT_CALL(X86_FEATURE_XENPV)
>   
>   #endif  /* CONFIG_PARAVIRT */
>   #endif	/* _ASM_X86_PARAVIRT_TYPES_H */
> diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
> index 0a985784be9b..0351acb5a143 100644
> --- a/arch/x86/include/asm/qspinlock_paravirt.h
> +++ b/arch/x86/include/asm/qspinlock_paravirt.h
> @@ -14,7 +14,8 @@ void __lockfunc __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 lock
>    */
>   #ifdef CONFIG_64BIT
>   
> -__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text");
> +__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text",
> +			    ALL);
>   #define __pv_queued_spin_unlock	__pv_queued_spin_unlock
>   
>   /*
> @@ -61,7 +62,7 @@ DEFINE_ASM_FUNC(__raw_callee_save___pv_queued_spin_unlock,
>   #else /* CONFIG_64BIT */
>   
>   extern void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock);
> -__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock, ".spinlock.text");
> +__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock, ".spinlock.text", ALL);
>   
>   #endif /* CONFIG_64BIT */
>   #endif
> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
> index 015bf298434f..ff7d7fdae360 100644
> --- a/arch/x86/kernel/paravirt.c
> +++ b/arch/x86/kernel/paravirt.c
> @@ -50,6 +50,24 @@ DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text);
>   DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text);
>   DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text);
>   DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
> +DEFINE_ASM_FUNC(pv_native_rdmsr,
> +		"1: rdmsr\n"
> +		"2:\n"
> +		_ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR), .noinstr.text);
> +DEFINE_ASM_FUNC(pv_native_wrmsr,
> +		"1: wrmsr\n"
> +		"2:\n"
> +		_ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR), .noinstr.text);
> +DEFINE_ASM_FUNC(pv_native_rdmsr_safe,
> +		"1: rdmsr; xor %ecx, %ecx\n"
> +		"2:\n"
> +		_ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %%ecx),
> +		.noinstr.text);
> +DEFINE_ASM_FUNC(pv_native_wrmsr_safe,
> +		"1: wrmsr; xor %eax, %eax\n"
> +		"2:\n"
> +		_ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %%eax),
> +		.noinstr.text);
>   #endif
>   
>   DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
> @@ -129,10 +147,10 @@ struct paravirt_patch_template pv_ops = {
>   	.cpu.read_cr0		= native_read_cr0,
>   	.cpu.write_cr0		= native_write_cr0,
>   	.cpu.write_cr4		= native_write_cr4,
> -	.cpu.read_msr		= native_read_msr,
> -	.cpu.write_msr		= native_write_msr,
> -	.cpu.read_msr_safe	= native_read_msr_safe,
> -	.cpu.write_msr_safe	= native_write_msr_safe,
> +	.cpu.read_msr		= __PV_IS_CALLEE_SAVE(pv_native_rdmsr),
> +	.cpu.write_msr		= __PV_IS_CALLEE_SAVE(pv_native_wrmsr),
> +	.cpu.read_msr_safe	= __PV_IS_CALLEE_SAVE(pv_native_rdmsr_safe),
> +	.cpu.write_msr_safe	= __PV_IS_CALLEE_SAVE(pv_native_wrmsr_safe),
>   	.cpu.read_pmc		= native_read_pmc,
>   	.cpu.load_tr_desc	= native_load_tr_desc,
>   	.cpu.set_ldt		= native_set_ldt,
> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
> index 3be38350f044..c279b2bef7eb 100644
> --- a/arch/x86/xen/enlighten_pv.c
> +++ b/arch/x86/xen/enlighten_pv.c
> @@ -1160,36 +1160,66 @@ static void xen_do_write_msr(u32 msr, u64 val, int *err)
>   	}
>   }
>   
> -static int xen_read_msr_safe(u32 msr, u64 *val)
> -{
> +/*
> + * Prototypes for functions called via PV_CALLEE_SAVE_REGS_THUNK() in order
> + * to avoid warnings with "-Wmissing-prototypes".
> + */
> +struct xen_rdmsr_safe_ret {
> +	u64 val;
>   	int err;
> +};
> +struct xen_rdmsr_safe_ret xen_read_msr_safe(u32 msr);
> +int xen_write_msr_safe(u32 msr, u32 low, u32 high);
> +u64 xen_read_msr(u32 msr);
> +void xen_write_msr(u32 msr, u32 low, u32 high);
>   
> -	*val = xen_do_read_msr(msr, &err);
> -	return err;
> +__visible struct xen_rdmsr_safe_ret xen_read_msr_safe(u32 msr)
> +{
> +	struct xen_rdmsr_safe_ret ret;
struct xen_rdmsr_safe_ret ret = { 0, 0 };
Because the 'err' member may not be set in xen_do_read_msr().
> +
> +	ret.val = xen_do_read_msr(msr, &ret.err);
> +	return ret;
>   }
> +#define PV_PROLOGUE_MSR_xen_read_msr_safe	"mov %ecx, %edi;"
> +#define PV_EPILOGUE_MSR_xen_read_msr_safe	\
> +	"mov %edx, %ecx; mov %rax, %rdx; mov %eax, %eax; shr $0x20, %rdx;"
> +PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_read_msr_safe);
>   
> -static int xen_write_msr_safe(u32 msr, u64 val)
> +__visible int xen_write_msr_safe(u32 msr, u32 low, u32 high)
I think we can avoid splitting this u64 into two u32.
>   {
>   	int err = 0;
>   
> -	xen_do_write_msr(msr, val, &err);
> +	xen_do_write_msr(msr, (u64)high << 32 | low, &err);
>   
>   	return err;
>   }
> +#define PV_PROLOGUE_MSR_xen_write_msr_safe	\
> +	"mov %ecx, %edi; mov %eax, %esi;"
> +#define PV_EPILOGUE_MSR_xen_write_msr_safe
> +PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_write_msr_safe);
>   
> -static u64 xen_read_msr(u32 msr)
> +__visible u64 xen_read_msr(u32 msr)
>   {
>   	int err;
>   
>   	return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL);
>   }
> +#define PV_PROLOGUE_MSR_xen_read_msr	"mov %ecx, %edi;"
> +#define PV_EPILOGUE_MSR_xen_read_msr	\
> +	"mov %rax, %rdx; mov %eax, %eax; shr $0x20, %rdx;"
> +PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_read_msr);
>   
> -static void xen_write_msr(u32 msr, u64 val)
> +__visible void xen_write_msr(u32 msr, u32 low, u32 high)
Ditto.
>   {
>   	int err;
>   
> -	xen_do_write_msr(msr, val, xen_msr_safe ? &err : NULL);
> +	xen_do_write_msr(msr, (u64)high << 32 | low,
> +			 xen_msr_safe ? &err : NULL);
>   }
> +#define PV_PROLOGUE_MSR_xen_write_msr	\
> +	"mov %ecx, %edi; mov %eax, %esi;"
> +#define PV_EPILOGUE_MSR_xen_write_msr
> +PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_write_msr);
>   
>   /* This is called once we have the cpu_possible_mask */
>   void __init xen_setup_vcpu_info_placement(void)
                
            On 09.05.25 10:18, Xin Li wrote:
> On 5/6/2025 2:20 AM, Juergen Gross wrote:
>> Instead of having callback functions for rdmsr/wrmsr on native, switch
>> to inline the respective instructions directly in order to avoid
>> overhead with the call interface.
> 
> To me, this is a beneficial addition to the existing pvops MSR code.
> 
>>
>> This requires to use the instruction interfaces for rdmsr/wrmsr
>> emulation when running as a Xen PV guest.
>>
>> In order to prepare support for the immediate forms of RDMSR and WRMSR
>> when not running as a Xen PV guest, use the RDMSR and WRMSR
>> instructions as the fallback case instead of ALT_CALL_INSTR.
> 
> I'm trying to evaluate how to add the immediate form MSR instructions
> on top of this patch set.  And I'm close to get it done.
There is something to consider when running as a Xen PV guest, ...
> 
>>
>> Note that in the Xen PV case the RDMSR/WRMSR patching must not happen
>> even as an intermediate step, as this would clobber the indirect call
>> information needed when patching in the direct call for the Xen case.
> 
> Good point!
... as this still needs to be true.
There are 2 different ways to deal with this:
1. When running as a Xen PV guest disable X86_FEATURE_WRMSRNS and
    ASM_WRMSRNS_IMM (e.g. in xen_init_capabilities()).
2. Buffer the original instruction before patching in apply_alternatives()
    in order to avoid the sequence limitation above (see attached patch).
> Deciding whether to retain the pvops MSR API is the responsibility of
> the x86 maintainers, who are the ones experiencing the challenges of maintaining 
> the code.
Well, I'm the PV ops maintainer, so it is basically me who needs to deal
with this. OTOH I do understand that diagnosis of problems with PV ops is
more complicated than without.
> 
> tglx said @https://lore.kernel.org/lkml/87y1h81ht4.ffs@tglx/:
> 
>  > I fundamentaly hate adding this to the PV infrastructure. We don't
>  > want more PV ops, quite the contrary.
> 
> That is the reason I took a different direction, i.e., removing the
> pvops MSR APIs.  But if your approach is cleaner, they may prefer it.
In the end it isn't adding additional PV ops interfaces. It is modifying
existing ones.
> 
>> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
>> index a463c747c780..df10b0e4f7b8 100644
>> --- a/arch/x86/include/asm/paravirt.h
>> +++ b/arch/x86/include/asm/paravirt.h
>> @@ -175,24 +175,72 @@ static inline void __write_cr4(unsigned long x)
>>       PVOP_VCALL1(cpu.write_cr4, x);
>>   }
>> -static inline u64 paravirt_read_msr(u32 msr)
>> +static __always_inline u64 paravirt_read_msr(u32 msr)
>>   {
>> -    return PVOP_CALL1(u64, cpu.read_msr, msr);
>> +    EAX_EDX_DECLARE_ARGS(val, low, high);
> 
> This is under CONFIG_PARAVIRT_XXL, thus CONFIG_XEN_PV and CONFIG_X86_64,
> therefore we don't need to consider 32-bit at all, no?
Right. OTOH the macros are there, so why not use them?
In the end I'm fine to open code the 64-bit case here.
> 
>> +
>> +    PVOP_TEST_NULL(cpu.read_msr);
>> +    asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
>> +                    "rdmsr", ALT_NOT_XEN,
>> +                    ALT_CALL_INSTR, ALT_XENPV_CALL)
>> +             "2:\n"
>> +             _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR)
>> +             : EAX_EDX_RET(val, low, high), ASM_CALL_CONSTRAINT
>> +             : paravirt_ptr(cpu.read_msr), "c" (msr));
>> +
>> +    return EAX_EDX_VAL(val, low, high);
>>   }
>> -static inline void paravirt_write_msr(u32 msr, u64 val)
>> +static __always_inline void paravirt_write_msr(u32 msr, u64 val)
>>   {
>> -    PVOP_VCALL2(cpu.write_msr, msr, val);
>> +    PVOP_TEST_NULL(cpu.write_msr);
>> +    asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
>> +                    "wrmsr", ALT_NOT_XEN,
>> +                    ALT_CALL_INSTR, ALT_XENPV_CALL)
>> +              "2:\n"
>> +              _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
>> +              : ASM_CALL_CONSTRAINT
>> +              : paravirt_ptr(cpu.write_msr),
>> +              "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32))
>> +              : "memory");
>>   }
>> -static inline int paravirt_read_msr_safe(u32 msr, u64 *val)
>> +static __always_inline int paravirt_read_msr_safe(u32 msr, u64 *p)
>>   {
>> -    return PVOP_CALL2(int, cpu.read_msr_safe, msr, val);
>> +    int err;
>> +    EAX_EDX_DECLARE_ARGS(val, low, high);
>> +
>> +    PVOP_TEST_NULL(cpu.read_msr_safe);
>> +    asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
>> +                    "rdmsr; xor %[err],%[err]", ALT_NOT_XEN,
>> +                    ALT_CALL_INSTR, ALT_XENPV_CALL)
>> +             "2:\n"
>> +             _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err])
>> +             : [err] "=c" (err), EAX_EDX_RET(val, low, high),
>> +               ASM_CALL_CONSTRAINT
>> +             : paravirt_ptr(cpu.read_msr_safe), "0" (msr));
>> +
>> +    *p = EAX_EDX_VAL(val, low, high);
>> +
>> +    return err;
>>   }
>> -static inline int paravirt_write_msr_safe(u32 msr, u64 val)
>> +static __always_inline int paravirt_write_msr_safe(u32 msr, u64 val)
>>   {
>> -    return PVOP_CALL2(int, cpu.write_msr_safe, msr, val);
>> +    int err;
>> +
>> +    PVOP_TEST_NULL(cpu.write_msr_safe);
>> +    asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
>> +                    "wrmsr; xor %[err],%[err]", ALT_NOT_XEN,
>> +                    ALT_CALL_INSTR, ALT_XENPV_CALL)
>> +             "2:\n"
>> +             _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err])
>> +             : [err] "=a" (err), ASM_CALL_CONSTRAINT
>> +             : paravirt_ptr(cpu.write_msr_safe),
>> +               "c" (msr), "0" ((u32)val), "d" ((u32)(val >> 32))
>> +             : "memory");
>> +
>> +    return err;
>>   }
>>   static __always_inline u64 read_msr(u32 msr)
>> @@ -573,27 +621,43 @@ bool __raw_callee_save___native_vcpu_is_preempted(long 
>> cpu);
>>   #define PV_SAVE_ALL_CALLER_REGS        "pushl %ecx;"
>>   #define PV_RESTORE_ALL_CALLER_REGS    "popl  %ecx;"
>>   #else
>> +/* save and restore caller-save registers, except %rax, %rcx and %rdx. */
>> +#define PV_SAVE_COMMON_CALLER_REGS    \
>> +    "push %rsi;"            \
>> +    "push %rdi;"            \
>> +    "push %r8;"            \
>> +    "push %r9;"            \
>> +    "push %r10;"            \
>> +    "push %r11;"
> 
> Add an empty line please, easier to read.
Okay (same below).
> 
>> +#define PV_RESTORE_COMMON_CALLER_REGS    \
>> +    "pop %r11;"            \
>> +    "pop %r10;"            \
>> +    "pop %r9;"            \
>> +    "pop %r8;"            \
>> +    "pop %rdi;"            \
>> +    "pop %rsi;"
>> +
>> +#define PV_PROLOGUE_MSR(func)        \
>> +    PV_SAVE_COMMON_CALLER_REGS    \
>> +    PV_PROLOGUE_MSR_##func
> 
> Ditto.  And the following similar cases.
> 
>> +#define PV_EPILOGUE_MSR(func)        \
>> +    PV_EPILOGUE_MSR_##func        \
>> +    PV_RESTORE_COMMON_CALLER_REGS
>> +
>>   /* save and restore all caller-save registers, except return value */
>>   #define PV_SAVE_ALL_CALLER_REGS                        \
>>       "push %rcx;"                            \
>>       "push %rdx;"                            \
>> -    "push %rsi;"                            \
>> -    "push %rdi;"                            \
>> -    "push %r8;"                            \
>> -    "push %r9;"                            \
>> -    "push %r10;"                            \
>> -    "push %r11;"
>> +    PV_SAVE_COMMON_CALLER_REGS
>>   #define PV_RESTORE_ALL_CALLER_REGS                    \
>> -    "pop %r11;"                            \
>> -    "pop %r10;"                            \
>> -    "pop %r9;"                            \
>> -    "pop %r8;"                            \
>> -    "pop %rdi;"                            \
>> -    "pop %rsi;"                            \
>> +    PV_RESTORE_COMMON_CALLER_REGS                    \
>>       "pop %rdx;"                            \
>>       "pop %rcx;"
>>   #endif
>> +#define PV_PROLOGUE_ALL(func)    PV_SAVE_ALL_CALLER_REGS
>> +#define PV_EPILOGUE_ALL(func)    PV_RESTORE_ALL_CALLER_REGS
>> +
>>   /*
>>    * Generate a thunk around a function which saves all caller-save
>>    * registers except for the return value.  This allows C functions to
>> @@ -607,7 +671,7 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
>>    * functions.
>>    */
>>   #define PV_THUNK_NAME(func) "__raw_callee_save_" #func
>> -#define __PV_CALLEE_SAVE_REGS_THUNK(func, section)            \
>> +#define __PV_CALLEE_SAVE_REGS_THUNK(func, section, helper)        \
>>       extern typeof(func) __raw_callee_save_##func;            \
>>                                       \
>>       asm(".pushsection " section ", \"ax\";"                \
>> @@ -617,16 +681,18 @@ bool __raw_callee_save___native_vcpu_is_preempted(long 
>> cpu);
>>           PV_THUNK_NAME(func) ":"                    \
>>           ASM_ENDBR                            \
>>           FRAME_BEGIN                            \
>> -        PV_SAVE_ALL_CALLER_REGS                    \
>> +        PV_PROLOGUE_##helper(func)                    \
>>           "call " #func ";"                        \
>> -        PV_RESTORE_ALL_CALLER_REGS                    \
>> +        PV_EPILOGUE_##helper(func)                    \
>>           FRAME_END                            \
>>           ASM_RET                            \
>>           ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";"    \
>>           ".popsection")
>>   #define PV_CALLEE_SAVE_REGS_THUNK(func)            \
>> -    __PV_CALLEE_SAVE_REGS_THUNK(func, ".text")
>> +    __PV_CALLEE_SAVE_REGS_THUNK(func, ".text", ALL)
>> +#define PV_CALLEE_SAVE_REGS_MSR_THUNK(func)        \
>> +    __PV_CALLEE_SAVE_REGS_THUNK(func, ".text", MSR)
>>   /* Get a reference to a callee-save function */
>>   #define PV_CALLEE_SAVE(func)                        \
>> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/ 
>> paravirt_types.h
>> index b08b9d3122d6..f7f879319e90 100644
>> --- a/arch/x86/include/asm/paravirt_types.h
>> +++ b/arch/x86/include/asm/paravirt_types.h
>> @@ -91,15 +91,15 @@ struct pv_cpu_ops {
>>                 unsigned int *ecx, unsigned int *edx);
>>       /* Unsafe MSR operations.  These will warn or panic on failure. */
>> -    u64 (*read_msr)(u32 msr);
>> -    void (*write_msr)(u32 msr, u64 val);
>> +    struct paravirt_callee_save read_msr;
>> +    struct paravirt_callee_save write_msr;
>>       /*
>>        * Safe MSR operations.
>>        * Returns 0 or -EIO.
>>        */
>> -    int (*read_msr_safe)(u32 msr, u64 *val);
>> -    int (*write_msr_safe)(u32 msr, u64 val);
>> +    struct paravirt_callee_save read_msr_safe;
>> +    struct paravirt_callee_save write_msr_safe;
>>       u64 (*read_pmc)(int counter);
>> @@ -520,6 +520,10 @@ unsigned long pv_native_save_fl(void);
>>   void pv_native_irq_disable(void);
>>   void pv_native_irq_enable(void);
>>   unsigned long pv_native_read_cr2(void);
>> +void pv_native_rdmsr(void);
>> +void pv_native_wrmsr(void);
>> +void pv_native_rdmsr_safe(void);
>> +void pv_native_wrmsr_safe(void);
>>   #endif
>>   #define paravirt_nop    ((void *)nop_func)
>> @@ -527,6 +531,7 @@ unsigned long pv_native_read_cr2(void);
>>   #endif    /* __ASSEMBLER__ */
>>   #define ALT_NOT_XEN    ALT_NOT(X86_FEATURE_XENPV)
>> +#define ALT_XENPV_CALL    ALT_DIRECT_CALL(X86_FEATURE_XENPV)
>>   #endif  /* CONFIG_PARAVIRT */
>>   #endif    /* _ASM_X86_PARAVIRT_TYPES_H */
>> diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/ 
>> qspinlock_paravirt.h
>> index 0a985784be9b..0351acb5a143 100644
>> --- a/arch/x86/include/asm/qspinlock_paravirt.h
>> +++ b/arch/x86/include/asm/qspinlock_paravirt.h
>> @@ -14,7 +14,8 @@ void __lockfunc __pv_queued_spin_unlock_slowpath(struct 
>> qspinlock *lock, u8 lock
>>    */
>>   #ifdef CONFIG_64BIT
>> -__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text");
>> +__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text",
>> +                ALL);
>>   #define __pv_queued_spin_unlock    __pv_queued_spin_unlock
>>   /*
>> @@ -61,7 +62,7 @@ DEFINE_ASM_FUNC(__raw_callee_save___pv_queued_spin_unlock,
>>   #else /* CONFIG_64BIT */
>>   extern void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock);
>> -__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock, ".spinlock.text");
>> +__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock, ".spinlock.text", ALL);
>>   #endif /* CONFIG_64BIT */
>>   #endif
>> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
>> index 015bf298434f..ff7d7fdae360 100644
>> --- a/arch/x86/kernel/paravirt.c
>> +++ b/arch/x86/kernel/paravirt.c
>> @@ -50,6 +50,24 @@ DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop 
>> %rax", .noinstr.text);
>>   DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text);
>>   DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text);
>>   DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
>> +DEFINE_ASM_FUNC(pv_native_rdmsr,
>> +        "1: rdmsr\n"
>> +        "2:\n"
>> +        _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR), .noinstr.text);
>> +DEFINE_ASM_FUNC(pv_native_wrmsr,
>> +        "1: wrmsr\n"
>> +        "2:\n"
>> +        _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR), .noinstr.text);
>> +DEFINE_ASM_FUNC(pv_native_rdmsr_safe,
>> +        "1: rdmsr; xor %ecx, %ecx\n"
>> +        "2:\n"
>> +        _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %%ecx),
>> +        .noinstr.text);
>> +DEFINE_ASM_FUNC(pv_native_wrmsr_safe,
>> +        "1: wrmsr; xor %eax, %eax\n"
>> +        "2:\n"
>> +        _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %%eax),
>> +        .noinstr.text);
>>   #endif
>>   DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
>> @@ -129,10 +147,10 @@ struct paravirt_patch_template pv_ops = {
>>       .cpu.read_cr0        = native_read_cr0,
>>       .cpu.write_cr0        = native_write_cr0,
>>       .cpu.write_cr4        = native_write_cr4,
>> -    .cpu.read_msr        = native_read_msr,
>> -    .cpu.write_msr        = native_write_msr,
>> -    .cpu.read_msr_safe    = native_read_msr_safe,
>> -    .cpu.write_msr_safe    = native_write_msr_safe,
>> +    .cpu.read_msr        = __PV_IS_CALLEE_SAVE(pv_native_rdmsr),
>> +    .cpu.write_msr        = __PV_IS_CALLEE_SAVE(pv_native_wrmsr),
>> +    .cpu.read_msr_safe    = __PV_IS_CALLEE_SAVE(pv_native_rdmsr_safe),
>> +    .cpu.write_msr_safe    = __PV_IS_CALLEE_SAVE(pv_native_wrmsr_safe),
>>       .cpu.read_pmc        = native_read_pmc,
>>       .cpu.load_tr_desc    = native_load_tr_desc,
>>       .cpu.set_ldt        = native_set_ldt,
>> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
>> index 3be38350f044..c279b2bef7eb 100644
>> --- a/arch/x86/xen/enlighten_pv.c
>> +++ b/arch/x86/xen/enlighten_pv.c
>> @@ -1160,36 +1160,66 @@ static void xen_do_write_msr(u32 msr, u64 val, int *err)
>>       }
>>   }
>> -static int xen_read_msr_safe(u32 msr, u64 *val)
>> -{
>> +/*
>> + * Prototypes for functions called via PV_CALLEE_SAVE_REGS_THUNK() in order
>> + * to avoid warnings with "-Wmissing-prototypes".
>> + */
>> +struct xen_rdmsr_safe_ret {
>> +    u64 val;
>>       int err;
>> +};
>> +struct xen_rdmsr_safe_ret xen_read_msr_safe(u32 msr);
>> +int xen_write_msr_safe(u32 msr, u32 low, u32 high);
>> +u64 xen_read_msr(u32 msr);
>> +void xen_write_msr(u32 msr, u32 low, u32 high);
>> -    *val = xen_do_read_msr(msr, &err);
>> -    return err;
>> +__visible struct xen_rdmsr_safe_ret xen_read_msr_safe(u32 msr)
>> +{
>> +    struct xen_rdmsr_safe_ret ret;
> 
> struct xen_rdmsr_safe_ret ret = { 0, 0 };
> 
> Because the 'err' member may not be set in xen_do_read_msr().
Right.
> 
>> +
>> +    ret.val = xen_do_read_msr(msr, &ret.err);
>> +    return ret;
>>   }
>> +#define PV_PROLOGUE_MSR_xen_read_msr_safe    "mov %ecx, %edi;"
>> +#define PV_EPILOGUE_MSR_xen_read_msr_safe    \
>> +    "mov %edx, %ecx; mov %rax, %rdx; mov %eax, %eax; shr $0x20, %rdx;"
>> +PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_read_msr_safe);
>> -static int xen_write_msr_safe(u32 msr, u64 val)
>> +__visible int xen_write_msr_safe(u32 msr, u32 low, u32 high)
> 
> I think we can avoid splitting this u64 into two u32.
This is related to the native WRMSR interface. The WRMSR needs to be
able to be replaced by the call of the Xen specific function.
I could handle this in the prologue helpers, but I'd prefer to keep
those helpers as small as possible.
> 
>>   {
>>       int err = 0;
>> -    xen_do_write_msr(msr, val, &err);
>> +    xen_do_write_msr(msr, (u64)high << 32 | low, &err);
>>       return err;
>>   }
>> +#define PV_PROLOGUE_MSR_xen_write_msr_safe    \
>> +    "mov %ecx, %edi; mov %eax, %esi;"
>> +#define PV_EPILOGUE_MSR_xen_write_msr_safe
>> +PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_write_msr_safe);
>> -static u64 xen_read_msr(u32 msr)
>> +__visible u64 xen_read_msr(u32 msr)
>>   {
>>       int err;
>>       return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL);
>>   }
>> +#define PV_PROLOGUE_MSR_xen_read_msr    "mov %ecx, %edi;"
>> +#define PV_EPILOGUE_MSR_xen_read_msr    \
>> +    "mov %rax, %rdx; mov %eax, %eax; shr $0x20, %rdx;"
>> +PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_read_msr);
>> -static void xen_write_msr(u32 msr, u64 val)
>> +__visible void xen_write_msr(u32 msr, u32 low, u32 high)
> 
> Ditto.
See above.
> 
>>   {
>>       int err;
>> -    xen_do_write_msr(msr, val, xen_msr_safe ? &err : NULL);
>> +    xen_do_write_msr(msr, (u64)high << 32 | low,
>> +             xen_msr_safe ? &err : NULL);
>>   }
>> +#define PV_PROLOGUE_MSR_xen_write_msr    \
>> +    "mov %ecx, %edi; mov %eax, %esi;"
>> +#define PV_EPILOGUE_MSR_xen_write_msr
>> +PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_write_msr);
>>   /* This is called once we have the cpu_possible_mask */
>>   void __init xen_setup_vcpu_info_placement(void)
Juergen
                
            On 5/12/2025 4:20 AM, Jürgen Groß wrote:
> On 09.05.25 10:18, Xin Li wrote:
>> On 5/6/2025 2:20 AM, Juergen Gross wrote:
>> I'm trying to evaluate how to add the immediate form MSR instructions
>> on top of this patch set.  And I'm close to get it done.
> 
> There is something to consider when running as a Xen PV guest, ...
Andrew said he doens't plan to expose WRMSRNS to PV guests, and doesn't
expect MSR_IMM to be useful in a PV guest either, which I fully agree.
>>>
>>> Note that in the Xen PV case the RDMSR/WRMSR patching must not happen
>>> even as an intermediate step, as this would clobber the indirect call
>>> information needed when patching in the direct call for the Xen case.
>>
>> Good point!
> 
> ... as this still needs to be true.
> 
> There are 2 different ways to deal with this:
> 
> 1. When running as a Xen PV guest disable X86_FEATURE_WRMSRNS and
>     ASM_WRMSRNS_IMM (e.g. in xen_init_capabilities()).
> 
> 2. Buffer the original instruction before patching in apply_alternatives()
>     in order to avoid the sequence limitation above (see attached patch).
> 
>> Deciding whether to retain the pvops MSR API is the responsibility of
>> the x86 maintainers, who are the ones experiencing the challenges of 
>> maintaining the code.
> 
> Well, I'm the PV ops maintainer, so it is basically me who needs to deal
> with this. OTOH I do understand that diagnosis of problems with PV ops is
> more complicated than without.
Indeed, significant improvements continue to be implemented.
> 
>>
>> tglx said @https://lore.kernel.org/lkml/87y1h81ht4.ffs@tglx/:
>>
>>  > I fundamentaly hate adding this to the PV infrastructure. We don't
>>  > want more PV ops, quite the contrary.
>>
>> That is the reason I took a different direction, i.e., removing the
>> pvops MSR APIs.  But if your approach is cleaner, they may prefer it.
> 
> In the end it isn't adding additional PV ops interfaces. It is modifying
> existing ones.
> 
>>
>>> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/ 
>>> paravirt.h
>>> index a463c747c780..df10b0e4f7b8 100644
>>> --- a/arch/x86/include/asm/paravirt.h
>>> +++ b/arch/x86/include/asm/paravirt.h
>>> @@ -175,24 +175,72 @@ static inline void __write_cr4(unsigned long x)
>>>       PVOP_VCALL1(cpu.write_cr4, x);
>>>   }
>>> -static inline u64 paravirt_read_msr(u32 msr)
>>> +static __always_inline u64 paravirt_read_msr(u32 msr)
>>>   {
>>> -    return PVOP_CALL1(u64, cpu.read_msr, msr);
>>> +    EAX_EDX_DECLARE_ARGS(val, low, high);
>>
>> This is under CONFIG_PARAVIRT_XXL, thus CONFIG_XEN_PV and CONFIG_X86_64,
>> therefore we don't need to consider 32-bit at all, no?
> 
> Right. OTOH the macros are there, so why not use them?
> 
> In the end I'm fine to open code the 64-bit case here.
> 
Here is a patch I cooked.  I added an ALTERNATIVE() hack because the new 
instructions can't be more than 6 bytes long.  But with the patch you
just sent, it shouldn't be needed.
diff --git a/arch/x86/include/asm/paravirt.h 
b/arch/x86/include/asm/paravirt.h
index df10b0e4f7b8..82ffc11d6f1f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -177,18 +177,20 @@ static inline void __write_cr4(unsigned long x)
  static __always_inline u64 paravirt_read_msr(u32 msr)
  {
-	EAX_EDX_DECLARE_ARGS(val, low, high);
+	u64 val;
  	PVOP_TEST_NULL(cpu.read_msr);
  	asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
  					"rdmsr", ALT_NOT_XEN,
  					ALT_CALL_INSTR, ALT_XENPV_CALL)
+		     ALTERNATIVE("", "shl $0x20, %%rdx; or %%rdx, %%rax", ALT_NOT_XEN)
  		     "2:\n"
  		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR)
-		     : EAX_EDX_RET(val, low, high), ASM_CALL_CONSTRAINT
-		     : paravirt_ptr(cpu.read_msr), "c" (msr));
+		     : "=a" (val), ASM_CALL_CONSTRAINT
+		     : paravirt_ptr(cpu.read_msr), "c" (msr)
+		     : "rdx");
-	return EAX_EDX_VAL(val, low, high);
+	return val;
  }
  static __always_inline void paravirt_write_msr(u32 msr, u64 val)
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index ea3d7d583254..cacd9c37c3bd 100644
@@ -1204,20 +1206,20 @@ __visible u64 xen_read_msr(u32 msr)
  	return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL);
  }
+
  #define PV_PROLOGUE_MSR_xen_read_msr	"mov %ecx, %edi;"
-#define PV_EPILOGUE_MSR_xen_read_msr	\
-	"mov %rax, %rdx; mov %eax, %eax; shr $0x20, %rdx;"
+#define PV_EPILOGUE_MSR_xen_read_msr
  PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_read_msr);
-__visible void xen_write_msr(u32 msr, u32 low, u32 high)
+__visible void xen_write_msr(u32 msr, u64 val)
  {
  	int err;
-	xen_do_write_msr(msr, (u64)high << 32 | low,
-			 xen_msr_safe ? &err : NULL);
+	xen_do_write_msr(msr, val, xen_msr_safe ? &err : NULL);
  }
+
  #define PV_PROLOGUE_MSR_xen_write_msr	\
-	"mov %ecx, %edi; mov %eax, %esi;"
+	"mov %ecx, %edi; mov %rax, %rsi;"
  #define PV_EPILOGUE_MSR_xen_write_msr
  PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_write_msr);
>>> +__visible int xen_write_msr_safe(u32 msr, u32 low, u32 high)
>>
>> I think we can avoid splitting this u64 into two u32.
> 
> This is related to the native WRMSR interface. The WRMSR needs to be
> able to be replaced by the call of the Xen specific function.
> 
> I could handle this in the prologue helpers, but I'd prefer to keep
> those helpers as small as possible.
The above patch makes PV_EPILOGUE_MSR_xen_read_msr empty, because only
RDMSR needs to convert edx:eax into a 64-bit register, and the code is
added into paravirt_read_msr() already.
For xen_write_msr(), the change is simple enough.
Thanks!
     Xin
                
            On 13.05.25 09:44, Xin Li wrote:
> On 5/12/2025 4:20 AM, Jürgen Groß wrote:
>> On 09.05.25 10:18, Xin Li wrote:
>>> On 5/6/2025 2:20 AM, Juergen Gross wrote:
>>> I'm trying to evaluate how to add the immediate form MSR instructions
>>> on top of this patch set.  And I'm close to get it done.
>>
>> There is something to consider when running as a Xen PV guest, ...
> 
> Andrew said he doens't plan to expose WRMSRNS to PV guests, and doesn't
> expect MSR_IMM to be useful in a PV guest either, which I fully agree.
>>>>
>>>> Note that in the Xen PV case the RDMSR/WRMSR patching must not happen
>>>> even as an intermediate step, as this would clobber the indirect call
>>>> information needed when patching in the direct call for the Xen case.
>>>
>>> Good point!
>>
>> ... as this still needs to be true.
>>
>> There are 2 different ways to deal with this:
>>
>> 1. When running as a Xen PV guest disable X86_FEATURE_WRMSRNS and
>>     ASM_WRMSRNS_IMM (e.g. in xen_init_capabilities()).
>>
>> 2. Buffer the original instruction before patching in apply_alternatives()
>>     in order to avoid the sequence limitation above (see attached patch).
>>
>>> Deciding whether to retain the pvops MSR API is the responsibility of
>>> the x86 maintainers, who are the ones experiencing the challenges of 
>>> maintaining the code.
>>
>> Well, I'm the PV ops maintainer, so it is basically me who needs to deal
>> with this. OTOH I do understand that diagnosis of problems with PV ops is
>> more complicated than without.
> 
> Indeed, significant improvements continue to be implemented.
> 
>>
>>>
>>> tglx said @https://lore.kernel.org/lkml/87y1h81ht4.ffs@tglx/:
>>>
>>>  > I fundamentaly hate adding this to the PV infrastructure. We don't
>>>  > want more PV ops, quite the contrary.
>>>
>>> That is the reason I took a different direction, i.e., removing the
>>> pvops MSR APIs.  But if your approach is cleaner, they may prefer it.
>>
>> In the end it isn't adding additional PV ops interfaces. It is modifying
>> existing ones.
>>
>>>
>>>> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/ paravirt.h
>>>> index a463c747c780..df10b0e4f7b8 100644
>>>> --- a/arch/x86/include/asm/paravirt.h
>>>> +++ b/arch/x86/include/asm/paravirt.h
>>>> @@ -175,24 +175,72 @@ static inline void __write_cr4(unsigned long x)
>>>>       PVOP_VCALL1(cpu.write_cr4, x);
>>>>   }
>>>> -static inline u64 paravirt_read_msr(u32 msr)
>>>> +static __always_inline u64 paravirt_read_msr(u32 msr)
>>>>   {
>>>> -    return PVOP_CALL1(u64, cpu.read_msr, msr);
>>>> +    EAX_EDX_DECLARE_ARGS(val, low, high);
>>>
>>> This is under CONFIG_PARAVIRT_XXL, thus CONFIG_XEN_PV and CONFIG_X86_64,
>>> therefore we don't need to consider 32-bit at all, no?
>>
>> Right. OTOH the macros are there, so why not use them?
>>
>> In the end I'm fine to open code the 64-bit case here.
>>
> 
> Here is a patch I cooked.  I added an ALTERNATIVE() hack because the new 
> instructions can't be more than 6 bytes long.  But with the patch you
> just sent, it shouldn't be needed.
I have meanwhile dropped the patch copying the original indirect call.
Reason is that I'm seeing a potential risk with current alternative
patching when using ALTERNATIVE_[23](): depending on the tested features
it might happen that an instruction sequence not suitable for the current
runtime environment is patched in as an intermediate step. In case there
is an interrupt happening just then AND the handling of the interrupt is
using the patch site, this could result in crashes or undefined behavior.
I have meanwhile a set of 3 patches fixing that problem by merging all
alternative patching of a patch site in the local buffer and only then
patching the code at the target site with the final result.
The same problem arises with your code below, but this time it isn't
fixed by my patches: the two ALTERNATIVE() instances in the asm() construct
would need to be patched in a single atomic operation to be consistent.
Otherwise you could end up e.g. on bare metal with paravirt_read_msr()
having replaced the indirect call with "rdmsr", but not yet having added
the code to merge %rdx into %rax.
I'm just doing a V2 of my series, but this time including the additional
support of the non-serializing and immediate forms. Lets see how this will
look like. I will drop using the EAX_EDX_* macros, but due to the reason
mentioned above I won't switch to your variant completely.
> 
> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
> index df10b0e4f7b8..82ffc11d6f1f 100644
> --- a/arch/x86/include/asm/paravirt.h
> +++ b/arch/x86/include/asm/paravirt.h
> @@ -177,18 +177,20 @@ static inline void __write_cr4(unsigned long x)
> 
>   static __always_inline u64 paravirt_read_msr(u32 msr)
>   {
> -    EAX_EDX_DECLARE_ARGS(val, low, high);
> +    u64 val;
> 
>       PVOP_TEST_NULL(cpu.read_msr);
>       asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
>                       "rdmsr", ALT_NOT_XEN,
>                       ALT_CALL_INSTR, ALT_XENPV_CALL)
> +             ALTERNATIVE("", "shl $0x20, %%rdx; or %%rdx, %%rax", ALT_NOT_XEN)
>                "2:\n"
>                _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR)
> -             : EAX_EDX_RET(val, low, high), ASM_CALL_CONSTRAINT
> -             : paravirt_ptr(cpu.read_msr), "c" (msr));
> +             : "=a" (val), ASM_CALL_CONSTRAINT
> +             : paravirt_ptr(cpu.read_msr), "c" (msr)
> +             : "rdx");
> 
> -    return EAX_EDX_VAL(val, low, high);
> +    return val;
Juergen
                
            On 6/11/2025 5:58 AM, Juergen Gross wrote:
> I'm just doing a V2 of my series, but this time including the additional
> support of the non-serializing and immediate forms. Lets see how this will
> look like. I will drop using the EAX_EDX_* macros, but due to the reason
> mentioned above I won't switch to your variant completely.
Hi Juergen,
Do you have any update on this?
Thanks!
     Xin
                
            On 25.08.25 03:54, Xin Li wrote: > On 6/11/2025 5:58 AM, Juergen Gross wrote: >> I'm just doing a V2 of my series, but this time including the additional >> support of the non-serializing and immediate forms. Lets see how this will >> look like. I will drop using the EAX_EDX_* macros, but due to the reason >> mentioned above I won't switch to your variant completely. > > Hi Juergen, > > Do you have any update on this? I've been very busy with other stuff (downstream, security, ...). In between I've been working on the series. I hope to post it some time in September. Juergen
On 26.08.25 12:39, Jürgen Groß wrote: > On 25.08.25 03:54, Xin Li wrote: >> On 6/11/2025 5:58 AM, Juergen Gross wrote: >>> I'm just doing a V2 of my series, but this time including the additional >>> support of the non-serializing and immediate forms. Lets see how this will >>> look like. I will drop using the EAX_EDX_* macros, but due to the reason >>> mentioned above I won't switch to your variant completely. >> >> Hi Juergen, >> >> Do you have any update on this? > > I've been very busy with other stuff (downstream, security, ...). > > In between I've been working on the series. I hope to post it some time in > September. I have been working on this the last week. Turns out things are a little bit complicated for adding them into the paravirt framework, especially regarding the exception fixups. I first thought that Peter's patch "x86/extable: Implement EX_TYPE_FUNC_REWIND" would help, but I'm seeing problems with his approach in case of shadow stack being enabled. This case would at least needed to be handled in his patch, as otherwise shadow stack and normal stack could get out of sync. For this reason your patch series won't work easily, too. OTOH using your basic idea it seems to be possible to solve the fixup problem without needing Peter's patch. I'm working on that approach now. Juergen
On 6/11/2025 5:58 AM, Juergen Gross wrote:
>> Here is a patch I cooked.  I added an ALTERNATIVE() hack because the 
>> new instructions can't be more than 6 bytes long.  But with the patch you
>> just sent, it shouldn't be needed.
> 
> I have meanwhile dropped the patch copying the original indirect call.
> 
> Reason is that I'm seeing a potential risk with current alternative
> patching when using ALTERNATIVE_[23](): depending on the tested features
> it might happen that an instruction sequence not suitable for the current
> runtime environment is patched in as an intermediate step. In case there
> is an interrupt happening just then AND the handling of the interrupt is
> using the patch site, this could result in crashes or undefined behavior.
Oh, I had assumed that Linux disables interrupts during the patching
process. Just out of curiosity, why are interrupts allowed in this case?
> 
> I have meanwhile a set of 3 patches fixing that problem by merging all
> alternative patching of a patch site in the local buffer and only then
> patching the code at the target site with the final result.
> 
> The same problem arises with your code below, but this time it isn't
> fixed by my patches: the two ALTERNATIVE() instances in the asm() construct
> would need to be patched in a single atomic operation to be consistent.
> Otherwise you could end up e.g. on bare metal with paravirt_read_msr()
> having replaced the indirect call with "rdmsr", but not yet having added
> the code to merge %rdx into %rax.
> 
> I'm just doing a V2 of my series, but this time including the additional
> support of the non-serializing and immediate forms. Lets see how this will
> look like. I will drop using the EAX_EDX_* macros, but due to the reason
> mentioned above I won't switch to your variant completely.
Great!
Thanks!
     Xin
                
            On 13.06.25 09:31, Xin Li wrote: > On 6/11/2025 5:58 AM, Juergen Gross wrote: >>> Here is a patch I cooked. I added an ALTERNATIVE() hack because the new >>> instructions can't be more than 6 bytes long. But with the patch you >>> just sent, it shouldn't be needed. >> >> I have meanwhile dropped the patch copying the original indirect call. >> >> Reason is that I'm seeing a potential risk with current alternative >> patching when using ALTERNATIVE_[23](): depending on the tested features >> it might happen that an instruction sequence not suitable for the current >> runtime environment is patched in as an intermediate step. In case there >> is an interrupt happening just then AND the handling of the interrupt is >> using the patch site, this could result in crashes or undefined behavior. > > Oh, I had assumed that Linux disables interrupts during the patching > process. Just out of curiosity, why are interrupts allowed in this case? Interrupts are disabled within text_poke_early() while patching a single instance. I guess keeping interrupts disabled during the complete apply_alternatives() handling would potentially result in a too long period without handling any interrupts. Juergen
Now with the mentioned patch really attached. :-)
On 12.05.25 13:20, Jürgen Groß wrote:
> On 09.05.25 10:18, Xin Li wrote:
>> On 5/6/2025 2:20 AM, Juergen Gross wrote:
>>> Instead of having callback functions for rdmsr/wrmsr on native, switch
>>> to inline the respective instructions directly in order to avoid
>>> overhead with the call interface.
>>
>> To me, this is a beneficial addition to the existing pvops MSR code.
>>
>>>
>>> This requires to use the instruction interfaces for rdmsr/wrmsr
>>> emulation when running as a Xen PV guest.
>>>
>>> In order to prepare support for the immediate forms of RDMSR and WRMSR
>>> when not running as a Xen PV guest, use the RDMSR and WRMSR
>>> instructions as the fallback case instead of ALT_CALL_INSTR.
>>
>> I'm trying to evaluate how to add the immediate form MSR instructions
>> on top of this patch set.  And I'm close to get it done.
> 
> There is something to consider when running as a Xen PV guest, ...
> 
>>
>>>
>>> Note that in the Xen PV case the RDMSR/WRMSR patching must not happen
>>> even as an intermediate step, as this would clobber the indirect call
>>> information needed when patching in the direct call for the Xen case.
>>
>> Good point!
> 
> ... as this still needs to be true.
> 
> There are 2 different ways to deal with this:
> 
> 1. When running as a Xen PV guest disable X86_FEATURE_WRMSRNS and
>     ASM_WRMSRNS_IMM (e.g. in xen_init_capabilities()).
> 
> 2. Buffer the original instruction before patching in apply_alternatives()
>     in order to avoid the sequence limitation above (see attached patch).
> 
>> Deciding whether to retain the pvops MSR API is the responsibility of
>> the x86 maintainers, who are the ones experiencing the challenges of 
>> maintaining the code.
> 
> Well, I'm the PV ops maintainer, so it is basically me who needs to deal
> with this. OTOH I do understand that diagnosis of problems with PV ops is
> more complicated than without.
> 
>>
>> tglx said @https://lore.kernel.org/lkml/87y1h81ht4.ffs@tglx/:
>>
>>  > I fundamentaly hate adding this to the PV infrastructure. We don't
>>  > want more PV ops, quite the contrary.
>>
>> That is the reason I took a different direction, i.e., removing the
>> pvops MSR APIs.  But if your approach is cleaner, they may prefer it.
> 
> In the end it isn't adding additional PV ops interfaces. It is modifying
> existing ones.
> 
>>
>>> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
>>> index a463c747c780..df10b0e4f7b8 100644
>>> --- a/arch/x86/include/asm/paravirt.h
>>> +++ b/arch/x86/include/asm/paravirt.h
>>> @@ -175,24 +175,72 @@ static inline void __write_cr4(unsigned long x)
>>>       PVOP_VCALL1(cpu.write_cr4, x);
>>>   }
>>> -static inline u64 paravirt_read_msr(u32 msr)
>>> +static __always_inline u64 paravirt_read_msr(u32 msr)
>>>   {
>>> -    return PVOP_CALL1(u64, cpu.read_msr, msr);
>>> +    EAX_EDX_DECLARE_ARGS(val, low, high);
>>
>> This is under CONFIG_PARAVIRT_XXL, thus CONFIG_XEN_PV and CONFIG_X86_64,
>> therefore we don't need to consider 32-bit at all, no?
> 
> Right. OTOH the macros are there, so why not use them?
> 
> In the end I'm fine to open code the 64-bit case here.
> 
>>
>>> +
>>> +    PVOP_TEST_NULL(cpu.read_msr);
>>> +    asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
>>> +                    "rdmsr", ALT_NOT_XEN,
>>> +                    ALT_CALL_INSTR, ALT_XENPV_CALL)
>>> +             "2:\n"
>>> +             _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR)
>>> +             : EAX_EDX_RET(val, low, high), ASM_CALL_CONSTRAINT
>>> +             : paravirt_ptr(cpu.read_msr), "c" (msr));
>>> +
>>> +    return EAX_EDX_VAL(val, low, high);
>>>   }
>>> -static inline void paravirt_write_msr(u32 msr, u64 val)
>>> +static __always_inline void paravirt_write_msr(u32 msr, u64 val)
>>>   {
>>> -    PVOP_VCALL2(cpu.write_msr, msr, val);
>>> +    PVOP_TEST_NULL(cpu.write_msr);
>>> +    asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
>>> +                    "wrmsr", ALT_NOT_XEN,
>>> +                    ALT_CALL_INSTR, ALT_XENPV_CALL)
>>> +              "2:\n"
>>> +              _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
>>> +              : ASM_CALL_CONSTRAINT
>>> +              : paravirt_ptr(cpu.write_msr),
>>> +              "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32))
>>> +              : "memory");
>>>   }
>>> -static inline int paravirt_read_msr_safe(u32 msr, u64 *val)
>>> +static __always_inline int paravirt_read_msr_safe(u32 msr, u64 *p)
>>>   {
>>> -    return PVOP_CALL2(int, cpu.read_msr_safe, msr, val);
>>> +    int err;
>>> +    EAX_EDX_DECLARE_ARGS(val, low, high);
>>> +
>>> +    PVOP_TEST_NULL(cpu.read_msr_safe);
>>> +    asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
>>> +                    "rdmsr; xor %[err],%[err]", ALT_NOT_XEN,
>>> +                    ALT_CALL_INSTR, ALT_XENPV_CALL)
>>> +             "2:\n"
>>> +             _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err])
>>> +             : [err] "=c" (err), EAX_EDX_RET(val, low, high),
>>> +               ASM_CALL_CONSTRAINT
>>> +             : paravirt_ptr(cpu.read_msr_safe), "0" (msr));
>>> +
>>> +    *p = EAX_EDX_VAL(val, low, high);
>>> +
>>> +    return err;
>>>   }
>>> -static inline int paravirt_write_msr_safe(u32 msr, u64 val)
>>> +static __always_inline int paravirt_write_msr_safe(u32 msr, u64 val)
>>>   {
>>> -    return PVOP_CALL2(int, cpu.write_msr_safe, msr, val);
>>> +    int err;
>>> +
>>> +    PVOP_TEST_NULL(cpu.write_msr_safe);
>>> +    asm volatile("1: "ALTERNATIVE_2(PARAVIRT_CALL,
>>> +                    "wrmsr; xor %[err],%[err]", ALT_NOT_XEN,
>>> +                    ALT_CALL_INSTR, ALT_XENPV_CALL)
>>> +             "2:\n"
>>> +             _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err])
>>> +             : [err] "=a" (err), ASM_CALL_CONSTRAINT
>>> +             : paravirt_ptr(cpu.write_msr_safe),
>>> +               "c" (msr), "0" ((u32)val), "d" ((u32)(val >> 32))
>>> +             : "memory");
>>> +
>>> +    return err;
>>>   }
>>>   static __always_inline u64 read_msr(u32 msr)
>>> @@ -573,27 +621,43 @@ bool __raw_callee_save___native_vcpu_is_preempted(long 
>>> cpu);
>>>   #define PV_SAVE_ALL_CALLER_REGS        "pushl %ecx;"
>>>   #define PV_RESTORE_ALL_CALLER_REGS    "popl  %ecx;"
>>>   #else
>>> +/* save and restore caller-save registers, except %rax, %rcx and %rdx. */
>>> +#define PV_SAVE_COMMON_CALLER_REGS    \
>>> +    "push %rsi;"            \
>>> +    "push %rdi;"            \
>>> +    "push %r8;"            \
>>> +    "push %r9;"            \
>>> +    "push %r10;"            \
>>> +    "push %r11;"
>>
>> Add an empty line please, easier to read.
> 
> Okay (same below).
> 
>>
>>> +#define PV_RESTORE_COMMON_CALLER_REGS    \
>>> +    "pop %r11;"            \
>>> +    "pop %r10;"            \
>>> +    "pop %r9;"            \
>>> +    "pop %r8;"            \
>>> +    "pop %rdi;"            \
>>> +    "pop %rsi;"
>>> +
>>> +#define PV_PROLOGUE_MSR(func)        \
>>> +    PV_SAVE_COMMON_CALLER_REGS    \
>>> +    PV_PROLOGUE_MSR_##func
>>
>> Ditto.  And the following similar cases.
>>
>>> +#define PV_EPILOGUE_MSR(func)        \
>>> +    PV_EPILOGUE_MSR_##func        \
>>> +    PV_RESTORE_COMMON_CALLER_REGS
>>> +
>>>   /* save and restore all caller-save registers, except return value */
>>>   #define PV_SAVE_ALL_CALLER_REGS                        \
>>>       "push %rcx;"                            \
>>>       "push %rdx;"                            \
>>> -    "push %rsi;"                            \
>>> -    "push %rdi;"                            \
>>> -    "push %r8;"                            \
>>> -    "push %r9;"                            \
>>> -    "push %r10;"                            \
>>> -    "push %r11;"
>>> +    PV_SAVE_COMMON_CALLER_REGS
>>>   #define PV_RESTORE_ALL_CALLER_REGS                    \
>>> -    "pop %r11;"                            \
>>> -    "pop %r10;"                            \
>>> -    "pop %r9;"                            \
>>> -    "pop %r8;"                            \
>>> -    "pop %rdi;"                            \
>>> -    "pop %rsi;"                            \
>>> +    PV_RESTORE_COMMON_CALLER_REGS                    \
>>>       "pop %rdx;"                            \
>>>       "pop %rcx;"
>>>   #endif
>>> +#define PV_PROLOGUE_ALL(func)    PV_SAVE_ALL_CALLER_REGS
>>> +#define PV_EPILOGUE_ALL(func)    PV_RESTORE_ALL_CALLER_REGS
>>> +
>>>   /*
>>>    * Generate a thunk around a function which saves all caller-save
>>>    * registers except for the return value.  This allows C functions to
>>> @@ -607,7 +671,7 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
>>>    * functions.
>>>    */
>>>   #define PV_THUNK_NAME(func) "__raw_callee_save_" #func
>>> -#define __PV_CALLEE_SAVE_REGS_THUNK(func, section)            \
>>> +#define __PV_CALLEE_SAVE_REGS_THUNK(func, section, helper)        \
>>>       extern typeof(func) __raw_callee_save_##func;            \
>>>                                       \
>>>       asm(".pushsection " section ", \"ax\";"                \
>>> @@ -617,16 +681,18 @@ bool __raw_callee_save___native_vcpu_is_preempted(long 
>>> cpu);
>>>           PV_THUNK_NAME(func) ":"                    \
>>>           ASM_ENDBR                            \
>>>           FRAME_BEGIN                            \
>>> -        PV_SAVE_ALL_CALLER_REGS                    \
>>> +        PV_PROLOGUE_##helper(func)                    \
>>>           "call " #func ";"                        \
>>> -        PV_RESTORE_ALL_CALLER_REGS                    \
>>> +        PV_EPILOGUE_##helper(func)                    \
>>>           FRAME_END                            \
>>>           ASM_RET                            \
>>>           ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";"    \
>>>           ".popsection")
>>>   #define PV_CALLEE_SAVE_REGS_THUNK(func)            \
>>> -    __PV_CALLEE_SAVE_REGS_THUNK(func, ".text")
>>> +    __PV_CALLEE_SAVE_REGS_THUNK(func, ".text", ALL)
>>> +#define PV_CALLEE_SAVE_REGS_MSR_THUNK(func)        \
>>> +    __PV_CALLEE_SAVE_REGS_THUNK(func, ".text", MSR)
>>>   /* Get a reference to a callee-save function */
>>>   #define PV_CALLEE_SAVE(func)                        \
>>> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/ 
>>> paravirt_types.h
>>> index b08b9d3122d6..f7f879319e90 100644
>>> --- a/arch/x86/include/asm/paravirt_types.h
>>> +++ b/arch/x86/include/asm/paravirt_types.h
>>> @@ -91,15 +91,15 @@ struct pv_cpu_ops {
>>>                 unsigned int *ecx, unsigned int *edx);
>>>       /* Unsafe MSR operations.  These will warn or panic on failure. */
>>> -    u64 (*read_msr)(u32 msr);
>>> -    void (*write_msr)(u32 msr, u64 val);
>>> +    struct paravirt_callee_save read_msr;
>>> +    struct paravirt_callee_save write_msr;
>>>       /*
>>>        * Safe MSR operations.
>>>        * Returns 0 or -EIO.
>>>        */
>>> -    int (*read_msr_safe)(u32 msr, u64 *val);
>>> -    int (*write_msr_safe)(u32 msr, u64 val);
>>> +    struct paravirt_callee_save read_msr_safe;
>>> +    struct paravirt_callee_save write_msr_safe;
>>>       u64 (*read_pmc)(int counter);
>>> @@ -520,6 +520,10 @@ unsigned long pv_native_save_fl(void);
>>>   void pv_native_irq_disable(void);
>>>   void pv_native_irq_enable(void);
>>>   unsigned long pv_native_read_cr2(void);
>>> +void pv_native_rdmsr(void);
>>> +void pv_native_wrmsr(void);
>>> +void pv_native_rdmsr_safe(void);
>>> +void pv_native_wrmsr_safe(void);
>>>   #endif
>>>   #define paravirt_nop    ((void *)nop_func)
>>> @@ -527,6 +531,7 @@ unsigned long pv_native_read_cr2(void);
>>>   #endif    /* __ASSEMBLER__ */
>>>   #define ALT_NOT_XEN    ALT_NOT(X86_FEATURE_XENPV)
>>> +#define ALT_XENPV_CALL    ALT_DIRECT_CALL(X86_FEATURE_XENPV)
>>>   #endif  /* CONFIG_PARAVIRT */
>>>   #endif    /* _ASM_X86_PARAVIRT_TYPES_H */
>>> diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/ 
>>> asm/ qspinlock_paravirt.h
>>> index 0a985784be9b..0351acb5a143 100644
>>> --- a/arch/x86/include/asm/qspinlock_paravirt.h
>>> +++ b/arch/x86/include/asm/qspinlock_paravirt.h
>>> @@ -14,7 +14,8 @@ void __lockfunc __pv_queued_spin_unlock_slowpath(struct 
>>> qspinlock *lock, u8 lock
>>>    */
>>>   #ifdef CONFIG_64BIT
>>> -__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, 
>>> ".spinlock.text");
>>> +__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text",
>>> +                ALL);
>>>   #define __pv_queued_spin_unlock    __pv_queued_spin_unlock
>>>   /*
>>> @@ -61,7 +62,7 @@ DEFINE_ASM_FUNC(__raw_callee_save___pv_queued_spin_unlock,
>>>   #else /* CONFIG_64BIT */
>>>   extern void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock);
>>> -__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock, ".spinlock.text");
>>> +__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock, ".spinlock.text", ALL);
>>>   #endif /* CONFIG_64BIT */
>>>   #endif
>>> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
>>> index 015bf298434f..ff7d7fdae360 100644
>>> --- a/arch/x86/kernel/paravirt.c
>>> +++ b/arch/x86/kernel/paravirt.c
>>> @@ -50,6 +50,24 @@ DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop 
>>> %rax", .noinstr.text);
>>>   DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text);
>>>   DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text);
>>>   DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
>>> +DEFINE_ASM_FUNC(pv_native_rdmsr,
>>> +        "1: rdmsr\n"
>>> +        "2:\n"
>>> +        _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR), .noinstr.text);
>>> +DEFINE_ASM_FUNC(pv_native_wrmsr,
>>> +        "1: wrmsr\n"
>>> +        "2:\n"
>>> +        _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR), .noinstr.text);
>>> +DEFINE_ASM_FUNC(pv_native_rdmsr_safe,
>>> +        "1: rdmsr; xor %ecx, %ecx\n"
>>> +        "2:\n"
>>> +        _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %%ecx),
>>> +        .noinstr.text);
>>> +DEFINE_ASM_FUNC(pv_native_wrmsr_safe,
>>> +        "1: wrmsr; xor %eax, %eax\n"
>>> +        "2:\n"
>>> +        _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %%eax),
>>> +        .noinstr.text);
>>>   #endif
>>>   DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
>>> @@ -129,10 +147,10 @@ struct paravirt_patch_template pv_ops = {
>>>       .cpu.read_cr0        = native_read_cr0,
>>>       .cpu.write_cr0        = native_write_cr0,
>>>       .cpu.write_cr4        = native_write_cr4,
>>> -    .cpu.read_msr        = native_read_msr,
>>> -    .cpu.write_msr        = native_write_msr,
>>> -    .cpu.read_msr_safe    = native_read_msr_safe,
>>> -    .cpu.write_msr_safe    = native_write_msr_safe,
>>> +    .cpu.read_msr        = __PV_IS_CALLEE_SAVE(pv_native_rdmsr),
>>> +    .cpu.write_msr        = __PV_IS_CALLEE_SAVE(pv_native_wrmsr),
>>> +    .cpu.read_msr_safe    = __PV_IS_CALLEE_SAVE(pv_native_rdmsr_safe),
>>> +    .cpu.write_msr_safe    = __PV_IS_CALLEE_SAVE(pv_native_wrmsr_safe),
>>>       .cpu.read_pmc        = native_read_pmc,
>>>       .cpu.load_tr_desc    = native_load_tr_desc,
>>>       .cpu.set_ldt        = native_set_ldt,
>>> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
>>> index 3be38350f044..c279b2bef7eb 100644
>>> --- a/arch/x86/xen/enlighten_pv.c
>>> +++ b/arch/x86/xen/enlighten_pv.c
>>> @@ -1160,36 +1160,66 @@ static void xen_do_write_msr(u32 msr, u64 val, int *err)
>>>       }
>>>   }
>>> -static int xen_read_msr_safe(u32 msr, u64 *val)
>>> -{
>>> +/*
>>> + * Prototypes for functions called via PV_CALLEE_SAVE_REGS_THUNK() in order
>>> + * to avoid warnings with "-Wmissing-prototypes".
>>> + */
>>> +struct xen_rdmsr_safe_ret {
>>> +    u64 val;
>>>       int err;
>>> +};
>>> +struct xen_rdmsr_safe_ret xen_read_msr_safe(u32 msr);
>>> +int xen_write_msr_safe(u32 msr, u32 low, u32 high);
>>> +u64 xen_read_msr(u32 msr);
>>> +void xen_write_msr(u32 msr, u32 low, u32 high);
>>> -    *val = xen_do_read_msr(msr, &err);
>>> -    return err;
>>> +__visible struct xen_rdmsr_safe_ret xen_read_msr_safe(u32 msr)
>>> +{
>>> +    struct xen_rdmsr_safe_ret ret;
>>
>> struct xen_rdmsr_safe_ret ret = { 0, 0 };
>>
>> Because the 'err' member may not be set in xen_do_read_msr().
> 
> Right.
> 
>>
>>> +
>>> +    ret.val = xen_do_read_msr(msr, &ret.err);
>>> +    return ret;
>>>   }
>>> +#define PV_PROLOGUE_MSR_xen_read_msr_safe    "mov %ecx, %edi;"
>>> +#define PV_EPILOGUE_MSR_xen_read_msr_safe    \
>>> +    "mov %edx, %ecx; mov %rax, %rdx; mov %eax, %eax; shr $0x20, %rdx;"
>>> +PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_read_msr_safe);
>>> -static int xen_write_msr_safe(u32 msr, u64 val)
>>> +__visible int xen_write_msr_safe(u32 msr, u32 low, u32 high)
>>
>> I think we can avoid splitting this u64 into two u32.
> 
> This is related to the native WRMSR interface. The WRMSR needs to be
> able to be replaced by the call of the Xen specific function.
> 
> I could handle this in the prologue helpers, but I'd prefer to keep
> those helpers as small as possible.
> 
>>
>>>   {
>>>       int err = 0;
>>> -    xen_do_write_msr(msr, val, &err);
>>> +    xen_do_write_msr(msr, (u64)high << 32 | low, &err);
>>>       return err;
>>>   }
>>> +#define PV_PROLOGUE_MSR_xen_write_msr_safe    \
>>> +    "mov %ecx, %edi; mov %eax, %esi;"
>>> +#define PV_EPILOGUE_MSR_xen_write_msr_safe
>>> +PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_write_msr_safe);
>>> -static u64 xen_read_msr(u32 msr)
>>> +__visible u64 xen_read_msr(u32 msr)
>>>   {
>>>       int err;
>>>       return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL);
>>>   }
>>> +#define PV_PROLOGUE_MSR_xen_read_msr    "mov %ecx, %edi;"
>>> +#define PV_EPILOGUE_MSR_xen_read_msr    \
>>> +    "mov %rax, %rdx; mov %eax, %eax; shr $0x20, %rdx;"
>>> +PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_read_msr);
>>> -static void xen_write_msr(u32 msr, u64 val)
>>> +__visible void xen_write_msr(u32 msr, u32 low, u32 high)
>>
>> Ditto.
> 
> See above.
> 
>>
>>>   {
>>>       int err;
>>> -    xen_do_write_msr(msr, val, xen_msr_safe ? &err : NULL);
>>> +    xen_do_write_msr(msr, (u64)high << 32 | low,
>>> +             xen_msr_safe ? &err : NULL);
>>>   }
>>> +#define PV_PROLOGUE_MSR_xen_write_msr    \
>>> +    "mov %ecx, %edi; mov %eax, %esi;"
>>> +#define PV_EPILOGUE_MSR_xen_write_msr
>>> +PV_CALLEE_SAVE_REGS_MSR_THUNK(xen_write_msr);
>>>   /* This is called once we have the cpu_possible_mask */
>>>   void __init xen_setup_vcpu_info_placement(void)
> 
> 
> Juergen
                
            On 5/12/2025 4:24 AM, Juergen Gross wrote:
> Now with the mentioned patch really attached. :-)
> 
Does it allow patching with an instruction more than 6 bytes long?
The immediate form MSR instructions are 9 bytes long.
Thanks!
     Xin
                
            On 13.05.25 07:55, Xin Li wrote: > On 5/12/2025 4:24 AM, Juergen Gross wrote: >> Now with the mentioned patch really attached. :-) >> > > Does it allow patching with an instruction more than 6 bytes long? > > The immediate form MSR instructions are 9 bytes long. Yes, shouldn't be a problem. Juergen
On May 12, 2025 11:06:02 PM PDT, "Jürgen Groß" <jgross@suse.com> wrote:
>On 13.05.25 07:55, Xin Li wrote:
>> On 5/12/2025 4:24 AM, Juergen Gross wrote:
>>> Now with the mentioned patch really attached. :-)
>>> 
>> 
>> Does it allow patching with an instruction more than 6 bytes long?
>> 
>> The immediate form MSR instructions are 9 bytes long.
>
>Yes, shouldn't be a problem.
>
>
>Juergen
However, it is more than that. The immediate instructions have a different interface, and it makes more sense to use the extra bytes to shuffle the bits around for the legacy forms:
Write:
    mov %rax,%rdx
    shr $32,%rdx
    wrmsr(ns)
Read:
    rdmsr
    shl $32,%rdx
    or %rdx,%rax
For the write case, this also means that two separate trap points are needed.
As far as Xen (the only user of pv msrs), note that it only paravirtualizes a very small number of MSRs, and some of those are fairly performance sensitive, so not going through the Xen framework for MSRs known to be either native or null on Xen would definitely be a win.
                
            On 5/13/2025 3:24 PM, H. Peter Anvin wrote:
> On May 12, 2025 11:06:02 PM PDT, "Jürgen Groß" <jgross@suse.com> wrote:
>> On 13.05.25 07:55, Xin Li wrote:
>>> On 5/12/2025 4:24 AM, Juergen Gross wrote:
>>>> Now with the mentioned patch really attached. :-)
>>>>
>>>
>>> Does it allow patching with an instruction more than 6 bytes long?
>>>
>>> The immediate form MSR instructions are 9 bytes long.
>>
>> Yes, shouldn't be a problem.
>>
>>
>> Juergen
> 
> However, it is more than that. The immediate instructions have a different interface, and it makes more sense to use the extra bytes to shuffle the bits around for the legacy forms:
> 
> Write:
> 
>      mov %rax,%rdx
>      shr $32,%rdx
>      wrmsr(ns)
> 
> Read:
> 
>      rdmsr
>      shl $32,%rdx
>      or %rdx,%rax
> 
> For the write case, this also means that two separate trap points are needed.
> 
> As far as Xen (the only user of pv msrs), note that it only paravirtualizes a very small number of MSRs, and some of those are fairly performance sensitive, so not going through the Xen framework for MSRs known to be either native or null on Xen would definitely be a win.
> 
> 
Hi Juergen,
I have some update on this thread while working on it.
If we continue down the path of maintaining pvops MSR APIs as this patch
series does, it seems we’ll need to duplicate the ALTERNATIVE code in
three different places.
1) The MSR access primitives defined in <asm/msr.h>, which is used when
    CONFIG_PARAVIRT=n.
2) The pvops native MSR functions pv_native_{rd,wr}msr{,_safe}() defined
    in arch/x86/kernel/paravirt.c, used when CONFIG_PARAVIRT=y on bare
    metal.
3) The pvops Xen MSR functions paravirt_{read,write}_msr{,_safe}()
    defined in <asm/paravirt.h>, used when CONFIG_PARAVIRT_XXL=y.
hpa had mentioned to me earlier that this would be a maintenance burden
— something I only truly realized once I got hands-on with it.
Maybe you have something in mind to address it?
Also add PeterZ to the To list because he cares it.
Thanks!
     Xin
                
            On 5/15/25 00:32, Xin Li wrote:
> 
> Hi Juergen,
> 
> I have some update on this thread while working on it.
> 
> If we continue down the path of maintaining pvops MSR APIs as this patch
> series does, it seems we’ll need to duplicate the ALTERNATIVE code in
> three different places.
> 
> 1) The MSR access primitives defined in <asm/msr.h>, which is used when
>     CONFIG_PARAVIRT=n.
> 
> 2) The pvops native MSR functions pv_native_{rd,wr}msr{,_safe}() defined
>     in arch/x86/kernel/paravirt.c, used when CONFIG_PARAVIRT=y on bare
>     metal.
> 
> 3) The pvops Xen MSR functions paravirt_{read,write}_msr{,_safe}()
>     defined in <asm/paravirt.h>, used when CONFIG_PARAVIRT_XXL=y.
> 
> hpa had mentioned to me earlier that this would be a maintenance burden
> — something I only truly realized once I got hands-on with it.
> 
> Maybe you have something in mind to address it?
> 
> Also add PeterZ to the To list because he cares it.
> 
Having the code being duplicated is definitely not a good thing; 
although I'm not one of the x86 maintainers anymore, I would consider it 
a strong reason to NAK such a patchset.
At one point I was considering augmenting the alternatives framework to 
be able to call an ad hoc subroutine to generate the code. It would be 
useful in cases like this, where if PV is enabled it can make a callout 
to the currently-active PV code to query the desired code to be output.
There are 16 unused bits in the alternatives table (not counting the 14 
unused flag bits), which could be used for an enumeration of such 
subroutines, optionally split into 8 bits of function enumeration and 8 
bits of private data. In this case, the "replacement" pointer becomes 
available as a private pointer; possibly to a metadata structure used by 
the subroutine.
This could also be used to significantly enhance the static-immediate 
framework, by being able to have explicit code which handles the 
transformations instead of needing to rely on assembly hacks. That way 
we might even be able to do that kind of transformations for any 
ro_after_init value.
I think the biggest concern is how this would affect objtool, since 
objtool would now not have any kind of direct visibility into the 
possibly generated code. How to best feed the information objtool needs 
to it would be my biggest question (in part because I don't know what 
objtool would actually need.)
	-hpa
                
            On 5/12/2025 11:06 PM, Jürgen Groß wrote: > On 13.05.25 07:55, Xin Li wrote: >> On 5/12/2025 4:24 AM, Juergen Gross wrote: >>> Now with the mentioned patch really attached. :-) >>> >> >> Does it allow patching with an instruction more than 6 bytes long? >> >> The immediate form MSR instructions are 9 bytes long. > > Yes, shouldn't be a problem. > Excellent, I will give it a try.
© 2016 - 2025 Red Hat, Inc.