[PATCH v2 08/12] x86/extable: Add support for immediate form MSR instructions

Juergen Gross posted 12 patches 2 months, 2 weeks ago
[PATCH v2 08/12] x86/extable: Add support for immediate form MSR instructions
Posted by Juergen Gross 2 months, 2 weeks ago
From: "Xin Li (Intel)" <xin@zytor.com>

Signed-off-by: Xin Li (Intel) <xin@zytor.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
V2:
- new patch, taken from the RFC v2 MSR refactor series by Xin Li
---
 arch/x86/include/asm/msr.h | 18 ++++++++++++++++++
 arch/x86/mm/extable.c      | 39 +++++++++++++++++++++++++++++++++-----
 2 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 71f41af11591..cf5205300266 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -56,6 +56,24 @@ static inline void do_trace_read_msr(u32 msr, u64 val, int failed) {}
 static inline void do_trace_rdpmc(u32 msr, u64 val, int failed) {}
 #endif
 
+/*
+ * Called only from an MSR fault handler, the instruction pointer points to
+ * the MSR access instruction that caused the fault.
+ */
+static __always_inline bool is_msr_imm_insn(void *ip)
+{
+	/*
+	 * A full decoder for immediate form MSR instructions appears excessive.
+	 */
+#ifdef CONFIG_X86_64
+	const u8 msr_imm_insn_prefix[] = { 0xc4, 0xe7 };
+
+	return !memcmp(ip, msr_imm_insn_prefix, sizeof(msr_imm_insn_prefix));
+#else
+	return false;
+#endif
+}
+
 /*
  * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR
  * accessors and should not have any tracing or other functionality piggybacking
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 2fdc1f1f5adb..c021e4dbc69d 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -166,23 +166,52 @@ static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
 static bool ex_handler_msr(const struct exception_table_entry *fixup,
 			   struct pt_regs *regs, bool wrmsr, bool safe, int reg)
 {
+	bool imm_insn = is_msr_imm_insn((void *)regs->ip);
+	u32 msr;
+
+	if (imm_insn)
+		/*
+		 * The 32-bit immediate specifying a MSR is encoded into
+		 * byte 5 ~ 8 of an immediate form MSR instruction.
+		 */
+		msr = *(u32 *)(regs->ip + 5);
+	else
+		msr = (u32)regs->cx;
+
 	if (__ONCE_LITE_IF(!safe && wrmsr)) {
-		pr_warn("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
-			(unsigned int)regs->cx, (unsigned int)regs->dx,
-			(unsigned int)regs->ax,  regs->ip, (void *)regs->ip);
+		/*
+		 * To maintain consistency with existing RDMSR and WRMSR(NS) instructions,
+		 * the register operand for immediate form MSR instructions is ALWAYS
+		 * encoded as RAX in <asm/msr.h> for reading or writing the MSR value.
+		 */
+		u64 msr_val = regs->ax;
+
+		if (!imm_insn) {
+			/*
+			 * On processors that support the Intel 64 architecture, the
+			 * high-order 32 bits of each of RAX and RDX are ignored.
+			 */
+			msr_val &= 0xffffffff;
+			msr_val |= (u64)regs->dx << 32;
+		}
+
+		pr_warn("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%016llx) at rIP: 0x%lx (%pS)\n",
+			msr, msr_val, regs->ip, (void *)regs->ip);
 		show_stack_regs(regs);
 	}
 
 	if (__ONCE_LITE_IF(!safe && !wrmsr)) {
 		pr_warn("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
-			(unsigned int)regs->cx, regs->ip, (void *)regs->ip);
+			msr, regs->ip, (void *)regs->ip);
 		show_stack_regs(regs);
 	}
 
 	if (!wrmsr) {
 		/* Pretend that the read succeeded and returned 0. */
 		regs->ax = 0;
-		regs->dx = 0;
+
+		if (!imm_insn)
+			regs->dx = 0;
 	}
 
 	if (safe)
-- 
2.51.0
Re: [PATCH v2 08/12] x86/extable: Add support for immediate form MSR instructions
Posted by Peter Zijlstra 2 months, 2 weeks ago
On Tue, Sep 30, 2025 at 09:03:52AM +0200, Juergen Gross wrote:
> From: "Xin Li (Intel)" <xin@zytor.com>
> 
> Signed-off-by: Xin Li (Intel) <xin@zytor.com>
> Signed-off-by: Juergen Gross <jgross@suse.com>
> ---
> V2:
> - new patch, taken from the RFC v2 MSR refactor series by Xin Li
> ---
>  arch/x86/include/asm/msr.h | 18 ++++++++++++++++++
>  arch/x86/mm/extable.c      | 39 +++++++++++++++++++++++++++++++++-----
>  2 files changed, 52 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
> index 71f41af11591..cf5205300266 100644
> --- a/arch/x86/include/asm/msr.h
> +++ b/arch/x86/include/asm/msr.h
> @@ -56,6 +56,24 @@ static inline void do_trace_read_msr(u32 msr, u64 val, int failed) {}
>  static inline void do_trace_rdpmc(u32 msr, u64 val, int failed) {}
>  #endif
>  
> +/*
> + * Called only from an MSR fault handler, the instruction pointer points to
> + * the MSR access instruction that caused the fault.
> + */
> +static __always_inline bool is_msr_imm_insn(void *ip)
> +{
> +	/*
> +	 * A full decoder for immediate form MSR instructions appears excessive.
> +	 */
> +#ifdef CONFIG_X86_64
> +	const u8 msr_imm_insn_prefix[] = { 0xc4, 0xe7 };
> +
> +	return !memcmp(ip, msr_imm_insn_prefix, sizeof(msr_imm_insn_prefix));

This seems fragile. Those two bytes are basically the first two bytes of
VEX3 and only indicate VEX3.map7. Which is not very specific, but when
combined with the fact that this is an MSR exception, might just work.

Trouble is that it is also possible to encode the immediate form using
EVEX. If a toolchain were to do that, we'd fail to detect it.

(And there is segment prefix stuffing possible I suppose)

> +#else
> +	return false;
> +#endif
> +}
> +
>  /*
>   * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR
>   * accessors and should not have any tracing or other functionality piggybacking
> diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
> index 2fdc1f1f5adb..c021e4dbc69d 100644
> --- a/arch/x86/mm/extable.c
> +++ b/arch/x86/mm/extable.c
> @@ -166,23 +166,52 @@ static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
>  static bool ex_handler_msr(const struct exception_table_entry *fixup,
>  			   struct pt_regs *regs, bool wrmsr, bool safe, int reg)
>  {
> +	bool imm_insn = is_msr_imm_insn((void *)regs->ip);
> +	u32 msr;
> +
> +	if (imm_insn)
> +		/*
> +		 * The 32-bit immediate specifying a MSR is encoded into
> +		 * byte 5 ~ 8 of an immediate form MSR instruction.
> +		 */
> +		msr = *(u32 *)(regs->ip + 5);
> +	else
> +		msr = (u32)regs->cx;

This seems to have fallen subject to the US tariff induced {} shortage
or something.

Also, EVEX form will have them in other bytes (one further, on account
of EVEX being 4 bytes, rather than 3).

Given this really isn't a fast path or anything, how about we just use
the instruction decoder? Something like:

	struct insn insn;
	u32 msr = (u32)regs->cx;

	ret = insn_decode_kernel(&insn, (void *)regs->ip);
	if (!ret && insn.vex_prefix.nbytes)
		msr = insn.immediate.value;

should do, I suppose. Isn't that both simpler and more robust?
Re: [PATCH v2 08/12] x86/extable: Add support for immediate form MSR instructions
Posted by Jürgen Groß 2 months, 2 weeks ago
On 30.09.25 10:14, Peter Zijlstra wrote:
> On Tue, Sep 30, 2025 at 09:03:52AM +0200, Juergen Gross wrote:
>> From: "Xin Li (Intel)" <xin@zytor.com>
>>
>> Signed-off-by: Xin Li (Intel) <xin@zytor.com>
>> Signed-off-by: Juergen Gross <jgross@suse.com>
>> ---
>> V2:
>> - new patch, taken from the RFC v2 MSR refactor series by Xin Li
>> ---
>>   arch/x86/include/asm/msr.h | 18 ++++++++++++++++++
>>   arch/x86/mm/extable.c      | 39 +++++++++++++++++++++++++++++++++-----
>>   2 files changed, 52 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
>> index 71f41af11591..cf5205300266 100644
>> --- a/arch/x86/include/asm/msr.h
>> +++ b/arch/x86/include/asm/msr.h
>> @@ -56,6 +56,24 @@ static inline void do_trace_read_msr(u32 msr, u64 val, int failed) {}
>>   static inline void do_trace_rdpmc(u32 msr, u64 val, int failed) {}
>>   #endif
>>   
>> +/*
>> + * Called only from an MSR fault handler, the instruction pointer points to
>> + * the MSR access instruction that caused the fault.
>> + */
>> +static __always_inline bool is_msr_imm_insn(void *ip)
>> +{
>> +	/*
>> +	 * A full decoder for immediate form MSR instructions appears excessive.
>> +	 */
>> +#ifdef CONFIG_X86_64
>> +	const u8 msr_imm_insn_prefix[] = { 0xc4, 0xe7 };
>> +
>> +	return !memcmp(ip, msr_imm_insn_prefix, sizeof(msr_imm_insn_prefix));
> 
> This seems fragile. Those two bytes are basically the first two bytes of
> VEX3 and only indicate VEX3.map7. Which is not very specific, but when
> combined with the fact that this is an MSR exception, might just work.
> 
> Trouble is that it is also possible to encode the immediate form using
> EVEX. If a toolchain were to do that, we'd fail to detect it.
> 
> (And there is segment prefix stuffing possible I suppose)
> 
>> +#else
>> +	return false;
>> +#endif
>> +}
>> +
>>   /*
>>    * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR
>>    * accessors and should not have any tracing or other functionality piggybacking
>> diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
>> index 2fdc1f1f5adb..c021e4dbc69d 100644
>> --- a/arch/x86/mm/extable.c
>> +++ b/arch/x86/mm/extable.c
>> @@ -166,23 +166,52 @@ static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
>>   static bool ex_handler_msr(const struct exception_table_entry *fixup,
>>   			   struct pt_regs *regs, bool wrmsr, bool safe, int reg)
>>   {
>> +	bool imm_insn = is_msr_imm_insn((void *)regs->ip);
>> +	u32 msr;
>> +
>> +	if (imm_insn)
>> +		/*
>> +		 * The 32-bit immediate specifying a MSR is encoded into
>> +		 * byte 5 ~ 8 of an immediate form MSR instruction.
>> +		 */
>> +		msr = *(u32 *)(regs->ip + 5);
>> +	else
>> +		msr = (u32)regs->cx;
> 
> This seems to have fallen subject to the US tariff induced {} shortage
> or something.
> 
> Also, EVEX form will have them in other bytes (one further, on account
> of EVEX being 4 bytes, rather than 3).
> 
> Given this really isn't a fast path or anything, how about we just use
> the instruction decoder? Something like:
> 
> 	struct insn insn;
> 	u32 msr = (u32)regs->cx;
> 
> 	ret = insn_decode_kernel(&insn, (void *)regs->ip);
> 	if (!ret && insn.vex_prefix.nbytes)
> 		msr = insn.immediate.value;
> 
> should do, I suppose. Isn't that both simpler and more robust?
> 

Works for me.


Juergen