xen/arch/x86/include/asm/alternative.h | 7 ++++ xen/arch/x86/include/asm/msr.h | 39 ++++++++++++++++++++- xen/include/public/arch-x86/cpufeatureset.h | 1 + 3 files changed, 46 insertions(+), 1 deletion(-)
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
This is on top of the FRED series for the wrmsrns() cleanup, but otherwise
unrelated.
The code generation isn't entirely ideal
Function old new delta
init_fred 255 274 +19
vmx_set_reg 248 256 +8
enter_state_helper.cold 1014 1018 +4
__start_xen 8893 8897 +4
but made worse by the the prior codegen for wrmsrns(MSR_STAR, ...) being mad:
mov $0xc0000081,%ecx
mov $0xe023e008,%edx
movabs $0xe023e00800000000,%rax
cs wrmsr
The two sources of code expansion come from the compiler not being able to
construct %eax and %edx separately, and not being able propagate constants.
Loading 0 is possibly common enough to warrant another specialisation where we
can use "a" (0), "d" (0) and forgo the MOV+SHR.
I'm probably overthinking things. The addition will be in the noise in
practice, and Intel are sure the advantage of MSR_IMM will not be.
---
xen/arch/x86/include/asm/alternative.h | 7 ++++
xen/arch/x86/include/asm/msr.h | 39 ++++++++++++++++++++-
xen/include/public/arch-x86/cpufeatureset.h | 1 +
3 files changed, 46 insertions(+), 1 deletion(-)
diff --git a/xen/arch/x86/include/asm/alternative.h b/xen/arch/x86/include/asm/alternative.h
index 0482bbf7cbf1..fe87b15ec72c 100644
--- a/xen/arch/x86/include/asm/alternative.h
+++ b/xen/arch/x86/include/asm/alternative.h
@@ -151,6 +151,13 @@ extern void alternative_instructions(void);
ALTERNATIVE(oldinstr, newinstr, feature) \
:: input )
+#define alternative_input_2(oldinstr, newinstr1, feature1, \
+ newinstr2, feature2, input...) \
+ asm_inline volatile ( \
+ ALTERNATIVE_2(oldinstr, newinstr1, feature1, \
+ newinstr2, feature2) \
+ :: input )
+
/* Like alternative_input, but with a single output argument */
#define alternative_io(oldinstr, newinstr, feature, output, input...) \
asm_inline volatile ( \
diff --git a/xen/arch/x86/include/asm/msr.h b/xen/arch/x86/include/asm/msr.h
index 01f510315ffe..434fcac854e1 100644
--- a/xen/arch/x86/include/asm/msr.h
+++ b/xen/arch/x86/include/asm/msr.h
@@ -38,9 +38,46 @@ static inline void wrmsrl(unsigned int msr, uint64_t val)
wrmsr(msr, lo, hi);
}
+/*
+ * Non-serialising WRMSR with a compile-time constant index, when available.
+ * Falls back to plain WRMSRNS, or to a serialising WRMSR.
+ */
+static always_inline void __wrmsrns_imm(uint32_t msr, uint64_t val)
+{
+ /*
+ * For best performance, WRMSRNS %r64, $msr is recommended. For
+ * compatibility, we need to fall back to plain WRMSRNS, or to WRMSR.
+ *
+ * The combined ABI is awkward, because WRMSRNS $imm takes a single r64,
+ * whereas WRMSR{,NS} takes a split edx:eax pair.
+ *
+ * Always use WRMSRNS %rax, $imm, because it has the most in common with
+ * the legacy forms. When MSR_IMM isn't available, emit setup logic for
+ * %ecx and %edx too.
+ */
+ alternative_input_2(
+ "mov $%c[msr], %%ecx\n\t"
+ "mov %%rax, %%rdx\n\t"
+ "shr $32, %%rdx\n\t"
+ ".byte 0x2e; wrmsr",
+
+ /* WRMSRNS %rax, $msr */
+ ".byte 0xc4,0xe7,0x7a,0xf6,0xc0; .long %c[msr]", X86_FEATURE_MSR_IMM,
+
+ "mov $%c[msr], %%ecx\n\t"
+ "mov %%rax, %%rdx\n\t"
+ "shr $32, %%rdx\n\t"
+ ".byte 0x0f,0x01,0xc6", X86_FEATURE_WRMSRNS,
+
+ [msr] "i" (msr), "a" (val) : "rcx", "rdx");
+}
+
/* Non-serialising WRMSR, when available. Falls back to a serialising WRMSR. */
-static inline void wrmsrns(uint32_t msr, uint64_t val)
+static always_inline void wrmsrns(uint32_t msr, uint64_t val)
{
+ if ( __builtin_constant_p(msr) )
+ return __wrmsrns_imm(msr, val);
+
/*
* WRMSR is 2 bytes. WRMSRNS is 3 bytes. Pad WRMSR with a redundant CS
* prefix to avoid a trailing NOP.
diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h
index 9cd778586f10..af69cf3822eb 100644
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -352,6 +352,7 @@ XEN_CPUFEATURE(MCDT_NO, 13*32+ 5) /*A MCDT_NO */
XEN_CPUFEATURE(UC_LOCK_DIS, 13*32+ 6) /* UC-lock disable */
/* Intel-defined CPU features, CPUID level 0x00000007:1.ecx, word 14 */
+XEN_CPUFEATURE(MSR_IMM, 14*32+ 5) /* {RD,WR}MSR $imm32 */
/* Intel-defined CPU features, CPUID level 0x00000007:1.edx, word 15 */
XEN_CPUFEATURE(AVX_VNNI_INT8, 15*32+ 4) /*A AVX-VNNI-INT8 Instructions */
--
2.39.5
On 09.08.2025 00:20, Andrew Cooper wrote:
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Roger Pau Monné <roger.pau@citrix.com>
>
> This is on top of the FRED series for the wrmsrns() cleanup, but otherwise
> unrelated.
>
> The code generation isn't entirely ideal
>
> Function old new delta
> init_fred 255 274 +19
> vmx_set_reg 248 256 +8
> enter_state_helper.cold 1014 1018 +4
> __start_xen 8893 8897 +4
>
> but made worse by the the prior codegen for wrmsrns(MSR_STAR, ...) being mad:
>
> mov $0xc0000081,%ecx
> mov $0xe023e008,%edx
> movabs $0xe023e00800000000,%rax
> cs wrmsr
>
> The two sources of code expansion come from the compiler not being able to
> construct %eax and %edx separately, and not being able propagate constants.
>
> Loading 0 is possibly common enough to warrant another specialisation where we
> can use "a" (0), "d" (0) and forgo the MOV+SHR.
>
> I'm probably overthinking things. The addition will be in the noise in
> practice, and Intel are sure the advantage of MSR_IMM will not be.
It's not entirely clear to me what the overall effects are now with your
02/22 reply on the FRED series. Nevertheless a nit or two here.
> --- a/xen/arch/x86/include/asm/msr.h
> +++ b/xen/arch/x86/include/asm/msr.h
> @@ -38,9 +38,46 @@ static inline void wrmsrl(unsigned int msr, uint64_t val)
> wrmsr(msr, lo, hi);
> }
>
> +/*
> + * Non-serialising WRMSR with a compile-time constant index, when available.
> + * Falls back to plain WRMSRNS, or to a serialising WRMSR.
> + */
> +static always_inline void __wrmsrns_imm(uint32_t msr, uint64_t val)
> +{
> + /*
> + * For best performance, WRMSRNS %r64, $msr is recommended. For
> + * compatibility, we need to fall back to plain WRMSRNS, or to WRMSR.
> + *
> + * The combined ABI is awkward, because WRMSRNS $imm takes a single r64,
> + * whereas WRMSR{,NS} takes a split edx:eax pair.
> + *
> + * Always use WRMSRNS %rax, $imm, because it has the most in common with
> + * the legacy forms. When MSR_IMM isn't available, emit setup logic for
> + * %ecx and %edx too.
> + */
> + alternative_input_2(
> + "mov $%c[msr], %%ecx\n\t"
Simply %[msr] here?
And then, might it make sense to pull out this and ...
> + "mov %%rax, %%rdx\n\t"
> + "shr $32, %%rdx\n\t"
> + ".byte 0x2e; wrmsr",
> +
> + /* WRMSRNS %rax, $msr */
> + ".byte 0xc4,0xe7,0x7a,0xf6,0xc0; .long %c[msr]", X86_FEATURE_MSR_IMM,
> +
> + "mov $%c[msr], %%ecx\n\t"
... this, to ...
> + "mov %%rax, %%rdx\n\t"
> + "shr $32, %%rdx\n\t"
> + ".byte 0x0f,0x01,0xc6", X86_FEATURE_WRMSRNS,
> +
> + [msr] "i" (msr), "a" (val) : "rcx", "rdx");
[msr] "i" (msr), "a" (val), "c" (msr) : "rdx");
allowing the compiler to actually know what's put in %ecx? That'll make
original and 2nd replacement code 10 bytes, better balancing with the 9
bytes of the 1st replacement. And I'd guess that the potentially dead
MOV to %ecx would be hidden in the noise as well.
Then, seeing your use of a CS: prefix on WRMSR, why not also add one to
the 1st replacement, thus not requiring any NOP padding?
Jan
On 11/08/2025 9:16 am, Jan Beulich wrote:
> On 09.08.2025 00:20, Andrew Cooper wrote:
>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> ---
>> CC: Jan Beulich <JBeulich@suse.com>
>> CC: Roger Pau Monné <roger.pau@citrix.com>
>>
>> This is on top of the FRED series for the wrmsrns() cleanup, but otherwise
>> unrelated.
>>
>> The code generation isn't entirely ideal
>>
>> Function old new delta
>> init_fred 255 274 +19
>> vmx_set_reg 248 256 +8
>> enter_state_helper.cold 1014 1018 +4
>> __start_xen 8893 8897 +4
>>
>> but made worse by the the prior codegen for wrmsrns(MSR_STAR, ...) being mad:
>>
>> mov $0xc0000081,%ecx
>> mov $0xe023e008,%edx
>> movabs $0xe023e00800000000,%rax
>> cs wrmsr
>>
>> The two sources of code expansion come from the compiler not being able to
>> construct %eax and %edx separately, and not being able propagate constants.
>>
>> Loading 0 is possibly common enough to warrant another specialisation where we
>> can use "a" (0), "d" (0) and forgo the MOV+SHR.
>>
>> I'm probably overthinking things. The addition will be in the noise in
>> practice, and Intel are sure the advantage of MSR_IMM will not be.
> It's not entirely clear to me what the overall effects are now with your
> 02/22 reply on the FRED series.
The delta gets larger now that 2/22 isn't quite as bad as it was.
> Nevertheless a nit or two here.
>
>> --- a/xen/arch/x86/include/asm/msr.h
>> +++ b/xen/arch/x86/include/asm/msr.h
>> @@ -38,9 +38,46 @@ static inline void wrmsrl(unsigned int msr, uint64_t val)
>> wrmsr(msr, lo, hi);
>> }
>>
>> +/*
>> + * Non-serialising WRMSR with a compile-time constant index, when available.
>> + * Falls back to plain WRMSRNS, or to a serialising WRMSR.
>> + */
>> +static always_inline void __wrmsrns_imm(uint32_t msr, uint64_t val)
>> +{
>> + /*
>> + * For best performance, WRMSRNS %r64, $msr is recommended. For
>> + * compatibility, we need to fall back to plain WRMSRNS, or to WRMSR.
>> + *
>> + * The combined ABI is awkward, because WRMSRNS $imm takes a single r64,
>> + * whereas WRMSR{,NS} takes a split edx:eax pair.
>> + *
>> + * Always use WRMSRNS %rax, $imm, because it has the most in common with
>> + * the legacy forms. When MSR_IMM isn't available, emit setup logic for
>> + * %ecx and %edx too.
>> + */
>> + alternative_input_2(
>> + "mov $%c[msr], %%ecx\n\t"
> Simply %[msr] here?
>
> And then, might it make sense to pull out this and ...
>
>> + "mov %%rax, %%rdx\n\t"
>> + "shr $32, %%rdx\n\t"
>> + ".byte 0x2e; wrmsr",
>> +
>> + /* WRMSRNS %rax, $msr */
>> + ".byte 0xc4,0xe7,0x7a,0xf6,0xc0; .long %c[msr]", X86_FEATURE_MSR_IMM,
>> +
>> + "mov $%c[msr], %%ecx\n\t"
> ... this, to ...
>
>> + "mov %%rax, %%rdx\n\t"
>> + "shr $32, %%rdx\n\t"
>> + ".byte 0x0f,0x01,0xc6", X86_FEATURE_WRMSRNS,
>> +
>> + [msr] "i" (msr), "a" (val) : "rcx", "rdx");
> [msr] "i" (msr), "a" (val), "c" (msr) : "rdx");
>
> allowing the compiler to actually know what's put in %ecx? That'll make
> original and 2nd replacement code 10 bytes, better balancing with the 9
> bytes of the 1st replacement. And I'd guess that the potentially dead
> MOV to %ecx would be hidden in the noise as well.
I considered that, but what can the compiler do as a result of knowing %ecx?
That said, we do need an RDMSR form (which I desperately want to make
foo = rdmsr(MSR_BAR) but my cleanup series from 2019 got nowhere), and
in a read+write case I suppose the compiler could deduplicate the setup
of %ecx.
A dead mov $imm, %ecx probably is as close to a nop as makes no difference.
~Andrew
On 11.08.2025 11:50, Andrew Cooper wrote:
> On 11/08/2025 9:16 am, Jan Beulich wrote:
>> On 09.08.2025 00:20, Andrew Cooper wrote:
>>> --- a/xen/arch/x86/include/asm/msr.h
>>> +++ b/xen/arch/x86/include/asm/msr.h
>>> @@ -38,9 +38,46 @@ static inline void wrmsrl(unsigned int msr, uint64_t val)
>>> wrmsr(msr, lo, hi);
>>> }
>>>
>>> +/*
>>> + * Non-serialising WRMSR with a compile-time constant index, when available.
>>> + * Falls back to plain WRMSRNS, or to a serialising WRMSR.
>>> + */
>>> +static always_inline void __wrmsrns_imm(uint32_t msr, uint64_t val)
>>> +{
>>> + /*
>>> + * For best performance, WRMSRNS %r64, $msr is recommended. For
>>> + * compatibility, we need to fall back to plain WRMSRNS, or to WRMSR.
>>> + *
>>> + * The combined ABI is awkward, because WRMSRNS $imm takes a single r64,
>>> + * whereas WRMSR{,NS} takes a split edx:eax pair.
>>> + *
>>> + * Always use WRMSRNS %rax, $imm, because it has the most in common with
>>> + * the legacy forms. When MSR_IMM isn't available, emit setup logic for
>>> + * %ecx and %edx too.
>>> + */
>>> + alternative_input_2(
>>> + "mov $%c[msr], %%ecx\n\t"
>> Simply %[msr] here?
>>
>> And then, might it make sense to pull out this and ...
>>
>>> + "mov %%rax, %%rdx\n\t"
>>> + "shr $32, %%rdx\n\t"
>>> + ".byte 0x2e; wrmsr",
>>> +
>>> + /* WRMSRNS %rax, $msr */
>>> + ".byte 0xc4,0xe7,0x7a,0xf6,0xc0; .long %c[msr]", X86_FEATURE_MSR_IMM,
>>> +
>>> + "mov $%c[msr], %%ecx\n\t"
>> ... this, to ...
>>
>>> + "mov %%rax, %%rdx\n\t"
>>> + "shr $32, %%rdx\n\t"
>>> + ".byte 0x0f,0x01,0xc6", X86_FEATURE_WRMSRNS,
>>> +
>>> + [msr] "i" (msr), "a" (val) : "rcx", "rdx");
>> [msr] "i" (msr), "a" (val), "c" (msr) : "rdx");
>>
>> allowing the compiler to actually know what's put in %ecx? That'll make
>> original and 2nd replacement code 10 bytes, better balancing with the 9
>> bytes of the 1st replacement. And I'd guess that the potentially dead
>> MOV to %ecx would be hidden in the noise as well.
>
> I considered that, but what can the compiler do as a result of knowing %ecx?
For example ...
> That said, we do need an RDMSR form (which I desperately want to make
> foo = rdmsr(MSR_BAR) but my cleanup series from 2019 got nowhere), and
> in a read+write case I suppose the compiler could deduplicate the setup
> of %ecx.
... this. But also simply to use a good pattern (exposing as much as possible
to the compiler), so there are more good instances of code for future cloning
from. (In size-optimizing builds, the compiler could further favor ADD/SUB
over MOV when the two MSRs accessed are relatively close together.)
Jan
On 11/08/2025 11:06 am, Jan Beulich wrote: > On 11.08.2025 11:50, Andrew Cooper wrote: >> On 11/08/2025 9:16 am, Jan Beulich wrote: >>> On 09.08.2025 00:20, Andrew Cooper wrote: >>>> + "mov %%rax, %%rdx\n\t" >>>> + "shr $32, %%rdx\n\t" >>>> + ".byte 0x0f,0x01,0xc6", X86_FEATURE_WRMSRNS, >>>> + >>>> + [msr] "i" (msr), "a" (val) : "rcx", "rdx"); >>> [msr] "i" (msr), "a" (val), "c" (msr) : "rdx"); >>> >>> allowing the compiler to actually know what's put in %ecx? That'll make >>> original and 2nd replacement code 10 bytes, better balancing with the 9 >>> bytes of the 1st replacement. And I'd guess that the potentially dead >>> MOV to %ecx would be hidden in the noise as well. >> I considered that, but what can the compiler do as a result of knowing %ecx? > For example ... > >> That said, we do need an RDMSR form (which I desperately want to make >> foo = rdmsr(MSR_BAR) but my cleanup series from 2019 got nowhere), and >> in a read+write case I suppose the compiler could deduplicate the setup >> of %ecx. > ... this. But also simply to use a good pattern (exposing as much as possible > to the compiler), so there are more good instances of code for future cloning > from. (In size-optimizing builds, the compiler could further favor ADD/SUB > over MOV when the two MSRs accessed are relatively close together.) I have seen the compiler do this in the past, but couldn't reproduce it for this work. We specifically do not want any conversion to ADD/SUB, because that takes our "close to a nop" and makes it no so. ~Andrew
On 11.08.2025 12:16, Andrew Cooper wrote: > On 11/08/2025 11:06 am, Jan Beulich wrote: >> On 11.08.2025 11:50, Andrew Cooper wrote: >>> On 11/08/2025 9:16 am, Jan Beulich wrote: >>>> On 09.08.2025 00:20, Andrew Cooper wrote: >>>>> + "mov %%rax, %%rdx\n\t" >>>>> + "shr $32, %%rdx\n\t" >>>>> + ".byte 0x0f,0x01,0xc6", X86_FEATURE_WRMSRNS, >>>>> + >>>>> + [msr] "i" (msr), "a" (val) : "rcx", "rdx"); >>>> [msr] "i" (msr), "a" (val), "c" (msr) : "rdx"); >>>> >>>> allowing the compiler to actually know what's put in %ecx? That'll make >>>> original and 2nd replacement code 10 bytes, better balancing with the 9 >>>> bytes of the 1st replacement. And I'd guess that the potentially dead >>>> MOV to %ecx would be hidden in the noise as well. >>> I considered that, but what can the compiler do as a result of knowing %ecx? >> For example ... >> >>> That said, we do need an RDMSR form (which I desperately want to make >>> foo = rdmsr(MSR_BAR) but my cleanup series from 2019 got nowhere), and >>> in a read+write case I suppose the compiler could deduplicate the setup >>> of %ecx. >> ... this. But also simply to use a good pattern (exposing as much as possible >> to the compiler), so there are more good instances of code for future cloning >> from. (In size-optimizing builds, the compiler could further favor ADD/SUB >> over MOV when the two MSRs accessed are relatively close together.) > > I have seen the compiler do this in the past, but couldn't reproduce it > for this work. > > We specifically do not want any conversion to ADD/SUB, because that > takes our "close to a nop" and makes it no so. Imo when size optimizing, size matters and "close-to-a-NOP" doesn't (as much). Jan
© 2016 - 2025 Red Hat, Inc.