[PATCH 2/4] x86/xstate: Rework XSAVE/XRSTOR given a newer toolchain baseline

Andrew Cooper posted 4 patches 1 month, 1 week ago
[PATCH 2/4] x86/xstate: Rework XSAVE/XRSTOR given a newer toolchain baseline
Posted by Andrew Cooper 1 month, 1 week ago
The new toolchain baseline knows all the mnemonics, so a plain memory operand
can be used, rather than needing to hard-code the ModRM byte as (%rdi).

For xrstor(), use asm goto rather than hiding the increment of the faults
variable inside the .fixup section.  Remove the loop and replace it with a
goto retry pattern.  Put the domain_crash() into the default case for fault
handling, and provide a concrete error message rather than leaving it as an
exercise for extra code diving.

No functional change.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
---
 xen/arch/x86/xstate.c | 77 ++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 41 deletions(-)

diff --git a/xen/arch/x86/xstate.c b/xen/arch/x86/xstate.c
index 384f78bd5281..4215a83efefb 100644
--- a/xen/arch/x86/xstate.c
+++ b/xen/arch/x86/xstate.c
@@ -310,21 +310,21 @@ void xsave(struct vcpu *v, uint64_t mask)
     uint32_t hmask = mask >> 32;
     uint32_t lmask = mask;
     unsigned int fip_width = v->domain->arch.x87_fip_width;
-#define XSAVE(pfx) \
-        if ( v->arch.xcr0_accum & XSTATE_XSAVES_ONLY ) \
-            asm volatile ( ".byte " pfx "0x0f,0xc7,0x2f\n" /* xsaves */ \
-                           : "=m" (*ptr) \
-                           : "a" (lmask), "d" (hmask), "D" (ptr) ); \
-        else \
-            alternative_io(".byte " pfx "0x0f,0xae,0x27\n", /* xsave */ \
-                           ".byte " pfx "0x0f,0xae,0x37\n", /* xsaveopt */ \
-                           X86_FEATURE_XSAVEOPT, \
-                           "=m" (*ptr), \
-                           "a" (lmask), "d" (hmask), "D" (ptr))
+
+#define XSAVE(pfx)                                                      \
+    if ( v->arch.xcr0_accum & XSTATE_XSAVES_ONLY )                      \
+        asm volatile ( "xsaves %0"                                      \
+                       : "=m" (*ptr)                                    \
+                       : "a" (lmask), "d" (hmask) );                    \
+    else                                                                \
+        alternative_io("xsave %0",                                      \
+                       "xsaveopt %0", X86_FEATURE_XSAVEOPT,             \
+                       "=m" (*ptr),                                     \
+                       "a" (lmask), "d" (hmask))
 
     if ( fip_width == 8 || !(mask & X86_XCR0_X87) )
     {
-        XSAVE("0x48,");
+        XSAVE("rex64 ");
     }
     else if ( fip_width == 4 )
     {
@@ -349,7 +349,7 @@ void xsave(struct vcpu *v, uint64_t mask)
 
         ptr->fpu_sse.fip.addr = bad_fip;
 
-        XSAVE("0x48,");
+        XSAVE("rex64 ");
 
         /* FIP/FDP not updated? Restore the old FIP value. */
         if ( ptr->fpu_sse.fip.addr == bad_fip )
@@ -384,7 +384,7 @@ void xrstor(struct vcpu *v, uint64_t mask)
     uint32_t hmask = mask >> 32;
     uint32_t lmask = mask;
     struct xsave_struct *ptr = v->arch.xsave_area;
-    unsigned int faults, prev_faults;
+    unsigned int faults = 0;
 
     /*
      * Some CPUs don't save/restore FDP/FIP/FOP unless an exception
@@ -405,22 +405,15 @@ void xrstor(struct vcpu *v, uint64_t mask)
      * possibility, which may occur if the block was passed to us by control
      * tools or through VCPUOP_initialise, by silently adjusting state.
      */
-    for ( prev_faults = faults = 0; ; prev_faults = faults )
-    {
+ retry:
     switch ( __builtin_expect(ptr->fpu_sse.x[FPU_WORD_SIZE_OFFSET], 8) )
     {
-        BUILD_BUG_ON(sizeof(faults) != 4); /* Clang doesn't support %z in asm. */
-#define _xrstor(insn)                                               \
-        asm volatile ( "1: .byte " insn "\n"                        \
-                       "3:\n"                                       \
-                       "   .section .fixup,\"ax\"\n"                \
-                       "2: incl %[faults]\n"                        \
-                       "   jmp 3b\n"                                \
-                       "   .previous\n"                             \
-                       _ASM_EXTABLE(1b, 2b)                         \
-                       : [mem] "+m" (*ptr), [faults] "+g" (faults)  \
-                       : [lmask] "a" (lmask), [hmask] "d" (hmask),  \
-                         [ptr] "D" (ptr) )
+#define _xrstor(insn)                                       \
+        asm_inline volatile goto (                          \
+            "1: " insn " %0\n"                              \
+            _ASM_EXTABLE(1b, %l[fault])                     \
+            :: "m" (*ptr), "a" (lmask), "d" (hmask)         \
+            :: fault )
 
 #define XRSTOR(pfx) \
         if ( v->arch.xcr0_accum & XSTATE_XSAVES_ONLY ) \
@@ -432,13 +425,13 @@ void xrstor(struct vcpu *v, uint64_t mask)
                 ptr->xsave_hdr.xcomp_bv = ptr->xsave_hdr.xstate_bv | \
                                           XSTATE_COMPACTION_ENABLED; \
             } \
-            _xrstor(pfx "0x0f,0xc7,0x1f"); /* xrstors */ \
+            _xrstor(pfx "xrstors"); \
         } \
         else \
-            _xrstor(pfx "0x0f,0xae,0x2f") /* xrstor */
+            _xrstor(pfx "xrstor")
 
     default:
-        XRSTOR("0x48,");
+        XRSTOR("rex64 ");
         break;
 
     case 4: case 2:
@@ -449,8 +442,10 @@ void xrstor(struct vcpu *v, uint64_t mask)
 #undef _xrstor
     }
 
-    if ( likely(faults == prev_faults) )
-        break;
+    return;
+
+ fault:
+    faults++;
 
 #ifndef NDEBUG
     gprintk(XENLOG_WARNING, "fault#%u: mxcsr=%08x\n",
@@ -489,17 +484,17 @@ void xrstor(struct vcpu *v, uint64_t mask)
             ptr->xsave_hdr.xcomp_bv = 0;
         }
         memset(ptr->xsave_hdr.reserved, 0, sizeof(ptr->xsave_hdr.reserved));
-        continue;
+        goto retry;
 
     case 2: /* Stage 2: Reset all state. */
         ptr->fpu_sse.mxcsr = MXCSR_DEFAULT;
         ptr->xsave_hdr.xstate_bv = 0;
         ptr->xsave_hdr.xcomp_bv = v->arch.xcr0_accum & XSTATE_XSAVES_ONLY
             ? XSTATE_COMPACTION_ENABLED : 0;
-        continue;
-    }
+        goto retry;
 
-        domain_crash(current->domain);
+    default: /* Stage 3: Nothing else to do. */
+        domain_crash(v->domain, "Uncorrectable XRSTOR fault\n");
         return;
     }
 }
@@ -1041,17 +1036,17 @@ uint64_t read_bndcfgu(void)
 
     if ( cpu_has_xsavec )
     {
-        asm ( ".byte 0x0f,0xc7,0x27\n" /* xsavec */
+        asm ( "xsavec %0"
               : "=m" (*xstate)
-              : "a" (X86_XCR0_BNDCSR), "d" (0), "D" (xstate) );
+              : "a" (X86_XCR0_BNDCSR), "d" (0) );
 
         bndcsr = (void *)(xstate + 1);
     }
     else
     {
-        asm ( ".byte 0x0f,0xae,0x27\n" /* xsave */
+        asm ( "xsave %0"
               : "=m" (*xstate)
-              : "a" (X86_XCR0_BNDCSR), "d" (0), "D" (xstate) );
+              : "a" (X86_XCR0_BNDCSR), "d" (0) );
 
         bndcsr = (void *)xstate + xstate_offsets[ilog2(X86_XCR0_BNDCSR)];
     }
-- 
2.39.5


Re: [PATCH 2/4] x86/xstate: Rework XSAVE/XRSTOR given a newer toolchain baseline
Posted by Jan Beulich 1 month ago
On 30.12.2025 14:54, Andrew Cooper wrote:
> --- a/xen/arch/x86/xstate.c
> +++ b/xen/arch/x86/xstate.c
> @@ -310,21 +310,21 @@ void xsave(struct vcpu *v, uint64_t mask)
>      uint32_t hmask = mask >> 32;
>      uint32_t lmask = mask;
>      unsigned int fip_width = v->domain->arch.x87_fip_width;
> -#define XSAVE(pfx) \
> -        if ( v->arch.xcr0_accum & XSTATE_XSAVES_ONLY ) \
> -            asm volatile ( ".byte " pfx "0x0f,0xc7,0x2f\n" /* xsaves */ \
> -                           : "=m" (*ptr) \
> -                           : "a" (lmask), "d" (hmask), "D" (ptr) ); \
> -        else \
> -            alternative_io(".byte " pfx "0x0f,0xae,0x27\n", /* xsave */ \
> -                           ".byte " pfx "0x0f,0xae,0x37\n", /* xsaveopt */ \
> -                           X86_FEATURE_XSAVEOPT, \
> -                           "=m" (*ptr), \
> -                           "a" (lmask), "d" (hmask), "D" (ptr))
> +
> +#define XSAVE(pfx)                                                      \
> +    if ( v->arch.xcr0_accum & XSTATE_XSAVES_ONLY )                      \
> +        asm volatile ( "xsaves %0"                                      \
> +                       : "=m" (*ptr)                                    \
> +                       : "a" (lmask), "d" (hmask) );                    \
> +    else                                                                \
> +        alternative_io("xsave %0",                                      \
> +                       "xsaveopt %0", X86_FEATURE_XSAVEOPT,             \
> +                       "=m" (*ptr),                                     \
> +                       "a" (lmask), "d" (hmask))

While no doubt neater to read this way, there's a subtle latent issue here:
"m" doesn't exclude RIP-relative addressing, yet that addressing form can't
be used in replacement code (up and until we leverage your decode-lite to
actually be able to fix up the displacement). Sadly "o" as a constraint
doesn't look to be any different in this regard (I think it should be, as
adding a "small integer" may already bring the displacement beyond the
permitted range, but their definition of "offsettable" allows this).

This issue is latent until such time that (a) a caller appears passing in
the address of a Xen-internal variable and (b) we make LTO work again.
Since the breakage would be impossible to notice at build time, I think we
would be better off if we avoided it from the beginning. Which may mean
sacrificing on code gen, by using "r" and then "(%0)" as the insn operand.

> @@ -489,17 +484,17 @@ void xrstor(struct vcpu *v, uint64_t mask)
>              ptr->xsave_hdr.xcomp_bv = 0;
>          }
>          memset(ptr->xsave_hdr.reserved, 0, sizeof(ptr->xsave_hdr.reserved));
> -        continue;
> +        goto retry;
>  
>      case 2: /* Stage 2: Reset all state. */
>          ptr->fpu_sse.mxcsr = MXCSR_DEFAULT;
>          ptr->xsave_hdr.xstate_bv = 0;
>          ptr->xsave_hdr.xcomp_bv = v->arch.xcr0_accum & XSTATE_XSAVES_ONLY
>              ? XSTATE_COMPACTION_ENABLED : 0;
> -        continue;
> -    }
> +        goto retry;
>  
> -        domain_crash(current->domain);
> +    default: /* Stage 3: Nothing else to do. */
> +        domain_crash(v->domain, "Uncorrectable XRSTOR fault\n");
>          return;

There's an unexplained change here as to which domain is being crashed.
You switch to crashing the subject domain, yet if that's not also the
requester, it isn't "guilty" in causing the observed fault.

Jan
Re: [PATCH 2/4] x86/xstate: Rework XSAVE/XRSTOR given a newer toolchain baseline
Posted by Andrew Cooper 1 month ago
On 05/01/2026 3:46 pm, Jan Beulich wrote:
> On 30.12.2025 14:54, Andrew Cooper wrote:
>> --- a/xen/arch/x86/xstate.c
>> +++ b/xen/arch/x86/xstate.c
>> @@ -310,21 +310,21 @@ void xsave(struct vcpu *v, uint64_t mask)
>>      uint32_t hmask = mask >> 32;
>>      uint32_t lmask = mask;
>>      unsigned int fip_width = v->domain->arch.x87_fip_width;
>> -#define XSAVE(pfx) \
>> -        if ( v->arch.xcr0_accum & XSTATE_XSAVES_ONLY ) \
>> -            asm volatile ( ".byte " pfx "0x0f,0xc7,0x2f\n" /* xsaves */ \
>> -                           : "=m" (*ptr) \
>> -                           : "a" (lmask), "d" (hmask), "D" (ptr) ); \
>> -        else \
>> -            alternative_io(".byte " pfx "0x0f,0xae,0x27\n", /* xsave */ \
>> -                           ".byte " pfx "0x0f,0xae,0x37\n", /* xsaveopt */ \
>> -                           X86_FEATURE_XSAVEOPT, \
>> -                           "=m" (*ptr), \
>> -                           "a" (lmask), "d" (hmask), "D" (ptr))
>> +
>> +#define XSAVE(pfx)                                                      \
>> +    if ( v->arch.xcr0_accum & XSTATE_XSAVES_ONLY )                      \
>> +        asm volatile ( "xsaves %0"                                      \
>> +                       : "=m" (*ptr)                                    \
>> +                       : "a" (lmask), "d" (hmask) );                    \
>> +    else                                                                \
>> +        alternative_io("xsave %0",                                      \
>> +                       "xsaveopt %0", X86_FEATURE_XSAVEOPT,             \
>> +                       "=m" (*ptr),                                     \
>> +                       "a" (lmask), "d" (hmask))
> While no doubt neater to read this way, there's a subtle latent issue here:
> "m" doesn't exclude RIP-relative addressing, yet that addressing form can't
> be used in replacement code (up and until we leverage your decode-lite to
> actually be able to fix up the displacement).

I guess I'll fix that first.

I'm not interested in trying to keep playing games to work around
deficiencies in our alternatives infrastructure.

>  Sadly "o" as a constraint
> doesn't look to be any different in this regard (I think it should be, as
> adding a "small integer" may already bring the displacement beyond the
> permitted range, but their definition of "offsettable" allows this).
>
> This issue is latent until such time that (a) a caller appears passing in
> the address of a Xen-internal variable and (b) we make LTO work again.
> Since the breakage would be impossible to notice at build time, I think we
> would be better off if we avoided it from the beginning. Which may mean
> sacrificing on code gen, by using "r" and then "(%0)" as the insn operand.

Even with LTO, I don't see any plausible case where we have build-time
struct vcpu's to pass in.

>
>> @@ -489,17 +484,17 @@ void xrstor(struct vcpu *v, uint64_t mask)
>>              ptr->xsave_hdr.xcomp_bv = 0;
>>          }
>>          memset(ptr->xsave_hdr.reserved, 0, sizeof(ptr->xsave_hdr.reserved));
>> -        continue;
>> +        goto retry;
>>  
>>      case 2: /* Stage 2: Reset all state. */
>>          ptr->fpu_sse.mxcsr = MXCSR_DEFAULT;
>>          ptr->xsave_hdr.xstate_bv = 0;
>>          ptr->xsave_hdr.xcomp_bv = v->arch.xcr0_accum & XSTATE_XSAVES_ONLY
>>              ? XSTATE_COMPACTION_ENABLED : 0;
>> -        continue;
>> -    }
>> +        goto retry;
>>  
>> -        domain_crash(current->domain);
>> +    default: /* Stage 3: Nothing else to do. */
>> +        domain_crash(v->domain, "Uncorrectable XRSTOR fault\n");
>>          return;
> There's an unexplained change here as to which domain is being crashed.
> You switch to crashing the subject domain, yet if that's not also the
> requester, it isn't "guilty" in causing the observed fault.

So dom0 should be crashed because there bad data in the migration stream?

v is always curr.  XRSTOR can't be used correctly outside of the subject
context, and indeed the Stage 2 logic above is definitely buggy when v
!= curr.

~Andrew

Re: [PATCH 2/4] x86/xstate: Rework XSAVE/XRSTOR given a newer toolchain baseline
Posted by Jan Beulich 1 month ago
On 05.01.2026 18:45, Andrew Cooper wrote:
> On 05/01/2026 3:46 pm, Jan Beulich wrote:
>> On 30.12.2025 14:54, Andrew Cooper wrote:
>>> --- a/xen/arch/x86/xstate.c
>>> +++ b/xen/arch/x86/xstate.c
>>> @@ -310,21 +310,21 @@ void xsave(struct vcpu *v, uint64_t mask)
>>>      uint32_t hmask = mask >> 32;
>>>      uint32_t lmask = mask;
>>>      unsigned int fip_width = v->domain->arch.x87_fip_width;
>>> -#define XSAVE(pfx) \
>>> -        if ( v->arch.xcr0_accum & XSTATE_XSAVES_ONLY ) \
>>> -            asm volatile ( ".byte " pfx "0x0f,0xc7,0x2f\n" /* xsaves */ \
>>> -                           : "=m" (*ptr) \
>>> -                           : "a" (lmask), "d" (hmask), "D" (ptr) ); \
>>> -        else \
>>> -            alternative_io(".byte " pfx "0x0f,0xae,0x27\n", /* xsave */ \
>>> -                           ".byte " pfx "0x0f,0xae,0x37\n", /* xsaveopt */ \
>>> -                           X86_FEATURE_XSAVEOPT, \
>>> -                           "=m" (*ptr), \
>>> -                           "a" (lmask), "d" (hmask), "D" (ptr))
>>> +
>>> +#define XSAVE(pfx)                                                      \
>>> +    if ( v->arch.xcr0_accum & XSTATE_XSAVES_ONLY )                      \
>>> +        asm volatile ( "xsaves %0"                                      \
>>> +                       : "=m" (*ptr)                                    \
>>> +                       : "a" (lmask), "d" (hmask) );                    \
>>> +    else                                                                \
>>> +        alternative_io("xsave %0",                                      \
>>> +                       "xsaveopt %0", X86_FEATURE_XSAVEOPT,             \
>>> +                       "=m" (*ptr),                                     \
>>> +                       "a" (lmask), "d" (hmask))
>> While no doubt neater to read this way, there's a subtle latent issue here:
>> "m" doesn't exclude RIP-relative addressing, yet that addressing form can't
>> be used in replacement code (up and until we leverage your decode-lite to
>> actually be able to fix up the displacement).
> 
> I guess I'll fix that first.
> 
> I'm not interested in trying to keep playing games to work around
> deficiencies in our alternatives infrastructure.
> 
>>  Sadly "o" as a constraint
>> doesn't look to be any different in this regard (I think it should be, as
>> adding a "small integer" may already bring the displacement beyond the
>> permitted range, but their definition of "offsettable" allows this).
>>
>> This issue is latent until such time that (a) a caller appears passing in
>> the address of a Xen-internal variable and (b) we make LTO work again.
>> Since the breakage would be impossible to notice at build time, I think we
>> would be better off if we avoided it from the beginning. Which may mean
>> sacrificing on code gen, by using "r" and then "(%0)" as the insn operand.
> 
> Even with LTO, I don't see any plausible case where we have build-time
> struct vcpu's to pass in.

Sure, but struct vcpu * also isn't a great parameter for this kind of a
function.

>>> @@ -489,17 +484,17 @@ void xrstor(struct vcpu *v, uint64_t mask)
>>>              ptr->xsave_hdr.xcomp_bv = 0;
>>>          }
>>>          memset(ptr->xsave_hdr.reserved, 0, sizeof(ptr->xsave_hdr.reserved));
>>> -        continue;
>>> +        goto retry;
>>>  
>>>      case 2: /* Stage 2: Reset all state. */
>>>          ptr->fpu_sse.mxcsr = MXCSR_DEFAULT;
>>>          ptr->xsave_hdr.xstate_bv = 0;
>>>          ptr->xsave_hdr.xcomp_bv = v->arch.xcr0_accum & XSTATE_XSAVES_ONLY
>>>              ? XSTATE_COMPACTION_ENABLED : 0;
>>> -        continue;
>>> -    }
>>> +        goto retry;
>>>  
>>> -        domain_crash(current->domain);
>>> +    default: /* Stage 3: Nothing else to do. */
>>> +        domain_crash(v->domain, "Uncorrectable XRSTOR fault\n");
>>>          return;
>> There's an unexplained change here as to which domain is being crashed.
>> You switch to crashing the subject domain, yet if that's not also the
>> requester, it isn't "guilty" in causing the observed fault.
> 
> So dom0 should be crashed because there bad data in the migration stream?

Well, I'm not saying the behavior needs to stay like this, or that's it's
the best of all possible options. But in principle Dom0 could sanitize the
migration stream before passing it to Xen. So it is still first and foremost
Dom0 which is to blame.

> v is always curr.

Not quite - see xstate_set_init(). And for some of the callers of
hvm_update_guest_cr() I also don't think they always act on current. In
particular hvm_vcpu_reset_state() never does, I suppose (not the least
because of the vcpu_pause() in its sole caller).

>  XRSTOR can't be used correctly outside of the subject context,

Then are you suggesting e.g. xstate_set_init() is buggy?

> and indeed the Stage 2 logic above is definitely buggy when v != curr.

Apparently I'm blind, as I don't see an issue there. It's all v's data
which is being fiddled with.

Jan

Re: [PATCH 2/4] x86/xstate: Rework XSAVE/XRSTOR given a newer toolchain baseline
Posted by Andrew Cooper 1 month ago
On 06/01/2026 7:59 am, Jan Beulich wrote:
>>>> @@ -489,17 +484,17 @@ void xrstor(struct vcpu *v, uint64_t mask)
>>>>              ptr->xsave_hdr.xcomp_bv = 0;
>>>>          }
>>>>          memset(ptr->xsave_hdr.reserved, 0, sizeof(ptr->xsave_hdr.reserved));
>>>> -        continue;
>>>> +        goto retry;
>>>>  
>>>>      case 2: /* Stage 2: Reset all state. */
>>>>          ptr->fpu_sse.mxcsr = MXCSR_DEFAULT;
>>>>          ptr->xsave_hdr.xstate_bv = 0;
>>>>          ptr->xsave_hdr.xcomp_bv = v->arch.xcr0_accum & XSTATE_XSAVES_ONLY
>>>>              ? XSTATE_COMPACTION_ENABLED : 0;
>>>> -        continue;
>>>> -    }
>>>> +        goto retry;
>>>>  
>>>> -        domain_crash(current->domain);
>>>> +    default: /* Stage 3: Nothing else to do. */
>>>> +        domain_crash(v->domain, "Uncorrectable XRSTOR fault\n");
>>>>          return;
>>> There's an unexplained change here as to which domain is being crashed.
>>> You switch to crashing the subject domain, yet if that's not also the
>>> requester, it isn't "guilty" in causing the observed fault.
>> So dom0 should be crashed because there bad data in the migration stream?
> Well, I'm not saying the behavior needs to stay like this, or that's it's
> the best of all possible options. But in principle Dom0 could sanitize the
> migration stream before passing it to Xen. So it is still first and foremost
> Dom0 which is to blame.

BNDCFGU contains a pointer which, for PV context, needs access_ok(), not
just a regular canonical check.  Most supervisor states are in a similar
position.

Just because Xen has managed to get away without such checks (by not yet
supporting a state where it matters), I don't agree that its safe to
trust dom0 to do this.


For this case, it's v's xstate buffer which cannot be loaded, so it's v
which cannot be context switched into, and must be crashed.  More below.


>> v is always curr.
> Not quite - see xstate_set_init().

Also more below.

> And for some of the callers of
> hvm_update_guest_cr() I also don't think they always act on current. In
> particular hvm_vcpu_reset_state() never does, I suppose (not the least
> because of the vcpu_pause() in its sole caller).

We discussed the need to not be remotely poking register state like
that.  But I don't see where the connection is between
hvm_update_guest_cr() and xsave()/xrstor().

Tangent: hvm_vcpu_reset_state() is terribly named as it's attempting to
put the vCPU into the INIT state, not the #RESET set.

But it only operates on the xstate header in memory while the target is
de-scheduled.  It's not using XSAVE/XRSTOR to load the results into
registers as far as I can tell.

>
>>   XRSTOR can't be used correctly outside of the subject context,
> Then are you suggesting e.g. xstate_set_init() is buggy?

No, but it switches into enough of v's context to function.  Really its
neither current nor remote context.

But, it's single caller is adjust_bnd() in the emulator so it's always
actually current context with a no-op on xcr0.

As said on Matrix, I think it's going to be necessary to remove MPX to
continue the XSAVE cleanup.

~Andrew

Re: [PATCH 2/4] x86/xstate: Rework XSAVE/XRSTOR given a newer toolchain baseline
Posted by Jan Beulich 1 month ago
On 08.01.2026 22:08, Andrew Cooper wrote:
> On 06/01/2026 7:59 am, Jan Beulich wrote:
>>>>> @@ -489,17 +484,17 @@ void xrstor(struct vcpu *v, uint64_t mask)
>>>>>              ptr->xsave_hdr.xcomp_bv = 0;
>>>>>          }
>>>>>          memset(ptr->xsave_hdr.reserved, 0, sizeof(ptr->xsave_hdr.reserved));
>>>>> -        continue;
>>>>> +        goto retry;
>>>>>  
>>>>>      case 2: /* Stage 2: Reset all state. */
>>>>>          ptr->fpu_sse.mxcsr = MXCSR_DEFAULT;
>>>>>          ptr->xsave_hdr.xstate_bv = 0;
>>>>>          ptr->xsave_hdr.xcomp_bv = v->arch.xcr0_accum & XSTATE_XSAVES_ONLY
>>>>>              ? XSTATE_COMPACTION_ENABLED : 0;
>>>>> -        continue;
>>>>> -    }
>>>>> +        goto retry;
>>>>>  
>>>>> -        domain_crash(current->domain);
>>>>> +    default: /* Stage 3: Nothing else to do. */
>>>>> +        domain_crash(v->domain, "Uncorrectable XRSTOR fault\n");
>>>>>          return;
>>>> There's an unexplained change here as to which domain is being crashed.
>>>> You switch to crashing the subject domain, yet if that's not also the
>>>> requester, it isn't "guilty" in causing the observed fault.
>>> So dom0 should be crashed because there bad data in the migration stream?
>> Well, I'm not saying the behavior needs to stay like this, or that's it's
>> the best of all possible options. But in principle Dom0 could sanitize the
>> migration stream before passing it to Xen. So it is still first and foremost
>> Dom0 which is to blame.
> 
> BNDCFGU contains a pointer which, for PV context, needs access_ok(), not
> just a regular canonical check.  Most supervisor states are in a similar
> position.

Yes, so exposing them to PV would require extra care. Note that MPX isn't
exposed to PV.

> Just because Xen has managed to get away without such checks (by not yet
> supporting a state where it matters), I don't agree that its safe to
> trust dom0 to do this.

Yet the guest itself can't have got in place a non-canonical value, can it?
Its attempts to load it into hardware would have faulted. So it's still
not the target domain which is to be blamed for a fault resulting from
XRSTOR encountering bogus pointers.

> For this case, it's v's xstate buffer which cannot be loaded, so it's v
> which cannot be context switched into, and must be crashed.  More below.

Well, yes, as said - that's one possible way of treating things. My main
request is not so much to undo the change, but to properly justify it in
the description. (Or maybe that really wants to be a separate change, in
particular if you wanted the changed behavior to also be backported.)

>>> v is always curr.
>> Not quite - see xstate_set_init().
> 
> Also more below.
> 
>> And for some of the callers of
>> hvm_update_guest_cr() I also don't think they always act on current. In
>> particular hvm_vcpu_reset_state() never does, I suppose (not the least
>> because of the vcpu_pause() in its sole caller).
> 
> We discussed the need to not be remotely poking register state like
> that.  But I don't see where the connection is between
> hvm_update_guest_cr() and xsave()/xrstor().

At the example of svm_update_guest_cr(): It calls svm_fpu_enter(), which
in turn calls vcpu_restore_fpu_lazy(). But yes, that's explicitly only
when v == current. I fear I didn't look closely enough when writing the
earlier reply, sorry.

> Tangent: hvm_vcpu_reset_state() is terribly named as it's attempting to
> put the vCPU into the INIT state, not the #RESET set.
> 
> But it only operates on the xstate header in memory while the target is
> de-scheduled.  It's not using XSAVE/XRSTOR to load the results into
> registers as far as I can tell.

Iirc I mentioned hvm_vcpu_reset_state() because it calls
hvm_update_guest_cr() several times.

>>>   XRSTOR can't be used correctly outside of the subject context,
>> Then are you suggesting e.g. xstate_set_init() is buggy?
> 
> No, but it switches into enough of v's context to function.  Really its
> neither current nor remote context.
> 
> But, it's single caller is adjust_bnd() in the emulator so it's always
> actually current context with a no-op on xcr0.

That's its single present caller. Who knows what else we might need it for.
It would better be operating correctly in the more general case.

> As said on Matrix, I think it's going to be necessary to remove MPX to
> continue the XSAVE cleanup.

Possibly, yes.

Jan

Re: [PATCH 2/4] x86/xstate: Rework XSAVE/XRSTOR given a newer toolchain baseline
Posted by Andrew Cooper 1 month, 1 week ago
On 30/12/2025 1:54 pm, Andrew Cooper wrote:
> diff --git a/xen/arch/x86/xstate.c b/xen/arch/x86/xstate.c
> index 384f78bd5281..4215a83efefb 100644
> --- a/xen/arch/x86/xstate.c
> +++ b/xen/arch/x86/xstate.c
> @@ -310,21 +310,21 @@ void xsave(struct vcpu *v, uint64_t mask)
>      uint32_t hmask = mask >> 32;
>      uint32_t lmask = mask;
>      unsigned int fip_width = v->domain->arch.x87_fip_width;
> -#define XSAVE(pfx) \
> -        if ( v->arch.xcr0_accum & XSTATE_XSAVES_ONLY ) \
> -            asm volatile ( ".byte " pfx "0x0f,0xc7,0x2f\n" /* xsaves */ \
> -                           : "=m" (*ptr) \
> -                           : "a" (lmask), "d" (hmask), "D" (ptr) ); \
> -        else \
> -            alternative_io(".byte " pfx "0x0f,0xae,0x27\n", /* xsave */ \
> -                           ".byte " pfx "0x0f,0xae,0x37\n", /* xsaveopt */ \
> -                           X86_FEATURE_XSAVEOPT, \
> -                           "=m" (*ptr), \
> -                           "a" (lmask), "d" (hmask), "D" (ptr))
> +
> +#define XSAVE(pfx)                                                      \
> +    if ( v->arch.xcr0_accum & XSTATE_XSAVES_ONLY )                      \
> +        asm volatile ( "xsaves %0"                                      \
> +                       : "=m" (*ptr)                                    \
> +                       : "a" (lmask), "d" (hmask) );                    \
> +    else                                                                \
> +        alternative_io("xsave %0",                                      \
> +                       "xsaveopt %0", X86_FEATURE_XSAVEOPT,             \
> +                       "=m" (*ptr),                                     \
> +                       "a" (lmask), "d" (hmask))

This loses the pfx.  I've fixed up locally and double checked the
disassembly.

~Andrew

Re: [PATCH 2/4] x86/xstate: Rework XSAVE/XRSTOR given a newer toolchain baseline
Posted by Jan Beulich 1 month ago
On 02.01.2026 17:01, Andrew Cooper wrote:
> On 30/12/2025 1:54 pm, Andrew Cooper wrote:
>> --- a/xen/arch/x86/xstate.c
>> +++ b/xen/arch/x86/xstate.c
>> @@ -310,21 +310,21 @@ void xsave(struct vcpu *v, uint64_t mask)
>>      uint32_t hmask = mask >> 32;
>>      uint32_t lmask = mask;
>>      unsigned int fip_width = v->domain->arch.x87_fip_width;
>> -#define XSAVE(pfx) \
>> -        if ( v->arch.xcr0_accum & XSTATE_XSAVES_ONLY ) \
>> -            asm volatile ( ".byte " pfx "0x0f,0xc7,0x2f\n" /* xsaves */ \
>> -                           : "=m" (*ptr) \
>> -                           : "a" (lmask), "d" (hmask), "D" (ptr) ); \
>> -        else \
>> -            alternative_io(".byte " pfx "0x0f,0xae,0x27\n", /* xsave */ \
>> -                           ".byte " pfx "0x0f,0xae,0x37\n", /* xsaveopt */ \
>> -                           X86_FEATURE_XSAVEOPT, \
>> -                           "=m" (*ptr), \
>> -                           "a" (lmask), "d" (hmask), "D" (ptr))
>> +
>> +#define XSAVE(pfx)                                                      \
>> +    if ( v->arch.xcr0_accum & XSTATE_XSAVES_ONLY )                      \
>> +        asm volatile ( "xsaves %0"                                      \
>> +                       : "=m" (*ptr)                                    \
>> +                       : "a" (lmask), "d" (hmask) );                    \
>> +    else                                                                \
>> +        alternative_io("xsave %0",                                      \
>> +                       "xsaveopt %0", X86_FEATURE_XSAVEOPT,             \
>> +                       "=m" (*ptr),                                     \
>> +                       "a" (lmask), "d" (hmask))
> 
> This loses the pfx.  I've fixed up locally and double checked the
> disassembly.

Question being: Do we want to stick to using the prefix form, when gas
specifically has been offering a kind-of-suffix form instead from the
very beginning (xsaves and xsaves64)?

If we wanted to stick to the prefixes, I'd favor a form where the use
sites don't need to supply the separating blank (i.e. the macro itself
would insert it, as doing do with an empty prefix results in merely
an indentation "issue" in the generated assembly). Thoughts?

Jan

Re: [PATCH 2/4] x86/xstate: Rework XSAVE/XRSTOR given a newer toolchain baseline
Posted by Andrew Cooper 1 month ago
On 05/01/2026 3:16 pm, Jan Beulich wrote:
> On 02.01.2026 17:01, Andrew Cooper wrote:
>> On 30/12/2025 1:54 pm, Andrew Cooper wrote:
>>> --- a/xen/arch/x86/xstate.c
>>> +++ b/xen/arch/x86/xstate.c
>>> @@ -310,21 +310,21 @@ void xsave(struct vcpu *v, uint64_t mask)
>>>      uint32_t hmask = mask >> 32;
>>>      uint32_t lmask = mask;
>>>      unsigned int fip_width = v->domain->arch.x87_fip_width;
>>> -#define XSAVE(pfx) \
>>> -        if ( v->arch.xcr0_accum & XSTATE_XSAVES_ONLY ) \
>>> -            asm volatile ( ".byte " pfx "0x0f,0xc7,0x2f\n" /* xsaves */ \
>>> -                           : "=m" (*ptr) \
>>> -                           : "a" (lmask), "d" (hmask), "D" (ptr) ); \
>>> -        else \
>>> -            alternative_io(".byte " pfx "0x0f,0xae,0x27\n", /* xsave */ \
>>> -                           ".byte " pfx "0x0f,0xae,0x37\n", /* xsaveopt */ \
>>> -                           X86_FEATURE_XSAVEOPT, \
>>> -                           "=m" (*ptr), \
>>> -                           "a" (lmask), "d" (hmask), "D" (ptr))
>>> +
>>> +#define XSAVE(pfx)                                                      \
>>> +    if ( v->arch.xcr0_accum & XSTATE_XSAVES_ONLY )                      \
>>> +        asm volatile ( "xsaves %0"                                      \
>>> +                       : "=m" (*ptr)                                    \
>>> +                       : "a" (lmask), "d" (hmask) );                    \
>>> +    else                                                                \
>>> +        alternative_io("xsave %0",                                      \
>>> +                       "xsaveopt %0", X86_FEATURE_XSAVEOPT,             \
>>> +                       "=m" (*ptr),                                     \
>>> +                       "a" (lmask), "d" (hmask))
>> This loses the pfx.  I've fixed up locally and double checked the
>> disassembly.
> Question being: Do we want to stick to using the prefix form, when gas
> specifically has been offering a kind-of-suffix form instead from the
> very beginning (xsaves and xsaves64)?
>
> If we wanted to stick to the prefixes, I'd favor a form where the use
> sites don't need to supply the separating blank (i.e. the macro itself
> would insert it, as doing do with an empty prefix results in merely
> an indentation "issue" in the generated assembly). Thoughts?

I don't expect this macro to survive the fixes to use the compressed
format.  From that point of view, "closest to the original" is least churn.

One problem with using a suffix form is that you could feed in "opt"
instead of "64" and break things in rather more subtle ways.

I'll adjust the position of the space, but I think this can keep on
using prefixes in the short term.

~Andrew

Re: [PATCH 2/4] x86/xstate: Rework XSAVE/XRSTOR given a newer toolchain baseline
Posted by Jan Beulich 1 month ago
On 05.01.2026 17:55, Andrew Cooper wrote:
> On 05/01/2026 3:16 pm, Jan Beulich wrote:
>> On 02.01.2026 17:01, Andrew Cooper wrote:
>>> On 30/12/2025 1:54 pm, Andrew Cooper wrote:
>>>> --- a/xen/arch/x86/xstate.c
>>>> +++ b/xen/arch/x86/xstate.c
>>>> @@ -310,21 +310,21 @@ void xsave(struct vcpu *v, uint64_t mask)
>>>>      uint32_t hmask = mask >> 32;
>>>>      uint32_t lmask = mask;
>>>>      unsigned int fip_width = v->domain->arch.x87_fip_width;
>>>> -#define XSAVE(pfx) \
>>>> -        if ( v->arch.xcr0_accum & XSTATE_XSAVES_ONLY ) \
>>>> -            asm volatile ( ".byte " pfx "0x0f,0xc7,0x2f\n" /* xsaves */ \
>>>> -                           : "=m" (*ptr) \
>>>> -                           : "a" (lmask), "d" (hmask), "D" (ptr) ); \
>>>> -        else \
>>>> -            alternative_io(".byte " pfx "0x0f,0xae,0x27\n", /* xsave */ \
>>>> -                           ".byte " pfx "0x0f,0xae,0x37\n", /* xsaveopt */ \
>>>> -                           X86_FEATURE_XSAVEOPT, \
>>>> -                           "=m" (*ptr), \
>>>> -                           "a" (lmask), "d" (hmask), "D" (ptr))
>>>> +
>>>> +#define XSAVE(pfx)                                                      \
>>>> +    if ( v->arch.xcr0_accum & XSTATE_XSAVES_ONLY )                      \
>>>> +        asm volatile ( "xsaves %0"                                      \
>>>> +                       : "=m" (*ptr)                                    \
>>>> +                       : "a" (lmask), "d" (hmask) );                    \
>>>> +    else                                                                \
>>>> +        alternative_io("xsave %0",                                      \
>>>> +                       "xsaveopt %0", X86_FEATURE_XSAVEOPT,             \
>>>> +                       "=m" (*ptr),                                     \
>>>> +                       "a" (lmask), "d" (hmask))
>>> This loses the pfx.  I've fixed up locally and double checked the
>>> disassembly.
>> Question being: Do we want to stick to using the prefix form, when gas
>> specifically has been offering a kind-of-suffix form instead from the
>> very beginning (xsaves and xsaves64)?
>>
>> If we wanted to stick to the prefixes, I'd favor a form where the use
>> sites don't need to supply the separating blank (i.e. the macro itself
>> would insert it, as doing do with an empty prefix results in merely
>> an indentation "issue" in the generated assembly). Thoughts?
> 
> I don't expect this macro to survive the fixes to use the compressed
> format.  From that point of view, "closest to the original" is least churn.
> 
> One problem with using a suffix form is that you could feed in "opt"
> instead of "64" and break things in rather more subtle ways.

Except that there's no XSAVESOPT nor XSAVEOPTOPT.

> I'll adjust the position of the space, but I think this can keep on
> using prefixes in the short term.

Okay, I wanted the alternative to at least be considered.

Jan