[PATCH v4] x86/vmx: save guest non-register state in hvm_hw_cpu

Tamas K Lengyel posted 1 patch 2 years, 1 month ago
Test gitlab-ci passed
Patches applied successfully (tree, apply log)
git fetch https://gitlab.com/xen-project/patchew/xen tags/patchew/a8828d68c308fa7ecbfe4387ca753ee0f80a2a7d.1647897016.git.tamas.lengyel@intel.com
xen/arch/x86/hvm/hvm.c                 |  4 ++--
xen/arch/x86/hvm/vmx/vmx.c             | 17 ++++++++++++++++-
xen/include/public/arch-x86/hvm/save.h | 13 +++++++++++++
3 files changed, 31 insertions(+), 3 deletions(-)
[PATCH v4] x86/vmx: save guest non-register state in hvm_hw_cpu
Posted by Tamas K Lengyel 2 years, 1 month ago
During VM forking and resetting a failed vmentry has been observed due
to the guest non-register state going out-of-sync with the guest register
state. For example, a VM fork reset right after a STI instruction can trigger
the failed entry. This is due to the guest non-register state not being saved
from the parent VM, thus the reset operation only copies the register state.

Fix this by including the guest non-register state in hvm_hw_cpu so that when
its copied from the parent VM the vCPU state remains in sync.

SVM is not currently wired-in as VM forking is VMX only and saving non-register
state during normal save/restore/migration operation hasn't been needed. If
deemed necessary in the future it can be wired in by adding a svm-substructure
to hvm_hw_cpu.

Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
v4: Correct setting and checking new flag value in hvm.c
v3: Add XEN_X86_VMX flag and vmx-substructure in hvm_hw_cpu
v2: Include all CPU non-register state and fold the ops into vmx_vmcs_save &
    vmx_vmcs_restore.
Note: no sanity checking is performed on the fields to reduce the cycles during
      fuzzing.
---
 xen/arch/x86/hvm/hvm.c                 |  4 ++--
 xen/arch/x86/hvm/vmx/vmx.c             | 17 ++++++++++++++++-
 xen/include/public/arch-x86/hvm/save.h | 13 +++++++++++++
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 709a4191ef..c502d0851e 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -894,7 +894,7 @@ static int cf_check hvm_save_cpu_ctxt(struct vcpu *v, hvm_domain_context_t *h)
     if ( v->fpu_initialised )
     {
         memcpy(ctxt.fpu_regs, v->arch.fpu_ctxt, sizeof(ctxt.fpu_regs));
-        ctxt.flags = XEN_X86_FPU_INITIALISED;
+        ctxt.flags |= XEN_X86_FPU_INITIALISED;
     }
 
     return hvm_save_entry(CPU, v->vcpu_id, h, &ctxt);
@@ -1025,7 +1025,7 @@ static int cf_check hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
         return -EINVAL;
     }
 
-    if ( (ctxt.flags & ~XEN_X86_FPU_INITIALISED) != 0 )
+    if ( (ctxt.flags & ~(XEN_X86_FPU_INITIALISED | XEN_X86_VMX)) != 0 )
     {
         gprintk(XENLOG_ERR, "bad flags value in CPU context: %#x\n",
                 ctxt.flags);
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index c075370f64..6da3842d6e 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -713,7 +713,7 @@ static void vmx_restore_dr(struct vcpu *v)
 
 static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
 {
-    unsigned long ev;
+    unsigned long ev, activity_state, intr_info;
 
     vmx_vmcs_enter(v);
 
@@ -721,6 +721,10 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
     __vmread(GUEST_SYSENTER_ESP, &c->sysenter_esp);
     __vmread(GUEST_SYSENTER_EIP, &c->sysenter_eip);
 
+    __vmread(GUEST_ACTIVITY_STATE, &activity_state);
+    __vmread(GUEST_INTERRUPTIBILITY_INFO, &intr_info);
+    __vmread(GUEST_PENDING_DBG_EXCEPTIONS, &c->vmx.pending_dbg);
+
     __vmread(VM_ENTRY_INTR_INFO, &ev);
     if ( (ev & INTR_INFO_VALID_MASK) &&
          hvm_event_needs_reinjection(MASK_EXTR(ev, INTR_INFO_INTR_TYPE_MASK),
@@ -732,6 +736,10 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
     }
 
     vmx_vmcs_exit(v);
+
+    c->vmx.activity_state = activity_state;
+    c->vmx.interruptibility_info = intr_info;
+    c->flags |= XEN_X86_VMX;
 }
 
 static int vmx_restore_cr0_cr3(
@@ -807,6 +815,13 @@ static int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
 
     __vmwrite(GUEST_DR7, c->dr7);
 
+    if ( c->flags & XEN_X86_VMX )
+    {
+        __vmwrite(GUEST_ACTIVITY_STATE, c->vmx.activity_state);
+        __vmwrite(GUEST_INTERRUPTIBILITY_INFO, c->vmx.interruptibility_info);
+        __vmwrite(GUEST_PENDING_DBG_EXCEPTIONS, c->vmx.pending_dbg);
+    }
+
     if ( c->pending_valid &&
          hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
     {
diff --git a/xen/include/public/arch-x86/hvm/save.h b/xen/include/public/arch-x86/hvm/save.h
index 773a380bc2..0f728aa5d9 100644
--- a/xen/include/public/arch-x86/hvm/save.h
+++ b/xen/include/public/arch-x86/hvm/save.h
@@ -52,6 +52,7 @@ DECLARE_HVM_SAVE_TYPE(HEADER, 1, struct hvm_save_header);
  * Compat:
  *     - Pre-3.4 didn't have msr_tsc_aux
  *     - Pre-4.7 didn't have fpu_initialised
+ *     - Pre-4.17 didn't have non-register state
  */
 
 struct hvm_hw_cpu {
@@ -163,9 +164,21 @@ struct hvm_hw_cpu {
     uint32_t error_code;
 
 #define _XEN_X86_FPU_INITIALISED        0
+#define _XEN_X86_VMX                    1
 #define XEN_X86_FPU_INITIALISED         (1U<<_XEN_X86_FPU_INITIALISED)
+#define XEN_X86_VMX                     (1U<<_XEN_X86_VMX)
     uint32_t flags;
     uint32_t pad0;
+
+    /* non-register state */
+    union {
+        /* if flags & XEN_X86_VMX */
+        struct {
+            uint32_t activity_state;
+            uint32_t interruptibility_info;
+            uint64_t pending_dbg;
+        } vmx;
+    };
 };
 
 struct hvm_hw_cpu_compat {
-- 
2.25.1
Re: [PATCH v4] x86/vmx: save guest non-register state in hvm_hw_cpu
Posted by Andrew Cooper 2 years, 1 month ago
On 21/03/2022 21:12, Tamas K Lengyel wrote:
> During VM forking and resetting a failed vmentry has been observed due
> to the guest non-register state going out-of-sync with the guest register
> state. For example, a VM fork reset right after a STI instruction can trigger
> the failed entry. This is due to the guest non-register state not being saved
> from the parent VM, thus the reset operation only copies the register state.
>
> Fix this by including the guest non-register state in hvm_hw_cpu so that when
> its copied from the parent VM the vCPU state remains in sync.
>
> SVM is not currently wired-in as VM forking is VMX only and saving non-register
> state during normal save/restore/migration operation hasn't been needed. If
> deemed necessary in the future it can be wired in by adding a svm-substructure
> to hvm_hw_cpu.

I disagree here.  This bug isn't really to do with fuzzing; it's to do
with our APIs being able to get/set vCPU state correctly, and fuzzing is
the example which demonstrated the breakage.

This will also cause very subtle bugs for the guest-transparent
migration work, and the live update work, both of which want to shift
vcpu state behind a VM which hasn't actively entered Xen via hypercall.

(Quick tangent.  Seriously, can the SDM be fixed so it actually
enumerates the Activity States.)

Xen doesn't currently support any situation where Intel's idea of
Activity State is anything other than 0.  This in turn is buggy, because
we don't encode VPF_blocked anywhere.  An activity state of hlt might
not be best place to encode this, because notably absent from Intel's
activity state is mwait.  We'll also terminate things like schedop_poll
early.

Next, AMD does have various fields from interruptibility.  If you want
me to write the patch then fine, but they should not be excluded from a
patch like this.  AMD do not have separate bits for STI and MovSS, due
to microarchitectural differences, but the single INTERRUPT_SHADOW bit
does need managing, as does the blocked-by-NMI bit which is emulated on
SVM and older Intel CPUs.

Minor tangent, blocked-by-SMI needs cross-linking with vm-entry
controls, so I'm not sure it is something we ought to expose.

Next, it turns out that MSR_DEBUGCTL isn't included anywhere (not even
the MSR list).  It is important, because it forms part of the VMEntry
cross-check with PENDING_DBG and TF.

Next, we also don't preserve PDPTRs.  This is another area where Intel
and AMD CPUs behave differently, but under Intel's architecture, the
guest kernel can clobber the 32bit PAE block of pointers and everything
will function correctly until the next mov into cr3.  There are
definitely bugs in Xen's emulated pagewalk in this area.

So there are a lot of errors with hvm_hw_cpu and I bet I haven't found
them all.

As discussed at multiple previous XenSummits, the current load/save mess
needs moving out of Xen, and that would be the correct time to fix the
other errors, but it probably is too much for this case.


At a minimum, there shouldn't be a VMX specific union, because we are
talking about architecture-agnostic properties of the vCPU.  It's fine
for the format to follow Intel's bit layout for the subset of bits we
tolerate saving/restoring, but this needs discussing in the header, and
needs rejecting on set.  An extra check/reject is 0% overhead for
forking, so I don't find that a credible argument against doing it.

I'm not sure if we want to include the activity state at the moment
without a better idea of how to encode VPF_blocked, but I think DEBUGCTL
does need including.  This in turn involves transmitting the LBR MSRs too.

~Andrew
Re: [PATCH v4] x86/vmx: save guest non-register state in hvm_hw_cpu
Posted by Tamas K Lengyel 2 years, 1 month ago
On Tue, Mar 22, 2022 at 11:11 AM Andrew Cooper
<Andrew.Cooper3@citrix.com> wrote:
>
> On 21/03/2022 21:12, Tamas K Lengyel wrote:
> > During VM forking and resetting a failed vmentry has been observed due
> > to the guest non-register state going out-of-sync with the guest register
> > state. For example, a VM fork reset right after a STI instruction can trigger
> > the failed entry. This is due to the guest non-register state not being saved
> > from the parent VM, thus the reset operation only copies the register state.
> >
> > Fix this by including the guest non-register state in hvm_hw_cpu so that when
> > its copied from the parent VM the vCPU state remains in sync.
> >
> > SVM is not currently wired-in as VM forking is VMX only and saving non-register
> > state during normal save/restore/migration operation hasn't been needed. If
> > deemed necessary in the future it can be wired in by adding a svm-substructure
> > to hvm_hw_cpu.
>
> I disagree here.  This bug isn't really to do with fuzzing; it's to do
> with our APIs being able to get/set vCPU state correctly, and fuzzing is
> the example which demonstrated the breakage.
>
> This will also cause very subtle bugs for the guest-transparent
> migration work, and the live update work, both of which want to shift
> vcpu state behind a VM which hasn't actively entered Xen via hypercall.
>
> (Quick tangent.  Seriously, can the SDM be fixed so it actually
> enumerates the Activity States.)
>
> Xen doesn't currently support any situation where Intel's idea of
> Activity State is anything other than 0.  This in turn is buggy, because
> we don't encode VPF_blocked anywhere.  An activity state of hlt might
> not be best place to encode this, because notably absent from Intel's
> activity state is mwait.  We'll also terminate things like schedop_poll
> early.
>
> Next, AMD does have various fields from interruptibility.  If you want
> me to write the patch then fine, but they should not be excluded from a
> patch like this.  AMD do not have separate bits for STI and MovSS, due
> to microarchitectural differences, but the single INTERRUPT_SHADOW bit
> does need managing, as does the blocked-by-NMI bit which is emulated on
> SVM and older Intel CPUs.
>
> Minor tangent, blocked-by-SMI needs cross-linking with vm-entry
> controls, so I'm not sure it is something we ought to expose.
>
> Next, it turns out that MSR_DEBUGCTL isn't included anywhere (not even
> the MSR list).  It is important, because it forms part of the VMEntry
> cross-check with PENDING_DBG and TF.
>
> Next, we also don't preserve PDPTRs.  This is another area where Intel
> and AMD CPUs behave differently, but under Intel's architecture, the
> guest kernel can clobber the 32bit PAE block of pointers and everything
> will function correctly until the next mov into cr3.  There are
> definitely bugs in Xen's emulated pagewalk in this area.
>
> So there are a lot of errors with hvm_hw_cpu and I bet I haven't found
> them all.
>
> As discussed at multiple previous XenSummits, the current load/save mess
> needs moving out of Xen, and that would be the correct time to fix the
> other errors, but it probably is too much for this case.
>
>
> At a minimum, there shouldn't be a VMX specific union, because we are
> talking about architecture-agnostic properties of the vCPU.  It's fine
> for the format to follow Intel's bit layout for the subset of bits we
> tolerate saving/restoring, but this needs discussing in the header, and
> needs rejecting on set.  An extra check/reject is 0% overhead for
> forking, so I don't find that a credible argument against doing it.

Sure, during the fork itself it's negligible. It's during fork_reset,
which we do thousands of times per second where that sanity checking
is both unneeded (we know the setting is getting copied from the
parent) and will quickly add up if we need to do a bunch of bitshifts
to ensure only the valid bits are restored (plus converted in case the
format will be the non-vmx version).

> I'm not sure if we want to include the activity state at the moment
> without a better idea of how to encode VPF_blocked, but I think DEBUGCTL
> does need including.  This in turn involves transmitting the LBR MSRs too.

I don't really have much to add here. If there are concerns in regards
to the side-effect of this on the pre-existing save/restore/migration
route then what I can do is add only a Xen-internal function that will
allow the fork_reset to copy the non-register state from the parent
and not expose it via the public header. That works for me just as
well, my use-case doesn't have a requirement for these bits to be
exposed externally.

Tamas