xen/arch/x86/hvm/hvm.c | 4 ++-- xen/arch/x86/hvm/vmx/vmx.c | 17 ++++++++++++++++- xen/include/public/arch-x86/hvm/save.h | 13 +++++++++++++ 3 files changed, 31 insertions(+), 3 deletions(-)
During VM forking and resetting a failed vmentry has been observed due
to the guest non-register state going out-of-sync with the guest register
state. For example, a VM fork reset right after a STI instruction can trigger
the failed entry. This is due to the guest non-register state not being saved
from the parent VM, thus the reset operation only copies the register state.
Fix this by including the guest non-register state in hvm_hw_cpu so that when
its copied from the parent VM the vCPU state remains in sync.
SVM is not currently wired-in as VM forking is VMX only and saving non-register
state during normal save/restore/migration operation hasn't been needed. If
deemed necessary in the future it can be wired in by adding a svm-substructure
to hvm_hw_cpu.
Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
v4: Correct setting and checking new flag value in hvm.c
v3: Add XEN_X86_VMX flag and vmx-substructure in hvm_hw_cpu
v2: Include all CPU non-register state and fold the ops into vmx_vmcs_save &
vmx_vmcs_restore.
Note: no sanity checking is performed on the fields to reduce the cycles during
fuzzing.
---
xen/arch/x86/hvm/hvm.c | 4 ++--
xen/arch/x86/hvm/vmx/vmx.c | 17 ++++++++++++++++-
xen/include/public/arch-x86/hvm/save.h | 13 +++++++++++++
3 files changed, 31 insertions(+), 3 deletions(-)
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 709a4191ef..c502d0851e 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -894,7 +894,7 @@ static int cf_check hvm_save_cpu_ctxt(struct vcpu *v, hvm_domain_context_t *h)
if ( v->fpu_initialised )
{
memcpy(ctxt.fpu_regs, v->arch.fpu_ctxt, sizeof(ctxt.fpu_regs));
- ctxt.flags = XEN_X86_FPU_INITIALISED;
+ ctxt.flags |= XEN_X86_FPU_INITIALISED;
}
return hvm_save_entry(CPU, v->vcpu_id, h, &ctxt);
@@ -1025,7 +1025,7 @@ static int cf_check hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
return -EINVAL;
}
- if ( (ctxt.flags & ~XEN_X86_FPU_INITIALISED) != 0 )
+ if ( (ctxt.flags & ~(XEN_X86_FPU_INITIALISED | XEN_X86_VMX)) != 0 )
{
gprintk(XENLOG_ERR, "bad flags value in CPU context: %#x\n",
ctxt.flags);
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index c075370f64..6da3842d6e 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -713,7 +713,7 @@ static void vmx_restore_dr(struct vcpu *v)
static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
{
- unsigned long ev;
+ unsigned long ev, activity_state, intr_info;
vmx_vmcs_enter(v);
@@ -721,6 +721,10 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
__vmread(GUEST_SYSENTER_ESP, &c->sysenter_esp);
__vmread(GUEST_SYSENTER_EIP, &c->sysenter_eip);
+ __vmread(GUEST_ACTIVITY_STATE, &activity_state);
+ __vmread(GUEST_INTERRUPTIBILITY_INFO, &intr_info);
+ __vmread(GUEST_PENDING_DBG_EXCEPTIONS, &c->vmx.pending_dbg);
+
__vmread(VM_ENTRY_INTR_INFO, &ev);
if ( (ev & INTR_INFO_VALID_MASK) &&
hvm_event_needs_reinjection(MASK_EXTR(ev, INTR_INFO_INTR_TYPE_MASK),
@@ -732,6 +736,10 @@ static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
}
vmx_vmcs_exit(v);
+
+ c->vmx.activity_state = activity_state;
+ c->vmx.interruptibility_info = intr_info;
+ c->flags |= XEN_X86_VMX;
}
static int vmx_restore_cr0_cr3(
@@ -807,6 +815,13 @@ static int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
__vmwrite(GUEST_DR7, c->dr7);
+ if ( c->flags & XEN_X86_VMX )
+ {
+ __vmwrite(GUEST_ACTIVITY_STATE, c->vmx.activity_state);
+ __vmwrite(GUEST_INTERRUPTIBILITY_INFO, c->vmx.interruptibility_info);
+ __vmwrite(GUEST_PENDING_DBG_EXCEPTIONS, c->vmx.pending_dbg);
+ }
+
if ( c->pending_valid &&
hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
{
diff --git a/xen/include/public/arch-x86/hvm/save.h b/xen/include/public/arch-x86/hvm/save.h
index 773a380bc2..0f728aa5d9 100644
--- a/xen/include/public/arch-x86/hvm/save.h
+++ b/xen/include/public/arch-x86/hvm/save.h
@@ -52,6 +52,7 @@ DECLARE_HVM_SAVE_TYPE(HEADER, 1, struct hvm_save_header);
* Compat:
* - Pre-3.4 didn't have msr_tsc_aux
* - Pre-4.7 didn't have fpu_initialised
+ * - Pre-4.17 didn't have non-register state
*/
struct hvm_hw_cpu {
@@ -163,9 +164,21 @@ struct hvm_hw_cpu {
uint32_t error_code;
#define _XEN_X86_FPU_INITIALISED 0
+#define _XEN_X86_VMX 1
#define XEN_X86_FPU_INITIALISED (1U<<_XEN_X86_FPU_INITIALISED)
+#define XEN_X86_VMX (1U<<_XEN_X86_VMX)
uint32_t flags;
uint32_t pad0;
+
+ /* non-register state */
+ union {
+ /* if flags & XEN_X86_VMX */
+ struct {
+ uint32_t activity_state;
+ uint32_t interruptibility_info;
+ uint64_t pending_dbg;
+ } vmx;
+ };
};
struct hvm_hw_cpu_compat {
--
2.25.1
On 21/03/2022 21:12, Tamas K Lengyel wrote: > During VM forking and resetting a failed vmentry has been observed due > to the guest non-register state going out-of-sync with the guest register > state. For example, a VM fork reset right after a STI instruction can trigger > the failed entry. This is due to the guest non-register state not being saved > from the parent VM, thus the reset operation only copies the register state. > > Fix this by including the guest non-register state in hvm_hw_cpu so that when > its copied from the parent VM the vCPU state remains in sync. > > SVM is not currently wired-in as VM forking is VMX only and saving non-register > state during normal save/restore/migration operation hasn't been needed. If > deemed necessary in the future it can be wired in by adding a svm-substructure > to hvm_hw_cpu. I disagree here. This bug isn't really to do with fuzzing; it's to do with our APIs being able to get/set vCPU state correctly, and fuzzing is the example which demonstrated the breakage. This will also cause very subtle bugs for the guest-transparent migration work, and the live update work, both of which want to shift vcpu state behind a VM which hasn't actively entered Xen via hypercall. (Quick tangent. Seriously, can the SDM be fixed so it actually enumerates the Activity States.) Xen doesn't currently support any situation where Intel's idea of Activity State is anything other than 0. This in turn is buggy, because we don't encode VPF_blocked anywhere. An activity state of hlt might not be best place to encode this, because notably absent from Intel's activity state is mwait. We'll also terminate things like schedop_poll early. Next, AMD does have various fields from interruptibility. If you want me to write the patch then fine, but they should not be excluded from a patch like this. AMD do not have separate bits for STI and MovSS, due to microarchitectural differences, but the single INTERRUPT_SHADOW bit does need managing, as does the blocked-by-NMI bit which is emulated on SVM and older Intel CPUs. Minor tangent, blocked-by-SMI needs cross-linking with vm-entry controls, so I'm not sure it is something we ought to expose. Next, it turns out that MSR_DEBUGCTL isn't included anywhere (not even the MSR list). It is important, because it forms part of the VMEntry cross-check with PENDING_DBG and TF. Next, we also don't preserve PDPTRs. This is another area where Intel and AMD CPUs behave differently, but under Intel's architecture, the guest kernel can clobber the 32bit PAE block of pointers and everything will function correctly until the next mov into cr3. There are definitely bugs in Xen's emulated pagewalk in this area. So there are a lot of errors with hvm_hw_cpu and I bet I haven't found them all. As discussed at multiple previous XenSummits, the current load/save mess needs moving out of Xen, and that would be the correct time to fix the other errors, but it probably is too much for this case. At a minimum, there shouldn't be a VMX specific union, because we are talking about architecture-agnostic properties of the vCPU. It's fine for the format to follow Intel's bit layout for the subset of bits we tolerate saving/restoring, but this needs discussing in the header, and needs rejecting on set. An extra check/reject is 0% overhead for forking, so I don't find that a credible argument against doing it. I'm not sure if we want to include the activity state at the moment without a better idea of how to encode VPF_blocked, but I think DEBUGCTL does need including. This in turn involves transmitting the LBR MSRs too. ~Andrew
On Tue, Mar 22, 2022 at 11:11 AM Andrew Cooper <Andrew.Cooper3@citrix.com> wrote: > > On 21/03/2022 21:12, Tamas K Lengyel wrote: > > During VM forking and resetting a failed vmentry has been observed due > > to the guest non-register state going out-of-sync with the guest register > > state. For example, a VM fork reset right after a STI instruction can trigger > > the failed entry. This is due to the guest non-register state not being saved > > from the parent VM, thus the reset operation only copies the register state. > > > > Fix this by including the guest non-register state in hvm_hw_cpu so that when > > its copied from the parent VM the vCPU state remains in sync. > > > > SVM is not currently wired-in as VM forking is VMX only and saving non-register > > state during normal save/restore/migration operation hasn't been needed. If > > deemed necessary in the future it can be wired in by adding a svm-substructure > > to hvm_hw_cpu. > > I disagree here. This bug isn't really to do with fuzzing; it's to do > with our APIs being able to get/set vCPU state correctly, and fuzzing is > the example which demonstrated the breakage. > > This will also cause very subtle bugs for the guest-transparent > migration work, and the live update work, both of which want to shift > vcpu state behind a VM which hasn't actively entered Xen via hypercall. > > (Quick tangent. Seriously, can the SDM be fixed so it actually > enumerates the Activity States.) > > Xen doesn't currently support any situation where Intel's idea of > Activity State is anything other than 0. This in turn is buggy, because > we don't encode VPF_blocked anywhere. An activity state of hlt might > not be best place to encode this, because notably absent from Intel's > activity state is mwait. We'll also terminate things like schedop_poll > early. > > Next, AMD does have various fields from interruptibility. If you want > me to write the patch then fine, but they should not be excluded from a > patch like this. AMD do not have separate bits for STI and MovSS, due > to microarchitectural differences, but the single INTERRUPT_SHADOW bit > does need managing, as does the blocked-by-NMI bit which is emulated on > SVM and older Intel CPUs. > > Minor tangent, blocked-by-SMI needs cross-linking with vm-entry > controls, so I'm not sure it is something we ought to expose. > > Next, it turns out that MSR_DEBUGCTL isn't included anywhere (not even > the MSR list). It is important, because it forms part of the VMEntry > cross-check with PENDING_DBG and TF. > > Next, we also don't preserve PDPTRs. This is another area where Intel > and AMD CPUs behave differently, but under Intel's architecture, the > guest kernel can clobber the 32bit PAE block of pointers and everything > will function correctly until the next mov into cr3. There are > definitely bugs in Xen's emulated pagewalk in this area. > > So there are a lot of errors with hvm_hw_cpu and I bet I haven't found > them all. > > As discussed at multiple previous XenSummits, the current load/save mess > needs moving out of Xen, and that would be the correct time to fix the > other errors, but it probably is too much for this case. > > > At a minimum, there shouldn't be a VMX specific union, because we are > talking about architecture-agnostic properties of the vCPU. It's fine > for the format to follow Intel's bit layout for the subset of bits we > tolerate saving/restoring, but this needs discussing in the header, and > needs rejecting on set. An extra check/reject is 0% overhead for > forking, so I don't find that a credible argument against doing it. Sure, during the fork itself it's negligible. It's during fork_reset, which we do thousands of times per second where that sanity checking is both unneeded (we know the setting is getting copied from the parent) and will quickly add up if we need to do a bunch of bitshifts to ensure only the valid bits are restored (plus converted in case the format will be the non-vmx version). > I'm not sure if we want to include the activity state at the moment > without a better idea of how to encode VPF_blocked, but I think DEBUGCTL > does need including. This in turn involves transmitting the LBR MSRs too. I don't really have much to add here. If there are concerns in regards to the side-effect of this on the pre-existing save/restore/migration route then what I can do is add only a Xen-internal function that will allow the fork_reset to copy the non-register state from the parent and not expose it via the public header. That works for me just as well, my use-case doesn't have a requirement for these bits to be exposed externally. Tamas
© 2016 - 2024 Red Hat, Inc.