[v10] KVM: x86: hyper-v: Fine-grained TLB flush + L2 TLB flush features

[PATCH v10 20/39] KVM: nVMX: hyper-v: Enable L2 TLB flush

Posted by Vitaly Kuznetsov 3 years, 6 months ago

Enable L2 TLB flush feature on nVMX when:
- Enlightened VMCS is in use.
- The feature flag is enabled in eVMCS.
- The feature flag is enabled in partition assist page.

Perform synthetic vmexit to L1 after processing TLB flush call upon
request (HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH).

Note: nested_evmcs_l2_tlb_flush_enabled() uses cached VP assist page copy
which gets updated from nested_vmx_handle_enlightened_vmptrld(). This is
also guaranteed to happen post migration with eVMCS backed L2 running.

Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
 arch/x86/kvm/vmx/evmcs.c  | 17 +++++++++++++++++
 arch/x86/kvm/vmx/evmcs.h  | 10 ++++++++++
 arch/x86/kvm/vmx/nested.c | 22 ++++++++++++++++++++++
 3 files changed, 49 insertions(+)

diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 635a0c81ff1d..f7fedc3ed247 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -6,6 +6,7 @@
 #include "../hyperv.h"
 #include "../cpuid.h"
 #include "evmcs.h"
+#include "nested.h"
 #include "vmcs.h"
 #include "vmx.h"
 #include "trace.h"
@@ -501,6 +502,22 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+
+	if (!hv_vcpu || !evmcs)
+		return false;
+
+	if (!evmcs->hv_enlightenments_control.nested_flush_hypercall)
+		return false;
+
+	return hv_vcpu->vp_assist_page.nested_control.features.directhypercall;
+}
+
 void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
 {
+	nested_vmx_vmexit(vcpu, HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH, 0, 0);
 }
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 7ad56fbc4b4d..dd1589336e79 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -63,6 +63,15 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
 #define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (0)
 #define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
 
+/*
+ * Note, Hyper-V isn't actually stealing bit 28 from Intel, just abusing it by
+ * pairing it with architecturally impossible exit reasons.  Bit 28 is set only
+ * on SMI exits to a SMI transfer monitor (STM) and if and only if a MTF VM-Exit
+ * is pending.  I.e. it will never be set by hardware for non-SMI exits (there
+ * are only three), nor will it ever be set unless the VMM is an STM.
+ */
+#define HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH 0x10000031
+
 struct evmcs_field {
 	u16 offset;
 	u16 clean_field;
@@ -241,6 +250,7 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu,
 			uint16_t *vmcs_version);
 void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
 int nested_evmcs_check_controls(struct vmcs12 *vmcs12);
+bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu);
 void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
 
 #endif /* __KVM_X86_VMX_EVMCS_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 0634518a6719..1451a7a2c488 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1132,6 +1132,17 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	/*
+	 * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or
+	 * L2's VP_ID upon request from the guest. Make sure we check for
+	 * pending entries for the case when the request got misplaced (e.g.
+	 * a transition from L2->L1 happened while processing L2 TLB flush
+	 * request or vice versa). kvm_hv_vcpu_flush_tlb() will not flush
+	 * anything if there are no requests in the corresponding buffer.
+	 */
+	if (to_hv_vcpu(vcpu))
+		kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
+
 	/*
 	 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
 	 * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
@@ -3267,6 +3278,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
 
 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
 {
+	/*
+	 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy
+	 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory
+	 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post
+	 * migration.
+	 */
 	if (!nested_get_evmcs_page(vcpu)) {
 		pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
 				     __func__);
@@ -6143,6 +6160,11 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
 		 * Handle L2's bus locks in L0 directly.
 		 */
 		return true;
+	case EXIT_REASON_VMCALL:
+		/* Hyper-V L2 TLB flush hypercall is handled by L0 */
+		return guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
+			nested_evmcs_l2_tlb_flush_enabled(vcpu) &&
+			kvm_hv_is_tlb_flush_hcall(vcpu);
 	default:
 		break;
 	}
-- 
2.37.3

Re: [PATCH v10 20/39] KVM: nVMX: hyper-v: Enable L2 TLB flush

Posted by Sean Christopherson 3 years, 6 months ago

On Wed, Sep 21, 2022, Vitaly Kuznetsov wrote:
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 0634518a6719..1451a7a2c488 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -1132,6 +1132,17 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  
> +	/*
> +	 * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or
> +	 * L2's VP_ID upon request from the guest. Make sure we check for
> +	 * pending entries for the case when the request got misplaced (e.g.

Kind of a nit, but I'd prefer to avoid "misplaced", as that implies KVM puts entries
into the wrong FIFO.  The issue isn't that KVM puts entries in the wrong FIFO,
it's that the FIFO is filled asynchronously be other vCPUs and so it's possible
to switch to a FIFO that has valid entries without a pending request.

And thinking about this, KVM_REQ_HV_TLB_FLUSH shouldn't be handled in
kvm_service_local_tlb_flush_requests().  My initial reaction to this patch is that
queueing the request here is too late because the switch has already happened,
i.e. nVMX has already called kvm_service_local_tlb_flush_requests() and so the
request 

But making the request for the _new_ context is correct _and_ necessary, e.g. given

	vCPU0			vCPU1
	FIFO[L1].insert
	FIFO[L1].insert
				L1 => L2 transition
	FIFO[L1].insert
	FIFO[L1].insert
	KVM_REQ_HV_TLB_FLUSH

if nVMX made the request for the old contex, then this would happen

	vCPU0			vCPU1
	FIFO[L1].insert
	FIFO[L1].insert
				KVM_REQ_HV_TLB_FLUSH
				service FIFO[L1]
				L1 => L2 transition
	FIFO[L1].insert
	FIFO[L1].insert
	KVM_REQ_HV_TLB_FLUSH
				service FIFO[L2]
				...
				KVM_REQ_HV_TLB_FLUSH
				service FIFO[L2]
				L2 => L1 transition

				Run L1 with FIFO[L1] entries!!!

whereas what is being done in this patch is:

	vCPU0			vCPU1
	FIFO[L1].insert
	FIFO[L1].insert
				L1 => L2 transition
				KVM_REQ_HV_TLB_FLUSH
				service FIFO[2]
	FIFO[L1].insert
	FIFO[L1].insert
	KVM_REQ_HV_TLB_FLUSH
				service FIFO[L2]
				...
				L2 => L1 transition
				KVM_REQ_HV_TLB_FLUSH
				service FIFO[L1]

which is correct and ensures that KVM will always consume FIFO entries prior to
running the associated context.

In other words, unlike KVM_REQ_TLB_FLUSH_CURRENT and KVM_REQ_TLB_FLUSH_GUEST,
KVM_REQ_HV_TLB_FLUSH is not a "local" request.  It's much more like KVM_REQ_TLB_FLUSH
in that it can come from other vCPUs, i.e. is effectively a "remote" request.

So rather than handle KVM_REQ_TLB_FLUSH in the "local" path, it should be handled
only in the request path.  Handling the request in kvm_service_local_tlb_flush_requests()
won't break anything, but conceptually it's wrong and as a result it's misleading
because it implies that nested transitions could also be handled by forcing
kvm_service_local_tlb_flush_requests() to service flushes for the current, i.e.
previous, context on nested transitions, but that wouldn't work (see example above).

I.e. we should end up with something like this:

		/*
		 * Note, the order matters here, as flushing "all" TLB entries
		 * also flushes the "current" TLB entries, and flushing "guest"
		 * TLB entries is a superset of Hyper-V's fine-grained flushing.
		 * I.e. servicing the flush "all" will clear any request to
		 * flush "current", and flushing "guest" will clear any request
		 * to service Hyper-V's fine-grained flush.
		 */
		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
			kvm_vcpu_flush_tlb_all(vcpu);

		kvm_service_local_tlb_flush_requests(vcpu);

		/*
		 * Fall back to a "full" guest flush if Hyper-V's precise
		 * flushing fails.  Note, Hyper-V's flushing is per-vCPU, but
		 * the flushes are considered "remote" and not "local" because
		 * the requests can be initiated from other vCPUs.
		 */
		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu) &&
		    kvm_hv_vcpu_flush_tlb(vcpu))
			kvm_vcpu_flush_tlb_guest(vcpu);

> +	 * a transition from L2->L1 happened while processing L2 TLB flush
> +	 * request or vice versa). kvm_hv_vcpu_flush_tlb() will not flush
> +	 * anything if there are no requests in the corresponding buffer.
> +	 */
> +	if (to_hv_vcpu(vcpu))

This should be:

	if (to_hv_vcpu(vcpu) && enable_ept)

otherwise KVM will fall back to flushing the guest, which is the entire TLB, when
EPT is disabled.  I'm guessing this applies to SVM+NPT as well.

> +		kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu);

Re: [PATCH v10 20/39] KVM: nVMX: hyper-v: Enable L2 TLB flush

Posted by Sean Christopherson 3 years, 6 months ago

On Wed, Sep 21, 2022, Vitaly Kuznetsov wrote:
> diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
> index 7ad56fbc4b4d..dd1589336e79 100644
> --- a/arch/x86/kvm/vmx/evmcs.h
> +++ b/arch/x86/kvm/vmx/evmcs.h
> @@ -63,6 +63,15 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
>  #define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (0)
>  #define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
>  
> +/*
> + * Note, Hyper-V isn't actually stealing bit 28 from Intel, just abusing it by
> + * pairing it with architecturally impossible exit reasons.  Bit 28 is set only
> + * on SMI exits to a SMI transfer monitor (STM) and if and only if a MTF VM-Exit
> + * is pending.  I.e. it will never be set by hardware for non-SMI exits (there
> + * are only three), nor will it ever be set unless the VMM is an STM.
> + */
> +#define HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH 0x10000031

This definition should go into hyperv-tlfs.h since it's take verbatim from the TLFS.

https://learn.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/nested-virtualization#synthetic-vm-exit