[v11] KVM TDX basic feature support

[PATCH v11 033/113] KVM: x86/mmu: Track shadow MMIO value on a per-VM basis

Posted by isaku.yamahata@intel.com 3 years ago

From: Isaku Yamahata <isaku.yamahata@intel.com>

TDX will use a different shadow PTE entry value for MMIO from VMX.  Add
members to kvm_arch and track value for MMIO per-VM instead of global
variables.  By using the per-VM EPT entry value for MMIO, the existing VMX
logic is kept working.  Introduce a separate setter function so that guest
TD can override later.

Also require mmio spte cachcing for TDX.  Actually this is true case
because TDX require EPT and KVM EPT allows mmio spte caching.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu.h              |  1 +
 arch/x86/kvm/mmu/mmu.c          |  7 ++++---
 arch/x86/kvm/mmu/spte.c         | 10 ++++++++--
 arch/x86/kvm/mmu/spte.h         |  4 ++--
 arch/x86/kvm/mmu/tdp_mmu.c      | 14 +++++++++++---
 6 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 73c987b3d2b6..807da4b95aba 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1243,6 +1243,8 @@ struct kvm_arch {
 	 */
 	spinlock_t mmu_unsync_pages_lock;
 
+	u64 shadow_mmio_value;
+
 	struct list_head assigned_dev_head;
 	struct iommu_domain *iommu_domain;
 	bool iommu_noncoherent;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index a45f7a96b821..50d240d52697 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -101,6 +101,7 @@ static inline u8 kvm_get_shadow_phys_bits(void)
 }
 
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
+void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value);
 void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask);
 void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
 
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 59befdfeec23..8d3d7deebdd0 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2450,7 +2450,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
 				return kvm_mmu_prepare_zap_page(kvm, child,
 								invalid_list);
 		}
-	} else if (is_mmio_spte(pte)) {
+	} else if (is_mmio_spte(kvm, pte)) {
 		mmu_spte_clear_no_track(spte);
 	}
 	return 0;
@@ -4119,7 +4119,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 	if (WARN_ON(reserved))
 		return -EINVAL;
 
-	if (is_mmio_spte(spte)) {
+	if (is_mmio_spte(vcpu->kvm, spte)) {
 		gfn_t gfn = get_mmio_spte_gfn(spte);
 		unsigned int access = get_mmio_spte_access(spte);
 
@@ -4628,7 +4628,7 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu)
 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 			   unsigned int access)
 {
-	if (unlikely(is_mmio_spte(*sptep))) {
+	if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) {
 		if (gfn != get_mmio_spte_gfn(*sptep)) {
 			mmu_spte_clear_no_track(sptep);
 			return true;
@@ -6111,6 +6111,7 @@ int kvm_mmu_init_vm(struct kvm *kvm)
 	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
 	int r;
 
+	kvm->arch.shadow_mmio_value = shadow_mmio_value;
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
 	INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index cc0bc058fb25..a23e9205fc42 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -74,10 +74,10 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
 	u64 spte = generation_mmio_spte_mask(gen);
 	u64 gpa = gfn << PAGE_SHIFT;
 
-	WARN_ON_ONCE(!shadow_mmio_value);
+	WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value);
 
 	access &= shadow_mmio_access_mask;
-	spte |= shadow_mmio_value | access;
+	spte |= vcpu->kvm->arch.shadow_mmio_value | access;
 	spte |= gpa | shadow_nonpresent_or_rsvd_mask;
 	spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
 		<< SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
@@ -413,6 +413,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
+void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value)
+{
+	kvm->arch.shadow_mmio_value = mmio_value;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_value);
+
 void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask)
 {
 	/* shadow_me_value must be a subset of shadow_me_mask */
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 471378ee9071..256395eb593f 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -251,9 +251,9 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
 	return to_shadow_page(__pa(sptep));
 }
 
-static inline bool is_mmio_spte(u64 spte)
+static inline bool is_mmio_spte(struct kvm *kvm, u64 spte)
 {
-	return (spte & shadow_mmio_mask) == shadow_mmio_value &&
+	return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value &&
 	       likely(enable_mmio_caching);
 }
 
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 6111e3e9266d..dffacb7eb15a 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -19,6 +19,14 @@ int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
 {
 	struct workqueue_struct *wq;
 
+	/*
+	 * TDs require mmio_caching to clear suppress_ve bit of SPTE for GPA
+	 * of MMIO so that TD can convert #VE triggered by MMIO into
+	 * TDG.VP.VMCALL<MMIO>.
+	 */
+	if (kvm->arch.vm_type == KVM_X86_TDX_VM && !enable_mmio_caching)
+		return -EOPNOTSUPP;
+
 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
 		return 0;
 
@@ -587,8 +595,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 		 * impact the guest since both the former and current SPTEs
 		 * are nonpresent.
 		 */
-		if (WARN_ON(!is_mmio_spte(old_spte) &&
-			    !is_mmio_spte(new_spte) &&
+		if (WARN_ON(!is_mmio_spte(kvm, old_spte) &&
+			    !is_mmio_spte(kvm, new_spte) &&
 			    !is_removed_spte(new_spte)))
 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 			       "should not be replaced with another,\n"
@@ -1114,7 +1122,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 	}
 
 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
-	if (unlikely(is_mmio_spte(new_spte))) {
+	if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
 		vcpu->stat.pf_mmio_spte_created++;
 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
 				     new_spte);
-- 
2.25.1

Re: [PATCH v11 033/113] KVM: x86/mmu: Track shadow MMIO value on a per-VM basis

Posted by Huang, Kai 3 years ago

On Thu, 2023-01-12 at 08:31 -0800, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> TDX will use a different shadow PTE entry value for MMIO from VMX.  Add
> members to kvm_arch and track value for MMIO per-VM instead of global
> variables.  By using the per-VM EPT entry value for MMIO, the existing VMX
> logic is kept working.  Introduce a separate setter function so that guest
> TD can override later.

The guest TD itself cannot override.  It is KVM who overrides it for a TDX
guest.
 
> 
> Also require mmio spte cachcing for TDX.  Actually this is true case
			 ^
			 spell check.

> because TDX require EPT and KVM EPT allows mmio spte caching.

The second sentence doesn't make sense.  IIUC "TDX requires EPT + EPT _allows_
MMIO caching = TDX _allows_ MMIO caching", which is different from "TDX
_requires_ MMIO caching).

> 
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  2 ++
>  arch/x86/kvm/mmu.h              |  1 +
>  arch/x86/kvm/mmu/mmu.c          |  7 ++++---
>  arch/x86/kvm/mmu/spte.c         | 10 ++++++++--
>  arch/x86/kvm/mmu/spte.h         |  4 ++--
>  arch/x86/kvm/mmu/tdp_mmu.c      | 14 +++++++++++---
>  6 files changed, 28 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 73c987b3d2b6..807da4b95aba 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1243,6 +1243,8 @@ struct kvm_arch {
>  	 */
>  	spinlock_t mmu_unsync_pages_lock;
>  
> +	u64 shadow_mmio_value;
> +
>  	struct list_head assigned_dev_head;
>  	struct iommu_domain *iommu_domain;
>  	bool iommu_noncoherent;
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index a45f7a96b821..50d240d52697 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -101,6 +101,7 @@ static inline u8 kvm_get_shadow_phys_bits(void)
>  }
>  
>  void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
> +void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value);
>  void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask);
>  void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
>  
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 59befdfeec23..8d3d7deebdd0 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -2450,7 +2450,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
>  				return kvm_mmu_prepare_zap_page(kvm, child,
>  								invalid_list);
>  		}
> -	} else if (is_mmio_spte(pte)) {
> +	} else if (is_mmio_spte(kvm, pte)) {
>  		mmu_spte_clear_no_track(spte);
>  	}
>  	return 0;
> @@ -4119,7 +4119,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
>  	if (WARN_ON(reserved))
>  		return -EINVAL;
>  
> -	if (is_mmio_spte(spte)) {
> +	if (is_mmio_spte(vcpu->kvm, spte)) {
>  		gfn_t gfn = get_mmio_spte_gfn(spte);
>  		unsigned int access = get_mmio_spte_access(spte);
>  
> @@ -4628,7 +4628,7 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu)
>  static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
>  			   unsigned int access)
>  {
> -	if (unlikely(is_mmio_spte(*sptep))) {
> +	if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) {
>  		if (gfn != get_mmio_spte_gfn(*sptep)) {
>  			mmu_spte_clear_no_track(sptep);
>  			return true;
> @@ -6111,6 +6111,7 @@ int kvm_mmu_init_vm(struct kvm *kvm)
>  	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
>  	int r;
>  
> +	kvm->arch.shadow_mmio_value = shadow_mmio_value;
>  	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
>  	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
>  	INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
> diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
> index cc0bc058fb25..a23e9205fc42 100644
> --- a/arch/x86/kvm/mmu/spte.c
> +++ b/arch/x86/kvm/mmu/spte.c
> @@ -74,10 +74,10 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
>  	u64 spte = generation_mmio_spte_mask(gen);
>  	u64 gpa = gfn << PAGE_SHIFT;
>  
> -	WARN_ON_ONCE(!shadow_mmio_value);
> +	WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value);
>  
>  	access &= shadow_mmio_access_mask;
> -	spte |= shadow_mmio_value | access;
> +	spte |= vcpu->kvm->arch.shadow_mmio_value | access;
>  	spte |= gpa | shadow_nonpresent_or_rsvd_mask;
>  	spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
>  		<< SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
> @@ -413,6 +413,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
>  
> +void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value)
> +{
> +	kvm->arch.shadow_mmio_value = mmio_value;
> +}
> +EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_value);
> +
>  void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask)
>  {
>  	/* shadow_me_value must be a subset of shadow_me_mask */
> diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
> index 471378ee9071..256395eb593f 100644
> --- a/arch/x86/kvm/mmu/spte.h
> +++ b/arch/x86/kvm/mmu/spte.h
> @@ -251,9 +251,9 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
>  	return to_shadow_page(__pa(sptep));
>  }
>  
> -static inline bool is_mmio_spte(u64 spte)
> +static inline bool is_mmio_spte(struct kvm *kvm, u64 spte)
>  {
> -	return (spte & shadow_mmio_mask) == shadow_mmio_value &&
> +	return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value &&
>  	       likely(enable_mmio_caching);
>  }
>  
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 6111e3e9266d..dffacb7eb15a 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -19,6 +19,14 @@ int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
>  {
>  	struct workqueue_struct *wq;
>  
> +	/*
> +	 * TDs require mmio_caching to clear suppress_ve bit of SPTE for GPA
> +	 * of MMIO so that TD can convert #VE triggered by MMIO into
> +	 * TDG.VP.VMCALL<MMIO>.
> +	 */
> +	if (kvm->arch.vm_type == KVM_X86_TDX_VM && !enable_mmio_caching)
> +		return -EOPNOTSUPP;

SEV-ES does the check in hardware_setup:

void __init sev_hardware_setup(void)
{
	...
	/*
         * SEV-ES requires MMIO caching as KVM doesn't have access to the guest
         * instruction stream, i.e. can't emulate in response to a #NPF and
         * instead relies on #NPF(RSVD) being reflected into the guest as #VC
         * (the guest can then do a #VMGEXIT to request MMIO emulation).
         */
        if (!enable_mmio_caching)
                goto out;

	...
}

TDX should be done in the same way.

And IMO this chunk really doesn't belong to this patch -- I interpret this patch
as a "infrastructure patch to track shadow MMIO value on per-VM basis" (which
even should have no functional change IMHO), but this chunk is clearly doing
more than that.

> +
>  	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
>  		return 0;
>  
> @@ -587,8 +595,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
>  		 * impact the guest since both the former and current SPTEs
>  		 * are nonpresent.
>  		 */
> -		if (WARN_ON(!is_mmio_spte(old_spte) &&
> -			    !is_mmio_spte(new_spte) &&
> +		if (WARN_ON(!is_mmio_spte(kvm, old_spte) &&
> +			    !is_mmio_spte(kvm, new_spte) &&
>  			    !is_removed_spte(new_spte)))
>  			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
>  			       "should not be replaced with another,\n"
> @@ -1114,7 +1122,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
>  	}
>  
>  	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
> -	if (unlikely(is_mmio_spte(new_spte))) {
> +	if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
>  		vcpu->stat.pf_mmio_spte_created++;
>  		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
>  				     new_spte);

Re: [PATCH v11 033/113] KVM: x86/mmu: Track shadow MMIO value on a per-VM basis

Posted by Isaku Yamahata 2 years, 11 months ago

On Mon, Jan 16, 2023 at 11:16:04AM +0000,
"Huang, Kai" <kai.huang@intel.com> wrote:

> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index 6111e3e9266d..dffacb7eb15a 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -19,6 +19,14 @@ int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
> >  {
> >  	struct workqueue_struct *wq;
> >  
> > +	/*
> > +	 * TDs require mmio_caching to clear suppress_ve bit of SPTE for GPA
> > +	 * of MMIO so that TD can convert #VE triggered by MMIO into
> > +	 * TDG.VP.VMCALL<MMIO>.
> > +	 */
> > +	if (kvm->arch.vm_type == KVM_X86_TDX_VM && !enable_mmio_caching)
> > +		return -EOPNOTSUPP;
> 
> SEV-ES does the check in hardware_setup:
> 
> void __init sev_hardware_setup(void)
> {
> 	...
> 	/*
>          * SEV-ES requires MMIO caching as KVM doesn't have access to the guest
>          * instruction stream, i.e. can't emulate in response to a #NPF and
>          * instead relies on #NPF(RSVD) being reflected into the guest as #VC
>          * (the guest can then do a #VMGEXIT to request MMIO emulation).
>          */
>         if (!enable_mmio_caching)
>                 goto out;
> 
> 	...
> }
> 
> TDX should be done in the same way.
> 
> And IMO this chunk really doesn't belong to this patch -- I interpret this patch
> as a "infrastructure patch to track shadow MMIO value on per-VM basis" (which
> even should have no functional change IMHO), but this chunk is clearly doing
> more than that.

It's cleaner to do in hardware_setup().  So I moved the logic into
hardware_setup() and an independent patch.
-- 
Isaku Yamahata <isaku.yamahata@gmail.com>