TDX: Dynamic PAMT + S-EPT Hugepage

[RFC PATCH v5 08/45] KVM: x86/mmu: Propagate mirror SPTE removal to S-EPT in handle_changed_spte()

Posted by Sean Christopherson 1 week, 3 days ago

Invoke .remove_external_spte() in handle_changed_spte() as appropriate
instead of relying on callers to do the right thing.  Relying on callers
to invoke .remove_external_spte() is confusing and brittle, e.g. subtly
relies tdp_mmu_set_spte_atomic() never removing SPTEs, and removing an
S-EPT entry in tdp_mmu_set_spte() is bizarre (yeah, the VM is bugged so
it doesn't matter in practice, but it's still weird).

Implementing rules-based logic in a common chokepoint will also make it
easier to reason about the correctness of splitting hugepages when support
for S-EPT hugepages comes along.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 43 +++++++++++++-------------------------
 1 file changed, 14 insertions(+), 29 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 8743cd020d12..27ac520f2a89 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -359,25 +359,6 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 }
 
-static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
-				 int level)
-{
-	/*
-	 * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
-	 * PTs are removed in a special order, involving free_external_spt().
-	 * But remove_external_spte() will be called on non-leaf PTEs via
-	 * __tdp_mmu_zap_root(), so avoid the error the former would return
-	 * in this case.
-	 */
-	if (!is_last_spte(old_spte, level))
-		return;
-
-	/* Zapping leaf spte is allowed only when write lock is held. */
-	lockdep_assert_held_write(&kvm->mmu_lock);
-
-	kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte);
-}
-
 /**
  * handle_removed_pt() - handle a page table removed from the TDP structure
  *
@@ -473,11 +454,6 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
 		}
 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), sptep, gfn,
 				    old_spte, FROZEN_SPTE, level, shared);
-
-		if (is_mirror_sp(sp)) {
-			KVM_BUG_ON(shared, kvm);
-			remove_external_spte(kvm, gfn, old_spte, level);
-		}
 	}
 
 	if (is_mirror_sp(sp) &&
@@ -590,10 +566,21 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
 	 * the paging structure.  Note the WARN on the PFN changing without the
 	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
 	 * pages are kernel allocations and should never be migrated.
+	 *
+	 * When removing leaf entries from a mirror, immediately propagate the
+	 * changes to the external page tables.  Note, non-leaf mirror entries
+	 * are handled by handle_removed_pt(), as TDX requires that all leaf
+	 * entries are removed before the owning page table.  Note #2, writes
+	 * to make mirror PTEs shadow-present are propagated to external page
+	 * tables by __tdp_mmu_set_spte_atomic(), as KVM needs to ensure the
+	 * external page table was successfully updated before marking the
+	 * mirror SPTE present.
 	 */
 	if (was_present && !was_leaf &&
 	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
 		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
+	else if (was_leaf && is_mirror_sptep(sptep) && !is_leaf)
+		kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte);
 }
 
 static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
@@ -725,12 +712,10 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
 
 	/*
 	 * Users that do non-atomic setting of PTEs don't operate on mirror
-	 * roots, so don't handle it and bug the VM if it's seen.
+	 * roots.  Bug the VM as this path doesn't propagate such writes to the
+	 * external page tables.
 	 */
-	if (is_mirror_sptep(sptep)) {
-		KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
-		remove_external_spte(kvm, gfn, old_spte, level);
-	}
+	KVM_BUG_ON(is_mirror_sptep(sptep) && is_shadow_present_pte(new_spte), kvm);
 
 	return old_spte;
 }
-- 
2.53.0.rc1.217.geba53bf80e-goog

Re: [RFC PATCH v5 08/45] KVM: x86/mmu: Propagate mirror SPTE removal to S-EPT in handle_changed_spte()

Posted by Yan Zhao 4 days, 7 hours ago

On Wed, Jan 28, 2026 at 05:14:40PM -0800, Sean Christopherson wrote:
> Invoke .remove_external_spte() in handle_changed_spte() as appropriate
> instead of relying on callers to do the right thing.  Relying on callers
> to invoke .remove_external_spte() is confusing and brittle, e.g. subtly
> relies tdp_mmu_set_spte_atomic() never removing SPTEs, and removing an
> S-EPT entry in tdp_mmu_set_spte() is bizarre (yeah, the VM is bugged so
> it doesn't matter in practice, but it's still weird).
> 
> Implementing rules-based logic in a common chokepoint will also make it
> easier to reason about the correctness of splitting hugepages when support
> for S-EPT hugepages comes along.
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/mmu/tdp_mmu.c | 43 +++++++++++++-------------------------
>  1 file changed, 14 insertions(+), 29 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 8743cd020d12..27ac520f2a89 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -359,25 +359,6 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>  	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
>  }
>  
> -static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
> -				 int level)
> -{
> -	/*
> -	 * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
> -	 * PTs are removed in a special order, involving free_external_spt().
> -	 * But remove_external_spte() will be called on non-leaf PTEs via
> -	 * __tdp_mmu_zap_root(), so avoid the error the former would return
> -	 * in this case.
> -	 */
> -	if (!is_last_spte(old_spte, level))
> -		return;
> -
> -	/* Zapping leaf spte is allowed only when write lock is held. */
> -	lockdep_assert_held_write(&kvm->mmu_lock);
> -
> -	kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte);
> -}
> -
>  /**
>   * handle_removed_pt() - handle a page table removed from the TDP structure
>   *
> @@ -473,11 +454,6 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
>  		}
>  		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), sptep, gfn,
>  				    old_spte, FROZEN_SPTE, level, shared);
> -
> -		if (is_mirror_sp(sp)) {
> -			KVM_BUG_ON(shared, kvm);
> -			remove_external_spte(kvm, gfn, old_spte, level);
> -		}
>  	}
>  
>  	if (is_mirror_sp(sp) &&
> @@ -590,10 +566,21 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
>  	 * the paging structure.  Note the WARN on the PFN changing without the
>  	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
>  	 * pages are kernel allocations and should never be migrated.
> +	 *
> +	 * When removing leaf entries from a mirror, immediately propagate the
> +	 * changes to the external page tables.  Note, non-leaf mirror entries
> +	 * are handled by handle_removed_pt(), as TDX requires that all leaf
> +	 * entries are removed before the owning page table.  Note #2, writes
> +	 * to make mirror PTEs shadow-present are propagated to external page
> +	 * tables by __tdp_mmu_set_spte_atomic(), as KVM needs to ensure the
> +	 * external page table was successfully updated before marking the
> +	 * mirror SPTE present.
>  	 */
>  	if (was_present && !was_leaf &&
>  	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
>  		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
> +	else if (was_leaf && is_mirror_sptep(sptep) && !is_leaf)
Should we check !is_present instead of !is_leaf?
e.g. a transition from a present leaf entry to a present non-leaf entry could
also trigger this if case.

Besides, need "KVM_BUG_ON(shared, kvm)" in this case.
> +		kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte);
>  }
>  
>  static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
> @@ -725,12 +712,10 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
>  
>  	/*
>  	 * Users that do non-atomic setting of PTEs don't operate on mirror
> -	 * roots, so don't handle it and bug the VM if it's seen.
> +	 * roots.  Bug the VM as this path doesn't propagate such writes to the
> +	 * external page tables.
>  	 */
> -	if (is_mirror_sptep(sptep)) {
> -		KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
> -		remove_external_spte(kvm, gfn, old_spte, level);
> -	}
> +	KVM_BUG_ON(is_mirror_sptep(sptep) && is_shadow_present_pte(new_spte), kvm);
>  
>  	return old_spte;
>  }
> -- 
> 2.53.0.rc1.217.geba53bf80e-goog
>

Re: [RFC PATCH v5 08/45] KVM: x86/mmu: Propagate mirror SPTE removal to S-EPT in handle_changed_spte()

Posted by Sean Christopherson 3 days, 14 hours ago

On Wed, Feb 04, 2026, Yan Zhao wrote:
> On Wed, Jan 28, 2026 at 05:14:40PM -0800, Sean Christopherson wrote:
> > @@ -590,10 +566,21 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
> >  	 * the paging structure.  Note the WARN on the PFN changing without the
> >  	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
> >  	 * pages are kernel allocations and should never be migrated.
> > +	 *
> > +	 * When removing leaf entries from a mirror, immediately propagate the
> > +	 * changes to the external page tables.  Note, non-leaf mirror entries
> > +	 * are handled by handle_removed_pt(), as TDX requires that all leaf
> > +	 * entries are removed before the owning page table.  Note #2, writes
> > +	 * to make mirror PTEs shadow-present are propagated to external page
> > +	 * tables by __tdp_mmu_set_spte_atomic(), as KVM needs to ensure the
> > +	 * external page table was successfully updated before marking the
> > +	 * mirror SPTE present.
> >  	 */
> >  	if (was_present && !was_leaf &&
> >  	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
> >  		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
> > +	else if (was_leaf && is_mirror_sptep(sptep) && !is_leaf)
> Should we check !is_present instead of !is_leaf?
> e.g. a transition from a present leaf entry to a present non-leaf entry could
> also trigger this if case.

No, the !is_leaf check is very intentional.  At this point in the series, S-EPT
doesn't support hugepages.  If KVM manages to install a leaf SPTE and replaces
that SPTE with a non-leaf SPTE, then we absolutely want the KVM_BUG_ON() in
tdx_sept_remove_private_spte() to fire:

	/* TODO: handle large pages. */
	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
		return -EIO;

And then later on, when S-EPT gains support for hugepages, "KVM: TDX: Add core
support for splitting/demoting 2MiB S-EPT to 4KiB" doesn't need to touch code
outside of arch/x86/kvm/vmx/tdx.c, because everything has already been plumbed
in.

> Besides, need "KVM_BUG_ON(shared, kvm)" in this case.

Eh, we have lockdep_assert_held_write() in the S-EPT paths that require mmu_lock
to be held for write.  I don't think a KVM_BUG_ON() here would add meaningful
value.

Re: [RFC PATCH v5 08/45] KVM: x86/mmu: Propagate mirror SPTE removal to S-EPT in handle_changed_spte()

Posted by Yan Zhao 3 days, 11 hours ago

On Wed, Feb 04, 2026 at 06:23:38PM -0800, Sean Christopherson wrote:
> On Wed, Feb 04, 2026, Yan Zhao wrote:
> > On Wed, Jan 28, 2026 at 05:14:40PM -0800, Sean Christopherson wrote:
> > > @@ -590,10 +566,21 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
> > >  	 * the paging structure.  Note the WARN on the PFN changing without the
> > >  	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
> > >  	 * pages are kernel allocations and should never be migrated.
> > > +	 *
> > > +	 * When removing leaf entries from a mirror, immediately propagate the
> > > +	 * changes to the external page tables.  Note, non-leaf mirror entries
> > > +	 * are handled by handle_removed_pt(), as TDX requires that all leaf
> > > +	 * entries are removed before the owning page table.  Note #2, writes
> > > +	 * to make mirror PTEs shadow-present are propagated to external page
> > > +	 * tables by __tdp_mmu_set_spte_atomic(), as KVM needs to ensure the
> > > +	 * external page table was successfully updated before marking the
> > > +	 * mirror SPTE present.
> > >  	 */
> > >  	if (was_present && !was_leaf &&
> > >  	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
> > >  		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
> > > +	else if (was_leaf && is_mirror_sptep(sptep) && !is_leaf)
> > Should we check !is_present instead of !is_leaf?
> > e.g. a transition from a present leaf entry to a present non-leaf entry could
> > also trigger this if case.
> 
> No, the !is_leaf check is very intentional.  At this point in the series, S-EPT
> doesn't support hugepages.  If KVM manages to install a leaf SPTE and replaces
> that SPTE with a non-leaf SPTE, then we absolutely want the KVM_BUG_ON() in
> tdx_sept_remove_private_spte() to fire:
> 
> 	/* TODO: handle large pages. */
> 	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
> 		return -EIO;
But the op is named remove_external_spte().
And the check of "level != PG_LEVEL_4K" is for removing large leaf entries.
Relying on this check is tricky and confusing.

> And then later on, when S-EPT gains support for hugepages, "KVM: TDX: Add core
> support for splitting/demoting 2MiB S-EPT to 4KiB" doesn't need to touch code
> outside of arch/x86/kvm/vmx/tdx.c, because everything has already been plumbed
> in.
I haven't looked at the later patches for huge pages, but plumbing here directly
for splitting does not look right when it's invoked under shared mmu_lock.
See the comment below.
 
> > Besides, need "KVM_BUG_ON(shared, kvm)" in this case.
> 
> Eh, we have lockdep_assert_held_write() in the S-EPT paths that require mmu_lock
> to be held for write.  I don't think a KVM_BUG_ON() here would add meaningful
> value.
Hmm, I think KVM_BUG_ON(shared, kvm) is still useful.
If KVM invokes remove_external_spte() under shared mmu_lock, it needs to freeze
the entry first, similar to the sequence in __tdp_mmu_set_spte_atomic().

i.e., invoking external x86 ops in handle_changed_spte() for mirror roots should
be !shared only.

Relying on the TDX code's lockdep_assert_held_write() for warning seems less
clear than having an explicit check here.

Re: [RFC PATCH v5 08/45] KVM: x86/mmu: Propagate mirror SPTE removal to S-EPT in handle_changed_spte()

Posted by Sean Christopherson 2 days, 18 hours ago

On Thu, Feb 05, 2026, Yan Zhao wrote:
> On Wed, Feb 04, 2026 at 06:23:38PM -0800, Sean Christopherson wrote:
> > On Wed, Feb 04, 2026, Yan Zhao wrote:
> > > On Wed, Jan 28, 2026 at 05:14:40PM -0800, Sean Christopherson wrote:
> > > > @@ -590,10 +566,21 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
> > > >  	 * the paging structure.  Note the WARN on the PFN changing without the
> > > >  	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
> > > >  	 * pages are kernel allocations and should never be migrated.
> > > > +	 *
> > > > +	 * When removing leaf entries from a mirror, immediately propagate the
> > > > +	 * changes to the external page tables.  Note, non-leaf mirror entries
> > > > +	 * are handled by handle_removed_pt(), as TDX requires that all leaf
> > > > +	 * entries are removed before the owning page table.  Note #2, writes
> > > > +	 * to make mirror PTEs shadow-present are propagated to external page
> > > > +	 * tables by __tdp_mmu_set_spte_atomic(), as KVM needs to ensure the
> > > > +	 * external page table was successfully updated before marking the
> > > > +	 * mirror SPTE present.
> > > >  	 */
> > > >  	if (was_present && !was_leaf &&
> > > >  	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
> > > >  		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
> > > > +	else if (was_leaf && is_mirror_sptep(sptep) && !is_leaf)
> > > Should we check !is_present instead of !is_leaf?
> > > e.g. a transition from a present leaf entry to a present non-leaf entry could
> > > also trigger this if case.
> > 
> > No, the !is_leaf check is very intentional.  At this point in the series, S-EPT
> > doesn't support hugepages.  If KVM manages to install a leaf SPTE and replaces
> > that SPTE with a non-leaf SPTE, then we absolutely want the KVM_BUG_ON() in
> > tdx_sept_remove_private_spte() to fire:
> > 
> > 	/* TODO: handle large pages. */
> > 	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
> > 		return -EIO;
> But the op is named remove_external_spte().
> And the check of "level != PG_LEVEL_4K" is for removing large leaf entries.

I agree that the naming at this point in the series is unfortunate, but I don't
see it as outright wrong.  That the TDP MMU could theoretically replace the leaf
SPTE with a non-leaf SPTE doesn't change the fact that the old leaf SPTE *is*
being removed.

> Relying on this check is tricky and confusing.

If it's still confusing at the end of the series, then I'm happy to discuss how
we can make it less confusion.  But as of this point in the series, I unfortunately
don't see a better way to achieve my end goals (reducing the number of kvm_x86_ops
hooks, and reducing how many TDX specific details bleed into common MMU code).

There are "different" ways to incrementally move from where were at today, to where
I want KVM to be, but I don't see them as "better".  I.e. AFAICT, there's no way
to move incrementally with reviewable patches while also maintaining perfect/ideal
naming and flow.

> > And then later on, when S-EPT gains support for hugepages, "KVM: TDX: Add core
> > support for splitting/demoting 2MiB S-EPT to 4KiB" doesn't need to touch code
> > outside of arch/x86/kvm/vmx/tdx.c, because everything has already been plumbed
> > in.
> I haven't looked at the later patches for huge pages,

Please do.  As above, I don't think it's realistic to completely avoid some amount
of "eww" in the intermediate stages.

> but plumbing here directly for splitting does not look right when it's
> invoked under shared mmu_lock.
> See the comment below.
>  
> > > Besides, need "KVM_BUG_ON(shared, kvm)" in this case.
> > 
> > Eh, we have lockdep_assert_held_write() in the S-EPT paths that require mmu_lock
> > to be held for write.  I don't think a KVM_BUG_ON() here would add meaningful
> > value.
> Hmm, I think KVM_BUG_ON(shared, kvm) is still useful.
> If KVM invokes remove_external_spte() under shared mmu_lock, it needs to freeze
> the entry first, similar to the sequence in __tdp_mmu_set_spte_atomic().
> 
> i.e., invoking external x86 ops in handle_changed_spte() for mirror roots should
> be !shared only.

Sure, but...

> Relying on the TDX code's lockdep_assert_held_write() for warning seems less
> clear than having an explicit check here.

...that's TDX's responsibility to enforce, and I don't see any justification for
something more than a lockdep assertion.  As I've said elsewhere, several times,
at some point we have to commit to getting the code right.  Adding KVM_BUG_ON() in
Every. Single. Call. does not yield more maintainable code.  There are myriad
things KVM can screw up, many of which have far, far more harmful impact than
calling an S-EPT hook with mmu_lock held for read instead of write.

The bar for adding a KVM_BUG_ON() is not simply "this shouldn't happen".  It's,
this shouldn't happen *and* at least one of (not a complete list):

  - we've either screwed this up badly more than once
  - it's really hard to get right
  - we might not notice if we do screw it up
  - KVM might corrupt data if we continue on

Re: [RFC PATCH v5 08/45] KVM: x86/mmu: Propagate mirror SPTE removal to S-EPT in handle_changed_spte()

Posted by Yan Zhao 2 days, 14 hours ago

On Thu, Feb 05, 2026 at 02:33:16PM -0800, Sean Christopherson wrote:
> On Thu, Feb 05, 2026, Yan Zhao wrote:
> > On Wed, Feb 04, 2026 at 06:23:38PM -0800, Sean Christopherson wrote:
> > > On Wed, Feb 04, 2026, Yan Zhao wrote:
> > > > On Wed, Jan 28, 2026 at 05:14:40PM -0800, Sean Christopherson wrote:
> > > > > @@ -590,10 +566,21 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
> > > > >  	 * the paging structure.  Note the WARN on the PFN changing without the
> > > > >  	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
> > > > >  	 * pages are kernel allocations and should never be migrated.
> > > > > +	 *
> > > > > +	 * When removing leaf entries from a mirror, immediately propagate the
> > > > > +	 * changes to the external page tables.  Note, non-leaf mirror entries
> > > > > +	 * are handled by handle_removed_pt(), as TDX requires that all leaf
> > > > > +	 * entries are removed before the owning page table.  Note #2, writes
> > > > > +	 * to make mirror PTEs shadow-present are propagated to external page
> > > > > +	 * tables by __tdp_mmu_set_spte_atomic(), as KVM needs to ensure the
> > > > > +	 * external page table was successfully updated before marking the
> > > > > +	 * mirror SPTE present.
> > > > >  	 */
> > > > >  	if (was_present && !was_leaf &&
> > > > >  	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
> > > > >  		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
> > > > > +	else if (was_leaf && is_mirror_sptep(sptep) && !is_leaf)
> > > > Should we check !is_present instead of !is_leaf?
> > > > e.g. a transition from a present leaf entry to a present non-leaf entry could
> > > > also trigger this if case.
> > > 
> > > No, the !is_leaf check is very intentional.  At this point in the series, S-EPT
> > > doesn't support hugepages.  If KVM manages to install a leaf SPTE and replaces
> > > that SPTE with a non-leaf SPTE, then we absolutely want the KVM_BUG_ON() in
> > > tdx_sept_remove_private_spte() to fire:
> > > 
> > > 	/* TODO: handle large pages. */
> > > 	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
> > > 		return -EIO;
> > But the op is named remove_external_spte().
> > And the check of "level != PG_LEVEL_4K" is for removing large leaf entries.
> 
> I agree that the naming at this point in the series is unfortunate, but I don't
> see it as outright wrong.  That the TDP MMU could theoretically replace the leaf
> SPTE with a non-leaf SPTE doesn't change the fact that the old leaf SPTE *is*
> being removed.
Hmm, I can't agree with that. But I won't insist if you think it's ok :)


> > Relying on this check is tricky and confusing.
> 
> If it's still confusing at the end of the series, then I'm happy to discuss how
> we can make it less confusion.  But as of this point in the series, I unfortunately
> don't see a better way to achieve my end goals (reducing the number of kvm_x86_ops
> hooks, and reducing how many TDX specific details bleed into common MMU code).
> 
> There are "different" ways to incrementally move from where were at today, to where
> I want KVM to be, but I don't see them as "better".  I.e. AFAICT, there's no way
> to move incrementally with reviewable patches while also maintaining perfect/ideal
> naming and flow.
Ok.

> > > And then later on, when S-EPT gains support for hugepages, "KVM: TDX: Add core
> > > support for splitting/demoting 2MiB S-EPT to 4KiB" doesn't need to touch code
> > > outside of arch/x86/kvm/vmx/tdx.c, because everything has already been plumbed
> > > in.
> > I haven't looked at the later patches for huge pages,
> 
> Please do.  As above, I don't think it's realistic to completely avoid some amount
> of "eww" in the intermediate stages.
WIP. Found some issues. Will comment after investigating further.

> > but plumbing here directly for splitting does not look right when it's
> > invoked under shared mmu_lock.
> > See the comment below.
> >  
> > > > Besides, need "KVM_BUG_ON(shared, kvm)" in this case.
> > > 
> > > Eh, we have lockdep_assert_held_write() in the S-EPT paths that require mmu_lock
> > > to be held for write.  I don't think a KVM_BUG_ON() here would add meaningful
> > > value.
> > Hmm, I think KVM_BUG_ON(shared, kvm) is still useful.
> > If KVM invokes remove_external_spte() under shared mmu_lock, it needs to freeze
> > the entry first, similar to the sequence in __tdp_mmu_set_spte_atomic().
> > 
> > i.e., invoking external x86 ops in handle_changed_spte() for mirror roots should
> > be !shared only.
> 
> Sure, but...
> 
> > Relying on the TDX code's lockdep_assert_held_write() for warning seems less
> > clear than having an explicit check here.
> 
> ...that's TDX's responsibility to enforce, and I don't see any justification for
> something more than a lockdep assertion.  As I've said elsewhere, several times,
My concern is that handle_changed_spte() can be invoked by callers other than
tdp_mmu_set_spte(). e.g.,

tdp_mmu_set_spte_atomic
  | __tdp_mmu_set_spte_atomic
  |     | kvm_x86_call(set_external_spte)
  | handle_changed_spte
        | kvm_x86_call(set_external_spte)

When !is_frozen_spte(new_spte) and was_leaf, set_external_spte() may be invoked
twice for splitting under shared mmu_lock.

Therefore, I think it would be better to check for !shared and only invoke
set_external_spte() when !shared.

BTW: in the patch log, you mentioned that

: Invoke .remove_external_spte() in handle_changed_spte() as appropriate
: instead of relying on callers to do the right thing.  Relying on callers
: to invoke .remove_external_spte() is confusing and brittle, e.g. subtly
: relies tdp_mmu_set_spte_atomic() never removing SPTEs, and removing an
: S-EPT entry in tdp_mmu_set_spte() is bizarre (yeah, the VM is bugged so
: it doesn't matter in practice, but it's still weird).

However, when tdp_mmu_set_spte_atomic() removes SPTEs in the future, it will
still need to follow the sequence in __tdp_mmu_set_spte_atomic() :
1. freeze, 2. set_external_spte(). 3. set the new_spte 4. handle_changed_spte().

So, do you think we should leave set_external_spte() in tdp_mmu_set_spte() for
exclusive mmu_lock scenarios instead of moving it to handle_changed_spte()?

> at some point we have to commit to getting the code right.  Adding KVM_BUG_ON() in
> Every. Single. Call. does not yield more maintainable code.  There are myriad
> things KVM can screw up, many of which have far, far more harmful impact than
> calling an S-EPT hook with mmu_lock held for read instead of write.
> 
> The bar for adding a KVM_BUG_ON() is not simply "this shouldn't happen".  It's,
> this shouldn't happen *and* at least one of (not a complete list):
> 
>   - we've either screwed this up badly more than once
>   - it's really hard to get right
>   - we might not notice if we do screw it up
>   - KVM might corrupt data if we continue on
Thanks for sharing this bar.

Re: [RFC PATCH v5 08/45] KVM: x86/mmu: Propagate mirror SPTE removal to S-EPT in handle_changed_spte()

Posted by Sean Christopherson 1 day, 23 hours ago

On Fri, Feb 06, 2026, Yan Zhao wrote:
> On Thu, Feb 05, 2026 at 02:33:16PM -0800, Sean Christopherson wrote:
> > On Thu, Feb 05, 2026, Yan Zhao wrote:
> > > > > >  	if (was_present && !was_leaf &&
> > > > > >  	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
> > > > > >  		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
> > > > > > +	else if (was_leaf && is_mirror_sptep(sptep) && !is_leaf)
> > > > > Should we check !is_present instead of !is_leaf?
> > > > > e.g. a transition from a present leaf entry to a present non-leaf entry could
> > > > > also trigger this if case.
> > > > 
> > > > No, the !is_leaf check is very intentional.  At this point in the series, S-EPT
> > > > doesn't support hugepages.  If KVM manages to install a leaf SPTE and replaces
> > > > that SPTE with a non-leaf SPTE, then we absolutely want the KVM_BUG_ON() in
> > > > tdx_sept_remove_private_spte() to fire:
> > > > 
> > > > 	/* TODO: handle large pages. */
> > > > 	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
> > > > 		return -EIO;
> > > But the op is named remove_external_spte().
> > > And the check of "level != PG_LEVEL_4K" is for removing large leaf entries.
> > 
> > I agree that the naming at this point in the series is unfortunate, but I don't
> > see it as outright wrong.  That the TDP MMU could theoretically replace the leaf
> > SPTE with a non-leaf SPTE doesn't change the fact that the old leaf SPTE *is*
> > being removed.
> Hmm, I can't agree with that. But I won't insist if you think it's ok :)

If the code is read through a TDX lens, then I agree, it's seems wrong.  Because
then you *know* that TDX doesn't support back-to-back remove()=>add() operations
to handle a page split.

But from a TDP MMU perspective, this is entirely logical (ignoring that
link_external_spt() is gone at this point in the series).

	else if (was_leaf && is_mirror_sptep(sptep) && !is_leaf) {
		kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte);

		/*
		 * Link the new page table if a hugepage is being split, i.e.
		 * if a leaf SPTE is being replaced with a non-leaf SPTE.
		 */
		if (is_present)
			kvm_x86_call(link_external_spt)(kvm, gfn, level, ...);
	}

And that is *exactly* why I want to get rid of the hyper-specific kvm_x86_ops
hooks.  They bleed all kinds of implementation details all over the TDP MMU, which
makes it difficult to read and understand the relevant TDP MMU code if you don't
already know the TDX rules.  And I absolutely do not want to effectively require
others to understand TDX's rules to be able to make changes to the TDP MMU.

> > > Relying on the TDX code's lockdep_assert_held_write() for warning seems less
> > > clear than having an explicit check here.
> > 
> > ...that's TDX's responsibility to enforce, and I don't see any justification for
> > something more than a lockdep assertion.  As I've said elsewhere, several times,
> My concern is that handle_changed_spte() can be invoked by callers other than
> tdp_mmu_set_spte(). e.g.,
> 
> tdp_mmu_set_spte_atomic
>   | __tdp_mmu_set_spte_atomic
>   |     | kvm_x86_call(set_external_spte)
>   | handle_changed_spte
>         | kvm_x86_call(set_external_spte)
> 
> When !is_frozen_spte(new_spte) and was_leaf, set_external_spte() may be invoked
> twice for splitting under shared mmu_lock.

But we don't support that yet.  I was going to punt dealing with that to the future,
but now that I've looked at it, I honestly think the problem is that I didn't go
far enough with the cleanup.  I shouldn't have hedged.

What I said in "KVM: TDX: Drop kvm_x86_ops.link_external_spt(), use .set_external_spte()
for all":

 : Ideally, KVM would provide a single pair of hooks to set S-EPT entries,
 : one hook for setting SPTEs under write-lock and another for settings SPTEs
 : under read-lock (e.g. to ensure the entire operation is "atomic", to allow
 : for failure, etc.).  Sadly, TDX's requirement that all child S-EPT entries
 : are removed before the parent makes that impractical: the TDP MMU
 : deliberately prunes non-leaf SPTEs and _then_ processes its children, thus
 : making it quite important for the TDP MMU to differentiate between zapping
 : leaf and non-leaf S-EPT entries.

isn't quite right.  Ideally, KVM would provide *one* hook to set S-EPT entries.
Because the API to actually *set* the external SPTE shouldn't be any different
for read-lock vs. write-lost.  I.e. the child S-EPT case remains the sole
exception.  More below.

> Therefore, I think it would be better to check for !shared and only invoke
> set_external_spte() when !shared.

No.  I am dead set against conditionally invoking set_external_spte() based on
shared vs. exclusive.  The rules for how SPTE updates are forwarded to TDX, and
how TDX reacts to those updates, need to be defined purely on state transitions,
i.e. on old_spte vs. new_spte.

Actually, even that isn't a strong enough statement.  *All* updates need to be
forwarded to TDX, and then TDX can react as appropriate.

> BTW: in the patch log, you mentioned that
> 
> : Invoke .remove_external_spte() in handle_changed_spte() as appropriate
> : instead of relying on callers to do the right thing.  Relying on callers
> : to invoke .remove_external_spte() is confusing and brittle, e.g. subtly
> : relies tdp_mmu_set_spte_atomic() never removing SPTEs, and removing an
> : S-EPT entry in tdp_mmu_set_spte() is bizarre (yeah, the VM is bugged so
> : it doesn't matter in practice, but it's still weird).
> 
> However, when tdp_mmu_set_spte_atomic() removes SPTEs in the future, it will
> still need to follow the sequence in __tdp_mmu_set_spte_atomic() :
> 1. freeze, 2. set_external_spte(). 3. set the new_spte 4. handle_changed_spte().
> 
> So, do you think we should leave set_external_spte() in tdp_mmu_set_spte() for
> exclusive mmu_lock scenarios instead of moving it to handle_changed_spte()?

Nope, as above, 100% the opposite.  Over ~3 patches, e.g.

 1. Drop the KVM_BUG_ON()s or move them to TDX
 2. Morph the !is_frozen_spte() check into a KVM_MMU_WARN_ON()
 3. Rework the code to rely on __handle_changed_spte() to propagate writes to S-EPT

That way, the _only_ path that updates external SPTEs is this:

	if (was_present && !was_leaf &&
	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
	else if (is_mirror_sptep(sptep))
		return kvm_x86_call(set_external_spte)(kvm, gfn, old_spte,
						       new_spte, level);

which is fully aligned with handle_changed_spte()'s role for !mirror roots: it
exists to react to changes (the sole exception to that being aging SPTEs, which
is a special case).

Compile-tested only.

---
 arch/x86/kvm/mmu/tdp_mmu.c | 118 ++++++++++++++++++-------------------
 1 file changed, 59 insertions(+), 59 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 847f2fcb6740..33a321aedac0 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -464,7 +464,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
 }
 
 /**
- * handle_changed_spte - handle bookkeeping associated with an SPTE change
+ * __handle_changed_spte - handle bookkeeping associated with an SPTE change
  * @kvm: kvm instance
  * @as_id: the address space of the paging structure the SPTE was a part of
  * @sptep: pointer to the SPTE
@@ -480,9 +480,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
  * dirty logging updates are handled in common code, not here (see make_spte()
  * and fast_pf_fix_direct_spte()).
  */
-static void handle_changed_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
-				gfn_t gfn, u64 old_spte, u64 new_spte,
-				int level, bool shared)
+static int __handle_changed_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
+				 gfn_t gfn, u64 old_spte, u64 new_spte,
+				 int level, bool shared)
 {
 	bool was_present = is_shadow_present_pte(old_spte);
 	bool is_present = is_shadow_present_pte(new_spte);
@@ -518,7 +518,7 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
 	}
 
 	if (old_spte == new_spte)
-		return;
+		return 0;
 
 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 
@@ -547,7 +547,7 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
 			       "a temporary frozen SPTE.\n"
 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 			       as_id, gfn, old_spte, new_spte, level);
-		return;
+		return 0;
 	}
 
 	if (is_leaf != was_leaf)
@@ -559,30 +559,31 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
 	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
 	 * pages are kernel allocations and should never be migrated.
 	 *
-	 * When modifying leaf entries in mirrored page tables, propagate the
-	 * changes to the external SPTE.  Bug the VM on failure, as callers
-	 * aren't prepared to handle errors, e.g. due to lock contention in the
-	 * TDX-Module.  Note, changes to non-leaf mirror SPTEs are handled by
-	 * handle_removed_pt() (the TDX-Module requires that child entries are
-	 * removed before the parent SPTE), and changes to non-present mirror
-	 * SPTEs are handled by __tdp_mmu_set_spte_atomic() (KVM needs to set
-	 * the external SPTE while the mirror SPTE is frozen so that installing
-	 * a new SPTE is effectively an atomic operation).
+	 * When modifying leaf entries in mirrored page tables, propagate all
+	 * changes to the external SPTE.
 	 */
 	if (was_present && !was_leaf &&
 	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
 		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
-	else if (was_leaf && is_mirror_sptep(sptep))
-		KVM_BUG_ON(kvm_x86_call(set_external_spte)(kvm, gfn, old_spte,
-							   new_spte, level), kvm);
+	else if (is_mirror_sptep(sptep))
+		return kvm_x86_call(set_external_spte)(kvm, gfn, old_spte,
+						       new_spte, level);
+
+	return 0;
+}
+
+static void handle_changed_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
+				gfn_t gfn, u64 old_spte, u64 new_spte,
+				int level, bool shared)
+{
+	KVM_BUG_ON(__handle_changed_spte(kvm, as_id, sptep, gfn, old_spte,
+					 new_spte, level, shared), kvm);
 }
 
 static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
 							 struct tdp_iter *iter,
 							 u64 new_spte)
 {
-	u64 *raw_sptep = rcu_dereference(iter->sptep);
-
 	/*
 	 * The caller is responsible for ensuring the old SPTE is not a FROZEN
 	 * SPTE.  KVM should never attempt to zap or manipulate a FROZEN SPTE,
@@ -591,40 +592,6 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
 	 */
 	WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
 
-	if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
-		int ret;
-
-		/*
-		 * KVM doesn't currently support zapping or splitting mirror
-		 * SPTEs while holding mmu_lock for read.
-		 */
-		if (KVM_BUG_ON(is_shadow_present_pte(iter->old_spte), kvm) ||
-		    KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
-			return -EBUSY;
-
-		/*
-		 * Temporarily freeze the SPTE until the external PTE operation
-		 * has completed, e.g. so that concurrent faults don't attempt
-		 * to install a child PTE in the external page table before the
-		 * parent PTE has been written.
-		 */
-		if (!try_cmpxchg64(raw_sptep, &iter->old_spte, FROZEN_SPTE))
-			return -EBUSY;
-
-		/*
-		 * Update the external PTE.  On success, set the mirror SPTE to
-		 * the desired value.  On failure, restore the old SPTE so that
-		 * the SPTE isn't frozen in perpetuity.
-		 */
-		ret = kvm_x86_call(set_external_spte)(kvm, iter->gfn, iter->old_spte,
-						      new_spte, iter->level);
-		if (ret)
-			__kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte);
-		else
-			__kvm_tdp_mmu_write_spte(iter->sptep, new_spte);
-		return ret;
-	}
-
 	/*
 	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
 	 * does not hold the mmu_lock.  On failure, i.e. if a different logical
@@ -632,7 +599,7 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
 	 * the current value, so the caller operates on fresh data, e.g. if it
 	 * retries tdp_mmu_set_spte_atomic()
 	 */
-	if (!try_cmpxchg64(raw_sptep, &iter->old_spte, new_spte))
+	if (!try_cmpxchg64(rcu_dereference(iter->sptep), &iter->old_spte, new_spte))
 		return -EBUSY;
 
 	return 0;
@@ -663,14 +630,44 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
 
 	lockdep_assert_held_read(&kvm->mmu_lock);
 
-	ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
+	/* KVM should never freeze SPTEs using higher level APIs. */
+	KVM_MMU_WARN_ON(is_frozen_spte(new_spte));
+
+	/*
+	  * Temporarily freeze the SPTE until the external PTE operation has
+	  * completed (unless the new SPTE itself will be frozen), e.g. so that
+	  * concurrent faults don't attempt to install a child PTE in the
+	  * external page table before the parent PTE has been written, or try
+	  * to re-install a page table before the old one was removed.
+	  */
+	if (is_mirror_sptep(iter->sptep))
+		ret = __tdp_mmu_set_spte_atomic(kvm, iter, FROZEN_SPTE);
+	else
+		ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
 	if (ret)
 		return ret;
 
-	handle_changed_spte(kvm, iter->as_id, iter->sptep, iter->gfn,
-			    iter->old_spte, new_spte, iter->level, true);
+	ret = __handle_changed_spte(kvm, iter->as_id, iter->sptep, iter->gfn,
+				    iter->old_spte, new_spte, iter->level, true);
 
-	return 0;
+	/*
+	 * Unfreeze the mirror SPTE.  If updating the external SPTE failed,
+	 * restore the old SPTE so that the SPTE isn't frozen in perpetuity,
+	 * otherwise set the mirror SPTE to the new desired value.
+	 */
+	if (is_mirror_sptep(iter->sptep)) {
+		if (ret)
+			__kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte);
+		else
+			__kvm_tdp_mmu_write_spte(iter->sptep, new_spte);
+	} else {
+		/*
+		 * Bug the VM if handling the change failed, as failure is only
+		 * allowed if KVM couldn't update the external SPTE.
+		 */
+		KVM_BUG_ON(ret, kvm);
+	}
+	return ret;
 }
 
 /*
@@ -1325,6 +1322,9 @@ static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
 {
 	u64 new_spte;
 
+	if (WARN_ON_ONCE(is_mirror_sptep(iter->sptep)))
+		return;
+
 	if (spte_ad_enabled(iter->old_spte)) {
 		iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
 								shadow_accessed_mask);

base-commit: c86be62d601ede14cbad1d0fb5af9b67d4172342
--