Drop the dedicated .link_external_spt() for linking non-leaf S-EPT pages,
and instead funnel everything through .set_external_spte(). Using separate
hooks doesn't help prevent TDP MMU details from bleeding into TDX, and vice
versa; to the contrary, dedicated callbacks will result in _more_ pollution
when hugepage support is added, e.g. will require the TDP MMU to know
details about the splitting rules for TDX that aren't all that relevant to
the TDP MMU.
Ideally, KVM would provide a single pair of hooks to set S-EPT entries,
one hook for setting SPTEs under write-lock and another for settings SPTEs
under read-lock (e.g. to ensure the entire operation is "atomic", to allow
for failure, etc.). Sadly, TDX's requirement that all child S-EPT entries
are removed before the parent makes that impractical: the TDP MMU
deliberately prunes non-leaf SPTEs and _then_ processes its children, thus
making it quite important for the TDP MMU to differentiate between zapping
leaf and non-leaf S-EPT entries.
However, that's the _only_ case that's truly special, and even that case
could be shoehorned into a single hook; it's just wouldn't be a net
positive.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/kvm-x86-ops.h | 1 -
arch/x86/include/asm/kvm_host.h | 3 --
arch/x86/kvm/mmu/tdp_mmu.c | 37 +++---------------
arch/x86/kvm/vmx/tdx.c | 61 ++++++++++++++++++++----------
4 files changed, 48 insertions(+), 54 deletions(-)
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index c18a033bee7e..57eb1f4832ae 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -94,7 +94,6 @@ KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
KVM_X86_OP(load_mmu_pgd)
-KVM_X86_OP_OPTIONAL_RET0(link_external_spt)
KVM_X86_OP_OPTIONAL_RET0(set_external_spte)
KVM_X86_OP_OPTIONAL_RET0(free_external_spt)
KVM_X86_OP_OPTIONAL(remove_external_spte)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e441f270f354..d12ca0f8a348 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1853,9 +1853,6 @@ struct kvm_x86_ops {
void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
int root_level);
- /* Update external mapping with page table link. */
- int (*link_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
- void *external_spt);
/* Update the external page table from spte getting set. */
int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
u64 mirror_spte);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 0feda295859a..56ad056e6042 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -495,31 +495,17 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
-static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
-{
- if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
- struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
-
- WARN_ON_ONCE(sp->role.level + 1 != level);
- WARN_ON_ONCE(sp->gfn != gfn);
- return sp->external_spt;
- }
-
- return NULL;
-}
-
static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
gfn_t gfn, u64 *old_spte,
u64 new_spte, int level)
{
- bool was_present = is_shadow_present_pte(*old_spte);
- bool is_present = is_shadow_present_pte(new_spte);
- bool is_leaf = is_present && is_last_spte(new_spte, level);
- int ret = 0;
-
- KVM_BUG_ON(was_present, kvm);
+ int ret;
lockdep_assert_held(&kvm->mmu_lock);
+
+ if (KVM_BUG_ON(is_shadow_present_pte(*old_spte), kvm))
+ return -EIO;
+
/*
* We need to lock out other updates to the SPTE until the external
* page table has been modified. Use FROZEN_SPTE similar to
@@ -528,18 +514,7 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp
if (!try_cmpxchg64(rcu_dereference(sptep), old_spte, FROZEN_SPTE))
return -EBUSY;
- /*
- * Use different call to either set up middle level
- * external page table, or leaf.
- */
- if (is_leaf) {
- ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte);
- } else {
- void *external_spt = get_external_spt(gfn, new_spte, level);
-
- KVM_BUG_ON(!external_spt, kvm);
- ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt);
- }
+ ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte);
if (ret)
__kvm_tdp_mmu_write_spte(sptep, *old_spte);
else
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 5688c77616e3..30494f9ceb31 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1664,18 +1664,58 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
return 0;
}
+static struct page *tdx_spte_to_external_spt(struct kvm *kvm, gfn_t gfn,
+ u64 new_spte, enum pg_level level)
+{
+ struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
+
+ if (KVM_BUG_ON(!sp->external_spt, kvm) ||
+ KVM_BUG_ON(sp->role.level + 1 != level, kvm) ||
+ KVM_BUG_ON(sp->gfn != gfn, kvm))
+ return NULL;
+
+ return virt_to_page(sp->external_spt);
+}
+
+static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, u64 mirror_spte)
+{
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u64 err, entry, level_state;
+ struct page *external_spt;
+
+ external_spt = tdx_spte_to_external_spt(kvm, gfn, mirror_spte, level);
+ if (!external_spt)
+ return -EIO;
+
+ err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, level, external_spt,
+ &entry, &level_state);
+ if (unlikely(tdx_operand_busy(err)))
+ return -EBUSY;
+
+ if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
+ return -EIO;
+
+ return 0;
+}
+
static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
enum pg_level level, u64 mirror_spte)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
+ if (KVM_BUG_ON(!is_shadow_present_pte(mirror_spte), kvm))
+ return -EIO;
+
+ if (!is_last_spte(mirror_spte, level))
+ return tdx_sept_link_private_spt(kvm, gfn, level, mirror_spte);
+
/* TODO: handle large pages. */
if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
return -EIO;
- WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) ||
- (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
+ WARN_ON_ONCE((mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
/*
* Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory()
@@ -1695,23 +1735,7 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
return tdx_mem_page_aug(kvm, gfn, level, pfn);
}
-static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt)
-{
- gpa_t gpa = gfn_to_gpa(gfn);
- struct page *page = virt_to_page(private_spt);
- u64 err, entry, level_state;
- err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, level, page, &entry,
- &level_state);
- if (unlikely(tdx_operand_busy(err)))
- return -EBUSY;
-
- if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
- return -EIO;
-
- return 0;
-}
/*
* Ensure shared and private EPTs to be flushed on all vCPUs.
@@ -3592,7 +3616,6 @@ void __init tdx_hardware_setup(void)
*/
vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
- vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
--
2.53.0.rc1.217.geba53bf80e-goog
On Wed, Jan 28, 2026 at 05:14:37PM -0800, Sean Christopherson wrote:
> Drop the dedicated .link_external_spt() for linking non-leaf S-EPT pages,
> and instead funnel everything through .set_external_spte(). Using separate
> hooks doesn't help prevent TDP MMU details from bleeding into TDX, and vice
> versa; to the contrary, dedicated callbacks will result in _more_ pollution
> when hugepage support is added, e.g. will require the TDP MMU to know
> details about the splitting rules for TDX that aren't all that relevant to
> the TDP MMU.
>
> Ideally, KVM would provide a single pair of hooks to set S-EPT entries,
> one hook for setting SPTEs under write-lock and another for settings SPTEs
> under read-lock (e.g. to ensure the entire operation is "atomic", to allow
> for failure, etc.). Sadly, TDX's requirement that all child S-EPT entries
> are removed before the parent makes that impractical: the TDP MMU
> deliberately prunes non-leaf SPTEs and _then_ processes its children, thus
> making it quite important for the TDP MMU to differentiate between zapping
> leaf and non-leaf S-EPT entries.
>
> However, that's the _only_ case that's truly special, and even that case
> could be shoehorned into a single hook; it's just wouldn't be a net
> positive.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
> arch/x86/include/asm/kvm-x86-ops.h | 1 -
> arch/x86/include/asm/kvm_host.h | 3 --
> arch/x86/kvm/mmu/tdp_mmu.c | 37 +++---------------
> arch/x86/kvm/vmx/tdx.c | 61 ++++++++++++++++++++----------
> 4 files changed, 48 insertions(+), 54 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index c18a033bee7e..57eb1f4832ae 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -94,7 +94,6 @@ KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
> KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
> KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
> KVM_X86_OP(load_mmu_pgd)
> -KVM_X86_OP_OPTIONAL_RET0(link_external_spt)
> KVM_X86_OP_OPTIONAL_RET0(set_external_spte)
> KVM_X86_OP_OPTIONAL_RET0(free_external_spt)
> KVM_X86_OP_OPTIONAL(remove_external_spte)
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index e441f270f354..d12ca0f8a348 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1853,9 +1853,6 @@ struct kvm_x86_ops {
> void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
> int root_level);
>
> - /* Update external mapping with page table link. */
> - int (*link_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> - void *external_spt);
> /* Update the external page table from spte getting set. */
> int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> u64 mirror_spte);
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 0feda295859a..56ad056e6042 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -495,31 +495,17 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
> call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
> }
>
> -static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
> -{
> - if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
> - struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
> -
> - WARN_ON_ONCE(sp->role.level + 1 != level);
> - WARN_ON_ONCE(sp->gfn != gfn);
> - return sp->external_spt;
> - }
> -
> - return NULL;
> -}
> -
> static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
> gfn_t gfn, u64 *old_spte,
> u64 new_spte, int level)
> {
> - bool was_present = is_shadow_present_pte(*old_spte);
> - bool is_present = is_shadow_present_pte(new_spte);
> - bool is_leaf = is_present && is_last_spte(new_spte, level);
> - int ret = 0;
> -
> - KVM_BUG_ON(was_present, kvm);
> + int ret;
>
> lockdep_assert_held(&kvm->mmu_lock);
> +
> + if (KVM_BUG_ON(is_shadow_present_pte(*old_spte), kvm))
> + return -EIO;
Why not move this check of is_shadow_present_pte() to tdx_sept_set_private_spte()
as well?
Or also check !is_shadow_present_pte(new_spte) in TDP MMU?
> * We need to lock out other updates to the SPTE until the external
> * page table has been modified. Use FROZEN_SPTE similar to
> @@ -528,18 +514,7 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp
> if (!try_cmpxchg64(rcu_dereference(sptep), old_spte, FROZEN_SPTE))
> return -EBUSY;
>
> - /*
> - * Use different call to either set up middle level
> - * external page table, or leaf.
> - */
> - if (is_leaf) {
> - ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte);
> - } else {
> - void *external_spt = get_external_spt(gfn, new_spte, level);
> -
> - KVM_BUG_ON(!external_spt, kvm);
> - ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt);
> - }
> + ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte);
> if (ret)
> __kvm_tdp_mmu_write_spte(sptep, *old_spte);
> else
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 5688c77616e3..30494f9ceb31 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -1664,18 +1664,58 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
> return 0;
> }
>
> +static struct page *tdx_spte_to_external_spt(struct kvm *kvm, gfn_t gfn,
> + u64 new_spte, enum pg_level level)
> +{
> + struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
> +
> + if (KVM_BUG_ON(!sp->external_spt, kvm) ||
> + KVM_BUG_ON(sp->role.level + 1 != level, kvm) ||
> + KVM_BUG_ON(sp->gfn != gfn, kvm))
> + return NULL;
Could we remove the KVM_BUG_ON()s, and ...
> + return virt_to_page(sp->external_spt);
> +}
> +
> +static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
> + enum pg_level level, u64 mirror_spte)
> +{
> + gpa_t gpa = gfn_to_gpa(gfn);
> + u64 err, entry, level_state;
> + struct page *external_spt;
> +
> + external_spt = tdx_spte_to_external_spt(kvm, gfn, mirror_spte, level);
> + if (!external_spt)
add a KVM_BUG_ON() here?
It could save KVM_BUG_ON()s and have KVM_BUG_ON() match -EIO :)
And as Rick also mentioned, better to remove external in external_spt, e.g.
something like pt_page.
And mirror_spte --> new_spte?
> + return -EIO;
> +
> + err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, level, external_spt,
> + &entry, &level_state);
> + if (unlikely(tdx_operand_busy(err)))
> + return -EBUSY;
> +
> + if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
> + return -EIO;
> +
> + return 0;
> +}
> +
> static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
> enum pg_level level, u64 mirror_spte)
> {
> struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
>
> + if (KVM_BUG_ON(!is_shadow_present_pte(mirror_spte), kvm))
> + return -EIO;
> +
> + if (!is_last_spte(mirror_spte, level))
> + return tdx_sept_link_private_spt(kvm, gfn, level, mirror_spte);
> +
> /* TODO: handle large pages. */
> if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
> return -EIO;
>
> - WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) ||
> - (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
> + WARN_ON_ONCE((mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
Also check this for tdx_sept_link_private_spt()?
> /*
> * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory()
> @@ -1695,23 +1735,7 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
> return tdx_mem_page_aug(kvm, gfn, level, pfn);
> }
>
> -static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
> - enum pg_level level, void *private_spt)
> -{
> - gpa_t gpa = gfn_to_gpa(gfn);
> - struct page *page = virt_to_page(private_spt);
> - u64 err, entry, level_state;
>
> - err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, level, page, &entry,
> - &level_state);
> - if (unlikely(tdx_operand_busy(err)))
> - return -EBUSY;
> -
> - if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
> - return -EIO;
> -
> - return 0;
> -}
>
> /*
> * Ensure shared and private EPTs to be flushed on all vCPUs.
> @@ -3592,7 +3616,6 @@ void __init tdx_hardware_setup(void)
> */
> vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
>
> - vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
> vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
> vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
> vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
> --
> 2.53.0.rc1.217.geba53bf80e-goog
>
On Tue, Feb 03, 2026, Yan Zhao wrote:
> On Wed, Jan 28, 2026 at 05:14:37PM -0800, Sean Christopherson wrote:
> > static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
> > gfn_t gfn, u64 *old_spte,
> > u64 new_spte, int level)
> > {
> > - bool was_present = is_shadow_present_pte(*old_spte);
> > - bool is_present = is_shadow_present_pte(new_spte);
> > - bool is_leaf = is_present && is_last_spte(new_spte, level);
> > - int ret = 0;
> > -
> > - KVM_BUG_ON(was_present, kvm);
> > + int ret;
> >
> > lockdep_assert_held(&kvm->mmu_lock);
> > +
> > + if (KVM_BUG_ON(is_shadow_present_pte(*old_spte), kvm))
> > + return -EIO;
> Why not move this check of is_shadow_present_pte() to tdx_sept_set_private_spte()
> as well?
The series gets there eventually, but as of this commit, @old_spte isn't plumbed
into tdx_sept_set_private_spte().
> Or also check !is_shadow_present_pte(new_spte) in TDP MMU?
Not sure I understand this suggestion.
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index 5688c77616e3..30494f9ceb31 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -1664,18 +1664,58 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
> > return 0;
> > }
> >
> > +static struct page *tdx_spte_to_external_spt(struct kvm *kvm, gfn_t gfn,
> > + u64 new_spte, enum pg_level level)
> > +{
> > + struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
> > +
> > + if (KVM_BUG_ON(!sp->external_spt, kvm) ||
> > + KVM_BUG_ON(sp->role.level + 1 != level, kvm) ||
> > + KVM_BUG_ON(sp->gfn != gfn, kvm))
> > + return NULL;
> Could we remove the KVM_BUG_ON()s, and ...
>
> > + return virt_to_page(sp->external_spt);
> > +}
> > +
> > +static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
> > + enum pg_level level, u64 mirror_spte)
> > +{
> > + gpa_t gpa = gfn_to_gpa(gfn);
> > + u64 err, entry, level_state;
> > + struct page *external_spt;
> > +
> > + external_spt = tdx_spte_to_external_spt(kvm, gfn, mirror_spte, level);
> > + if (!external_spt)
> add a KVM_BUG_ON() here?
> It could save KVM_BUG_ON()s and have KVM_BUG_ON() match -EIO :)
We could, but I don't want to, because if we're going to bother with sanity checks,
I want the resulting WARNs to be precise. I.e. I want the WARN to capture *why*
tdx_spte_to_external_spt() failed, to make debug/triage easier.
> And as Rick also mentioned, better to remove external in external_spt, e.g.
> something like pt_page.
Yeah, maybe sept_spt?
> And mirror_spte --> new_spte?
Hmm, ya, I made that change later, but it can probably be shifted here.
> > - WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) ||
> > - (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
> > + WARN_ON_ONCE((mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
> Also check this for tdx_sept_link_private_spt()?
Eh, we could, but I don't think it's necessary. make_nonleaf_spte() is hardcoded
to set full permissions (and I don't see that changing any time soon), whereas
leaf SPTE protections are much more dynamic.
On Tue, Feb 03, 2026 at 08:05:05PM +0000, Sean Christopherson wrote:
> On Tue, Feb 03, 2026, Yan Zhao wrote:
> > On Wed, Jan 28, 2026 at 05:14:37PM -0800, Sean Christopherson wrote:
> > > static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
> > > gfn_t gfn, u64 *old_spte,
> > > u64 new_spte, int level)
> > > {
> > > - bool was_present = is_shadow_present_pte(*old_spte);
> > > - bool is_present = is_shadow_present_pte(new_spte);
> > > - bool is_leaf = is_present && is_last_spte(new_spte, level);
> > > - int ret = 0;
> > > -
> > > - KVM_BUG_ON(was_present, kvm);
> > > + int ret;
> > >
> > > lockdep_assert_held(&kvm->mmu_lock);
> > > +
> > > + if (KVM_BUG_ON(is_shadow_present_pte(*old_spte), kvm))
> > > + return -EIO;
> > Why not move this check of is_shadow_present_pte() to tdx_sept_set_private_spte()
> > as well?
>
> The series gets there eventually, but as of this commit, @old_spte isn't plumbed
> into tdx_sept_set_private_spte().
>
> > Or also check !is_shadow_present_pte(new_spte) in TDP MMU?
>
> Not sure I understand this suggestion.
Sorry. The accurate expression should be
"what about moving !is_shadow_present_pte(new_spte) to TDP MMU as well?".
>
> > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > > index 5688c77616e3..30494f9ceb31 100644
> > > --- a/arch/x86/kvm/vmx/tdx.c
> > > +++ b/arch/x86/kvm/vmx/tdx.c
> > > @@ -1664,18 +1664,58 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
> > > return 0;
> > > }
> > >
> > > +static struct page *tdx_spte_to_external_spt(struct kvm *kvm, gfn_t gfn,
> > > + u64 new_spte, enum pg_level level)
> > > +{
> > > + struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
> > > +
> > > + if (KVM_BUG_ON(!sp->external_spt, kvm) ||
> > > + KVM_BUG_ON(sp->role.level + 1 != level, kvm) ||
> > > + KVM_BUG_ON(sp->gfn != gfn, kvm))
> > > + return NULL;
> > Could we remove the KVM_BUG_ON()s, and ...
> >
> > > + return virt_to_page(sp->external_spt);
> > > +}
> > > +
> > > +static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
> > > + enum pg_level level, u64 mirror_spte)
> > > +{
> > > + gpa_t gpa = gfn_to_gpa(gfn);
> > > + u64 err, entry, level_state;
> > > + struct page *external_spt;
> > > +
> > > + external_spt = tdx_spte_to_external_spt(kvm, gfn, mirror_spte, level);
> > > + if (!external_spt)
> > add a KVM_BUG_ON() here?
> > It could save KVM_BUG_ON()s and have KVM_BUG_ON() match -EIO :)
>
> We could, but I don't want to, because if we're going to bother with sanity checks,
> I want the resulting WARNs to be precise. I.e. I want the WARN to capture *why*
> tdx_spte_to_external_spt() failed, to make debug/triage easier.
Ok.
> > And as Rick also mentioned, better to remove external in external_spt, e.g.
> > something like pt_page.
>
> Yeah, maybe sept_spt?
Hmm, here sept_spt is of type struct page, while sp->spt and sp->external_spt
represents VA. Not sure if it will cause confusion.
But I don't have strong opinion :)
> > And mirror_spte --> new_spte?
>
> Hmm, ya, I made that change later, but it can probably be shifted here.
>
> > > - WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) ||
> > > - (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
> > > + WARN_ON_ONCE((mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
> > Also check this for tdx_sept_link_private_spt()?
>
> Eh, we could, but I don't think it's necessary. make_nonleaf_spte() is hardcoded
> to set full permissions (and I don't see that changing any time soon), whereas
> leaf SPTE protections are much more dynamic.
Makes sense.
On Wed, Feb 04, 2026, Yan Zhao wrote:
> On Tue, Feb 03, 2026 at 08:05:05PM +0000, Sean Christopherson wrote:
> > On Tue, Feb 03, 2026, Yan Zhao wrote:
> > > On Wed, Jan 28, 2026 at 05:14:37PM -0800, Sean Christopherson wrote:
> > > > static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
> > > > gfn_t gfn, u64 *old_spte,
> > > > u64 new_spte, int level)
> > > > {
> > > > - bool was_present = is_shadow_present_pte(*old_spte);
> > > > - bool is_present = is_shadow_present_pte(new_spte);
> > > > - bool is_leaf = is_present && is_last_spte(new_spte, level);
> > > > - int ret = 0;
> > > > -
> > > > - KVM_BUG_ON(was_present, kvm);
> > > > + int ret;
> > > >
> > > > lockdep_assert_held(&kvm->mmu_lock);
> > > > +
> > > > + if (KVM_BUG_ON(is_shadow_present_pte(*old_spte), kvm))
> > > > + return -EIO;
> > > Why not move this check of is_shadow_present_pte() to tdx_sept_set_private_spte()
> > > as well?
> >
> > The series gets there eventually, but as of this commit, @old_spte isn't plumbed
> > into tdx_sept_set_private_spte().
> >
> > > Or also check !is_shadow_present_pte(new_spte) in TDP MMU?
> >
> > Not sure I understand this suggestion.
> Sorry. The accurate expression should be
> "what about moving !is_shadow_present_pte(new_spte) to TDP MMU as well?".
It's already there, in __tdp_mmu_set_spte_atomic():
/*
* KVM doesn't currently support zapping or splitting mirror
* SPTEs while holding mmu_lock for read.
*/
if (KVM_BUG_ON(is_shadow_present_pte(iter->old_spte), kvm) ||
KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
return -EBUSY;
> > > And as Rick also mentioned, better to remove external in external_spt, e.g.
> > > something like pt_page.
> >
> > Yeah, maybe sept_spt?
> Hmm, here sept_spt is of type struct page, while sp->spt and sp->external_spt
> represents VA. Not sure if it will cause confusion.
How about sept_pt?
On Thu, Feb 05, 2026 at 03:14:29PM -0800, Sean Christopherson wrote:
> On Wed, Feb 04, 2026, Yan Zhao wrote:
> > On Tue, Feb 03, 2026 at 08:05:05PM +0000, Sean Christopherson wrote:
> > > On Tue, Feb 03, 2026, Yan Zhao wrote:
> > > > On Wed, Jan 28, 2026 at 05:14:37PM -0800, Sean Christopherson wrote:
> > > > > static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
> > > > > gfn_t gfn, u64 *old_spte,
> > > > > u64 new_spte, int level)
> > > > > {
> > > > > - bool was_present = is_shadow_present_pte(*old_spte);
> > > > > - bool is_present = is_shadow_present_pte(new_spte);
> > > > > - bool is_leaf = is_present && is_last_spte(new_spte, level);
> > > > > - int ret = 0;
> > > > > -
> > > > > - KVM_BUG_ON(was_present, kvm);
> > > > > + int ret;
> > > > >
> > > > > lockdep_assert_held(&kvm->mmu_lock);
> > > > > +
> > > > > + if (KVM_BUG_ON(is_shadow_present_pte(*old_spte), kvm))
> > > > > + return -EIO;
> > > > Why not move this check of is_shadow_present_pte() to tdx_sept_set_private_spte()
> > > > as well?
> > >
> > > The series gets there eventually, but as of this commit, @old_spte isn't plumbed
> > > into tdx_sept_set_private_spte().
> > >
> > > > Or also check !is_shadow_present_pte(new_spte) in TDP MMU?
> > >
> > > Not sure I understand this suggestion.
> > Sorry. The accurate expression should be
> > "what about moving !is_shadow_present_pte(new_spte) to TDP MMU as well?".
>
> It's already there, in __tdp_mmu_set_spte_atomic():
>
> /*
> * KVM doesn't currently support zapping or splitting mirror
> * SPTEs while holding mmu_lock for read.
> */
> if (KVM_BUG_ON(is_shadow_present_pte(iter->old_spte), kvm) ||
> KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
> return -EBUSY;
Ok. I was wondering why we don't include it directly in this patch, but it
doesn't matter.
> > > > And as Rick also mentioned, better to remove external in external_spt, e.g.
> > > > something like pt_page.
> > >
> > > Yeah, maybe sept_spt?
> > Hmm, here sept_spt is of type struct page, while sp->spt and sp->external_spt
> > represents VA. Not sure if it will cause confusion.
>
> How about sept_pt?
LGTM.
On Wed, 2026-01-28 at 17:14 -0800, Sean Christopherson wrote:
> Drop the dedicated .link_external_spt() for linking non-leaf S-EPT
> pages, and instead funnel everything through .set_external_spte().
> Using separate hooks doesn't help prevent TDP MMU details from
> bleeding into TDX, and vice versa; to the contrary, dedicated
> callbacks will result in _more_ pollution when hugepage support is
> added, e.g. will require the TDP MMU to know details about the
> splitting rules for TDX that aren't all that relevant to
> the TDP MMU.
>
> Ideally, KVM would provide a single pair of hooks to set S-EPT
> entries, one hook for setting SPTEs under write-lock and another for
> settings SPTEs under read-lock (e.g. to ensure the entire operation
> is "atomic", to allow for failure, etc.). Sadly, TDX's requirement
> that all child S-EPT entries are removed before the parent makes that
> impractical: the TDP MMU deliberately prunes non-leaf SPTEs and
> _then_ processes its children, thus making it quite important for the
> TDP MMU to differentiate between zapping leaf and non-leaf S-EPT
> entries.
>
> However, that's the _only_ case that's truly special, and even that
> case could be shoehorned into a single hook; it's just wouldn't be a
> net positive.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
It has better handling of the external_spt == NULL case too, by
actually returning an error, but one naming nit below to take or leave.
Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> ---
> arch/x86/include/asm/kvm-x86-ops.h | 1 -
> arch/x86/include/asm/kvm_host.h | 3 --
> arch/x86/kvm/mmu/tdp_mmu.c | 37 +++---------------
> arch/x86/kvm/vmx/tdx.c | 61 ++++++++++++++++++++--------
> --
> 4 files changed, 48 insertions(+), 54 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h
> b/arch/x86/include/asm/kvm-x86-ops.h
> index c18a033bee7e..57eb1f4832ae 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -94,7 +94,6 @@ KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
> KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
> KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
> KVM_X86_OP(load_mmu_pgd)
> -KVM_X86_OP_OPTIONAL_RET0(link_external_spt)
> KVM_X86_OP_OPTIONAL_RET0(set_external_spte)
> KVM_X86_OP_OPTIONAL_RET0(free_external_spt)
> KVM_X86_OP_OPTIONAL(remove_external_spte)
> diff --git a/arch/x86/include/asm/kvm_host.h
> b/arch/x86/include/asm/kvm_host.h
> index e441f270f354..d12ca0f8a348 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1853,9 +1853,6 @@ struct kvm_x86_ops {
> void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
> int root_level);
>
> - /* Update external mapping with page table link. */
> - int (*link_external_spt)(struct kvm *kvm, gfn_t gfn, enum
> pg_level level,
> - void *external_spt);
> /* Update the external page table from spte getting set. */
> int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum
> pg_level level,
> u64 mirror_spte);
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 0feda295859a..56ad056e6042 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -495,31 +495,17 @@ static void handle_removed_pt(struct kvm *kvm,
> tdp_ptep_t pt, bool shared)
> call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
> }
>
> -static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
> -{
> - if (is_shadow_present_pte(new_spte) &&
> !is_last_spte(new_spte, level)) {
> - struct kvm_mmu_page *sp =
> spte_to_child_sp(new_spte);
> -
> - WARN_ON_ONCE(sp->role.level + 1 != level);
> - WARN_ON_ONCE(sp->gfn != gfn);
> - return sp->external_spt;
> - }
> -
> - return NULL;
> -}
> -
> static int __must_check set_external_spte_present(struct kvm *kvm,
> tdp_ptep_t sptep,
> gfn_t gfn, u64
> *old_spte,
> u64 new_spte, int
> level)
> {
> - bool was_present = is_shadow_present_pte(*old_spte);
> - bool is_present = is_shadow_present_pte(new_spte);
> - bool is_leaf = is_present && is_last_spte(new_spte, level);
> - int ret = 0;
> -
> - KVM_BUG_ON(was_present, kvm);
> + int ret;
>
> lockdep_assert_held(&kvm->mmu_lock);
> +
> + if (KVM_BUG_ON(is_shadow_present_pte(*old_spte), kvm))
> + return -EIO;
> +
> /*
> * We need to lock out other updates to the SPTE until the
> external
> * page table has been modified. Use FROZEN_SPTE similar to
> @@ -528,18 +514,7 @@ static int __must_check
> set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp
> if (!try_cmpxchg64(rcu_dereference(sptep), old_spte,
> FROZEN_SPTE))
> return -EBUSY;
>
> - /*
> - * Use different call to either set up middle level
> - * external page table, or leaf.
> - */
> - if (is_leaf) {
> - ret = kvm_x86_call(set_external_spte)(kvm, gfn,
> level, new_spte);
> - } else {
> - void *external_spt = get_external_spt(gfn, new_spte,
> level);
> -
> - KVM_BUG_ON(!external_spt, kvm);
> - ret = kvm_x86_call(link_external_spt)(kvm, gfn,
> level, external_spt);
> - }
> + ret = kvm_x86_call(set_external_spte)(kvm, gfn, level,
> new_spte);
> if (ret)
> __kvm_tdp_mmu_write_spte(sptep, *old_spte);
> else
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 5688c77616e3..30494f9ceb31 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -1664,18 +1664,58 @@ static int tdx_mem_page_aug(struct kvm *kvm,
> gfn_t gfn,
> return 0;
> }
>
> +static struct page *tdx_spte_to_external_spt(struct kvm *kvm, gfn_t
> gfn,
> + u64 new_spte, enum
> pg_level level)
> +{
> + struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
> +
> + if (KVM_BUG_ON(!sp->external_spt, kvm) ||
> + KVM_BUG_ON(sp->role.level + 1 != level, kvm) ||
> + KVM_BUG_ON(sp->gfn != gfn, kvm))
> + return NULL;
> +
> + return virt_to_page(sp->external_spt);
> +}
> +
> +static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
> + enum pg_level level, u64
> mirror_spte)
> +{
> + gpa_t gpa = gfn_to_gpa(gfn);
> + u64 err, entry, level_state;
> + struct page *external_spt;
> +
> + external_spt = tdx_spte_to_external_spt(kvm, gfn,
> mirror_spte, level);
The "external" abstraction wraps the "S-EPT" knowledge and naming (for
maybe increasingly dubious reasons), but in the TDX code, inside the
abstraction, it uses the sept naming. so I might have called it
sept_pt.
© 2016 - 2026 Red Hat, Inc.