[RFC PATCH v5 20/45] KVM: x86/mmu: Allocate/free S-EPT pages using tdx_{alloc,free}_control_page()

Sean Christopherson posted 45 patches 1 week, 3 days ago
[RFC PATCH v5 20/45] KVM: x86/mmu: Allocate/free S-EPT pages using tdx_{alloc,free}_control_page()
Posted by Sean Christopherson 1 week, 3 days ago
Now that kvm_mmu_memory_cache supports custom page allocators, wire up the
S-EPT cache to use tdx_{alloc,free}_control_page() (arguably S-EPT pages
aren't "control" pages, but they're not guest pages either).  Using the
TDX APIs will make S-EPT pages naturally play nice with Dynamic PAMT, by
virtue of adding/removing PAMT entries when S-EPT pages are allocated and
freed, as opposed to when they are added/removed from the S-EPT tree.

Inserting into the PAMT entries on allocation does mean KVM will create
unnecessary PAMT entries, e.g. once a vCPU stops faulting in memory, the
remaining pages in the MMU cache will go unused.  But in practice, odds
are very good the containing 2MiB page will have other in-use S-EPT pages,
i.e. will create PAMT entries anyways.  And _if_ creating PAMT entries on
allocation is problematic for memory consumption, that can be resolved by
tweaking KVM's cache size.

Suggested-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm-x86-ops.h |  2 ++
 arch/x86/include/asm/kvm_host.h    | 18 +++++++++---------
 arch/x86/kvm/mmu/mmu.c             |  6 ++++--
 arch/x86/kvm/mmu/mmu_internal.h    | 11 -----------
 arch/x86/kvm/mmu/tdp_mmu.c         |  5 +++--
 arch/x86/kvm/vmx/tdx.c             | 13 ++++++++++++-
 6 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index c17cedc485c9..17dddada69fc 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -94,6 +94,8 @@ KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
 KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
 KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
 KVM_X86_OP(load_mmu_pgd)
+KVM_X86_OP_OPTIONAL(alloc_external_sp)
+KVM_X86_OP_OPTIONAL(free_external_sp)
 KVM_X86_OP_OPTIONAL_RET0(set_external_spte)
 KVM_X86_OP_OPTIONAL(remove_external_spte)
 KVM_X86_OP_OPTIONAL(reclaim_external_sp)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b35a07ed11fb..6e84dbc89e79 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -867,10 +867,7 @@ struct kvm_vcpu_arch {
 	struct kvm_mmu_memory_cache mmu_shadow_page_cache;
 	struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
-	/*
-	 * This cache is to allocate external page table. E.g. private EPT used
-	 * by the TDX module.
-	 */
+	/* Used to allocate S-EPT pages (gifted to the TDX-Module). */
 	struct kvm_mmu_memory_cache mmu_external_spt_cache;
 
 	/*
@@ -1853,18 +1850,21 @@ struct kvm_x86_ops {
 	void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
 			     int root_level);
 
-	/* Update the external page table from spte getting set. */
+	/*
+	 * Callbacks to allocate and free external page tables, a.k.a. S-EPT,
+	 * and to propagate changes in mirror page tables to the external page
+	 * tables.
+	 */
+	unsigned long (*alloc_external_sp)(gfp_t gfp);
+	void (*free_external_sp)(unsigned long addr);
 	int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
 				 u64 mirror_spte);
-
-	/* Update external page tables for page table about to be freed. */
 	void (*reclaim_external_sp)(struct kvm *kvm, gfn_t gfn,
 				    struct kvm_mmu_page *sp);
-
-	/* Update external page table from spte getting removed, and flush TLB. */
 	void (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
 				     u64 mirror_spte);
 
+
 	bool (*has_wbinvd_exit)(void);
 
 	u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3911ac9bddfd..9b5a6861e2a4 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6690,11 +6690,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
 	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
 
-	vcpu->arch.mmu_shadow_page_cache.init_value =
-		SHADOW_NONPRESENT_VALUE;
+	vcpu->arch.mmu_shadow_page_cache.init_value = SHADOW_NONPRESENT_VALUE;
 	if (!vcpu->arch.mmu_shadow_page_cache.init_value)
 		vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
 
+	vcpu->arch.mmu_external_spt_cache.page_get = kvm_x86_ops.alloc_external_sp;
+	vcpu->arch.mmu_external_spt_cache.page_free = kvm_x86_ops.free_external_sp;
+
 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
 	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
 
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 73cdcbccc89e..6bb97f660793 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -157,17 +157,6 @@ static inline bool is_mirror_sp(const struct kvm_mmu_page *sp)
 	return sp->role.is_mirror;
 }
 
-static inline void kvm_mmu_alloc_external_spt(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
-	/*
-	 * external_spt is allocated for TDX module to hold private EPT mappings,
-	 * TDX module will initialize the page by itself.
-	 * Therefore, KVM does not need to initialize or access external_spt.
-	 * KVM only interacts with sp->spt for private EPT operations.
-	 */
-	sp->external_spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache);
-}
-
 static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mmu_page *root)
 {
 	/*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 18764dbc97ea..01e3e4f4baa5 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -55,7 +55,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
 
 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
 {
-	free_page((unsigned long)sp->external_spt);
+	if (sp->external_spt)
+		kvm_x86_call(free_external_sp)((unsigned long)sp->external_spt);
 	free_page((unsigned long)sp->spt);
 	kmem_cache_free(mmu_page_header_cache, sp);
 }
@@ -1246,7 +1247,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		sp = tdp_mmu_alloc_sp(vcpu);
 		tdp_mmu_init_child_sp(sp, &iter);
 		if (is_mirror_sp(sp))
-			kvm_mmu_alloc_external_spt(vcpu, sp);
+			sp->external_spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache);
 
 		sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
 
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 323aae4300a1..0946eba2de23 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1790,7 +1790,9 @@ static void tdx_sept_reclaim_private_sp(struct kvm *kvm, gfn_t gfn,
 	 * TD's hkid is freed, when the TD is being torn down.
 	 *
 	 * If the S-EPT PTE can't be removed for any reason, intentionally leak
-	 * the page to prevent the kernel from accessing the encrypted page.
+	 * the page to prevent the kernel from accessing the encrypted page,
+	 * and if Dynamic PAMT is enabled, to avoid inducing a failure on
+	 * removal of the still-used PAMT entry.
 	 */
 	if (KVM_BUG_ON(is_hkid_assigned(to_kvm_tdx(kvm)), kvm) ||
 	    tdx_reclaim_page(virt_to_page(sp->external_spt)))
@@ -3600,6 +3602,15 @@ void __init tdx_hardware_setup(void)
 	 */
 	vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
 
+	/*
+	 * TDX uses the external_spt cache to allocate S-EPT page table pages,
+	 * which (a) don't need to be initialized by KVM as the TDX-Module will
+	 * initialize the page (using the guest's encryption key), and (b) need
+	 * to use a custom allocator to be compatible with Dynamic PAMT.
+	 */
+	vt_x86_ops.alloc_external_sp = tdx_alloc_control_page;
+	vt_x86_ops.free_external_sp = tdx_free_control_page;
+
 	vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
 	vt_x86_ops.reclaim_external_sp = tdx_sept_reclaim_private_sp;
 	vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
-- 
2.53.0.rc1.217.geba53bf80e-goog
Re: [RFC PATCH v5 20/45] KVM: x86/mmu: Allocate/free S-EPT pages using tdx_{alloc,free}_control_page()
Posted by Yan Zhao 2 days, 12 hours ago
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 18764dbc97ea..01e3e4f4baa5 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -55,7 +55,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
>  
>  static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
>  {
> -	free_page((unsigned long)sp->external_spt);
> +	if (sp->external_spt)
> +		kvm_x86_call(free_external_sp)((unsigned long)sp->external_spt);
>  	free_page((unsigned long)sp->spt);
>  	kmem_cache_free(mmu_page_header_cache, sp);
>  }
Strictly speaking, external_spt is not a control page. Its alloc/free are
different from normal control pages managed by TDX's code.

(1) alloc
tdx_alloc_control_page
  __tdx_alloc_control_page
    __tdx_pamt_get 
      spin_lock(&pamt_lock)   ==> under process context
      spin_unlock(&pamt_lock)

(2) free
tdp_mmu_free_sp_rcu_callback
  tdp_mmu_free_sp
    kvm_x86_call(free_external_sp)
     tdx_free_control_page
        __tdx_free_control_page
          __tdx_pamt_put
            spin_lock(&pamt_lock)   ==> under softirq context
            spin_unlock(&pamt_lock)

So, invoking __tdx_pamt_put() in the RCU callback triggers deadlock warning
(see the bottom for details).

> +	/*
> +	 * TDX uses the external_spt cache to allocate S-EPT page table pages,
> +	 * which (a) don't need to be initialized by KVM as the TDX-Module will
> +	 * initialize the page (using the guest's encryption key), and (b) need
> +	 * to use a custom allocator to be compatible with Dynamic PAMT.
> +	 */
> +	vt_x86_ops.alloc_external_sp = tdx_alloc_control_page;
> +	vt_x86_ops.free_external_sp = tdx_free_control_page;
> +
>  	vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
>  	vt_x86_ops.reclaim_external_sp = tdx_sept_reclaim_private_sp;
>  	vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;

 ================================
 WARNING: inconsistent lock state
 6.19.0-rc6-upstream+ #1078 Tainted: G S   U
 --------------------------------
 inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
 swapper/7/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
 ffffffff9067b6f8 (pamt_lock){+.?.}-{3:3}, at: __tdx_pamt_put+0x80/0xf0
 {SOFTIRQ-ON-W} state was registered at:
   __lock_acquire+0x405/0xc10
   lock_acquire.part.0+0x9c/0x210
   lock_acquire+0x5e/0x100
   _raw_spin_lock+0x37/0x80
   __tdx_pamt_get+0xb8/0x150
   __tdx_alloc_control_page+0x2e/0x60
   __tdx_td_init+0x65/0x740 [kvm_intel]
   tdx_td_init+0x147/0x240 [kvm_intel]
   tdx_vm_ioctl+0x125/0x260 [kvm_intel]
   vt_mem_enc_ioctl+0x17/0x30 [kvm_intel]
   kvm_arch_vm_ioctl+0x4e0/0xb40 [kvm]
   kvm_vm_ioctl+0x4f4/0xaf0 [kvm]
   __x64_sys_ioctl+0x9d/0xf0
   x64_sys_call+0xf38/0x1da0
   do_syscall_64+0xc5/0xfc0
   entry_SYSCALL_64_after_hwframe+0x77/0x7f
 irq event stamp: 252814
 hardirqs last  enabled at (252814): [<ffffffff8fa6f41a>] _raw_spin_unlock_irqrestore+0x5a/0x80
 hardirqs last disabled at (252813): [<ffffffff8fa6f096>] _raw_spin_lock_irqsave+0x76/0x90
 softirqs last  enabled at (252798): [<ffffffff8e60f139>] handle_softirqs+0x309/0x460
 softirqs last disabled at (252805): [<ffffffff8e60f401>] __irq_exit_rcu+0xe1/0x160

 other info that might help us debug this:
  Possible unsafe locking scenario:

        CPU0
        ----
   lock(pamt_lock);
   <Interrupt>
     lock(pamt_lock);

  *** DEADLOCK ***

 1 lock held by swapper/7/0:
  #0: ffffffff9077d660 (rcu_callback){....}-{0:0}, at: rcu_do_batch+0x153/0x620

 stack backtrace:
 CPU: 7 UID: 0 PID: 0 Comm: swapper/7 Tainted: G S   U              6.19.0-rc6-upstream+ #1078 PREEMPT(voluntary)  b8f4b38003dc2ca73352cf9d3d544aa826c4f5a9
 Tainted: [S]=CPU_OUT_OF_SPEC, [U]=USER
 Hardware name: Intel Corporation ArcherCity/ArcherCity, BIOS EGSDCRB1.SYS.0101.D29.2303301937 03/30/2023
 Call Trace:
  <IRQ>
  show_stack+0x49/0x60
  dump_stack_lvl+0x6f/0xb0
  dump_stack+0x10/0x16
  print_usage_bug.part.0+0x264/0x350
  mark_lock_irq+0x4d6/0x9e0
  ? stack_trace_save+0x4a/0x70
  ? save_trace+0x66/0x2b0
  mark_lock+0x1cf/0x6a0
  mark_usage+0x4c/0x130
  __lock_acquire+0x405/0xc10
  ? __this_cpu_preempt_check+0x13/0x20
  lock_acquire.part.0+0x9c/0x210
  ? __tdx_pamt_put+0x80/0xf0
  lock_acquire+0x5e/0x100
  ? __tdx_pamt_put+0x80/0xf0
  _raw_spin_lock+0x37/0x80
  ? __tdx_pamt_put+0x80/0xf0
  __tdx_pamt_put+0x80/0xf0
  ? __this_cpu_preempt_check+0x13/0x20
  ? sched_clock_noinstr+0x9/0x10
  __tdx_free_control_page+0x22/0x40
  tdx_free_control_page+0x38/0x50 [kvm_intel c135d3571385e160f086f9f6195fc72e4b6aa2b1]
  tdp_mmu_free_sp_rcu_callback+0x24/0x50 [kvm 3932b137c28c130169e7e3615041bcec6cefc090]
  ? rcu_do_batch+0x1dc/0x620
  rcu_do_batch+0x1e1/0x620
  ? rcu_do_batch+0x153/0x620
  rcu_core+0x37d/0x4d0
  rcu_core_si+0xe/0x20
  handle_softirqs+0xdc/0x460
  ? hrtimer_interrupt+0x154/0x290
  __irq_exit_rcu+0xe1/0x160
  irq_exit_rcu+0xe/0x30
  sysvec_apic_timer_interrupt+0xc0/0xf0
  </IRQ>
  <TASK>
  asm_sysvec_apic_timer_interrupt+0x1b/0x20
 RIP: 0010:cpuidle_enter_state+0x122/0x7a0
Re: [RFC PATCH v5 20/45] KVM: x86/mmu: Allocate/free S-EPT pages using tdx_{alloc,free}_control_page()
Posted by Sean Christopherson 2 days, 7 hours ago
On Fri, Feb 06, 2026, Yan Zhao wrote:
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index 18764dbc97ea..01e3e4f4baa5 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -55,7 +55,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
> >  
> >  static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
> >  {
> > -	free_page((unsigned long)sp->external_spt);
> > +	if (sp->external_spt)
> > +		kvm_x86_call(free_external_sp)((unsigned long)sp->external_spt);
> >  	free_page((unsigned long)sp->spt);
> >  	kmem_cache_free(mmu_page_header_cache, sp);
> >  }
> Strictly speaking, external_spt is not a control page. Its alloc/free are
> different from normal control pages managed by TDX's code.

Yeah, I called that out in the changelog.  I'm definitley not wedded to
tdx_{alloc,free}_control_page(), but I am very much against tdx_{alloc,free}_page().

  (arguably S-EPT pages aren't "control" pages, but they're not guest pages either)

> (1) alloc
> tdx_alloc_control_page
>   __tdx_alloc_control_page
>     __tdx_pamt_get 
>       spin_lock(&pamt_lock)   ==> under process context
>       spin_unlock(&pamt_lock)
> 
> (2) free
> tdp_mmu_free_sp_rcu_callback
>   tdp_mmu_free_sp
>     kvm_x86_call(free_external_sp)
>      tdx_free_control_page
>         __tdx_free_control_page
>           __tdx_pamt_put
>             spin_lock(&pamt_lock)   ==> under softirq context
>             spin_unlock(&pamt_lock)
> 
> So, invoking __tdx_pamt_put() in the RCU callback triggers deadlock warning
> (see the bottom for details).

Hrm.  I can think of two options.  Option #1 would be to use a raw spinlock and
disable IRQs:

diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 823ec092b4e4..6348085d7dcb 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -2246,7 +2246,7 @@ static u64 tdh_phymem_pamt_remove(u64 pfn, u64 *pamt_pa_array)
 }
 
 /* Serializes adding/removing PAMT memory */
-static DEFINE_SPINLOCK(pamt_lock);
+static DEFINE_RAW_SPINLOCK(pamt_lock);
 
 /* Bump PAMT refcount for the given page and allocate PAMT memory if needed */
 int __tdx_pamt_get(u64 pfn, struct tdx_pamt_cache *cache)
@@ -2272,7 +2272,7 @@ int __tdx_pamt_get(u64 pfn, struct tdx_pamt_cache *cache)
        if (ret)
                goto out_free;
 
-       scoped_guard(spinlock, &pamt_lock) {
+       scoped_guard(raw_spinlock_irqsave, &pamt_lock) {
                /*
                 * Lost race to other tdx_pamt_add(). Other task has already allocated
                 * PAMT memory for the HPA.
@@ -2348,7 +2348,7 @@ void __tdx_pamt_put(u64 pfn)
        if (!atomic_dec_and_test(pamt_refcount))
                return;
 
-       scoped_guard(spinlock, &pamt_lock) {
+       scoped_guard(raw_spinlock_irqsave, &pamt_lock) {
                /* Lost race with tdx_pamt_get(). */
                if (atomic_read(pamt_refcount))
                        return;

--

Option #2 would be to immediately free the page in tdx_sept_reclaim_private_sp(),
so that pages that freed via handle_removed_pt() don't defer freeing the S-EPT
page table (which, IIUC, is safe since the TDX-Module forces TLB flushes and exits).

I really, really don't like this option (if it even works).

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index ae7b9beb3249..4726011ad624 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -2014,7 +2014,15 @@ static void tdx_sept_reclaim_private_sp(struct kvm *kvm, gfn_t gfn,
         */
        if (KVM_BUG_ON(is_hkid_assigned(to_kvm_tdx(kvm)), kvm) ||
            tdx_reclaim_page(virt_to_page(sp->external_spt)))
-               sp->external_spt = NULL;
+               goto out;
+
+       /*
+        * Immediately free the control page, as the TDX subsystem doesn't
+        * support freeing pages from RCU callbacks.
+        */
+       tdx_free_control_page((unsigned long)sp->external_spt);
+out:
+       sp->external_spt = NULL;
 }
 
 void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
--
Re: [RFC PATCH v5 20/45] KVM: x86/mmu: Allocate/free S-EPT pages using tdx_{alloc,free}_control_page()
Posted by Huang, Kai 5 days, 10 hours ago
On Wed, 2026-01-28 at 17:14 -0800, Sean Christopherson wrote:
> Now that kvm_mmu_memory_cache supports custom page allocators, wire up the
> S-EPT cache to use tdx_{alloc,free}_control_page() (arguably S-EPT pages
> aren't "control" pages, but they're not guest pages either).  Using the
> TDX APIs will make S-EPT pages naturally play nice with Dynamic PAMT, by
> virtue of adding/removing PAMT entries when S-EPT pages are allocated and
> freed, as opposed to when they are added/removed from the S-EPT tree.
> 
> Inserting into the PAMT entries on allocation does mean KVM will create
> unnecessary PAMT entries, e.g. once a vCPU stops faulting in memory, the
> remaining pages in the MMU cache will go unused.  But in practice, odds
> are very good the containing 2MiB page will have other in-use S-EPT pages,
> i.e. will create PAMT entries anyways.  And _if_ creating PAMT entries on
> allocation is problematic for memory consumption, that can be resolved by
> tweaking KVM's cache size.
> 
> Suggested-by: Kai Huang <kai.huang@intel.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>

Reviewed-by: Kai Huang <kai.huang@intel.com>

Some nits below ..


[...]

>  	int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
>  				 u64 mirror_spte);
> -
> -	/* Update external page tables for page table about to be freed. */
>  	void (*reclaim_external_sp)(struct kvm *kvm, gfn_t gfn,
>  				    struct kvm_mmu_page *sp);
> -
> -	/* Update external page table from spte getting removed, and flush TLB. */

The above two comments are still useful to me.

Not sure why do you want to remove them, especially in _this_ patch?

>  	void (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
>  				     u64 mirror_spte);
>  
> +

Unintentional change?

>  	bool (*has_wbinvd_exit)(void);
>  
>  	u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 3911ac9bddfd..9b5a6861e2a4 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -6690,11 +6690,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
>  	vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
>  	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
>  
> -	vcpu->arch.mmu_shadow_page_cache.init_value =
> -		SHADOW_NONPRESENT_VALUE;
> +	vcpu->arch.mmu_shadow_page_cache.init_value = SHADOW_NONPRESENT_VALUE;
>  	if (!vcpu->arch.mmu_shadow_page_cache.init_value)
>  		vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;

Ditto.  Not sure this adjustment is intentional?
Re: [RFC PATCH v5 20/45] KVM: x86/mmu: Allocate/free S-EPT pages using tdx_{alloc,free}_control_page()
Posted by Sean Christopherson 5 days, 1 hour ago
On Tue, Feb 03, 2026, Kai Huang wrote:
> On Wed, 2026-01-28 at 17:14 -0800, Sean Christopherson wrote:
> >  	int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> >  				 u64 mirror_spte);
> > -
> > -	/* Update external page tables for page table about to be freed. */
> >  	void (*reclaim_external_sp)(struct kvm *kvm, gfn_t gfn,
> >  				    struct kvm_mmu_page *sp);
> > -
> > -	/* Update external page table from spte getting removed, and flush TLB. */
> 
> The above two comments are still useful to me.
> 
> Not sure why do you want to remove them, especially in _this_ patch?

My intent was to replace the individual comments with a more generic comment for
all of the "external" hooks.  For things like "and flush TLB", IMO those comments
belong at the call sites, not at this point.  E.g. _KVM_ doesn't require a TLB
flush in all cases.  And so for the definition of the hooks, I would prefer a more
generic comment, so that if there are details that matter to the usage, they are
documented there.

> >  	void (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> >  				     u64 mirror_spte);
> >  
> > +
> 
> Unintentional change?

Ya.

> 
> >  	bool (*has_wbinvd_exit)(void);
> >  
> >  	u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 3911ac9bddfd..9b5a6861e2a4 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -6690,11 +6690,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
> >  	vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
> >  	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
> >  
> > -	vcpu->arch.mmu_shadow_page_cache.init_value =
> > -		SHADOW_NONPRESENT_VALUE;
> > +	vcpu->arch.mmu_shadow_page_cache.init_value = SHADOW_NONPRESENT_VALUE;
> >  	if (!vcpu->arch.mmu_shadow_page_cache.init_value)
> >  		vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
> 
> Ditto.  Not sure this adjustment is intentional?

Heh, I'm pretty sure it was intentional, but yeah, doesn't belong here.
Re: [RFC PATCH v5 20/45] KVM: x86/mmu: Allocate/free S-EPT pages using tdx_{alloc,free}_control_page()
Posted by Huang, Kai 5 days ago
On Tue, 2026-02-03 at 12:17 -0800, Sean Christopherson wrote:
> On Tue, Feb 03, 2026, Kai Huang wrote:
> > On Wed, 2026-01-28 at 17:14 -0800, Sean Christopherson wrote:
> > >   	int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> > >   				 u64 mirror_spte);
> > > -
> > > -	/* Update external page tables for page table about to be freed. */
> > >   	void (*reclaim_external_sp)(struct kvm *kvm, gfn_t gfn,
> > >   				    struct kvm_mmu_page *sp);
> > > -
> > > -	/* Update external page table from spte getting removed, and flush TLB. */
> > 
> > The above two comments are still useful to me.
> > 
> > Not sure why do you want to remove them, especially in _this_ patch?
> 
> My intent was to replace the individual comments with a more generic comment for
> all of the "external" hooks.  For things like "and flush TLB", IMO those comments
> belong at the call sites, not at this point.  E.g. _KVM_ doesn't require a TLB
> flush in all cases.  And so for the definition of the hooks, I would prefer a more
> generic comment, so that if there are details that matter to the usage, they are
> documented there.

I see.  You actually mentioned "propagate changes in mirror page tables to
the external pages" in the new comment, so all make sense to me now.