[v3] TDX: Enable Dynamic PAMT

[PATCH v3 12/16] x86/virt/tdx: Add helpers to allow for pre-allocating pages

Posted by Rick Edgecombe 1 week, 6 days ago

In the KVM fault path pagei, tables and private pages need to be
installed under a spin lock. This means that the operations around
installing PAMT pages for them will not be able to allocate pages.

Create a small structure to allow passing a list of pre-allocated pages
that PAMT operations can use. Have the structure keep a count such that
it can be stored on KVM's vCPU structure, and "topped up" for each fault.
This is consistent with how KVM manages similar caches and will fit better
than allocating and freeing all possible needed pages each time.

Adding this structure duplicates a fancier one that lives in KVM 'struct
kvm_mmu_memory_cache'. While the struct itself is easy to expose, the
functions that operate on it are a bit big to put in a header, which
would be needed to use them from the core kernel. So don't pursue this
option.

To avoid problem of needing the kernel to link to functionality in KVM,
a function pointer could be passed, however this makes the code
convoluted, when what is needed is barely more than a linked list. So
create a tiny, simpler version of KVM's kvm_mmu_memory_cache to use for
PAMT pages.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/include/asm/tdx.h  | 43 ++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/tdx.c      | 16 +++++++++++---
 arch/x86/kvm/vmx/tdx.h      |  2 +-
 arch/x86/virt/vmx/tdx/tdx.c | 22 +++++++++++++------
 virt/kvm/kvm_main.c         |  2 --
 5 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 439dd5c5282e..e108b48af2c3 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -17,6 +17,7 @@
 #include <uapi/asm/mce.h>
 #include <asm/tdx_global_metadata.h>
 #include <linux/pgtable.h>
+#include <linux/memory.h>
 
 /*
  * Used by the #VE exception handler to gather the #VE exception
@@ -116,7 +117,46 @@ int tdx_guest_keyid_alloc(void);
 u32 tdx_get_nr_guest_keyids(void);
 void tdx_guest_keyid_free(unsigned int keyid);
 
-int tdx_pamt_get(struct page *page);
+int tdx_dpamt_entry_pages(void);
+
+/*
+ * Simple structure for pre-allocating Dynamic
+ * PAMT pages outside of locks.
+ */
+struct tdx_prealloc {
+	struct list_head page_list;
+	int cnt;
+};
+
+static inline struct page *get_tdx_prealloc_page(struct tdx_prealloc *prealloc)
+{
+	struct page *page;
+
+	page = list_first_entry_or_null(&prealloc->page_list, struct page, lru);
+	if (page) {
+		list_del(&page->lru);
+		prealloc->cnt--;
+	}
+
+	return page;
+}
+
+static inline int topup_tdx_prealloc_page(struct tdx_prealloc *prealloc, unsigned int min_size)
+{
+	while (prealloc->cnt < min_size) {
+		struct page *page = alloc_page(GFP_KERNEL);
+
+		if (!page)
+			return -ENOMEM;
+
+		list_add(&page->lru, &prealloc->page_list);
+		prealloc->cnt++;
+	}
+
+	return 0;
+}
+
+int tdx_pamt_get(struct page *page, struct tdx_prealloc *prealloc);
 void tdx_pamt_put(struct page *page);
 
 struct page *tdx_alloc_page(void);
@@ -192,6 +232,7 @@ static inline int tdx_enable(void)  { return -ENODEV; }
 static inline u32 tdx_get_nr_guest_keyids(void) { return 0; }
 static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; }
 static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; }
+static inline int tdx_dpamt_entry_pages(void) { return 0; }
 #endif	/* CONFIG_INTEL_TDX_HOST */
 
 #endif /* !__ASSEMBLER__ */
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 6c9e11be9705..b274d350165c 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1593,16 +1593,26 @@ static void tdx_unpin(struct kvm *kvm, struct page *page)
 static void *tdx_alloc_external_fault_cache(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_tdx *tdx = to_tdx(vcpu);
+	struct page *page = get_tdx_prealloc_page(&tdx->prealloc);
 
-	return kvm_mmu_memory_cache_alloc(&tdx->mmu_external_spt_cache);
+	if (!page)
+		return NULL;
+
+	return page_address(page);
 }
 
 static int tdx_topup_external_fault_cache(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_tdx *tdx = to_tdx(vcpu);
+	struct tdx_prealloc *prealloc = &tdx->prealloc;
+	int min_fault_cache_size;
 
-	return kvm_mmu_topup_memory_cache(&tdx->mmu_external_spt_cache,
-					  PT64_ROOT_MAX_LEVEL);
+	/* External page tables */
+	min_fault_cache_size = PT64_ROOT_MAX_LEVEL;
+	/* Dynamic PAMT pages (if enabled) */
+	min_fault_cache_size += tdx_dpamt_entry_pages() * PT64_ROOT_MAX_LEVEL;
+
+	return topup_tdx_prealloc_page(prealloc, min_fault_cache_size);
 }
 
 static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index cd7993ef056e..68bb841c1b6c 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -71,7 +71,7 @@ struct vcpu_tdx {
 	u64 map_gpa_next;
 	u64 map_gpa_end;
 
-	struct kvm_mmu_memory_cache mmu_external_spt_cache;
+	struct tdx_prealloc prealloc;
 };
 
 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err);
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index c25e238931a7..b4edc3ee495c 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1999,13 +1999,23 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
 EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
 
 /* Number PAMT pages to be provided to TDX module per 2M region of PA */
-static int tdx_dpamt_entry_pages(void)
+int tdx_dpamt_entry_pages(void)
 {
 	if (!tdx_supports_dynamic_pamt(&tdx_sysinfo))
 		return 0;
 
 	return tdx_sysinfo.tdmr.pamt_4k_entry_size * PTRS_PER_PTE / PAGE_SIZE;
 }
+EXPORT_SYMBOL_GPL(tdx_dpamt_entry_pages);
+
+static struct page *alloc_dpamt_page(struct tdx_prealloc *prealloc)
+{
+	if (prealloc)
+		return get_tdx_prealloc_page(prealloc);
+
+	return alloc_page(GFP_KERNEL);
+}
+
 
 /*
  * The TDX spec treats the registers like an array, as they are ordered
@@ -2032,12 +2042,12 @@ static u64 *dpamt_args_array_ptr(struct tdx_module_args *args)
 	return (u64 *)((u8 *)args + offsetof(struct tdx_module_args, rdx));
 }
 
-static int alloc_pamt_array(u64 *pa_array)
+static int alloc_pamt_array(u64 *pa_array, struct tdx_prealloc *prealloc)
 {
 	struct page *page;
 
 	for (int i = 0; i < tdx_dpamt_entry_pages(); i++) {
-		page = alloc_page(GFP_KERNEL);
+		page = alloc_dpamt_page(prealloc);
 		if (!page)
 			return -ENOMEM;
 		pa_array[i] = page_to_phys(page);
@@ -2111,7 +2121,7 @@ static u64 tdh_phymem_pamt_remove(unsigned long hpa, u64 *pamt_pa_array)
 static DEFINE_SPINLOCK(pamt_lock);
 
 /* Bump PAMT refcount for the given page and allocate PAMT memory if needed */
-int tdx_pamt_get(struct page *page)
+int tdx_pamt_get(struct page *page, struct tdx_prealloc *prealloc)
 {
 	unsigned long hpa = ALIGN_DOWN(page_to_phys(page), PMD_SIZE);
 	u64 pamt_pa_array[MAX_DPAMT_ARG_SIZE];
@@ -2122,7 +2132,7 @@ int tdx_pamt_get(struct page *page)
 	if (!tdx_supports_dynamic_pamt(&tdx_sysinfo))
 		return 0;
 
-	ret = alloc_pamt_array(pamt_pa_array);
+	ret = alloc_pamt_array(pamt_pa_array, prealloc);
 	if (ret)
 		return ret;
 
@@ -2228,7 +2238,7 @@ struct page *tdx_alloc_page(void)
 	if (!page)
 		return NULL;
 
-	if (tdx_pamt_get(page)) {
+	if (tdx_pamt_get(page, NULL)) {
 		__free_page(page);
 		return NULL;
 	}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f05e6d43184b..fee108988028 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -404,7 +404,6 @@ int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
 {
 	return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_topup_memory_cache);
 
 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
 {
@@ -437,7 +436,6 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
 	BUG_ON(!p);
 	return p;
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_memory_cache_alloc);
 #endif
 
 static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
-- 
2.51.0

Re: [PATCH v3 12/16] x86/virt/tdx: Add helpers to allow for pre-allocating pages

Posted by Dave Hansen 5 days, 16 hours ago

On 9/18/25 16:22, Rick Edgecombe wrote:
> +/*
> + * Simple structure for pre-allocating Dynamic
> + * PAMT pages outside of locks.
> + */
> +struct tdx_prealloc {
> +	struct list_head page_list;
> +	int cnt;
> +};

This is compact and all. But it's really just an open-coded, simplified
version of what mempool_t plus mempool_init_page_pool() would do.

Could you take a look at that and double check that it's not a good fit
here, please?

Re: [PATCH v3 12/16] x86/virt/tdx: Add helpers to allow for pre-allocating pages

Posted by Edgecombe, Rick P 5 days, 15 hours ago

On Fri, 2025-09-26 at 08:19 -0700, Dave Hansen wrote:
> On 9/18/25 16:22, Rick Edgecombe wrote:
> > +/*
> > + * Simple structure for pre-allocating Dynamic
> > + * PAMT pages outside of locks.
> > + */
> > +struct tdx_prealloc {
> > +	struct list_head page_list;
> > +	int cnt;
> > +};
> 
> This is compact and all. But it's really just an open-coded,
> simplified version of what mempool_t plus mempool_init_page_pool()
> would do.
> 
> Could you take a look at that and double check that it's not a good
> fit here, please?

Yes! I searched and was surprised there wasn't something like this. I
will give this one a try.

Re: [PATCH v3 12/16] x86/virt/tdx: Add helpers to allow for pre-allocating pages

Posted by Yan Zhao 6 days, 6 hours ago

On Thu, Sep 18, 2025 at 04:22:20PM -0700, Rick Edgecombe wrote:
> In the KVM fault path pagei, tables and private pages need to be
> installed under a spin lock. This means that the operations around
> installing PAMT pages for them will not be able to allocate pages.
> 
> Create a small structure to allow passing a list of pre-allocated pages
> that PAMT operations can use. Have the structure keep a count such that
> it can be stored on KVM's vCPU structure, and "topped up" for each fault.
> This is consistent with how KVM manages similar caches and will fit better
> than allocating and freeing all possible needed pages each time.
> 
> Adding this structure duplicates a fancier one that lives in KVM 'struct
> kvm_mmu_memory_cache'. While the struct itself is easy to expose, the
> functions that operate on it are a bit big to put in a header, which
> would be needed to use them from the core kernel. So don't pursue this
> option.
> 
> To avoid problem of needing the kernel to link to functionality in KVM,
> a function pointer could be passed, however this makes the code
> convoluted, when what is needed is barely more than a linked list. So
> create a tiny, simpler version of KVM's kvm_mmu_memory_cache to use for
> PAMT pages.
> 
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> ---
>  arch/x86/include/asm/tdx.h  | 43 ++++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/vmx/tdx.c      | 16 +++++++++++---
>  arch/x86/kvm/vmx/tdx.h      |  2 +-
>  arch/x86/virt/vmx/tdx/tdx.c | 22 +++++++++++++------
>  virt/kvm/kvm_main.c         |  2 --
>  5 files changed, 72 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> index 439dd5c5282e..e108b48af2c3 100644
> --- a/arch/x86/include/asm/tdx.h
> +++ b/arch/x86/include/asm/tdx.h
> @@ -17,6 +17,7 @@
>  #include <uapi/asm/mce.h>
>  #include <asm/tdx_global_metadata.h>
>  #include <linux/pgtable.h>
> +#include <linux/memory.h>
>  
>  /*
>   * Used by the #VE exception handler to gather the #VE exception
> @@ -116,7 +117,46 @@ int tdx_guest_keyid_alloc(void);
>  u32 tdx_get_nr_guest_keyids(void);
>  void tdx_guest_keyid_free(unsigned int keyid);
>  
> -int tdx_pamt_get(struct page *page);
> +int tdx_dpamt_entry_pages(void);
> +
> +/*
> + * Simple structure for pre-allocating Dynamic
> + * PAMT pages outside of locks.
> + */
> +struct tdx_prealloc {
> +	struct list_head page_list;
> +	int cnt;
> +};
> +
> +static inline struct page *get_tdx_prealloc_page(struct tdx_prealloc *prealloc)
> +{
> +	struct page *page;
> +
> +	page = list_first_entry_or_null(&prealloc->page_list, struct page, lru);
> +	if (page) {
> +		list_del(&page->lru);
> +		prealloc->cnt--;
> +	}
> +
> +	return page;
> +}
> +
> +static inline int topup_tdx_prealloc_page(struct tdx_prealloc *prealloc, unsigned int min_size)
> +{
> +	while (prealloc->cnt < min_size) {
> +		struct page *page = alloc_page(GFP_KERNEL);
> +
> +		if (!page)
> +			return -ENOMEM;
> +
> +		list_add(&page->lru, &prealloc->page_list);
> +		prealloc->cnt++;
> +	}
> +
> +	return 0;
> +}
> +
> +int tdx_pamt_get(struct page *page, struct tdx_prealloc *prealloc);
>  void tdx_pamt_put(struct page *page);
>  
>  struct page *tdx_alloc_page(void);
> @@ -192,6 +232,7 @@ static inline int tdx_enable(void)  { return -ENODEV; }
>  static inline u32 tdx_get_nr_guest_keyids(void) { return 0; }
>  static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; }
>  static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; }
> +static inline int tdx_dpamt_entry_pages(void) { return 0; }
>  #endif	/* CONFIG_INTEL_TDX_HOST */
>  
>  #endif /* !__ASSEMBLER__ */
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 6c9e11be9705..b274d350165c 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -1593,16 +1593,26 @@ static void tdx_unpin(struct kvm *kvm, struct page *page)
>  static void *tdx_alloc_external_fault_cache(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_tdx *tdx = to_tdx(vcpu);
> +	struct page *page = get_tdx_prealloc_page(&tdx->prealloc);
>  
> -	return kvm_mmu_memory_cache_alloc(&tdx->mmu_external_spt_cache);
> +	if (!page)
> +		return NULL;
> +
> +	return page_address(page);
>  }
>  
>  static int tdx_topup_external_fault_cache(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_tdx *tdx = to_tdx(vcpu);
> +	struct tdx_prealloc *prealloc = &tdx->prealloc;
> +	int min_fault_cache_size;
>  
> -	return kvm_mmu_topup_memory_cache(&tdx->mmu_external_spt_cache,
> -					  PT64_ROOT_MAX_LEVEL);
> +	/* External page tables */
> +	min_fault_cache_size = PT64_ROOT_MAX_LEVEL;

min_fault_cache_size = PT64_ROOT_MAX_LEVEL - 1?
We don't need to allocate page for the root page.

> +	/* Dynamic PAMT pages (if enabled) */
> +	min_fault_cache_size += tdx_dpamt_entry_pages() * PT64_ROOT_MAX_LEVEL;
> +
What about commenting that it's
tdx_dpamt_entry_pages() * ((PT64_ROOT_MAX_LEVEL - 1) + 1) ?
i.e.,
(PT64_ROOT_MAX_LEVEL  - 1) for page table pages, and 1 for guest private page.


> +	return topup_tdx_prealloc_page(prealloc, min_fault_cache_size);
>  }
>  
>  static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index cd7993ef056e..68bb841c1b6c 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -71,7 +71,7 @@ struct vcpu_tdx {
>  	u64 map_gpa_next;
>  	u64 map_gpa_end;
>  
> -	struct kvm_mmu_memory_cache mmu_external_spt_cache;
> +	struct tdx_prealloc prealloc;
>  };
>  
>  void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err);
> diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> index c25e238931a7..b4edc3ee495c 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -1999,13 +1999,23 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
>  EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
>  
>  /* Number PAMT pages to be provided to TDX module per 2M region of PA */
> -static int tdx_dpamt_entry_pages(void)
> +int tdx_dpamt_entry_pages(void)
>  {
>  	if (!tdx_supports_dynamic_pamt(&tdx_sysinfo))
>  		return 0;
>  
>  	return tdx_sysinfo.tdmr.pamt_4k_entry_size * PTRS_PER_PTE / PAGE_SIZE;
>  }
> +EXPORT_SYMBOL_GPL(tdx_dpamt_entry_pages);
> +
> +static struct page *alloc_dpamt_page(struct tdx_prealloc *prealloc)
> +{
> +	if (prealloc)
> +		return get_tdx_prealloc_page(prealloc);
> +
> +	return alloc_page(GFP_KERNEL);
> +}
> +
>  
>  /*
>   * The TDX spec treats the registers like an array, as they are ordered
> @@ -2032,12 +2042,12 @@ static u64 *dpamt_args_array_ptr(struct tdx_module_args *args)
>  	return (u64 *)((u8 *)args + offsetof(struct tdx_module_args, rdx));
>  }
>  
> -static int alloc_pamt_array(u64 *pa_array)
> +static int alloc_pamt_array(u64 *pa_array, struct tdx_prealloc *prealloc)
>  {
>  	struct page *page;
>  
>  	for (int i = 0; i < tdx_dpamt_entry_pages(); i++) {
> -		page = alloc_page(GFP_KERNEL);
> +		page = alloc_dpamt_page(prealloc);
>  		if (!page)
>  			return -ENOMEM;
>  		pa_array[i] = page_to_phys(page);
> @@ -2111,7 +2121,7 @@ static u64 tdh_phymem_pamt_remove(unsigned long hpa, u64 *pamt_pa_array)
>  static DEFINE_SPINLOCK(pamt_lock);
>  
>  /* Bump PAMT refcount for the given page and allocate PAMT memory if needed */
> -int tdx_pamt_get(struct page *page)
> +int tdx_pamt_get(struct page *page, struct tdx_prealloc *prealloc)
>  {
>  	unsigned long hpa = ALIGN_DOWN(page_to_phys(page), PMD_SIZE);
>  	u64 pamt_pa_array[MAX_DPAMT_ARG_SIZE];
> @@ -2122,7 +2132,7 @@ int tdx_pamt_get(struct page *page)
>  	if (!tdx_supports_dynamic_pamt(&tdx_sysinfo))
>  		return 0;
>  
> -	ret = alloc_pamt_array(pamt_pa_array);
> +	ret = alloc_pamt_array(pamt_pa_array, prealloc);
>  	if (ret)
>  		return ret;
>  
> @@ -2228,7 +2238,7 @@ struct page *tdx_alloc_page(void)
>  	if (!page)
>  		return NULL;
>  
> -	if (tdx_pamt_get(page)) {
> +	if (tdx_pamt_get(page, NULL)) {
>  		__free_page(page);
>  		return NULL;
>  	}
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index f05e6d43184b..fee108988028 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -404,7 +404,6 @@ int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
>  {
>  	return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
>  }
> -EXPORT_SYMBOL_GPL(kvm_mmu_topup_memory_cache);
>  
>  int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
>  {
> @@ -437,7 +436,6 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
>  	BUG_ON(!p);
>  	return p;
>  }
> -EXPORT_SYMBOL_GPL(kvm_mmu_memory_cache_alloc);
>  #endif
>  
>  static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
> -- 
> 2.51.0
>

Re: [PATCH v3 12/16] x86/virt/tdx: Add helpers to allow for pre-allocating pages

Posted by Edgecombe, Rick P 5 days, 9 hours ago

On Fri, 2025-09-26 at 09:44 +0800, Yan Zhao wrote:
> > -	return kvm_mmu_topup_memory_cache(&tdx-
> > >mmu_external_spt_cache,
> > -					  PT64_ROOT_MAX_LEVEL);
> > +	/* External page tables */
> > +	min_fault_cache_size = PT64_ROOT_MAX_LEVEL;
> 
> min_fault_cache_size = PT64_ROOT_MAX_LEVEL - 1?
> We don't need to allocate page for the root page.

Why change it in this patch?

> 
> > +	/* Dynamic PAMT pages (if enabled) */
> > +	min_fault_cache_size += tdx_dpamt_entry_pages() *
> > PT64_ROOT_MAX_LEVEL;
> > +
> What about commenting that it's
> tdx_dpamt_entry_pages() * ((PT64_ROOT_MAX_LEVEL - 1) + 1) ?
> i.e.,
> (PT64_ROOT_MAX_LEVEL  - 1) for page table pages, and 1 for guest
> private page.

Yes the comment could be improved. I'll enhance it. Thanks.

Re: [PATCH v3 12/16] x86/virt/tdx: Add helpers to allow for pre-allocating pages

Posted by Yan Zhao 4 days, 6 hours ago

On Sat, Sep 27, 2025 at 06:05:12AM +0800, Edgecombe, Rick P wrote:
> On Fri, 2025-09-26 at 09:44 +0800, Yan Zhao wrote:
> > > -	return kvm_mmu_topup_memory_cache(&tdx-
> > > >mmu_external_spt_cache,
> > > -					  PT64_ROOT_MAX_LEVEL);
> > > +	/* External page tables */
> > > +	min_fault_cache_size = PT64_ROOT_MAX_LEVEL;
> > 
> > min_fault_cache_size = PT64_ROOT_MAX_LEVEL - 1?
> > We don't need to allocate page for the root page.
> 
> Why change it in this patch?
The previous size determined by TDP MMU core was just unnecessarily 1 more than
required (I guess it's due to historical reasons).

But since now the size is determined by TDX itself, I think it can be accurate.

> > > +	/* Dynamic PAMT pages (if enabled) */
> > > +	min_fault_cache_size += tdx_dpamt_entry_pages() *
> > > PT64_ROOT_MAX_LEVEL;
> > > +
> > What about commenting that it's
> > tdx_dpamt_entry_pages() * ((PT64_ROOT_MAX_LEVEL - 1) + 1) ?
> > i.e.,
> > (PT64_ROOT_MAX_LEVEL  - 1) for page table pages, and 1 for guest
> > private page.
> 
> Yes the comment could be improved. I'll enhance it. Thanks.

Re: [PATCH v3 12/16] x86/virt/tdx: Add helpers to allow for pre-allocating pages

Posted by Huang, Kai 1 week, 2 days ago

> +/*
> + * Simple structure for pre-allocating Dynamic
> + * PAMT pages outside of locks.
> + */
> +struct tdx_prealloc {
> +	struct list_head page_list;
> +	int cnt;
> +};
> +
> +static inline struct page *get_tdx_prealloc_page(struct tdx_prealloc *prealloc)
> +{
> +	struct page *page;
> +
> +	page = list_first_entry_or_null(&prealloc->page_list, struct page, lru);
> +	if (page) {
> +		list_del(&page->lru);
> +		prealloc->cnt--;
> +	}
> +
> +	return page;
> +}
> +
> +static inline int topup_tdx_prealloc_page(struct tdx_prealloc *prealloc, unsigned int min_size)
> +{
> +	while (prealloc->cnt < min_size) {
> +		struct page *page = alloc_page(GFP_KERNEL);
> +
> +		if (!page)
> +			return -ENOMEM;
> +
> +		list_add(&page->lru, &prealloc->page_list);
> +		prealloc->cnt++;
> +	}
> +
> +	return 0;
> +}

Since 'struct tdx_prealloc' replaces the KVM standard 'struct
kvm_mmu_memory_cache' for external page table, and it is allowed to fail
in "topup" operation, why not just call tdx_alloc_page() to "topup" page
for external page table here?

I don't think we need to keep all "DPAMT pages" in the pool, right?

If tdx_alloc_page() succeeds, then the "DPAMT pages" are also "topup"ed,
and PAMT entries for the 2M range of the SEPT page is ready too.

This at least avoids having to export tdx_dpamt_entry_pages(), which is
not nice IMHO.  And I think it should yield simpler code.

One more thinking:

I also have been thinking whether we can continue to use the KVM standard
'struct kvm_mmu_memory_cache' for S-EPT pages.  Below is one more idea for
your reference.

In the previous discussion I think we concluded the 'kmem_cache' doesn't
work nicely with DPAMT (due to the ctor() cannot fail etc).  And when we
don't use 'kmem_cache', KVM just call __get_free_page() to topup objects.
But we need tdx_alloc_page() for allocation here, so this is the problem.

If we add two callbacks for object allocation/free to 'struct
kvm_mmu_memory_cache', then we can have place to hook tdx_alloc_page().

Something like the below:

diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 827ecc0b7e10..5dbd80773689 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -88,6 +88,8 @@ struct kvm_mmu_memory_cache {
        gfp_t gfp_custom;
        u64 init_value;
        struct kmem_cache *kmem_cache;
+       void*(*obj_alloc)(gfp_t gfp);
+       void (*obj_free)(void *);
        int capacity;
        int nobjs;
        void **objects;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6c07dd423458..df2c2100d656 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -355,7 +355,10 @@ static inline void *mmu_memory_cache_alloc_obj(struct
kvm_mmu_memory_cache *mc,
        if (mc->kmem_cache)
                return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
 
-       page = (void *)__get_free_page(gfp_flags);
+       if (mc->obj_alloc)
+               page = mc->obj_alloc(gfp_flags);
+       else
+               page = (void *)__get_free_page(gfp_flags);
        if (page && mc->init_value)
                memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64));
        return page;
@@ -415,6 +418,8 @@ void kvm_mmu_free_memory_cache(struct
kvm_mmu_memory_cache *mc)
        while (mc->nobjs) {
                if (mc->kmem_cache)
                        kmem_cache_free(mc->kmem_cache, mc->objects[--mc-
>nobjs]);
+               else if (mc->obj_free)
+                       mc->obj_free(mc->objects[--mc->nobjs]);
                else
                        free_page((unsigned long)mc->objects[--mc-
>nobjs]);
        }

Re: [PATCH v3 12/16] x86/virt/tdx: Add helpers to allow for pre-allocating pages

Posted by Edgecombe, Rick P 5 days, 8 hours ago

On Mon, 2025-09-22 at 11:20 +0000, Huang, Kai wrote:
> Since 'struct tdx_prealloc' replaces the KVM standard 'struct
> kvm_mmu_memory_cache' for external page table, and it is allowed to
> fail in "topup" operation, why not just call tdx_alloc_page() to
> "topup" page for external page table here?

I sympathize with the intuition. It would be nice to just prep
everything and then operate on it like normal.

We want this to not have to be totally redone for huge pages. In the
huge pages case, we could do this approach for the page tables, but for
the private page itself, we don't know whether we need 4KB PAMT backing
or not. So we don't fully know whether a TDX private page needs PAMT
4KB backing or not before the fault.

So we would need, like, separate pools for page tables and private
pages. Or someway to unwind the wrong guess of small page. At that
point I don't think it's simpler.

> 
> I don't think we need to keep all "DPAMT pages" in the pool, right?

Not sure what you mean by this.

> 
> If tdx_alloc_page() succeeds, then the "DPAMT pages" are also
> "topup"ed, and PAMT entries for the 2M range of the SEPT page is
> ready too.
> 
> This at least avoids having to export tdx_dpamt_entry_pages(), which
> is not nice IMHO.  And I think it should yield simpler code.

I mean less exports is better, but I don't follow what is so egregious.
It's not called from core KVM code.

> 
> One more thinking:
> 
> I also have been thinking whether we can continue to use the KVM
> standard 'struct kvm_mmu_memory_cache' for S-EPT pages.  Below is one
> more idea for your reference.

The point of the new struct was to hand it to the arch/x86 side of the
house. If we don't need to do that, then yes we could have options. And
Dave suggested another struct that could be used to hand off the cache.

> 
> In the previous discussion I think we concluded the 'kmem_cache'
> doesn't work nicely with DPAMT (due to the ctor() cannot fail etc). 
> And when we don't use 'kmem_cache', KVM just call __get_free_page()
> to topup objects.
> But we need tdx_alloc_page() for allocation here, so this is the
> problem.
> 
> If we add two callbacks for object allocation/free to 'struct
> kvm_mmu_memory_cache', then we can have place to hook
> tdx_alloc_page().

kvm_mmu_memory_cache has a lot of options at this point. All we really
need is a list. I'm not sure it makes sense to keep cramming things
into it?

> 
> Something like the below:
> 
> diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
> index 827ecc0b7e10..5dbd80773689 100644
> --- a/include/linux/kvm_types.h
> +++ b/include/linux/kvm_types.h
> @@ -88,6 +88,8 @@ struct kvm_mmu_memory_cache {
>         gfp_t gfp_custom;
>         u64 init_value;
>         struct kmem_cache *kmem_cache;
> +       void*(*obj_alloc)(gfp_t gfp);
> +       void (*obj_free)(void *);
>         int capacity;
>         int nobjs;
>         void **objects;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 6c07dd423458..df2c2100d656 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -355,7 +355,10 @@ static inline void
> *mmu_memory_cache_alloc_obj(struct
> kvm_mmu_memory_cache *mc,
>         if (mc->kmem_cache)
>                 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
>  
> -       page = (void *)__get_free_page(gfp_flags);
> +       if (mc->obj_alloc)
> +               page = mc->obj_alloc(gfp_flags);
> +       else
> +               page = (void *)__get_free_page(gfp_flags);
>         if (page && mc->init_value)
>                 memset64(page, mc->init_value, PAGE_SIZE /
> sizeof(u64));
>         return page;
> @@ -415,6 +418,8 @@ void kvm_mmu_free_memory_cache(struct
> kvm_mmu_memory_cache *mc)
>         while (mc->nobjs) {
>                 if (mc->kmem_cache)
>                         kmem_cache_free(mc->kmem_cache, mc->objects[-
> -mc-
> > nobjs]);
> +               else if (mc->obj_free)
> +                       mc->obj_free(mc->objects[--mc->nobjs]);
>                 else
>                         free_page((unsigned long)mc->objects[--mc-
> > nobjs]);
>         }

Re: [PATCH v3 12/16] x86/virt/tdx: Add helpers to allow for pre-allocating pages

Posted by Huang, Kai 3 days, 8 hours ago

On Fri, 2025-09-26 at 23:47 +0000, Edgecombe, Rick P wrote:
> On Mon, 2025-09-22 at 11:20 +0000, Huang, Kai wrote:
> > Since 'struct tdx_prealloc' replaces the KVM standard 'struct
> > kvm_mmu_memory_cache' for external page table, and it is allowed to
> > fail in "topup" operation, why not just call tdx_alloc_page() to
> > "topup" page for external page table here?
> 
> I sympathize with the intuition. It would be nice to just prep
> everything and then operate on it like normal.
> 
> We want this to not have to be totally redone for huge pages. In the
> huge pages case, we could do this approach for the page tables, but for
> the private page itself, we don't know whether we need 4KB PAMT backing
> or not. So we don't fully know whether a TDX private page needs PAMT
> 4KB backing or not before the fault.
> 
> So we would need, like, separate pools for page tables and private
> pages. 
> 

The private page itself comes from the guest_memfd via page cache, so we
cannot use tdx_alloc_page() for it anyway but need to use tdx_pamt_get()
explicitly.

I don't know all details of exactly what is the interaction with huge page
here -- my imagine would be we only call tdx_pamt_get() when we found the
the private page is 4K but not 2M [*], but I don't see how this conflicts
with using tdx_alloc_page() for page table itself.

The point is page table itself is always 4K, therefore it has no
difference from control pages.

[*] this is actually an optimization but not a must for supporting
hugepage with DPAMT AFAICT.  Theoretically, we can always allocate DPAMT
pages upfront for hugepage at allocation time in guest_memfd regardless
whether it is actually mapped as hugepage in S-EPT, and we can free DPAMT
pages when we promote 4K pages to a hugepage in S-EPT to effectively
reduce DPAMT pages for hugepage.

> Or someway to unwind the wrong guess of small page. At that
> point I don't think it's simpler.
> 
> > 
> > I don't think we need to keep all "DPAMT pages" in the pool, right?
> 
> Not sure what you mean by this.

I mean we don't need to keep DPAMT pages in the list of 'struct
tdx_prealloc'.  tdx_pamt_put() get the DPAMT pages from the TDX module and
just frees them.

> 
> > 
> > If tdx_alloc_page() succeeds, then the "DPAMT pages" are also
> > "topup"ed, and PAMT entries for the 2M range of the SEPT page is
> > ready too.
> > 
> > This at least avoids having to export tdx_dpamt_entry_pages(), which
> > is not nice IMHO.  And I think it should yield simpler code.
> 
> I mean less exports is better, but I don't follow what is so egregious.
> It's not called from core KVM code.
> 
> > 
> > One more thinking:
> > 
> > I also have been thinking whether we can continue to use the KVM
> > standard 'struct kvm_mmu_memory_cache' for S-EPT pages.  Below is one
> > more idea for your reference.
> 
> The point of the new struct was to hand it to the arch/x86 side of the
> house. If we don't need to do that, then yes we could have options. And
> Dave suggested another struct that could be used to hand off the cache.
> 
> > 
> > In the previous discussion I think we concluded the 'kmem_cache'
> > doesn't work nicely with DPAMT (due to the ctor() cannot fail etc). 
> > And when we don't use 'kmem_cache', KVM just call __get_free_page()
> > to topup objects.
> > But we need tdx_alloc_page() for allocation here, so this is the
> > problem.
> > 
> > If we add two callbacks for object allocation/free to 'struct
> > kvm_mmu_memory_cache', then we can have place to hook
> > tdx_alloc_page().
> 
> kvm_mmu_memory_cache has a lot of options at this point. All we really
> need is a list. I'm not sure it makes sense to keep cramming things
> into it?

It comes down to whether we want to continue to reuse
'kvm_mmu_memory_cache' (which is already implemented in KVM), or we want
to use a different infrastructure for tracking S-EPT pages.

Anyway just my 2cents for your reference.

Re: [PATCH v3 12/16] x86/virt/tdx: Add helpers to allow for pre-allocating pages

Posted by Huang, Kai 2 days, 19 hours ago

On Sun, 2025-09-28 at 22:56 +0000, Huang, Kai wrote:
> On Fri, 2025-09-26 at 23:47 +0000, Edgecombe, Rick P wrote:
> > On Mon, 2025-09-22 at 11:20 +0000, Huang, Kai wrote:
> > > Since 'struct tdx_prealloc' replaces the KVM standard 'struct
> > > kvm_mmu_memory_cache' for external page table, and it is allowed to
> > > fail in "topup" operation, why not just call tdx_alloc_page() to
> > > "topup" page for external page table here?
> > 
> > I sympathize with the intuition. It would be nice to just prep
> > everything and then operate on it like normal.
> > 
> > We want this to not have to be totally redone for huge pages. In the
> > huge pages case, we could do this approach for the page tables, but for
> > the private page itself, we don't know whether we need 4KB PAMT backing
> > or not. So we don't fully know whether a TDX private page needs PAMT
> > 4KB backing or not before the fault.
> > 
> > So we would need, like, separate pools for page tables and private
> > pages. 
> > 
> 
> The private page itself comes from the guest_memfd via page cache, so we
> cannot use tdx_alloc_page() for it anyway but need to use tdx_pamt_get()
> explicitly.
> 
> I don't know all details of exactly what is the interaction with huge page
> here -- my imagine would be we only call tdx_pamt_get() when we found the
> the private page is 4K but not 2M [*], but I don't see how this conflicts
> with using tdx_alloc_page() for page table itself.
> 
> The point is page table itself is always 4K, therefore it has no
> difference from control pages.
> 
> [*] this is actually an optimization but not a must for supporting
> hugepage with DPAMT AFAICT.  Theoretically, we can always allocate DPAMT
> pages upfront for hugepage at allocation time in guest_memfd regardless
> whether it is actually mapped as hugepage in S-EPT, and we can free DPAMT
> pages when we promote 4K pages to a hugepage in S-EPT to effectively
> reduce DPAMT pages for hugepage.

After second thought, please ignore the [*], since I am not sure whether
allocating DPAMT pages for hugepage upfront is a good idea.  I suppose
hugepage and 4K pages can convert to each other at runtime, so perhaps
it's better to handle DPAMT pages when KVM actually maps the leaf TDX
private page.

So seems it's inevitable we need to manage a pool for the leaf TDX private
pages -- for hugepage support at least, since w/o hugepage we may avoid
this pool (e.g., theoretically, we could do tdx_pamt_get() after
kvm_mmu_faultin_pfn() where fault->pfn is ready).

Given you said "We want this to not have to be totally redone for huge
pages.", I can see why you want to use a single pool for both page table
and the leaf TDX private pages.

But maybe another strategy is we could use simplest way for the initial
DPAMT support w/o huge page support first, and leave this to the hugepage
support series.  I don't know.  Just my 2cents.

Re: [PATCH v3 12/16] x86/virt/tdx: Add helpers to allow for pre-allocating pages

Posted by Kiryl Shutsemau 1 week, 5 days ago

On Thu, Sep 18, 2025 at 04:22:20PM -0700, Rick Edgecombe wrote:
> In the KVM fault path pagei, tables and private pages need to be

Typo.

> installed under a spin lock. This means that the operations around
> installing PAMT pages for them will not be able to allocate pages.
> 

> +static inline struct page *get_tdx_prealloc_page(struct tdx_prealloc *prealloc)
> +{
> +	struct page *page;
> +
> +	page = list_first_entry_or_null(&prealloc->page_list, struct page, lru);
> +	if (page) {
> +		list_del(&page->lru);
> +		prealloc->cnt--;
> +	}
> +

For prealloc->cnt == 0, kvm_mmu_memory_cache_alloc() does allocation
with GFP_ATOMIC after WARN().

Do we want the same here?


-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [PATCH v3 12/16] x86/virt/tdx: Add helpers to allow for pre-allocating pages

Posted by Edgecombe, Rick P 12 hours ago

On Fri, 2025-09-19 at 10:55 +0100, Kiryl Shutsemau wrote:
> > installed under a spin lock. This means that the operations around
> > installing PAMT pages for them will not be able to allocate pages.
> > 
> 
> > +static inline struct page *get_tdx_prealloc_page(struct tdx_prealloc
> > *prealloc)
> > +{
> > +	struct page *page;
> > +
> > +	page = list_first_entry_or_null(&prealloc->page_list, struct page,
> > lru);
> > +	if (page) {
> > +		list_del(&page->lru);
> > +		prealloc->cnt--;
> > +	}
> > +
> 
> For prealloc->cnt == 0, kvm_mmu_memory_cache_alloc() does allocation
> with GFP_ATOMIC after WARN().
> 
> Do we want the same here?

Uhh, hmm. I'd hope to keep it as simple as possible, but I suppose that this is
a regression from the current upstream level of safety. I'll add it.