[RFC, PATCH 02/12] x86/virt/tdx: Allocate reference counters for PAMT memory

Kirill A. Shutemov posted 12 patches 9 months, 1 week ago
There is a newer version of this series
[RFC, PATCH 02/12] x86/virt/tdx: Allocate reference counters for PAMT memory
Posted by Kirill A. Shutemov 9 months, 1 week ago
The PAMT memory holds metadata for TDX-protected memory. With Dynamic
PAMT, PAMT_4K is allocated on demand. The kernel supplies the TDX module
with a page pair that covers 2M of host physical memory.

The kernel must provide this page pair before using pages from the range
for TDX. If this is not done, any SEAMCALL that attempts to use the
memory will fail.

Allocate reference counters for every 2M range to track PAMT memory
usage. This is necessary to accurately determine when PAMT memory needs
to be allocated and when it can be freed.

This allocation will consume 2MiB for every 1TiB of physical memory.

Tracking PAMT memory usage on the kernel side duplicates what TDX module
does.  It is possible to avoid this by lazily allocating PAMT memory on
SEAMCALL failure and freeing it based on hints provided by the TDX
module when the last user of PAMT memory is no longer present.

However, this approach complicates serialization.

The TDX module takes locks when dealing with PAMT: a shared lock on any
SEAMCALL that uses explicit HPA and an exclusive lock on PAMT.ADD and
PAMT.REMOVE. Any SEAMCALL that uses explicit HPA as an operand may fail
if it races with PAMT.ADD/REMOVE.

Since PAMT is a global resource, to prevent failure the kernel would
need global locking (per-TD is not sufficient). Or, it has to retry on
TDX_OPERATOR_BUSY.

Both options are not ideal, and tracking PAMT usage on the kernel side
seems like a reasonable alternative.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 arch/x86/virt/vmx/tdx/tdx.c | 113 +++++++++++++++++++++++++++++++++++-
 1 file changed, 111 insertions(+), 2 deletions(-)

diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index c8bfd765e451..00e07a0c908a 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -29,6 +29,7 @@
 #include <linux/acpi.h>
 #include <linux/suspend.h>
 #include <linux/idr.h>
+#include <linux/vmalloc.h>
 #include <asm/page.h>
 #include <asm/special_insns.h>
 #include <asm/msr-index.h>
@@ -50,6 +51,8 @@ static DEFINE_PER_CPU(bool, tdx_lp_initialized);
 
 static struct tdmr_info_list tdx_tdmr_list;
 
+static atomic_t *pamt_refcounts;
+
 static enum tdx_module_status_t tdx_module_status;
 static DEFINE_MUTEX(tdx_module_lock);
 
@@ -1035,9 +1038,108 @@ static int config_global_keyid(void)
 	return ret;
 }
 
+atomic_t *tdx_get_pamt_refcount(unsigned long hpa)
+{
+	return &pamt_refcounts[hpa / PMD_SIZE];
+}
+EXPORT_SYMBOL_GPL(tdx_get_pamt_refcount);
+
+static int pamt_refcount_populate(pte_t *pte, unsigned long addr, void *data)
+{
+	unsigned long vaddr;
+	pte_t entry;
+
+	if (!pte_none(ptep_get(pte)))
+		return 0;
+
+	vaddr = __get_free_page(GFP_KERNEL | __GFP_ZERO);
+	if (!vaddr)
+		return -ENOMEM;
+
+	entry = pfn_pte(PFN_DOWN(__pa(vaddr)), PAGE_KERNEL);
+
+	spin_lock(&init_mm.page_table_lock);
+	if (pte_none(ptep_get(pte)))
+		set_pte_at(&init_mm, addr, pte, entry);
+	else
+		free_page(vaddr);
+	spin_unlock(&init_mm.page_table_lock);
+
+	return 0;
+}
+
+static int pamt_refcount_depopulate(pte_t *pte, unsigned long addr,
+				    void *data)
+{
+	unsigned long vaddr;
+
+	vaddr = (unsigned long)__va(PFN_PHYS(pte_pfn(ptep_get(pte))));
+
+	spin_lock(&init_mm.page_table_lock);
+	if (!pte_none(ptep_get(pte))) {
+		pte_clear(&init_mm, addr, pte);
+		free_page(vaddr);
+	}
+	spin_unlock(&init_mm.page_table_lock);
+
+	return 0;
+}
+
+static int alloc_tdmr_pamt_refcount(struct tdmr_info *tdmr)
+{
+	unsigned long start, end;
+
+	start = (unsigned long)tdx_get_pamt_refcount(tdmr->base);
+	end = (unsigned long)tdx_get_pamt_refcount(tdmr->base + tdmr->size);
+	start = round_down(start, PAGE_SIZE);
+	end = round_up(end, PAGE_SIZE);
+
+	return apply_to_page_range(&init_mm, start, end - start,
+				   pamt_refcount_populate, NULL);
+}
+
+static int init_pamt_metadata(void)
+{
+	size_t size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
+	struct vm_struct *area;
+
+	if (!tdx_supports_dynamic_pamt(&tdx_sysinfo))
+		return 0;
+
+	/*
+	 * Reserve vmalloc range for PAMT reference counters. It covers all
+	 * physical address space up to max_pfn. It is going to be populated
+	 * from init_tdmr() only for present memory that available for TDX use.
+	 */
+	area = get_vm_area(size, VM_IOREMAP);
+	if (!area)
+		return -ENOMEM;
+
+	pamt_refcounts = area->addr;
+	return 0;
+}
+
+static void free_pamt_metadata(void)
+{
+	size_t size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
+
+	size = round_up(size, PAGE_SIZE);
+	apply_to_existing_page_range(&init_mm,
+				     (unsigned long)pamt_refcounts,
+				     size, pamt_refcount_depopulate,
+				     NULL);
+	vfree(pamt_refcounts);
+	pamt_refcounts = NULL;
+}
+
 static int init_tdmr(struct tdmr_info *tdmr)
 {
 	u64 next;
+	int ret;
+
+	ret = alloc_tdmr_pamt_refcount(tdmr);
+	if (ret)
+		return ret;
 
 	/*
 	 * Initializing a TDMR can be time consuming.  To avoid long
@@ -1048,7 +1150,6 @@ static int init_tdmr(struct tdmr_info *tdmr)
 		struct tdx_module_args args = {
 			.rcx = tdmr->base,
 		};
-		int ret;
 
 		ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
 		if (ret)
@@ -1134,10 +1235,15 @@ static int init_tdx_module(void)
 	if (ret)
 		goto err_reset_pamts;
 
+	/* Reserve vmalloc range for PAMT reference counters */
+	ret = init_pamt_metadata();
+	if (ret)
+		goto err_reset_pamts;
+
 	/* Initialize TDMRs to complete the TDX module initialization */
 	ret = init_tdmrs(&tdx_tdmr_list);
 	if (ret)
-		goto err_reset_pamts;
+		goto err_free_pamt_metadata;
 
 	pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
 
@@ -1149,6 +1255,9 @@ static int init_tdx_module(void)
 	put_online_mems();
 	return ret;
 
+err_free_pamt_metadata:
+	free_pamt_metadata();
+
 err_reset_pamts:
 	/*
 	 * Part of PAMTs may already have been initialized by the
-- 
2.47.2
Re: [RFC, PATCH 02/12] x86/virt/tdx: Allocate reference counters for PAMT memory
Posted by Chao Gao 9 months ago
>+static int init_pamt_metadata(void)
>+{
>+	size_t size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
>+	struct vm_struct *area;
>+
>+	if (!tdx_supports_dynamic_pamt(&tdx_sysinfo))
>+		return 0;
>+
>+	/*
>+	 * Reserve vmalloc range for PAMT reference counters. It covers all
>+	 * physical address space up to max_pfn. It is going to be populated
>+	 * from init_tdmr() only for present memory that available for TDX use.
>+	 */
>+	area = get_vm_area(size, VM_IOREMAP);
>+	if (!area)
>+		return -ENOMEM;
>+
>+	pamt_refcounts = area->addr;
>+	return 0;
>+}
>+
>+static void free_pamt_metadata(void)
>+{
>+	size_t size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
>+

Shouldn't the free path also be gated by tdx_supports_dynamic_pamt()?

There is a possibility that pamt_refcounts could be NULL here, e.g., the
TDX module doesn't support dynamic PAMT and init_tdmrs() encountered an
error.  I am assuming that apply_to_existing_page_range() below will cause
issues if pamt_refcounts is NULL, e.g., unmap mappings set up by others.

>+	size = round_up(size, PAGE_SIZE);
>+	apply_to_existing_page_range(&init_mm,
>+				     (unsigned long)pamt_refcounts,
>+				     size, pamt_refcount_depopulate,
>+				     NULL);
>+	vfree(pamt_refcounts);
>+	pamt_refcounts = NULL;
>+}
>+
> static int init_tdmr(struct tdmr_info *tdmr)
> {
> 	u64 next;
>+	int ret;
>+
>+	ret = alloc_tdmr_pamt_refcount(tdmr);
>+	if (ret)
>+		return ret;
> 
> 	/*
> 	 * Initializing a TDMR can be time consuming.  To avoid long
>@@ -1048,7 +1150,6 @@ static int init_tdmr(struct tdmr_info *tdmr)
> 		struct tdx_module_args args = {
> 			.rcx = tdmr->base,
> 		};
>-		int ret;
> 
> 		ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
> 		if (ret)
>@@ -1134,10 +1235,15 @@ static int init_tdx_module(void)
> 	if (ret)
> 		goto err_reset_pamts;
> 
>+	/* Reserve vmalloc range for PAMT reference counters */
>+	ret = init_pamt_metadata();
>+	if (ret)
>+		goto err_reset_pamts;
>+
> 	/* Initialize TDMRs to complete the TDX module initialization */
> 	ret = init_tdmrs(&tdx_tdmr_list);
> 	if (ret)
>-		goto err_reset_pamts;
>+		goto err_free_pamt_metadata;
> 
> 	pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
> 
>@@ -1149,6 +1255,9 @@ static int init_tdx_module(void)
> 	put_online_mems();
> 	return ret;
> 
>+err_free_pamt_metadata:
>+	free_pamt_metadata();
>+
> err_reset_pamts:
> 	/*
> 	 * Part of PAMTs may already have been initialized by the
>-- 
>2.47.2
>
Re: [RFC, PATCH 02/12] x86/virt/tdx: Allocate reference counters for PAMT memory
Posted by Kirill A. Shutemov 9 months ago
On Fri, May 09, 2025 at 05:52:16PM +0800, Chao Gao wrote:
> >+static int init_pamt_metadata(void)
> >+{
> >+	size_t size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
> >+	struct vm_struct *area;
> >+
> >+	if (!tdx_supports_dynamic_pamt(&tdx_sysinfo))
> >+		return 0;
> >+
> >+	/*
> >+	 * Reserve vmalloc range for PAMT reference counters. It covers all
> >+	 * physical address space up to max_pfn. It is going to be populated
> >+	 * from init_tdmr() only for present memory that available for TDX use.
> >+	 */
> >+	area = get_vm_area(size, VM_IOREMAP);
> >+	if (!area)
> >+		return -ENOMEM;
> >+
> >+	pamt_refcounts = area->addr;
> >+	return 0;
> >+}
> >+
> >+static void free_pamt_metadata(void)
> >+{
> >+	size_t size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
> >+
> 
> Shouldn't the free path also be gated by tdx_supports_dynamic_pamt()?

True. Missed this.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov
Re: [RFC, PATCH 02/12] x86/virt/tdx: Allocate reference counters for PAMT memory
Posted by Huang, Kai 9 months, 1 week ago
> +static atomic_t *pamt_refcounts;
> +
>  static enum tdx_module_status_t tdx_module_status;
>  static DEFINE_MUTEX(tdx_module_lock);
>  
> @@ -1035,9 +1038,108 @@ static int config_global_keyid(void)
>  	return ret;
>  }
>  
> +atomic_t *tdx_get_pamt_refcount(unsigned long hpa)
> +{
> +	return &pamt_refcounts[hpa / PMD_SIZE];
> +}
> +EXPORT_SYMBOL_GPL(tdx_get_pamt_refcount);

It's not quite clear why this function needs to be exported in this patch.  IMO
it's better to move the export to the patch which actually needs it.

Looking at patch 5, tdx_pamt_get()/put() use it, and they are in KVM code.  But
I think we should just put them here in this file.  tdx_alloc_page() and
tdx_free_page() should be in this file too.

And instead of exporting tdx_get_pamt_refcount(), the TDX core code here can
export tdx_alloc_page() and tdx_free_page(), providing two high level helpers to
allow the TDX users (e.g., KVM) to allocate/free TDX private pages.  How PAMT
pages are allocated is then hidden in the core TDX code.

> +
> +static int pamt_refcount_populate(pte_t *pte, unsigned long addr, void *data)
> +{
> +	unsigned long vaddr;
> +	pte_t entry;
> +
> +	if (!pte_none(ptep_get(pte)))
> +		return 0;
> +
> +	vaddr = __get_free_page(GFP_KERNEL | __GFP_ZERO);
> +	if (!vaddr)
> +		return -ENOMEM;
> +
> +	entry = pfn_pte(PFN_DOWN(__pa(vaddr)), PAGE_KERNEL);
> +
> +	spin_lock(&init_mm.page_table_lock);
> +	if (pte_none(ptep_get(pte)))
> +		set_pte_at(&init_mm, addr, pte, entry);
> +	else
> +		free_page(vaddr);
> +	spin_unlock(&init_mm.page_table_lock);
> +
> +	return 0;
> +}
> +
> +static int pamt_refcount_depopulate(pte_t *pte, unsigned long addr,
> +				    void *data)
> +{
> +	unsigned long vaddr;
> +
> +	vaddr = (unsigned long)__va(PFN_PHYS(pte_pfn(ptep_get(pte))));
> +
> +	spin_lock(&init_mm.page_table_lock);
> +	if (!pte_none(ptep_get(pte))) {
> +		pte_clear(&init_mm, addr, pte);
> +		free_page(vaddr);
> +	}
> +	spin_unlock(&init_mm.page_table_lock);
> +
> +	return 0;
> +}
> +
> +static int alloc_tdmr_pamt_refcount(struct tdmr_info *tdmr)
> +{
> +	unsigned long start, end;
> +
> +	start = (unsigned long)tdx_get_pamt_refcount(tdmr->base);
> +	end = (unsigned long)tdx_get_pamt_refcount(tdmr->base + tdmr->size);
> +	start = round_down(start, PAGE_SIZE);
> +	end = round_up(end, PAGE_SIZE);
> +
> +	return apply_to_page_range(&init_mm, start, end - start,
> +				   pamt_refcount_populate, NULL);
> +}

IIUC, populating refcount based on TDMR will slightly waste memory.  The reason
is IIUC we don't need to populate the refcount for a 2M range if the range is
completely marked as reserved in TDMR, because it's not possible for the kernel
to use such range for TDX.

Populating based on the list of TDX memory blocks should be better.  In
practice, the difference should be unnoticeable, but conceptually, using TDX
memory blocks is better.

> +
> +static int init_pamt_metadata(void)
> +{
> +	size_t size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
> +	struct vm_struct *area;
> +
> +	if (!tdx_supports_dynamic_pamt(&tdx_sysinfo))
> +		return 0;
> +
> +	/*
> +	 * Reserve vmalloc range for PAMT reference counters. It covers all
> +	 * physical address space up to max_pfn. It is going to be populated
> +	 * from init_tdmr() only for present memory that available for TDX use.
> +	 */
> +	area = get_vm_area(size, VM_IOREMAP);
> +	if (!area)
> +		return -ENOMEM;
> +
> +	pamt_refcounts = area->addr;
> +	return 0;
> +}
> +
> +static void free_pamt_metadata(void)
> +{
> +	size_t size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
> +
> +	size = round_up(size, PAGE_SIZE);
> +	apply_to_existing_page_range(&init_mm,
> +				     (unsigned long)pamt_refcounts,
> +				     size, pamt_refcount_depopulate,
> +				     NULL);
> +	vfree(pamt_refcounts);
> +	pamt_refcounts = NULL;
> +}
> +
>  static int init_tdmr(struct tdmr_info *tdmr)
>  {
>  	u64 next;
> +	int ret;
> +
> +	ret = alloc_tdmr_pamt_refcount(tdmr);
> +	if (ret)
> +		return ret;
>  
>  	/*
>  	 * Initializing a TDMR can be time consuming.  To avoid long
> @@ -1048,7 +1150,6 @@ static int init_tdmr(struct tdmr_info *tdmr)
>  		struct tdx_module_args args = {
>  			.rcx = tdmr->base,
>  		};
> -		int ret;
>  
>  		ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
>  		if (ret)
> @@ -1134,10 +1235,15 @@ static int init_tdx_module(void)
>  	if (ret)
>  		goto err_reset_pamts;
>  
> +	/* Reserve vmalloc range for PAMT reference counters */
> +	ret = init_pamt_metadata();
> +	if (ret)
> +		goto err_reset_pamts;
> +
>  	/* Initialize TDMRs to complete the TDX module initialization */
>  	ret = init_tdmrs(&tdx_tdmr_list);
>  	if (ret)
> -		goto err_reset_pamts;
> +		goto err_free_pamt_metadata;
>  
>  	pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
>  
> @@ -1149,6 +1255,9 @@ static int init_tdx_module(void)
>  	put_online_mems();
>  	return ret;
>  
> +err_free_pamt_metadata:
> +	free_pamt_metadata();
> +
>  err_reset_pamts:
>  	/*
>  	 * Part of PAMTs may already have been initialized by the

Re: [RFC, PATCH 02/12] x86/virt/tdx: Allocate reference counters for PAMT memory
Posted by kirill.shutemov@linux.intel.com 9 months ago
On Mon, May 05, 2025 at 11:05:12AM +0000, Huang, Kai wrote:
> 
> > +static atomic_t *pamt_refcounts;
> > +
> >  static enum tdx_module_status_t tdx_module_status;
> >  static DEFINE_MUTEX(tdx_module_lock);
> >  
> > @@ -1035,9 +1038,108 @@ static int config_global_keyid(void)
> >  	return ret;
> >  }
> >  
> > +atomic_t *tdx_get_pamt_refcount(unsigned long hpa)
> > +{
> > +	return &pamt_refcounts[hpa / PMD_SIZE];
> > +}
> > +EXPORT_SYMBOL_GPL(tdx_get_pamt_refcount);
> 
> It's not quite clear why this function needs to be exported in this patch.  IMO
> it's better to move the export to the patch which actually needs it.
> 
> Looking at patch 5, tdx_pamt_get()/put() use it, and they are in KVM code.  But
> I think we should just put them here in this file.  tdx_alloc_page() and
> tdx_free_page() should be in this file too.
> 
> And instead of exporting tdx_get_pamt_refcount(), the TDX core code here can
> export tdx_alloc_page() and tdx_free_page(), providing two high level helpers to
> allow the TDX users (e.g., KVM) to allocate/free TDX private pages.  How PAMT
> pages are allocated is then hidden in the core TDX code.

We would still need tdx_get_pamt_refcount() to handle case when we need to
bump refcount for page allocated elsewhere.

> > +
> > +static int pamt_refcount_populate(pte_t *pte, unsigned long addr, void *data)
> > +{
> > +	unsigned long vaddr;
> > +	pte_t entry;
> > +
> > +	if (!pte_none(ptep_get(pte)))
> > +		return 0;
> > +
> > +	vaddr = __get_free_page(GFP_KERNEL | __GFP_ZERO);
> > +	if (!vaddr)
> > +		return -ENOMEM;
> > +
> > +	entry = pfn_pte(PFN_DOWN(__pa(vaddr)), PAGE_KERNEL);
> > +
> > +	spin_lock(&init_mm.page_table_lock);
> > +	if (pte_none(ptep_get(pte)))
> > +		set_pte_at(&init_mm, addr, pte, entry);
> > +	else
> > +		free_page(vaddr);
> > +	spin_unlock(&init_mm.page_table_lock);
> > +
> > +	return 0;
> > +}
> > +
> > +static int pamt_refcount_depopulate(pte_t *pte, unsigned long addr,
> > +				    void *data)
> > +{
> > +	unsigned long vaddr;
> > +
> > +	vaddr = (unsigned long)__va(PFN_PHYS(pte_pfn(ptep_get(pte))));
> > +
> > +	spin_lock(&init_mm.page_table_lock);
> > +	if (!pte_none(ptep_get(pte))) {
> > +		pte_clear(&init_mm, addr, pte);
> > +		free_page(vaddr);
> > +	}
> > +	spin_unlock(&init_mm.page_table_lock);
> > +
> > +	return 0;
> > +}
> > +
> > +static int alloc_tdmr_pamt_refcount(struct tdmr_info *tdmr)
> > +{
> > +	unsigned long start, end;
> > +
> > +	start = (unsigned long)tdx_get_pamt_refcount(tdmr->base);
> > +	end = (unsigned long)tdx_get_pamt_refcount(tdmr->base + tdmr->size);
> > +	start = round_down(start, PAGE_SIZE);
> > +	end = round_up(end, PAGE_SIZE);
> > +
> > +	return apply_to_page_range(&init_mm, start, end - start,
> > +				   pamt_refcount_populate, NULL);
> > +}
> 
> IIUC, populating refcount based on TDMR will slightly waste memory.  The reason
> is IIUC we don't need to populate the refcount for a 2M range if the range is
> completely marked as reserved in TDMR, because it's not possible for the kernel
> to use such range for TDX.
> 
> Populating based on the list of TDX memory blocks should be better.  In
> practice, the difference should be unnoticeable, but conceptually, using TDX
> memory blocks is better.

Okay, I will look into this after dealing with huge pages.

> > +
> > +static int init_pamt_metadata(void)
> > +{
> > +	size_t size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
> > +	struct vm_struct *area;
> > +
> > +	if (!tdx_supports_dynamic_pamt(&tdx_sysinfo))
> > +		return 0;
> > +
> > +	/*
> > +	 * Reserve vmalloc range for PAMT reference counters. It covers all
> > +	 * physical address space up to max_pfn. It is going to be populated
> > +	 * from init_tdmr() only for present memory that available for TDX use.
> > +	 */
> > +	area = get_vm_area(size, VM_IOREMAP);
> > +	if (!area)
> > +		return -ENOMEM;
> > +
> > +	pamt_refcounts = area->addr;
> > +	return 0;
> > +}
> > +
> > +static void free_pamt_metadata(void)
> > +{
> > +	size_t size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
> > +
> > +	size = round_up(size, PAGE_SIZE);
> > +	apply_to_existing_page_range(&init_mm,
> > +				     (unsigned long)pamt_refcounts,
> > +				     size, pamt_refcount_depopulate,
> > +				     NULL);
> > +	vfree(pamt_refcounts);
> > +	pamt_refcounts = NULL;
> > +}
> > +
> >  static int init_tdmr(struct tdmr_info *tdmr)
> >  {
> >  	u64 next;
> > +	int ret;
> > +
> > +	ret = alloc_tdmr_pamt_refcount(tdmr);
> > +	if (ret)
> > +		return ret;
> >  
> >  	/*
> >  	 * Initializing a TDMR can be time consuming.  To avoid long
> > @@ -1048,7 +1150,6 @@ static int init_tdmr(struct tdmr_info *tdmr)
> >  		struct tdx_module_args args = {
> >  			.rcx = tdmr->base,
> >  		};
> > -		int ret;
> >  
> >  		ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
> >  		if (ret)
> > @@ -1134,10 +1235,15 @@ static int init_tdx_module(void)
> >  	if (ret)
> >  		goto err_reset_pamts;
> >  
> > +	/* Reserve vmalloc range for PAMT reference counters */
> > +	ret = init_pamt_metadata();
> > +	if (ret)
> > +		goto err_reset_pamts;
> > +
> >  	/* Initialize TDMRs to complete the TDX module initialization */
> >  	ret = init_tdmrs(&tdx_tdmr_list);
> >  	if (ret)
> > -		goto err_reset_pamts;
> > +		goto err_free_pamt_metadata;
> >  
> >  	pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
> >  
> > @@ -1149,6 +1255,9 @@ static int init_tdx_module(void)
> >  	put_online_mems();
> >  	return ret;
> >  
> > +err_free_pamt_metadata:
> > +	free_pamt_metadata();
> > +
> >  err_reset_pamts:
> >  	/*
> >  	 * Part of PAMTs may already have been initialized by the
> 

-- 
  Kiryl Shutsemau / Kirill A. Shutemov
Re: [RFC, PATCH 02/12] x86/virt/tdx: Allocate reference counters for PAMT memory
Posted by Huang, Kai 9 months ago
On Thu, 2025-05-08 at 16:03 +0300, kirill.shutemov@linux.intel.com wrote:
> On Mon, May 05, 2025 at 11:05:12AM +0000, Huang, Kai wrote:
> > 
> > > +static atomic_t *pamt_refcounts;
> > > +
> > >   static enum tdx_module_status_t tdx_module_status;
> > >   static DEFINE_MUTEX(tdx_module_lock);
> > >   
> > > @@ -1035,9 +1038,108 @@ static int config_global_keyid(void)
> > >   	return ret;
> > >   }
> > >   
> > > +atomic_t *tdx_get_pamt_refcount(unsigned long hpa)
> > > +{
> > > +	return &pamt_refcounts[hpa / PMD_SIZE];
> > > +}
> > > +EXPORT_SYMBOL_GPL(tdx_get_pamt_refcount);
> > 
> > It's not quite clear why this function needs to be exported in this patch.  IMO
> > it's better to move the export to the patch which actually needs it.
> > 
> > Looking at patch 5, tdx_pamt_get()/put() use it, and they are in KVM code.  But
> > I think we should just put them here in this file.  tdx_alloc_page() and
> > tdx_free_page() should be in this file too.
> > 
> > And instead of exporting tdx_get_pamt_refcount(), the TDX core code here can
> > export tdx_alloc_page() and tdx_free_page(), providing two high level helpers to
> > allow the TDX users (e.g., KVM) to allocate/free TDX private pages.  How PAMT
> > pages are allocated is then hidden in the core TDX code.
> 
> We would still need tdx_get_pamt_refcount() to handle case when we need to
> bump refcount for page allocated elsewhere.

Hmm I am not sure I am following this.  What "page allocated" are you referring
to?  I am probably missing something, but if the caller wants a TDX page then it
should just call tdx_alloc_page() which handles refcount bumping internally. 
No?
Re: [RFC, PATCH 02/12] x86/virt/tdx: Allocate reference counters for PAMT memory
Posted by kirill.shutemov@linux.intel.com 9 months ago
On Fri, May 09, 2025 at 01:06:05AM +0000, Huang, Kai wrote:
> On Thu, 2025-05-08 at 16:03 +0300, kirill.shutemov@linux.intel.com wrote:
> > On Mon, May 05, 2025 at 11:05:12AM +0000, Huang, Kai wrote:
> > > 
> > > > +static atomic_t *pamt_refcounts;
> > > > +
> > > >   static enum tdx_module_status_t tdx_module_status;
> > > >   static DEFINE_MUTEX(tdx_module_lock);
> > > >   
> > > > @@ -1035,9 +1038,108 @@ static int config_global_keyid(void)
> > > >   	return ret;
> > > >   }
> > > >   
> > > > +atomic_t *tdx_get_pamt_refcount(unsigned long hpa)
> > > > +{
> > > > +	return &pamt_refcounts[hpa / PMD_SIZE];
> > > > +}
> > > > +EXPORT_SYMBOL_GPL(tdx_get_pamt_refcount);
> > > 
> > > It's not quite clear why this function needs to be exported in this patch.  IMO
> > > it's better to move the export to the patch which actually needs it.
> > > 
> > > Looking at patch 5, tdx_pamt_get()/put() use it, and they are in KVM code.  But
> > > I think we should just put them here in this file.  tdx_alloc_page() and
> > > tdx_free_page() should be in this file too.
> > > 
> > > And instead of exporting tdx_get_pamt_refcount(), the TDX core code here can
> > > export tdx_alloc_page() and tdx_free_page(), providing two high level helpers to
> > > allow the TDX users (e.g., KVM) to allocate/free TDX private pages.  How PAMT
> > > pages are allocated is then hidden in the core TDX code.
> > 
> > We would still need tdx_get_pamt_refcount() to handle case when we need to
> > bump refcount for page allocated elsewhere.
> 
> Hmm I am not sure I am following this.  What "page allocated" are you referring
> to?  I am probably missing something, but if the caller wants a TDX page then it
> should just call tdx_alloc_page() which handles refcount bumping internally. 
> No?

Pages that get mapped to the guest is allocated externally via
guest_memfd and we need bump refcount for them.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov
Re: [RFC, PATCH 02/12] x86/virt/tdx: Allocate reference counters for PAMT memory
Posted by Huang, Kai 9 months ago
On Mon, 2025-05-12 at 12:53 +0300, kirill.shutemov@linux.intel.com wrote:
> On Fri, May 09, 2025 at 01:06:05AM +0000, Huang, Kai wrote:
> > On Thu, 2025-05-08 at 16:03 +0300, kirill.shutemov@linux.intel.com wrote:
> > > On Mon, May 05, 2025 at 11:05:12AM +0000, Huang, Kai wrote:
> > > > 
> > > > > +static atomic_t *pamt_refcounts;
> > > > > +
> > > > >   static enum tdx_module_status_t tdx_module_status;
> > > > >   static DEFINE_MUTEX(tdx_module_lock);
> > > > >   
> > > > > @@ -1035,9 +1038,108 @@ static int config_global_keyid(void)
> > > > >   	return ret;
> > > > >   }
> > > > >   
> > > > > +atomic_t *tdx_get_pamt_refcount(unsigned long hpa)
> > > > > +{
> > > > > +	return &pamt_refcounts[hpa / PMD_SIZE];
> > > > > +}
> > > > > +EXPORT_SYMBOL_GPL(tdx_get_pamt_refcount);
> > > > 
> > > > It's not quite clear why this function needs to be exported in this patch.  IMO
> > > > it's better to move the export to the patch which actually needs it.
> > > > 
> > > > Looking at patch 5, tdx_pamt_get()/put() use it, and they are in KVM code.  But
> > > > I think we should just put them here in this file.  tdx_alloc_page() and
> > > > tdx_free_page() should be in this file too.
> > > > 
> > > > And instead of exporting tdx_get_pamt_refcount(), the TDX core code here can
> > > > export tdx_alloc_page() and tdx_free_page(), providing two high level helpers to
> > > > allow the TDX users (e.g., KVM) to allocate/free TDX private pages.  How PAMT
> > > > pages are allocated is then hidden in the core TDX code.
> > > 
> > > We would still need tdx_get_pamt_refcount() to handle case when we need to
> > > bump refcount for page allocated elsewhere.
> > 
> > Hmm I am not sure I am following this.  What "page allocated" are you referring
> > to?  I am probably missing something, but if the caller wants a TDX page then it
> > should just call tdx_alloc_page() which handles refcount bumping internally. 
> > No?
> 
> Pages that get mapped to the guest is allocated externally via
> guest_memfd and we need bump refcount for them.

Oh right.  TDX private pages can also be in page cache.

It's better to have a way to consolidate page allocation for TDX but with page
cache I don't see a simple straightforward way to do that.

For now, I think we can just export tdx_pamt_{get|put}() in the core TDX code.
We can also provide tdx_{alloc|free}_page() wrappers (e.g., static inline in
<asm/tdx.h>) for kernel TDX memory allocation so that they can be used for TDX
Connect too.