[PATCH v3 22/24] x86/tdx: Add/Remove DPAMT pages for guest private memory to demote

Yan Zhao posted 24 patches 1 month ago
[PATCH v3 22/24] x86/tdx: Add/Remove DPAMT pages for guest private memory to demote
Posted by Yan Zhao 1 month ago
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>

When Dynamic PAMT is enabled and splitting a 2MB mapping to 512 4KB
mappings, SEAMCALL TDH.MEM.PAGE.DEMOTE takes the Dynamic PAMT page pair in
registers R12 and R13. The Dynamic PAMT page pair is used to store physical
memory metadata for the 2MB guest private memory after its S-EPT mapping is
split to 4KB successfully.

Pass prealloc_split_cache (the per-VM split cache) to SEAMCALL wrapper
tdh_mem_page_demote() for dequeuing Dynamic PAMT pages from the cache.
Protect the cache dequeuing in KVM with prealloc_split_cache_lock.

Inside wrapper tdh_mem_page_demote(), dequeue the Dynamic PAMT pages into
the guest_memory_pamt_page array and copy the page address to R12 and R13.

Invoke SEAMCALL TDH_MEM_PAGE_DEMOTE using seamcall_saved_ret() to handle
registers above R11.

Free the Dynamic PAMT pages after SEAMCALL TDH_MEM_PAGE_DEMOTE fails since
the guest private memory is still mapped at 2MB level.

Opportunistically, rename dpamt_args_array_ptr() to
dpamt_args_array_ptr_rdx() for tdh_phymem_pamt_{add/remove} and invoke
dpamt_args_array_ptr_r12() in tdh_mem_page_demote() for populating
registers starting from R12.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Co-developed-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
v3:
- Split out as a new patch.
- Get pages from preallocate cache corresponding to DPAMT v4.
---
 arch/x86/include/asm/tdx.h  |  1 +
 arch/x86/kvm/vmx/tdx.c      |  5 ++-
 arch/x86/virt/vmx/tdx/tdx.c | 76 ++++++++++++++++++++++++++-----------
 3 files changed, 59 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index abe484045132..5fc7498392fd 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -251,6 +251,7 @@ u64 tdh_mng_create(struct tdx_td *td, u16 hkid);
 u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp);
 u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data);
 u64 tdh_mem_page_demote(struct tdx_td *td, u64 gpa, int level, struct page *new_sept_page,
+			struct tdx_prealloc *prealloc,
 			u64 *ext_err1, u64 *ext_err2);
 u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2);
 u64 tdh_mr_finalize(struct tdx_td *td);
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index ec47bd799274..a11ff02a4f30 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -2021,8 +2021,11 @@ static int tdx_sept_split_private_spte(struct kvm *kvm, gfn_t gfn, enum pg_level
 	if (KVM_BUG_ON(ret, kvm))
 		return -EIO;
 
+	spin_lock(&kvm_tdx->prealloc_split_cache_lock);
 	err = tdh_do_no_vcpus(tdh_mem_page_demote, kvm, &kvm_tdx->td, gpa,
-			      tdx_level, new_sept_page, &entry, &level_state);
+			      tdx_level, new_sept_page,
+			      &kvm_tdx->prealloc_split_cache, &entry, &level_state);
+	spin_unlock(&kvm_tdx->prealloc_split_cache_lock);
 	if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_DEMOTE, entry, level_state, kvm)) {
 		tdx_pamt_put(new_sept_page);
 		return -EIO;
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 76963c563906..9917e4e7705f 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1848,25 +1848,69 @@ u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
 }
 EXPORT_SYMBOL_GPL(tdh_mng_rd);
 
+static int alloc_pamt_array(u64 *pa_array, struct tdx_prealloc *prealloc);
+static void free_pamt_array(u64 *pa_array);
+/*
+ * The TDX spec treats the registers like an array, as they are ordered
+ * in the struct. The array size is limited by the number or registers,
+ * so define the max size it could be for worst case allocations and sanity
+ * checking.
+ */
+#define MAX_TDX_ARG_SIZE(reg) ((sizeof(struct tdx_module_args) - \
+			       offsetof(struct tdx_module_args, reg)) / sizeof(u64))
+#define TDX_ARG_INDEX(reg) (offsetof(struct tdx_module_args, reg) / \
+			    sizeof(u64))
+/*
+ * Treat struct the registers like an array that starts at R12, per
+ * TDX spec. Do some sanitychecks, and return an indexable type.
+ */
+static u64 *dpamt_args_array_ptr_r12(struct tdx_module_array_args *args)
+{
+	WARN_ON_ONCE(tdx_dpamt_entry_pages() > MAX_TDX_ARG_SIZE(r12));
+
+	return &args->args_array[TDX_ARG_INDEX(r12)];
+}
+
 u64 tdh_mem_page_demote(struct tdx_td *td, u64 gpa, int level, struct page *new_sept_page,
+			struct tdx_prealloc *prealloc,
 			u64 *ext_err1, u64 *ext_err2)
 {
-	struct tdx_module_args args = {
-		.rcx = gpa | level,
-		.rdx = tdx_tdr_pa(td),
-		.r8 = page_to_phys(new_sept_page),
+	bool dpamt = tdx_supports_dynamic_pamt(&tdx_sysinfo) && level == TDX_PS_2M;
+	u64 guest_memory_pamt_page[MAX_TDX_ARG_SIZE(r12)];
+	struct tdx_module_array_args args = {
+		.args.rcx = gpa | level,
+		.args.rdx = tdx_tdr_pa(td),
+		.args.r8 = page_to_phys(new_sept_page),
 	};
 	u64 ret;
 
 	if (!tdx_supports_demote_nointerrupt(&tdx_sysinfo))
 		return TDX_SW_ERROR;
 
+	if (dpamt) {
+		u64 *args_array = dpamt_args_array_ptr_r12(&args);
+
+		if (alloc_pamt_array(guest_memory_pamt_page, prealloc))
+			return TDX_SW_ERROR;
+
+		/*
+		 * Copy PAMT page PAs of the guest memory into the struct per the
+		 * TDX ABI
+		 */
+		memcpy(args_array, guest_memory_pamt_page,
+		       tdx_dpamt_entry_pages() * sizeof(*args_array));
+	}
+
 	/* Flush the new S-EPT page to be added */
 	tdx_clflush_page(new_sept_page);
-	ret = seamcall_ret(TDH_MEM_PAGE_DEMOTE, &args);
 
-	*ext_err1 = args.rcx;
-	*ext_err2 = args.rdx;
+	ret = seamcall_saved_ret(TDH_MEM_PAGE_DEMOTE, &args.args);
+
+	*ext_err1 = args.args.rcx;
+	*ext_err2 = args.args.rdx;
+
+	if (dpamt && ret)
+		free_pamt_array(guest_memory_pamt_page);
 
 	return ret;
 }
@@ -2104,23 +2148,11 @@ static struct page *alloc_dpamt_page(struct tdx_prealloc *prealloc)
 	return alloc_page(GFP_KERNEL_ACCOUNT);
 }
 
-
-/*
- * The TDX spec treats the registers like an array, as they are ordered
- * in the struct. The array size is limited by the number or registers,
- * so define the max size it could be for worst case allocations and sanity
- * checking.
- */
-#define MAX_TDX_ARG_SIZE(reg) (sizeof(struct tdx_module_args) - \
-			       offsetof(struct tdx_module_args, reg))
-#define TDX_ARG_INDEX(reg) (offsetof(struct tdx_module_args, reg) / \
-			    sizeof(u64))
-
 /*
  * Treat struct the registers like an array that starts at RDX, per
  * TDX spec. Do some sanitychecks, and return an indexable type.
  */
-static u64 *dpamt_args_array_ptr(struct tdx_module_array_args *args)
+static u64 *dpamt_args_array_ptr_rdx(struct tdx_module_array_args *args)
 {
 	WARN_ON_ONCE(tdx_dpamt_entry_pages() > MAX_TDX_ARG_SIZE(rdx));
 
@@ -2188,7 +2220,7 @@ static u64 tdh_phymem_pamt_add(struct page *page, u64 *pamt_pa_array)
 	struct tdx_module_array_args args = {
 		.args.rcx = pamt_2mb_arg(page)
 	};
-	u64 *dpamt_arg_array = dpamt_args_array_ptr(&args);
+	u64 *dpamt_arg_array = dpamt_args_array_ptr_rdx(&args);
 
 	/* Copy PAMT page PA's into the struct per the TDX ABI */
 	memcpy(dpamt_arg_array, pamt_pa_array,
@@ -2216,7 +2248,7 @@ static u64 tdh_phymem_pamt_remove(struct page *page, u64 *pamt_pa_array)
 	struct tdx_module_array_args args = {
 		.args.rcx = pamt_2mb_arg(page),
 	};
-	u64 *args_array = dpamt_args_array_ptr(&args);
+	u64 *args_array = dpamt_args_array_ptr_rdx(&args);
 	u64 ret;
 
 	ret = seamcall_ret(TDH_PHYMEM_PAMT_REMOVE, &args.args);
-- 
2.43.2
Re: [PATCH v3 22/24] x86/tdx: Add/Remove DPAMT pages for guest private memory to demote
Posted by Huang, Kai 2 weeks, 6 days ago
On Tue, 2026-01-06 at 18:24 +0800, Yan Zhao wrote:
>  u64 tdh_mem_page_demote(struct tdx_td *td, u64 gpa, int level, struct page *new_sept_page,
> +			struct tdx_prealloc *prealloc,
>  			u64 *ext_err1, u64 *ext_err2)
>  {
> -	struct tdx_module_args args = {
> -		.rcx = gpa | level,
> -		.rdx = tdx_tdr_pa(td),
> -		.r8 = page_to_phys(new_sept_page),
> +	bool dpamt = tdx_supports_dynamic_pamt(&tdx_sysinfo) && level == TDX_PS_2M;

The spec of TDH.MEM.PAGE.DEMOTE says:

  If the TDX Module is configured to use Dynamic PAMT and the large page
  level is 1 (2MB), R12 contains the host physical address of a new 
  PAMT page (HKID bits must be 0).

It says "... is configured to use Dynamic PAMT ...", but not ".. Dynamic
PAMT is supported ..".

tdx_supports_dynamic_pamt() only reports whether the module "supports"
DPAMT.  Although in the DPAMT series the kernel always enables DPAMT when
it is supported, I think it's better to have a comment point out this fact
so we don't need to go to that series to figure out.

> +	u64 guest_memory_pamt_page[MAX_TDX_ARG_SIZE(r12)];
> +	struct tdx_module_array_args args = {
> +		.args.rcx = gpa | level,
> +		.args.rdx = tdx_tdr_pa(td),
> +		.args.r8 = page_to_phys(new_sept_page),
>  	};
>  	u64 ret;
>  
>  	if (!tdx_supports_demote_nointerrupt(&tdx_sysinfo))
>  		return TDX_SW_ERROR;
>  
> +	if (dpamt) {
> +		u64 *args_array = dpamt_args_array_ptr_r12(&args);
> +
> +		if (alloc_pamt_array(guest_memory_pamt_page, prealloc))
> +			return TDX_SW_ERROR;
> +
> +		/*
> +		 * Copy PAMT page PAs of the guest memory into the struct per the
> +		 * TDX ABI
> +		 */
> +		memcpy(args_array, guest_memory_pamt_page,
> +		       tdx_dpamt_entry_pages() * sizeof(*args_array));
> +	}
Re: [PATCH v3 22/24] x86/tdx: Add/Remove DPAMT pages for guest private memory to demote
Posted by Yan Zhao 2 weeks, 6 days ago
On Mon, Jan 19, 2026 at 06:52:46PM +0800, Huang, Kai wrote:
> On Tue, 2026-01-06 at 18:24 +0800, Yan Zhao wrote:
> >  u64 tdh_mem_page_demote(struct tdx_td *td, u64 gpa, int level, struct page *new_sept_page,
> > +			struct tdx_prealloc *prealloc,
> >  			u64 *ext_err1, u64 *ext_err2)
> >  {
> > -	struct tdx_module_args args = {
> > -		.rcx = gpa | level,
> > -		.rdx = tdx_tdr_pa(td),
> > -		.r8 = page_to_phys(new_sept_page),
> > +	bool dpamt = tdx_supports_dynamic_pamt(&tdx_sysinfo) && level == TDX_PS_2M;
> 
> The spec of TDH.MEM.PAGE.DEMOTE says:
> 
>   If the TDX Module is configured to use Dynamic PAMT and the large page
>   level is 1 (2MB), R12 contains the host physical address of a new 
>   PAMT page (HKID bits must be 0).
> 
> It says "... is configured to use Dynamic PAMT ...", but not ".. Dynamic
> PAMT is supported ..".
Good catch.

> tdx_supports_dynamic_pamt() only reports whether the module "supports"
> DPAMT.  Although in the DPAMT series the kernel always enables DPAMT when
> it is supported, I think it's better to have a comment point out this fact
> so we don't need to go to that series to figure out.
Will add the comment. Thanks!

> > +	u64 guest_memory_pamt_page[MAX_TDX_ARG_SIZE(r12)];
> > +	struct tdx_module_array_args args = {
> > +		.args.rcx = gpa | level,
> > +		.args.rdx = tdx_tdr_pa(td),
> > +		.args.r8 = page_to_phys(new_sept_page),
> >  	};
> >  	u64 ret;
> >  
> >  	if (!tdx_supports_demote_nointerrupt(&tdx_sysinfo))
> >  		return TDX_SW_ERROR;
> >  
> > +	if (dpamt) {
> > +		u64 *args_array = dpamt_args_array_ptr_r12(&args);
> > +
> > +		if (alloc_pamt_array(guest_memory_pamt_page, prealloc))
> > +			return TDX_SW_ERROR;
> > +
> > +		/*
> > +		 * Copy PAMT page PAs of the guest memory into the struct per the
> > +		 * TDX ABI
> > +		 */
> > +		memcpy(args_array, guest_memory_pamt_page,
> > +		       tdx_dpamt_entry_pages() * sizeof(*args_array));
> > +	}