struct page to PFN conversion for TDX guest private memory

[PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Yan Zhao 2 weeks, 4 days ago

From: Sean Christopherson <seanjc@google.com>

Remove the completely unnecessary assumption that memory mapped into a TDX
guest is backed by refcounted struct page memory. From KVM's point of view,
TDH_MEM_PAGE_ADD and TDH_MEM_PAGE_AUG are glorified writes to PTEs, so they
have no business placing requirements on how KVM and guest_memfd manage
memory.

Rip out the misguided struct page assumptions/constraints and instead have
the two SEAMCALL wrapper APIs take PFN directly. This ensures that for
future huge page support in S-EPT, the kernel doesn't pick up even worse
assumptions like "a hugepage must be contained in a single folio".

Use "kvm_pfn_t pfn" for type safety. Using this KVM type is appropriate
since APIs tdh_mem_page_add() and tdh_mem_page_aug() are exported to KVM
only.

[ Yan: Replace "u64 pfn" with "kvm_pfn_t pfn" ]

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
 arch/x86/include/asm/tdx.h  |  5 +++--
 arch/x86/kvm/vmx/tdx.c      |  7 +++----
 arch/x86/virt/vmx/tdx/tdx.c | 20 +++++++++++++-------
 3 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index a149740b24e8..f3f0b1872176 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -6,6 +6,7 @@
 #include <linux/init.h>
 #include <linux/bits.h>
 #include <linux/mmzone.h>
+#include <linux/kvm_types.h>
 
 #include <asm/errno.h>
 #include <asm/ptrace.h>
@@ -195,10 +196,10 @@ static inline int pg_level_to_tdx_sept_level(enum pg_level level)
 
 u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args);
 u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page);
-u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, kvm_pfn_t pfn, struct page *source, u64 *ext_err1, u64 *ext_err2);
 u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2);
 u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page);
-u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, kvm_pfn_t pfn, u64 *ext_err1, u64 *ext_err2);
 u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2);
 u64 tdh_mng_key_config(struct tdx_td *td);
 u64 tdh_mng_create(struct tdx_td *td, u16 hkid);
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 1e47c194af53..1f1abc5b5655 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1624,8 +1624,8 @@ static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level,
 	    KVM_BUG_ON(!kvm_tdx->page_add_src, kvm))
 		return -EIO;
 
-	err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
-			       kvm_tdx->page_add_src, &entry, &level_state);
+	err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn, kvm_tdx->page_add_src,
+			       &entry, &level_state);
 	if (unlikely(tdx_operand_busy(err)))
 		return -EBUSY;
 
@@ -1640,12 +1640,11 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
 {
 	int tdx_level = pg_level_to_tdx_sept_level(level);
 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-	struct page *page = pfn_to_page(pfn);
 	gpa_t gpa = gfn_to_gpa(gfn);
 	u64 entry, level_state;
 	u64 err;
 
-	err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
+	err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, pfn, &entry, &level_state);
 	if (unlikely(tdx_operand_busy(err)))
 		return -EBUSY;
 
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index cb9b3210ab71..a9dd75190c67 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -30,7 +30,6 @@
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
 #include <linux/idr.h>
-#include <linux/kvm_types.h>
 #include <asm/page.h>
 #include <asm/special_insns.h>
 #include <asm/msr-index.h>
@@ -1568,6 +1567,11 @@ static void tdx_clflush_page(struct page *page)
 	clflush_cache_range(page_to_virt(page), PAGE_SIZE);
 }
 
+static void tdx_clflush_pfn(kvm_pfn_t pfn)
+{
+	clflush_cache_range(__va(PFN_PHYS(pfn)), PAGE_SIZE);
+}
+
 noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args)
 {
 	args->rcx = td->tdvpr_pa;
@@ -1588,17 +1592,18 @@ u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page)
 }
 EXPORT_SYMBOL_FOR_KVM(tdh_mng_addcx);
 
-u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2)
+u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, kvm_pfn_t pfn, struct page *source,
+		     u64 *ext_err1, u64 *ext_err2)
 {
 	struct tdx_module_args args = {
 		.rcx = gpa,
 		.rdx = tdx_tdr_pa(td),
-		.r8 = page_to_phys(page),
+		.r8 = PFN_PHYS(pfn),
 		.r9 = page_to_phys(source),
 	};
 	u64 ret;
 
-	tdx_clflush_page(page);
+	tdx_clflush_pfn(pfn);
 	ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args);
 
 	*ext_err1 = args.rcx;
@@ -1639,16 +1644,17 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
 }
 EXPORT_SYMBOL_FOR_KVM(tdh_vp_addcx);
 
-u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
+u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, kvm_pfn_t pfn,
+		     u64 *ext_err1, u64 *ext_err2)
 {
 	struct tdx_module_args args = {
 		.rcx = gpa | level,
 		.rdx = tdx_tdr_pa(td),
-		.r8 = page_to_phys(page),
+		.r8 = PFN_PHYS(pfn),
 	};
 	u64 ret;
 
-	tdx_clflush_page(page);
+	tdx_clflush_pfn(pfn);
 	ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args);
 
 	*ext_err1 = args.rcx;
-- 
2.43.2

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Ackerley Tng 3 days, 16 hours ago

Yan Zhao <yan.y.zhao@intel.com> writes:

>
> [...snip...]
>
> -u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2)
> +u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, kvm_pfn_t pfn, struct page *source,
> +		     u64 *ext_err1, u64 *ext_err2)
>  {
>  	struct tdx_module_args args = {
>  		.rcx = gpa,
>  		.rdx = tdx_tdr_pa(td),
> -		.r8 = page_to_phys(page),
> +		.r8 = PFN_PHYS(pfn),
>  		.r9 = page_to_phys(source),

Perhaps in some future patch, are we considering also passing pfn
instead of struct page for source? Would we also update
kvm_tdx->page_add_src to be a kvm_pfn_t?

>  	};
>  	u64 ret;
>
> -	tdx_clflush_page(page);
> +	tdx_clflush_pfn(pfn);
>  	ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args);
>
>  	*ext_err1 = args.rcx;
>
> [...snip...]
>

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Edgecombe, Rick P 3 days, 16 hours ago

On Thu, 2026-04-02 at 16:23 -0700, Ackerley Tng wrote:
> Yan Zhao <yan.y.zhao@intel.com> writes:
> 
> > 
> > [...snip...]
> > 
> > -u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2)
> > +u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, kvm_pfn_t pfn, struct page *source,
> > +		     u64 *ext_err1, u64 *ext_err2)
> >   {
> >   	struct tdx_module_args args = {
> >   		.rcx = gpa,
> >   		.rdx = tdx_tdr_pa(td),
> > -		.r8 = page_to_phys(page),
> > +		.r8 = PFN_PHYS(pfn),
> >   		.r9 = page_to_phys(source),
> 
> Perhaps in some future patch, are we considering also passing pfn
> instead of struct page for source? Would we also update
> kvm_tdx->page_add_src to be a kvm_pfn_t?

Can you remind me, with the new API we were going to do an in-place add right?
Then I'd wonder if we could maybe change tdh_mem_page_add() to only have a
single pfn arg. The passing of ->src_page is kind of awkward already.

Like Ira was playing around with here:
https://lore.kernel.org/kvm/20251105-tdx-init-in-place-v1-1-1196b67d0423@intel.com/

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Sean Christopherson 3 days, 16 hours ago

On Thu, Apr 02, 2026, Rick P Edgecombe wrote:
> On Thu, 2026-04-02 at 16:23 -0700, Ackerley Tng wrote:
> > Yan Zhao <yan.y.zhao@intel.com> writes:
> > 
> > > 
> > > [...snip...]
> > > 
> > > -u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2)
> > > +u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, kvm_pfn_t pfn, struct page *source,
> > > +		     u64 *ext_err1, u64 *ext_err2)
> > >   {
> > >   	struct tdx_module_args args = {
> > >   		.rcx = gpa,
> > >   		.rdx = tdx_tdr_pa(td),
> > > -		.r8 = page_to_phys(page),
> > > +		.r8 = PFN_PHYS(pfn),
> > >   		.r9 = page_to_phys(source),
> > 
> > Perhaps in some future patch, are we considering also passing pfn
> > instead of struct page for source? Would we also update
> > kvm_tdx->page_add_src to be a kvm_pfn_t?
> 
> Can you remind me, with the new API we were going to do an in-place add right?
> Then I'd wonder if we could maybe change tdh_mem_page_add() to only have a
> single pfn arg.

No.  In-place ADD will be supported, but it won't be mandatory.  Practically
speaking, we can't make it mandatory unless we're willing to completely rip out
support for per-VM attributes (or at least, per-VM PRIVATE tracking).  I suppose
we could require in-place ADD when using per-gmem attributes, but I don't see
the point given that TDH_MEM_PAGE_ADD itself takes a source and dest.

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Edgecombe, Rick P 3 days, 15 hours ago

On Thu, 2026-04-02 at 16:46 -0700, Sean Christopherson wrote:
> > Can you remind me, with the new API we were going to do an in-place add
> > right?
> > Then I'd wonder if we could maybe change tdh_mem_page_add() to only have a
> > single pfn arg.
> 
> No.  In-place ADD will be supported, but it won't be mandatory.  Practically
> speaking, we can't make it mandatory unless we're willing to completely rip
> out support for per-VM attributes (or at least, per-VM PRIVATE tracking).  I
> suppose we could require in-place ADD when using per-gmem attributes, but I
> don't see the point given that TDH_MEM_PAGE_ADD itself takes a source and
> dest.

Thanks. It might still be cleaner to copy clear text from the GUPed page to the
destination page and let the tdh_mem_page_add() call in the map path do it in-
place? Especially if we want to support in-place add as an option, it would make
the code more uniform.

But it sounds like we don't need to decide now.

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Sean Christopherson 3 days, 16 hours ago

On Thu, Apr 02, 2026, Ackerley Tng wrote:
> Yan Zhao <yan.y.zhao@intel.com> writes:
> 
> >
> > [...snip...]
> >
> > -u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2)
> > +u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, kvm_pfn_t pfn, struct page *source,
> > +		     u64 *ext_err1, u64 *ext_err2)
> >  {
> >  	struct tdx_module_args args = {
> >  		.rcx = gpa,
> >  		.rdx = tdx_tdr_pa(td),
> > -		.r8 = page_to_phys(page),
> > +		.r8 = PFN_PHYS(pfn),
> >  		.r9 = page_to_phys(source),
> 
> Perhaps in some future patch, are we considering also passing pfn instead of
> struct page for source? Would we also update kvm_tdx->page_add_src to be a
> kvm_pfn_t?

Probably?

I assume you're asking in the context of in-place conversion, where KVM will
allow a single guest_memfd page to be both the source and the dest?

Right now, KVM requires the source page to be a GUP'able page, specifically so
that KVM can obtain a reference and ensure the page isn't freed until KVM is done
with it.  If/when the source and dest are one and the same, then I don't think
we'd want to GUP the page (and there would be no need to since this would all run
while holding gmem's filemap_invalidate_lock()), at which point, yeah, passing a
"struct page" doesn't make much sense, and passing kvm_pfn_t or u64 or whatever
seems like the obvious choice.

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Dave Hansen 2 weeks, 3 days ago

On 3/18/26 17:57, Yan Zhao wrote:
> Remove the completely unnecessary assumption that memory mapped into a TDX
> guest is backed by refcounted struct page memory. From KVM's point of view,
> TDH_MEM_PAGE_ADD and TDH_MEM_PAGE_AUG are glorified writes to PTEs, so they
> have no business placing requirements on how KVM and guest_memfd manage
> memory.

I think this goes a bit too far.

It's one thing to say that it's more convenient for KVM to stick with
pfns because it's what KVM uses now. Or, that the goals of using 'struct
page' can be accomplished other ways. It's quite another to say what
other bits of the codebase have "business" doing.

Sean, can we tone this down a _bit_ to help guide folks in the future?

> Rip out the misguided struct page assumptions/constraints and instead have

Could we maybe tone down the editorializing a bit, please? Folks can
have honest disagreements about this stuff while not being "misguided".

> the two SEAMCALL wrapper APIs take PFN directly. This ensures that for
> future huge page support in S-EPT, the kernel doesn't pick up even worse
> assumptions like "a hugepage must be contained in a single folio".

I don't really understand what this is saying.

Is the concern that KVM might want to set up page tables for memory that
differ from how it was allocated? I'm a bit worried that this assumes
something about folios that doesn't always hold.

I think the hugetlbfs gigantic support uses folios in at least a few
spots today.

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Sean Christopherson 3 days, 19 hours ago

On Thu, Mar 19, 2026, Dave Hansen wrote:
> On 3/18/26 17:57, Yan Zhao wrote:
> > Remove the completely unnecessary assumption that memory mapped into a TDX
> > guest is backed by refcounted struct page memory. From KVM's point of view,
> > TDH_MEM_PAGE_ADD and TDH_MEM_PAGE_AUG are glorified writes to PTEs, so they
> > have no business placing requirements on how KVM and guest_memfd manage
> > memory.
> 
> I think this goes a bit too far.
> 
> It's one thing to say that it's more convenient for KVM to stick with
> pfns because it's what KVM uses now. Or, that the goals of using 'struct
> page' can be accomplished other ways. It's quite another to say what
> other bits of the codebase have "business" doing.
> 
> Sean, can we tone this down a _bit_ to help guide folks in the future?

I strongly disagree on this one.  IMO, super low level APIs have no business
placing unnecessary requirements on callers.  Requiring that the target memory
be convertible?  A-ok because that's an actual requirement of the architecture.
Requiring or assuming anything about "struct page" or folios?  Not ok.

This isn't a convenience thing, it's a core tenent of KVM guest memory managment.
KVM's MMUs work with PFNs, full stop.  A PFN might have been acquired via GUP and
thus a refcounted struct page, but there is a hard boundary in KVM between getting
the page via GUP and installing the PFN into KVM's MMU.

KVM didn't always have a hard boundary, and it took us literally years to undo
the resulting messes.  And the TDX hugepage support that was posted that pulled
information from "struct page" and/or its folio re-introduced the exact type of
flawed assumptions that we spent years purging from KVM.

So yeah, what I wrote was a strongly worded statement, but that was 100% intentional,
because I want to be crystal clear that requiring KVM to pass a struct page is a
complete non-starter for me.

> > Rip out the misguided struct page assumptions/constraints and instead have
> 
> Could we maybe tone down the editorializing a bit, please? Folks can
> have honest disagreements about this stuff while not being "misguided".

FWIW, I'm not trying to say the intent or people's viewpoints were misguided, I'm
saying the code itself is misguided.  AFAICT, the "struct page" stuff was added
to try to harden the TDX implementation, e.g. to guard against effective UAF of
memory that was assigned to a TD.  But my viewpoint is that requiring a struct
page made the overall implemenation _less_ robust, and thus the code is misguided
because its justfication/reasoning was flawed.

> > the two SEAMCALL wrapper APIs take PFN directly. This ensures that for
> > future huge page support in S-EPT, the kernel doesn't pick up even worse
> > assumptions like "a hugepage must be contained in a single folio".
> 
> I don't really understand what this is saying.
> 
> Is the concern that KVM might want to set up page tables for memory that
> differ from how it was allocated? I'm a bit worried that this assumes
> something about folios that doesn't always hold.

Heh, the concern is that taking a page/folio in the SEAMCALL wrappers will lead
to assumptions that don't always hold.  Specifically, the TDX hugepage support[*]
was building up assumptions that KVM would never attempt to install a hugepage
that didn't fit into a single folio:

+	if (start_idx + npages > folio_nr_pages(folio))
+		return TDX_OPERAND_INVALID;

[*] https://lore.kernel.org/all/20250807094132.4453-1-yan.y.zhao@intel.com

> I think the hugetlbfs gigantic support uses folios in at least a few
> spots today.

Yes, and the in-progress guest_memfd+HugeTLB work will also use folios.  The
potential hiccup with the above folio_nr_pages() assumption is that KVM may want
to shatter folios to 4KiB granularity for tracking purposes, but still map
hugepage when memory is known to be physically contiguous.

That's where a lot of this is coming from.  Taking a "struct page" is a bad
enough assumption on its own (that all TDX private memory is backed by struct page),
but even worse it's a slippery slope to even more bad assumptions (e.g. about how
guest_memfd internally manages its folios).

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Dave Hansen 3 days, 18 hours ago

On 4/2/26 13:47, Sean Christopherson wrote:
> On Thu, Mar 19, 2026, Dave Hansen wrote:
>> On 3/18/26 17:57, Yan Zhao wrote:
>>> Remove the completely unnecessary assumption that memory mapped into a TDX
>>> guest is backed by refcounted struct page memory. From KVM's point of view,
>>> TDH_MEM_PAGE_ADD and TDH_MEM_PAGE_AUG are glorified writes to PTEs, so they
>>> have no business placing requirements on how KVM and guest_memfd manage
>>> memory.
>>
>> I think this goes a bit too far.
>>
>> It's one thing to say that it's more convenient for KVM to stick with
>> pfns because it's what KVM uses now. Or, that the goals of using 'struct
>> page' can be accomplished other ways. It's quite another to say what
>> other bits of the codebase have "business" doing.
>>
>> Sean, can we tone this down a _bit_ to help guide folks in the future?
> 
> I strongly disagree on this one.

I think I understand the motivation now. All I'm saying is that instead
of something like:

	Remove the completely unnecessary assumption that memory mapped
	into a TDX guest is backed by refcounted struct page memory.

I'd rather see something along the lines of

	KVM's MMUs work with PFNs. This is very much an intentional
	design choice. It ensures that the KVM MMUs remains flexible
	and are not too tied to the regular CPU MMUs and the kernel code
	around	them.

	Using 'struct page' for TDX memory is not a good fit anywhere
	near the KVM MMU code.

Would you disagree strongly with that kind of rewording?

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Sean Christopherson 3 days, 17 hours ago

On Thu, Apr 02, 2026, Dave Hansen wrote:
> On 4/2/26 13:47, Sean Christopherson wrote:
> > On Thu, Mar 19, 2026, Dave Hansen wrote:
> >> On 3/18/26 17:57, Yan Zhao wrote:
> >>> Remove the completely unnecessary assumption that memory mapped into a TDX
> >>> guest is backed by refcounted struct page memory. From KVM's point of view,
> >>> TDH_MEM_PAGE_ADD and TDH_MEM_PAGE_AUG are glorified writes to PTEs, so they
> >>> have no business placing requirements on how KVM and guest_memfd manage
> >>> memory.
> >>
> >> I think this goes a bit too far.
> >>
> >> It's one thing to say that it's more convenient for KVM to stick with
> >> pfns because it's what KVM uses now. Or, that the goals of using 'struct
> >> page' can be accomplished other ways. It's quite another to say what
> >> other bits of the codebase have "business" doing.
> >>
> >> Sean, can we tone this down a _bit_ to help guide folks in the future?
> > 
> > I strongly disagree on this one.
> 
> I think I understand the motivation now. All I'm saying is that instead
> of something like:
> 
> 	Remove the completely unnecessary assumption that memory mapped
> 	into a TDX guest is backed by refcounted struct page memory.
> 
> I'd rather see something along the lines of
> 
> 	KVM's MMUs work with PFNs. This is very much an intentional
> 	design choice. It ensures that the KVM MMUs remains flexible
> 	and are not too tied to the regular CPU MMUs and the kernel code
> 	around	them.
> 
> 	Using 'struct page' for TDX memory is not a good fit anywhere
> 	near the KVM MMU code.
> 
> Would you disagree strongly with that kind of rewording?

Not at all, works for me.

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Yan Zhao 1 week, 5 days ago

On Thu, Mar 19, 2026 at 11:05:09AM -0700, Dave Hansen wrote:
> On 3/18/26 17:57, Yan Zhao wrote:
> > Remove the completely unnecessary assumption that memory mapped into a TDX
> > guest is backed by refcounted struct page memory. From KVM's point of view,
> > TDH_MEM_PAGE_ADD and TDH_MEM_PAGE_AUG are glorified writes to PTEs, so they
> > have no business placing requirements on how KVM and guest_memfd manage
> > memory.
> 
> I think this goes a bit too far.
> 
> It's one thing to say that it's more convenient for KVM to stick with
> pfns because it's what KVM uses now. Or, that the goals of using 'struct
> page' can be accomplished other ways. It's quite another to say what
> other bits of the codebase have "business" doing.
I explained the background in the cover letter, thinking we could add the link
to the final patches when they are merged.

I can expand the patch logs by providing background explanation as well.

> Sean, can we tone this down a _bit_ to help guide folks in the future?
Sorry for being lazy and not expanding the patch logs from Sean's original
patch tagged "DO NOT MERGE".

> > Rip out the misguided struct page assumptions/constraints and instead have
> 
> Could we maybe tone down the editorializing a bit, please? Folks can
> have honest disagreements about this stuff while not being "misguided".
You are right. I need to make it clear.

> > the two SEAMCALL wrapper APIs take PFN directly. This ensures that for
> > future huge page support in S-EPT, the kernel doesn't pick up even worse
> > assumptions like "a hugepage must be contained in a single folio".
> 
> I don't really understand what this is saying.
> 
> Is the concern that KVM might want to set up page tables for memory that
> differ from how it was allocated? I'm a bit worried that this assumes
> something about folios that doesn't always hold.
> 
> I think the hugetlbfs gigantic support uses folios in at least a few
> spots today.
Below is the background of this problem. I'll try to include a short summary in
the next version's patch logs.

In TDX huge page v3, I added logic that assumes PFNs are contained in a single
folio in both TDX's map/unmap paths [1][2]:
	if (start_idx + npages > folio_nr_pages(folio))
		return TDX_OPERAND_INVALID;
This not only assumes the PFNs have corresponding struct page, but also assumes
they must be contained in a single folio, since with only base_page + npages,
it's not easy to get the ith page's pointer without first ensuring the pages are
contained in a single folio.

This should work since current KVM/guest_memfd only allocates memory with
struct page and maps them into S-EPT at a level lower than or equal to the
backend folio size. That is, a single S-EPT mapping cannot span multiple backend
folios.

However, Ackerley's 1G hugetlb-based gmem splits the backend folio [3] ahead of
splitting/unmapping them from S-EPT [4], due to implementation limitations
mentioned at [5]. It makes the warning in [1] hit upon invoking TDX's unmap
callback.

Moreover, Google's future gmem may manage PFNs independently in the future, so
TDX's private memory may have no corresponding struct page, and KVM would map
them via VM_PFNMAP, similar to mapping pass-through MMIOs or other PFNs without
struct page or with non-refcounted struct page in normal VMs. Given that KVM has
suffered a lot from handling VM_PFNMAP memory for non-refcounted struct page [6]
in normal VMs, and TDX mapping/unmapping callbacks have no semantic reason to
dictate where and how KVM/guest_memfd should allocate and map memory, Sean
suggested dropping the unnecessary assumption that memory to be mapped/unmapped
to/from S-EPT must be contained in a single folio (though he didn't object
reasonable sanity checks on if the PFNs are TDX convertible).

[1] https://lore.kernel.org/kvm/20260106101929.24937-1-yan.y.zhao@intel.com
[2] https://lore.kernel.org/kvm/20260106101826.24870-1-yan.y.zhao@intel.com
[3] https://github.com/googleprodkernel/linux-cc/blob/wip-gmem-conversions-hugetlb-restructuring-12-08-25/virt/kvm/guest_memfd.c#L909
[4] https://github.com/googleprodkernel/linux-cc/blob/wip-gmem-conversions-hugetlb-restructuring-12-08-25/virt/kvm/guest_memfd.c#L918
[5] https://lore.kernel.org/kvm/diqzqzrzdfvh.fsf@google.com/
[6] https://lore.kernel.org/all/20241010182427.1434605-1-seanjc@google.com

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Edgecombe, Rick P 1 week, 4 days ago

On Wed, 2026-03-25 at 17:10 +0800, Yan Zhao wrote:
> > I don't really understand what this is saying.
> > 
> > Is the concern that KVM might want to set up page tables for memory
> > that differ from how it was allocated? I'm a bit worried that this
> > assumes something about folios that doesn't always hold.
> > 
> > I think the hugetlbfs gigantic support uses folios in at least a
> > few spots today.
> Below is the background of this problem. I'll try to include a short
> summary in the next version's patch logs.

While this patchset is kind of pre-work for TDX huge pages, the reason
to separate it out and push it earlier is because it has some value on
it's own. So I'd think to focus mostly on the impact of the change
today.

How about this justification:
1. Because KVM handles guest memory as PFNs, and the SEAMCALLs under
discussion are only used there, PFN is more natural.

2. The struct page was partly making sure we didn't pass a wrong arg
(typical type safety) and partly ensuring that KVM doesn't pass non-
convertible memory, however the SEAMCALLs themselves can check this for
the kernel. So the case is already covered by warnings.

In conclusion, the PFN is more natural and the original purpose of
struct page is already covered.


Sean said somewhere IIRC that he would have NAKed the struct page thing
if he had seen it, for even the base support. And the two points above
don't actually require discussion of even huge pages. So does it
actually add any value to dive into the issues you list below?

> 
> In TDX huge page v3, I added logic that assumes PFNs are contained in
> a single folio in both TDX's map/unmap paths [1][2]:
> 	if (start_idx + npages > folio_nr_pages(folio))
> 		return TDX_OPERAND_INVALID;
> This not only assumes the PFNs have corresponding struct page, but
> also assumes they must be contained in a single folio, since with
> only base_page + npages, it's not easy to get the ith page's pointer
> without first ensuring the pages are contained in a single folio.
> 
> This should work since current KVM/guest_memfd only allocates memory
> with struct page and maps them into S-EPT at a level lower than or
> equal to the backend folio size. That is, a single S-EPT mapping
> cannot span multiple backend folios.
> 
> However, Ackerley's 1G hugetlb-based gmem splits the backend folio
> [3] ahead of splitting/unmapping them from S-EPT [4], due to
> implementation limitations mentioned at [5]. It makes the warning in
> [1] hit upon invoking TDX's unmap callback.
> 
> Moreover, Google's future gmem may manage PFNs independently in the
> future,

I think we can adapt to such changes when they eventually come up. It's
not just about not merging code to be used by out-of-tree code. It also
shrinks what we have to consider at each stage. So we can eventually
get there.

>  so TDX's private memory may have no corresponding struct page, and
> KVM would map them via VM_PFNMAP, similar to mapping pass-through
> MMIOs or other PFNs without struct page or with non-refcounted struct
> page in normal VMs. Given that KVM has
> suffered a lot from handling VM_PFNMAP memory for non-refcounted
> struct page [6] in normal VMs, and TDX mapping/unmapping callbacks
> have no semantic reason to dictate where and how KVM/guest_memfd
> should allocate and map memory, Sean suggested dropping the
> unnecessary assumption that memory to be mapped/unmapped to/from S-
> EPT must be contained in a single folio (though he didn't object
> reasonable sanity checks on if the PFNs are TDX convertible).
> 
> 
> [1]
> https://lore.kernel.org/kvm/20260106101929.24937-1-yan.y.zhao@intel.com
> [2]
> https://lore.kernel.org/kvm/20260106101826.24870-1-yan.y.zhao@intel.com
> [3]
> https://github.com/googleprodkernel/linux-cc/blob/wip-gmem-conversions-hugetlb-restructuring-12-08-25/virt/kvm/guest_memfd.c#L909
> [4]
> https://github.com/googleprodkernel/linux-cc/blob/wip-gmem-conversions-hugetlb-restructuring-12-08-25/virt/kvm/guest_memfd.c#L918
> [5] https://lore.kernel.org/kvm/diqzqzrzdfvh.fsf@google.com/
> [6]
> https://lore.kernel.org/all/20241010182427.1434605-1-seanjc@google.com

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Yan Zhao 1 week, 3 days ago

On Thu, Mar 26, 2026 at 12:57:26AM +0800, Edgecombe, Rick P wrote:
> On Wed, 2026-03-25 at 17:10 +0800, Yan Zhao wrote:
> > > I don't really understand what this is saying.
> > > 
> > > Is the concern that KVM might want to set up page tables for memory
> > > that differ from how it was allocated? I'm a bit worried that this
> > > assumes something about folios that doesn't always hold.
> > > 
> > > I think the hugetlbfs gigantic support uses folios in at least a
> > > few spots today.
> > Below is the background of this problem. I'll try to include a short
> > summary in the next version's patch logs.
> 
> While this patchset is kind of pre-work for TDX huge pages, the reason
> to separate it out and push it earlier is because it has some value on
> it's own. So I'd think to focus mostly on the impact of the change
> today.
> 
> How about this justification:
> 1. Because KVM handles guest memory as PFNs, and the SEAMCALLs under
> discussion are only used there, PFN is more natural.
> 
> 2. The struct page was partly making sure we didn't pass a wrong arg
> (typical type safety) and partly ensuring that KVM doesn't pass non-
> convertible memory, however the SEAMCALLs themselves can check this for
> the kernel. So the case is already covered by warnings.
> 
> In conclusion, the PFN is more natural and the original purpose of
> struct page is already covered.
> 
> 
> Sean said somewhere IIRC that he would have NAKed the struct page thing
> if he had seen it, for even the base support. And the two points above
> don't actually require discussion of even huge pages. So does it
> actually add any value to dive into the issues you list below?
I wanted to mention the issues listed below because I'm not sure if anyone has
the same question as me: why do we have to convert struct page to PFN if they
can both achieve the same purpose, given that currently all private memory
allocated by gmem has struct page backing?

The background can also reduce confusion if the patch log mentions hugepage and
single folio.

So, maybe also avoid mentioning hugepage and single folio things if you think
it's better to just mention the above two points?

> > In TDX huge page v3, I added logic that assumes PFNs are contained in
> > a single folio in both TDX's map/unmap paths [1][2]:
> > 	if (start_idx + npages > folio_nr_pages(folio))
> > 		return TDX_OPERAND_INVALID;
> > This not only assumes the PFNs have corresponding struct page, but
> > also assumes they must be contained in a single folio, since with
> > only base_page + npages, it's not easy to get the ith page's pointer
> > without first ensuring the pages are contained in a single folio.
> > 
> > This should work since current KVM/guest_memfd only allocates memory
> > with struct page and maps them into S-EPT at a level lower than or
> > equal to the backend folio size. That is, a single S-EPT mapping
> > cannot span multiple backend folios.
> > 
> > However, Ackerley's 1G hugetlb-based gmem splits the backend folio
> > [3] ahead of splitting/unmapping them from S-EPT [4], due to
> > implementation limitations mentioned at [5]. It makes the warning in
> > [1] hit upon invoking TDX's unmap callback.
> > 
> > Moreover, Google's future gmem may manage PFNs independently in the
> > future,
> 
> I think we can adapt to such changes when they eventually come up. It's
> not just about not merging code to be used by out-of-tree code. It also
> shrinks what we have to consider at each stage. So we can eventually
> get there.
Ok.

> >  so TDX's private memory may have no corresponding struct page, and
> > KVM would map them via VM_PFNMAP, similar to mapping pass-through
> > MMIOs or other PFNs without struct page or with non-refcounted struct
> > page in normal VMs. Given that KVM has
> > suffered a lot from handling VM_PFNMAP memory for non-refcounted
> > struct page [6] in normal VMs, and TDX mapping/unmapping callbacks
> > have no semantic reason to dictate where and how KVM/guest_memfd
> > should allocate and map memory, Sean suggested dropping the
> > unnecessary assumption that memory to be mapped/unmapped to/from S-
> > EPT must be contained in a single folio (though he didn't object
> > reasonable sanity checks on if the PFNs are TDX convertible).
> > 
> > 
> > [1]
> > https://lore.kernel.org/kvm/20260106101929.24937-1-yan.y.zhao@intel.com
> > [2]
> > https://lore.kernel.org/kvm/20260106101826.24870-1-yan.y.zhao@intel.com
> > [3]
> > https://github.com/googleprodkernel/linux-cc/blob/wip-gmem-conversions-hugetlb-restructuring-12-08-25/virt/kvm/guest_memfd.c#L909
> > [4]
> > https://github.com/googleprodkernel/linux-cc/blob/wip-gmem-conversions-hugetlb-restructuring-12-08-25/virt/kvm/guest_memfd.c#L918
> > [5] https://lore.kernel.org/kvm/diqzqzrzdfvh.fsf@google.com/
> > [6]
> > https://lore.kernel.org/all/20241010182427.1434605-1-seanjc@google.com
>

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Sean Christopherson 5 days, 20 hours ago

On Fri, Mar 27, 2026, Yan Zhao wrote:
> On Thu, Mar 26, 2026 at 12:57:26AM +0800, Edgecombe, Rick P wrote:
> > On Wed, 2026-03-25 at 17:10 +0800, Yan Zhao wrote:
> > > > I don't really understand what this is saying.
> > > > 
> > > > Is the concern that KVM might want to set up page tables for memory
> > > > that differ from how it was allocated? I'm a bit worried that this
> > > > assumes something about folios that doesn't always hold.
> > > > 
> > > > I think the hugetlbfs gigantic support uses folios in at least a
> > > > few spots today.
> > > Below is the background of this problem. I'll try to include a short
> > > summary in the next version's patch logs.
> > 
> > While this patchset is kind of pre-work for TDX huge pages, the reason
> > to separate it out and push it earlier is because it has some value on
> > it's own. So I'd think to focus mostly on the impact of the change
> > today.
> > 
> > How about this justification:
> > 1. Because KVM handles guest memory as PFNs, and the SEAMCALLs under
> > discussion are only used there, PFN is more natural.
> > 
> > 2. The struct page was partly making sure we didn't pass a wrong arg
> > (typical type safety) and partly ensuring that KVM doesn't pass non-
> > convertible memory, however the SEAMCALLs themselves can check this for
> > the kernel. So the case is already covered by warnings.
> > 
> > In conclusion, the PFN is more natural and the original purpose of
> > struct page is already covered.

Most importantly, having core TDX make assumptions based on the struct page and/or
folio will create subtle dependencies that are easily avoided.

> > Sean said somewhere IIRC that he would have NAKed the struct page thing
> > if he had seen it, for even the base support.

Yes.

> > And the two points above don't actually require discussion of even huge
> > pages. So does it actually add any value to dive into the issues you list
> > below?
> I wanted to mention the issues listed below because I'm not sure if anyone has
> the same question as me: why do we have to convert struct page to PFN if they
> can both achieve the same purpose, given that currently all private memory
> allocated by gmem has struct page backing?

From https://lore.kernel.org/all/aWgyhmTJphGQqO0Y@google.com:

 : I'm not at all opposed to backing guest_memfd with "struct page", quite the
 : opposite.  What I don't want is to bake assumptions into KVM code that doesn't
 : _require_ struct page, because that has cause KVM immense pain in the past.
 : 
 : And I'm strongly opposed to KVM special-casing TDX or anything else, precisely
 : because we struggled through all that pain so that KVM would work better with
 : memory that isn't backed by "struct page", or more specifically, memory that has
 : an associated "struct page", but isn't managed by core MM, e.g. isn't refcounted.

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Kiryl Shutsemau 2 weeks, 4 days ago

On Thu, Mar 19, 2026 at 08:57:03AM +0800, Yan Zhao wrote:
> @@ -1639,16 +1644,17 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
>  }
>  EXPORT_SYMBOL_FOR_KVM(tdh_vp_addcx);
>  
> -u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
> +u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, kvm_pfn_t pfn,
> +		     u64 *ext_err1, u64 *ext_err2)
>  {
>  	struct tdx_module_args args = {
>  		.rcx = gpa | level,
>  		.rdx = tdx_tdr_pa(td),
> -		.r8 = page_to_phys(page),
> +		.r8 = PFN_PHYS(pfn),
>  	};
>  	u64 ret;
>  
> -	tdx_clflush_page(page);
> +	tdx_clflush_pfn(pfn);

This is pre-existing problem, but shouldn't we respect @level here?
Flush size need to take page size into account.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Yan Zhao 2 weeks, 4 days ago

On Thu, Mar 19, 2026 at 10:39:14AM +0000, Kiryl Shutsemau wrote:
> On Thu, Mar 19, 2026 at 08:57:03AM +0800, Yan Zhao wrote:
> > @@ -1639,16 +1644,17 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
> >  }
> >  EXPORT_SYMBOL_FOR_KVM(tdh_vp_addcx);
> >  
> > -u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
> > +u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, kvm_pfn_t pfn,
> > +		     u64 *ext_err1, u64 *ext_err2)
> >  {
> >  	struct tdx_module_args args = {
> >  		.rcx = gpa | level,
> >  		.rdx = tdx_tdr_pa(td),
> > -		.r8 = page_to_phys(page),
> > +		.r8 = PFN_PHYS(pfn),
> >  	};
> >  	u64 ret;
> >  
> > -	tdx_clflush_page(page);
> > +	tdx_clflush_pfn(pfn);
> 
> This is pre-existing problem, but shouldn't we respect @level here?
> Flush size need to take page size into account.
Hmm, flush size is fixed to PAGE_SIZE, because this series is based on the
upstream code where huge page is not supported, so there's 
"if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))" in KVM.

Though tdh_mem_page_aug() is an API, it is currently only exported to KVM and
uses type kvm_pfn_t. So, is it still acceptable to assume flush size to be
PAGE_SIZE? Honoring level will soon be introduced by huge page patches.

If you think it needs to be fixed before huge page series, what about fixing it
in a separate cleanup patch? IMO, it would be better placed after Sean's cleanup
patch [1], so we can use page_level_size() instead of inventing the wheel.

[1] https://lore.kernel.org/kvm/20260129011517.3545883-2-seanjc@google.com/

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Kiryl Shutsemau 2 weeks, 4 days ago

On Thu, Mar 19, 2026 at 07:59:53PM +0800, Yan Zhao wrote:
> On Thu, Mar 19, 2026 at 10:39:14AM +0000, Kiryl Shutsemau wrote:
> > On Thu, Mar 19, 2026 at 08:57:03AM +0800, Yan Zhao wrote:
> > > @@ -1639,16 +1644,17 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
> > >  }
> > >  EXPORT_SYMBOL_FOR_KVM(tdh_vp_addcx);
> > >  
> > > -u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
> > > +u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, kvm_pfn_t pfn,
> > > +		     u64 *ext_err1, u64 *ext_err2)
> > >  {
> > >  	struct tdx_module_args args = {
> > >  		.rcx = gpa | level,
> > >  		.rdx = tdx_tdr_pa(td),
> > > -		.r8 = page_to_phys(page),
> > > +		.r8 = PFN_PHYS(pfn),
> > >  	};
> > >  	u64 ret;
> > >  
> > > -	tdx_clflush_page(page);
> > > +	tdx_clflush_pfn(pfn);
> > 
> > This is pre-existing problem, but shouldn't we respect @level here?
> > Flush size need to take page size into account.
> Hmm, flush size is fixed to PAGE_SIZE, because this series is based on the
> upstream code where huge page is not supported, so there's 
> "if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))" in KVM.
> 
> Though tdh_mem_page_aug() is an API, it is currently only exported to KVM and
> uses type kvm_pfn_t. So, is it still acceptable to assume flush size to be
> PAGE_SIZE? Honoring level will soon be introduced by huge page patches.

It caught my eye because previously size to flush was passed down to
tdx_clflush_page() in the struct page (although never used there).
With switching to pfn, we give up this information and it has to be
passed separately. It would be easy to miss that in huge page patches,
if we don't pass down level here.

> 
> If you think it needs to be fixed before huge page series, what about fixing it
> in a separate cleanup patch? IMO, it would be better placed after Sean's cleanup
> patch [1], so we can use page_level_size() instead of inventing the wheel.

I am okay with a separate patch.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Edgecombe, Rick P 2 weeks, 3 days ago

On Thu, 2026-03-19 at 12:57 +0000, Kiryl Shutsemau wrote:
> > Though tdh_mem_page_aug() is an API, it is currently only exported to KVM
> > and
> > uses type kvm_pfn_t. So, is it still acceptable to assume flush size to be
> > PAGE_SIZE? Honoring level will soon be introduced by huge page patches.
> 
> It caught my eye because previously size to flush was passed down to
> tdx_clflush_page() in the struct page (although never used there).
> With switching to pfn, we give up this information and it has to be
> passed separately. It would be easy to miss that in huge page patches,
> if we don't pass down level here.
> 
> > 
> > If you think it needs to be fixed before huge page series, what about fixing
> > it
> > in a separate cleanup patch? IMO, it would be better placed after Sean's
> > cleanup
> > patch [1], so we can use page_level_size() instead of inventing the wheel.
> 
> I am okay with a separate patch.

I feel like we argued about this before. But it would be more correct to just
drop level and make it 4k only until huge pages? Otherwise we are tweaking dead
behavior.

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Kiryl Shutsemau 2 weeks, 3 days ago

On Thu, Mar 19, 2026 at 05:27:48PM +0000, Edgecombe, Rick P wrote:
> On Thu, 2026-03-19 at 12:57 +0000, Kiryl Shutsemau wrote:
> > > Though tdh_mem_page_aug() is an API, it is currently only exported to KVM
> > > and
> > > uses type kvm_pfn_t. So, is it still acceptable to assume flush size to be
> > > PAGE_SIZE? Honoring level will soon be introduced by huge page patches.
> > 
> > It caught my eye because previously size to flush was passed down to
> > tdx_clflush_page() in the struct page (although never used there).
> > With switching to pfn, we give up this information and it has to be
> > passed separately. It would be easy to miss that in huge page patches,
> > if we don't pass down level here.
> > 
> > > 
> > > If you think it needs to be fixed before huge page series, what about fixing
> > > it
> > > in a separate cleanup patch? IMO, it would be better placed after Sean's
> > > cleanup
> > > patch [1], so we can use page_level_size() instead of inventing the wheel.
> > 
> > I am okay with a separate patch.
> 
> I feel like we argued about this before. But it would be more correct to just
> drop level and make it 4k only until huge pages? Otherwise we are tweaking dead
> behavior.

I guess. But you add one more thing on the list to remember when adding
huge page support. This kind of stuff is easy to miss.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Edgecombe, Rick P 2 weeks, 2 days ago

On Fri, 2026-03-20 at 12:59 +0000, Kiryl Shutsemau wrote:
> > I feel like we argued about this before. But it would be more correct to
> > just drop level and make it 4k only until huge pages? Otherwise we are
> > tweaking dead behavior.
> 
> I guess. But you add one more thing on the list to remember when adding
> huge page support. This kind of stuff is easy to miss.

I guess I'm fine either way. I'm not sure Dave will like adding dead branches
though. I suppose huge pages is close enough that we are looking to merge prep
work anyway.

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Dave Hansen 2 weeks, 2 days ago

On 3/20/26 10:31, Edgecombe, Rick P wrote:
> On Fri, 2026-03-20 at 12:59 +0000, Kiryl Shutsemau wrote:
>>> I feel like we argued about this before. But it would be more correct to
>>> just drop level and make it 4k only until huge pages? Otherwise we are
>>> tweaking dead behavior.
>> I guess. But you add one more thing on the list to remember when adding
>> huge page support. This kind of stuff is easy to miss.
> I guess I'm fine either way. I'm not sure Dave will like adding dead branches
> though. I suppose huge pages is close enough that we are looking to merge prep
> work anyway.

Can we add something that will BUG_ON() or fail to compile when the huge
page support comes around?

I'd much rather have:

	BUG_ON(level != PG_LEVEL_4K);
	tdx_clflush_pfn(pfn);

That go implementing a level argument to tdx_clflush_pfn() now. Then
nobody has to remember. The list to remember is in the code.

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Edgecombe, Rick P 2 weeks, 2 days ago

On Fri, 2026-03-20 at 10:38 -0700, Dave Hansen wrote:
> Can we add something that will BUG_ON() or fail to compile when the huge
> page support comes around?
> 
> I'd much rather have:
> 
> 	BUG_ON(level != PG_LEVEL_4K);
> 	tdx_clflush_pfn(pfn);
> 
> That go implementing a level argument to tdx_clflush_pfn() now. Then
> nobody has to remember. The list to remember is in the code.

I like it, but I'm afraid to add BUG_ON()s. Could we do a WARN instead?

Especially since... the ridiculous thing about this is that the clflush is only
needed if CLFLUSH_BEFORE_ALLOC is set in the tdx metadata, which we have yet to
see in any modules. The spec is apparently to give TDX module flexibility for
the future.

In the base series we debated just ignoring it, but at the time it was simpler
to just always flush. So if CLFLUSH_BEFORE_ALLOC never comes along, it is
possible the function will never do any useful work in its life. Tough case for
a BUG_ON().

Re: [PATCH 1/2] x86/virt/tdx: Use PFN directly for mapping guest private memory

Posted by Yan Zhao 2 weeks, 4 days ago

On Thu, Mar 19, 2026 at 07:59:53PM +0800, Yan Zhao wrote:
> On Thu, Mar 19, 2026 at 10:39:14AM +0000, Kiryl Shutsemau wrote:
> > On Thu, Mar 19, 2026 at 08:57:03AM +0800, Yan Zhao wrote:
> > > @@ -1639,16 +1644,17 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
> > >  }
> > >  EXPORT_SYMBOL_FOR_KVM(tdh_vp_addcx);
> > >  
> > > -u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
> > > +u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, kvm_pfn_t pfn,
> > > +		     u64 *ext_err1, u64 *ext_err2)
> > >  {
> > >  	struct tdx_module_args args = {
> > >  		.rcx = gpa | level,
> > >  		.rdx = tdx_tdr_pa(td),
> > > -		.r8 = page_to_phys(page),
> > > +		.r8 = PFN_PHYS(pfn),
> > >  	};
> > >  	u64 ret;
> > >  
> > > -	tdx_clflush_page(page);
> > > +	tdx_clflush_pfn(pfn);
> > 
> > This is pre-existing problem, but shouldn't we respect @level here?
> > Flush size need to take page size into account.
> Hmm, flush size is fixed to PAGE_SIZE, because this series is based on the
> upstream code where huge page is not supported, so there's 
> "if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))" in KVM.
> 
> Though tdh_mem_page_aug() is an API, it is currently only exported to KVM and
> uses type kvm_pfn_t. So, is it still acceptable to assume flush size to be
> PAGE_SIZE? Honoring level will soon be introduced by huge page patches.
> 
> If you think it needs to be fixed before huge page series, what about fixing it
> in a separate cleanup patch? IMO, it would be better placed after Sean's cleanup
> patch [1], so we can use page_level_size() instead of inventing the wheel.
BTW, the cleanup patch would then essentially look like the one in the huge page
series [2]...
[2] https://lore.kernel.org/kvm/20260129011517.3545883-27-seanjc@google.com/

So,  if a cleanup patch before the huge page series is required, maybe just
adding WARN_ON_ONCE(level != PG_LEVEL_4K) in that patch?

> [1] https://lore.kernel.org/kvm/20260129011517.3545883-2-seanjc@google.com/