[v1] KVM: x86/mmu: TDX post-populate cleanups

[RFC PATCH 02/12] KVM: x86/mmu: Add dedicated API to map guest_memfd pfn into TDP MMU

Posted by Sean Christopherson 1 month, 1 week ago

Add and use a new API for mapping a private pfn from guest_memfd into the
TDP MMU from TDX's post-populate hook instead of partially open-coding the
functionality into the TDX code.  Sharing code with the pre-fault path
sounded good on paper, but it's fatally flawed as simulating a fault loses
the pfn, and calling back into gmem to re-retrieve the pfn creates locking
problems, e.g. kvm_gmem_populate() already holds the gmem invalidation
lock.

Providing a dedicated API will also removing several MMU exports that
ideally would not be exposed outside of the MMU, let alone to vendor code.
On that topic, opportunistically drop the kvm_mmu_load() export.  Leave
kvm_tdp_mmu_gpa_is_mapped() alone for now; the entire commit that added
kvm_tdp_mmu_gpa_is_mapped() will be removed in the near future.

Cc: Michael Roth <michael.roth@amd.com>
Cc: Yan Zhao <yan.y.zhao@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Vishal Annapurve <vannapurve@google.com>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Link: https://lore.kernel.org/all/20250709232103.zwmufocd3l7sqk7y@amd.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu.h     |  1 +
 arch/x86/kvm/mmu/mmu.c | 60 +++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/tdx.c | 10 +++----
 3 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index b4b6860ab971..697b90a97f43 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -259,6 +259,7 @@ extern bool tdp_mmu_enabled;
 
 bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa);
 int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level);
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn);
 
 static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6e838cb6c9e1..d3625e00baf9 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4990,6 +4990,65 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
 	return min(range->size, end - range->gpa);
 }
 
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+{
+	struct kvm_page_fault fault = {
+		.addr = gfn_to_gpa(gfn),
+		.error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS,
+		.prefetch = true,
+		.is_tdp = true,
+		.nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm),
+
+		.max_level = KVM_MAX_HUGEPAGE_LEVEL,
+		.req_level = PG_LEVEL_4K,
+		.goal_level = PG_LEVEL_4K,
+		.is_private = true,
+
+		.gfn = gfn,
+		.slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
+		.pfn = pfn,
+		.map_writable = true,
+	};
+	struct kvm *kvm = vcpu->kvm;
+	int r;
+
+	lockdep_assert_held(&kvm->slots_lock);
+
+	if (KVM_BUG_ON(!tdp_mmu_enabled, kvm))
+		return -EIO;
+
+	if (kvm_gfn_is_write_tracked(kvm, fault.slot, fault.gfn))
+		return -EPERM;
+
+	r = kvm_mmu_reload(vcpu);
+	if (r)
+		return r;
+
+	r = mmu_topup_memory_caches(vcpu, false);
+	if (r)
+		return r;
+
+	do {
+		if (signal_pending(current))
+			return -EINTR;
+
+		if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu))
+			return -EIO;
+
+		cond_resched();
+
+		guard(read_lock)(&kvm->mmu_lock);
+
+		r = kvm_tdp_mmu_map(vcpu, &fault);
+	} while (r == RET_PF_RETRY);
+
+	if (r != RET_PF_FIXED)
+		return -EIO;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_tdp_mmu_map_private_pfn);
+
 static void nonpaging_init_context(struct kvm_mmu *context)
 {
 	context->page_fault = nonpaging_page_fault;
@@ -5973,7 +6032,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 out:
 	return r;
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
 
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index a6155f76cc6a..1724d82c8512 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -3151,15 +3151,12 @@ struct tdx_gmem_post_populate_arg {
 static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
 				  void __user *src, int order, void *_arg)
 {
-	u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
-	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
 	struct tdx_gmem_post_populate_arg *arg = _arg;
-	struct kvm_vcpu *vcpu = arg->vcpu;
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	u64 err, entry, level_state;
 	gpa_t gpa = gfn_to_gpa(gfn);
-	u8 level = PG_LEVEL_4K;
 	struct page *src_page;
 	int ret, i;
-	u64 err, entry, level_state;
 
 	/*
 	 * Get the source page if it has been faulted in. Return failure if the
@@ -3171,7 +3168,7 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
 	if (ret != 1)
 		return -ENOMEM;
 
-	ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
+	ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
 	if (ret < 0)
 		goto out;
 
@@ -3234,7 +3231,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
 	    !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
 		return -EINVAL;
 
-	kvm_mmu_reload(vcpu);
 	ret = 0;
 	while (region.nr_pages) {
 		if (signal_pending(current)) {
-- 
2.51.0.268.g9569e192d0-goog

Re: [RFC PATCH 02/12] KVM: x86/mmu: Add dedicated API to map guest_memfd pfn into TDP MMU

Posted by Ira Weiny 1 month ago

Sean Christopherson wrote:

[snip]

> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6e838cb6c9e1..d3625e00baf9 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4990,6 +4990,65 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
>  	return min(range->size, end - range->gpa);
>  }
>  
> +int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
> +{
> +	struct kvm_page_fault fault = {
> +		.addr = gfn_to_gpa(gfn),
> +		.error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS,
> +		.prefetch = true,
> +		.is_tdp = true,
> +		.nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm),
> +
> +		.max_level = KVM_MAX_HUGEPAGE_LEVEL,
> +		.req_level = PG_LEVEL_4K,
> +		.goal_level = PG_LEVEL_4K,
> +		.is_private = true,
> +
> +		.gfn = gfn,
> +		.slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
> +		.pfn = pfn,
> +		.map_writable = true,

Why is map_writable set?  Doesn't this get translated into host_writable?

Ira

[snip]

Re: [RFC PATCH 02/12] KVM: x86/mmu: Add dedicated API to map guest_memfd pfn into TDP MMU

Posted by Edgecombe, Rick P 1 month ago

On Wed, 2025-08-27 at 19:40 -0500, Ira Weiny wrote:
> > +		.map_writable = true,
> 
> Why is map_writable set?  Doesn't this get translated into host_writable?

I guess it's normally set only if it's a !KVM_MEM_READONLY slot for private
faults memory. But that flag is invalid for gmem. So we should only have
map_writable=true cases for tdx.

Hypothetically this function should work for non-gmem. I guess since it's
exported, a comment could be nice to specify that the memslots are not
consulted. There are many MMU details that are not commented though, so it's
probably too much given that the struct is right there to look at what kind of
fault it is.

Re: [RFC PATCH 02/12] KVM: x86/mmu: Add dedicated API to map guest_memfd pfn into TDP MMU

Posted by Sean Christopherson 1 month ago

On Thu, Aug 28, 2025, Rick P Edgecombe wrote:
> On Wed, 2025-08-27 at 19:40 -0500, Ira Weiny wrote:
> > > +		.map_writable = true,
> > 
> > Why is map_writable set?  Doesn't this get translated into host_writable?
> 
> I guess it's normally set only if it's a !KVM_MEM_READONLY slot for private
> faults memory. 

map_writable can also be %false on read faults and the host userspace mapping
isn't writable. 

> But that flag is invalid for gmem. So we should only have
> map_writable=true cases for tdx.

Yep.  And not TDX specific, map_writable _must_ be true for write faults.  The
reason there's two separate flags is so that KVM can opportunistically create a
writable mapping on read faults.

Re: [RFC PATCH 02/12] KVM: x86/mmu: Add dedicated API to map guest_memfd pfn into TDP MMU

Posted by Yan Zhao 1 month, 1 week ago

On Tue, Aug 26, 2025 at 05:05:12PM -0700, Sean Christopherson wrote:
> Add and use a new API for mapping a private pfn from guest_memfd into the
> TDP MMU from TDX's post-populate hook instead of partially open-coding the
> functionality into the TDX code.  Sharing code with the pre-fault path
> sounded good on paper, but it's fatally flawed as simulating a fault loses
> the pfn, and calling back into gmem to re-retrieve the pfn creates locking
> problems, e.g. kvm_gmem_populate() already holds the gmem invalidation
> lock.
> 
> Providing a dedicated API will also removing several MMU exports that
> ideally would not be exposed outside of the MMU, let alone to vendor code.
> On that topic, opportunistically drop the kvm_mmu_load() export.  Leave
> kvm_tdp_mmu_gpa_is_mapped() alone for now; the entire commit that added
> kvm_tdp_mmu_gpa_is_mapped() will be removed in the near future.
> 
> Cc: Michael Roth <michael.roth@amd.com>
> Cc: Yan Zhao <yan.y.zhao@intel.com>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Cc: Vishal Annapurve <vannapurve@google.com>
> Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
> Link: https://lore.kernel.org/all/20250709232103.zwmufocd3l7sqk7y@amd.com
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/mmu.h     |  1 +
>  arch/x86/kvm/mmu/mmu.c | 60 +++++++++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/vmx/tdx.c | 10 +++----
>  3 files changed, 63 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index b4b6860ab971..697b90a97f43 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -259,6 +259,7 @@ extern bool tdp_mmu_enabled;
>  
>  bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa);
>  int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level);
> +int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn);
>  
>  static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
>  {
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6e838cb6c9e1..d3625e00baf9 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4990,6 +4990,65 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
>  	return min(range->size, end - range->gpa);
>  }
>  
> +int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
As the function starts with kvm_tdp_mmu, can we move it to tdp_mmu.c?

> +{
> +	struct kvm_page_fault fault = {
> +		.addr = gfn_to_gpa(gfn),
> +		.error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS,
> +		.prefetch = true,
> +		.is_tdp = true,
> +		.nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm),
> +
> +		.max_level = KVM_MAX_HUGEPAGE_LEVEL,
Looks the kvm_tdp_mmu_map_private_pfn() is only for initial memory mapping,
given that ".prefetch = true" and RET_PF_SPURIOUS is not a valid return value.

Then, what about setting
                .max_level = PG_LEVEL_4K,
directly?

Otherwise, the "(KVM_BUG_ON(level != PG_LEVEL_4K, kvm)" would be triggered in
tdx_sept_set_private_spte().

> +		.req_level = PG_LEVEL_4K,
> +		.goal_level = PG_LEVEL_4K,
> +		.is_private = true,
> +
> +		.gfn = gfn,
> +		.slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
> +		.pfn = pfn,
> +		.map_writable = true,
> +	};
> +	struct kvm *kvm = vcpu->kvm;
> +	int r;
> +
> +	lockdep_assert_held(&kvm->slots_lock);
> +
> +	if (KVM_BUG_ON(!tdp_mmu_enabled, kvm))
> +		return -EIO;
> +
> +	if (kvm_gfn_is_write_tracked(kvm, fault.slot, fault.gfn))
> +		return -EPERM;
> +
> +	r = kvm_mmu_reload(vcpu);
> +	if (r)
> +		return r;
> +
> +	r = mmu_topup_memory_caches(vcpu, false);
> +	if (r)
> +		return r;
> +
> +	do {
> +		if (signal_pending(current))
> +			return -EINTR;
> +
> +		if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu))
> +			return -EIO;
> +
> +		cond_resched();
> +
> +		guard(read_lock)(&kvm->mmu_lock);
> +
> +		r = kvm_tdp_mmu_map(vcpu, &fault);
> +	} while (r == RET_PF_RETRY);
> +
> +	if (r != RET_PF_FIXED)
> +		return -EIO;
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(kvm_tdp_mmu_map_private_pfn);
> +
>  static void nonpaging_init_context(struct kvm_mmu *context)
>  {
>  	context->page_fault = nonpaging_page_fault;
> @@ -5973,7 +6032,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
>  out:
>  	return r;
>  }
> -EXPORT_SYMBOL_GPL(kvm_mmu_load);
>  
>  void kvm_mmu_unload(struct kvm_vcpu *vcpu)
>  {
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index a6155f76cc6a..1724d82c8512 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -3151,15 +3151,12 @@ struct tdx_gmem_post_populate_arg {
>  static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
>  				  void __user *src, int order, void *_arg)
>  {
> -	u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
> -	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
>  	struct tdx_gmem_post_populate_arg *arg = _arg;
> -	struct kvm_vcpu *vcpu = arg->vcpu;
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	u64 err, entry, level_state;
>  	gpa_t gpa = gfn_to_gpa(gfn);
> -	u8 level = PG_LEVEL_4K;
>  	struct page *src_page;
>  	int ret, i;
> -	u64 err, entry, level_state;
>  
>  	/*
>  	 * Get the source page if it has been faulted in. Return failure if the
> @@ -3171,7 +3168,7 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
>  	if (ret != 1)
>  		return -ENOMEM;
>  
> -	ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
> +	ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
>  	if (ret < 0)
>  		goto out;
>  
> @@ -3234,7 +3231,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
>  	    !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
>  		return -EINVAL;
>  
> -	kvm_mmu_reload(vcpu);
>  	ret = 0;
>  	while (region.nr_pages) {
>  		if (signal_pending(current)) {
> -- 
> 2.51.0.268.g9569e192d0-goog
>

Re: [RFC PATCH 02/12] KVM: x86/mmu: Add dedicated API to map guest_memfd pfn into TDP MMU

Posted by Edgecombe, Rick P 1 month ago

On Wed, 2025-08-27 at 16:25 +0800, Yan Zhao wrote:
> > +{
> > +	struct kvm_page_fault fault = {
> > +		.addr = gfn_to_gpa(gfn),
> > +		.error_code = PFERR_GUEST_FINAL_MASK |
> > PFERR_PRIVATE_ACCESS,
> > +		.prefetch = true,
> > +		.is_tdp = true,
> > +		.nx_huge_page_workaround_enabled =
> > is_nx_huge_page_enabled(vcpu->kvm),
> > +
> > +		.max_level = KVM_MAX_HUGEPAGE_LEVEL,
> Looks the kvm_tdp_mmu_map_private_pfn() is only for initial memory mapping,
> given that ".prefetch = true" and RET_PF_SPURIOUS is not a valid return value.

Hmm, what are you referring to regarding RET_PF_SPURIOUS?

> 
> Then, what about setting
>                 .max_level = PG_LEVEL_4K,
> directly?
> 
> Otherwise, the "(KVM_BUG_ON(level != PG_LEVEL_4K, kvm)" would be triggered in
> tdx_sept_set_private_spte().

Yes this fails to boot a TD. With max_level = PG_LEVEL_4K it passes the full
tests. I don't think it's ideal to encode PAGE.ADD details here though.

But I'm not immediately clear what is going wrong. The old struct kvm_page_fault
looks pretty similar. Did you root cause it?

Re: [RFC PATCH 02/12] KVM: x86/mmu: Add dedicated API to map guest_memfd pfn into TDP MMU

Posted by Yan Zhao 1 month ago

On Thu, Aug 28, 2025 at 08:54:48AM +0800, Edgecombe, Rick P wrote:
> On Wed, 2025-08-27 at 16:25 +0800, Yan Zhao wrote:
> > > +{
> > > +	struct kvm_page_fault fault = {
> > > +		.addr = gfn_to_gpa(gfn),
> > > +		.error_code = PFERR_GUEST_FINAL_MASK |
> > > PFERR_PRIVATE_ACCESS,
> > > +		.prefetch = true,
> > > +		.is_tdp = true,
> > > +		.nx_huge_page_workaround_enabled =
> > > is_nx_huge_page_enabled(vcpu->kvm),
> > > +
> > > +		.max_level = KVM_MAX_HUGEPAGE_LEVEL,
> > Looks the kvm_tdp_mmu_map_private_pfn() is only for initial memory mapping,
> > given that ".prefetch = true" and RET_PF_SPURIOUS is not a valid return value.
> 
> Hmm, what are you referring to regarding RET_PF_SPURIOUS?
If kvm_tdp_mmu_map_private_pfn() can also be invoked after initial memory
mapping stage, RET_PF_SPURIOUS is a valid return case.

But in this patch, only RET_PF_RETRY and RET_PF_FIXED are valid.
So, I think it's expected to be invoked only by initial memory mapping stage :)

Re: [RFC PATCH 02/12] KVM: x86/mmu: Add dedicated API to map guest_memfd pfn into TDP MMU

Posted by Edgecombe, Rick P 1 month ago

On Wed, 2025-08-27 at 17:54 -0700, Rick Edgecombe wrote:
> > 
> > Then, what about setting
> >                 .max_level = PG_LEVEL_4K,
> > directly?
> > 
> > Otherwise, the "(KVM_BUG_ON(level != PG_LEVEL_4K, kvm)" would be triggered
> > in
> > tdx_sept_set_private_spte().
> 
> Yes this fails to boot a TD. With max_level = PG_LEVEL_4K it passes the full
> tests. I don't think it's ideal to encode PAGE.ADD details here though.
> 
> But I'm not immediately clear what is going wrong. The old struct
> kvm_page_fault
> looks pretty similar. Did you root cause it?

Oh, duh. Because we are passing in the PFN now so it can't know the size. So
it's not about PAGE.ADD actually.

Sill, how about calling the function kvm_tdp_mmu_map_private_pfn_4k(), or
passing in the level?

Re: [RFC PATCH 02/12] KVM: x86/mmu: Add dedicated API to map guest_memfd pfn into TDP MMU

Posted by Yan Zhao 1 month ago

On Thu, Aug 28, 2025 at 09:26:50AM +0800, Edgecombe, Rick P wrote:
> On Wed, 2025-08-27 at 17:54 -0700, Rick Edgecombe wrote:
> > > 
> > > Then, what about setting
> > >                 .max_level = PG_LEVEL_4K,
> > > directly?
> > > 
> > > Otherwise, the "(KVM_BUG_ON(level != PG_LEVEL_4K, kvm)" would be triggered
> > > in
> > > tdx_sept_set_private_spte().
> > 
> > Yes this fails to boot a TD. With max_level = PG_LEVEL_4K it passes the full
> > tests. I don't think it's ideal to encode PAGE.ADD details here though.
> > 
> > But I'm not immediately clear what is going wrong. The old struct
> > kvm_page_fault
> > looks pretty similar. Did you root cause it?
>
> Oh, duh. Because we are passing in the PFN now so it can't know the size. So
> it's not about PAGE.ADD actually.
Right, it's because the previous kvm_tdp_map_page() updates fault->max_level in
kvm_mmu_faultin_pfn_private() by checking the private_max_mapping_level hook.

However, private_max_mapping_level() skips the faultin step and goes straight
to kvm_tdp_mmu_map().

> Sill, how about calling the function kvm_tdp_mmu_map_private_pfn_4k(), or
> passing in the level?
Looks [1] can also address this issue. Not sure which one Sean prefers.

[1] https://lore.kernel.org/all/20250729225455.670324-15-seanjc@google.com

Re: [RFC PATCH 02/12] KVM: x86/mmu: Add dedicated API to map guest_memfd pfn into TDP MMU

Posted by Sean Christopherson 1 month ago

On Thu, Aug 28, 2025, Yan Zhao wrote:
> On Thu, Aug 28, 2025 at 09:26:50AM +0800, Edgecombe, Rick P wrote:
> > On Wed, 2025-08-27 at 17:54 -0700, Rick Edgecombe wrote:
> > > > 
> > > > Then, what about setting
> > > >                 .max_level = PG_LEVEL_4K,
> > > > directly?
> > > > 
> > > > Otherwise, the "(KVM_BUG_ON(level != PG_LEVEL_4K, kvm)" would be triggered
> > > > in
> > > > tdx_sept_set_private_spte().
> > > 
> > > Yes this fails to boot a TD. With max_level = PG_LEVEL_4K it passes the full
> > > tests. I don't think it's ideal to encode PAGE.ADD details here though.
> > > 
> > > But I'm not immediately clear what is going wrong. The old struct
> > > kvm_page_fault
> > > looks pretty similar. Did you root cause it?
> >
> > Oh, duh. Because we are passing in the PFN now so it can't know the size. So
> > it's not about PAGE.ADD actually.
> Right, it's because the previous kvm_tdp_map_page() updates fault->max_level in
> kvm_mmu_faultin_pfn_private() by checking the private_max_mapping_level hook.
> 
> However, private_max_mapping_level() skips the faultin step and goes straight
> to kvm_tdp_mmu_map().
> 
> > Sill, how about calling the function kvm_tdp_mmu_map_private_pfn_4k(), or
> > passing in the level?
> Looks [1] can also address this issue. Not sure which one Sean prefers.
> 
> [1] https://lore.kernel.org/all/20250729225455.670324-15-seanjc@google.com

That won't fix this issue though, becuase @fault will be valid and so max_level
will still be KVM_MAX_HUGEPAGE_LEVEL.  Which is by design, the intent in that
flow is that KVM should have gotten the level when getting the pfn from gmem.

IIUC, this particular flow _must_ map at 4KiB, so I think forcing PG_LEVEL_4k is
the right solution.

Re: [RFC PATCH 02/12] KVM: x86/mmu: Add dedicated API to map guest_memfd pfn into TDP MMU

Posted by Yan Zhao 1 month ago

On Thu, Aug 28, 2025 at 12:40:20PM -0700, Sean Christopherson wrote:
> On Thu, Aug 28, 2025, Yan Zhao wrote:
> > On Thu, Aug 28, 2025 at 09:26:50AM +0800, Edgecombe, Rick P wrote:
> > > On Wed, 2025-08-27 at 17:54 -0700, Rick Edgecombe wrote:
> > > > > 
> > > > > Then, what about setting
> > > > >                 .max_level = PG_LEVEL_4K,
> > > > > directly?
> > > > > 
> > > > > Otherwise, the "(KVM_BUG_ON(level != PG_LEVEL_4K, kvm)" would be triggered
> > > > > in
> > > > > tdx_sept_set_private_spte().
> > > > 
> > > > Yes this fails to boot a TD. With max_level = PG_LEVEL_4K it passes the full
> > > > tests. I don't think it's ideal to encode PAGE.ADD details here though.
> > > > 
> > > > But I'm not immediately clear what is going wrong. The old struct
> > > > kvm_page_fault
> > > > looks pretty similar. Did you root cause it?
> > >
> > > Oh, duh. Because we are passing in the PFN now so it can't know the size. So
> > > it's not about PAGE.ADD actually.
> > Right, it's because the previous kvm_tdp_map_page() updates fault->max_level in
> > kvm_mmu_faultin_pfn_private() by checking the private_max_mapping_level hook.
> > 
> > However, private_max_mapping_level() skips the faultin step and goes straight
> > to kvm_tdp_mmu_map().
> > 
> > > Sill, how about calling the function kvm_tdp_mmu_map_private_pfn_4k(), or
> > > passing in the level?
> > Looks [1] can also address this issue. Not sure which one Sean prefers.
> > 
> > [1] https://lore.kernel.org/all/20250729225455.670324-15-seanjc@google.com
> 
> That won't fix this issue though, becuase @fault will be valid and so max_level
Ah, right, I missed that you composed a fault...

> will still be KVM_MAX_HUGEPAGE_LEVEL.  Which is by design, the intent in that
> flow is that KVM should have gotten the level when getting the pfn from gmem.
> 
> IIUC, this particular flow _must_ map at 4KiB, so I think forcing PG_LEVEL_4k is
> the right solution.
Forcing PG_LEVEL_4k looks good to me.
I was worried that SEV might want to use higher levels.

Re: [RFC PATCH 02/12] KVM: x86/mmu: Add dedicated API to map guest_memfd pfn into TDP MMU

Posted by Yan Zhao 1 month ago

On Fri, Aug 29, 2025 at 09:16:47AM +0800, Yan Zhao wrote:
> On Thu, Aug 28, 2025 at 12:40:20PM -0700, Sean Christopherson wrote:
> > On Thu, Aug 28, 2025, Yan Zhao wrote:
> > > On Thu, Aug 28, 2025 at 09:26:50AM +0800, Edgecombe, Rick P wrote:
> > > > On Wed, 2025-08-27 at 17:54 -0700, Rick Edgecombe wrote:
> > > > > > 
> > > > > > Then, what about setting
> > > > > >                 .max_level = PG_LEVEL_4K,
> > > > > > directly?
> > > > > > 
> > > > > > Otherwise, the "(KVM_BUG_ON(level != PG_LEVEL_4K, kvm)" would be triggered
> > > > > > in
> > > > > > tdx_sept_set_private_spte().
> > > > > 
> > > > > Yes this fails to boot a TD. With max_level = PG_LEVEL_4K it passes the full
> > > > > tests. I don't think it's ideal to encode PAGE.ADD details here though.
> > > > > 
> > > > > But I'm not immediately clear what is going wrong. The old struct
> > > > > kvm_page_fault
> > > > > looks pretty similar. Did you root cause it?
> > > >
> > > > Oh, duh. Because we are passing in the PFN now so it can't know the size. So
> > > > it's not about PAGE.ADD actually.
> > > Right, it's because the previous kvm_tdp_map_page() updates fault->max_level in
> > > kvm_mmu_faultin_pfn_private() by checking the private_max_mapping_level hook.
> > > 
> > > However, private_max_mapping_level() skips the faultin step and goes straight
> > > to kvm_tdp_mmu_map().
> > > 
> > > > Sill, how about calling the function kvm_tdp_mmu_map_private_pfn_4k(), or
> > > > passing in the level?
> > > Looks [1] can also address this issue. Not sure which one Sean prefers.
> > > 
> > > [1] https://lore.kernel.org/all/20250729225455.670324-15-seanjc@google.com
> > 
> > That won't fix this issue though, becuase @fault will be valid and so max_level
> Ah, right, I missed that you composed a fault...
FWIW: after reviewing it again, I think [1] is still able update the max_level
to 4KB.

The flow with a valid @fault:

kvm_mmu_hugepage_adjust
  kvm_mmu_max_mapping_level
    kvm_max_private_mapping_level
      kvm_x86_call(gmem_max_mapping_level)(kvm, pfn);
 

> > will still be KVM_MAX_HUGEPAGE_LEVEL.  Which is by design, the intent in that
> > flow is that KVM should have gotten the level when getting the pfn from gmem.
> > 
> > IIUC, this particular flow _must_ map at 4KiB, so I think forcing PG_LEVEL_4k is
> > the right solution.
> Forcing PG_LEVEL_4k looks good to me.
> I was worried that SEV might want to use higher levels.