[RFC PATCH 20/21] KVM: x86: Force a prefetch fault's max mapping level to 4KB for TDX

Yan Zhao posted 21 patches 7 months, 3 weeks ago
Only 20 patches received!
There is a newer version of this series
[RFC PATCH 20/21] KVM: x86: Force a prefetch fault's max mapping level to 4KB for TDX
Posted by Yan Zhao 7 months, 3 weeks ago
Introduce a "prefetch" parameter to the private_max_mapping_level hook and
enforce the max mapping level of a prefetch fault for private memory to be
4KB. This is a preparation to enable the ignoring huge page splitting in
the fault path.

If a prefetch fault results in a 2MB huge leaf in the mirror page table,
there may not be a vCPU available to accept the corresponding 2MB huge leaf
in the S-EPT if the TD is not configured to receive #VE for page
acceptance. Consequently, if a vCPU accepts the page at 4KB level, it will
trigger an EPT violation to split the 2MB huge leaf generated by the
prefetch fault.

Since handling the BUSY error from SEAMCALLs for huge page splitting is
more comprehensive in the fault path, which is with kvm->mmu_lock held for
reading, force the max mapping level of a prefetch fault of private memory
to be 4KB to prevent potential splitting.

Since prefetch faults for private memory are uncommon after the TD's build
time, enforcing a 4KB mapping level is unlikely to cause any performance
degradation. The max mapping level is already set to 4KB during the TD's
build phase.

Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
 arch/x86/include/asm/kvm_host.h | 3 ++-
 arch/x86/kvm/mmu/mmu.c          | 7 ++++---
 arch/x86/kvm/svm/sev.c          | 3 ++-
 arch/x86/kvm/svm/svm.h          | 5 +++--
 arch/x86/kvm/vmx/main.c         | 5 +++--
 arch/x86/kvm/vmx/tdx.c          | 5 +++--
 arch/x86/kvm/vmx/x86_ops.h      | 4 ++--
 7 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6962a8a424ef..5167458742bf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1900,7 +1900,8 @@ struct kvm_x86_ops {
 	void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
 	int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
 	void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end);
-	int (*private_max_mapping_level)(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, gfn_t gfn);
+	int (*private_max_mapping_level)(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, gfn_t gfn,
+					 bool prefetch);
 };
 
 struct kvm_x86_nested_ops {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 1a34e43bd349..94a557e010d3 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4488,7 +4488,7 @@ static inline u8 kvm_max_level_for_order(int order)
 }
 
 static u8 kvm_max_private_mapping_level(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, gfn_t gfn,
-					u8 max_level, int gmem_order)
+					u8 max_level, int gmem_order, bool prefetch)
 {
 	u8 req_max_level;
 
@@ -4499,7 +4499,7 @@ static u8 kvm_max_private_mapping_level(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, gf
 	if (max_level == PG_LEVEL_4K)
 		return PG_LEVEL_4K;
 
-	req_max_level = kvm_x86_call(private_max_mapping_level)(vcpu, pfn, gfn);
+	req_max_level = kvm_x86_call(private_max_mapping_level)(vcpu, pfn, gfn, prefetch);
 	if (req_max_level)
 		max_level = min(max_level, req_max_level);
 
@@ -4532,7 +4532,8 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
 
 	fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
 	fault->max_level = kvm_max_private_mapping_level(vcpu, fault->pfn, fault->gfn,
-							 fault->max_level, max_order);
+							 fault->max_level, max_order,
+							 fault->prefetch);
 
 	return RET_PF_CONTINUE;
 }
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index dc6cdf9fa1ba..7a9c44ad5b91 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -4910,7 +4910,8 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
 	}
 }
 
-int sev_private_max_mapping_level(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, gfn_t gfn)
+int sev_private_max_mapping_level(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, gfn_t gfn,
+				  bool prefetch)
 {
 	int level, rc;
 	bool assigned;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 1a9738b6ae37..272a8404e1c0 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -782,7 +782,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code);
 void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
 int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
 void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
-int sev_private_max_mapping_level(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, gfn_t gfn);
+int sev_private_max_mapping_level(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, gfn_t gfn, bool prefetch);
 #else
 static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
 {
@@ -809,7 +809,8 @@ static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, in
 	return 0;
 }
 static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {}
-static inline int sev_private_max_mapping_level(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, gfn_t gfn)
+static inline int sev_private_max_mapping_level(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
+						gfn_t gfn, bool prefetch)
 {
 	return 0;
 }
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 16c0c31dd066..82689ad8bc18 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -881,10 +881,11 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
 	return tdx_vcpu_ioctl(vcpu, argp);
 }
 
-static int vt_gmem_private_max_mapping_level(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, gfn_t gfn)
+static int vt_gmem_private_max_mapping_level(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
+					     gfn_t gfn, bool prefetch)
 {
 	if (is_td(vcpu->kvm))
-		return tdx_gmem_private_max_mapping_level(vcpu, pfn, gfn);
+		return tdx_gmem_private_max_mapping_level(vcpu, pfn, gfn, prefetch);
 
 	return 0;
 }
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 4386e1a0323e..e24d1cbcc762 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -3332,11 +3332,12 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
 	return ret;
 }
 
-int tdx_gmem_private_max_mapping_level(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, gfn_t gfn)
+int tdx_gmem_private_max_mapping_level(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
+				       gfn_t gfn, bool prefetch)
 {
 	struct vcpu_tdx *tdx = to_tdx(vcpu);
 
-	if (unlikely(to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
+	if (unlikely((to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE) || prefetch))
 		return PG_LEVEL_4K;
 
 	if (gfn >= tdx->violation_gfn_start && gfn < tdx->violation_gfn_end)
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index df7d4cd1436c..0619e9390e5d 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -164,7 +164,7 @@ int tdx_sept_split_private_spt(struct kvm *kvm, gfn_t gfn, enum pg_level level,
 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
 void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
-int tdx_gmem_private_max_mapping_level(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, gfn_t gfn);
+int tdx_gmem_private_max_mapping_level(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, gfn_t gfn, bool prefetch);
 #else
 static inline void tdx_disable_virtualization_cpu(void) {}
 static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; }
@@ -236,7 +236,7 @@ static inline int tdx_sept_split_private_spt(struct kvm *kvm, gfn_t gfn,
 static inline void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) {}
 static inline void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) {}
 static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {}
-static inline int tdx_gmem_private_max_mapping_level(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, gfn_t gfn) { return 0; }
+static inline int tdx_gmem_private_max_mapping_level(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, gfn_t gfn, bool prefetch) { return 0; }
 #endif
 
 #endif /* __KVM_X86_VMX_X86_OPS_H */
-- 
2.43.2
Re: [RFC PATCH 20/21] KVM: x86: Force a prefetch fault's max mapping level to 4KB for TDX
Posted by Binbin Wu 6 months, 4 weeks ago

On 4/24/2025 11:09 AM, Yan Zhao wrote:
> Introduce a "prefetch" parameter to the private_max_mapping_level hook and
> enforce the max mapping level of a prefetch fault for private memory to be
> 4KB. This is a preparation to enable the ignoring huge page splitting in
> the fault path.
>
> If a prefetch fault results in a 2MB huge leaf in the mirror page table,
> there may not be a vCPU available to accept the corresponding 2MB huge leaf
> in the S-EPT if the TD is not configured to receive #VE for page
> acceptance. Consequently, if a vCPU accepts the page at 4KB level, it will
> trigger an EPT violation to split the 2MB huge leaf generated by the
> prefetch fault.
>
> Since handling the BUSY error from SEAMCALLs for huge page splitting is
> more comprehensive in the fault path, which is with kvm->mmu_lock held for
> reading, force the max mapping level of a prefetch fault of private memory
> to be 4KB to prevent potential splitting.
>
> Since prefetch faults for private memory are uncommon after the TD's build
> time, enforcing a 4KB mapping level is unlikely to cause any performance
> degradation.
I am wondering what are the use cases for KVM_PRE_FAULT_MEMORY.
Is there an API usage guide to limit that userspace shouldn't use it for a large
amount of memory pre-fault? If no, and userspace uses it to pre-fault a lot of
memory, this "unlikely to cause any performance degradation" might be not true.
Re: [RFC PATCH 20/21] KVM: x86: Force a prefetch fault's max mapping level to 4KB for TDX
Posted by Yan Zhao 6 months, 4 weeks ago
On Wed, May 21, 2025 at 11:30:42AM +0800, Binbin Wu wrote:
> 
> 
> On 4/24/2025 11:09 AM, Yan Zhao wrote:
> > Introduce a "prefetch" parameter to the private_max_mapping_level hook and
> > enforce the max mapping level of a prefetch fault for private memory to be
> > 4KB. This is a preparation to enable the ignoring huge page splitting in
> > the fault path.
> > 
> > If a prefetch fault results in a 2MB huge leaf in the mirror page table,
> > there may not be a vCPU available to accept the corresponding 2MB huge leaf
> > in the S-EPT if the TD is not configured to receive #VE for page
> > acceptance. Consequently, if a vCPU accepts the page at 4KB level, it will
> > trigger an EPT violation to split the 2MB huge leaf generated by the
> > prefetch fault.
> > 
> > Since handling the BUSY error from SEAMCALLs for huge page splitting is
> > more comprehensive in the fault path, which is with kvm->mmu_lock held for
> > reading, force the max mapping level of a prefetch fault of private memory
> > to be 4KB to prevent potential splitting.
> > 
> > Since prefetch faults for private memory are uncommon after the TD's build
> > time, enforcing a 4KB mapping level is unlikely to cause any performance
> > degradation.
> I am wondering what are the use cases for KVM_PRE_FAULT_MEMORY.
> Is there an API usage guide to limit that userspace shouldn't use it for a large
> amount of memory pre-fault? If no, and userspace uses it to pre-fault a lot of
> memory, this "unlikely to cause any performance degradation" might be not true.
Currently, there are no known users of KVM_PRE_FAULT_MEMORY.
We can enable huge page support for prefetch faults (along with allowing
splitting in the fault path) in the future if performance considerations arise
for future users.
Re: [RFC PATCH 20/21] KVM: x86: Force a prefetch fault's max mapping level to 4KB for TDX
Posted by Edgecombe, Rick P 7 months ago
On Thu, 2025-04-24 at 11:09 +0800, Yan Zhao wrote:
> Introduce a "prefetch" parameter to the private_max_mapping_level hook and
> enforce the max mapping level of a prefetch fault for private memory to be
> 4KB. This is a preparation to enable the ignoring huge page splitting in
> the fault path.
> 
> If a prefetch fault results in a 2MB huge leaf in the mirror page table,
> there may not be a vCPU available to accept the corresponding 2MB huge leaf
> in the S-EPT if the TD is not configured to receive #VE for page
> acceptance. 
> 

Can you elaborate on this case more. A vCPU may not be available? What does that
mean?

> Consequently, if a vCPU accepts the page at 4KB level, it will
> trigger an EPT violation to split the 2MB huge leaf generated by the
> prefetch fault.

The case is KVM_PRE_FAULT_MEMORY faults in 2MB, then guest accepts at 4k (which
it is not supposed to do)?

Then maybe the kvm_vm_dead() case I suggested in the other patch could handle
this case too, and this patch could be dropped?

> 
> Since handling the BUSY error from SEAMCALLs for huge page splitting is
> more comprehensive in the fault path, which is with kvm->mmu_lock held for
> reading, force the max mapping level of a prefetch fault of private memory
> to be 4KB to prevent potential splitting.

Re: [RFC PATCH 20/21] KVM: x86: Force a prefetch fault's max mapping level to 4KB for TDX
Posted by Yan Zhao 7 months ago
On Wed, May 14, 2025 at 07:20:18AM +0800, Edgecombe, Rick P wrote:
> On Thu, 2025-04-24 at 11:09 +0800, Yan Zhao wrote:
> > Introduce a "prefetch" parameter to the private_max_mapping_level hook and
> > enforce the max mapping level of a prefetch fault for private memory to be
> > 4KB. This is a preparation to enable the ignoring huge page splitting in
> > the fault path.
> > 
> > If a prefetch fault results in a 2MB huge leaf in the mirror page table,
> > there may not be a vCPU available to accept the corresponding 2MB huge leaf
> > in the S-EPT if the TD is not configured to receive #VE for page
> > acceptance. 
> > 
> 
> Can you elaborate on this case more. A vCPU may not be available? What does that
> mean?
Sorry. I didn't express it clearly.

If a prefetch fault results in a 2MB mapping, as the guest is not aware of the
prefetched mapping, it may accept at 4KB later, triggering a demotion.

> > Consequently, if a vCPU accepts the page at 4KB level, it will
> > trigger an EPT violation to split the 2MB huge leaf generated by the
> > prefetch fault.
> 
> The case is KVM_PRE_FAULT_MEMORY faults in 2MB, then guest accepts at 4k (which
> it is not supposed to do)?
Actually, the guest is innocent to accept at 4KB.

> Then maybe the kvm_vm_dead() case I suggested in the other patch could handle
> this case too, and this patch could be dropped?
 
This patch is not required if we decide to support demotion in the fault path.
 
> > Since handling the BUSY error from SEAMCALLs for huge page splitting is
> > more comprehensive in the fault path, which is with kvm->mmu_lock held for
> > reading, force the max mapping level of a prefetch fault of private memory
> > to be 4KB to prevent potential splitting.
>