[RFC PATCH 15/21] KVM: TDX: Support huge page splitting with exclusive kvm->mmu_lock

Yan Zhao posted 21 patches 7 months, 3 weeks ago
Only 20 patches received!
There is a newer version of this series
[RFC PATCH 15/21] KVM: TDX: Support huge page splitting with exclusive kvm->mmu_lock
Posted by Yan Zhao 7 months, 3 weeks ago
From: Xiaoyao Li <xiaoyao.li@intel.com>

Implement the split_external_spt hook to support huge page splitting for
TDX when kvm->mmu_lock is held for writing.

Invoke tdh_mem_range_block(), tdh_mem_track(), kicking off vCPUs,
tdh_mem_page_demote() in sequence. Since kvm->mmu_lock is held for writing,
simply kick off vCPUs on tdx_operand_busy() to ensure the second SEAMCALL
invocation succeeds.

TDX module may return TDX_INTERRUPTED_RESTARTABLE when there is a pending
interrupt on the host side during tdh_mem_page_demote(). Retry indefinitely
on this error, as with exclusive kvm->mmu_lock the pending interrupt is for
host only.

[Yan: Split patch for exclusive mmu_lock only, handled busy error ]

Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
 arch/x86/kvm/vmx/main.c      |  1 +
 arch/x86/kvm/vmx/tdx.c       | 45 ++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/tdx_errno.h |  1 +
 arch/x86/kvm/vmx/x86_ops.h   |  9 ++++++++
 4 files changed, 56 insertions(+)

diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index ae8540576821..16c0c31dd066 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -62,6 +62,7 @@ static __init int vt_hardware_setup(void)
 		vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
 		vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
 		vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
+		vt_x86_ops.split_external_spt = tdx_sept_split_private_spt;
 		vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
 	}
 
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index dd63a634e633..4386e1a0323e 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1806,6 +1806,51 @@ int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
 	return tdx_reclaim_page(virt_to_page(private_spt), PG_LEVEL_4K);
 }
 
+static int tdx_spte_demote_private_spte(struct kvm *kvm, gfn_t gfn,
+					enum pg_level level, struct page *page)
+{
+	int tdx_level = pg_level_to_tdx_sept_level(level);
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	gpa_t gpa = gfn_to_gpa(gfn);
+	u64 err, entry, level_state;
+
+	do {
+		err = tdh_mem_page_demote(&kvm_tdx->td, gpa, tdx_level, page,
+					  &entry, &level_state);
+	} while (err == TDX_INTERRUPTED_RESTARTABLE);
+
+	if (unlikely(tdx_operand_busy(err))) {
+		tdx_no_vcpus_enter_start(kvm);
+		err = tdh_mem_page_demote(&kvm_tdx->td, gpa, tdx_level, page,
+					  &entry, &level_state);
+		tdx_no_vcpus_enter_stop(kvm);
+	}
+
+	if (KVM_BUG_ON(err, kvm)) {
+		pr_tdx_error_2(TDH_MEM_PAGE_DEMOTE, err, entry, level_state);
+		return -EIO;
+	}
+	return 0;
+}
+
+int tdx_sept_split_private_spt(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+			       void *private_spt)
+{
+	struct page *page = virt_to_page(private_spt);
+	int ret;
+
+	if (KVM_BUG_ON(to_kvm_tdx(kvm)->state != TD_STATE_RUNNABLE || level != PG_LEVEL_2M, kvm))
+		return -EINVAL;
+
+	ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
+	if (ret <= 0)
+		return ret;
+
+	tdx_track(kvm);
+
+	return tdx_spte_demote_private_spte(kvm, gfn, level, page);
+}
+
 int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
 				 enum pg_level level, kvm_pfn_t pfn)
 {
diff --git a/arch/x86/kvm/vmx/tdx_errno.h b/arch/x86/kvm/vmx/tdx_errno.h
index 6ff4672c4181..33589e7fa1e1 100644
--- a/arch/x86/kvm/vmx/tdx_errno.h
+++ b/arch/x86/kvm/vmx/tdx_errno.h
@@ -14,6 +14,7 @@
 #define TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE	0x6000000500000000ULL
 #define TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE	0x6000000700000000ULL
 #define TDX_INTERRUPTED_RESUMABLE		0x8000000300000000ULL
+#define TDX_INTERRUPTED_RESTARTABLE		0x8000000400000000ULL
 #define TDX_OPERAND_INVALID			0xC000010000000000ULL
 #define TDX_OPERAND_BUSY			0x8000020000000000ULL
 #define TDX_PREVIOUS_TLB_EPOCH_BUSY		0x8000020100000000ULL
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 7c183da7c4d4..df7d4cd1436c 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -158,6 +158,8 @@ int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
 			      enum pg_level level, kvm_pfn_t pfn);
 int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
 				 enum pg_level level, kvm_pfn_t pfn);
+int tdx_sept_split_private_spt(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+			       void *private_spt);
 
 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
@@ -224,6 +226,13 @@ static inline int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
 	return -EOPNOTSUPP;
 }
 
+static inline int tdx_sept_split_private_spt(struct kvm *kvm, gfn_t gfn,
+					     enum pg_level level,
+					     void *private_spt)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) {}
 static inline void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) {}
 static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {}
-- 
2.43.2
Re: [RFC PATCH 15/21] KVM: TDX: Support huge page splitting with exclusive kvm->mmu_lock
Posted by Edgecombe, Rick P 5 months, 2 weeks ago
On Thu, 2025-04-24 at 11:08 +0800, Yan Zhao wrote:
> +static int tdx_spte_demote_private_spte(struct kvm *kvm, gfn_t gfn,
> +					enum pg_level level, struct page *page)
> +{
> +	int tdx_level = pg_level_to_tdx_sept_level(level);
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	gpa_t gpa = gfn_to_gpa(gfn);
> +	u64 err, entry, level_state;
> +
> +	do {
> +		err = tdh_mem_page_demote(&kvm_tdx->td, gpa, tdx_level, page,
> +					  &entry, &level_state);
> +	} while (err == TDX_INTERRUPTED_RESTARTABLE);
> +
> +	if (unlikely(tdx_operand_busy(err))) {
> +		tdx_no_vcpus_enter_start(kvm);
> +		err = tdh_mem_page_demote(&kvm_tdx->td, gpa, tdx_level, page,
> +					  &entry, &level_state);
> +		tdx_no_vcpus_enter_stop(kvm);
> +	}
> +
> +	if (KVM_BUG_ON(err, kvm)) {
> +		pr_tdx_error_2(TDH_MEM_PAGE_DEMOTE, err, entry, level_state);
> +		return -EIO;
> +	}
> +	return 0;
> +}
> +
> +int tdx_sept_split_private_spt(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> +			       void *private_spt)
> +{
> +	struct page *page = virt_to_page(private_spt);
> +	int ret;
> +
> +	if (KVM_BUG_ON(to_kvm_tdx(kvm)->state != TD_STATE_RUNNABLE || level != PG_LEVEL_2M, kvm))
> +		return -EINVAL;
> +
> +	ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
> +	if (ret <= 0)
> +		return ret;
> +
> +	tdx_track(kvm);
> +
> +	return tdx_spte_demote_private_spte(kvm, gfn, level, page);
> +}

The latest TDX docs talk about a feature called NON_BLOCKING_RESIZE. It allows
for demote without blocking. If we rely on this feature we could simplify this
code. Not having transitory blocked state would reduce the scenarios that have
to be accounted for. We could also make demote operation accommodate failures
(rollback on SEAMCALL BUSY issue), which means mmu write lock is no longer
needed. It would have helped the fault path demote issue, which we have now
worked around. But still, it seems more flexible as well as simpler.

What about we rely on it this feature for KVM TDX huge mappings?
Re: [RFC PATCH 15/21] KVM: TDX: Support huge page splitting with exclusive kvm->mmu_lock
Posted by Binbin Wu 6 months, 4 weeks ago

On 4/24/2025 11:08 AM, Yan Zhao wrote:
[...]
> +
> +int tdx_sept_split_private_spt(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> +			       void *private_spt)
> +{
> +	struct page *page = virt_to_page(private_spt);
> +	int ret;
> +
> +	if (KVM_BUG_ON(to_kvm_tdx(kvm)->state != TD_STATE_RUNNABLE || level != PG_LEVEL_2M, kvm))
> +		return -EINVAL;
> +
> +	ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
> +	if (ret <= 0)
> +		return ret;
> +
> +	tdx_track(kvm);

It may worth a helper for the zap and track code.
It's the some code as what in tdx_sept_remove_private_spte().
So that they can share the code, including the bug check for HKID and the
comments.


> +
> +	return tdx_spte_demote_private_spte(kvm, gfn, level, page);
> +}
> +
>
[...]
Re: [RFC PATCH 15/21] KVM: TDX: Support huge page splitting with exclusive kvm->mmu_lock
Posted by Yan Zhao 6 months, 4 weeks ago
On Tue, May 20, 2025 at 02:18:12PM +0800, Binbin Wu wrote:
> 
> 
> On 4/24/2025 11:08 AM, Yan Zhao wrote:
> [...]
> > +
> > +int tdx_sept_split_private_spt(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> > +			       void *private_spt)
> > +{
> > +	struct page *page = virt_to_page(private_spt);
> > +	int ret;
> > +
> > +	if (KVM_BUG_ON(to_kvm_tdx(kvm)->state != TD_STATE_RUNNABLE || level != PG_LEVEL_2M, kvm))
> > +		return -EINVAL;
> > +
> > +	ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
> > +	if (ret <= 0)
> > +		return ret;
> > +
> > +	tdx_track(kvm);
> 
> It may worth a helper for the zap and track code.
> It's the some code as what in tdx_sept_remove_private_spte().
> So that they can share the code, including the bug check for HKID and the
> comments.
Not sure if it's worthwhile.
But I'm open to it if others also agree.

> 
> > +
> > +	return tdx_spte_demote_private_spte(kvm, gfn, level, page);
> > +}
> > +
> > 
> [...]