The functions kvm_x86_ops::link_external_spt() and
kvm_x86_ops::set_external_spte() are used to assign new memory to a VM.
When using TDX with Dynamic PAMT enabled, the assigned memory must be
covered by PAMT.
The new function kvm_x86_ops::phys_prepare() is called before
link_external_spt() and set_external_spte() to ensure that the memory is
ready to be assigned to the virtual machine. In the case of TDX, it
makes sure that the memory is covered by PAMT.
kvm_x86_ops::phys_prepare() is called in a context where struct kvm_vcpu
is available, allowing the implementation to allocate memory from a
per-VCPU pool.
The function kvm_x86_ops::phys_cleanup() frees PAMT memory in case of
failure.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
arch/x86/include/asm/kvm-x86-ops.h | 2 ++
arch/x86/include/asm/kvm_host.h | 3 ++
arch/x86/kvm/mmu/tdp_mmu.c | 47 +++++++++++++++++++++++++++---
3 files changed, 48 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 79406bf07a1c..37081d04e82f 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -99,6 +99,8 @@ KVM_X86_OP_OPTIONAL(link_external_spt)
KVM_X86_OP_OPTIONAL(set_external_spte)
KVM_X86_OP_OPTIONAL(free_external_spt)
KVM_X86_OP_OPTIONAL(remove_external_spte)
+KVM_X86_OP_OPTIONAL(phys_prepare)
+KVM_X86_OP_OPTIONAL(phys_cleanup)
KVM_X86_OP(has_wbinvd_exit)
KVM_X86_OP(get_l2_tsc_offset)
KVM_X86_OP(get_l2_tsc_multiplier)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6c06f3d6e081..91958c55f918 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1813,6 +1813,9 @@ struct kvm_x86_ops {
int (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
kvm_pfn_t pfn_for_gfn);
+ int (*phys_prepare)(struct kvm_vcpu *vcpu, kvm_pfn_t pfn);
+ void (*phys_cleanup)(kvm_pfn_t pfn);
+
bool (*has_wbinvd_exit)(void);
u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 405874f4d088..f6c836b2e6fc 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1137,6 +1137,26 @@ void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
}
}
+static int tdp_mmu_install_spte(struct kvm_vcpu *vcpu,
+ struct tdp_iter *iter,
+ u64 spte)
+{
+ kvm_pfn_t pfn = 0;
+ int ret = 0;
+
+ if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(spte)) {
+ pfn = spte_to_pfn(spte);
+ ret = static_call(kvm_x86_phys_prepare)(vcpu, pfn);
+ }
+ if (ret)
+ return ret;
+ ret = tdp_mmu_set_spte_atomic(vcpu->kvm, iter, spte);
+ if (pfn && ret)
+ static_call(kvm_x86_phys_cleanup)(pfn);
+
+ return ret;
+}
+
/*
* Installs a last-level SPTE to handle a TDP page fault.
* (NPT/EPT violation/misconfiguration)
@@ -1170,7 +1190,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
- else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
+ else if (tdp_mmu_install_spte(vcpu, iter, new_spte))
return RET_PF_RETRY;
else if (is_shadow_present_pte(iter->old_spte) &&
(!is_last_spte(iter->old_spte, iter->level) ||
@@ -1211,7 +1231,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
* Returns: 0 if the new page table was installed. Non-0 if the page table
* could not be installed (e.g. the atomic compare-exchange failed).
*/
-static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
+static int __tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_mmu_page *sp, bool shared)
{
u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled);
@@ -1230,6 +1250,25 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
return 0;
}
+static int tdp_mmu_link_sp(struct kvm_vcpu *vcpu, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared)
+{
+ kvm_pfn_t pfn = 0;
+ int ret = 0;
+
+ if (sp->external_spt) {
+ pfn = __pa(sp->external_spt) >> PAGE_SHIFT;
+ ret = static_call(kvm_x86_phys_prepare)(vcpu, pfn);
+ if (ret)
+ return ret;
+ }
+ ret = __tdp_mmu_link_sp(vcpu->kvm, iter, sp, shared);
+ if (pfn && ret)
+ static_call(kvm_x86_phys_cleanup)(pfn);
+
+ return ret;
+}
+
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_mmu_page *sp, bool shared);
@@ -1288,7 +1327,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm);
r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
} else {
- r = tdp_mmu_link_sp(kvm, &iter, sp, true);
+ r = tdp_mmu_link_sp(vcpu, &iter, sp, true);
}
/*
@@ -1514,7 +1553,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
* correctness standpoint since the translation will be the same either
* way.
*/
- ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
+ ret = __tdp_mmu_link_sp(kvm, iter, sp, shared);
if (ret)
goto out;
--
2.47.2
>+static int tdp_mmu_install_spte(struct kvm_vcpu *vcpu,
>+ struct tdp_iter *iter,
>+ u64 spte)
>+{
>+ kvm_pfn_t pfn = 0;
>+ int ret = 0;
>+
>+ if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(spte)) {
>+ pfn = spte_to_pfn(spte);
>+ ret = static_call(kvm_x86_phys_prepare)(vcpu, pfn);
nit: kvm is using kvm_x86_call() in most of cases, e.g.,
ret = kvm_x86_call(phys_prepare)(vcpu, pfn);
>+ }
>+ if (ret)
>+ return ret;
fold this chunk into the if() statement above to align with tdp_mmu_link_sp()
below?
I'm concerned about handling phys_prepare() failures. Such failures may not be
recoverable. ...
>+ ret = tdp_mmu_set_spte_atomic(vcpu->kvm, iter, spte);
>+ if (pfn && ret)
>+ static_call(kvm_x86_phys_cleanup)(pfn);
>+
>+ return ret;
>+}
>+
> /*
> * Installs a last-level SPTE to handle a TDP page fault.
> * (NPT/EPT violation/misconfiguration)
>@@ -1170,7 +1190,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
>
> if (new_spte == iter->old_spte)
> ret = RET_PF_SPURIOUS;
>- else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
>+ else if (tdp_mmu_install_spte(vcpu, iter, new_spte))
> return RET_PF_RETRY;
if RET_FP_RETRY is returned here, it could potentially cause an infinite loop.
I think we need a KVM_BUG_ON() somewhere.
On Fri, May 02, 2025 at 04:08:24PM +0300, Kirill A. Shutemov wrote:
> The functions kvm_x86_ops::link_external_spt() and
> kvm_x86_ops::set_external_spte() are used to assign new memory to a VM.
> When using TDX with Dynamic PAMT enabled, the assigned memory must be
> covered by PAMT.
>
> The new function kvm_x86_ops::phys_prepare() is called before
> link_external_spt() and set_external_spte() to ensure that the memory is
> ready to be assigned to the virtual machine. In the case of TDX, it
> makes sure that the memory is covered by PAMT.
>
> kvm_x86_ops::phys_prepare() is called in a context where struct kvm_vcpu
> is available, allowing the implementation to allocate memory from a
> per-VCPU pool.
>
Why not invoke phys_prepare() and phys_cleanup() in set_external_spte_present()?
Or in tdx_sept_set_private_spte()/tdx_sept_link_private_spt()?
> The function kvm_x86_ops::phys_cleanup() frees PAMT memory in case of
> failure.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
> arch/x86/include/asm/kvm-x86-ops.h | 2 ++
> arch/x86/include/asm/kvm_host.h | 3 ++
> arch/x86/kvm/mmu/tdp_mmu.c | 47 +++++++++++++++++++++++++++---
> 3 files changed, 48 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index 79406bf07a1c..37081d04e82f 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -99,6 +99,8 @@ KVM_X86_OP_OPTIONAL(link_external_spt)
> KVM_X86_OP_OPTIONAL(set_external_spte)
> KVM_X86_OP_OPTIONAL(free_external_spt)
> KVM_X86_OP_OPTIONAL(remove_external_spte)
> +KVM_X86_OP_OPTIONAL(phys_prepare)
> +KVM_X86_OP_OPTIONAL(phys_cleanup)
> KVM_X86_OP(has_wbinvd_exit)
> KVM_X86_OP(get_l2_tsc_offset)
> KVM_X86_OP(get_l2_tsc_multiplier)
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 6c06f3d6e081..91958c55f918 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1813,6 +1813,9 @@ struct kvm_x86_ops {
> int (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> kvm_pfn_t pfn_for_gfn);
>
> + int (*phys_prepare)(struct kvm_vcpu *vcpu, kvm_pfn_t pfn);
> + void (*phys_cleanup)(kvm_pfn_t pfn);
> +
> bool (*has_wbinvd_exit)(void);
>
> u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 405874f4d088..f6c836b2e6fc 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1137,6 +1137,26 @@ void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
> }
> }
>
> +static int tdp_mmu_install_spte(struct kvm_vcpu *vcpu,
> + struct tdp_iter *iter,
> + u64 spte)
> +{
> + kvm_pfn_t pfn = 0;
> + int ret = 0;
> +
> + if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(spte)) {
> + pfn = spte_to_pfn(spte);
> + ret = static_call(kvm_x86_phys_prepare)(vcpu, pfn);
> + }
> + if (ret)
> + return ret;
> + ret = tdp_mmu_set_spte_atomic(vcpu->kvm, iter, spte);
> + if (pfn && ret)
> + static_call(kvm_x86_phys_cleanup)(pfn);
> +
> + return ret;
> +}
> +
> /*
> * Installs a last-level SPTE to handle a TDP page fault.
> * (NPT/EPT violation/misconfiguration)
> @@ -1170,7 +1190,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
>
> if (new_spte == iter->old_spte)
> ret = RET_PF_SPURIOUS;
> - else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
> + else if (tdp_mmu_install_spte(vcpu, iter, new_spte))
> return RET_PF_RETRY;
> else if (is_shadow_present_pte(iter->old_spte) &&
> (!is_last_spte(iter->old_spte, iter->level) ||
> @@ -1211,7 +1231,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
> * Returns: 0 if the new page table was installed. Non-0 if the page table
> * could not be installed (e.g. the atomic compare-exchange failed).
> */
> -static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
> +static int __tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
> struct kvm_mmu_page *sp, bool shared)
> {
> u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled);
> @@ -1230,6 +1250,25 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
> return 0;
> }
>
> +static int tdp_mmu_link_sp(struct kvm_vcpu *vcpu, struct tdp_iter *iter,
> + struct kvm_mmu_page *sp, bool shared)
> +{
> + kvm_pfn_t pfn = 0;
> + int ret = 0;
> +
> + if (sp->external_spt) {
> + pfn = __pa(sp->external_spt) >> PAGE_SHIFT;
> + ret = static_call(kvm_x86_phys_prepare)(vcpu, pfn);
> + if (ret)
> + return ret;
> + }
> + ret = __tdp_mmu_link_sp(vcpu->kvm, iter, sp, shared);
> + if (pfn && ret)
> + static_call(kvm_x86_phys_cleanup)(pfn);
> +
> + return ret;
> +}
> +
> static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
> struct kvm_mmu_page *sp, bool shared);
>
> @@ -1288,7 +1327,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm);
> r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
> } else {
> - r = tdp_mmu_link_sp(kvm, &iter, sp, true);
> + r = tdp_mmu_link_sp(vcpu, &iter, sp, true);
> }
>
> /*
> @@ -1514,7 +1553,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
> * correctness standpoint since the translation will be the same either
> * way.
> */
> - ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
> + ret = __tdp_mmu_link_sp(kvm, iter, sp, shared);
> if (ret)
> goto out;
>
> --
> 2.47.2
>
On Tue, May 06, 2025 at 07:55:17PM +0800, Yan Zhao wrote: > On Fri, May 02, 2025 at 04:08:24PM +0300, Kirill A. Shutemov wrote: > > The functions kvm_x86_ops::link_external_spt() and > > kvm_x86_ops::set_external_spte() are used to assign new memory to a VM. > > When using TDX with Dynamic PAMT enabled, the assigned memory must be > > covered by PAMT. > > > > The new function kvm_x86_ops::phys_prepare() is called before > > link_external_spt() and set_external_spte() to ensure that the memory is > > ready to be assigned to the virtual machine. In the case of TDX, it > > makes sure that the memory is covered by PAMT. > > > > kvm_x86_ops::phys_prepare() is called in a context where struct kvm_vcpu > > is available, allowing the implementation to allocate memory from a > > per-VCPU pool. > > > Why not invoke phys_prepare() and phys_cleanup() in set_external_spte_present()? > Or in tdx_sept_set_private_spte()/tdx_sept_link_private_spt()? Because the memory pool we allocated from is per-vcpu and we lost access to vcpu by then. And not all callers provide vcpu. -- Kiryl Shutsemau / Kirill A. Shutemov
On Thu, May 08, 2025 at 04:23:56PM +0300, Kirill A. Shutemov wrote: > On Tue, May 06, 2025 at 07:55:17PM +0800, Yan Zhao wrote: > > On Fri, May 02, 2025 at 04:08:24PM +0300, Kirill A. Shutemov wrote: > > > The functions kvm_x86_ops::link_external_spt() and > > > kvm_x86_ops::set_external_spte() are used to assign new memory to a VM. > > > When using TDX with Dynamic PAMT enabled, the assigned memory must be > > > covered by PAMT. > > > > > > The new function kvm_x86_ops::phys_prepare() is called before > > > link_external_spt() and set_external_spte() to ensure that the memory is > > > ready to be assigned to the virtual machine. In the case of TDX, it > > > makes sure that the memory is covered by PAMT. > > > > > > kvm_x86_ops::phys_prepare() is called in a context where struct kvm_vcpu > > > is available, allowing the implementation to allocate memory from a > > > per-VCPU pool. > > > > > Why not invoke phys_prepare() and phys_cleanup() in set_external_spte_present()? > > Or in tdx_sept_set_private_spte()/tdx_sept_link_private_spt()? > > Because the memory pool we allocated from is per-vcpu and we lost access > to vcpu by then. And not all callers provide vcpu. Maybe we can get vcpu via kvm_get_running_vcpu(), as in [1]. Then for callers not providing vcpu (where vcpu is NULL), we can use per-KVM cache? [1] https://lore.kernel.org/all/20250424030926.554-1-yan.y.zhao@intel.com/
On Fri, May 09, 2025 at 09:25:58AM +0800, Yan Zhao wrote: > On Thu, May 08, 2025 at 04:23:56PM +0300, Kirill A. Shutemov wrote: > > On Tue, May 06, 2025 at 07:55:17PM +0800, Yan Zhao wrote: > > > On Fri, May 02, 2025 at 04:08:24PM +0300, Kirill A. Shutemov wrote: > > > > The functions kvm_x86_ops::link_external_spt() and > > > > kvm_x86_ops::set_external_spte() are used to assign new memory to a VM. > > > > When using TDX with Dynamic PAMT enabled, the assigned memory must be > > > > covered by PAMT. > > > > > > > > The new function kvm_x86_ops::phys_prepare() is called before > > > > link_external_spt() and set_external_spte() to ensure that the memory is > > > > ready to be assigned to the virtual machine. In the case of TDX, it > > > > makes sure that the memory is covered by PAMT. > > > > > > > > kvm_x86_ops::phys_prepare() is called in a context where struct kvm_vcpu > > > > is available, allowing the implementation to allocate memory from a > > > > per-VCPU pool. > > > > > > > Why not invoke phys_prepare() and phys_cleanup() in set_external_spte_present()? > > > Or in tdx_sept_set_private_spte()/tdx_sept_link_private_spt()? > > > > Because the memory pool we allocated from is per-vcpu and we lost access > > to vcpu by then. And not all callers provide vcpu. > Maybe we can get vcpu via kvm_get_running_vcpu(), as in [1]. > Then for callers not providing vcpu (where vcpu is NULL), we can use per-KVM > cache? Hm. I was not aware of kvm_get_running_vcpu(). Will play with it, thanks. -- Kiryl Shutsemau / Kirill A. Shutemov
On Mon, 2025-05-12 at 12:55 +0300, Kirill A. Shutemov wrote:
> On Fri, May 09, 2025 at 09:25:58AM +0800, Yan Zhao wrote:
> > On Thu, May 08, 2025 at 04:23:56PM +0300, Kirill A. Shutemov wrote:
> > > On Tue, May 06, 2025 at 07:55:17PM +0800, Yan Zhao wrote:
> > > > On Fri, May 02, 2025 at 04:08:24PM +0300, Kirill A. Shutemov wrote:
> > > > > The functions kvm_x86_ops::link_external_spt() and
> > > > > kvm_x86_ops::set_external_spte() are used to assign new memory to a VM.
> > > > > When using TDX with Dynamic PAMT enabled, the assigned memory must be
> > > > > covered by PAMT.
> > > > >
> > > > > The new function kvm_x86_ops::phys_prepare() is called before
> > > > > link_external_spt() and set_external_spte() to ensure that the memory is
> > > > > ready to be assigned to the virtual machine. In the case of TDX, it
> > > > > makes sure that the memory is covered by PAMT.
> > > > >
> > > > > kvm_x86_ops::phys_prepare() is called in a context where struct kvm_vcpu
> > > > > is available, allowing the implementation to allocate memory from a
> > > > > per-VCPU pool.
> > > > >
> > > > Why not invoke phys_prepare() and phys_cleanup() in set_external_spte_present()?
> > > > Or in tdx_sept_set_private_spte()/tdx_sept_link_private_spt()?
> > >
> > > Because the memory pool we allocated from is per-vcpu and we lost access
> > > to vcpu by then. And not all callers provide vcpu.
> > Maybe we can get vcpu via kvm_get_running_vcpu(), as in [1].
> > Then for callers not providing vcpu (where vcpu is NULL), we can use per-KVM
> > cache?
>
> Hm. I was not aware of kvm_get_running_vcpu(). Will play with it, thanks.
I am not sure why per-vcpu cache matters.
For non-leaf SEPT pages, AFAICT the "vcpu->arch.mmu_external_spt_cache" is just
an empty cache, and eventually __get_free_page() is used to allocate in:
sp->external_spt =
kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache);
So why not we actually create a kmem_cache for it with an actual 'ctor', and we
can call tdx_alloc_page() in that. This makes sure when the "external_spt" is
allocated, the underneath PAMT entry is there.
For the last level guest memory page, similar to SEV, we can hook the
kvm_arch_gmem_prepare() to call tdx_alloc_page() to make PAMT entry ready.
On Wed, May 14, 2025 at 12:00:17AM +0000, Huang, Kai wrote: > On Mon, 2025-05-12 at 12:55 +0300, Kirill A. Shutemov wrote: > > On Fri, May 09, 2025 at 09:25:58AM +0800, Yan Zhao wrote: > > > On Thu, May 08, 2025 at 04:23:56PM +0300, Kirill A. Shutemov wrote: > > > > On Tue, May 06, 2025 at 07:55:17PM +0800, Yan Zhao wrote: > > > > > On Fri, May 02, 2025 at 04:08:24PM +0300, Kirill A. Shutemov wrote: > > > > > > The functions kvm_x86_ops::link_external_spt() and > > > > > > kvm_x86_ops::set_external_spte() are used to assign new memory to a VM. > > > > > > When using TDX with Dynamic PAMT enabled, the assigned memory must be > > > > > > covered by PAMT. > > > > > > > > > > > > The new function kvm_x86_ops::phys_prepare() is called before > > > > > > link_external_spt() and set_external_spte() to ensure that the memory is > > > > > > ready to be assigned to the virtual machine. In the case of TDX, it > > > > > > makes sure that the memory is covered by PAMT. > > > > > > > > > > > > kvm_x86_ops::phys_prepare() is called in a context where struct kvm_vcpu > > > > > > is available, allowing the implementation to allocate memory from a > > > > > > per-VCPU pool. > > > > > > > > > > > Why not invoke phys_prepare() and phys_cleanup() in set_external_spte_present()? > > > > > Or in tdx_sept_set_private_spte()/tdx_sept_link_private_spt()? > > > > > > > > Because the memory pool we allocated from is per-vcpu and we lost access > > > > to vcpu by then. And not all callers provide vcpu. > > > Maybe we can get vcpu via kvm_get_running_vcpu(), as in [1]. > > > Then for callers not providing vcpu (where vcpu is NULL), we can use per-KVM > > > cache? > > > > Hm. I was not aware of kvm_get_running_vcpu(). Will play with it, thanks. > > I am not sure why per-vcpu cache matters. > > For non-leaf SEPT pages, AFAICT the "vcpu->arch.mmu_external_spt_cache" is just > an empty cache, and eventually __get_free_page() is used to allocate in: > > sp->external_spt = > kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache); > > So why not we actually create a kmem_cache for it with an actual 'ctor', and we > can call tdx_alloc_page() in that. This makes sure when the "external_spt" is > allocated, the underneath PAMT entry is there. I looked closer to this and while it is good idea, but ctor in kmem_cache cannot fail which makes this approach not viable. I guess we can a constructor directly into struct kvm_mmu_memory_cache. Let me play with this. -- Kiryl Shutsemau / Kirill A. Shutemov
On Fri, May 23, 2025 at 03:00:56PM +0300, kirill.shutemov@linux.intel.com wrote: > On Wed, May 14, 2025 at 12:00:17AM +0000, Huang, Kai wrote: > > On Mon, 2025-05-12 at 12:55 +0300, Kirill A. Shutemov wrote: > > > On Fri, May 09, 2025 at 09:25:58AM +0800, Yan Zhao wrote: > > > > On Thu, May 08, 2025 at 04:23:56PM +0300, Kirill A. Shutemov wrote: > > > > > On Tue, May 06, 2025 at 07:55:17PM +0800, Yan Zhao wrote: > > > > > > On Fri, May 02, 2025 at 04:08:24PM +0300, Kirill A. Shutemov wrote: > > > > > > > The functions kvm_x86_ops::link_external_spt() and > > > > > > > kvm_x86_ops::set_external_spte() are used to assign new memory to a VM. > > > > > > > When using TDX with Dynamic PAMT enabled, the assigned memory must be > > > > > > > covered by PAMT. > > > > > > > > > > > > > > The new function kvm_x86_ops::phys_prepare() is called before > > > > > > > link_external_spt() and set_external_spte() to ensure that the memory is > > > > > > > ready to be assigned to the virtual machine. In the case of TDX, it > > > > > > > makes sure that the memory is covered by PAMT. > > > > > > > > > > > > > > kvm_x86_ops::phys_prepare() is called in a context where struct kvm_vcpu > > > > > > > is available, allowing the implementation to allocate memory from a > > > > > > > per-VCPU pool. > > > > > > > > > > > > > Why not invoke phys_prepare() and phys_cleanup() in set_external_spte_present()? > > > > > > Or in tdx_sept_set_private_spte()/tdx_sept_link_private_spt()? > > > > > > > > > > Because the memory pool we allocated from is per-vcpu and we lost access > > > > > to vcpu by then. And not all callers provide vcpu. > > > > Maybe we can get vcpu via kvm_get_running_vcpu(), as in [1]. > > > > Then for callers not providing vcpu (where vcpu is NULL), we can use per-KVM > > > > cache? > > > > > > Hm. I was not aware of kvm_get_running_vcpu(). Will play with it, thanks. > > > > I am not sure why per-vcpu cache matters. > > > > For non-leaf SEPT pages, AFAICT the "vcpu->arch.mmu_external_spt_cache" is just > > an empty cache, and eventually __get_free_page() is used to allocate in: > > > > sp->external_spt = > > kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache); > > > > So why not we actually create a kmem_cache for it with an actual 'ctor', and we > > can call tdx_alloc_page() in that. This makes sure when the "external_spt" is > > allocated, the underneath PAMT entry is there. > > I looked closer to this and while it is good idea, but ctor in kmem_cache > cannot fail which makes this approach not viable. > > I guess we can a constructor directly into struct kvm_mmu_memory_cache. > Let me play with this. I failed to make it work. We need to have destructor paired with the constructor that would do PAMT-aware freeing. And redirect all free paths to it. It requires substantial rework. I don't think it worth the effort. Will do manual PAMT management for SPT in TDX code. -- Kiryl Shutsemau / Kirill A. Shutemov
On Thu, 2025-06-05 at 16:01 +0300, kirill.shutemov@linux.intel.com wrote:
> On Fri, May 23, 2025 at 03:00:56PM +0300, kirill.shutemov@linux.intel.com wrote:
> > On Wed, May 14, 2025 at 12:00:17AM +0000, Huang, Kai wrote:
> > > On Mon, 2025-05-12 at 12:55 +0300, Kirill A. Shutemov wrote:
> > > > On Fri, May 09, 2025 at 09:25:58AM +0800, Yan Zhao wrote:
> > > > > On Thu, May 08, 2025 at 04:23:56PM +0300, Kirill A. Shutemov wrote:
> > > > > > On Tue, May 06, 2025 at 07:55:17PM +0800, Yan Zhao wrote:
> > > > > > > On Fri, May 02, 2025 at 04:08:24PM +0300, Kirill A. Shutemov wrote:
> > > > > > > > The functions kvm_x86_ops::link_external_spt() and
> > > > > > > > kvm_x86_ops::set_external_spte() are used to assign new memory to a VM.
> > > > > > > > When using TDX with Dynamic PAMT enabled, the assigned memory must be
> > > > > > > > covered by PAMT.
> > > > > > > >
> > > > > > > > The new function kvm_x86_ops::phys_prepare() is called before
> > > > > > > > link_external_spt() and set_external_spte() to ensure that the memory is
> > > > > > > > ready to be assigned to the virtual machine. In the case of TDX, it
> > > > > > > > makes sure that the memory is covered by PAMT.
> > > > > > > >
> > > > > > > > kvm_x86_ops::phys_prepare() is called in a context where struct kvm_vcpu
> > > > > > > > is available, allowing the implementation to allocate memory from a
> > > > > > > > per-VCPU pool.
> > > > > > > >
> > > > > > > Why not invoke phys_prepare() and phys_cleanup() in set_external_spte_present()?
> > > > > > > Or in tdx_sept_set_private_spte()/tdx_sept_link_private_spt()?
> > > > > >
> > > > > > Because the memory pool we allocated from is per-vcpu and we lost access
> > > > > > to vcpu by then. And not all callers provide vcpu.
> > > > > Maybe we can get vcpu via kvm_get_running_vcpu(), as in [1].
> > > > > Then for callers not providing vcpu (where vcpu is NULL), we can use per-KVM
> > > > > cache?
> > > >
> > > > Hm. I was not aware of kvm_get_running_vcpu(). Will play with it, thanks.
> > >
> > > I am not sure why per-vcpu cache matters.
> > >
> > > For non-leaf SEPT pages, AFAICT the "vcpu->arch.mmu_external_spt_cache" is just
> > > an empty cache, and eventually __get_free_page() is used to allocate in:
> > >
> > > sp->external_spt =
> > > kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache);
> > >
> > > So why not we actually create a kmem_cache for it with an actual 'ctor', and we
> > > can call tdx_alloc_page() in that. This makes sure when the "external_spt" is
> > > allocated, the underneath PAMT entry is there.
> >
> > I looked closer to this and while it is good idea, but ctor in kmem_cache
> > cannot fail which makes this approach not viable.
> >
> > I guess we can a constructor directly into struct kvm_mmu_memory_cache.
> > Let me play with this.
>
> I failed to make it work.
>
> We need to have destructor paired with the constructor that would do
> PAMT-aware freeing. And redirect all free paths to it. It requires
> substantial rework. I don't think it worth the effort.
>
> Will do manual PAMT management for SPT in TDX code.
Thanks for the effort.
Maybe something below?
diff --git a/arch/x86/kvm/mmu/mmu_internal.h
b/arch/x86/kvm/mmu/mmu_internal.h
index db8f33e4de62..48732270bff0 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -164,8 +164,10 @@ static inline bool is_mirror_sp(const struct
kvm_mmu_page *sp)
return sp->role.is_mirror;
}
-static inline void kvm_mmu_alloc_external_spt(struct kvm_vcpu *vcpu, struct
kvm_mmu_page *sp)
+static inline int kvm_mmu_alloc_external_spt(struct kvm_vcpu *vcpu, struct
kvm_mmu_page *sp)
{
+ int r;
+
/*
* external_spt is allocated for TDX module to hold private EPT
mappings,
* TDX module will initialize the page by itself.
@@ -173,6 +175,12 @@ static inline void kvm_mmu_alloc_external_spt(struct
kvm_vcpu *vcpu, struct kvm_
* KVM only interacts with sp->spt for private EPT operations.
*/
sp->external_spt = kvm_mmu_memory_cache_alloc(&vcpu-
>arch.mmu_external_spt_cache);
+
+ r = tdx_pamt_get(virt_to_page(sp->external_spt));
+ if (r)
+ free_page((unsigned long)sp->external_spt);
+
+ return r;
}
static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct
kvm_mmu_page *root)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7f3d7229b2c1..2d3a716d9195 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -55,7 +55,10 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
- free_page((unsigned long)sp->external_spt);
+ if (sp->external_spt) {
+ free_page((unsigned long)sp->external_spt);
+ tdx_pamt_put(virt_to_page(sp->external_spt));
+ }
free_page((unsigned long)sp->spt);
kmem_cache_free(mmu_page_header_cache, sp);
}
@@ -1277,8 +1280,13 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct
kvm_page_fault *fault)
*/
sp = tdp_mmu_alloc_sp(vcpu);
tdp_mmu_init_child_sp(sp, &iter);
- if (is_mirror_sp(sp))
- kvm_mmu_alloc_external_spt(vcpu, sp);
+ if (is_mirror_sp(sp)) {
+ r = kvm_mmu_alloc_external_spt(vcpu, sp);
+ if (r) {
+ tdp_mmu_free_sp(sp);
+ goto retry;
+ }
+ }
sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
On Thu, Jun 05, 2025 at 10:21:46PM +0000, Huang, Kai wrote: > On Thu, 2025-06-05 at 16:01 +0300, kirill.shutemov@linux.intel.com wrote: > > On Fri, May 23, 2025 at 03:00:56PM +0300, kirill.shutemov@linux.intel.com wrote: > > > On Wed, May 14, 2025 at 12:00:17AM +0000, Huang, Kai wrote: > > > > On Mon, 2025-05-12 at 12:55 +0300, Kirill A. Shutemov wrote: > > > > > On Fri, May 09, 2025 at 09:25:58AM +0800, Yan Zhao wrote: > > > > > > On Thu, May 08, 2025 at 04:23:56PM +0300, Kirill A. Shutemov wrote: > > > > > > > On Tue, May 06, 2025 at 07:55:17PM +0800, Yan Zhao wrote: > > > > > > > > On Fri, May 02, 2025 at 04:08:24PM +0300, Kirill A. Shutemov wrote: > > > > > > > > > The functions kvm_x86_ops::link_external_spt() and > > > > > > > > > kvm_x86_ops::set_external_spte() are used to assign new memory to a VM. > > > > > > > > > When using TDX with Dynamic PAMT enabled, the assigned memory must be > > > > > > > > > covered by PAMT. > > > > > > > > > > > > > > > > > > The new function kvm_x86_ops::phys_prepare() is called before > > > > > > > > > link_external_spt() and set_external_spte() to ensure that the memory is > > > > > > > > > ready to be assigned to the virtual machine. In the case of TDX, it > > > > > > > > > makes sure that the memory is covered by PAMT. > > > > > > > > > > > > > > > > > > kvm_x86_ops::phys_prepare() is called in a context where struct kvm_vcpu > > > > > > > > > is available, allowing the implementation to allocate memory from a > > > > > > > > > per-VCPU pool. > > > > > > > > > > > > > > > > > Why not invoke phys_prepare() and phys_cleanup() in set_external_spte_present()? > > > > > > > > Or in tdx_sept_set_private_spte()/tdx_sept_link_private_spt()? > > > > > > > > > > > > > > Because the memory pool we allocated from is per-vcpu and we lost access > > > > > > > to vcpu by then. And not all callers provide vcpu. > > > > > > Maybe we can get vcpu via kvm_get_running_vcpu(), as in [1]. > > > > > > Then for callers not providing vcpu (where vcpu is NULL), we can use per-KVM > > > > > > cache? > > > > > > > > > > Hm. I was not aware of kvm_get_running_vcpu(). Will play with it, thanks. > > > > > > > > I am not sure why per-vcpu cache matters. > > > > > > > > For non-leaf SEPT pages, AFAICT the "vcpu->arch.mmu_external_spt_cache" is just > > > > an empty cache, and eventually __get_free_page() is used to allocate in: > > > > > > > > sp->external_spt = > > > > kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache); > > > > > > > > So why not we actually create a kmem_cache for it with an actual 'ctor', and we > > > > can call tdx_alloc_page() in that. This makes sure when the "external_spt" is > > > > allocated, the underneath PAMT entry is there. > > > > > > I looked closer to this and while it is good idea, but ctor in kmem_cache > > > cannot fail which makes this approach not viable. > > > > > > I guess we can a constructor directly into struct kvm_mmu_memory_cache. > > > Let me play with this. > > > > I failed to make it work. > > > > We need to have destructor paired with the constructor that would do > > PAMT-aware freeing. And redirect all free paths to it. It requires > > substantial rework. I don't think it worth the effort. > > > > Will do manual PAMT management for SPT in TDX code. > > Thanks for the effort. > > Maybe something below? With help of kvm_get_running_vcpu(), I localized these manipulations to the internals of TDX code. No need to leak this to TDP. phys_prepare/cleanup() is gone now. https://git.kernel.org/pub/scm/linux/kernel/git/kas/linux.git/commit/?h=tdx/dpamt-huge&id=72394699b5454aac6c027accab6d94a52d88819b -- Kiryl Shutsemau / Kirill A. Shutemov
On Wed, May 14, 2025 at 12:00:17AM +0000, Huang, Kai wrote: > On Mon, 2025-05-12 at 12:55 +0300, Kirill A. Shutemov wrote: > > On Fri, May 09, 2025 at 09:25:58AM +0800, Yan Zhao wrote: > > > On Thu, May 08, 2025 at 04:23:56PM +0300, Kirill A. Shutemov wrote: > > > > On Tue, May 06, 2025 at 07:55:17PM +0800, Yan Zhao wrote: > > > > > On Fri, May 02, 2025 at 04:08:24PM +0300, Kirill A. Shutemov wrote: > > > > > > The functions kvm_x86_ops::link_external_spt() and > > > > > > kvm_x86_ops::set_external_spte() are used to assign new memory to a VM. > > > > > > When using TDX with Dynamic PAMT enabled, the assigned memory must be > > > > > > covered by PAMT. > > > > > > > > > > > > The new function kvm_x86_ops::phys_prepare() is called before > > > > > > link_external_spt() and set_external_spte() to ensure that the memory is > > > > > > ready to be assigned to the virtual machine. In the case of TDX, it > > > > > > makes sure that the memory is covered by PAMT. > > > > > > > > > > > > kvm_x86_ops::phys_prepare() is called in a context where struct kvm_vcpu > > > > > > is available, allowing the implementation to allocate memory from a > > > > > > per-VCPU pool. > > > > > > > > > > > Why not invoke phys_prepare() and phys_cleanup() in set_external_spte_present()? > > > > > Or in tdx_sept_set_private_spte()/tdx_sept_link_private_spt()? > > > > > > > > Because the memory pool we allocated from is per-vcpu and we lost access > > > > to vcpu by then. And not all callers provide vcpu. > > > Maybe we can get vcpu via kvm_get_running_vcpu(), as in [1]. > > > Then for callers not providing vcpu (where vcpu is NULL), we can use per-KVM > > > cache? > > > > Hm. I was not aware of kvm_get_running_vcpu(). Will play with it, thanks. > > I am not sure why per-vcpu cache matters. > > For non-leaf SEPT pages, AFAICT the "vcpu->arch.mmu_external_spt_cache" is just > an empty cache, and eventually __get_free_page() is used to allocate in: > > sp->external_spt = > kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache); > > So why not we actually create a kmem_cache for it with an actual 'ctor', and we > can call tdx_alloc_page() in that. This makes sure when the "external_spt" is > allocated, the underneath PAMT entry is there. This would make hard to debug PAMT memory leaks. external_spt pages in the pool will have PAMT memory tied to them, so we will have non-zero PAMT memory usage with zero TDs running. > For the last level guest memory page, similar to SEV, we can hook the > kvm_arch_gmem_prepare() to call tdx_alloc_page() to make PAMT entry ready. I don't think kvm_arch_gmem_prepare() is right place to allocate PAMT memory. THPs are dynamic and page order can change due to split or collapse between the time the page is allocated and gets mapped into EPT. I am not sure if SEV code is correct in this regard. -- Kiryl Shutsemau / Kirill A. Shutemov
On Wed, 2025-05-14 at 09:43 +0300, kirill.shutemov@linux.intel.com wrote: > On Wed, May 14, 2025 at 12:00:17AM +0000, Huang, Kai wrote: > > On Mon, 2025-05-12 at 12:55 +0300, Kirill A. Shutemov wrote: > > > On Fri, May 09, 2025 at 09:25:58AM +0800, Yan Zhao wrote: > > > > On Thu, May 08, 2025 at 04:23:56PM +0300, Kirill A. Shutemov wrote: > > > > > On Tue, May 06, 2025 at 07:55:17PM +0800, Yan Zhao wrote: > > > > > > On Fri, May 02, 2025 at 04:08:24PM +0300, Kirill A. Shutemov wrote: > > > > > > > The functions kvm_x86_ops::link_external_spt() and > > > > > > > kvm_x86_ops::set_external_spte() are used to assign new memory to a VM. > > > > > > > When using TDX with Dynamic PAMT enabled, the assigned memory must be > > > > > > > covered by PAMT. > > > > > > > > > > > > > > The new function kvm_x86_ops::phys_prepare() is called before > > > > > > > link_external_spt() and set_external_spte() to ensure that the memory is > > > > > > > ready to be assigned to the virtual machine. In the case of TDX, it > > > > > > > makes sure that the memory is covered by PAMT. > > > > > > > > > > > > > > kvm_x86_ops::phys_prepare() is called in a context where struct kvm_vcpu > > > > > > > is available, allowing the implementation to allocate memory from a > > > > > > > per-VCPU pool. > > > > > > > > > > > > > Why not invoke phys_prepare() and phys_cleanup() in set_external_spte_present()? > > > > > > Or in tdx_sept_set_private_spte()/tdx_sept_link_private_spt()? > > > > > > > > > > Because the memory pool we allocated from is per-vcpu and we lost access > > > > > to vcpu by then. And not all callers provide vcpu. > > > > Maybe we can get vcpu via kvm_get_running_vcpu(), as in [1]. > > > > Then for callers not providing vcpu (where vcpu is NULL), we can use per-KVM > > > > cache? > > > > > > Hm. I was not aware of kvm_get_running_vcpu(). Will play with it, thanks. > > > > I am not sure why per-vcpu cache matters. > > > > For non-leaf SEPT pages, AFAICT the "vcpu->arch.mmu_external_spt_cache" is just > > an empty cache, and eventually __get_free_page() is used to allocate in: > > > > sp->external_spt = > > kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache); > > > > So why not we actually create a kmem_cache for it with an actual 'ctor', and we > > can call tdx_alloc_page() in that. This makes sure when the "external_spt" is > > allocated, the underneath PAMT entry is there. > > This would make hard to debug PAMT memory leaks. external_spt pages in the > pool will have PAMT memory tied to them, so we will have non-zero PAMT > memory usage with zero TDs running. Why is that? AFAICT all 'external_spt' pages are freed when TD is gone. > > > For the last level guest memory page, similar to SEV, we can hook the > > kvm_arch_gmem_prepare() to call tdx_alloc_page() to make PAMT entry ready. > > I don't think kvm_arch_gmem_prepare() is right place to allocate PAMT > memory. THPs are dynamic and page order can change due to split or > collapse between the time the page is allocated and gets mapped into EPT. > I am not sure if SEV code is correct in this regard. Yeah, agreed. Not sure how does SEV-SNP handles large page split/merge either.
© 2016 - 2026 Red Hat, Inc.