Don't explicitly pin pages when mapping pages into the S-EPT, guest_memfd
doesn't support page migration in any capacity, i.e. there are no migrate
callbacks because guest_memfd pages *can't* be migrated. See the WARN in
kvm_gmem_migrate_folio().
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kvm/vmx/tdx.c | 28 ++++------------------------
1 file changed, 4 insertions(+), 24 deletions(-)
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 1724d82c8512..9fb6e5f02cc9 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1586,29 +1586,22 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
}
-static void tdx_unpin(struct kvm *kvm, struct page *page)
-{
- put_page(page);
-}
-
static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, struct page *page)
+ enum pg_level level, kvm_pfn_t pfn)
{
int tdx_level = pg_level_to_tdx_sept_level(level);
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct page *page = pfn_to_page(pfn);
gpa_t gpa = gfn_to_gpa(gfn);
u64 entry, level_state;
u64 err;
err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
- if (unlikely(tdx_operand_busy(err))) {
- tdx_unpin(kvm, page);
+ if (unlikely(tdx_operand_busy(err)))
return -EBUSY;
- }
if (KVM_BUG_ON(err, kvm)) {
pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
- tdx_unpin(kvm, page);
return -EIO;
}
@@ -1642,29 +1635,18 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
enum pg_level level, kvm_pfn_t pfn)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- struct page *page = pfn_to_page(pfn);
/* TODO: handle large pages. */
if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
return -EINVAL;
- /*
- * Because guest_memfd doesn't support page migration with
- * a_ops->migrate_folio (yet), no callback is triggered for KVM on page
- * migration. Until guest_memfd supports page migration, prevent page
- * migration.
- * TODO: Once guest_memfd introduces callback on page migration,
- * implement it and remove get_page/put_page().
- */
- get_page(page);
-
/*
* Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
* barrier in tdx_td_finalize().
*/
smp_rmb();
if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
- return tdx_mem_page_aug(kvm, gfn, level, page);
+ return tdx_mem_page_aug(kvm, gfn, level, pfn);
return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
}
@@ -1715,7 +1697,6 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
return -EIO;
}
tdx_clear_page(page);
- tdx_unpin(kvm, page);
return 0;
}
@@ -1795,7 +1776,6 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
atomic64_dec(&kvm_tdx->nr_premapped);
- tdx_unpin(kvm, page);
return 0;
}
--
2.51.0.268.g9569e192d0-goog
On Tue, 2025-08-26 at 17:05 -0700, Sean Christopherson wrote: > Don't explicitly pin pages when mapping pages into the S-EPT, guest_memfd > doesn't support page migration in any capacity, i.e. there are no migrate > callbacks because guest_memfd pages *can't* be migrated. See the WARN in > kvm_gmem_migrate_folio(). > > Signed-off-by: Sean Christopherson <seanjc@google.com> > Reviewed-by: Kai Huang <kai.huang@intel.com>
Sean Christopherson wrote: > Don't explicitly pin pages when mapping pages into the S-EPT, guest_memfd > doesn't support page migration in any capacity, i.e. there are no migrate > callbacks because guest_memfd pages *can't* be migrated. See the WARN in > kvm_gmem_migrate_folio(). I like the fact this removes a poorly named function tdx_unpin() as well. That said, concerning gmem tracking page reference, I have some questions. In the TDX.PAGE.AUG path, [via kvm_gmem_get_pfn()] gmem takes a folio reference whereas the TDX.PAGE.ADD path [via kvm_gmem_populate()] does not take a folio reference. Why are these paths different? For this patch. Reviewed-by: Ira Weiny <ira.weiny@intel.com> > > Signed-off-by: Sean Christopherson <seanjc@google.com> > --- > arch/x86/kvm/vmx/tdx.c | 28 ++++------------------------ > 1 file changed, 4 insertions(+), 24 deletions(-) > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > index 1724d82c8512..9fb6e5f02cc9 100644 > --- a/arch/x86/kvm/vmx/tdx.c > +++ b/arch/x86/kvm/vmx/tdx.c > @@ -1586,29 +1586,22 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) > td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); > } > > -static void tdx_unpin(struct kvm *kvm, struct page *page) > -{ > - put_page(page); > -} > - > static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, > - enum pg_level level, struct page *page) > + enum pg_level level, kvm_pfn_t pfn) > { > int tdx_level = pg_level_to_tdx_sept_level(level); > struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > + struct page *page = pfn_to_page(pfn); > gpa_t gpa = gfn_to_gpa(gfn); > u64 entry, level_state; > u64 err; > > err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); > - if (unlikely(tdx_operand_busy(err))) { > - tdx_unpin(kvm, page); > + if (unlikely(tdx_operand_busy(err))) > return -EBUSY; > - } > > if (KVM_BUG_ON(err, kvm)) { > pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); > - tdx_unpin(kvm, page); > return -EIO; > } > > @@ -1642,29 +1635,18 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, > enum pg_level level, kvm_pfn_t pfn) > { > struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > - struct page *page = pfn_to_page(pfn); > > /* TODO: handle large pages. */ > if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) > return -EINVAL; > > - /* > - * Because guest_memfd doesn't support page migration with > - * a_ops->migrate_folio (yet), no callback is triggered for KVM on page > - * migration. Until guest_memfd supports page migration, prevent page > - * migration. > - * TODO: Once guest_memfd introduces callback on page migration, > - * implement it and remove get_page/put_page(). > - */ > - get_page(page); > - > /* > * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching > * barrier in tdx_td_finalize(). > */ > smp_rmb(); > if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) > - return tdx_mem_page_aug(kvm, gfn, level, page); > + return tdx_mem_page_aug(kvm, gfn, level, pfn); > > return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); > } > @@ -1715,7 +1697,6 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, > return -EIO; > } > tdx_clear_page(page); > - tdx_unpin(kvm, page); > return 0; > } > > @@ -1795,7 +1776,6 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, > if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && > !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { > atomic64_dec(&kvm_tdx->nr_premapped); > - tdx_unpin(kvm, page); > return 0; > } > > -- > 2.51.0.268.g9569e192d0-goog >
On Wed, Aug 27, 2025 at 07:36:46PM -0500, Ira Weiny wrote: > Sean Christopherson wrote: > > Don't explicitly pin pages when mapping pages into the S-EPT, guest_memfd > > doesn't support page migration in any capacity, i.e. there are no migrate > > callbacks because guest_memfd pages *can't* be migrated. See the WARN in > > kvm_gmem_migrate_folio(). > > I like the fact this removes a poorly named function tdx_unpin() as well. > > That said, concerning gmem tracking page reference, I have some questions. > In the TDX.PAGE.AUG path, [via kvm_gmem_get_pfn()] gmem takes a folio kvm_mmu_finish_page_fault() will decrease the folio refcount. > reference whereas the TDX.PAGE.ADD path [via kvm_gmem_populate()] does not > take a folio reference. > > Why are these paths different? > > For this patch. > > Reviewed-by: Ira Weiny <ira.weiny@intel.com> > > > > > Signed-off-by: Sean Christopherson <seanjc@google.com> > > --- > > arch/x86/kvm/vmx/tdx.c | 28 ++++------------------------ > > 1 file changed, 4 insertions(+), 24 deletions(-) > > > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > > index 1724d82c8512..9fb6e5f02cc9 100644 > > --- a/arch/x86/kvm/vmx/tdx.c > > +++ b/arch/x86/kvm/vmx/tdx.c > > @@ -1586,29 +1586,22 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) > > td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); > > } > > > > -static void tdx_unpin(struct kvm *kvm, struct page *page) > > -{ > > - put_page(page); > > -} > > - > > static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, > > - enum pg_level level, struct page *page) > > + enum pg_level level, kvm_pfn_t pfn) > > { > > int tdx_level = pg_level_to_tdx_sept_level(level); > > struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > > + struct page *page = pfn_to_page(pfn); > > gpa_t gpa = gfn_to_gpa(gfn); > > u64 entry, level_state; > > u64 err; > > > > err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); > > - if (unlikely(tdx_operand_busy(err))) { > > - tdx_unpin(kvm, page); > > + if (unlikely(tdx_operand_busy(err))) > > return -EBUSY; > > - } > > > > if (KVM_BUG_ON(err, kvm)) { > > pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); > > - tdx_unpin(kvm, page); > > return -EIO; > > } > > > > @@ -1642,29 +1635,18 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, > > enum pg_level level, kvm_pfn_t pfn) > > { > > struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > > - struct page *page = pfn_to_page(pfn); > > > > /* TODO: handle large pages. */ > > if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) > > return -EINVAL; > > > > - /* > > - * Because guest_memfd doesn't support page migration with > > - * a_ops->migrate_folio (yet), no callback is triggered for KVM on page > > - * migration. Until guest_memfd supports page migration, prevent page > > - * migration. > > - * TODO: Once guest_memfd introduces callback on page migration, > > - * implement it and remove get_page/put_page(). > > - */ > > - get_page(page); > > - > > /* > > * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching > > * barrier in tdx_td_finalize(). > > */ > > smp_rmb(); > > if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) > > - return tdx_mem_page_aug(kvm, gfn, level, page); > > + return tdx_mem_page_aug(kvm, gfn, level, pfn); > > > > return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); > > } > > @@ -1715,7 +1697,6 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, > > return -EIO; > > } > > tdx_clear_page(page); > > - tdx_unpin(kvm, page); > > return 0; > > } > > > > @@ -1795,7 +1776,6 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, > > if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && > > !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { > > atomic64_dec(&kvm_tdx->nr_premapped); > > - tdx_unpin(kvm, page); > > return 0; > > } > > > > -- > > 2.51.0.268.g9569e192d0-goog > > > >
Yan Zhao wrote: > On Wed, Aug 27, 2025 at 07:36:46PM -0500, Ira Weiny wrote: > > Sean Christopherson wrote: > > > Don't explicitly pin pages when mapping pages into the S-EPT, guest_memfd > > > doesn't support page migration in any capacity, i.e. there are no migrate > > > callbacks because guest_memfd pages *can't* be migrated. See the WARN in > > > kvm_gmem_migrate_folio(). > > > > I like the fact this removes a poorly named function tdx_unpin() as well. > > > > That said, concerning gmem tracking page reference, I have some questions. > > In the TDX.PAGE.AUG path, [via kvm_gmem_get_pfn()] gmem takes a folio > kvm_mmu_finish_page_fault() will decrease the folio refcount. Thanks, Ira > > > reference whereas the TDX.PAGE.ADD path [via kvm_gmem_populate()] does not > > take a folio reference. > > > > Why are these paths different? > > [snip]
On Tue, Aug 26, 2025 at 05:05:15PM -0700, Sean Christopherson wrote: > Don't explicitly pin pages when mapping pages into the S-EPT, guest_memfd > doesn't support page migration in any capacity, i.e. there are no migrate > callbacks because guest_memfd pages *can't* be migrated. See the WARN in > kvm_gmem_migrate_folio(). Hmm, we implemented exactly the same patch at [1], where we explained the potential problems of not holding page refcount, and the explored various approaches, and related considerations. [1] https://lore.kernel.org/all/20250807094241.4523-1-yan.y.zhao@intel.com/ > Signed-off-by: Sean Christopherson <seanjc@google.com> > --- > arch/x86/kvm/vmx/tdx.c | 28 ++++------------------------ > 1 file changed, 4 insertions(+), 24 deletions(-) > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > index 1724d82c8512..9fb6e5f02cc9 100644 > --- a/arch/x86/kvm/vmx/tdx.c > +++ b/arch/x86/kvm/vmx/tdx.c > @@ -1586,29 +1586,22 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) > td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); > } > > -static void tdx_unpin(struct kvm *kvm, struct page *page) > -{ > - put_page(page); > -} > - > static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, > - enum pg_level level, struct page *page) > + enum pg_level level, kvm_pfn_t pfn) > { > int tdx_level = pg_level_to_tdx_sept_level(level); > struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > + struct page *page = pfn_to_page(pfn); > gpa_t gpa = gfn_to_gpa(gfn); > u64 entry, level_state; > u64 err; > > err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); > - if (unlikely(tdx_operand_busy(err))) { > - tdx_unpin(kvm, page); > + if (unlikely(tdx_operand_busy(err))) > return -EBUSY; > - } > > if (KVM_BUG_ON(err, kvm)) { > pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); > - tdx_unpin(kvm, page); > return -EIO; > } > > @@ -1642,29 +1635,18 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, > enum pg_level level, kvm_pfn_t pfn) > { > struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > - struct page *page = pfn_to_page(pfn); > > /* TODO: handle large pages. */ > if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) > return -EINVAL; > > - /* > - * Because guest_memfd doesn't support page migration with > - * a_ops->migrate_folio (yet), no callback is triggered for KVM on page > - * migration. Until guest_memfd supports page migration, prevent page > - * migration. > - * TODO: Once guest_memfd introduces callback on page migration, > - * implement it and remove get_page/put_page(). > - */ > - get_page(page); > - > /* > * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching > * barrier in tdx_td_finalize(). > */ > smp_rmb(); > if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) > - return tdx_mem_page_aug(kvm, gfn, level, page); > + return tdx_mem_page_aug(kvm, gfn, level, pfn); > > return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); > } > @@ -1715,7 +1697,6 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, > return -EIO; > } > tdx_clear_page(page); > - tdx_unpin(kvm, page); > return 0; > } > > @@ -1795,7 +1776,6 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, > if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && > !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { > atomic64_dec(&kvm_tdx->nr_premapped); > - tdx_unpin(kvm, page); > return 0; > } > > -- > 2.51.0.268.g9569e192d0-goog >
On Wed, 2025-08-27 at 16:33 +0800, Yan Zhao wrote: > On Tue, Aug 26, 2025 at 05:05:15PM -0700, Sean Christopherson wrote: > > Don't explicitly pin pages when mapping pages into the S-EPT, guest_memfd > > doesn't support page migration in any capacity, i.e. there are no migrate > > callbacks because guest_memfd pages *can't* be migrated. See the WARN in > > kvm_gmem_migrate_folio(). > Hmm, we implemented exactly the same patch at [1], where we explained the > potential problems of not holding page refcount, and the explored various > approaches, and related considerations. > > [1] https://lore.kernel.org/all/20250807094241.4523-1-yan.y.zhao@intel.com/ Yea, so the outcome of the huge page related discussion was that we should look at some sort of emergency page reclaim feature for the TDX module to use in the case of bugs. But in the meantime to move forward without it, using a solution like in this patch.
On Thu, Aug 28, 2025, Rick P Edgecombe wrote: > On Wed, 2025-08-27 at 16:33 +0800, Yan Zhao wrote: > > On Tue, Aug 26, 2025 at 05:05:15PM -0700, Sean Christopherson wrote: > > > Don't explicitly pin pages when mapping pages into the S-EPT, guest_memfd > > > doesn't support page migration in any capacity, i.e. there are no migrate > > > callbacks because guest_memfd pages *can't* be migrated. See the WARN in > > > kvm_gmem_migrate_folio(). > > Hmm, we implemented exactly the same patch at [1], where we explained the > > potential problems of not holding page refcount, and the explored various > > approaches, and related considerations. > > > > [1] https://lore.kernel.org/all/20250807094241.4523-1-yan.y.zhao@intel.com/ Oh, nice! I'll grab that and massage the changelog to break the hard dependencies on the rest of the hugepage support. > Yea, so the outcome of the huge page related discussion was that we should look > at some sort of emergency page reclaim feature for the TDX module to use in the > case of bugs. But in the meantime to move forward without it, using a solution > like in this patch.
© 2016 - 2025 Red Hat, Inc.