From: Yan Zhao <yan.y.zhao@intel.com>
Don't explicitly pin pages when mapping pages into the S-EPT, guest_memfd
doesn't support page migration in any capacity, i.e. there are no migrate
callbacks because guest_memfd pages *can't* be migrated. See the WARN in
kvm_gmem_migrate_folio().
Eliminating TDX's explicit pinning will also enable guest_memfd to support
in-place conversion between shared and private memory[1][2]. Because KVM
cannot distinguish between speculative/transient refcounts and the
intentional refcount for TDX on private pages[3], failing to release
private page refcount in TDX could cause guest_memfd to indefinitely wait
on decreasing the refcount for the splitting.
Under normal conditions, not holding an extra page refcount in TDX is safe
because guest_memfd ensures pages are retained until its invalidation
notification to KVM MMU is completed. However, if there're bugs in KVM/TDX
module, not holding an extra refcount when a page is mapped in S-EPT could
result in a page being released from guest_memfd while still mapped in the
S-EPT. But, doing work to make a fatal error slightly less fatal is a net
negative when that extra work adds complexity and confusion.
Several approaches were considered to address the refcount issue, including
- Attempting to modify the KVM unmap operation to return a failure,
which was deemed too complex and potentially incorrect[4].
- Increasing the folio reference count only upon S-EPT zapping failure[5].
- Use page flags or page_ext to indicate a page is still used by TDX[6],
which does not work for HVO (HugeTLB Vmemmap Optimization).
- Setting HWPOISON bit or leveraging folio_set_hugetlb_hwpoison()[7].
Due to the complexity or inappropriateness of these approaches, and the
fact that S-EPT zapping failure is currently only possible when there are
bugs in the KVM or TDX module, which is very rare in a production kernel,
a straightforward approach of simply not holding the page reference count
in TDX was chosen[8].
When S-EPT zapping errors occur, KVM_BUG_ON() is invoked to kick off all
vCPUs and mark the VM as dead. Although there is a potential window that a
private page mapped in the S-EPT could be reallocated and used outside the
VM, the loud warning from KVM_BUG_ON() should provide sufficient debug
information. To be robust against bugs, the user can enable panic_on_warn
as normal.
Link: https://lore.kernel.org/all/cover.1747264138.git.ackerleytng@google.com [1]
Link: https://youtu.be/UnBKahkAon4 [2]
Link: https://lore.kernel.org/all/CAGtprH_ypohFy9TOJ8Emm_roT4XbQUtLKZNFcM6Fr+fhTFkE0Q@mail.gmail.com [3]
Link: https://lore.kernel.org/all/aEEEJbTzlncbRaRA@yzhao56-desk.sh.intel.com [4]
Link: https://lore.kernel.org/all/aE%2Fq9VKkmaCcuwpU@yzhao56-desk.sh.intel.com [5]
Link: https://lore.kernel.org/all/aFkeBtuNBN1RrDAJ@yzhao56-desk.sh.intel.com [6]
Link: https://lore.kernel.org/all/diqzy0tikran.fsf@ackerleytng-ctop.c.googlers.com [7]
Link: https://lore.kernel.org/all/53ea5239f8ef9d8df9af593647243c10435fd219.camel@intel.com [8]
Suggested-by: Vishal Annapurve <vannapurve@google.com>
Suggested-by: Ackerley Tng <ackerleytng@google.com>
Suggested-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Kai Huang <kai.huang@intel.com>
[sean: extract out of hugepage series, massage changelog accordingly]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kvm/vmx/tdx.c | 28 ++++------------------------
1 file changed, 4 insertions(+), 24 deletions(-)
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index c83e1ff02827..f24f8635b433 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1586,29 +1586,22 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
}
-static void tdx_unpin(struct kvm *kvm, struct page *page)
-{
- put_page(page);
-}
-
static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, struct page *page)
+ enum pg_level level, kvm_pfn_t pfn)
{
int tdx_level = pg_level_to_tdx_sept_level(level);
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct page *page = pfn_to_page(pfn);
gpa_t gpa = gfn_to_gpa(gfn);
u64 entry, level_state;
u64 err;
err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
- if (unlikely(tdx_operand_busy(err))) {
- tdx_unpin(kvm, page);
+ if (unlikely(tdx_operand_busy(err)))
return -EBUSY;
- }
if (KVM_BUG_ON(err, kvm)) {
pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
- tdx_unpin(kvm, page);
return -EIO;
}
@@ -1642,29 +1635,18 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
enum pg_level level, kvm_pfn_t pfn)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- struct page *page = pfn_to_page(pfn);
/* TODO: handle large pages. */
if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
return -EINVAL;
- /*
- * Because guest_memfd doesn't support page migration with
- * a_ops->migrate_folio (yet), no callback is triggered for KVM on page
- * migration. Until guest_memfd supports page migration, prevent page
- * migration.
- * TODO: Once guest_memfd introduces callback on page migration,
- * implement it and remove get_page/put_page().
- */
- get_page(page);
-
/*
* Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
* barrier in tdx_td_finalize().
*/
smp_rmb();
if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
- return tdx_mem_page_aug(kvm, gfn, level, page);
+ return tdx_mem_page_aug(kvm, gfn, level, pfn);
return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
}
@@ -1715,7 +1697,6 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
return -EIO;
}
tdx_clear_page(page);
- tdx_unpin(kvm, page);
return 0;
}
@@ -1795,7 +1776,6 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
atomic64_dec(&kvm_tdx->nr_premapped);
- tdx_unpin(kvm, page);
return 0;
}
--
2.51.0.318.gd7df087d1a-goog
On Thu, 2025-08-28 at 17:06 -0700, Sean Christopherson wrote: > From: Yan Zhao <yan.y.zhao@intel.com> > > Don't explicitly pin pages when mapping pages into the S-EPT, guest_memfd > doesn't support page migration in any capacity, i.e. there are no migrate > callbacks because guest_memfd pages *can't* be migrated. See the WARN in > kvm_gmem_migrate_folio(). > > Eliminating TDX's explicit pinning will also enable guest_memfd to support > in-place conversion between shared and private memory[1][2]. Because KVM > cannot distinguish between speculative/transient refcounts and the > intentional refcount for TDX on private pages[3], failing to release > private page refcount in TDX could cause guest_memfd to indefinitely wait > on decreasing the refcount for the splitting. > > Under normal conditions, not holding an extra page refcount in TDX is safe > because guest_memfd ensures pages are retained until its invalidation > notification to KVM MMU is completed. However, if there're bugs in KVM/TDX > module, not holding an extra refcount when a page is mapped in S-EPT could > result in a page being released from guest_memfd while still mapped in the > S-EPT. But, doing work to make a fatal error slightly less fatal is a net > negative when that extra work adds complexity and confusion. > > Several approaches were considered to address the refcount issue, including > - Attempting to modify the KVM unmap operation to return a failure, > which was deemed too complex and potentially incorrect[4]. > - Increasing the folio reference count only upon S-EPT zapping failure[5]. > - Use page flags or page_ext to indicate a page is still used by TDX[6], > which does not work for HVO (HugeTLB Vmemmap Optimization). > - Setting HWPOISON bit or leveraging folio_set_hugetlb_hwpoison()[7]. > > Due to the complexity or inappropriateness of these approaches, and the > fact that S-EPT zapping failure is currently only possible when there are > bugs in the KVM or TDX module, which is very rare in a production kernel, > a straightforward approach of simply not holding the page reference count > in TDX was chosen[8]. > > When S-EPT zapping errors occur, KVM_BUG_ON() is invoked to kick off all > vCPUs and mark the VM as dead. Although there is a potential window that a > private page mapped in the S-EPT could be reallocated and used outside the > VM, the loud warning from KVM_BUG_ON() should provide sufficient debug > information. > Yea, in the case of a bug, there could be a use-after-free. This logic applies to all code that has allocations including the entire KVM MMU. But in this case, we can actually catch the use-after-free scenario under scrutiny and not have it happen silently, which does not apply to all code. But the special case here is that the use-after-free depends on TDX module logic which is not part of the kernel. Yan, can you clarify what you mean by "there could be a small window"? I'm thinking this is a hypothetical window around vm_dead races? Or more concrete? I *don't* want to re-open the debate on whether to go with this approach, but I think this is a good teaching edge case to settle on how we want to treat similar issues. So I just want to make sure we have the justification right. > To be robust against bugs, the user can enable panic_on_warn > as normal. > > Link: https://lore.kernel.org/all/cover.1747264138.git.ackerleytng@google.com [1] > Link: https://youtu.be/UnBKahkAon4 [2] > Link: https://lore.kernel.org/all/CAGtprH_ypohFy9TOJ8Emm_roT4XbQUtLKZNFcM6Fr+fhTFkE0Q@mail.gmail.com [3] > Link: https://lore.kernel.org/all/aEEEJbTzlncbRaRA@yzhao56-desk.sh.intel.com [4] > Link: https://lore.kernel.org/all/aE%2Fq9VKkmaCcuwpU@yzhao56-desk.sh.intel.com [5] > Link: https://lore.kernel.org/all/aFkeBtuNBN1RrDAJ@yzhao56-desk.sh.intel.com [6] > Link: https://lore.kernel.org/all/diqzy0tikran.fsf@ackerleytng-ctop.c.googlers.com [7] > Link: https://lore.kernel.org/all/53ea5239f8ef9d8df9af593647243c10435fd219.camel@intel.com [8] > Suggested-by: Vishal Annapurve <vannapurve@google.com> > Suggested-by: Ackerley Tng <ackerleytng@google.com> > Suggested-by: Rick Edgecombe <rick.p.edgecombe@intel.com> > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com> > Reviewed-by: Ira Weiny <ira.weiny@intel.com> > Reviewed-by: Kai Huang <kai.huang@intel.com> > [sean: extract out of hugepage series, massage changelog accordingly] > Signed-off-by: Sean Christopherson <seanjc@google.com> > --- Discussion aside, Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
On Sat, Aug 30, 2025 at 03:53:24AM +0800, Edgecombe, Rick P wrote: > On Thu, 2025-08-28 at 17:06 -0700, Sean Christopherson wrote: > > From: Yan Zhao <yan.y.zhao@intel.com> > > When S-EPT zapping errors occur, KVM_BUG_ON() is invoked to kick off all > > vCPUs and mark the VM as dead. Although there is a potential window that a > > private page mapped in the S-EPT could be reallocated and used outside the > > VM, the loud warning from KVM_BUG_ON() should provide sufficient debug > > information. ... > Yan, can you clarify what you mean by "there could be a small window"? I'm > thinking this is a hypothetical window around vm_dead races? Or more concrete? I > *don't* want to re-open the debate on whether to go with this approach, but I > think this is a good teaching edge case to settle on how we want to treat > similar issues. So I just want to make sure we have the justification right. I think this window isn't hypothetical. 1. SEAMCALL failure in tdx_sept_remove_private_spte(). KVM_BUG_ON() sets vm_dead and kicks off all vCPUs. 2. guest_memfd invalidation completes. memory is freed. 3. VM gets killed. After 2, the page is still mapped in the S-EPT, but it could potentially be reallocated and used outside the VM. From the TDX module and hardware's perspective, the mapping in the S-EPT for this page remains valid. So, I'm uncertain if the TDX module might do something creative to access the guest page after 2. Besides, a cache flush after 2 can essentially cause a memory write to the page. Though we could invoke tdh_phymem_page_wbinvd_hkid() after the KVM_BUG_ON(), the SEAMCALL itself can fail.
On Mon, Sep 01, 2025, Yan Zhao wrote: > On Sat, Aug 30, 2025 at 03:53:24AM +0800, Edgecombe, Rick P wrote: > > On Thu, 2025-08-28 at 17:06 -0700, Sean Christopherson wrote: > > > From: Yan Zhao <yan.y.zhao@intel.com> > > > When S-EPT zapping errors occur, KVM_BUG_ON() is invoked to kick off all > > > vCPUs and mark the VM as dead. Although there is a potential window that a > > > private page mapped in the S-EPT could be reallocated and used outside the > > > VM, the loud warning from KVM_BUG_ON() should provide sufficient debug > > > information. > ... > > Yan, can you clarify what you mean by "there could be a small window"? I'm > > thinking this is a hypothetical window around vm_dead races? Or more concrete? I > > *don't* want to re-open the debate on whether to go with this approach, but I > > think this is a good teaching edge case to settle on how we want to treat > > similar issues. So I just want to make sure we have the justification right. > I think this window isn't hypothetical. > > 1. SEAMCALL failure in tdx_sept_remove_private_spte(). But tdx_sept_remove_private_spte() failing is already a hypothetical scenario. > KVM_BUG_ON() sets vm_dead and kicks off all vCPUs. > 2. guest_memfd invalidation completes. memory is freed. > 3. VM gets killed. > > After 2, the page is still mapped in the S-EPT, but it could potentially be > reallocated and used outside the VM. > > >From the TDX module and hardware's perspective, the mapping in the S-EPT for > this page remains valid. So, I'm uncertain if the TDX module might do something > creative to access the guest page after 2. > > Besides, a cache flush after 2 can essentially cause a memory write to the page. > Though we could invoke tdh_phymem_page_wbinvd_hkid() after the KVM_BUG_ON(), the > SEAMCALL itself can fail. I think this falls into the category of "don't screw up" flows. Failure to remove a private SPTE is a near-catastrophic error. Going out of our way to reduce the impact of such errors increases complexity without providing much in the way of value. E.g. if VMCLEAR fails, KVM WARNs but continues on and hopes for the best, even though there's a decent chance failure to purge the VMCS cache entry could be lead to UAF-like problems. To me, this is largely the same. If anything, we should try to prevent #2, e.g. by marking the entire guest_memfd as broken or something, and then deliberately leaking _all_ pages.
On Tue, 2025-09-02 at 10:33 -0700, Sean Christopherson wrote: > > Besides, a cache flush after 2 can essentially cause a memory write to the > > page. > > Though we could invoke tdh_phymem_page_wbinvd_hkid() after the KVM_BUG_ON(), > > the SEAMCALL itself can fail. > > I think this falls into the category of "don't screw up" flows. Failure to > remove a private SPTE is a near-catastrophic error. Going out of our way to > reduce the impact of such errors increases complexity without providing much > in the way of value. > > E.g. if VMCLEAR fails, KVM WARNs but continues on and hopes for the best, even > though there's a decent chance failure to purge the VMCS cache entry could be > lead to UAF-like problems. To me, this is largely the same. > > If anything, we should try to prevent #2, e.g. by marking the entire > guest_memfd as broken or something, and then deliberately leaking _all_ pages. There was a marathon thread on this subject. We did discuss this option (link to most relevant part I could find): https://lore.kernel.org/kvm/a9affa03c7cdc8109d0ed6b5ca30ec69269e2f34.camel@intel.com/ The high level summary is that pinning the pages wrinkles guestmemfd's plans to use refcount for other tracking purposes. Dropping refcounts interferes with the error handling safety. I strongly agree that we should not optimize for the error path at all. If we could bug the guestmemfd (kind of what we were discussing in that link) I think it would be appropriate to use in these cases. I guess the question is are we ok dropping the safety before we have a solution like that. In that thread I was advocating for yes, partly to close it because the conversation was getting stuck. But there is probably a long tail of potential issues or ways of looking at it that could put it in the grey area.
On Tue, Sep 02, 2025, Rick P Edgecombe wrote: > On Tue, 2025-09-02 at 10:33 -0700, Sean Christopherson wrote: > > > Besides, a cache flush after 2 can essentially cause a memory write to the > > > page. > > > Though we could invoke tdh_phymem_page_wbinvd_hkid() after the KVM_BUG_ON(), > > > the SEAMCALL itself can fail. > > > > I think this falls into the category of "don't screw up" flows. Failure to > > remove a private SPTE is a near-catastrophic error. Going out of our way to > > reduce the impact of such errors increases complexity without providing much > > in the way of value. > > > > E.g. if VMCLEAR fails, KVM WARNs but continues on and hopes for the best, even > > though there's a decent chance failure to purge the VMCS cache entry could be > > lead to UAF-like problems. To me, this is largely the same. > > > > If anything, we should try to prevent #2, e.g. by marking the entire > > guest_memfd as broken or something, and then deliberately leaking _all_ pages. > > There was a marathon thread on this subject. Holy moly, you weren't kidding. > We did discuss this option (link to > most relevant part I could find): > https://lore.kernel.org/kvm/a9affa03c7cdc8109d0ed6b5ca30ec69269e2f34.camel@intel.com/ > > The high level summary is that pinning the pages wrinkles guestmemfd's plans to > use refcount for other tracking purposes. Dropping refcounts interferes with the > error handling safety. It also bakes even more assumptions into TDX about guest_memfd being backed with "struct page", which I would like to avoid doing whenever possible. > I strongly agree that we should not optimize for the error path at all. If we > could bug the guestmemfd (kind of what we were discussing in that link) I think > it would be appropriate to use in these cases. I guess the question is are we ok > dropping the safety before we have a solution like that. Definitely a "yes" from me. For this to actually cause real world problems, we'd need a critical KVM, hardware, or TDX-Module bug, and several unlikely events to all line up. If someone encounters any of these KVM_BUG_ON()s _and_ has observed that the probability of data corruption is meaningful, then we can always convert one or more of these to full BUG_ON() conditions, but I don't see any reason to do that without strong evidence that it's necessary. > In that thread I was advocating for yes, partly to close it because the > conversation was getting stuck. But there is probably a long tail of > potential issues or ways of looking at it that could put it in the grey area.
On Fri, Aug 29, 2025, Rick P Edgecombe wrote: > On Thu, 2025-08-28 at 17:06 -0700, Sean Christopherson wrote: > > From: Yan Zhao <yan.y.zhao@intel.com> > > > > Don't explicitly pin pages when mapping pages into the S-EPT, guest_memfd > > doesn't support page migration in any capacity, i.e. there are no migrate > > callbacks because guest_memfd pages *can't* be migrated. See the WARN in > > kvm_gmem_migrate_folio(). > > > > Eliminating TDX's explicit pinning will also enable guest_memfd to support > > in-place conversion between shared and private memory[1][2]. Because KVM > > cannot distinguish between speculative/transient refcounts and the > > intentional refcount for TDX on private pages[3], failing to release > > private page refcount in TDX could cause guest_memfd to indefinitely wait > > on decreasing the refcount for the splitting. > > > > Under normal conditions, not holding an extra page refcount in TDX is safe > > because guest_memfd ensures pages are retained until its invalidation > > notification to KVM MMU is completed. However, if there're bugs in KVM/TDX > > module, not holding an extra refcount when a page is mapped in S-EPT could > > result in a page being released from guest_memfd while still mapped in the > > S-EPT. But, doing work to make a fatal error slightly less fatal is a net > > negative when that extra work adds complexity and confusion. > > > > Several approaches were considered to address the refcount issue, including > > - Attempting to modify the KVM unmap operation to return a failure, > > which was deemed too complex and potentially incorrect[4]. > > - Increasing the folio reference count only upon S-EPT zapping failure[5]. > > - Use page flags or page_ext to indicate a page is still used by TDX[6], > > which does not work for HVO (HugeTLB Vmemmap Optimization). > > - Setting HWPOISON bit or leveraging folio_set_hugetlb_hwpoison()[7]. > > > > Due to the complexity or inappropriateness of these approaches, and the > > fact that S-EPT zapping failure is currently only possible when there are > > bugs in the KVM or TDX module, which is very rare in a production kernel, > > a straightforward approach of simply not holding the page reference count > > in TDX was chosen[8]. > > > > When S-EPT zapping errors occur, KVM_BUG_ON() is invoked to kick off all > > vCPUs and mark the VM as dead. Although there is a potential window that a > > private page mapped in the S-EPT could be reallocated and used outside the > > VM, the loud warning from KVM_BUG_ON() should provide sufficient debug > > information. > > > > Yea, in the case of a bug, there could be a use-after-free. This logic applies > to all code that has allocations including the entire KVM MMU. But in this case, > we can actually catch the use-after-free scenario under scrutiny and not have it > happen silently, which does not apply to all code. But the special case here is > that the use-after-free depends on TDX module logic which is not part of the > kernel. > > Yan, can you clarify what you mean by "there could be a small window"? I'm > thinking this is a hypothetical window around vm_dead races? Or more concrete? I > *don't* want to re-open the debate on whether to go with this approach, but I > think this is a good teaching edge case to settle on how we want to treat > similar issues. So I just want to make sure we have the justification right. The first paragraph is all the justification we need. Seriously. Bad things will happen if you have UAF bugs, news at 11! I'm all for defensive programming, but pinning pages goes too far, because that itself can be dangerous, e.g. see commit 2bcb52a3602b ("KVM: Pin (as in FOLL_PIN) pages during kvm_vcpu_map()") and the many messes KVM created with respect to struct page refcounts. I'm happy to include more context in the changelog, but I really don't want anyone to walk away from this thinking that pinning pages in random KVM code is at all encouraged.
On Fri, 2025-08-29 at 13:19 -0700, Sean Christopherson wrote: > > Yan, can you clarify what you mean by "there could be a small window"? I'm > > thinking this is a hypothetical window around vm_dead races? Or more > > concrete? I *don't* want to re-open the debate on whether to go with this > > approach, but I think this is a good teaching edge case to settle on how we > > want to treat similar issues. So I just want to make sure we have the > > justification right. > > The first paragraph is all the justification we need. Seriously. Bad things > will happen if you have UAF bugs, news at 11! Totally. > > I'm all for defensive programming, but pinning pages goes too far, because > that itself can be dangerous, e.g. see commit 2bcb52a3602b ("KVM: Pin (as in > FOLL_PIN) pages during kvm_vcpu_map()") and the many messes KVM created with > respect to struct page refcounts. > > I'm happy to include more context in the changelog, but I really don't want > anyone to walk away from this thinking that pinning pages in random KVM code > is at all encouraged. Sorry for going on a tangent. Defensive programming inside the kernel is a little more settled. But for defensive programming against the TDX module, there are various schools of thought internally. Currently we rely on some undocumented behavior of the TDX module (as in not in the spec) for correctness. But I don't think we do for security. Speaking for Yan here, I think she was a little more worried about this scenario then me, so I read this verbiage and thought to try to close it out.
On Fri, Aug 29, 2025, Rick P Edgecombe wrote: > On Fri, 2025-08-29 at 13:19 -0700, Sean Christopherson wrote: > > I'm happy to include more context in the changelog, but I really don't want > > anyone to walk away from this thinking that pinning pages in random KVM code > > is at all encouraged. > > Sorry for going on a tangent. Defensive programming inside the kernel is a > little more settled. But for defensive programming against the TDX module, there > are various schools of thought internally. Currently we rely on some > undocumented behavior of the TDX module (as in not in the spec) for correctness. Examples? > But I don't think we do for security. > > Speaking for Yan here, I think she was a little more worried about this scenario > then me, so I read this verbiage and thought to try to close it out.
On Fri, 2025-08-29 at 15:02 -0700, Sean Christopherson wrote: > On Fri, Aug 29, 2025, Rick P Edgecombe wrote: > > On Fri, 2025-08-29 at 13:19 -0700, Sean Christopherson wrote: > > > I'm happy to include more context in the changelog, but I really don't want > > > anyone to walk away from this thinking that pinning pages in random KVM code > > > is at all encouraged. > > > > Sorry for going on a tangent. Defensive programming inside the kernel is a > > little more settled. But for defensive programming against the TDX module, there > > are various schools of thought internally. Currently we rely on some > > undocumented behavior of the TDX module (as in not in the spec) for correctness. > > Examples? I was thinking about the BUSY error code avoidance logic that is now called tdh_do_no_vcpus(). We assume no new conditions will appear that cause a TDX_OPERAND_BUSY. Like a guest opt-in or something. It's on our todo list to transition those assumptions to promises. We just need to formalize them. > > > But I don't think we do for security. But, actually they are some of the same paths. So same pattern. > > > > Speaking for Yan here, I think she was a little more worried about this scenario > > then me, so I read this verbiage and thought to try to close it out.
On Fri, Aug 29, 2025, Rick P Edgecombe wrote: > On Fri, 2025-08-29 at 15:02 -0700, Sean Christopherson wrote: > > On Fri, Aug 29, 2025, Rick P Edgecombe wrote: > > > On Fri, 2025-08-29 at 13:19 -0700, Sean Christopherson wrote: > > > > I'm happy to include more context in the changelog, but I really don't want > > > > anyone to walk away from this thinking that pinning pages in random KVM code > > > > is at all encouraged. > > > > > > Sorry for going on a tangent. Defensive programming inside the kernel is a > > > little more settled. But for defensive programming against the TDX module, there > > > are various schools of thought internally. Currently we rely on some > > > undocumented behavior of the TDX module (as in not in the spec) for correctness. > > > > Examples? > > I was thinking about the BUSY error code avoidance logic that is now called > tdh_do_no_vcpus(). We assume no new conditions will appear that cause a > TDX_OPERAND_BUSY. Like a guest opt-in or something. Ah, gotcha. If that happens, that's a TDX-Module ABI break. Probably a good idea to drill it into the TDX-Module authors/designers that ABI is established when behavior is visible to the user, regardless of whether or not that behavior is formally defined. Note, breaking ABI _can_ be fine, e.g. if the behavior of some SEAMCALL changes, but KVM doesn't care. But if the TDX-Module suddenly starts failing a SEAMCALL that previously succeeded, then we're going to have a problem. > It's on our todo list to transition those assumptions to promises. We just need > to formalize them.
On Fri, 2025-08-29 at 15:58 -0700, Sean Christopherson wrote: > > I was thinking about the BUSY error code avoidance logic that is now called > > tdh_do_no_vcpus(). We assume no new conditions will appear that cause a > > TDX_OPERAND_BUSY. Like a guest opt-in or something. > > Ah, gotcha. If that happens, that's a TDX-Module ABI break. Probably a good > idea to drill it into the TDX-Module authors/designers that ABI is established > when behavior is visible to the user, regardless of whether or not that > behavior is formally defined. > > Note, breaking ABI _can_ be fine, e.g. if the behavior of some SEAMCALL > changes, but KVM doesn't care. But if the TDX-Module suddenly starts failing > a SEAMCALL that previously succeeded, then we're going to have a problem. Thanks! I'll use this quote.
On 8/29/2025 8:06 AM, Sean Christopherson wrote: > From: Yan Zhao <yan.y.zhao@intel.com> > > Don't explicitly pin pages when mapping pages into the S-EPT, guest_memfd > doesn't support page migration in any capacity, i.e. there are no migrate > callbacks because guest_memfd pages *can't* be migrated. See the WARN in > kvm_gmem_migrate_folio(). > > Eliminating TDX's explicit pinning will also enable guest_memfd to support > in-place conversion between shared and private memory[1][2]. Because KVM > cannot distinguish between speculative/transient refcounts and the > intentional refcount for TDX on private pages[3], failing to release > private page refcount in TDX could cause guest_memfd to indefinitely wait > on decreasing the refcount for the splitting. > > Under normal conditions, not holding an extra page refcount in TDX is safe > because guest_memfd ensures pages are retained until its invalidation > notification to KVM MMU is completed. However, if there're bugs in KVM/TDX > module, not holding an extra refcount when a page is mapped in S-EPT could > result in a page being released from guest_memfd while still mapped in the > S-EPT. But, doing work to make a fatal error slightly less fatal is a net > negative when that extra work adds complexity and confusion. > > Several approaches were considered to address the refcount issue, including > - Attempting to modify the KVM unmap operation to return a failure, > which was deemed too complex and potentially incorrect[4]. > - Increasing the folio reference count only upon S-EPT zapping failure[5]. > - Use page flags or page_ext to indicate a page is still used by TDX[6], > which does not work for HVO (HugeTLB Vmemmap Optimization). > - Setting HWPOISON bit or leveraging folio_set_hugetlb_hwpoison()[7]. Nit: alignment issue with the bullets. Otherwise, Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com> > > Due to the complexity or inappropriateness of these approaches, and the > fact that S-EPT zapping failure is currently only possible when there are > bugs in the KVM or TDX module, which is very rare in a production kernel, > a straightforward approach of simply not holding the page reference count > in TDX was chosen[8]. > > When S-EPT zapping errors occur, KVM_BUG_ON() is invoked to kick off all > vCPUs and mark the VM as dead. Although there is a potential window that a > private page mapped in the S-EPT could be reallocated and used outside the > VM, the loud warning from KVM_BUG_ON() should provide sufficient debug > information. To be robust against bugs, the user can enable panic_on_warn > as normal. > > Link: https://lore.kernel.org/all/cover.1747264138.git.ackerleytng@google.com [1] > Link: https://youtu.be/UnBKahkAon4 [2] > Link: https://lore.kernel.org/all/CAGtprH_ypohFy9TOJ8Emm_roT4XbQUtLKZNFcM6Fr+fhTFkE0Q@mail.gmail.com [3] > Link: https://lore.kernel.org/all/aEEEJbTzlncbRaRA@yzhao56-desk.sh.intel.com [4] > Link: https://lore.kernel.org/all/aE%2Fq9VKkmaCcuwpU@yzhao56-desk.sh.intel.com [5] > Link: https://lore.kernel.org/all/aFkeBtuNBN1RrDAJ@yzhao56-desk.sh.intel.com [6] > Link: https://lore.kernel.org/all/diqzy0tikran.fsf@ackerleytng-ctop.c.googlers.com [7] > Link: https://lore.kernel.org/all/53ea5239f8ef9d8df9af593647243c10435fd219.camel@intel.com [8] > Suggested-by: Vishal Annapurve <vannapurve@google.com> > Suggested-by: Ackerley Tng <ackerleytng@google.com> > Suggested-by: Rick Edgecombe <rick.p.edgecombe@intel.com> > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com> > Reviewed-by: Ira Weiny <ira.weiny@intel.com> > Reviewed-by: Kai Huang <kai.huang@intel.com> > [sean: extract out of hugepage series, massage changelog accordingly] > Signed-off-by: Sean Christopherson <seanjc@google.com> > --- > arch/x86/kvm/vmx/tdx.c | 28 ++++------------------------ > 1 file changed, 4 insertions(+), 24 deletions(-) > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > index c83e1ff02827..f24f8635b433 100644 > --- a/arch/x86/kvm/vmx/tdx.c > +++ b/arch/x86/kvm/vmx/tdx.c > @@ -1586,29 +1586,22 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) > td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); > } > > -static void tdx_unpin(struct kvm *kvm, struct page *page) > -{ > - put_page(page); > -} > - > static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, > - enum pg_level level, struct page *page) > + enum pg_level level, kvm_pfn_t pfn) > { > int tdx_level = pg_level_to_tdx_sept_level(level); > struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > + struct page *page = pfn_to_page(pfn); > gpa_t gpa = gfn_to_gpa(gfn); > u64 entry, level_state; > u64 err; > > err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); > - if (unlikely(tdx_operand_busy(err))) { > - tdx_unpin(kvm, page); > + if (unlikely(tdx_operand_busy(err))) > return -EBUSY; > - } > > if (KVM_BUG_ON(err, kvm)) { > pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); > - tdx_unpin(kvm, page); > return -EIO; > } > > @@ -1642,29 +1635,18 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, > enum pg_level level, kvm_pfn_t pfn) > { > struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > - struct page *page = pfn_to_page(pfn); > > /* TODO: handle large pages. */ > if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) > return -EINVAL; > > - /* > - * Because guest_memfd doesn't support page migration with > - * a_ops->migrate_folio (yet), no callback is triggered for KVM on page > - * migration. Until guest_memfd supports page migration, prevent page > - * migration. > - * TODO: Once guest_memfd introduces callback on page migration, > - * implement it and remove get_page/put_page(). > - */ > - get_page(page); > - > /* > * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching > * barrier in tdx_td_finalize(). > */ > smp_rmb(); > if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) > - return tdx_mem_page_aug(kvm, gfn, level, page); > + return tdx_mem_page_aug(kvm, gfn, level, pfn); > > return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); > } > @@ -1715,7 +1697,6 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, > return -EIO; > } > tdx_clear_page(page); > - tdx_unpin(kvm, page); > return 0; > } > > @@ -1795,7 +1776,6 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, > if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && > !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { > atomic64_dec(&kvm_tdx->nr_premapped); > - tdx_unpin(kvm, page); > return 0; > } >
© 2016 - 2025 Red Hat, Inc.