Add and use a new API for mapping a private pfn from guest_memfd into the
TDP MMU from TDX's post-populate hook instead of partially open-coding the
functionality into the TDX code. Sharing code with the pre-fault path
sounded good on paper, but it's fatally flawed as simulating a fault loses
the pfn, and calling back into gmem to re-retrieve the pfn creates locking
problems, e.g. kvm_gmem_populate() already holds the gmem invalidation
lock.
Providing a dedicated API will also removing several MMU exports that
ideally would not be exposed outside of the MMU, let alone to vendor code.
On that topic, opportunistically drop the kvm_mmu_load() export. Leave
kvm_tdp_mmu_gpa_is_mapped() alone for now; the entire commit that added
kvm_tdp_mmu_gpa_is_mapped() will be removed in the near future.
Cc: Michael Roth <michael.roth@amd.com>
Cc: Yan Zhao <yan.y.zhao@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Vishal Annapurve <vannapurve@google.com>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Link: https://lore.kernel.org/all/20250709232103.zwmufocd3l7sqk7y@amd.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kvm/mmu.h | 1 +
arch/x86/kvm/mmu/mmu.c | 60 +++++++++++++++++++++++++++++++++++++++++-
arch/x86/kvm/vmx/tdx.c | 10 +++----
3 files changed, 63 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index f63074048ec6..2f108e381959 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -259,6 +259,7 @@ extern bool tdp_mmu_enabled;
bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa);
int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level);
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn);
static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 18d69d48bc55..ba5cca825a7f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5014,6 +5014,65 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
return min(range->size, end - range->gpa);
}
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+{
+ struct kvm_page_fault fault = {
+ .addr = gfn_to_gpa(gfn),
+ .error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS,
+ .prefetch = true,
+ .is_tdp = true,
+ .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm),
+
+ .max_level = PG_LEVEL_4K,
+ .req_level = PG_LEVEL_4K,
+ .goal_level = PG_LEVEL_4K,
+ .is_private = true,
+
+ .gfn = gfn,
+ .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
+ .pfn = pfn,
+ .map_writable = true,
+ };
+ struct kvm *kvm = vcpu->kvm;
+ int r;
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ if (KVM_BUG_ON(!tdp_mmu_enabled, kvm))
+ return -EIO;
+
+ if (kvm_gfn_is_write_tracked(kvm, fault.slot, fault.gfn))
+ return -EPERM;
+
+ r = kvm_mmu_reload(vcpu);
+ if (r)
+ return r;
+
+ r = mmu_topup_memory_caches(vcpu, false);
+ if (r)
+ return r;
+
+ do {
+ if (signal_pending(current))
+ return -EINTR;
+
+ if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu))
+ return -EIO;
+
+ cond_resched();
+
+ guard(read_lock)(&kvm->mmu_lock);
+
+ r = kvm_tdp_mmu_map(vcpu, &fault);
+ } while (r == RET_PF_RETRY);
+
+ if (r != RET_PF_FIXED)
+ return -EIO;
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_map_private_pfn);
+
static void nonpaging_init_context(struct kvm_mmu *context)
{
context->page_fault = nonpaging_page_fault;
@@ -5997,7 +6056,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
out:
return r;
}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 4c3014befe9f..29f344af4cc2 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -3157,15 +3157,12 @@ struct tdx_gmem_post_populate_arg {
static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
void __user *src, int order, void *_arg)
{
- u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
struct tdx_gmem_post_populate_arg *arg = _arg;
- struct kvm_vcpu *vcpu = arg->vcpu;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err, entry, level_state;
gpa_t gpa = gfn_to_gpa(gfn);
- u8 level = PG_LEVEL_4K;
struct page *src_page;
int ret, i;
- u64 err, entry, level_state;
/*
* Get the source page if it has been faulted in. Return failure if the
@@ -3177,7 +3174,7 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
if (ret != 1)
return -ENOMEM;
- ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
+ ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
if (ret < 0)
goto out;
@@ -3240,7 +3237,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
!vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
return -EINVAL;
- kvm_mmu_reload(vcpu);
ret = 0;
while (region.nr_pages) {
if (signal_pending(current)) {
--
2.51.0.858.gf9c4a03a3a-goog
On Thu, 2025-10-16 at 17:32 -0700, Sean Christopherson wrote: > Add and use a new API for mapping a private pfn from guest_memfd into the > TDP MMU from TDX's post-populate hook instead of partially open-coding the > functionality into the TDX code. Sharing code with the pre-fault path > sounded good on paper, but it's fatally flawed as simulating a fault loses > the pfn, and calling back into gmem to re-retrieve the pfn creates locking > problems, e.g. kvm_gmem_populate() already holds the gmem invalidation > lock. > > Providing a dedicated API will also removing several MMU exports that ^ remove > ideally would not be exposed outside of the MMU, let alone to vendor code. > On that topic, opportunistically drop the kvm_mmu_load() export. Leave > kvm_tdp_mmu_gpa_is_mapped() alone for now; the entire commit that added > kvm_tdp_mmu_gpa_is_mapped() will be removed in the near future. > > Cc: Michael Roth <michael.roth@amd.com> > Cc: Yan Zhao <yan.y.zhao@intel.com> > Cc: Ira Weiny <ira.weiny@intel.com> > Cc: Vishal Annapurve <vannapurve@google.com> > Cc: Rick Edgecombe <rick.p.edgecombe@intel.com> > Link: https://lore.kernel.org/all/20250709232103.zwmufocd3l7sqk7y@amd.com > Signed-off-by: Sean Christopherson <seanjc@google.com> Reviewed-by: Kai Huang <kai.huang@intel.com>
On Thu, Oct 16, 2025 at 05:32:22PM -0700, Sean Christopherson wrote:
> Link: https://lore.kernel.org/all/20250709232103.zwmufocd3l7sqk7y@amd.com
Hi Sean,
Will you post [1] to fix the AB-BA deadlock issue for huge page in-place
conversion as well?
Without it, the "WARNING: possible circular locking dependency detected" would
still appear due to
- lock(mapping.invalidate_lock#4) --> lock(&mm->mmap_lock)
for init mem on non-in-place-conversion guest_memfd
- rlock(&mm->mmap_lock) --> rlock(mapping.invalidate_lock#4)
for faulting shared pages on in-place-convertion guest_memfd
[1] https://lore.kernel.org/all/aHEwT4X0RcfZzHlt@google.com/
Thanks
Yan
On Wed, Oct 22, 2025 at 12:53:53PM +0800, Yan Zhao wrote:
> On Thu, Oct 16, 2025 at 05:32:22PM -0700, Sean Christopherson wrote:
> > Link: https://lore.kernel.org/all/20250709232103.zwmufocd3l7sqk7y@amd.com
>
> Hi Sean,
>
> Will you post [1] to fix the AB-BA deadlock issue for huge page in-place
> conversion as well?
>
> Without it, the "WARNING: possible circular locking dependency detected" would
> still appear due to
>
> - lock(mapping.invalidate_lock#4) --> lock(&mm->mmap_lock)
> for init mem on non-in-place-conversion guest_memfd
> - rlock(&mm->mmap_lock) --> rlock(mapping.invalidate_lock#4)
> for faulting shared pages on in-place-convertion guest_memfd
>
> [1] https://lore.kernel.org/all/aHEwT4X0RcfZzHlt@google.com/
[2] https://lore.kernel.org/all/cover.1760731772.git.ackerleytng@google.com/
Note: [1] is still required even with [2].
Consider the following scenario (assuming vm_memory_attributes=Y):
1. Create a TDX VM with non-in-place-conversion guest_memfd.
In the init mem path, the lock sequence is
lock(mapping.invalidate_lock#4) --> lock(&mm->mmap_lock)
2. Create a normal VM with in-place-conversion guest_memfd, with guest_memfd
memory defaulting to shared by specifying flags
GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED.
(Since kvm_arch_supports_gmem_init_shared() returns true for normal VMs due
to kvm->arch.has_private_mem == false, GUEST_MEMFD_FLAG_INIT_SHARED is a
valid flag).
Accessing the mmap'ed VA of this guest_memfd invokes
kvm_gmem_fault_user_mapping().
The lock sequence in this path is
rlock(&mm->mmap_lock) --> rlock(mapping.invalidate_lock#4)
Running 1 & 2 in the same process would trigger a circular locking warning:
[ 297.090165][ T3469] ======================================================
[ 297.099976][ T3469] WARNING: possible circular locking dependency detected
[ 297.109830][ T3469] 6.17.0-rc7-upstream+ #109 Tainted: G S
[ 297.119825][ T3469] ------------------------------------------------------
[ 297.129795][ T3469] tdx_vm_huge_pag/3469 is trying to acquire lock:
[ 297.139032][ T3469] ff110004a0625c70 (mapping.invalidate_lock#4){++++}-{4:4}, at: kvm_gmem_fault_user_mapping+0xfc/0x4c0 [kvm]
[ 297.156463][ T3469]
[ 297.156463][ T3469] but task is already holding lock:
[ 297.169168][ T3469] ff110004db628d80 (&mm->mmap_lock){++++}-{4:4}, at: lock_mm_and_find_vma+0x2d/0x520
[ 297.184330][ T3469]
[ 297.184330][ T3469] which lock already depends on the new lock.
[ 297.184330][ T3469]
[ 297.202954][ T3469]
[ 297.202954][ T3469] the existing dependency chain (in reverse order) is:
[ 297.217582][ T3469]
[ 297.217582][ T3469] -> #1 (&mm->mmap_lock){++++}-{4:4}:
[ 297.230618][ T3469] __lock_acquire+0x5ba/0xa20
[ 297.238730][ T3469] lock_acquire.part.0+0xb4/0x240
[ 297.247200][ T3469] lock_acquire+0x60/0x130
[ 297.254942][ T3469] gup_fast_fallback+0x1fb/0x390
[ 297.263269][ T3469] get_user_pages_fast+0x8f/0xd0
[ 297.271610][ T3469] tdx_gmem_post_populate+0x163/0x640 [kvm_intel]
[ 297.281603][ T3469] kvm_gmem_populate+0x53b/0x960 [kvm]
[ 297.290663][ T3469] tdx_vcpu_init_mem_region+0x33b/0x530 [kvm_intel]
[ 297.300978][ T3469] tdx_vcpu_unlocked_ioctl+0x16f/0x250 [kvm_intel]
[ 297.311245][ T3469] vt_vcpu_mem_enc_unlocked_ioctl+0x6b/0xa0 [kvm_intel]
[ 297.322045][ T3469] kvm_arch_vcpu_unlocked_ioctl+0x50/0x80 [kvm]
[ 297.332167][ T3469] kvm_vcpu_ioctl+0x27b/0xf30 [kvm]
[ 297.341084][ T3469] __x64_sys_ioctl+0x13c/0x1d0
[ 297.349416][ T3469] x64_sys_call+0x10ee/0x20d0
[ 297.357566][ T3469] do_syscall_64+0xc9/0x400
[ 297.365507][ T3469] entry_SYSCALL_64_after_hwframe+0x77/0x7f
[ 297.375053][ T3469]
[ 297.375053][ T3469] -> #0 (mapping.invalidate_lock#4){++++}-{4:4}:
[ 297.389364][ T3469] check_prev_add+0x8b/0x4c0
[ 297.397442][ T3469] validate_chain+0x367/0x440
[ 297.405580][ T3469] __lock_acquire+0x5ba/0xa20
[ 297.413664][ T3469] lock_acquire.part.0+0xb4/0x240
[ 297.422123][ T3469] lock_acquire+0x60/0x130
[ 297.429836][ T3469] down_read+0x9f/0x540
[ 297.437187][ T3469] kvm_gmem_fault_user_mapping+0xfc/0x4c0 [kvm]
[ 297.446895][ T3469] __do_fault+0xf8/0x690
[ 297.454304][ T3469] do_shared_fault+0x8a/0x3b0
[ 297.462205][ T3469] do_fault+0xf0/0xb80
[ 297.469355][ T3469] handle_pte_fault+0x499/0x9a0
[ 297.477294][ T3469] __handle_mm_fault+0x98d/0x1100
[ 297.485449][ T3469] handle_mm_fault+0x1e2/0x500
[ 297.493288][ T3469] do_user_addr_fault+0x4f3/0xf20
[ 297.501419][ T3469] exc_page_fault+0x5d/0xc0
[ 297.509027][ T3469] asm_exc_page_fault+0x27/0x30
[ 297.517003][ T3469]
[ 297.517003][ T3469] other info that might help us debug this:
[ 297.517003][ T3469]
[ 297.534317][ T3469] Possible unsafe locking scenario:
[ 297.534317][ T3469]
[ 297.546565][ T3469] CPU0 CPU1
[ 297.554486][ T3469] ---- ----
[ 297.562385][ T3469] rlock(&mm->mmap_lock);
[ 297.569203][ T3469] lock(mapping.invalidate_lock#4);
[ 297.579871][ T3469] lock(&mm->mmap_lock);
[ 297.589429][ T3469] rlock(mapping.invalidate_lock#4);
[ 297.597345][ T3469]
[ 297.597345][ T3469] *** DEADLOCK ***
[ 297.597345][ T3469]
[ 297.611988][ T3469] 1 lock held by tdx_vm_huge_pag/3469:
[ 297.619863][ T3469] #0: ff110004db628d80 (&mm->mmap_lock){++++}-{4:4}, at: lock_mm_and_find_vma+0x2d/0x520
[ 297.634775][ T3469]
[ 297.634775][ T3469] stack backtrace:
[ 297.645161][ T3469] CPU: 7 UID: 0 PID: 3469 Comm: tdx_vm_huge_pag Tainted: G S 6.17.0-rc7-upstream+ #109 PREEMPT(voluntary) cdf4eff053c68cc34a4de47b373cdf3e020105d7
[ 297.645166][ T3469] Tainted: [S]=CPU_OUT_OF_SPEC
[ 297.645167][ T3469] Hardware name: Intel Corporation ArcherCity/ArcherCity, BIOS EGSDCRB1.SYS.0101.D29.2303301937 03/30/2023
[ 297.645168][ T3469] Call Trace:
[ 297.645170][ T3469] <TASK>
[ 297.645171][ T3469] dump_stack_lvl+0x81/0xe0
[ 297.645176][ T3469] dump_stack+0x10/0x20
[ 297.645178][ T3469] print_circular_bug+0xf3/0x120
[ 297.645181][ T3469] check_noncircular+0x135/0x150
[ 297.645186][ T3469] check_prev_add+0x8b/0x4c0
[ 297.645189][ T3469] validate_chain+0x367/0x440
[ 297.645192][ T3469] __lock_acquire+0x5ba/0xa20
[ 297.645196][ T3469] lock_acquire.part.0+0xb4/0x240
[ 297.645198][ T3469] ? kvm_gmem_fault_user_mapping+0xfc/0x4c0 [kvm 92b56a1aeace799385454e64f4d853f860f01956]
[ 297.645279][ T3469] lock_acquire+0x60/0x130
[ 297.645281][ T3469] ? kvm_gmem_fault_user_mapping+0xfc/0x4c0 [kvm 92b56a1aeace799385454e64f4d853f860f01956]
[ 297.645360][ T3469] down_read+0x9f/0x540
[ 297.645363][ T3469] ? kvm_gmem_fault_user_mapping+0xfc/0x4c0 [kvm 92b56a1aeace799385454e64f4d853f860f01956]
[ 297.645441][ T3469] ? __pfx_down_read+0x10/0x10
[ 297.645444][ T3469] ? __this_cpu_preempt_check+0x13/0x20
[ 297.645447][ T3469] kvm_gmem_fault_user_mapping+0xfc/0x4c0 [kvm 92b56a1aeace799385454e64f4d853f860f01956]
[ 297.645527][ T3469] __do_fault+0xf8/0x690
[ 297.645530][ T3469] do_shared_fault+0x8a/0x3b0
[ 297.645532][ T3469] do_fault+0xf0/0xb80
[ 297.645534][ T3469] ? __this_cpu_preempt_check+0x13/0x20
[ 297.645537][ T3469] handle_pte_fault+0x499/0x9a0
[ 297.645541][ T3469] ? __pfx_handle_pte_fault+0x10/0x10
[ 297.645545][ T3469] __handle_mm_fault+0x98d/0x1100
[ 297.645547][ T3469] ? mt_find+0x3e3/0x5d0
[ 297.645552][ T3469] ? __pfx___handle_mm_fault+0x10/0x10
[ 297.645557][ T3469] ? __this_cpu_preempt_check+0x13/0x20
[ 297.645560][ T3469] handle_mm_fault+0x1e2/0x500
[ 297.645563][ T3469] ? __pfx_handle_mm_fault+0x10/0x10
[ 297.645566][ T3469] ? down_read_trylock+0x49/0x60
[ 297.645571][ T3469] do_user_addr_fault+0x4f3/0xf20
[ 297.645575][ T3469] exc_page_fault+0x5d/0xc0
[ 297.645577][ T3469] asm_exc_page_fault+0x27/0x30
[ 297.645579][ T3469] RIP: 0033:0x41fba0
[ 297.645581][ T3469] Code: f8 41 89 f0 48 8d 3c 17 48 89 c1 48 85 d2 74 2a 48 89 fa 48 29 c2 83 e2 01 74 0f 48 8d 48 01 40 88 71 ff 48 39 cf 74 13 66 90 <44> 88 01 48 83 c1 02 44 88 41 ff 48 39 cf 75 f0 c3 c3 66 66 2e 0f
[ 297.645583][ T3469] RSP: 002b:00007ffc8037f1c8 EFLAGS: 00010246
[ 297.645585][ T3469] RAX: 00007f604ee9d000 RBX: 00007f604ee906a8 RCX: 00007f604ee9d000
[ 297.645587][ T3469] RDX: 0000000000000000 RSI: 00000000000000aa RDI: 00007f604ee9e000
[ 297.645588][ T3469] RBP: 00007f604ee9d000 R08: 00000000000000aa R09: 0000000000426886
[ 297.645589][ T3469] R10: 0000000000000001 R11: 0000000000000246 R12: 000000003b5502a0
[ 297.645591][ T3469] R13: 0000000000001000 R14: 0000000000000200 R15: 00007f604eee4000
[ 297.645595][ T3469] </TASK>
On Thu, Oct 30, 2025, Yan Zhao wrote: > On Wed, Oct 22, 2025 at 12:53:53PM +0800, Yan Zhao wrote: > > On Thu, Oct 16, 2025 at 05:32:22PM -0700, Sean Christopherson wrote: > > > Link: https://lore.kernel.org/all/20250709232103.zwmufocd3l7sqk7y@amd.com > > > > Hi Sean, > > > > Will you post [1] to fix the AB-BA deadlock issue for huge page in-place > > conversion as well? If you (or anyone) has the bandwidth, please pick it up. I won't have cycles to look at that for many weeks (potentially not even this calendar year).
On Tue, Nov 04, 2025 at 09:57:26AM -0800, Sean Christopherson wrote:
> On Thu, Oct 30, 2025, Yan Zhao wrote:
> > On Wed, Oct 22, 2025 at 12:53:53PM +0800, Yan Zhao wrote:
> > > On Thu, Oct 16, 2025 at 05:32:22PM -0700, Sean Christopherson wrote:
> > > > Link: https://lore.kernel.org/all/20250709232103.zwmufocd3l7sqk7y@amd.com
> > >
> > > Hi Sean,
> > >
> > > Will you post [1] to fix the AB-BA deadlock issue for huge page in-place
> > > conversion as well?
>
> If you (or anyone) has the bandwidth, please pick it up. I won't have cycles to
> look at that for many weeks (potentially not even this calendar year).
Got it!
On the other hand, do you think we can address the warning as below?
The code is based on [2].
@@ -853,6 +859,10 @@ static int kvm_gmem_init_inode(struct inode *inode, loff_t size, u64 flags)
inode->i_size = size;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_inaccessible(inode->i_mapping);
+ if (flags &GUEST_MEMFD_FLAG_MMAP)
+ lockdep_set_class_and_subclass(&inode->i_mapping->invalidate_lock,
+ &inode->i_sb->s_type->invalidate_lock_key, 1);
+
/* Unmovable mappings are supposed to be marked unevictable as well. */
WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
As noted in [3], the only scenario can trigger the warning after [2] is when a
process creates a TDX VM with non-in-place-conversion guest_memfd and a normal
VM with in-place-conversion guest_memfd. The two invalidate_lock's don't contend
with each other theoretically.
[2] https://lore.kernel.org/all/cover.1760731772.git.ackerleytng@google.com/
[3] https://lore.kernel.org/all/aQMi%2Fn9DVyeaWsVH@yzhao56-desk.sh.intel.com/
On Wed, Nov 05, 2025 at 03:32:29PM +0800, Yan Zhao wrote:
> On Tue, Nov 04, 2025 at 09:57:26AM -0800, Sean Christopherson wrote:
> > On Thu, Oct 30, 2025, Yan Zhao wrote:
> > > On Wed, Oct 22, 2025 at 12:53:53PM +0800, Yan Zhao wrote:
> > > > On Thu, Oct 16, 2025 at 05:32:22PM -0700, Sean Christopherson wrote:
> > > > > Link: https://lore.kernel.org/all/20250709232103.zwmufocd3l7sqk7y@amd.com
> > > >
> > > > Hi Sean,
> > > >
> > > > Will you post [1] to fix the AB-BA deadlock issue for huge page in-place
> > > > conversion as well?
> >
> > If you (or anyone) has the bandwidth, please pick it up. I won't have cycles to
> > look at that for many weeks (potentially not even this calendar year).
> Got it!
> On the other hand, do you think we can address the warning as below?
> The code is based on [2].
Hmm, updated the diff.
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 7b4a4474d468..543e1eb9db65 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -853,6 +853,9 @@ static int kvm_gmem_init_inode(struct inode *inode, loff_t size, u64 flags)
inode->i_size = size;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_inaccessible(inode->i_mapping);
+ if (flags &GUEST_MEMFD_FLAG_MMAP)
+ lockdep_set_subclass(&inode->i_mapping->invalidate_lock, 1);
+
/* Unmovable mappings are supposed to be marked unevictable as well. */
WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
> As noted in [3], the only scenario can trigger the warning after [2] is when a
> process creates a TDX VM with non-in-place-conversion guest_memfd and a normal
> VM with in-place-conversion guest_memfd. The two invalidate_lock's don't contend
> with each other theoretically.
>
>
> [2] https://lore.kernel.org/all/cover.1760731772.git.ackerleytng@google.com/
> [3] https://lore.kernel.org/all/aQMi%2Fn9DVyeaWsVH@yzhao56-desk.sh.intel.com/
On Wed, Nov 05, 2025, Yan Zhao wrote: > On Wed, Nov 05, 2025 at 03:32:29PM +0800, Yan Zhao wrote: > > On Tue, Nov 04, 2025 at 09:57:26AM -0800, Sean Christopherson wrote: > > > On Thu, Oct 30, 2025, Yan Zhao wrote: > > > > On Wed, Oct 22, 2025 at 12:53:53PM +0800, Yan Zhao wrote: > > > > > On Thu, Oct 16, 2025 at 05:32:22PM -0700, Sean Christopherson wrote: > > > > > > Link: https://lore.kernel.org/all/20250709232103.zwmufocd3l7sqk7y@amd.com > > > > > > > > > > Hi Sean, > > > > > > > > > > Will you post [1] to fix the AB-BA deadlock issue for huge page in-place > > > > > conversion as well? > > > > > > If you (or anyone) has the bandwidth, please pick it up. I won't have cycles to > > > look at that for many weeks (potentially not even this calendar year). > > Got it! > > On the other hand, do you think we can address the warning as below? > > The code is based on [2]. > Hmm, updated the diff. > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > index 7b4a4474d468..543e1eb9db65 100644 > --- a/virt/kvm/guest_memfd.c > +++ b/virt/kvm/guest_memfd.c > @@ -853,6 +853,9 @@ static int kvm_gmem_init_inode(struct inode *inode, loff_t size, u64 flags) > inode->i_size = size; > mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); > mapping_set_inaccessible(inode->i_mapping); > + if (flags &GUEST_MEMFD_FLAG_MMAP) > + lockdep_set_subclass(&inode->i_mapping->invalidate_lock, 1); > + > /* Unmovable mappings are supposed to be marked unevictable as well. */ > WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); > > > > As noted in [3], the only scenario can trigger the warning after [2] is when a > > process creates a TDX VM with non-in-place-conversion guest_memfd and a normal > > VM with in-place-conversion guest_memfd. The two invalidate_lock's don't contend > > with each other theoretically. Hmm, no, I think we need to hoist gup() call outside of filemap_invalidate_lock(), because I don't think this is strictly limited to TDX VMs without in-place conversion. Even with in-place conversion, I think KVM should allow the source page to be shared memory, at which point I believe this becomes a legimate AB-BA issue. In general, playing lockdep games with so many subsystems involved terrifies me. > > [2] https://lore.kernel.org/all/cover.1760731772.git.ackerleytng@google.com/ > > [3] https://lore.kernel.org/all/aQMi%2Fn9DVyeaWsVH@yzhao56-desk.sh.intel.com/
On Thu, Oct 16, 2025 at 05:32:22PM -0700, Sean Christopherson wrote:
> Add and use a new API for mapping a private pfn from guest_memfd into the
> TDP MMU from TDX's post-populate hook instead of partially open-coding the
> functionality into the TDX code. Sharing code with the pre-fault path
> sounded good on paper, but it's fatally flawed as simulating a fault loses
> the pfn, and calling back into gmem to re-retrieve the pfn creates locking
> problems, e.g. kvm_gmem_populate() already holds the gmem invalidation
> lock.
>
> Providing a dedicated API will also removing several MMU exports that
> ideally would not be exposed outside of the MMU, let alone to vendor code.
> On that topic, opportunistically drop the kvm_mmu_load() export. Leave
> kvm_tdp_mmu_gpa_is_mapped() alone for now; the entire commit that added
> kvm_tdp_mmu_gpa_is_mapped() will be removed in the near future.
>
> Cc: Michael Roth <michael.roth@amd.com>
> Cc: Yan Zhao <yan.y.zhao@intel.com>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Cc: Vishal Annapurve <vannapurve@google.com>
> Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
> Link: https://lore.kernel.org/all/20250709232103.zwmufocd3l7sqk7y@amd.com
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
> arch/x86/kvm/mmu.h | 1 +
> arch/x86/kvm/mmu/mmu.c | 60 +++++++++++++++++++++++++++++++++++++++++-
> arch/x86/kvm/vmx/tdx.c | 10 +++----
> 3 files changed, 63 insertions(+), 8 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index f63074048ec6..2f108e381959 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -259,6 +259,7 @@ extern bool tdp_mmu_enabled;
>
> bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa);
> int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level);
> +int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn);
>
> static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
> {
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 18d69d48bc55..ba5cca825a7f 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5014,6 +5014,65 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
> return min(range->size, end - range->gpa);
> }
>
> +int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
> +{
> + struct kvm_page_fault fault = {
> + .addr = gfn_to_gpa(gfn),
> + .error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS,
> + .prefetch = true,
> + .is_tdp = true,
> + .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm),
> +
> + .max_level = PG_LEVEL_4K,
> + .req_level = PG_LEVEL_4K,
> + .goal_level = PG_LEVEL_4K,
> + .is_private = true,
> +
> + .gfn = gfn,
> + .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
> + .pfn = pfn,
> + .map_writable = true,
> + };
> + struct kvm *kvm = vcpu->kvm;
> + int r;
> +
> + lockdep_assert_held(&kvm->slots_lock);
Do we need to assert that filemap_invalidate_lock() is held as well?
Otherwise, a concurrent kvm_gmem_punch_hole(), which does not hold slots_lock,
could make the pfn stale.
Or check for stale mapping?
> +
> + if (KVM_BUG_ON(!tdp_mmu_enabled, kvm))
> + return -EIO;
> +
> + if (kvm_gfn_is_write_tracked(kvm, fault.slot, fault.gfn))
> + return -EPERM;
> +
> + r = kvm_mmu_reload(vcpu);
> + if (r)
> + return r;
> +
> + r = mmu_topup_memory_caches(vcpu, false);
> + if (r)
> + return r;
> +
> + do {
> + if (signal_pending(current))
> + return -EINTR;
> +
> + if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu))
> + return -EIO;
> +
> + cond_resched();
> +
> + guard(read_lock)(&kvm->mmu_lock);
> +
> + r = kvm_tdp_mmu_map(vcpu, &fault);
> + } while (r == RET_PF_RETRY);
> +
> + if (r != RET_PF_FIXED)
> + return -EIO;
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_map_private_pfn);
> +
> static void nonpaging_init_context(struct kvm_mmu *context)
> {
> context->page_fault = nonpaging_page_fault;
> @@ -5997,7 +6056,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
> out:
> return r;
> }
> -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_load);
>
> void kvm_mmu_unload(struct kvm_vcpu *vcpu)
> {
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 4c3014befe9f..29f344af4cc2 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -3157,15 +3157,12 @@ struct tdx_gmem_post_populate_arg {
> static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> void __user *src, int order, void *_arg)
> {
> - u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
> - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> struct tdx_gmem_post_populate_arg *arg = _arg;
> - struct kvm_vcpu *vcpu = arg->vcpu;
> + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> + u64 err, entry, level_state;
> gpa_t gpa = gfn_to_gpa(gfn);
> - u8 level = PG_LEVEL_4K;
> struct page *src_page;
> int ret, i;
> - u64 err, entry, level_state;
>
> /*
> * Get the source page if it has been faulted in. Return failure if the
> @@ -3177,7 +3174,7 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> if (ret != 1)
> return -ENOMEM;
>
> - ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
> + ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
> if (ret < 0)
> goto out;
>
> @@ -3240,7 +3237,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
> !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
> return -EINVAL;
>
> - kvm_mmu_reload(vcpu);
> ret = 0;
> while (region.nr_pages) {
> if (signal_pending(current)) {
> --
> 2.51.0.858.gf9c4a03a3a-goog
>
On Tue, Oct 21, 2025, Yan Zhao wrote:
> On Thu, Oct 16, 2025 at 05:32:22PM -0700, Sean Christopherson wrote:
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 18d69d48bc55..ba5cca825a7f 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -5014,6 +5014,65 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
> > return min(range->size, end - range->gpa);
> > }
> >
> > +int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
> > +{
> > + struct kvm_page_fault fault = {
> > + .addr = gfn_to_gpa(gfn),
> > + .error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS,
> > + .prefetch = true,
> > + .is_tdp = true,
> > + .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm),
> > +
> > + .max_level = PG_LEVEL_4K,
> > + .req_level = PG_LEVEL_4K,
> > + .goal_level = PG_LEVEL_4K,
> > + .is_private = true,
> > +
> > + .gfn = gfn,
> > + .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
> > + .pfn = pfn,
> > + .map_writable = true,
> > + };
> > + struct kvm *kvm = vcpu->kvm;
> > + int r;
> > +
> > + lockdep_assert_held(&kvm->slots_lock);
> Do we need to assert that filemap_invalidate_lock() is held as well?
Hrm, a lockdep assertion would be nice to have, but it's obviously not strictly
necessary, and I'm not sure it's worth the cost. To safely assert, KVM would need
to first assert that the file refcount is elevated, e.g. to guard against
guest_memfd _really_ screwing up and not grabbing a reference to the underlying
file.
E.g. it'd have to be something like this:
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 94d7f32a03b6..5d46b2ac0292 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5014,6 +5014,18 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
return min(range->size, end - range->gpa);
}
+static void kvm_assert_gmem_invalidate_lock_held(struct kvm_memory_slot *slot)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ if (WARN_ON_ONCE(!kvm_slot_has_gmem(slot)) ||
+ WARN_ON_ONCE(!slot->gmem.file) ||
+ WARN_ON_ONCE(!file_count(slot->gmem.file)))
+ return;
+
+ lockdep_assert_held(file_inode(&slot->gmem.file)->i_mapping->invalidate_lock));
+#endif
+}
+
int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
{
struct kvm_page_fault fault = {
@@ -5038,6 +5050,8 @@ int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
lockdep_assert_held(&kvm->slots_lock);
+ kvm_assert_gmem_invalidate_lock_held(fault.slot);
+
if (KVM_BUG_ON(!tdp_mmu_enabled, kvm))
return -EIO;
--
Which I suppose isn't that terrible?
On Tue, Oct 21, 2025 at 09:36:52AM -0700, Sean Christopherson wrote:
> On Tue, Oct 21, 2025, Yan Zhao wrote:
> > On Thu, Oct 16, 2025 at 05:32:22PM -0700, Sean Christopherson wrote:
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index 18d69d48bc55..ba5cca825a7f 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -5014,6 +5014,65 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
> > > return min(range->size, end - range->gpa);
> > > }
> > >
> > > +int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
> > > +{
> > > + struct kvm_page_fault fault = {
> > > + .addr = gfn_to_gpa(gfn),
> > > + .error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS,
> > > + .prefetch = true,
> > > + .is_tdp = true,
> > > + .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm),
> > > +
> > > + .max_level = PG_LEVEL_4K,
> > > + .req_level = PG_LEVEL_4K,
> > > + .goal_level = PG_LEVEL_4K,
> > > + .is_private = true,
> > > +
> > > + .gfn = gfn,
> > > + .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
> > > + .pfn = pfn,
> > > + .map_writable = true,
> > > + };
> > > + struct kvm *kvm = vcpu->kvm;
> > > + int r;
> > > +
> > > + lockdep_assert_held(&kvm->slots_lock);
> > Do we need to assert that filemap_invalidate_lock() is held as well?
>
> Hrm, a lockdep assertion would be nice to have, but it's obviously not strictly
> necessary, and I'm not sure it's worth the cost. To safely assert, KVM would need
Not sure. Maybe just add a comment?
But even with kvm_assert_gmem_invalidate_lock_held() and
lockdep_assert_held(&kvm->slots_lock), it seems that
kvm_tdp_mmu_map_private_pfn() still can't guarantee that the pfn is not stale.
e.g., if hypothetically those locks were released and re-acquired after getting
the pfn.
> to first assert that the file refcount is elevated, e.g. to guard against
> guest_memfd _really_ screwing up and not grabbing a reference to the underlying
> file.
>
> E.g. it'd have to be something like this:
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 94d7f32a03b6..5d46b2ac0292 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5014,6 +5014,18 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
> return min(range->size, end - range->gpa);
> }
>
> +static void kvm_assert_gmem_invalidate_lock_held(struct kvm_memory_slot *slot)
> +{
> +#ifdef CONFIG_PROVE_LOCKING
> + if (WARN_ON_ONCE(!kvm_slot_has_gmem(slot)) ||
> + WARN_ON_ONCE(!slot->gmem.file) ||
> + WARN_ON_ONCE(!file_count(slot->gmem.file)))
> + return;
> +
> + lockdep_assert_held(file_inode(&slot->gmem.file)->i_mapping->invalidate_lock));
lockdep_assert_held(&file_inode(slot->gmem.file)->i_mapping->invalidate_lock);
> +#endif
> +}
> +
> int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
> {
> struct kvm_page_fault fault = {
> @@ -5038,6 +5050,8 @@ int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
>
> lockdep_assert_held(&kvm->slots_lock);
>
> + kvm_assert_gmem_invalidate_lock_held(fault.slot);
> +
> if (KVM_BUG_ON(!tdp_mmu_enabled, kvm))
> return -EIO;
> --
>
> Which I suppose isn't that terrible?
Is it good if we test is_page_fault_stale()? e.g.,
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 9e5045a60d8b..b2cf754f6f92 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -257,7 +257,8 @@ extern bool tdp_mmu_enabled;
#define tdp_mmu_enabled false
#endif
-int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn);
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn,
+ unsigned long mmu_seq);
static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 94d7f32a03b6..0dc9ff1bc63e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5014,7 +5014,8 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
return min(range->size, end - range->gpa);
}
-int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn,
+ unsigned long mmu_seq)
{
struct kvm_page_fault fault = {
.addr = gfn_to_gpa(gfn),
@@ -5032,12 +5033,12 @@ int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
.slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
.pfn = pfn,
.map_writable = true,
+
+ .mmu_seq = mmu_seq,
};
struct kvm *kvm = vcpu->kvm;
int r;
- lockdep_assert_held(&kvm->slots_lock);
-
if (KVM_BUG_ON(!tdp_mmu_enabled, kvm))
return -EIO;
@@ -5063,6 +5064,9 @@ int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
guard(read_lock)(&kvm->mmu_lock);
+ if (is_page_fault_stale(vcpu, &fault))
+ return -EIO;
+
r = kvm_tdp_mmu_map(vcpu, &fault);
} while (r == RET_PF_RETRY);
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index cae694d3ff33..4bb3e68a12b3 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -3113,7 +3113,8 @@ struct tdx_gmem_post_populate_arg {
};
static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
- void __user *src, int order, void *_arg)
+ unsigned long mmu_seq, void __user *src,
+ int order, void *_arg)
{
struct tdx_gmem_post_populate_arg *arg = _arg;
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
@@ -3136,7 +3137,7 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
return -ENOMEM;
kvm_tdx->page_add_src = src_page;
- ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
+ ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn, mmu_seq);
kvm_tdx->page_add_src = NULL;
put_page(src_page);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d93f75b05ae2..406472f60e63 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2581,7 +2581,8 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord
* Returns the number of pages that were populated.
*/
typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
- void __user *src, int order, void *opaque);
+ unsigned long mmu_seq, void __user *src,
+ int order, void *opaque);
long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
kvm_gmem_populate_cb post_populate, void *opaque);
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 427c0acee9d7..c9a87197412e 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -836,6 +836,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
pgoff_t index = kvm_gmem_get_index(slot, gfn);
bool is_prepared = false;
kvm_pfn_t pfn;
+ unsigned long mmu_seq = kvm->mmu_invalidate_seq;
if (signal_pending(current)) {
ret = -EINTR;
@@ -869,7 +870,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
}
p = src ? src + i * PAGE_SIZE : NULL;
- ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
+ ret = post_populate(kvm, gfn, pfn, mmu_seq, p, max_order, opaque);
if (!ret)
kvm_gmem_mark_prepared(folio);
On Wed, Oct 22, 2025, Yan Zhao wrote:
> On Tue, Oct 21, 2025 at 09:36:52AM -0700, Sean Christopherson wrote:
> > On Tue, Oct 21, 2025, Yan Zhao wrote:
> > > On Thu, Oct 16, 2025 at 05:32:22PM -0700, Sean Christopherson wrote:
> > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > > index 18d69d48bc55..ba5cca825a7f 100644
> > > > --- a/arch/x86/kvm/mmu/mmu.c
> > > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > > @@ -5014,6 +5014,65 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
> > > > return min(range->size, end - range->gpa);
> > > > }
> > > >
> > > > +int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
> > > > +{
> > > > + struct kvm_page_fault fault = {
> > > > + .addr = gfn_to_gpa(gfn),
> > > > + .error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS,
> > > > + .prefetch = true,
> > > > + .is_tdp = true,
> > > > + .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm),
> > > > +
> > > > + .max_level = PG_LEVEL_4K,
> > > > + .req_level = PG_LEVEL_4K,
> > > > + .goal_level = PG_LEVEL_4K,
> > > > + .is_private = true,
> > > > +
> > > > + .gfn = gfn,
> > > > + .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
> > > > + .pfn = pfn,
> > > > + .map_writable = true,
> > > > + };
> > > > + struct kvm *kvm = vcpu->kvm;
> > > > + int r;
> > > > +
> > > > + lockdep_assert_held(&kvm->slots_lock);
> > > Do we need to assert that filemap_invalidate_lock() is held as well?
> >
> > Hrm, a lockdep assertion would be nice to have, but it's obviously not strictly
> > necessary, and I'm not sure it's worth the cost. To safely assert, KVM would need
> Not sure. Maybe just add a comment?
> But even with kvm_assert_gmem_invalidate_lock_held() and
> lockdep_assert_held(&kvm->slots_lock), it seems that
> kvm_tdp_mmu_map_private_pfn() still can't guarantee that the pfn is not stale.
At some point we have to assume correctness. E.g. one could also argue that
holding every locking in the universe still doesn't ensure the pfn is fresh,
because theoretically guest_memfd could violate the locking scheme.
Aha! And to further harden and document this code, this API can be gated on
CONFIG_KVM_GUEST_MEMFD=y, as pointed out by the amazing-as-always test bot:
https://lore.kernel.org/all/202510221928.ikBXHGCf-lkp@intel.com
We could go a step further and gate it on CONFIG_KVM_INTEL_TDX=y, but I don't
like that idea as I think it'd would be a net negative in terms of documenation,
compared to checking CONFIG_KVM_GUEST_MEMFD. And in general I don't want to set
a precedent of ifdef-ing common x86 based on what vendor code _currently_ needs
an API.
> e.g., if hypothetically those locks were released and re-acquired after getting
> the pfn.
>
> > to first assert that the file refcount is elevated, e.g. to guard against
> > guest_memfd _really_ screwing up and not grabbing a reference to the underlying
> > file.
> >
> > E.g. it'd have to be something like this:
> >
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 94d7f32a03b6..5d46b2ac0292 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -5014,6 +5014,18 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
> > return min(range->size, end - range->gpa);
> > }
> >
> > +static void kvm_assert_gmem_invalidate_lock_held(struct kvm_memory_slot *slot)
> > +{
> > +#ifdef CONFIG_PROVE_LOCKING
> > + if (WARN_ON_ONCE(!kvm_slot_has_gmem(slot)) ||
> > + WARN_ON_ONCE(!slot->gmem.file) ||
> > + WARN_ON_ONCE(!file_count(slot->gmem.file)))
> > + return;
> > +
> > + lockdep_assert_held(file_inode(&slot->gmem.file)->i_mapping->invalidate_lock));
> lockdep_assert_held(&file_inode(slot->gmem.file)->i_mapping->invalidate_lock);
> > +#endif
> > +}
> > +
> > int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
> > {
> > struct kvm_page_fault fault = {
> > @@ -5038,6 +5050,8 @@ int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
> >
> > lockdep_assert_held(&kvm->slots_lock);
> >
> > + kvm_assert_gmem_invalidate_lock_held(fault.slot);
> > +
> > if (KVM_BUG_ON(!tdp_mmu_enabled, kvm))
> > return -EIO;
> > --
> >
> > Which I suppose isn't that terrible?
> Is it good if we test is_page_fault_stale()? e.g.,
No, because it can only get false positives, e.g. if an mmu_notifier invalidation
on shared, non-guest_memfd memory. Though a sanity check would be nice to have;
I believe we can simply do:
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c5734ca5c17d..440fd8f80397 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1273,6 +1273,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
struct kvm_mmu_page *sp;
int ret = RET_PF_RETRY;
+ KVM_MMU_WARN_ON(!root || root->role.invalid);
+
kvm_mmu_hugepage_adjust(vcpu, fault);
trace_kvm_mmu_spte_requested(fault);
On Wed, Oct 22, 2025 at 11:12:47AM -0700, Sean Christopherson wrote: > On Wed, Oct 22, 2025, Yan Zhao wrote: > > On Tue, Oct 21, 2025 at 09:36:52AM -0700, Sean Christopherson wrote: > > > On Tue, Oct 21, 2025, Yan Zhao wrote: > > > > On Thu, Oct 16, 2025 at 05:32:22PM -0700, Sean Christopherson wrote: > > Is it good if we test is_page_fault_stale()? e.g., > No, because it can only get false positives, e.g. if an mmu_notifier invalidation > on shared, non-guest_memfd memory. Though a sanity check would be nice to have; Right. The false positive is annoying. > I believe we can simply do: > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index c5734ca5c17d..440fd8f80397 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -1273,6 +1273,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > struct kvm_mmu_page *sp; > int ret = RET_PF_RETRY; > > + KVM_MMU_WARN_ON(!root || root->role.invalid); > + > kvm_mmu_hugepage_adjust(vcpu, fault); > > trace_kvm_mmu_spte_requested(fault); Ok.
On Thu, 2025-10-16 at 17:32 -0700, Sean Christopherson wrote: > Add and use a new API for mapping a private pfn from guest_memfd into the > TDP MMU from TDX's post-populate hook instead of partially open-coding the > functionality into the TDX code. Sharing code with the pre-fault path > sounded good on paper, but it's fatally flawed as simulating a fault loses > the pfn, and calling back into gmem to re-retrieve the pfn creates locking > problems, e.g. kvm_gmem_populate() already holds the gmem invalidation > lock. > > Providing a dedicated API will also removing several MMU exports that > ideally would not be exposed outside of the MMU, let alone to vendor code. > On that topic, opportunistically drop the kvm_mmu_load() export. Leave > kvm_tdp_mmu_gpa_is_mapped() alone for now; the entire commit that added > kvm_tdp_mmu_gpa_is_mapped() will be removed in the near future. > > Cc: Michael Roth <michael.roth@amd.com> > Cc: Yan Zhao <yan.y.zhao@intel.com> > Cc: Ira Weiny <ira.weiny@intel.com> > Cc: Vishal Annapurve <vannapurve@google.com> > Cc: Rick Edgecombe <rick.p.edgecombe@intel.com> > Link: https://lore.kernel.org/all/20250709232103.zwmufocd3l7sqk7y@amd.com > Signed-off-by: Sean Christopherson <seanjc@google.com> Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
© 2016 - 2026 Red Hat, Inc.