Rework kvm_mmu_max_mapping_level() to provide the plumbing to consult
guest_memfd (and relevant vendor code) when recovering hugepages, e.g.
after disabling live migration. The flaw has existed since guest_memfd was
originally added, but has gone unnoticed due to lack of guest_memfd support
for hugepages or dirty logging.
Don't actually call into guest_memfd at this time, as it's unclear as to
what the API should be. Ideally, KVM would simply use kvm_gmem_get_pfn(),
but invoking kvm_gmem_get_pfn() would lead to sleeping in atomic context
if guest_memfd needed to allocate memory (mmu_lock is held). Luckily,
the path isn't actually reachable, so just add a TODO and WARN to ensure
the functionality is added alongisde guest_memfd hugepage support, and
punt the guest_memfd API design question to the future.
Note, calling kvm_mem_is_private() in the non-fault path is safe, so long
as mmu_lock is held, as hugepage recovery operates on shadow-present SPTEs,
i.e. calling kvm_mmu_max_mapping_level() with @fault=NULL is mutually
exclusive with kvm_vm_set_mem_attributes() changing the PRIVATE attribute
of the gfn.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kvm/mmu/mmu.c | 82 +++++++++++++++++++--------------
arch/x86/kvm/mmu/mmu_internal.h | 2 +-
arch/x86/kvm/mmu/tdp_mmu.c | 2 +-
3 files changed, 49 insertions(+), 37 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 20dd9f64156e..61eb9f723675 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3302,31 +3302,54 @@ static u8 kvm_max_level_for_order(int order)
return PG_LEVEL_4K;
}
-static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
- u8 max_level, int gmem_order)
+static u8 kvm_max_private_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
+ const struct kvm_memory_slot *slot, gfn_t gfn)
{
- u8 req_max_level;
+ u8 max_level, coco_level;
+ kvm_pfn_t pfn;
- if (max_level == PG_LEVEL_4K)
- return PG_LEVEL_4K;
+ /* For faults, use the gmem information that was resolved earlier. */
+ if (fault) {
+ pfn = fault->pfn;
+ max_level = fault->max_level;
+ } else {
+ /* TODO: Call into guest_memfd once hugepages are supported. */
+ WARN_ONCE(1, "Get pfn+order from guest_memfd");
+ pfn = KVM_PFN_ERR_FAULT;
+ max_level = PG_LEVEL_4K;
+ }
- max_level = min(kvm_max_level_for_order(gmem_order), max_level);
if (max_level == PG_LEVEL_4K)
- return PG_LEVEL_4K;
+ return max_level;
- req_max_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn);
- if (req_max_level)
- max_level = min(max_level, req_max_level);
+ /*
+ * CoCo may influence the max mapping level, e.g. due to RMP or S-EPT
+ * restrictions. A return of '0' means "no additional restrictions", to
+ * allow for using an optional "ret0" static call.
+ */
+ coco_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn);
+ if (coco_level)
+ max_level = min(max_level, coco_level);
return max_level;
}
-static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- gfn_t gfn, int max_level, bool is_private)
+int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
+ const struct kvm_memory_slot *slot, gfn_t gfn)
{
struct kvm_lpage_info *linfo;
- int host_level;
+ int host_level, max_level;
+ bool is_private;
+
+ lockdep_assert_held(&kvm->mmu_lock);
+
+ if (fault) {
+ max_level = fault->max_level;
+ is_private = fault->is_private;
+ } else {
+ max_level = PG_LEVEL_NUM;
+ is_private = kvm_mem_is_private(kvm, gfn);
+ }
max_level = min(max_level, max_huge_page_level);
for ( ; max_level > PG_LEVEL_4K; max_level--) {
@@ -3335,25 +3358,16 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
break;
}
+ if (max_level == PG_LEVEL_4K)
+ return PG_LEVEL_4K;
+
if (is_private)
- return max_level;
-
- if (max_level == PG_LEVEL_4K)
- return PG_LEVEL_4K;
-
- host_level = host_pfn_mapping_level(kvm, gfn, slot);
+ host_level = kvm_max_private_mapping_level(kvm, fault, slot, gfn);
+ else
+ host_level = host_pfn_mapping_level(kvm, gfn, slot);
return min(host_level, max_level);
}
-int kvm_mmu_max_mapping_level(struct kvm *kvm,
- const struct kvm_memory_slot *slot, gfn_t gfn)
-{
- bool is_private = kvm_slot_has_gmem(slot) &&
- kvm_mem_is_private(kvm, gfn);
-
- return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
-}
-
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_memory_slot *slot = fault->slot;
@@ -3374,9 +3388,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* Enforce the iTLB multihit workaround after capturing the requested
* level, which will be used to do precise, accurate accounting.
*/
- fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
- fault->gfn, fault->max_level,
- fault->is_private);
+ fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, fault,
+ fault->slot, fault->gfn);
if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
return;
@@ -4564,8 +4577,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
}
fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
- fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
- fault->max_level, max_order);
+ fault->max_level = kvm_max_level_for_order(max_order);
return RET_PF_CONTINUE;
}
@@ -7165,7 +7177,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
* mapping if the indirect sp has level = 1.
*/
if (sp->role.direct &&
- sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) {
+ sp->role.level < kvm_mmu_max_mapping_level(kvm, NULL, slot, sp->gfn)) {
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
if (kvm_available_flush_remote_tlbs_range())
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 65f3c89d7c5d..b776be783a2f 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -411,7 +411,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
return r;
}
-int kvm_mmu_max_mapping_level(struct kvm *kvm,
+int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
const struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7f3d7229b2c1..740cb06accdb 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1813,7 +1813,7 @@ static void recover_huge_pages_range(struct kvm *kvm,
if (iter.gfn < start || iter.gfn >= end)
continue;
- max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn);
+ max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn);
if (max_mapping_level < iter.level)
continue;
--
2.50.1.552.g942d659e1b-goog
On Tue, 29 Jul 2025 at 23:55, Sean Christopherson <seanjc@google.com> wrote:
>
> Rework kvm_mmu_max_mapping_level() to provide the plumbing to consult
> guest_memfd (and relevant vendor code) when recovering hugepages, e.g.
> after disabling live migration. The flaw has existed since guest_memfd was
> originally added, but has gone unnoticed due to lack of guest_memfd support
> for hugepages or dirty logging.
>
> Don't actually call into guest_memfd at this time, as it's unclear as to
> what the API should be. Ideally, KVM would simply use kvm_gmem_get_pfn(),
> but invoking kvm_gmem_get_pfn() would lead to sleeping in atomic context
> if guest_memfd needed to allocate memory (mmu_lock is held). Luckily,
> the path isn't actually reachable, so just add a TODO and WARN to ensure
> the functionality is added alongisde guest_memfd hugepage support, and
> punt the guest_memfd API design question to the future.
nit: *alongside
>
> Note, calling kvm_mem_is_private() in the non-fault path is safe, so long
> as mmu_lock is held, as hugepage recovery operates on shadow-present SPTEs,
> i.e. calling kvm_mmu_max_mapping_level() with @fault=NULL is mutually
> exclusive with kvm_vm_set_mem_attributes() changing the PRIVATE attribute
> of the gfn.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
Reviewed-by: Fuad Tabba <tabba@google.com>
Cheers,
/fuad
> arch/x86/kvm/mmu/mmu.c | 82 +++++++++++++++++++--------------
> arch/x86/kvm/mmu/mmu_internal.h | 2 +-
> arch/x86/kvm/mmu/tdp_mmu.c | 2 +-
> 3 files changed, 49 insertions(+), 37 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 20dd9f64156e..61eb9f723675 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3302,31 +3302,54 @@ static u8 kvm_max_level_for_order(int order)
> return PG_LEVEL_4K;
> }
>
> -static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
> - u8 max_level, int gmem_order)
> +static u8 kvm_max_private_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
> + const struct kvm_memory_slot *slot, gfn_t gfn)
> {
> - u8 req_max_level;
> + u8 max_level, coco_level;
> + kvm_pfn_t pfn;
>
> - if (max_level == PG_LEVEL_4K)
> - return PG_LEVEL_4K;
> + /* For faults, use the gmem information that was resolved earlier. */
> + if (fault) {
> + pfn = fault->pfn;
> + max_level = fault->max_level;
> + } else {
> + /* TODO: Call into guest_memfd once hugepages are supported. */
> + WARN_ONCE(1, "Get pfn+order from guest_memfd");
> + pfn = KVM_PFN_ERR_FAULT;
> + max_level = PG_LEVEL_4K;
> + }
>
> - max_level = min(kvm_max_level_for_order(gmem_order), max_level);
> if (max_level == PG_LEVEL_4K)
> - return PG_LEVEL_4K;
> + return max_level;
>
> - req_max_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn);
> - if (req_max_level)
> - max_level = min(max_level, req_max_level);
> + /*
> + * CoCo may influence the max mapping level, e.g. due to RMP or S-EPT
> + * restrictions. A return of '0' means "no additional restrictions", to
> + * allow for using an optional "ret0" static call.
> + */
> + coco_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn);
> + if (coco_level)
> + max_level = min(max_level, coco_level);
>
> return max_level;
> }
>
> -static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
> - const struct kvm_memory_slot *slot,
> - gfn_t gfn, int max_level, bool is_private)
> +int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
> + const struct kvm_memory_slot *slot, gfn_t gfn)
> {
> struct kvm_lpage_info *linfo;
> - int host_level;
> + int host_level, max_level;
> + bool is_private;
> +
> + lockdep_assert_held(&kvm->mmu_lock);
> +
> + if (fault) {
> + max_level = fault->max_level;
> + is_private = fault->is_private;
> + } else {
> + max_level = PG_LEVEL_NUM;
> + is_private = kvm_mem_is_private(kvm, gfn);
> + }
>
> max_level = min(max_level, max_huge_page_level);
> for ( ; max_level > PG_LEVEL_4K; max_level--) {
> @@ -3335,25 +3358,16 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
> break;
> }
>
> + if (max_level == PG_LEVEL_4K)
> + return PG_LEVEL_4K;
> +
> if (is_private)
> - return max_level;
> -
> - if (max_level == PG_LEVEL_4K)
> - return PG_LEVEL_4K;
> -
> - host_level = host_pfn_mapping_level(kvm, gfn, slot);
> + host_level = kvm_max_private_mapping_level(kvm, fault, slot, gfn);
> + else
> + host_level = host_pfn_mapping_level(kvm, gfn, slot);
> return min(host_level, max_level);
> }
>
> -int kvm_mmu_max_mapping_level(struct kvm *kvm,
> - const struct kvm_memory_slot *slot, gfn_t gfn)
> -{
> - bool is_private = kvm_slot_has_gmem(slot) &&
> - kvm_mem_is_private(kvm, gfn);
> -
> - return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
> -}
> -
> void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> {
> struct kvm_memory_slot *slot = fault->slot;
> @@ -3374,9 +3388,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> * Enforce the iTLB multihit workaround after capturing the requested
> * level, which will be used to do precise, accurate accounting.
> */
> - fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
> - fault->gfn, fault->max_level,
> - fault->is_private);
> + fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, fault,
> + fault->slot, fault->gfn);
> if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
> return;
>
> @@ -4564,8 +4577,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
> }
>
> fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
> - fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
> - fault->max_level, max_order);
> + fault->max_level = kvm_max_level_for_order(max_order);
>
> return RET_PF_CONTINUE;
> }
> @@ -7165,7 +7177,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
> * mapping if the indirect sp has level = 1.
> */
> if (sp->role.direct &&
> - sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) {
> + sp->role.level < kvm_mmu_max_mapping_level(kvm, NULL, slot, sp->gfn)) {
> kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
>
> if (kvm_available_flush_remote_tlbs_range())
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 65f3c89d7c5d..b776be783a2f 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -411,7 +411,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> return r;
> }
>
> -int kvm_mmu_max_mapping_level(struct kvm *kvm,
> +int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
> const struct kvm_memory_slot *slot, gfn_t gfn);
> void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
> void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 7f3d7229b2c1..740cb06accdb 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1813,7 +1813,7 @@ static void recover_huge_pages_range(struct kvm *kvm,
> if (iter.gfn < start || iter.gfn >= end)
> continue;
>
> - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn);
> + max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn);
> if (max_mapping_level < iter.level)
> continue;
>
> --
> 2.50.1.552.g942d659e1b-goog
>
On 7/30/2025 6:54 AM, Sean Christopherson wrote:
> Rework kvm_mmu_max_mapping_level() to provide the plumbing to consult
> guest_memfd (and relevant vendor code) when recovering hugepages, e.g.
> after disabling live migration. The flaw has existed since guest_memfd was
> originally added, but has gone unnoticed due to lack of guest_memfd support
> for hugepages or dirty logging.
>
> Don't actually call into guest_memfd at this time, as it's unclear as to
> what the API should be. Ideally, KVM would simply use kvm_gmem_get_pfn(),
> but invoking kvm_gmem_get_pfn() would lead to sleeping in atomic context
> if guest_memfd needed to allocate memory (mmu_lock is held). Luckily,
> the path isn't actually reachable, so just add a TODO and WARN to ensure
> the functionality is added alongisde guest_memfd hugepage support, and
> punt the guest_memfd API design question to the future.
>
> Note, calling kvm_mem_is_private() in the non-fault path is safe, so long
> as mmu_lock is held, as hugepage recovery operates on shadow-present SPTEs,
> i.e. calling kvm_mmu_max_mapping_level() with @fault=NULL is mutually
> exclusive with kvm_vm_set_mem_attributes() changing the PRIVATE attribute
> of the gfn.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
> arch/x86/kvm/mmu/mmu.c | 82 +++++++++++++++++++--------------
> arch/x86/kvm/mmu/mmu_internal.h | 2 +-
> arch/x86/kvm/mmu/tdp_mmu.c | 2 +-
> 3 files changed, 49 insertions(+), 37 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 20dd9f64156e..61eb9f723675 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3302,31 +3302,54 @@ static u8 kvm_max_level_for_order(int order)
> return PG_LEVEL_4K;
> }
>
> -static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
> - u8 max_level, int gmem_order)
> +static u8 kvm_max_private_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
> + const struct kvm_memory_slot *slot, gfn_t gfn)
I don't see why slot and gfn are needed here. Just to keep consistent
with host_pfn_mapping_level()?
> {
> - u8 req_max_level;
> + u8 max_level, coco_level;
> + kvm_pfn_t pfn;
>
> - if (max_level == PG_LEVEL_4K)
> - return PG_LEVEL_4K;
> + /* For faults, use the gmem information that was resolved earlier. */
> + if (fault) {
> + pfn = fault->pfn;
> + max_level = fault->max_level;
> + } else {
> + /* TODO: Call into guest_memfd once hugepages are supported. */
> + WARN_ONCE(1, "Get pfn+order from guest_memfd");
> + pfn = KVM_PFN_ERR_FAULT;
> + max_level = PG_LEVEL_4K;
> + }
>
> - max_level = min(kvm_max_level_for_order(gmem_order), max_level);
> if (max_level == PG_LEVEL_4K)
> - return PG_LEVEL_4K;
> + return max_level;
>
> - req_max_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn);
> - if (req_max_level)
> - max_level = min(max_level, req_max_level);
> + /*
> + * CoCo may influence the max mapping level, e.g. due to RMP or S-EPT
> + * restrictions. A return of '0' means "no additional restrictions", to
> + * allow for using an optional "ret0" static call.
> + */
> + coco_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn);
> + if (coco_level)
> + max_level = min(max_level, coco_level);
>
> return max_level;
> }
>
> -static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
> - const struct kvm_memory_slot *slot,
> - gfn_t gfn, int max_level, bool is_private)
> +int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
> + const struct kvm_memory_slot *slot, gfn_t gfn)
> {
> struct kvm_lpage_info *linfo;
> - int host_level;
> + int host_level, max_level;
> + bool is_private;
> +
> + lockdep_assert_held(&kvm->mmu_lock);
> +
> + if (fault) {
> + max_level = fault->max_level;
> + is_private = fault->is_private;
> + } else {
> + max_level = PG_LEVEL_NUM;
> + is_private = kvm_mem_is_private(kvm, gfn);
> + }
>
> max_level = min(max_level, max_huge_page_level);
> for ( ; max_level > PG_LEVEL_4K; max_level--) {
> @@ -3335,25 +3358,16 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
> break;
> }
>
> + if (max_level == PG_LEVEL_4K)
> + return PG_LEVEL_4K;
> +
> if (is_private)
> - return max_level;
> -
> - if (max_level == PG_LEVEL_4K)
> - return PG_LEVEL_4K;
> -
> - host_level = host_pfn_mapping_level(kvm, gfn, slot);
> + host_level = kvm_max_private_mapping_level(kvm, fault, slot, gfn);
> + else
> + host_level = host_pfn_mapping_level(kvm, gfn, slot);
> return min(host_level, max_level);
> }
>
> -int kvm_mmu_max_mapping_level(struct kvm *kvm,
> - const struct kvm_memory_slot *slot, gfn_t gfn)
> -{
> - bool is_private = kvm_slot_has_gmem(slot) &&
> - kvm_mem_is_private(kvm, gfn);
> -
> - return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
> -}
> -
> void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> {
> struct kvm_memory_slot *slot = fault->slot;
> @@ -3374,9 +3388,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> * Enforce the iTLB multihit workaround after capturing the requested
> * level, which will be used to do precise, accurate accounting.
> */
> - fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
> - fault->gfn, fault->max_level,
> - fault->is_private);
> + fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, fault,
> + fault->slot, fault->gfn);
> if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
> return;
>
> @@ -4564,8 +4577,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
> }
>
> fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
> - fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
> - fault->max_level, max_order);
> + fault->max_level = kvm_max_level_for_order(max_order);
>
> return RET_PF_CONTINUE;
> }
> @@ -7165,7 +7177,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
> * mapping if the indirect sp has level = 1.
> */
> if (sp->role.direct &&
> - sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) {
> + sp->role.level < kvm_mmu_max_mapping_level(kvm, NULL, slot, sp->gfn)) {
> kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
>
> if (kvm_available_flush_remote_tlbs_range())
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 65f3c89d7c5d..b776be783a2f 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -411,7 +411,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> return r;
> }
>
> -int kvm_mmu_max_mapping_level(struct kvm *kvm,
> +int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
> const struct kvm_memory_slot *slot, gfn_t gfn);
> void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
> void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 7f3d7229b2c1..740cb06accdb 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1813,7 +1813,7 @@ static void recover_huge_pages_range(struct kvm *kvm,
> if (iter.gfn < start || iter.gfn >= end)
> continue;
>
> - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn);
> + max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn);
> if (max_mapping_level < iter.level)
> continue;
>
On 30.07.25 09:33, Xiaoyao Li wrote: > On 7/30/2025 6:54 AM, Sean Christopherson wrote: >> Rework kvm_mmu_max_mapping_level() to provide the plumbing to consult >> guest_memfd (and relevant vendor code) when recovering hugepages, e.g. >> after disabling live migration. The flaw has existed since guest_memfd was >> originally added, but has gone unnoticed due to lack of guest_memfd support >> for hugepages or dirty logging. >> >> Don't actually call into guest_memfd at this time, as it's unclear as to >> what the API should be. Ideally, KVM would simply use kvm_gmem_get_pfn(), >> but invoking kvm_gmem_get_pfn() would lead to sleeping in atomic context >> if guest_memfd needed to allocate memory (mmu_lock is held). Luckily, >> the path isn't actually reachable, so just add a TODO and WARN to ensure >> the functionality is added alongisde guest_memfd hugepage support, and >> punt the guest_memfd API design question to the future. >> >> Note, calling kvm_mem_is_private() in the non-fault path is safe, so long >> as mmu_lock is held, as hugepage recovery operates on shadow-present SPTEs, >> i.e. calling kvm_mmu_max_mapping_level() with @fault=NULL is mutually >> exclusive with kvm_vm_set_mem_attributes() changing the PRIVATE attribute >> of the gfn. >> >> Signed-off-by: Sean Christopherson <seanjc@google.com> >> --- >> arch/x86/kvm/mmu/mmu.c | 82 +++++++++++++++++++-------------- >> arch/x86/kvm/mmu/mmu_internal.h | 2 +- >> arch/x86/kvm/mmu/tdp_mmu.c | 2 +- >> 3 files changed, 49 insertions(+), 37 deletions(-) >> >> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c >> index 20dd9f64156e..61eb9f723675 100644 >> --- a/arch/x86/kvm/mmu/mmu.c >> +++ b/arch/x86/kvm/mmu/mmu.c >> @@ -3302,31 +3302,54 @@ static u8 kvm_max_level_for_order(int order) >> return PG_LEVEL_4K; >> } >> >> -static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, >> - u8 max_level, int gmem_order) >> +static u8 kvm_max_private_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault, >> + const struct kvm_memory_slot *slot, gfn_t gfn) > > I don't see why slot and gfn are needed here. Just to keep consistent > with host_pfn_mapping_level()? > I assume as a preparation to implement the TODO. Reviewed-by: David Hildenbrand <david@redhat.com> -- Cheers, David / dhildenb
© 2016 - 2026 Red Hat, Inc.