Dynamically allocate the (massive) array of hashed lists used to track
shadow pages, as the array itself is 32KiB, i.e. is an order-3 allocation
all on its own, and is *exactly* an order-3 allocation. Dynamically
allocating the array will allow allocating "struct kvm" using regular
kmalloc(), and will also allow deferring allocation of the array until
it's actually needed, i.e. until the first shadow root is allocated.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/kvm_host.h | 4 ++--
arch/x86/kvm/mmu/mmu.c | 21 ++++++++++++++++++++-
arch/x86/kvm/x86.c | 5 ++++-
3 files changed, 26 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d881e7d276b1..6ead9e57446a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1332,7 +1332,7 @@ struct kvm_arch {
bool has_private_mem;
bool has_protected_state;
bool pre_fault_allowed;
- struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
+ struct hlist_head *mmu_page_hash;
struct list_head active_mmu_pages;
/*
* A list of kvm_mmu_page structs that, if zapped, could possibly be
@@ -1984,7 +1984,7 @@ void kvm_mmu_vendor_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
-void kvm_mmu_init_vm(struct kvm *kvm);
+int kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 63bb77ee1bb1..b878f2e89dec 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3880,6 +3880,18 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
return r;
}
+static int kvm_mmu_alloc_page_hash(struct kvm *kvm)
+{
+ typeof(kvm->arch.mmu_page_hash) h;
+
+ h = kcalloc(KVM_NUM_MMU_PAGES, sizeof(*h), GFP_KERNEL_ACCOUNT);
+ if (!h)
+ return -ENOMEM;
+
+ kvm->arch.mmu_page_hash = h;
+ return 0;
+}
+
static int mmu_first_shadow_root_alloc(struct kvm *kvm)
{
struct kvm_memslots *slots;
@@ -6673,13 +6685,19 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
}
-void kvm_mmu_init_vm(struct kvm *kvm)
+int kvm_mmu_init_vm(struct kvm *kvm)
{
+ int r;
+
kvm->arch.shadow_mmio_value = shadow_mmio_value;
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
+ r = kvm_mmu_alloc_page_hash(kvm);
+ if (r)
+ return r;
+
if (tdp_mmu_enabled)
kvm_mmu_init_tdp_mmu(kvm);
@@ -6690,6 +6708,7 @@ void kvm_mmu_init_vm(struct kvm *kvm)
kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
+ return 0;
}
static void mmu_free_vm_memory_caches(struct kvm *kvm)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 69c20a68a3f0..a1d85740d6e7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12704,7 +12704,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (ret)
goto out;
- kvm_mmu_init_vm(kvm);
+ ret = kvm_mmu_init_vm(kvm);
+ if (ret)
+ goto out_cleanup_page_track;
ret = kvm_x86_call(vm_init)(kvm);
if (ret)
@@ -12757,6 +12759,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
out_uninit_mmu:
kvm_mmu_uninit_vm(kvm);
+out_cleanup_page_track:
kvm_page_track_cleanup(kvm);
out:
return ret;
--
2.49.0.rc1.451.g8f38331e32-goog
On 2025-03-14 19:40:08, Sean Christopherson wrote:
> Dynamically allocate the (massive) array of hashed lists used to track
> shadow pages, as the array itself is 32KiB, i.e. is an order-3 allocation
> all on its own, and is *exactly* an order-3 allocation. Dynamically
> allocating the array will allow allocating "struct kvm" using regular
> kmalloc(), and will also allow deferring allocation of the array until
> it's actually needed, i.e. until the first shadow root is allocated.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
> arch/x86/include/asm/kvm_host.h | 4 ++--
> arch/x86/kvm/mmu/mmu.c | 21 ++++++++++++++++++++-
> arch/x86/kvm/x86.c | 5 ++++-
> 3 files changed, 26 insertions(+), 4 deletions(-)
>
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -6673,13 +6685,19 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
> kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
> }
>
> -void kvm_mmu_init_vm(struct kvm *kvm)
> +int kvm_mmu_init_vm(struct kvm *kvm)
> {
> + int r;
> +
> kvm->arch.shadow_mmio_value = shadow_mmio_value;
> INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
> INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
> spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
>
> + r = kvm_mmu_alloc_page_hash(kvm);
> + if (r)
> + return r;
> +
In the patch 3, shouldn't this be moved to else part of the below
'if (tdp_mmu_enabled)' line? Otherwise, this hash array will always get
allocated.
> if (tdp_mmu_enabled)
> kvm_mmu_init_tdp_mmu(kvm);
>
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -12704,7 +12704,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
> if (ret)
> goto out;
>
> - kvm_mmu_init_vm(kvm);
> + ret = kvm_mmu_init_vm(kvm);
> + if (ret)
> + goto out_cleanup_page_track;
>
> ret = kvm_x86_call(vm_init)(kvm);
> if (ret)
> @@ -12757,6 +12759,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>
> out_uninit_mmu:
> kvm_mmu_uninit_vm(kvm);
> +out_cleanup_page_track:
I think there is a memory leak in this series.
1. kvm_mmu_uninit_vm() is not freeing kvm->arch.mmu_page_hash. So, in
error case out_uninit_mmu will not recover memory allocated in
kvm_mmu_alloc_page_hash().
2. When VM terminates or is killed then the same thing will happen, no
one is reclaiming the memory.
> kvm_page_track_cleanup(kvm);
> out:
> return ret;
On Mon, Mar 17, 2025, Vipin Sharma wrote:
> On 2025-03-14 19:40:08, Sean Christopherson wrote:
> > Dynamically allocate the (massive) array of hashed lists used to track
> > shadow pages, as the array itself is 32KiB, i.e. is an order-3 allocation
> > all on its own, and is *exactly* an order-3 allocation. Dynamically
> > allocating the array will allow allocating "struct kvm" using regular
> > kmalloc(), and will also allow deferring allocation of the array until
> > it's actually needed, i.e. until the first shadow root is allocated.
> >
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > ---
> > arch/x86/include/asm/kvm_host.h | 4 ++--
> > arch/x86/kvm/mmu/mmu.c | 21 ++++++++++++++++++++-
> > arch/x86/kvm/x86.c | 5 ++++-
> > 3 files changed, 26 insertions(+), 4 deletions(-)
> >
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -6673,13 +6685,19 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
> > kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
> > }
> >
> > -void kvm_mmu_init_vm(struct kvm *kvm)
> > +int kvm_mmu_init_vm(struct kvm *kvm)
> > {
> > + int r;
> > +
> > kvm->arch.shadow_mmio_value = shadow_mmio_value;
> > INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
> > INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
> > spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
> >
> > + r = kvm_mmu_alloc_page_hash(kvm);
> > + if (r)
> > + return r;
> > +
>
> In the patch 3, shouldn't this be moved to else part of the below
> 'if (tdp_mmu_enabled)' line? Otherwise, this hash array will always get
> allocated.
Ugh, I botched the rebase, and didn't point test that the allocations actually
went away.
Before commit 0df9dab891ff ("KVM: x86/mmu: Stop zapping invalidated TDP MMU roots
asynchronously"), kvm_mmu_init_tdp_mmu() returned a value and so the code was:
if (tdp_mmu_enabled)
r = kvm_mmu_init_tdp_mmu(kvm);
else
r = kvm_mmu_alloc_page_hash(kvm);
if (r < 0)
return r;
I suppose the least ugly approach is:
if (tdp_mmu_enabled) {
kvm_mmu_init_tdp_mmu(kvm);
} else {
r = kvm_mmu_alloc_page_hash(kvm);
if (r)
return r;
}
> > if (tdp_mmu_enabled)
> > kvm_mmu_init_tdp_mmu(kvm);
> >
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -12704,7 +12704,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
> > if (ret)
> > goto out;
> >
> > - kvm_mmu_init_vm(kvm);
> > + ret = kvm_mmu_init_vm(kvm);
> > + if (ret)
> > + goto out_cleanup_page_track;
> >
> > ret = kvm_x86_call(vm_init)(kvm);
> > if (ret)
> > @@ -12757,6 +12759,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
> >
> > out_uninit_mmu:
> > kvm_mmu_uninit_vm(kvm);
> > +out_cleanup_page_track:
>
> I think there is a memory leak in this series.
/facepalm
Good job, me.
Thanks for the review!
© 2016 - 2025 Red Hat, Inc.