When a large VM, specifically one that holds a significant number of PTEs,
gets abruptly destroyed, the following warning is seen during the
page-table walk:
sched: CPU 0 need_resched set for > 100018840 ns (100 ticks) without schedule
CPU: 0 UID: 0 PID: 9617 Comm: kvm_page_table_ Tainted: G O 6.16.0-smp-DEV #3 NONE
Tainted: [O]=OOT_MODULE
Call trace:
show_stack+0x20/0x38 (C)
dump_stack_lvl+0x3c/0xb8
dump_stack+0x18/0x30
resched_latency_warn+0x7c/0x88
sched_tick+0x1c4/0x268
update_process_times+0xa8/0xd8
tick_nohz_handler+0xc8/0x168
__hrtimer_run_queues+0x11c/0x338
hrtimer_interrupt+0x104/0x308
arch_timer_handler_phys+0x40/0x58
handle_percpu_devid_irq+0x8c/0x1b0
generic_handle_domain_irq+0x48/0x78
gic_handle_irq+0x1b8/0x408
call_on_irq_stack+0x24/0x30
do_interrupt_handler+0x54/0x78
el1_interrupt+0x44/0x88
el1h_64_irq_handler+0x18/0x28
el1h_64_irq+0x84/0x88
stage2_free_walker+0x30/0xa0 (P)
__kvm_pgtable_walk+0x11c/0x258
__kvm_pgtable_walk+0x180/0x258
__kvm_pgtable_walk+0x180/0x258
__kvm_pgtable_walk+0x180/0x258
kvm_pgtable_walk+0xc4/0x140
kvm_pgtable_stage2_destroy+0x5c/0xf0
kvm_free_stage2_pgd+0x6c/0xe8
kvm_uninit_stage2_mmu+0x24/0x48
kvm_arch_flush_shadow_all+0x80/0xa0
kvm_mmu_notifier_release+0x38/0x78
__mmu_notifier_release+0x15c/0x250
exit_mmap+0x68/0x400
__mmput+0x38/0x1c8
mmput+0x30/0x68
exit_mm+0xd4/0x198
do_exit+0x1a4/0xb00
do_group_exit+0x8c/0x120
get_signal+0x6d4/0x778
do_signal+0x90/0x718
do_notify_resume+0x70/0x170
el0_svc+0x74/0xd8
el0t_64_sync_handler+0x60/0xc8
el0t_64_sync+0x1b0/0x1b8
The warning is seen majorly on the host kernels that are configured
not to force-preempt, such as CONFIG_PREEMPT_NONE=y. To avoid this,
instead of walking the entire page-table in one go, split it into
smaller ranges, by checking for cond_resched() between each range.
Since the path is executed during VM destruction, after the
page-table structure is unlinked from the KVM MMU, relying on
cond_resched_rwlock_write() isn't necessary.
Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
---
arch/arm64/kvm/mmu.c | 38 ++++++++++++++++++++++++++++++++++++--
1 file changed, 36 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 2942ec92c5a4..6c4b9fb1211b 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -387,6 +387,40 @@ static void stage2_flush_vm(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, idx);
}
+/*
+ * Assume that @pgt is valid and unlinked from the KVM MMU to free the
+ * page-table without taking the kvm_mmu_lock and without performing any
+ * TLB invalidations.
+ *
+ * Also, the range of addresses can be large enough to cause need_resched
+ * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke
+ * cond_resched() periodically to prevent hogging the CPU for a long time
+ * and schedule something else, if required.
+ */
+static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr,
+ phys_addr_t end)
+{
+ u64 next;
+
+ do {
+ next = stage2_range_addr_end(addr, end);
+ kvm_pgtable_stage2_destroy_range(pgt, addr, next - addr);
+
+ if (next != end)
+ cond_resched();
+ } while (addr = next, addr != end);
+}
+
+static void kvm_destroy_stage2_pgt(struct kvm_pgtable *pgt)
+{
+ if (!is_protected_kvm_enabled()) {
+ stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits));
+ kvm_pgtable_stage2_destroy_pgd(pgt);
+ } else {
+ pkvm_pgtable_stage2_destroy(pgt);
+ }
+}
+
/**
* free_hyp_pgds - free Hyp-mode page tables
*/
@@ -984,7 +1018,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
return 0;
out_destroy_pgtable:
- KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
+ kvm_destroy_stage2_pgt(pgt);
out_free_pgtable:
kfree(pgt);
return err;
@@ -1081,7 +1115,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
write_unlock(&kvm->mmu_lock);
if (pgt) {
- KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
+ kvm_destroy_stage2_pgt(pgt);
kfree(pgt);
}
}
--
2.50.1.470.g6ba607880d-goog
On Thu, Jul 24, 2025 at 11:51:44PM +0000, Raghavendra Rao Ananta wrote:
> +/*
> + * Assume that @pgt is valid and unlinked from the KVM MMU to free the
> + * page-table without taking the kvm_mmu_lock and without performing any
> + * TLB invalidations.
> + *
> + * Also, the range of addresses can be large enough to cause need_resched
> + * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke
> + * cond_resched() periodically to prevent hogging the CPU for a long time
> + * and schedule something else, if required.
> + */
> +static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr,
> + phys_addr_t end)
> +{
> + u64 next;
> +
> + do {
> + next = stage2_range_addr_end(addr, end);
> + kvm_pgtable_stage2_destroy_range(pgt, addr, next - addr);
> +
> + if (next != end)
> + cond_resched();
> + } while (addr = next, addr != end);
> +}
> +
> +static void kvm_destroy_stage2_pgt(struct kvm_pgtable *pgt)
> +{
> + if (!is_protected_kvm_enabled()) {
> + stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits));
> + kvm_pgtable_stage2_destroy_pgd(pgt);
> + } else {
> + pkvm_pgtable_stage2_destroy(pgt);
> + }
> +}
> +
Protected mode is affected by the same problem, potentially even worse
due to the overheads of calling into EL2. Both protected and
non-protected flows should use stage2_destroy_range().
Thanks,
Oliver
Hi Oliver,
>
> Protected mode is affected by the same problem, potentially even worse
> due to the overheads of calling into EL2. Both protected and
> non-protected flows should use stage2_destroy_range().
>
I experimented with this (see diff below), and it looks like it takes
significantly longer to finish the destruction even for a very small
VM. For instance, it takes ~140 seconds on an Ampere Altra machine.
This is probably because we run cond_resched() for every breakup in
the entire sweep of the possible address range, 0 to ~(0ULL), even
though there are no actual mappings there, and we context switch out
more often.
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
+ static void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
+ {
+ u64 end = is_protected_kvm_enabled() ? ~(0ULL) : BIT(pgt->ia_bits);
+ u64 next, addr = 0;
+
+ do {
+ next = stage2_range_addr_end(addr, end);
+ KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr,
+ next - addr);
+
+ if (next != end)
+ cond_resched();
+ } while (addr = next, addr != end);
+
+
+ KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt);
+ }
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -316,9 +316,13 @@ static int __pkvm_pgtable_stage2_unmap(struct
kvm_pgtable *pgt, u64 start, u64 e
return 0;
}
-void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
+void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, u64
addr, u64 size)
+{
+ __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
+}
+
+void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt)
+{
+}
Without cond_resched() in place, it takes about half the time.
I also tried moving cond_resched() to __pkvm_pgtable_stage2_unmap(),
as per the below diff, and calling pkvm_pgtable_stage2_destroy_range()
for the entire 0 to ~(1ULL) range (instead of breaking it up). Even
for a fully 4K mapped 128G VM, I see it taking ~65 seconds, which is
close to the baseline (no cond_resched()).
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -311,8 +311,11 @@ static int __pkvm_pgtable_stage2_unmap(struct
kvm_pgtable *pgt, u64 start, u64 e
return ret;
pkvm_mapping_remove(mapping, &pgt->pkvm_mappings);
kfree(mapping);
+ cond_resched();
}
Does it make sense to call cond_resched() only when we actually unmap pages?
Thank you.
Raghavendra
On Thu, Aug 07, 2025 at 11:58:01AM -0700, Raghavendra Rao Ananta wrote:
> Hi Oliver,
>
> >
> > Protected mode is affected by the same problem, potentially even worse
> > due to the overheads of calling into EL2. Both protected and
> > non-protected flows should use stage2_destroy_range().
> >
> I experimented with this (see diff below), and it looks like it takes
> significantly longer to finish the destruction even for a very small
> VM. For instance, it takes ~140 seconds on an Ampere Altra machine.
> This is probably because we run cond_resched() for every breakup in
> the entire sweep of the possible address range, 0 to ~(0ULL), even
> though there are no actual mappings there, and we context switch out
> more often.
This seems more like an issue with the upper bound on a pKVM walk rather
than a problem with the suggestion. The information in pgt->ia_bits is
actually derived from the VTCR value of the owning MMU.
Even though we never use the VTCR value in hardware, pKVM MMUs have a
valid VTCR value that encodes the size of the IPA space and we use that
in the common stage-2 abort path.
I'm attaching some fixups that I have on top of your series that'd allow
the resched logic to remain common, like it is in other MMU flows.
From 421468dcaa4692208c3f708682b058cfc072a984 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton@linux.dev>
Date: Fri, 8 Aug 2025 11:43:12 -0700
Subject: [PATCH 4/4] fixup! KVM: arm64: Destroy the stage-2 page-table
periodically
---
arch/arm64/kvm/mmu.c | 60 ++++++++++++++++++--------------------------
1 file changed, 25 insertions(+), 35 deletions(-)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index b82412323054..fc93cc256bd8 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -383,40 +383,6 @@ static void stage2_flush_vm(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, idx);
}
-/*
- * Assume that @pgt is valid and unlinked from the KVM MMU to free the
- * page-table without taking the kvm_mmu_lock and without performing any
- * TLB invalidations.
- *
- * Also, the range of addresses can be large enough to cause need_resched
- * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke
- * cond_resched() periodically to prevent hogging the CPU for a long time
- * and schedule something else, if required.
- */
-static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr,
- phys_addr_t end)
-{
- u64 next;
-
- do {
- next = stage2_range_addr_end(addr, end);
- kvm_pgtable_stage2_destroy_range(pgt, addr, next - addr);
-
- if (next != end)
- cond_resched();
- } while (addr = next, addr != end);
-}
-
-static void kvm_destroy_stage2_pgt(struct kvm_pgtable *pgt)
-{
- if (!is_protected_kvm_enabled()) {
- stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits));
- kvm_pgtable_stage2_destroy_pgd(pgt);
- } else {
- pkvm_pgtable_stage2_destroy(pgt);
- }
-}
-
/**
* free_hyp_pgds - free Hyp-mode page tables
*/
@@ -938,11 +904,35 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
return 0;
}
+/*
+ * Assume that @pgt is valid and unlinked from the KVM MMU to free the
+ * page-table without taking the kvm_mmu_lock and without performing any
+ * TLB invalidations.
+ *
+ * Also, the range of addresses can be large enough to cause need_resched
+ * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke
+ * cond_resched() periodically to prevent hogging the CPU for a long time
+ * and schedule something else, if required.
+ */
+static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr,
+ phys_addr_t end)
+{
+ u64 next;
+
+ do {
+ next = stage2_range_addr_end(addr, end);
+ KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, next - addr);
+
+ if (next != end)
+ cond_resched();
+ } while (addr = next, addr != end);
+}
+
static void kvm_stage2_destroy(struct kvm_pgtable *pgt)
{
unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr);
- KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, 0, BIT(ia_bits));
+ stage2_destroy_range(pgt, 0, BIT(ia_bits));
KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt);
}
--
2.39.5
Heh, without full conext, the shortlog reads like "destroy stage-2 page tables from time to time". Something like this would be more appropriate: KVM: arm64: Reschedule as needed when destroying stage-2 page-tables
On Fri, Jul 25, 2025 at 7:59 AM Sean Christopherson <seanjc@google.com> wrote: > > Heh, without full conext, the shortlog reads like "destroy stage-2 page tables > from time to time". Something like this would be more appropriate: > > KVM: arm64: Reschedule as needed when destroying stage-2 page-tables This definitely sounds better :)
© 2016 - 2026 Red Hat, Inc.