When a large VM, specifically one that holds a significant number of PTEs,
gets abruptly destroyed, the following warning is seen during the
page-table walk:
sched: CPU 0 need_resched set for > 100018840 ns (100 ticks) without schedule
CPU: 0 UID: 0 PID: 9617 Comm: kvm_page_table_ Tainted: G O 6.16.0-smp-DEV #3 NONE
Tainted: [O]=OOT_MODULE
Call trace:
show_stack+0x20/0x38 (C)
dump_stack_lvl+0x3c/0xb8
dump_stack+0x18/0x30
resched_latency_warn+0x7c/0x88
sched_tick+0x1c4/0x268
update_process_times+0xa8/0xd8
tick_nohz_handler+0xc8/0x168
__hrtimer_run_queues+0x11c/0x338
hrtimer_interrupt+0x104/0x308
arch_timer_handler_phys+0x40/0x58
handle_percpu_devid_irq+0x8c/0x1b0
generic_handle_domain_irq+0x48/0x78
gic_handle_irq+0x1b8/0x408
call_on_irq_stack+0x24/0x30
do_interrupt_handler+0x54/0x78
el1_interrupt+0x44/0x88
el1h_64_irq_handler+0x18/0x28
el1h_64_irq+0x84/0x88
stage2_free_walker+0x30/0xa0 (P)
__kvm_pgtable_walk+0x11c/0x258
__kvm_pgtable_walk+0x180/0x258
__kvm_pgtable_walk+0x180/0x258
__kvm_pgtable_walk+0x180/0x258
kvm_pgtable_walk+0xc4/0x140
kvm_pgtable_stage2_destroy+0x5c/0xf0
kvm_free_stage2_pgd+0x6c/0xe8
kvm_uninit_stage2_mmu+0x24/0x48
kvm_arch_flush_shadow_all+0x80/0xa0
kvm_mmu_notifier_release+0x38/0x78
__mmu_notifier_release+0x15c/0x250
exit_mmap+0x68/0x400
__mmput+0x38/0x1c8
mmput+0x30/0x68
exit_mm+0xd4/0x198
do_exit+0x1a4/0xb00
do_group_exit+0x8c/0x120
get_signal+0x6d4/0x778
do_signal+0x90/0x718
do_notify_resume+0x70/0x170
el0_svc+0x74/0xd8
el0t_64_sync_handler+0x60/0xc8
el0t_64_sync+0x1b0/0x1b8
The warning is seen majorly on the host kernels that are configured
not to force-preempt, such as CONFIG_PREEMPT_NONE=y. To avoid this,
instead of walking the entire page-table in one go, split it into
smaller ranges, by checking for cond_resched() between each range.
Since the path is executed during VM destruction, after the
page-table structure is unlinked from the KVM MMU, relying on
cond_resched_rwlock_write() isn't necessary.
Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
---
arch/arm64/kvm/mmu.c | 38 ++++++++++++++++++++++++++++++++++++--
1 file changed, 36 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 2942ec92c5a4..6c4b9fb1211b 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -387,6 +387,40 @@ static void stage2_flush_vm(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, idx);
}
+/*
+ * Assume that @pgt is valid and unlinked from the KVM MMU to free the
+ * page-table without taking the kvm_mmu_lock and without performing any
+ * TLB invalidations.
+ *
+ * Also, the range of addresses can be large enough to cause need_resched
+ * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke
+ * cond_resched() periodically to prevent hogging the CPU for a long time
+ * and schedule something else, if required.
+ */
+static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr,
+ phys_addr_t end)
+{
+ u64 next;
+
+ do {
+ next = stage2_range_addr_end(addr, end);
+ kvm_pgtable_stage2_destroy_range(pgt, addr, next - addr);
+
+ if (next != end)
+ cond_resched();
+ } while (addr = next, addr != end);
+}
+
+static void kvm_destroy_stage2_pgt(struct kvm_pgtable *pgt)
+{
+ if (!is_protected_kvm_enabled()) {
+ stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits));
+ kvm_pgtable_stage2_destroy_pgd(pgt);
+ } else {
+ pkvm_pgtable_stage2_destroy(pgt);
+ }
+}
+
/**
* free_hyp_pgds - free Hyp-mode page tables
*/
@@ -984,7 +1018,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
return 0;
out_destroy_pgtable:
- KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
+ kvm_destroy_stage2_pgt(pgt);
out_free_pgtable:
kfree(pgt);
return err;
@@ -1081,7 +1115,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
write_unlock(&kvm->mmu_lock);
if (pgt) {
- KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
+ kvm_destroy_stage2_pgt(pgt);
kfree(pgt);
}
}
--
2.50.1.470.g6ba607880d-goog
On Thu, Jul 24, 2025 at 11:51:44PM +0000, Raghavendra Rao Ananta wrote: > +/* > + * Assume that @pgt is valid and unlinked from the KVM MMU to free the > + * page-table without taking the kvm_mmu_lock and without performing any > + * TLB invalidations. > + * > + * Also, the range of addresses can be large enough to cause need_resched > + * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke > + * cond_resched() periodically to prevent hogging the CPU for a long time > + * and schedule something else, if required. > + */ > +static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, > + phys_addr_t end) > +{ > + u64 next; > + > + do { > + next = stage2_range_addr_end(addr, end); > + kvm_pgtable_stage2_destroy_range(pgt, addr, next - addr); > + > + if (next != end) > + cond_resched(); > + } while (addr = next, addr != end); > +} > + > +static void kvm_destroy_stage2_pgt(struct kvm_pgtable *pgt) > +{ > + if (!is_protected_kvm_enabled()) { > + stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits)); > + kvm_pgtable_stage2_destroy_pgd(pgt); > + } else { > + pkvm_pgtable_stage2_destroy(pgt); > + } > +} > + Protected mode is affected by the same problem, potentially even worse due to the overheads of calling into EL2. Both protected and non-protected flows should use stage2_destroy_range(). Thanks, Oliver
Hi Oliver, > > Protected mode is affected by the same problem, potentially even worse > due to the overheads of calling into EL2. Both protected and > non-protected flows should use stage2_destroy_range(). > I experimented with this (see diff below), and it looks like it takes significantly longer to finish the destruction even for a very small VM. For instance, it takes ~140 seconds on an Ampere Altra machine. This is probably because we run cond_resched() for every breakup in the entire sweep of the possible address range, 0 to ~(0ULL), even though there are no actual mappings there, and we context switch out more often. --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c + static void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) + { + u64 end = is_protected_kvm_enabled() ? ~(0ULL) : BIT(pgt->ia_bits); + u64 next, addr = 0; + + do { + next = stage2_range_addr_end(addr, end); + KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, + next - addr); + + if (next != end) + cond_resched(); + } while (addr = next, addr != end); + + + KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); + } --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -316,9 +316,13 @@ static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 e return 0; } -void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); +} + +void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) +{ +} Without cond_resched() in place, it takes about half the time. I also tried moving cond_resched() to __pkvm_pgtable_stage2_unmap(), as per the below diff, and calling pkvm_pgtable_stage2_destroy_range() for the entire 0 to ~(1ULL) range (instead of breaking it up). Even for a fully 4K mapped 128G VM, I see it taking ~65 seconds, which is close to the baseline (no cond_resched()). --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -311,8 +311,11 @@ static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 e return ret; pkvm_mapping_remove(mapping, &pgt->pkvm_mappings); kfree(mapping); + cond_resched(); } Does it make sense to call cond_resched() only when we actually unmap pages? Thank you. Raghavendra
On Thu, Aug 07, 2025 at 11:58:01AM -0700, Raghavendra Rao Ananta wrote: > Hi Oliver, > > > > > Protected mode is affected by the same problem, potentially even worse > > due to the overheads of calling into EL2. Both protected and > > non-protected flows should use stage2_destroy_range(). > > > I experimented with this (see diff below), and it looks like it takes > significantly longer to finish the destruction even for a very small > VM. For instance, it takes ~140 seconds on an Ampere Altra machine. > This is probably because we run cond_resched() for every breakup in > the entire sweep of the possible address range, 0 to ~(0ULL), even > though there are no actual mappings there, and we context switch out > more often. This seems more like an issue with the upper bound on a pKVM walk rather than a problem with the suggestion. The information in pgt->ia_bits is actually derived from the VTCR value of the owning MMU. Even though we never use the VTCR value in hardware, pKVM MMUs have a valid VTCR value that encodes the size of the IPA space and we use that in the common stage-2 abort path. I'm attaching some fixups that I have on top of your series that'd allow the resched logic to remain common, like it is in other MMU flows. From 421468dcaa4692208c3f708682b058cfc072a984 Mon Sep 17 00:00:00 2001 From: Oliver Upton <oliver.upton@linux.dev> Date: Fri, 8 Aug 2025 11:43:12 -0700 Subject: [PATCH 4/4] fixup! KVM: arm64: Destroy the stage-2 page-table periodically --- arch/arm64/kvm/mmu.c | 60 ++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index b82412323054..fc93cc256bd8 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -383,40 +383,6 @@ static void stage2_flush_vm(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, idx); } -/* - * Assume that @pgt is valid and unlinked from the KVM MMU to free the - * page-table without taking the kvm_mmu_lock and without performing any - * TLB invalidations. - * - * Also, the range of addresses can be large enough to cause need_resched - * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke - * cond_resched() periodically to prevent hogging the CPU for a long time - * and schedule something else, if required. - */ -static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, - phys_addr_t end) -{ - u64 next; - - do { - next = stage2_range_addr_end(addr, end); - kvm_pgtable_stage2_destroy_range(pgt, addr, next - addr); - - if (next != end) - cond_resched(); - } while (addr = next, addr != end); -} - -static void kvm_destroy_stage2_pgt(struct kvm_pgtable *pgt) -{ - if (!is_protected_kvm_enabled()) { - stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits)); - kvm_pgtable_stage2_destroy_pgd(pgt); - } else { - pkvm_pgtable_stage2_destroy(pgt); - } -} - /** * free_hyp_pgds - free Hyp-mode page tables */ @@ -938,11 +904,35 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) return 0; } +/* + * Assume that @pgt is valid and unlinked from the KVM MMU to free the + * page-table without taking the kvm_mmu_lock and without performing any + * TLB invalidations. + * + * Also, the range of addresses can be large enough to cause need_resched + * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke + * cond_resched() periodically to prevent hogging the CPU for a long time + * and schedule something else, if required. + */ +static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, + phys_addr_t end) +{ + u64 next; + + do { + next = stage2_range_addr_end(addr, end); + KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, next - addr); + + if (next != end) + cond_resched(); + } while (addr = next, addr != end); +} + static void kvm_stage2_destroy(struct kvm_pgtable *pgt) { unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); - KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, 0, BIT(ia_bits)); + stage2_destroy_range(pgt, 0, BIT(ia_bits)); KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); } -- 2.39.5
Heh, without full conext, the shortlog reads like "destroy stage-2 page tables from time to time". Something like this would be more appropriate: KVM: arm64: Reschedule as needed when destroying stage-2 page-tables
On Fri, Jul 25, 2025 at 7:59 AM Sean Christopherson <seanjc@google.com> wrote: > > Heh, without full conext, the shortlog reads like "destroy stage-2 page tables > from time to time". Something like this would be more appropriate: > > KVM: arm64: Reschedule as needed when destroying stage-2 page-tables This definitely sounds better :)
© 2016 - 2025 Red Hat, Inc.