mm/vmscan.c | 10 ++++++++++ 1 file changed, 10 insertions(+)
Greetings,
I've noticed size XL cyclictest -Smp99 wakeup latency spikes for some
time now, but due to bandwidth etc (ie laziness), never got around to
taking a peek until today.
Seems isolate_lru_folios() in v6.10+ holds off irqs far longer, roughly
an order of magnitude over v6.9 for my old i7-4790 desktop box.
-Mike
test log - scroll fast
cyclictest -Smp99 vs HDD (spinning rust) IO, osc kernel build/packaging
in one HD and bonnie in another makes spikes nicely repeatable.
6.12.0-rc5
T: 0 ( 4541) P:99 I:1000 C: 773124 Min: 1 Act: 6 Avg: 3 Max: 53381
T: 1 ( 4542) P:99 I:1500 C: 515381 Min: 1 Act: 7 Avg: 3 Max: 60119
T: 2 ( 4543) P:99 I:2000 C: 386465 Min: 1 Act: 5 Avg: 4 Max: 67469
T: 3 ( 4544) P:99 I:2500 C: 309182 Min: 1 Act: 7 Avg: 4 Max: 65534
T: 4 ( 4545) P:99 I:3000 C: 257682 Min: 1 Act: 6 Avg: 4 Max: 57776
T: 5 ( 4546) P:99 I:3500 C: 220874 Min: 1 Act: 4 Avg: 4 Max: 64737
T: 6 ( 4547) P:99 I:4000 C: 193289 Min: 1 Act: 6 Avg: 4 Max: 52567
T: 7 ( 4548) P:99 I:4500 C: 171799 Min: 1 Act: 6 Avg: 5 Max: 52606
6.6.58-stable
T: 0 ( 4928) P:99 I:1000 C: 824721 Min: 1 Act: 5 Avg: 3 Max: 4619
T: 1 ( 4929) P:99 I:1500 C: 549816 Min: 1 Act: 7 Avg: 3 Max: 5308
T: 2 ( 4930) P:99 I:2000 C: 412364 Min: 1 Act: 5 Avg: 3 Max: 5305
T: 3 ( 4931) P:99 I:2500 C: 329892 Min: 1 Act: 5 Avg: 3 Max: 3160
T: 4 ( 4932) P:99 I:3000 C: 274909 Min: 1 Act: 6 Avg: 3 Max: 4044
T: 5 ( 4933) P:99 I:3500 C: 235637 Min: 1 Act: 6 Avg: 3 Max: 3922
T: 6 ( 4934) P:99 I:4000 C: 206183 Min: 1 Act: 5 Avg: 3 Max: 2811
T: 7 ( 4935) P:99 I:4500 C: 183274 Min: 1 Act: 6 Avg: 3 Max: 3678
6.11.5-stable
T: 0 ( 8318) P:99 I:1000 C: 734458 Min: 0 Act: 4 Avg: 4 Max: 61030
T: 1 ( 8319) P:99 I:1500 C: 489879 Min: 1 Act: 5 Avg: 4 Max: 51595
T: 2 ( 8320) P:99 I:2000 C: 367286 Min: 1 Act: 5 Avg: 5 Max: 62361
T: 3 ( 8321) P:99 I:2500 C: 293937 Min: 1 Act: 6 Avg: 5 Max: 60837
T: 4 ( 8322) P:99 I:3000 C: 244935 Min: 1 Act: 6 Avg: 6 Max: 59312
T: 5 ( 8323) P:99 I:3500 C: 209981 Min: 1 Act: 9 Avg: 5 Max: 62318
T: 6 ( 8324) P:99 I:4000 C: 183701 Min: 1 Act: 7 Avg: 7 Max: 78961
T: 7 ( 8325) P:99 I:4500 C: 163241 Min: 1 Act: 7 Avg: 9 Max: 59300
6.10.14-stable
T: 0 ( 4318) P:99 I:1000 C: 788179 Min: 0 Act: 6 Avg: 3 Max: 63290
T: 1 ( 4319) P:99 I:1500 C: 525458 Min: 1 Act: 6 Avg: 3 Max: 64128
T: 2 ( 4320) P:99 I:2000 C: 394070 Min: 1 Act: 6 Avg: 4 Max: 58836
T: 3 ( 4321) P:99 I:2500 C: 315317 Min: 0 Act: 8 Avg: 3 Max: 55249
T: 4 ( 4322) P:99 I:3000 C: 262773 Min: 1 Act: 7 Avg: 3 Max: 50339
T: 5 ( 4323) P:99 I:3500 C: 225239 Min: 0 Act: 6 Avg: 3 Max: 52687
T: 6 ( 4324) P:99 I:4000 C: 197054 Min: 1 Act: 6 Avg: 4 Max: 64231
T: 7 ( 4325) P:99 I:4500 C: 175170 Min: 1 Act: 7 Avg: 4 Max: 54302
6.9.12-stable
T: 0 (24683) P:99 I:1000 C: 785036 Min: 1 Act: 6 Avg: 2 Max: 1771
T: 1 (24684) P:99 I:1500 C: 523355 Min: 1 Act: 5 Avg: 3 Max: 1958
T: 2 (24685) P:99 I:2000 C: 392516 Min: 1 Act: 4 Avg: 3 Max: 2317
T: 3 (24686) P:99 I:2500 C: 314013 Min: 1 Act: 7 Avg: 3 Max: 786
T: 4 (24687) P:99 I:3000 C: 261677 Min: 1 Act: 5 Avg: 3 Max: 840
T: 5 (24688) P:99 I:3500 C: 224294 Min: 1 Act: 6 Avg: 3 Max: 960
T: 6 (24689) P:99 I:4000 C: 196257 Min: 1 Act: 6 Avg: 3 Max: 627
T: 7 (24690) P:99 I:4500 C: 174449 Min: 1 Act: 6 Avg: 3 Max: 6327
Ok, size mondo spikes begin at v6.10
# tracer: wakeup_rt
#
# wakeup_rt latency trace v1.1.5 on 6.10.14-stable
# --------------------------------------------------------------------
# latency: 55060 us, #343/343, CPU#4 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:8)
# -----------------
# | task: cyclictest-8238 (uid:0 nice:0 policy:1 rt_prio:99)
# -----------------
#
# _------=> CPU#
# / _-----=> irqs-off/BH-disabled
# | / _----=> need-resched
# || / _---=> hardirq/softirq
# ||| / _--=> preempt-depth
# |||| / _-=> migrate-disable
# ||||| / delay
# cmd pid |||||| time | caller
# \ / |||||| \ | /
<...>-88 4dNh.. 0us : 88:120:R + [004] 8238: 0:R cyclictest
<...>-88 4dNh.. 8us : <stack trace>
=> __ftrace_trace_stack+0x115/0x140
=> probe_wakeup+0x2ac/0x330
=> ttwu_do_activate+0xda/0x1c0
=> try_to_wake_up+0x390/0x470
=> hrtimer_wakeup+0x1e/0x30
=> __hrtimer_run_queues+0xfc/0x200
=> hrtimer_interrupt+0xea/0x250
=> __sysvec_apic_timer_interrupt+0x44/0xe0
=> sysvec_apic_timer_interrupt+0x4b/0x90
=> asm_sysvec_apic_timer_interrupt+0x16/0x20
=> shrink_inactive_list+0x0/0x3e0
=> shrink_lruvec+0x2fc/0x470
=> shrink_node+0x1db/0x4a0
=> balance_pgdat+0x278/0x5b0
=> kswapd+0x10e/0x310
=> kthread+0xc7/0xf0
=> ret_from_fork+0x2d/0x50
=> ret_from_fork_asm+0x11/0x20
<...>-88 4dNh.. 8us : 0
<...>-88 4dNh.. 8us : task_woken_rt+0x0/0x60 <-ttwu_do_activate+0x87/0x1c0
<...>-88 4dNh.. 9us : _raw_spin_unlock_irqrestore+0x0/0x20 <-try_to_wake_up+0x3b4/0x470
<...>-88 4dNh.. 9us : _raw_spin_lock_irq+0x0/0x20 <-__hrtimer_run_queues+0x109/0x200
<...>-88 4dNh.. 9us : hrtimer_update_next_event+0x0/0x50 <-hrtimer_interrupt+0xf2/0x250
<...>-88 4dNh.. 9us : __hrtimer_get_next_event+0x0/0x70 <-hrtimer_update_next_event+0x24/0x50
<...>-88 4dNh.. 9us : __hrtimer_next_event_base+0x0/0xb0 <-__hrtimer_get_next_event+0x60/0x70
<...>-88 4dNh.. 9us : __hrtimer_get_next_event+0x0/0x70 <-hrtimer_update_next_event+0x38/0x50
<...>-88 4dNh.. 9us : __hrtimer_next_event_base+0x0/0xb0 <-hrtimer_update_next_event+0x38/0x50
<...>-88 4dNh.. 9us : _raw_spin_unlock_irqrestore+0x0/0x20 <-hrtimer_interrupt+0x10a/0x250
<...>-88 4dNh.. 10us : tick_program_event+0x0/0x70 <-hrtimer_interrupt+0x118/0x250
<...>-88 4dNh.. 10us : clockevents_program_event+0x0/0xe0 <-hrtimer_interrupt+0x118/0x250
<...>-88 4dNh.. 10us : ktime_get+0x0/0xc0 <-clockevents_program_event+0x78/0xe0
<...>-88 4dNh.. 10us : lapic_next_deadline+0x0/0x40 <-clockevents_program_event+0xad/0xe0
<...>-88 4dNh.. 10us : irq_exit_rcu+0x0/0xa0 <-sysvec_apic_timer_interrupt+0x50/0x90
<...>-88 4dN... 10us : idle_cpu+0x0/0x40 <-irq_exit_rcu+0x3b/0xa0
<...>-88 4.N... 11us : shrink_inactive_list+0x0/0x3e0 <-shrink_lruvec+0x2fc/0x470
<...>-88 4.N... 11us : lru_add_drain+0x0/0x20 <-shrink_inactive_list+0xee/0x3e0
<...>-88 4.N... 11us : lru_add_drain_cpu+0x0/0x100 <-lru_add_drain+0x11/0x20
<...>-88 4.N... 11us : mlock_drain_local+0x0/0x30 <-shrink_inactive_list+0xee/0x3e0
<...>-88 4.N... 11us : _raw_spin_lock_irq+0x0/0x20 <-shrink_inactive_list+0xf6/0x3e0
<...>-88 4dN... 11us*: isolate_lru_folios+0x0/0x390 <-shrink_inactive_list+0x110/0x3e0 <== hmmm
<...>-88 4dN... 54984us : __mod_node_page_state+0x0/0x70 <-isolate_lru_folios+0x289/0x390 <== artifact.. but repeats
<...>-88 4dN... 54985us : __mod_zone_page_state+0x0/0x60 <-isolate_lru_folios+0x297/0x390
<...>-88 4dN... 54985us : __mod_node_page_state+0x0/0x70 <-shrink_inactive_list+0x12b/0x3e0
<...>-88 4dN... 54986us : irq_enter_rcu+0x0/0x60 <-sysvec_apic_timer_interrupt+0x43/0x90
<...>-88 4dNh.. 54986us : __sysvec_apic_timer_interrupt+0x0/0xe0 <-sysvec_apic_timer_interrupt+0x4b/0x90
<...>-88 4dNh.. 54986us : hrtimer_interrupt+0x0/0x250 <-__sysvec_apic_timer_interrupt+0x44/0xe0
trace_printk time.
patches/xx
---
mm/vmscan.c | 10 ++++++++++
1 file changed, 10 insertions(+)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -56,6 +56,7 @@
#include <linux/khugepaged.h>
#include <linux/rculist_nulls.h>
#include <linux/random.h>
+#include <linux/sched/clock.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -1620,6 +1621,9 @@ static unsigned long isolate_lru_folios(
unsigned long skipped = 0;
unsigned long scan, total_scan, nr_pages;
LIST_HEAD(folios_skipped);
+ int cpu = raw_smp_processor_id();
+ u64 then = cpu_clock(cpu), this;
+ static u64 max = 0;
total_scan = 0;
scan = 0;
@@ -1697,6 +1701,12 @@ static unsigned long isolate_lru_folios(
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
total_scan, skipped, nr_taken, lru);
update_lru_sizes(lruvec, lru, nr_zone_taken);
+ this = cpu_clock(cpu) - then;
+ if (this > max) {
+ max = this;
+ if (max > 1000)
+ trace_printk("new record: %Lu us\n", max/1000UL);
+ }
return nr_taken;
}
6.10.14-stable +xx
T: 0 ( 6009) P:99 I:1000 C: 392693 Min: 0 Act: 6 Avg: 3 Max: 58348
T: 1 ( 6010) P:99 I:1500 C: 261247 Min: 0 Act: 12 Avg: 7 Max: 72563
T: 2 ( 6011) P:99 I:2000 C: 196226 Min: 0 Act: 7 Avg: 5 Max: 72367
T: 3 ( 6012) P:99 I:2500 C: 156939 Min: 0 Act: 8 Avg: 7 Max: 66426
T: 4 ( 6013) P:99 I:3000 C: 130563 Min: 1 Act: 8 Avg: 13 Max: 72005
T: 5 ( 6014) P:99 I:3500 C: 112231 Min: 1 Act: 6 Avg: 5 Max: 62329
T: 6 ( 6015) P:99 I:4000 C: 98159 Min: 1 Act: 4 Avg: 6 Max: 62539
T: 7 ( 6016) P:99 I:4500 C: 87314 Min: 1 Act: 8 Avg: 4 Max: 51182
homer:..debug/tracing # tail -5 trace
kswapd0-91 [004] d.... 518.224557: isolate_lru_folios: new record: 52169 us
kswapd0-91 [001] d.... 520.001208: isolate_lru_folios: new record: 61611 us
kswapd0-91 [001] d.... 520.390120: isolate_lru_folios: new record: 72501 us
kswapd0-91 [001] d.... 520.463310: isolate_lru_folios: new record: 72639 us
kswapd0-91 [001] d.... 520.607071: isolate_lru_folios: new record: 72707 us
6.12.0.g81983758-master
# tracer: irqsoff
#
# irqsoff latency trace v1.1.5 on 6.12.0.g81983758-master
# --------------------------------------------------------------------
# latency: 49040 us, #8/8, CPU#1 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:8)
# -----------------
# | task: kswapd0-91 (uid:0 nice:0 policy:0 rt_prio:0)
# -----------------
# => started at: _raw_spin_lock_irq+0x1a/0x40
# => ended at: shrink_inactive_list+0x165/0x420
#
#
# _------=> CPU#
# / _-----=> irqs-off/BH-disabled
# | / _----=> need-resched
# || / _---=> hardirq/softirq
# ||| / _--=> preempt-depth
# |||| / _-=> migrate-disable
# ||||| / delay
# cmd pid |||||| time | caller
# \ / |||||| \ | /
kswapd0-91 1d.... 0us : _raw_spin_lock_irq+0x1a/0x40
kswapd0-91 1d.... 0us*: isolate_lru_folios+0x0/0x380 <-shrink_inactive_list+0x106/0x420
kswapd0-91 1d.... 49039us : __mod_node_page_state+0x0/0x70 <-isolate_lru_folios+0x288/0x380 <== yup
kswapd0-91 1d.... 49039us : __mod_zone_page_state+0x0/0x60 <-isolate_lru_folios+0x296/0x380
kswapd0-91 1d.... 49040us : __mod_node_page_state+0x0/0x70 <-shrink_inactive_list+0x11e/0x420
kswapd0-91 1d.... 49040us : shrink_inactive_list+0x165/0x420
kswapd0-91 1d.... 49040us : tracer_hardirqs_on+0xbb/0xd0 <-shrink_inactive_list+0x165/0x420
kswapd0-91 1d.... 49048us : <stack trace>
=> shrink_lruvec+0x2fc/0x470
=> shrink_node+0x1dd/0x4b0
=> balance_pgdat+0x280/0x5c0
=> kswapd+0x10f/0x320
=> kthread+0xb7/0xe0
=> ret_from_fork+0x2d/0x50
=> ret_from_fork_asm+0x11/0x20
gitk...
hm, 5ed890ce5147 mm: vmscan: avoid split during shrink_folio_list()
v6.9-rc4-254-g845982eb264b (without)
homer:..debug/tracing # tail -5 trace
kswapd0-88 [005] d.... 254.583681: isolate_lru_folios: new record: 1545 us
kswapd0-88 [003] dn... 318.694640: isolate_lru_folios: new record: 36025 us
kswapd0-88 [003] dn... 560.700168: isolate_lru_folios: new record: 48922 us
kswapd0-88 [002] d.... 652.526922: isolate_lru_folios: new record: 56143 us
kswapd0-88 [002] d.... 652.588941: isolate_lru_folios: new record: 56638 us
meh, still way too wide.. not as wide though.
v6.9-rc4-255-g5ed890ce5147 (with)
homer:..debug/tracing # tail -5 trace
kswapd0-88 [006] d.... 301.719781: isolate_lru_folios: new record: 19 us
kswapd0-88 [002] d.... 305.202140: isolate_lru_folios: new record: 36446 us
kswapd0-88 [005] dn... 339.858760: isolate_lru_folios: new record: 59762 us
kswapd0-88 [007] d.... 583.350733: isolate_lru_folios: new record: 73866 us
kswapd0-88 [005] d.... 604.445868: isolate_lru_folios: new record: 75636 us
6.9.12-stable +xx
homer:..debug/tracing # tail -5 trace
kswapd0-91 [003] d.... 791.301680: isolate_lru_folios: new record: 378 us
kswapd0-91 [006] d.... 1592.777235: isolate_lru_folios: new record: 2358 us
kswapd0-91 [004] d.... 1879.108874: isolate_lru_folios: new record: 2528 us
kswapd0-91 [001] d.... 1883.635762: isolate_lru_folios: new record: 3535 us
kswapd0-91 [002] d.... 1911.251376: isolate_lru_folios: new record: 5486 us
pass it on, poke later.. maybe.. lazy sod
On Tue, 2024-10-29 at 13:03 +0100, Mike Galbraith wrote:
> Greetings,
>
> I've noticed size XL cyclictest -Smp99 wakeup latency spikes for some
> time now, but due to bandwidth etc (ie laziness), never got around to
> taking a peek until today.
isolate_lru_folios() dropped off the irqsoff radar with the below.
mm: vmscan: Limit isolate_lru_folios() IRQ holdoff latency impact.
While examining an RT latency regression beginning at v6.10, it was
discovered that isolate_lru_folios() runtimes could cause IRQ holdoff
to easily span several ticks.
Given it's called with IRQs disabled, cap its runtme to around a ms.
Signed-off-by: Mike Galbraith <efault@gmx.de>
---
mm/vmscan.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,6 +63,7 @@
#include <linux/swapops.h>
#include <linux/balloon_compaction.h>
#include <linux/sched/sysctl.h>
+#include <linux/sched/clock.h>
#include "internal.h"
#include "swap.h"
@@ -1656,6 +1657,8 @@ static unsigned long isolate_lru_folios(
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
unsigned long skipped = 0;
unsigned long scan, total_scan, nr_pages;
+ int cpu = raw_smp_processor_id(), iter = 0;
+ u64 then = cpu_clock(cpu);
LIST_HEAD(folios_skipped);
total_scan = 0;
@@ -1709,6 +1712,15 @@ static unsigned long isolate_lru_folios(
move_to = dst;
move:
list_move(&folio->lru, move_to);
+
+ /*
+ * IRQs are disabled, cap holdoff at a millisecond or so.
+ */
+ if (iter++ < 1000)
+ continue;
+ if (cpu_clock(cpu) - then > NSEC_PER_MSEC)
+ break;
+ iter = 0;
}
/*
(was regression: mm: vmscan: - size XL irqoff time increase v6.10+)
Break off queueing of IO after we've been at it for a ms or so and a
preemption is due, to keep writeback latency impact at least reasonable.
The IO we're queueing under spinlock still has to be started under that
same lock.
wakeup_rt tracing caught this function spanning 66ms in a i7-4790 box.
With this patch applied on top of one to mitigate even worse IRQ holdoff
induced hits (78ms) by isolate_lru_folios(), the same trivial load that
leads to this and worse (osc kernel package build + bonnie):
T: 1 ( 6211) P:99 I:1500 C: 639971 Min: 1 Act: 7 Avg: 12 Max: 66696
resulted in this perfectly reasonable max:
T: 0 ( 6078) P:99 I:1000 C:1031230 Min: 1 Act: 7 Avg: 4 Max: 4449
Note: cyclictest -Smp99 is only the messenger. This is not an RT issue,
RT is fingering bad generic behavior.
Signed-off-by: Mike Galbraith <efault@gmx.de>
---
fs/fs-writeback.c | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -29,6 +29,7 @@
#include <linux/tracepoint.h>
#include <linux/device.h>
#include <linux/memcontrol.h>
+#include <linux/sched/clock.h>
#include "internal.h"
/*
@@ -1424,6 +1425,10 @@ static int move_expired_inodes(struct li
struct inode *inode;
int do_sb_sort = 0;
int moved = 0;
+#ifndef CONFIG_PREEMPT_RT
+ u64 then = local_clock();
+ int iter = 0;
+#endif
while (!list_empty(delaying_queue)) {
inode = wb_inode(delaying_queue->prev);
@@ -1439,6 +1444,19 @@ static int move_expired_inodes(struct li
if (sb && sb != inode->i_sb)
do_sb_sort = 1;
sb = inode->i_sb;
+#ifndef CONFIG_PREEMPT_RT
+ /*
+ * We're under ->list_lock here, and the IO being queued
+ * still has to be started. Stop queueing when we've been
+ * at it for a ms or so and a preemption is due, to keep
+ * latency impact reasonable.
+ */
+ if (iter++ < 100 || !need_resched())
+ continue;
+ if (local_clock() - then > NSEC_PER_MSEC)
+ break;
+ iter = 0;
+#endif
}
/* just one sb in list, splice to dispatch_queue and we're done */
(grr.. CC was supposed to be kvack)
On Thu, 2024-10-31 at 11:21 +0100, Mike Galbraith wrote:
> (was regression: mm: vmscan: - size XL irqoff time increase v6.10+)
>
>
> Break off queueing of IO after we've been at it for a ms or so and a
> preemption is due, to keep writeback latency impact at least reasonable.
> The IO we're queueing under spinlock still has to be started under that
> same lock.
>
> wakeup_rt tracing caught this function spanning 66ms in a i7-4790 box.
>
> With this patch applied on top of one to mitigate even worse IRQ holdoff
> induced hits (78ms) by isolate_lru_folios(), the same trivial load that
> leads to this and worse (osc kernel package build + bonnie):
> T: 1 ( 6211) P:99 I:1500 C: 639971 Min: 1 Act: 7 Avg: 12 Max: 66696
>
> resulted in this perfectly reasonable max:
> T: 0 ( 6078) P:99 I:1000 C:1031230 Min: 1 Act: 7 Avg: 4 Max: 4449
>
> Note: cyclictest -Smp99 is only the messenger. This is not an RT issue,
> RT is fingering bad generic behavior.
>
> Signed-off-by: Mike Galbraith <efault@gmx.de>
> ---
> fs/fs-writeback.c | 18 ++++++++++++++++++
> 1 file changed, 18 insertions(+)
>
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -29,6 +29,7 @@
> #include <linux/tracepoint.h>
> #include <linux/device.h>
> #include <linux/memcontrol.h>
> +#include <linux/sched/clock.h>
> #include "internal.h"
>
> /*
> @@ -1424,6 +1425,10 @@ static int move_expired_inodes(struct li
> struct inode *inode;
> int do_sb_sort = 0;
> int moved = 0;
> +#ifndef CONFIG_PREEMPT_RT
> + u64 then = local_clock();
> + int iter = 0;
> +#endif
>
> while (!list_empty(delaying_queue)) {
> inode = wb_inode(delaying_queue->prev);
> @@ -1439,6 +1444,19 @@ static int move_expired_inodes(struct li
> if (sb && sb != inode->i_sb)
> do_sb_sort = 1;
> sb = inode->i_sb;
> +#ifndef CONFIG_PREEMPT_RT
> + /*
> + * We're under ->list_lock here, and the IO being queued
> + * still has to be started. Stop queueing when we've been
> + * at it for a ms or so and a preemption is due, to keep
> + * latency impact reasonable.
> + */
> + if (iter++ < 100 || !need_resched())
> + continue;
> + if (local_clock() - then > NSEC_PER_MSEC)
> + break;
> + iter = 0;
> +#endif
> }
>
> /* just one sb in list, splice to dispatch_queue and we're done */
>
(was regression: mm: vmscan: - size XL irqoff time increase v6.10+)
While examining a service latency regression beginning at v6.10, it was
discovered that isolate_lru_folios() runtimes could cause IRQ holdoff
spanning several ticks. Worse measured was 78ms.
Given it's called with IRQs disabled, cap its runtme to around a ms.
With this patch, and one to mitigate move_expired_inodes() induced
wakeup latency hits up 66ms, the load that repeatably produced this:
(6.12.0-rc5 running osc kernel package build + ram sized bonnie)
T: 0 ( 4541) P:99 I:1000 C: 773124 Min: 1 Act: 6 Avg: 3 Max: 53381
T: 1 ( 4542) P:99 I:1500 C: 515381 Min: 1 Act: 7 Avg: 3 Max: 60119
T: 2 ( 4543) P:99 I:2000 C: 386465 Min: 1 Act: 5 Avg: 4 Max: 67469
T: 3 ( 4544) P:99 I:2500 C: 309182 Min: 1 Act: 7 Avg: 4 Max: 65534
T: 4 ( 4545) P:99 I:3000 C: 257682 Min: 1 Act: 6 Avg: 4 Max: 57776
T: 5 ( 4546) P:99 I:3500 C: 220874 Min: 1 Act: 4 Avg: 4 Max: 64737
T: 6 ( 4547) P:99 I:4000 C: 193289 Min: 1 Act: 6 Avg: 4 Max: 52567
T: 7 ( 4548) P:99 I:4500 C: 171799 Min: 1 Act: 6 Avg: 5 Max: 52606
latency went to this:
T: 0 ( 6078) P:99 I:1000 C:1031230 Min: 1 Act: 7 Avg: 4 Max: 4449
T: 1 ( 6079) P:99 I:1500 C: 687505 Min: 1 Act: 5 Avg: 4 Max: 2075
T: 2 ( 6080) P:99 I:2000 C: 515627 Min: 1 Act: 334 Avg: 5 Max: 2987
T: 3 ( 6081) P:99 I:2500 C: 412503 Min: 1 Act: 8 Avg: 5 Max: 2425
T: 4 ( 6082) P:99 I:3000 C: 343752 Min: 1 Act: 11 Avg: 5 Max: 1466
T: 5 ( 6083) P:99 I:3500 C: 294643 Min: 1 Act: 282 Avg: 5 Max: 3844
T: 6 ( 6084) P:99 I:4000 C: 257813 Min: 1 Act: 7 Avg: 5 Max: 3291
T: 7 ( 6085) P:99 I:4500 C: 229167 Min: 1 Act: 9 Avg: 5 Max: 3516
Signed-off-by: Mike Galbraith <efault@gmx.de>
---
mm/vmscan.c | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,6 +63,9 @@
#include <linux/swapops.h>
#include <linux/balloon_compaction.h>
#include <linux/sched/sysctl.h>
+#ifndef CONFIG_PREEMPT_RT
+#include <linux/sched/clock.h>
+#endif
#include "internal.h"
#include "swap.h"
@@ -1657,6 +1660,10 @@ static unsigned long isolate_lru_folios(
unsigned long skipped = 0;
unsigned long scan, total_scan, nr_pages;
LIST_HEAD(folios_skipped);
+#ifndef CONFIG_PREEMPT_RT
+ u64 then = local_clock();
+ int batch = 0;
+#endif
total_scan = 0;
scan = 0;
@@ -1709,6 +1716,18 @@ static unsigned long isolate_lru_folios(
move_to = dst;
move:
list_move(&folio->lru, move_to);
+
+#ifndef CONFIG_PREEMPT_RT
+ /*
+ * IRQs are disabled, cap holdoff at a millisecond or so.
+ */
+ batch += nr_pages;
+ if (batch < 1024)
+ continue;
+ if (local_clock() - then >= NSEC_PER_MSEC)
+ break;
+ batch = 0;
+#endif
}
/*
# tracer: irqsoff
#
# irqsoff latency trace v1.1.5 on 6.12.0.g4236f913-master
# --------------------------------------------------------------------
# latency: 77766 us, #12/12, CPU#5 | (M:unknown VP:0, KP:0, SP:0 HP:0 #P:8)
# -----------------
# | task: kswapd0-90 (uid:0 nice:0 policy:0 rt_prio:0)
# -----------------
# => started at: _raw_spin_lock_irq+0x16/0x40
# => ended at: _raw_spin_unlock_irq+0x10/0x30
#
#
# _------=> CPU#
# / _-----=> irqs-off/BH-disabled
# | / _----=> need-resched
# || / _---=> hardirq/softirq
# ||| / _--=> preempt-depth
# |||| / _-=> migrate-disable
# ||||| / delay
# cmd pid |||||| time | caller
# \ / |||||| \ | /
kswapd0-90 5d.... 0us : _raw_spin_lock_irq+0x16/0x40
kswapd0-90 5d.... 0us : preempt_count_add+0x0/0x50 <-_raw_spin_lock_irq+0x20/0x40
kswapd0-90 5d..1. 0us*: isolate_lru_folios+0x0/0x3e0 <-shrink_inactive_list+0x10f/0x440
kswapd0-90 5d..1. 77765us : __rcu_read_lock+0x0/0x20 <-isolate_lru_folios+0x160/0x3e0
kswapd0-90 5d..1. 77765us : __rcu_read_unlock+0x0/0x180 <-isolate_lru_folios+0x199/0x3e0
kswapd0-90 5d..1. 77766us : __mod_node_page_state+0x0/0x70 <-isolate_lru_folios+0x2ee/0x3e0
kswapd0-90 5d..1. 77766us : __mod_zone_page_state+0x0/0x60 <-isolate_lru_folios+0x2fc/0x3e0
kswapd0-90 5d..1. 77766us : __mod_node_page_state+0x0/0x70 <-shrink_inactive_list+0x12a/0x440
kswapd0-90 5d..1. 77766us : _raw_spin_unlock_irq+0x0/0x30 <-shrink_inactive_list+0x16a/0x440
kswapd0-90 5d..1. 77766us : _raw_spin_unlock_irq+0x10/0x30
kswapd0-90 5d..1. 77767us : tracer_hardirqs_on+0xcb/0xe0 <-_raw_spin_unlock_irq+0x10/0x30
kswapd0-90 5d..1. 77774us : <stack trace>
=> shrink_inactive_list+0x16a/0x440
=> shrink_lruvec+0x2fc/0x470
=> shrink_node+0x1d6/0x4a0
=> balance_pgdat+0x280/0x5c0
=> kswapd+0x10f/0x350
=> kthread+0xfa/0x130
=> ret_from_fork+0x2d/0x50
=> ret_from_fork_asm+0x11/0x20
© 2016 - 2026 Red Hat, Inc.