fs/proc/meminfo.c | 1 + include/linux/mm_inline.h | 3 + include/linux/mmzone.h | 3 + include/linux/page-flags.h | 5 ++ include/linux/vm_event_item.h | 2 + include/trace/events/mmflags.h | 4 +- include/trace/events/vmscan.h | 35 +++++++++ mm/migrate.c | 2 + mm/readahead.c | 9 +++ mm/show_mem.c | 3 +- mm/vmscan.c | 132 +++++++++++++++++++++++++++++++++ mm/vmstat.c | 4 + 12 files changed, 201 insertions(+), 2 deletions(-)
1. Problem Background
In Android systems, a significant challenge arises during application
startup when a large number of private application files are read.
Approximately 90% of these file pages are loaded into memory via readahead.
However, about 85% of these pre-read pages are reclaimed without ever being
accessed, which means only around 15% of the pre-read pages are effectively
utilized. This results in wasted memory, as unaccessed file pages consume
valuable memory space, leading to memory thrashing and unnecessary I/O
reads.
2. Solution Proposal
Introduce a Readahead LRU to track pages brought in via readahead. During
memory reclamation, prioritize scanning this LRU to reclaim pages that
have not been accessed recently. For pages in the Readahead LRU that are
accessed, move them back to the inactive_file LRU to await subsequent
reclamation.
3. Benefits Data
In tests involving the cold start of 30 applications:
Memory Reclamation Efficiency: The slowpath process saw a reduction of
over 30%.
4. Current Issues
The refault metric for file pages has significantly degraded, increasing
by about 100%. This is primarily because pages are reclaimed too quickly,
without sufficient aging.
5. Next Steps
When calculating reclamation propensity, adjust the intensity of
reclamation from the Readahead LRU. This ensures aging and reclamation
efficiency while allowing adequate aging time.
Signed-off-by: Lei Liu <liulei.rjpt@vivo.com>
---
fs/proc/meminfo.c | 1 +
include/linux/mm_inline.h | 3 +
include/linux/mmzone.h | 3 +
include/linux/page-flags.h | 5 ++
include/linux/vm_event_item.h | 2 +
include/trace/events/mmflags.h | 4 +-
include/trace/events/vmscan.h | 35 +++++++++
mm/migrate.c | 2 +
mm/readahead.c | 9 +++
mm/show_mem.c | 3 +-
mm/vmscan.c | 132 +++++++++++++++++++++++++++++++++
mm/vmstat.c | 4 +
12 files changed, 201 insertions(+), 2 deletions(-)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a458f1e112fd..4f3f031134fd 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -71,6 +71,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]);
show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]);
show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
+ show_val_kb(m, "ReadAhead(file):", pages[LRU_READ_AHEAD_FILE]);
show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]);
show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK));
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 89b518ff097e..dcfd5cd5350b 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -93,6 +93,9 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio)
if (folio_test_unevictable(folio))
return LRU_UNEVICTABLE;
+ if (folio_test_readahead_lru(folio))
+ return LRU_READ_AHEAD_FILE;
+
lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
if (folio_test_active(folio))
lru += LRU_ACTIVE;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0c5da9141983..69c336465b0c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -165,6 +165,7 @@ enum zone_stat_item {
NR_ZONE_ACTIVE_ANON,
NR_ZONE_INACTIVE_FILE,
NR_ZONE_ACTIVE_FILE,
+ NR_ZONE_READAHEAD_FILE,
NR_ZONE_UNEVICTABLE,
NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
NR_MLOCK, /* mlock()ed pages found and moved off LRU */
@@ -184,6 +185,7 @@ enum node_stat_item {
NR_ACTIVE_ANON, /* " " " " " */
NR_INACTIVE_FILE, /* " " " " " */
NR_ACTIVE_FILE, /* " " " " " */
+ NR_READAHEAD_FILE, /* " " " " " */
NR_UNEVICTABLE, /* " " " " " */
NR_SLAB_RECLAIMABLE_B,
NR_SLAB_UNRECLAIMABLE_B,
@@ -303,6 +305,7 @@ enum lru_list {
LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
+ LRU_READ_AHEAD_FILE,
LRU_UNEVICTABLE,
NR_LRU_LISTS
};
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 8d3fa3a91ce4..57dac828aa4f 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -127,6 +127,7 @@ enum pageflags {
#ifdef CONFIG_ARCH_USES_PG_ARCH_3
PG_arch_3,
#endif
+ PG_readahead_lru,
__NR_PAGEFLAGS,
PG_readahead = PG_reclaim,
@@ -564,6 +565,10 @@ PAGEFLAG(Workingset, workingset, PF_HEAD)
TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
+PAGEFLAG(Readahead_lru, readahead_lru, PF_HEAD)
+ __CLEARPAGEFLAG(Readahead_lru, readahead_lru, PF_HEAD)
+ TESTCLEARFLAG(Readahead_lru, readahead_lru, PF_HEAD)
+
/* Xen */
PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 9e15a088ba38..7fc1b83e0aeb 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -49,8 +49,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
PGSCAN_DIRECT_THROTTLE,
PGSCAN_ANON,
PGSCAN_FILE,
+ PGSCAN_READAHEAD_FILE,
PGSTEAL_ANON,
PGSTEAL_FILE,
+ PGSTEAL_READAHEAD_FILE,
#ifdef CONFIG_NUMA
PGSCAN_ZONE_RECLAIM_SUCCESS,
PGSCAN_ZONE_RECLAIM_FAILED,
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index aa441f593e9a..2dbc1701e838 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -159,7 +159,8 @@ TRACE_DEFINE_ENUM(___GFP_LAST_BIT);
DEF_PAGEFLAG_NAME(reclaim), \
DEF_PAGEFLAG_NAME(swapbacked), \
DEF_PAGEFLAG_NAME(unevictable), \
- DEF_PAGEFLAG_NAME(dropbehind) \
+ DEF_PAGEFLAG_NAME(dropbehind), \
+ DEF_PAGEFLAG_NAME(readahead_lru) \
IF_HAVE_PG_MLOCK(mlocked) \
IF_HAVE_PG_HWPOISON(hwpoison) \
IF_HAVE_PG_IDLE(idle) \
@@ -309,6 +310,7 @@ IF_HAVE_VM_DROPPABLE(VM_DROPPABLE, "droppable" ) \
EM (LRU_ACTIVE_ANON, "active_anon") \
EM (LRU_INACTIVE_FILE, "inactive_file") \
EM (LRU_ACTIVE_FILE, "active_file") \
+ EM(LRU_READ_AHEAD_FILE, "readahead_file") \
EMe(LRU_UNEVICTABLE, "unevictable")
/*
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 490958fa10de..ef1ff37ae64d 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -391,6 +391,41 @@ TRACE_EVENT(mm_vmscan_reclaim_pages,
__entry->nr_ref_keep, __entry->nr_unmap_fail)
);
+TRACE_EVENT(mm_vmscan_lru_shrink_readahead,
+
+ TP_PROTO(int nid, unsigned long nr_to_scan,
+ unsigned long nr_to_reclaim, unsigned long nr_scanned,
+ unsigned long nr_taken, unsigned long nr_reclaimed),
+
+ TP_ARGS(nid, nr_to_scan, nr_to_reclaim, nr_scanned, nr_taken, nr_reclaimed),
+
+ TP_STRUCT__entry(
+ __field(int, nid)
+ __field(unsigned long, nr_to_scan)
+ __field(unsigned long, nr_to_reclaim)
+ __field(unsigned long, nr_scanned)
+ __field(unsigned long, nr_taken)
+ __field(unsigned long, nr_reclaimed)
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ __entry->nr_to_scan = nr_to_scan;
+ __entry->nr_to_reclaim = nr_to_reclaim;
+ __entry->nr_scanned = nr_scanned;
+ __entry->nr_taken = nr_taken;
+ __entry->nr_reclaimed = nr_reclaimed;
+ ),
+
+ TP_printk("nid=%d nr_to_scan=%ld nr_to_reclaim=%ld nr_scanned=%ld nr_taken=%ld nr_reclaimed=%ld",
+ __entry->nid,
+ __entry->nr_to_scan,
+ __entry->nr_to_reclaim,
+ __entry->nr_scanned,
+ __entry->nr_taken,
+ __entry->nr_reclaimed)
+);
+
TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
TP_PROTO(int nid,
diff --git a/mm/migrate.c b/mm/migrate.c
index 9e5ef39ce73a..0feab4d89d47 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -760,6 +760,8 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
folio_set_workingset(newfolio);
if (folio_test_checked(folio))
folio_set_checked(newfolio);
+ if (folio_test_readahead_lru(folio))
+ folio_set_readahead_lru(folio);
/*
* PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via
* migration entries. We can still have PG_anon_exclusive set on an
diff --git a/mm/readahead.c b/mm/readahead.c
index 406756d34309..b428dcbed27c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -272,6 +272,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
if (!folio)
break;
+ folio_set_readahead_lru(folio);
+
ret = filemap_add_folio(mapping, folio, index + i, gfp_mask);
if (ret < 0) {
folio_put(folio);
@@ -445,6 +447,9 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
mark = round_down(mark, 1UL << order);
if (index == mark)
folio_set_readahead(folio);
+
+ folio_set_readahead_lru(folio);
+
err = filemap_add_folio(ractl->mapping, folio, index, gfp);
if (err) {
folio_put(folio);
@@ -781,6 +786,8 @@ void readahead_expand(struct readahead_control *ractl,
if (!folio)
return;
+ folio_set_readahead_lru(folio);
+
index = mapping_align_index(mapping, index);
if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
folio_put(folio);
@@ -810,6 +817,8 @@ void readahead_expand(struct readahead_control *ractl,
if (!folio)
return;
+ folio_set_readahead_lru(folio);
+
index = mapping_align_index(mapping, index);
if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
folio_put(folio);
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 41999e94a56d..f0df7531d12a 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -52,7 +52,8 @@ long si_mem_available(void)
* cache, or the low watermark worth of cache, needs to stay.
*/
pagecache = global_node_page_state(NR_ACTIVE_FILE) +
- global_node_page_state(NR_INACTIVE_FILE);
+ global_node_page_state(NR_INACTIVE_FILE) +
+ global_node_page_state(NR_READAHEAD_FILE);
pagecache -= min(pagecache / 2, wmark_low);
available += pagecache;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a48aec8bfd92..be547166d6dc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -201,6 +201,9 @@ struct scan_control {
*/
int vm_swappiness = 60;
+static const unsigned long read_ahead_age_threshold = 240 << (20 - PAGE_SHIFT); // Example threshold
+static const unsigned long read_ahead_weight = 5; // Lower weight for read ahead
+
#ifdef CONFIG_MEMCG
/* Returns true for reclaim through cgroup limits or cgroup interfaces. */
@@ -2666,6 +2669,40 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
nr[lru] = scan;
}
+
+ unsigned long read_ahead_size =
+ lruvec_lru_size(lruvec, LRU_READ_AHEAD_FILE, sc->reclaim_idx);
+ unsigned long nr_inactive_file = nr[LRU_INACTIVE_FILE];
+
+ if (scan_balance == SCAN_FILE) {
+ if (read_ahead_size > read_ahead_age_threshold ||
+ nr_inactive_file < read_ahead_size) {
+ nr[LRU_READ_AHEAD_FILE] =
+ (unsigned long)(read_ahead_size *
+ read_ahead_weight / 100);
+ } else {
+ nr[LRU_READ_AHEAD_FILE] = 0;
+ }
+ } else if (scan_balance == SCAN_FRACT) {
+ if (read_ahead_size > read_ahead_age_threshold ||
+ nr_inactive_file < read_ahead_size) {
+ read_ahead_size =
+ mem_cgroup_online(memcg) ?
+ div64_u64(read_ahead_size * fraction[1],
+ denominator) :
+ DIV64_U64_ROUND_UP(read_ahead_size *
+ fraction[1],
+ denominator);
+ nr[LRU_READ_AHEAD_FILE] =
+ (unsigned long)(read_ahead_size *
+ read_ahead_weight / 100);
+ } else {
+ nr[LRU_READ_AHEAD_FILE] = 0;
+ }
+
+ } else {
+ nr[LRU_READ_AHEAD_FILE] = 0;
+ }
}
/*
@@ -5800,6 +5837,87 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *
#endif /* CONFIG_LRU_GEN */
+static unsigned long shrink_read_ahead_list(unsigned long nr_to_scan,
+ unsigned long nr_to_reclaim,
+ struct lruvec *lruvec,
+ struct scan_control *sc)
+{
+ LIST_HEAD(l_hold);
+ LIST_HEAD(l_reclaim);
+ LIST_HEAD(l_inactive);
+ unsigned long nr_scanned = 0;
+ unsigned long nr_taken = 0;
+ unsigned long nr_reclaimed = 0;
+ unsigned long vm_flags;
+ enum vm_event_item item;
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ struct reclaim_stat stat = { 0 };
+
+ lru_add_drain();
+
+ spin_lock_irq(&lruvec->lru_lock);
+ nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, &nr_scanned,
+ sc, LRU_READ_AHEAD_FILE);
+
+ __count_vm_events(PGSCAN_READAHEAD_FILE, nr_scanned);
+ __mod_node_page_state(pgdat, NR_ISOLATED_FILE, nr_taken);
+ item = PGSCAN_KSWAPD + reclaimer_offset(sc);
+ if (!cgroup_reclaim(sc))
+ __count_vm_events(item, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
+ __count_vm_events(PGSCAN_FILE, nr_scanned);
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ if (nr_taken == 0)
+ return 0;
+
+ while (!list_empty(&l_hold)) {
+ struct folio *folio;
+
+ cond_resched();
+ folio = lru_to_folio(&l_hold);
+ list_del(&folio->lru);
+ folio_clear_readahead_lru(folio);
+
+ if (folio_referenced(folio, 0, sc->target_mem_cgroup, &vm_flags)) {
+ list_add(&folio->lru, &l_inactive);
+ continue;
+ }
+ folio_clear_active(folio);
+ list_add(&folio->lru, &l_reclaim);
+ }
+
+ nr_reclaimed = shrink_folio_list(&l_reclaim, pgdat, sc, &stat, true,
+ lruvec_memcg(lruvec));
+
+ list_splice(&l_reclaim, &l_inactive);
+
+ spin_lock_irq(&lruvec->lru_lock);
+ move_folios_to_lru(lruvec, &l_inactive);
+ __mod_node_page_state(pgdat, NR_ISOLATED_FILE, -nr_taken);
+
+ __count_vm_events(PGSTEAL_READAHEAD_FILE, nr_reclaimed);
+ item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
+ if (!cgroup_reclaim(sc))
+ __count_vm_events(item, nr_reclaimed);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
+ __count_vm_events(PGSTEAL_FILE, nr_reclaimed);
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ sc->nr.dirty += stat.nr_dirty;
+ sc->nr.congested += stat.nr_congested;
+ sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
+ sc->nr.writeback += stat.nr_writeback;
+ sc->nr.immediate += stat.nr_immediate;
+ sc->nr.taken += nr_taken;
+ sc->nr.file_taken += nr_taken;
+
+ trace_mm_vmscan_lru_shrink_readahead(pgdat->node_id, nr_to_scan,
+ nr_to_reclaim, nr_scanned,
+ nr_taken, nr_reclaimed);
+ return nr_reclaimed;
+}
+
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
@@ -5836,6 +5954,19 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
sc->priority == DEF_PRIORITY);
blk_start_plug(&plug);
+
+ while (nr[LRU_READ_AHEAD_FILE] > 0) {
+ nr_to_scan = min(nr[LRU_READ_AHEAD_FILE], SWAP_CLUSTER_MAX);
+ nr[LRU_READ_AHEAD_FILE] -= nr_to_scan;
+
+ nr_reclaimed += shrink_read_ahead_list(nr_to_scan,
+ nr_to_reclaim,
+ lruvec, sc);
+
+ if (nr_reclaimed >= nr_to_reclaim)
+ goto out;
+ }
+
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
unsigned long nr_anon, nr_file, percentage;
@@ -5905,6 +6036,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
nr[lru] = targets[lru] * (100 - percentage) / 100;
nr[lru] -= min(nr[lru], nr_scanned);
}
+out:
blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 71cd1ceba191..fda968e489e5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1203,6 +1203,7 @@ const char * const vmstat_text[] = {
[I(NR_ZONE_ACTIVE_ANON)] = "nr_zone_active_anon",
[I(NR_ZONE_INACTIVE_FILE)] = "nr_zone_inactive_file",
[I(NR_ZONE_ACTIVE_FILE)] = "nr_zone_active_file",
+ [I(NR_ZONE_READAHEAD_FILE)] = "nr_zone_readahead_file",
[I(NR_ZONE_UNEVICTABLE)] = "nr_zone_unevictable",
[I(NR_ZONE_WRITE_PENDING)] = "nr_zone_write_pending",
[I(NR_MLOCK)] = "nr_mlock",
@@ -1233,6 +1234,7 @@ const char * const vmstat_text[] = {
[I(NR_ACTIVE_ANON)] = "nr_active_anon",
[I(NR_INACTIVE_FILE)] = "nr_inactive_file",
[I(NR_ACTIVE_FILE)] = "nr_active_file",
+ [I(NR_READAHEAD_FILE)] = "nr_readahead_file",
[I(NR_UNEVICTABLE)] = "nr_unevictable",
[I(NR_SLAB_RECLAIMABLE_B)] = "nr_slab_reclaimable",
[I(NR_SLAB_UNRECLAIMABLE_B)] = "nr_slab_unreclaimable",
@@ -1339,8 +1341,10 @@ const char * const vmstat_text[] = {
[I(PGSCAN_DIRECT_THROTTLE)] = "pgscan_direct_throttle",
[I(PGSCAN_ANON)] = "pgscan_anon",
[I(PGSCAN_FILE)] = "pgscan_file",
+ [I(PGSCAN_READAHEAD_FILE)] = "pgscan_readahead_file",
[I(PGSTEAL_ANON)] = "pgsteal_anon",
[I(PGSTEAL_FILE)] = "pgsteal_file",
+ [I(PGSTEAL_READAHEAD_FILE)] = "pgsteal_readahead_file",
#ifdef CONFIG_NUMA
[I(PGSCAN_ZONE_RECLAIM_SUCCESS)] = "zone_reclaim_success",
--
2.34.1
On Tue, Sep 16, 2025 at 2:22 AM Lei Liu <liulei.rjpt@vivo.com> wrote: > ... > > 2. Solution Proposal > Introduce a Readahead LRU to track pages brought in via readahead. During > memory reclamation, prioritize scanning this LRU to reclaim pages that > have not been accessed recently. For pages in the Readahead LRU that are > accessed, move them back to the inactive_file LRU to await subsequent > reclamation. I'm unsure this is the right solution though, given all users would have this readahead LRU on and we don't have performance numbers besides application startup here. My impression is that readahead behavior is highly dependent on the hardware, the workload, and the desired behavior, so making the readahead{-adjacent} behavior more amenable to tuning seems like the right direction. Maybe relevant discussions: https://lwn.net/Articles/897786/ I only skimmed the code but noticed a few things: > diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c > index a458f1e112fd..4f3f031134fd 100644 > --- a/fs/proc/meminfo.c > +++ b/fs/proc/meminfo.c > @@ -71,6 +71,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) > show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]); > show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); > show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); > + show_val_kb(m, "ReadAhead(file):", I notice both readahead and read ahead in this patch. Stick to the conventional one (readahead). > diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h > index 8d3fa3a91ce4..57dac828aa4f 100644 > --- a/include/linux/page-flags.h > +++ b/include/linux/page-flags.h > @@ -127,6 +127,7 @@ enum pageflags { > #ifdef CONFIG_ARCH_USES_PG_ARCH_3 > PG_arch_3, > #endif > + PG_readahead_lru, More pageflags... b/include/trace/events/mmflags.h > index aa441f593e9a..2dbc1701e838 100644 > --- a/include/trace/events/mmflags.h > +++ b/include/trace/events/mmflags.h > @@ -159,7 +159,8 @@ TRACE_DEFINE_ENUM(___GFP_LAST_BIT); > DEF_PAGEFLAG_NAME(reclaim), \ > DEF_PAGEFLAG_NAME(swapbacked), \ > DEF_PAGEFLAG_NAME(unevictable), \ > - DEF_PAGEFLAG_NAME(dropbehind) \ > + DEF_PAGEFLAG_NAME(dropbehind), \ > + DEF_PAGEFLAG_NAME(readahead_lru) \ > IF_HAVE_PG_MLOCK(mlocked) \ > IF_HAVE_PG_HWPOISON(hwpoison) \ > IF_HAVE_PG_IDLE(idle) \ > @@ -309,6 +310,7 @@ IF_HAVE_VM_DROPPABLE(VM_DROPPABLE, "droppable" ) \ > EM (LRU_ACTIVE_ANON, "active_anon") \ > EM (LRU_INACTIVE_FILE, "inactive_file") \ > EM (LRU_ACTIVE_FILE, "active_file") \ > + EM(LRU_READ_AHEAD_FILE, "readahead_file") \ Likewise, inconsistent naming. > diff --git a/mm/migrate.c b/mm/migrate.c > index 9e5ef39ce73a..0feab4d89d47 100644 > --- a/mm/migrate.c > +++ b/mm/migrate.c > @@ -760,6 +760,8 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) > folio_set_workingset(newfolio); > if (folio_test_checked(folio)) > folio_set_checked(newfolio); > + if (folio_test_readahead_lru(folio)) > + folio_set_readahead_lru(folio); newfolio > /* > @@ -5800,6 +5837,87 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * > > #endif /* CONFIG_LRU_GEN */ > > +static unsigned long shrink_read_ahead_list(unsigned long nr_to_scan, > + unsigned long nr_to_reclaim, > + struct lruvec *lruvec, > + struct scan_control *sc) > +{ > + LIST_HEAD(l_hold); > + LIST_HEAD(l_reclaim); > + LIST_HEAD(l_inactive); > + unsigned long nr_scanned = 0; > + unsigned long nr_taken = 0; > + unsigned long nr_reclaimed = 0; > + unsigned long vm_flags; > + enum vm_event_item item; > + struct pglist_data *pgdat = lruvec_pgdat(lruvec); > + struct reclaim_stat stat = { 0 }; > + > + lru_add_drain(); > + > + spin_lock_irq(&lruvec->lru_lock); > + nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, &nr_scanned, > + sc, LRU_READ_AHEAD_FILE); > + > + __count_vm_events(PGSCAN_READAHEAD_FILE, nr_scanned); > + __mod_node_page_state(pgdat, NR_ISOLATED_FILE, nr_taken); > + item = PGSCAN_KSWAPD + reclaimer_offset(sc); > + if (!cgroup_reclaim(sc)) > + __count_vm_events(item, nr_scanned); > + count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); > + __count_vm_events(PGSCAN_FILE, nr_scanned); > + spin_unlock_irq(&lruvec->lru_lock); > + > + if (nr_taken == 0) > + return 0; > + > + while (!list_empty(&l_hold)) { > + struct folio *folio; > + > + cond_resched(); > + folio = lru_to_folio(&l_hold); > + list_del(&folio->lru); > + folio_clear_readahead_lru(folio); > + > + if (folio_referenced(folio, 0, sc->target_mem_cgroup, &vm_flags)) { > + list_add(&folio->lru, &l_inactive); > + continue; > + } > + folio_clear_active(folio); > + list_add(&folio->lru, &l_reclaim); > + } > + > + nr_reclaimed = shrink_folio_list(&l_reclaim, pgdat, sc, &stat, true, > + lruvec_memcg(lruvec)); > + > + list_splice(&l_reclaim, &l_inactive); > + > + spin_lock_irq(&lruvec->lru_lock); > + move_folios_to_lru(lruvec, &l_inactive); > + __mod_node_page_state(pgdat, NR_ISOLATED_FILE, -nr_taken); > + > + __count_vm_events(PGSTEAL_READAHEAD_FILE, nr_reclaimed); > + item = PGSTEAL_KSWAPD + reclaimer_offset(sc); > + if (!cgroup_reclaim(sc)) > + __count_vm_events(item, nr_reclaimed); > + count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); > + __count_vm_events(PGSTEAL_FILE, nr_reclaimed); > + spin_unlock_irq(&lruvec->lru_lock); I see the idea is that readahead pages should be scanned before the rest of inactive file. I wonder if this is achievable without adding another LRU. Thanks, Yuanchu
> 1. Problem Background > In Android systems, a significant challenge arises during application > startup when a large number of private application files are read. > Approximately 90% of these file pages are loaded into memory via readahead. > However, about 85% of these pre-read pages are reclaimed without ever being > accessed, which means only around 15% of the pre-read pages are effectively > utilized. This results in wasted memory, as unaccessed file pages consume > valuable memory space, leading to memory thrashing and unnecessary I/O > reads. > > 2. Solution Proposal > Introduce a Readahead LRU to track pages brought in via readahead. During > memory reclamation, prioritize scanning this LRU to reclaim pages that > have not been accessed recently. For pages in the Readahead LRU that are > accessed, move them back to the inactive_file LRU to await subsequent > reclamation. > > 3. Benefits Data > In tests involving the cold start of 30 applications: > Memory Reclamation Efficiency: The slowpath process saw a reduction of > over 30%. Did you enable MGLRU? If you did, I guess "do not active page" and a separate LRU would have the same effect, but I didn't find any benefits. diff --git a/mm/swap.c b/mm/swap.c index 3632dd061beb..9e87996abbc9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -504,7 +504,8 @@ void folio_add_lru(struct folio *folio) /* see the comment in lru_gen_folio_seq() */ if (lru_gen_enabled() && !folio_test_unevictable(folio) && - lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) + lru_gen_in_fault() && !(current->flags & PF_MEMALLOC) && + !folio_test_readahead_lru(folio)) folio_set_active(folio); folio_batch_add_and_move(folio, lru_add, false); > 4. Current Issues > The refault metric for file pages has significantly degraded, increasing > by about 100%. This is primarily because pages are reclaimed too quickly, > without sufficient aging. > > 5. Next Steps > When calculating reclamation propensity, adjust the intensity of > reclamation from the Readahead LRU. This ensures aging and reclamation > efficiency while allowing adequate aging time. > > Signed-off-by: Lei Liu <liulei.rjpt@vivo.com>
On 2025/9/17 0:33, Yuanchu Xie wrote: > On Tue, Sep 16, 2025 at 2:22 AM Lei Liu <liulei.rjpt@vivo.com> wrote: >> ... >> >> 2. Solution Proposal >> Introduce a Readahead LRU to track pages brought in via readahead. During >> memory reclamation, prioritize scanning this LRU to reclaim pages that >> have not been accessed recently. For pages in the Readahead LRU that are >> accessed, move them back to the inactive_file LRU to await subsequent >> reclamation. > I'm unsure this is the right solution though, given all users would > have this readahead LRU on and we don't have performance numbers > besides application startup here. > My impression is that readahead behavior is highly dependent on the > hardware, the workload, and the desired behavior, so making the > readahead{-adjacent} behavior more amenable to tuning seems like the > right direction. > > Maybe relevant discussions: https://lwn.net/Articles/897786/ > > I only skimmed the code but noticed a few things: > >> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c >> index a458f1e112fd..4f3f031134fd 100644 >> --- a/fs/proc/meminfo.c >> +++ b/fs/proc/meminfo.c >> @@ -71,6 +71,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) >> show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]); >> show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); >> show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); >> + show_val_kb(m, "ReadAhead(file):", > I notice both readahead and read ahead in this patch. Stick to the > conventional one (readahead). > >> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h >> index 8d3fa3a91ce4..57dac828aa4f 100644 >> --- a/include/linux/page-flags.h >> +++ b/include/linux/page-flags.h >> @@ -127,6 +127,7 @@ enum pageflags { >> #ifdef CONFIG_ARCH_USES_PG_ARCH_3 >> PG_arch_3, >> #endif >> + PG_readahead_lru, > More pageflags... > > b/include/trace/events/mmflags.h >> index aa441f593e9a..2dbc1701e838 100644 >> --- a/include/trace/events/mmflags.h >> +++ b/include/trace/events/mmflags.h >> @@ -159,7 +159,8 @@ TRACE_DEFINE_ENUM(___GFP_LAST_BIT); >> DEF_PAGEFLAG_NAME(reclaim), \ >> DEF_PAGEFLAG_NAME(swapbacked), \ >> DEF_PAGEFLAG_NAME(unevictable), \ >> - DEF_PAGEFLAG_NAME(dropbehind) \ >> + DEF_PAGEFLAG_NAME(dropbehind), \ >> + DEF_PAGEFLAG_NAME(readahead_lru) \ >> IF_HAVE_PG_MLOCK(mlocked) \ >> IF_HAVE_PG_HWPOISON(hwpoison) \ >> IF_HAVE_PG_IDLE(idle) \ >> @@ -309,6 +310,7 @@ IF_HAVE_VM_DROPPABLE(VM_DROPPABLE, "droppable" ) \ >> EM (LRU_ACTIVE_ANON, "active_anon") \ >> EM (LRU_INACTIVE_FILE, "inactive_file") \ >> EM (LRU_ACTIVE_FILE, "active_file") \ >> + EM(LRU_READ_AHEAD_FILE, "readahead_file") \ > Likewise, inconsistent naming. > >> diff --git a/mm/migrate.c b/mm/migrate.c >> index 9e5ef39ce73a..0feab4d89d47 100644 >> --- a/mm/migrate.c >> +++ b/mm/migrate.c >> @@ -760,6 +760,8 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) >> folio_set_workingset(newfolio); >> if (folio_test_checked(folio)) >> folio_set_checked(newfolio); >> + if (folio_test_readahead_lru(folio)) >> + folio_set_readahead_lru(folio); > newfolio Understood—I'll revise accordingly. > >> /* >> @@ -5800,6 +5837,87 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * >> >> #endif /* CONFIG_LRU_GEN */ >> >> +static unsigned long shrink_read_ahead_list(unsigned long nr_to_scan, >> + unsigned long nr_to_reclaim, >> + struct lruvec *lruvec, >> + struct scan_control *sc) >> +{ >> + LIST_HEAD(l_hold); >> + LIST_HEAD(l_reclaim); >> + LIST_HEAD(l_inactive); >> + unsigned long nr_scanned = 0; >> + unsigned long nr_taken = 0; >> + unsigned long nr_reclaimed = 0; >> + unsigned long vm_flags; >> + enum vm_event_item item; >> + struct pglist_data *pgdat = lruvec_pgdat(lruvec); >> + struct reclaim_stat stat = { 0 }; >> + >> + lru_add_drain(); >> + >> + spin_lock_irq(&lruvec->lru_lock); >> + nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, &nr_scanned, >> + sc, LRU_READ_AHEAD_FILE); >> + >> + __count_vm_events(PGSCAN_READAHEAD_FILE, nr_scanned); >> + __mod_node_page_state(pgdat, NR_ISOLATED_FILE, nr_taken); >> + item = PGSCAN_KSWAPD + reclaimer_offset(sc); >> + if (!cgroup_reclaim(sc)) >> + __count_vm_events(item, nr_scanned); >> + count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); >> + __count_vm_events(PGSCAN_FILE, nr_scanned); >> + spin_unlock_irq(&lruvec->lru_lock); >> + >> + if (nr_taken == 0) >> + return 0; >> + >> + while (!list_empty(&l_hold)) { >> + struct folio *folio; >> + >> + cond_resched(); >> + folio = lru_to_folio(&l_hold); >> + list_del(&folio->lru); >> + folio_clear_readahead_lru(folio); >> + >> + if (folio_referenced(folio, 0, sc->target_mem_cgroup, &vm_flags)) { >> + list_add(&folio->lru, &l_inactive); >> + continue; >> + } >> + folio_clear_active(folio); >> + list_add(&folio->lru, &l_reclaim); >> + } >> + >> + nr_reclaimed = shrink_folio_list(&l_reclaim, pgdat, sc, &stat, true, >> + lruvec_memcg(lruvec)); >> + >> + list_splice(&l_reclaim, &l_inactive); >> + >> + spin_lock_irq(&lruvec->lru_lock); >> + move_folios_to_lru(lruvec, &l_inactive); >> + __mod_node_page_state(pgdat, NR_ISOLATED_FILE, -nr_taken); >> + >> + __count_vm_events(PGSTEAL_READAHEAD_FILE, nr_reclaimed); >> + item = PGSTEAL_KSWAPD + reclaimer_offset(sc); >> + if (!cgroup_reclaim(sc)) >> + __count_vm_events(item, nr_reclaimed); >> + count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); >> + __count_vm_events(PGSTEAL_FILE, nr_reclaimed); >> + spin_unlock_irq(&lruvec->lru_lock); > I see the idea is that readahead pages should be scanned before the > rest of inactive file. I wonder if this is achievable without adding > another LRU. > > > Thanks, > Yuanchu Hi, Yuanchu Thank you for your valuable feedback! 1.We initially considered keeping readahead pages in the system's existing inactive/active LRUs without adding a dedicated LRU. However, this approach may lead to inefficient reclamation of readahead pages. Reason: When scanning the inactive LRU, processing readahead pages can be frequently interrupted by non-readahead pages (e.g., shared/accessed pages). The reference checks for these non-readahead pages incur significant overhead, slowing down the scanning and reclamation of readahead pages. Thus, isolating readahead pages in a readahead LRU allows more targeted reclamation, significantly accelerating scanning and recycling efficiency. 2.That said, this solution does raise valid concerns. As you rightly pointed out, enabling this LRU globally may not align with all users' needs since not every scenario requires it. 3.For now, this remains a preliminary solution. The primary goal of this RFC is to highlight the issue of excessive readahead overhead and gather community insights for better alternatives. We are actively exploring solutions without adding a new LRU for future iterations. Best regards, Lei Liu
© 2016 - 2025 Red Hat, Inc.