include/linux/swap.h | 13 +++++- mm/page_alloc.c | 101 +++++++++++++++++++++++++++++++++++-------- mm/vmscan.c | 72 ++++++++++++++++++++++-------- 3 files changed, 146 insertions(+), 40 deletions(-)
From: Matt Fleming <mfleming@cloudflare.com>
should_reclaim_retry() uses zone_reclaimable_pages() to estimate whether
retrying reclaim could eventually satisfy an allocation. It's possible
for reclaim to make minimal or no progress on an LRU type despite having
ample reclaimable pages, e.g. anonymous pages when the only swap is
RAM-backed (zram). This can cause the reclaim path to loop indefinitely.
Track LRU reclaim progress (anon vs file) through a new struct
reclaim_progress passed out of try_to_free_pages(), and only count a
type's reclaimable pages if at least reclaim_progress_pct% was actually
reclaimed in the last cycle.
The threshold is exposed as /proc/sys/vm/reclaim_progress_pct (default
1, range 0-100). Setting 0 disables the gate and restores the previous
behaviour. Environments with only RAM-backed swap (zram) and small
memory may need a higher value to prevent futile anon LRU churn from
keeping the allocator spinning.
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Matt Fleming <mfleming@cloudflare.com>
---
include/linux/swap.h | 13 +++++-
mm/page_alloc.c | 101 +++++++++++++++++++++++++++++++++++--------
mm/vmscan.c | 72 ++++++++++++++++++++++--------
3 files changed, 146 insertions(+), 40 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 62fc7499b408..d46477365cd9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -368,9 +368,18 @@ void folio_mark_lazyfree(struct folio *folio);
extern void swap_setup(void);
/* linux/mm/vmscan.c */
+struct reclaim_progress {
+ unsigned long nr_reclaimed;
+ unsigned long nr_anon;
+ unsigned long nr_file;
+};
+
extern unsigned long zone_reclaimable_pages(struct zone *zone);
-extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
- gfp_t gfp_mask, nodemask_t *mask);
+extern unsigned long zone_reclaimable_file_pages(struct zone *zone);
+extern unsigned long zone_reclaimable_anon_pages(struct zone *zone);
+extern void try_to_free_pages(struct zonelist *zonelist, int order,
+ gfp_t gfp_mask, nodemask_t *mask,
+ struct reclaim_progress *progress);
#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2d4b6f1a554e..0f2597542ace 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4407,12 +4407,11 @@ static unsigned int check_retry_zonelist(unsigned int seq)
}
/* Perform direct synchronous page reclaim */
-static unsigned long
-__perform_reclaim(gfp_t gfp_mask, unsigned int order,
- const struct alloc_context *ac)
+static void __perform_reclaim(gfp_t gfp_mask, unsigned int order,
+ const struct alloc_context *ac,
+ struct reclaim_progress *progress)
{
unsigned int noreclaim_flag;
- unsigned long progress;
cond_resched();
@@ -4421,30 +4420,27 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
fs_reclaim_acquire(gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
- progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
- ac->nodemask);
+ try_to_free_pages(ac->zonelist, order, gfp_mask, ac->nodemask, progress);
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(gfp_mask);
cond_resched();
-
- return progress;
}
/* The really slow allocator path where we enter direct reclaim */
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- unsigned long *did_some_progress)
+ struct reclaim_progress *progress)
{
struct page *page = NULL;
unsigned long pflags;
bool drained = false;
psi_memstall_enter(&pflags);
- *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
- if (unlikely(!(*did_some_progress)))
+ __perform_reclaim(gfp_mask, order, ac, progress);
+ if (unlikely(!progress->nr_reclaimed))
goto out;
retry:
@@ -4586,6 +4582,41 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
return !!__gfp_pfmemalloc_flags(gfp_mask);
}
+/*
+ * Minimum percentage of LRU reclaimable pages that must have been
+ * reclaimed in the last cycle for that type to be counted towards the
+ * "can we satisfy this allocation?" watermark check in
+ * should_reclaim_retry().
+ *
+ * This prevents systems with only RAM-backed swap (zram) from
+ * endlessly retrying reclaim for anon pages when minimal progress is
+ * made despite seemingly having lots of reclaimable pages.
+ *
+ * Setting this to 0 disables the per-LRU progress check: all
+ * reclaimable pages are always counted towards watermark.
+ */
+static int reclaim_progress_pct __read_mostly = 1;
+
+/*
+ * Return true if reclaim for this LRU type made at least
+ * reclaim_progress_pct% progress in the last cycle or the LRU progress
+ * check is disabled.
+ */
+static inline bool reclaim_progress_sufficient(unsigned long reclaimed,
+ unsigned long reclaimable)
+{
+ unsigned long threshold;
+
+ if (!reclaim_progress_pct)
+ return true;
+
+ if (!reclaimable)
+ return false;
+
+ threshold = DIV_ROUND_UP(reclaimable * reclaim_progress_pct, 100);
+ return reclaimed >= threshold;
+}
+
/*
* Checks whether it makes sense to retry the reclaim to make a forward progress
* for the given allocation request.
@@ -4599,11 +4630,13 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
static inline bool
should_reclaim_retry(gfp_t gfp_mask, unsigned order,
struct alloc_context *ac, int alloc_flags,
- bool did_some_progress, int *no_progress_loops)
+ struct reclaim_progress *progress,
+ int *no_progress_loops)
{
struct zone *zone;
struct zoneref *z;
bool ret = false;
+ bool did_some_progress = progress->nr_reclaimed > 0;
/*
* Costly allocations might have made a progress but this doesn't mean
@@ -4629,6 +4662,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
ac->highest_zoneidx, ac->nodemask) {
unsigned long available;
unsigned long reclaimable;
+ unsigned long reclaimable_anon;
+ unsigned long reclaimable_file;
unsigned long min_wmark = min_wmark_pages(zone);
bool wmark;
@@ -4637,7 +4672,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
!__cpuset_zone_allowed(zone, gfp_mask))
continue;
- available = reclaimable = zone_reclaimable_pages(zone);
+ /*
+ * Only count reclaimable pages from an LRU type if reclaim
+ * actually made headway on that type in the last cycle.
+ * This prevents the allocator from looping endlessly on
+ * account of a large pool of pages that reclaim cannot make
+ * progress on, e.g. anonymous pages when the only swap is
+ * RAM-backed (zram).
+ */
+ reclaimable = 0;
+ reclaimable_file = zone_reclaimable_file_pages(zone);
+ reclaimable_anon = zone_reclaimable_anon_pages(zone);
+
+ if (reclaim_progress_sufficient(progress->nr_file, reclaimable_file))
+ reclaimable += reclaimable_file;
+ if (reclaim_progress_sufficient(progress->nr_anon, reclaimable_anon))
+ reclaimable += reclaimable_anon;
+
+ available = reclaimable;
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
/*
@@ -4716,7 +4768,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned int alloc_flags;
- unsigned long did_some_progress;
+ struct reclaim_progress reclaim_progress = {};
+ unsigned long oom_progress;
enum compact_priority compact_priority;
enum compact_result compact_result;
int compaction_retries;
@@ -4727,6 +4780,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
bool compact_first = false;
bool can_retry_reserves = true;
+
if (unlikely(nofail)) {
/*
* Also we don't support __GFP_NOFAIL without __GFP_DIRECT_RECLAIM,
@@ -4844,7 +4898,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
/* Try direct reclaim and then allocating */
if (!compact_first) {
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags,
- ac, &did_some_progress);
+ ac, &reclaim_progress);
if (page)
goto got_pg;
}
@@ -4904,7 +4958,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
goto restart;
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
- did_some_progress > 0, &no_progress_loops))
+ &reclaim_progress, &no_progress_loops))
goto retry;
/*
@@ -4913,7 +4967,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
* implementation of the compaction depends on the sufficient amount
* of free memory (see __compaction_suitable)
*/
- if (did_some_progress > 0 && can_compact &&
+ if (reclaim_progress.nr_reclaimed > 0 && can_compact &&
should_compact_retry(ac, order, alloc_flags,
compact_result, &compact_priority,
&compaction_retries))
@@ -4934,7 +4988,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
goto restart;
/* Reclaim has failed us, start killing things */
- page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
+ page = __alloc_pages_may_oom(gfp_mask, order, ac, &oom_progress);
if (page)
goto got_pg;
@@ -4945,7 +4999,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
goto nopage;
/* Retry as long as the OOM killer is making progress */
- if (did_some_progress) {
+ if (oom_progress) {
no_progress_loops = 0;
goto retry;
}
@@ -6775,6 +6829,15 @@ static const struct ctl_table page_alloc_sysctl_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
+ {
+ .procname = "reclaim_progress_pct",
+ .data = &reclaim_progress_pct,
+ .maxlen = sizeof(reclaim_progress_pct),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
{
.procname = "percpu_pagelist_high_fraction",
.data = &percpu_pagelist_high_fraction,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0fc9373e8251..9087b4e0a704 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -167,6 +167,10 @@ struct scan_control {
/* Number of pages freed so far during a call to shrink_zones() */
unsigned long nr_reclaimed;
+ /* Anon/file LRU contributions to nr_reclaimed */
+ unsigned long nr_reclaimed_anon;
+ unsigned long nr_reclaimed_file;
+
struct {
unsigned int dirty;
unsigned int unqueued_dirty;
@@ -385,6 +389,21 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
return can_demote(nid, sc, memcg);
}
+unsigned long zone_reclaimable_file_pages(struct zone *zone)
+{
+ return zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
+ zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
+}
+
+unsigned long zone_reclaimable_anon_pages(struct zone *zone)
+{
+ if (!can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
+ return 0;
+
+ return zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
+ zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
+}
+
/*
* This misses isolated folios which are not accounted for to save counters.
* As the data only determines if reclaim or compaction continues, it is
@@ -392,15 +411,8 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
*/
unsigned long zone_reclaimable_pages(struct zone *zone)
{
- unsigned long nr;
-
- nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
- zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
- if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
- nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
- zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
-
- return nr;
+ return zone_reclaimable_file_pages(zone) +
+ zone_reclaimable_anon_pages(zone);
}
/**
@@ -4718,6 +4730,10 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg);
sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
sc->nr_reclaimed += reclaimed;
+ if (type)
+ sc->nr_reclaimed_file += reclaimed;
+ else
+ sc->nr_reclaimed_anon += reclaimed;
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
scanned, reclaimed, &stat, sc->priority,
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
@@ -5776,6 +5792,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
unsigned long nr_to_scan;
enum lru_list lru;
unsigned long nr_reclaimed = 0;
+ unsigned long nr_reclaimed_anon = 0;
+ unsigned long nr_reclaimed_file = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
bool proportional_reclaim;
struct blk_plug plug;
@@ -5812,11 +5830,18 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
for_each_evictable_lru(lru) {
if (nr[lru]) {
+ unsigned long reclaimed;
+
nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
nr[lru] -= nr_to_scan;
- nr_reclaimed += shrink_list(lru, nr_to_scan,
- lruvec, sc);
+ reclaimed = shrink_list(lru, nr_to_scan,
+ lruvec, sc);
+ nr_reclaimed += reclaimed;
+ if (is_file_lru(lru))
+ nr_reclaimed_file += reclaimed;
+ else
+ nr_reclaimed_anon += reclaimed;
}
}
@@ -5876,6 +5901,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
}
blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;
+ sc->nr_reclaimed_anon += nr_reclaimed_anon;
+ sc->nr_reclaimed_file += nr_reclaimed_file;
/*
* Even if we did not try to evict anon pages at all, we want to
@@ -6563,8 +6590,9 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
return false;
}
-unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
- gfp_t gfp_mask, nodemask_t *nodemask)
+void try_to_free_pages(struct zonelist *zonelist, int order,
+ gfp_t gfp_mask, nodemask_t *nodemask,
+ struct reclaim_progress *progress)
{
unsigned long nr_reclaimed;
struct scan_control sc = {
@@ -6588,12 +6616,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
/*
- * Do not enter reclaim if fatal signal was delivered while throttled.
- * 1 is returned so that the page allocator does not OOM kill at this
- * point.
+ * Do not enter reclaim if fatal signal was delivered while
+ * throttled. nr_reclaimed is set to 1 so that the page
+ * allocator does not OOM kill at this point.
*/
- if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
- return 1;
+ if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) {
+ nr_reclaimed = 1;
+ goto out;
+ }
set_task_reclaim_state(current, &sc.reclaim_state);
trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
@@ -6603,7 +6633,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
set_task_reclaim_state(current, NULL);
- return nr_reclaimed;
+ progress->nr_anon = sc.nr_reclaimed_anon;
+ progress->nr_file = sc.nr_reclaimed_file;
+
+out:
+ progress->nr_reclaimed = nr_reclaimed;
}
#ifdef CONFIG_MEMCG
--
2.43.0
On Fri, Apr 10, 2026 at 6:16 PM Matt Fleming <matt@readmodwrite.com> wrote: > > From: Matt Fleming <mfleming@cloudflare.com> > > should_reclaim_retry() uses zone_reclaimable_pages() to estimate whether > retrying reclaim could eventually satisfy an allocation. It's possible > for reclaim to make minimal or no progress on an LRU type despite having > ample reclaimable pages, e.g. anonymous pages when the only swap is > RAM-backed (zram). This can cause the reclaim path to loop indefinitely. I am still struggling to understand when zram-backed reclamation cannot make progress. Is it because zram is full, or because folio_alloc_swap() fails? Or does zs_malloc() fail, causing pageout() to fail? Even incompressible pages are still written as ZRAM_HUGE pages and reclaimed successfully. > > Track LRU reclaim progress (anon vs file) through a new struct > reclaim_progress passed out of try_to_free_pages(), and only count a > type's reclaimable pages if at least reclaim_progress_pct% was actually > reclaimed in the last cycle. I would rather detect what causes the lack of progress and implement a better fallback. > > The threshold is exposed as /proc/sys/vm/reclaim_progress_pct (default > 1, range 0-100). Setting 0 disables the gate and restores the previous > behaviour. Environments with only RAM-backed swap (zram) and small > memory may need a higher value to prevent futile anon LRU churn from > keeping the allocator spinning. > > Suggested-by: Johannes Weiner <hannes@cmpxchg.org> > Signed-off-by: Matt Fleming <mfleming@cloudflare.com> Thanks Barry
On Thu, Apr 16, 2026 at 09:44:55AM +0800, Barry Song wrote: > On Fri, Apr 10, 2026 at 6:16 PM Matt Fleming <matt@readmodwrite.com> wrote: > > > > From: Matt Fleming <mfleming@cloudflare.com> > > > > should_reclaim_retry() uses zone_reclaimable_pages() to estimate whether > > retrying reclaim could eventually satisfy an allocation. It's possible > > for reclaim to make minimal or no progress on an LRU type despite having > > ample reclaimable pages, e.g. anonymous pages when the only swap is > > RAM-backed (zram). This can cause the reclaim path to loop indefinitely. > > I am still struggling to understand when zram-backed > reclamation cannot make progress. Is it because zram is > full, or because folio_alloc_swap() fails? > > Or does zs_malloc() fail, causing pageout() to fail? > Even incompressible pages are still written as > ZRAM_HUGE pages and reclaimed successfully. We should have counters for these, right? > > > > > Track LRU reclaim progress (anon vs file) through a new struct > > reclaim_progress passed out of try_to_free_pages(), and only count a > > type's reclaimable pages if at least reclaim_progress_pct% was actually > > reclaimed in the last cycle. > > I would rather detect what causes the lack of progress > and implement a better fallback. This is a good question. I think we have appropriate counters in /proc/vmstat for cases where pages keep getting recycled in the LRUs instead of reclaim. Matt, do you see anything unexpected in /proc/vmstat?
On Thu, Apr 16, 2026 at 02:58:30PM -0700, Shakeel Butt wrote: > On Thu, Apr 16, 2026 at 09:44:55AM +0800, Barry Song wrote: > > > > I am still struggling to understand when zram-backed > > reclamation cannot make progress. Is it because zram is > > full, or because folio_alloc_swap() fails? > > > > Or does zs_malloc() fail, causing pageout() to fail? > > Even incompressible pages are still written as > > ZRAM_HUGE pages and reclaimed successfully. > > We should have counters for these, right? Let me try and provide some more data for this. It's hard to replicate on our production systems so I've resorted to creating a minimal Qemu repro that has 1GiB RAM and zram disk = 1GiB. The workload is a simple anon memory mapper that allocs 900MiB of memory and touches all pages for 60s. zs_malloc --------- None of the zs_malloc() calls failed and we made ~1.2M of them during the test. Here's a breakdown of allocation sizes: @hist_zs_malloc_size: [32, 64) 4831015 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [64, 128) 409 | | [128, 256) 1090 | | [256, 512) 2334 | | [512, 1K) 5069 | | [1K, 2K) 11174 | | [2K, 4K) 2395 | | [4K, 8K) 237 | | During direct reclaim only: @hist_zs_malloc_size_in_dr: [32, 64) 1268042 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [64, 128) 52 | | [128, 256) 149 | | [256, 512) 292 | | [512, 1K) 1234 | | [1K, 2K) 3539 | | [2K, 4K) 1156 | | [4K, 8K) 135 | | /sys/block/zram0/mm_stat -------------------------- before: 4096 74 12288 0 12288 0 0 0 0 after: 42622976 9412667 10985472 0 34131968 0 1962 0 237 trace_mm_vmscan_lru_shrink_inactive ----------------------------------- Anon LRU shrink events: 397,949 sum(args->nr_scanned): 11,837,216 sum(args->nr_reclaimed): 4,871,775 sum(args->nr_dirty): 0 sum(args->nr_writeback): 0 sum(args->nr_congested): 0 sum(args->nr_immediate): 0 sum(args->nr_ref_keep): 5,200,896 sum(args->nr_unmap_fail): 0 File LRU shrink events: 2,632 sum(args->nr_scanned): 26,048 sum(args->nr_reclaimed): 12,681 sum(args->nr_dirty): 0 sum(args->nr_writeback): 0 sum(args->nr_congested): 0 sum(args->nr_immediate): 0 sum(args->nr_ref_keep): 476 sum(args->nr_unmap_fail): 0 > > I would rather detect what causes the lack of progress > > and implement a better fallback. > > This is a good question. I think we have appropriate counters in /proc/vmstat > for cases where pages keep getting recycled in the LRUs instead of reclaim. Here's the output of /proc/vmstat before and after the test runs. nr_free_pages 210,825 -> 206,742 (delta=-4,083) nr_free_pages_blocks 209,920 -> 65,536 (delta=-144,384) nr_zone_inactive_anon 1,685 -> 136 (delta=-1,549) nr_zone_active_anon 15 -> 3,774 (delta=3,759) nr_zone_inactive_file 329 -> 591 (delta=262) nr_zone_active_file 673 -> 504 (delta=-169) nr_zspages 3 -> 2,716 (delta=2,713) nr_inactive_anon 1,685 -> 136 (delta=-1,549) nr_active_anon 15 -> 3,774 (delta=3,759) nr_inactive_file 329 -> 591 (delta=262) nr_active_file 673 -> 504 (delta=-169) nr_slab_reclaimable 1,352 -> 2,037 (delta=685) nr_slab_unreclaimable 9,581 -> 11,689 (delta=2,108) nr_anon_pages 1,526 -> 262 (delta=-1,264) nr_mapped 912 -> 442 (delta=-470) nr_file_pages 1,132 -> 4,760 (delta=3,628) nr_shmem 162 -> 3,608 (delta=3,446) nr_swapcached 0 -> 19 (delta=19) nr_vmscan_write 0 -> 4,872,846 (delta=4,872,846) nr_written 1 -> 4,853,727 (delta=4,853,726) pgpgin 1,200 -> 19,035,312 (delta=19,034,112) pgpgout 4 -> 19,414,908 (delta=19,414,904) pswpin 0 -> 4,758,528 (delta=4,758,528) pswpout 0 -> 4,853,726 (delta=4,853,726) pgalloc_dma 32 -> 84,262 (delta=84,230) pgalloc_dma32 45,989 -> 5,095,307 (delta=5,049,318) pgfree 269,896 -> 5,415,629 (delta=5,145,733) pgactivate 2,820 -> 14,490 (delta=11,670) pgdeactivate 10 -> 10,924 (delta=10,914) pgfault 29,321 -> 5,088,427 (delta=5,059,106) pgmajfault 3,750 -> 4,794,781 (delta=4,791,031) pgrefill 0 -> 13,733 (delta=13,733) pgreuse 3,333 -> 5,852 (delta=2,519) pgsteal_kswapd 0 -> 3,605,552 (delta=3,605,552) pgsteal_direct 0 -> 1,280,091 (delta=1,280,091) pgscan_kswapd 0 -> 6,579,240 (delta=6,579,240) pgscan_direct 0 -> 5,290,778 (delta=5,290,778) pgscan_anon 0 -> 11,843,970 (delta=11,843,970) pgscan_file 0 -> 26,048 (delta=26,048) pgsteal_anon 0 -> 4,872,962 (delta=4,872,962) pgsteal_file 0 -> 12,681 (delta=12,681) allocstall_normal 0 -> 110 (delta=110) allocstall_movable 0 -> 32,088 (delta=32,088) oom_kill 0 -> 0 (delta=0) workingset_nodes 0 -> 302 (delta=302) workingset_refault_anon 0 -> 4,777,591 (delta=4,777,591) workingset_refault_file 0 -> 870 (delta=870) workingset_activate_anon 0 -> 487 (delta=487) kswapd_low_wmark_hit_quickly 0 -> 35 (delta=35) kswapd_high_wmark_hit_quickly 0 -> 99 (delta=99) pageoutrun 0 -> 135 (delta=135) pgmigrate_success 0 -> 21,317 (delta=21,317) compact_migrate_scanned 0 -> 98,848 (delta=98,848) compact_free_scanned 0 -> 136,667 (delta=136,667) swpin_zero 0 -> 19,069 (delta=19,069) swpout_zero 0 -> 19,120 (delta=19,120) swap_ra 0 -> 63 (delta=63) swap_ra_hit 0 -> 26 (delta=26) Happy to do any other tests or pull any other data for you to help. Thanks, Matt
On Fri, Apr 10, 2026 at 11:15:49AM +0100, Matt Fleming wrote: > From: Matt Fleming <mfleming@cloudflare.com> > > should_reclaim_retry() uses zone_reclaimable_pages() to estimate whether > retrying reclaim could eventually satisfy an allocation. It's possible > for reclaim to make minimal or no progress on an LRU type despite having > ample reclaimable pages, e.g. anonymous pages when the only swap is > RAM-backed (zram). Or incompressible memory on zswap with writeback disabled or overcommitted memory.min. > This can cause the reclaim path to loop indefinitely. > > Track LRU reclaim progress (anon vs file) through a new struct > reclaim_progress passed out of try_to_free_pages(), and only count a > type's reclaimable pages if at least reclaim_progress_pct% was actually > reclaimed in the last cycle. > > The threshold is exposed as /proc/sys/vm/reclaim_progress_pct (default > 1, range 0-100). Let's not expose any sysctl or user visible API for this heuristic. It will evolve and then this interface would be awkward and hard to remove. > Setting 0 disables the gate and restores the previous > behaviour. Environments with only RAM-backed swap (zram) and small > memory may need a higher value to prevent futile anon LRU churn from > keeping the allocator spinning. > > Suggested-by: Johannes Weiner <hannes@cmpxchg.org> > Signed-off-by: Matt Fleming <mfleming@cloudflare.com> > --- [...] > > @@ -4637,7 +4672,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, > !__cpuset_zone_allowed(zone, gfp_mask)) > continue; > > - available = reclaimable = zone_reclaimable_pages(zone); > + /* > + * Only count reclaimable pages from an LRU type if reclaim > + * actually made headway on that type in the last cycle. > + * This prevents the allocator from looping endlessly on > + * account of a large pool of pages that reclaim cannot make > + * progress on, e.g. anonymous pages when the only swap is > + * RAM-backed (zram). > + */ > + reclaimable = 0; > + reclaimable_file = zone_reclaimable_file_pages(zone); > + reclaimable_anon = zone_reclaimable_anon_pages(zone); Here we are getting the current reclaimable pages. > + > + if (reclaim_progress_sufficient(progress->nr_file, reclaimable_file)) > + reclaimable += reclaimable_file; > + if (reclaim_progress_sufficient(progress->nr_anon, reclaimable_anon)) > + reclaimable += reclaimable_anon; And here we are comparing the current reclaimable pages with last iteration. Is this intentional to keep things simple? > + > + available = reclaimable; > available += zone_page_state_snapshot(zone, NR_FREE_PAGES); > Another heuristic we can play with is to also pass through the vmscan scan count. If for couple of consecutive iterations, we continue to see low reclaim efficiency, go for OOM. Also maybe compare the scan count with the watermark as I expect we don't see much difference scan count for consecutive reclaim iteration, so, it is a good representative of reclaimable memory. The reclaim efficiency heuristic should handle the swap-on-zram or incomp-zswap-with-no-writeback. Treating scan count as proxy for reclaimable memory should handle the overcommitted memory.min case.
On Wed, Apr 15, 2026 at 06:01:54PM -0700, Shakeel Butt wrote: > On Fri, Apr 10, 2026 at 11:15:49AM +0100, Matt Fleming wrote: > > > > @@ -4637,7 +4672,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, > > !__cpuset_zone_allowed(zone, gfp_mask)) > > continue; > > > > - available = reclaimable = zone_reclaimable_pages(zone); > > + /* > > + * Only count reclaimable pages from an LRU type if reclaim > > + * actually made headway on that type in the last cycle. > > + * This prevents the allocator from looping endlessly on > > + * account of a large pool of pages that reclaim cannot make > > + * progress on, e.g. anonymous pages when the only swap is > > + * RAM-backed (zram). > > + */ > > + reclaimable = 0; > > + reclaimable_file = zone_reclaimable_file_pages(zone); > > + reclaimable_anon = zone_reclaimable_anon_pages(zone); > > Here we are getting the current reclaimable pages. > > > + > > + if (reclaim_progress_sufficient(progress->nr_file, reclaimable_file)) > > + reclaimable += reclaimable_file; > > + if (reclaim_progress_sufficient(progress->nr_anon, reclaimable_anon)) > > + reclaimable += reclaimable_anon; > > And here we are comparing the current reclaimable pages with last iteration. Is > this intentional to keep things simple? Yep, that was the intent. > > + > > + available = reclaimable; > > available += zone_page_state_snapshot(zone, NR_FREE_PAGES); > > > > Another heuristic we can play with is to also pass through the vmscan scan > count. If for couple of consecutive iterations, we continue to see low reclaim > efficiency, go for OOM. Also maybe compare the scan count with the watermark as > I expect we don't see much difference scan count for consecutive reclaim > iteration, so, it is a good representative of reclaimable memory. > > The reclaim efficiency heuristic should handle the swap-on-zram or > incomp-zswap-with-no-writeback. Treating scan count as proxy for reclaimable > memory should handle the overcommitted memory.min case. Nice. I'll take a look at this.
On Fri, Apr 10, 2026 at 11:15:49AM +0100, Matt Fleming wrote: > From: Matt Fleming <mfleming@cloudflare.com> > > should_reclaim_retry() uses zone_reclaimable_pages() to estimate whether > retrying reclaim could eventually satisfy an allocation. It's possible > for reclaim to make minimal or no progress on an LRU type despite having > ample reclaimable pages, e.g. anonymous pages when the only swap is > RAM-backed (zram). This can cause the reclaim path to loop indefinitely. > > Track LRU reclaim progress (anon vs file) through a new struct > reclaim_progress passed out of try_to_free_pages(), and only count a > type's reclaimable pages if at least reclaim_progress_pct% was actually > reclaimed in the last cycle. I think there is at least one problem with this heuristic: you are counting everything that hasn't made progress as "we cannot reclaim it". When in reality you can simply fail to make progress on any given folio as e.g it's referenced and we want to give it another spin in the LRU. My theory (from merely reading the patch, maybe I missed something) is that a pathological case for this is a lot of folios added to the LRU in a row, that are set referenced (or dirty). Say SWAP_CLUSTER_MAX * MAX_RECLAIM_RETRIES - it will simply OOM too early. The other question is whether this effectively solves reclaim problems - some hard numbers would be great. -- Pedro
On Wed, Apr 15, 2026 at 03:57:25PM +0100, Pedro Falcato wrote: > On Fri, Apr 10, 2026 at 11:15:49AM +0100, Matt Fleming wrote: > > From: Matt Fleming <mfleming@cloudflare.com> > > > > should_reclaim_retry() uses zone_reclaimable_pages() to estimate whether > > retrying reclaim could eventually satisfy an allocation. It's possible > > for reclaim to make minimal or no progress on an LRU type despite having > > ample reclaimable pages, e.g. anonymous pages when the only swap is > > RAM-backed (zram). This can cause the reclaim path to loop indefinitely. > > > > Track LRU reclaim progress (anon vs file) through a new struct > > reclaim_progress passed out of try_to_free_pages(), and only count a > > type's reclaimable pages if at least reclaim_progress_pct% was actually > > reclaimed in the last cycle. > > I think there is at least one problem with this heuristic: you are counting > everything that hasn't made progress as "we cannot reclaim it". When in reality > you can simply fail to make progress on any given folio as e.g it's referenced > and we want to give it another spin in the LRU. The intention was that the percentage threshold would avoid giving up on reclaim as long as "sufficient" progress was made. This should allow for some folios to need another trip through the LRU but... > My theory (from merely reading the patch, maybe I missed something) is that > a pathological case for this is a lot of folios added to the LRU in a row, > that are set referenced (or dirty). Say SWAP_CLUSTER_MAX * MAX_RECLAIM_RETRIES > - it will simply OOM too early. OK yeah I think I see the problem now: this heuristic applies the threshold against all reclaimable pages but that falls apart when doing SWAP_CLUSTER_MAX chunks of reclaim. > The other question is whether this effectively solves reclaim problems - some > hard numbers would be great. I shared some numbers in my reply to Vlastimil, but if there are other cases you'd like measured I'm happy to run them.
On Thu, Apr 16, 2026 at 03:51:04PM +0100, Matt Fleming wrote: > On Wed, Apr 15, 2026 at 03:57:25PM +0100, Pedro Falcato wrote: [...] > > > My theory (from merely reading the patch, maybe I missed something) is that > > a pathological case for this is a lot of folios added to the LRU in a row, > > that are set referenced (or dirty). Say SWAP_CLUSTER_MAX * MAX_RECLAIM_RETRIES > > - it will simply OOM too early. > > OK yeah I think I see the problem now: this heuristic applies the > threshold against all reclaimable pages but that falls apart when doing > SWAP_CLUSTER_MAX chunks of reclaim. I am not sure I understand the pathological case. Yes SWAP_CLUSTER_MAX is requested amount of pages to reclaim but the kernel can potentially scan full memory twice to reclaim that much amount. Though those reclaimed pages can get stolen but that can still happen today before this patch.
On Thu, Apr 16, 2026 at 02:49:28PM -0700, Shakeel Butt wrote: > On Thu, Apr 16, 2026 at 03:51:04PM +0100, Matt Fleming wrote: > > On Wed, Apr 15, 2026 at 03:57:25PM +0100, Pedro Falcato wrote: > [...] > > > > > My theory (from merely reading the patch, maybe I missed something) is that > > > a pathological case for this is a lot of folios added to the LRU in a row, > > > that are set referenced (or dirty). Say SWAP_CLUSTER_MAX * MAX_RECLAIM_RETRIES > > > - it will simply OOM too early. > > > > OK yeah I think I see the problem now: this heuristic applies the > > threshold against all reclaimable pages but that falls apart when doing > > SWAP_CLUSTER_MAX chunks of reclaim. > > I am not sure I understand the pathological case. Yes SWAP_CLUSTER_MAX is > requested amount of pages to reclaim but the kernel can potentially scan full > memory twice to reclaim that much amount. Though those reclaimed pages can get > stolen but that can still happen today before this patch. I see, yes, you are totally correct. Had a look at the vmscan code again and just realized I had missed some details. Matt, please disregard :) -- Pedro
On 4/10/26 12:15, Matt Fleming wrote: > From: Matt Fleming <mfleming@cloudflare.com> > > should_reclaim_retry() uses zone_reclaimable_pages() to estimate whether > retrying reclaim could eventually satisfy an allocation. It's possible > for reclaim to make minimal or no progress on an LRU type despite having > ample reclaimable pages, e.g. anonymous pages when the only swap is > RAM-backed (zram). This can cause the reclaim path to loop indefinitely. > > Track LRU reclaim progress (anon vs file) through a new struct > reclaim_progress passed out of try_to_free_pages(), and only count a > type's reclaimable pages if at least reclaim_progress_pct% was actually > reclaimed in the last cycle. > > The threshold is exposed as /proc/sys/vm/reclaim_progress_pct (default > 1, range 0-100). Setting 0 disables the gate and restores the previous > behaviour. Environments with only RAM-backed swap (zram) and small > memory may need a higher value to prevent futile anon LRU churn from > keeping the allocator spinning. > > Suggested-by: Johannes Weiner <hannes@cmpxchg.org> > Signed-off-by: Matt Fleming <mfleming@cloudflare.com> Hi Matt, so have you tested it for your usecase with zram and have any observations how it helped, what values did you set etc? Vlastimil
On Mon, Apr 13, 2026 at 05:38:19PM +0200, Vlastimil Babka (SUSE) wrote:
>
> Hi Matt,
>
> so have you tested it for your usecase with zram and have any observations
> how it helped, what values did you set etc?
Hey Vlastimil,
Yeah I've tested this out. So far, results have been positive -- I see
system-wide OOM kills when memory is low and direct reclaim occurs, but
not so many OOM kills that the SRE folks have started screaming at me.
I've only run with the proposed 1% value so far. I also ran a bunch of
benchmarks alongside a memory hogging app that peridoically touches
anoymous memory.
Workload rpp=0 rpp=1 Notes
----------------------------------------------------------------------------------------------
Kernel compile + anon hog Completed, no OOM Completed, Global OOM confirmed from
Global OOM fired __alloc_pages_slowpath
Memcached + anon hog 282k / 2.30M ops/s 562k / 3.53M ops/s Global OOM killed hog,
No OOM Global OOM fired then benchmark ran faster
Pure fio (5 reruns each) median 3710 MiB/s median 3702 MiB/s No reproducible regression
Mixed fio + anon hog 2747 MiB/s 2915 MiB/s Global OOM killed
unrelated services
reclaim_progress_pct=1 seems to help in these memory exhausted
situations, and doesn't appear to cause a regression for the pure file
workload case.
If you have any suggestions for other tests or benchmarks to run I'd be
happy to do that.
Thanks,
Matt
On 4/15/26 11:11, Matt Fleming wrote: > On Mon, Apr 13, 2026 at 05:38:19PM +0200, Vlastimil Babka (SUSE) wrote: >> >> Hi Matt, >> >> so have you tested it for your usecase with zram and have any observations >> how it helped, what values did you set etc? > > Hey Vlastimil, > > Yeah I've tested this out. So far, results have been positive -- I see > system-wide OOM kills when memory is low and direct reclaim occurs, but > not so many OOM kills that the SRE folks have started screaming at me. Hmm... > I've only run with the proposed 1% value so far. I also ran a bunch of > benchmarks alongside a memory hogging app that peridoically touches > anoymous memory. > > Workload rpp=0 rpp=1 Notes > ---------------------------------------------------------------------------------------------- > Kernel compile + anon hog Completed, no OOM Completed, Global OOM confirmed from > Global OOM fired __alloc_pages_slowpath Completed in both cases... but was it faster? Also what got OOM killed, the hog? > > Memcached + anon hog 282k / 2.30M ops/s 562k / 3.53M ops/s Global OOM killed hog, > No OOM Global OOM fired then benchmark ran faster The improvement is nice. However even in the rpp=0 case there didn't seem to have been a thrashing so bad the system wouldn't recover. I think this is minimally an argument against having it enabled by default, as by default we don't want to cause premature OOMs if the system is still working (And yes, we do have problems to recognize when it's not working, and actually doing OOM). But these tradeoffs for killing something to get better throughput on something else are good for certain kind of servers/workloads but not as a default. And once you go that way then you might be better of looking at the PSI metrics that would be more holistic than this heuristic? > Pure fio (5 reruns each) median 3710 MiB/s median 3702 MiB/s No reproducible regression > Mixed fio + anon hog 2747 MiB/s 2915 MiB/s Global OOM killed > unrelated services > > reclaim_progress_pct=1 seems to help in these memory exhausted > situations, and doesn't appear to cause a regression for the pure file > workload case. > > If you have any suggestions for other tests or benchmarks to run I'd be > happy to do that. > > Thanks, > Matt
© 2016 - 2026 Red Hat, Inc.