[v1] mm: Require LRU reclaim progress before retrying direct reclaim

[PATCH] mm: Require LRU reclaim progress before retrying direct reclaim

Posted by Matt Fleming 2 months ago

From: Matt Fleming <mfleming@cloudflare.com>

should_reclaim_retry() uses zone_reclaimable_pages() to estimate whether
retrying reclaim could eventually satisfy an allocation. It's possible
for reclaim to make minimal or no progress on an LRU type despite having
ample reclaimable pages, e.g. anonymous pages when the only swap is
RAM-backed (zram). This can cause the reclaim path to loop indefinitely.

Track LRU reclaim progress (anon vs file) through a new struct
reclaim_progress passed out of try_to_free_pages(), and only count a
type's reclaimable pages if at least reclaim_progress_pct% was actually
reclaimed in the last cycle.

The threshold is exposed as /proc/sys/vm/reclaim_progress_pct (default
1, range 0-100). Setting 0 disables the gate and restores the previous
behaviour. Environments with only RAM-backed swap (zram) and small
memory may need a higher value to prevent futile anon LRU churn from
keeping the allocator spinning.

Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Matt Fleming <mfleming@cloudflare.com>
---
 include/linux/swap.h |  13 +++++-
 mm/page_alloc.c      | 101 +++++++++++++++++++++++++++++++++++--------
 mm/vmscan.c          |  72 ++++++++++++++++++++++--------
 3 files changed, 146 insertions(+), 40 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 62fc7499b408..d46477365cd9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -368,9 +368,18 @@ void folio_mark_lazyfree(struct folio *folio);
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
+struct reclaim_progress {
+	unsigned long nr_reclaimed;
+	unsigned long nr_anon;
+	unsigned long nr_file;
+};
+
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
-extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
-					gfp_t gfp_mask, nodemask_t *mask);
+extern unsigned long zone_reclaimable_file_pages(struct zone *zone);
+extern unsigned long zone_reclaimable_anon_pages(struct zone *zone);
+extern void try_to_free_pages(struct zonelist *zonelist, int order,
+			      gfp_t gfp_mask, nodemask_t *mask,
+			      struct reclaim_progress *progress);
 
 #define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
 #define MEMCG_RECLAIM_PROACTIVE (1 << 2)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2d4b6f1a554e..0f2597542ace 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4407,12 +4407,11 @@ static unsigned int check_retry_zonelist(unsigned int seq)
 }
 
 /* Perform direct synchronous page reclaim */
-static unsigned long
-__perform_reclaim(gfp_t gfp_mask, unsigned int order,
-					const struct alloc_context *ac)
+static void __perform_reclaim(gfp_t gfp_mask, unsigned int order,
+			      const struct alloc_context *ac,
+			      struct reclaim_progress *progress)
 {
 	unsigned int noreclaim_flag;
-	unsigned long progress;
 
 	cond_resched();
 
@@ -4421,30 +4420,27 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 	fs_reclaim_acquire(gfp_mask);
 	noreclaim_flag = memalloc_noreclaim_save();
 
-	progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
-								ac->nodemask);
+	try_to_free_pages(ac->zonelist, order, gfp_mask, ac->nodemask, progress);
 
 	memalloc_noreclaim_restore(noreclaim_flag);
 	fs_reclaim_release(gfp_mask);
 
 	cond_resched();
-
-	return progress;
 }
 
 /* The really slow allocator path where we enter direct reclaim */
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
-		unsigned long *did_some_progress)
+		struct reclaim_progress *progress)
 {
 	struct page *page = NULL;
 	unsigned long pflags;
 	bool drained = false;
 
 	psi_memstall_enter(&pflags);
-	*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
-	if (unlikely(!(*did_some_progress)))
+	__perform_reclaim(gfp_mask, order, ac, progress);
+	if (unlikely(!progress->nr_reclaimed))
 		goto out;
 
 retry:
@@ -4586,6 +4582,41 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	return !!__gfp_pfmemalloc_flags(gfp_mask);
 }
 
+/*
+ * Minimum percentage of LRU reclaimable pages that must have been
+ * reclaimed in the last cycle for that type to be counted towards the
+ * "can we satisfy this allocation?" watermark check in
+ * should_reclaim_retry().
+ *
+ * This prevents systems with only RAM-backed swap (zram) from
+ * endlessly retrying reclaim for anon pages when minimal progress is
+ * made despite seemingly having lots of reclaimable pages.
+ *
+ * Setting this to 0 disables the per-LRU progress check: all
+ * reclaimable pages are always counted towards watermark.
+ */
+static int reclaim_progress_pct __read_mostly = 1;
+
+/*
+ * Return true if reclaim for this LRU type made at least
+ * reclaim_progress_pct% progress in the last cycle or the LRU progress
+ * check is disabled.
+ */
+static inline bool reclaim_progress_sufficient(unsigned long reclaimed,
+					       unsigned long reclaimable)
+{
+	unsigned long threshold;
+
+	if (!reclaim_progress_pct)
+		return true;
+
+	if (!reclaimable)
+		return false;
+
+	threshold = DIV_ROUND_UP(reclaimable * reclaim_progress_pct, 100);
+	return reclaimed >= threshold;
+}
+
 /*
  * Checks whether it makes sense to retry the reclaim to make a forward progress
  * for the given allocation request.
@@ -4599,11 +4630,13 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 static inline bool
 should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 		     struct alloc_context *ac, int alloc_flags,
-		     bool did_some_progress, int *no_progress_loops)
+		     struct reclaim_progress *progress,
+		     int *no_progress_loops)
 {
 	struct zone *zone;
 	struct zoneref *z;
 	bool ret = false;
+	bool did_some_progress = progress->nr_reclaimed > 0;
 
 	/*
 	 * Costly allocations might have made a progress but this doesn't mean
@@ -4629,6 +4662,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 				ac->highest_zoneidx, ac->nodemask) {
 		unsigned long available;
 		unsigned long reclaimable;
+		unsigned long reclaimable_anon;
+		unsigned long reclaimable_file;
 		unsigned long min_wmark = min_wmark_pages(zone);
 		bool wmark;
 
@@ -4637,7 +4672,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 			!__cpuset_zone_allowed(zone, gfp_mask))
 				continue;
 
-		available = reclaimable = zone_reclaimable_pages(zone);
+		/*
+		 * Only count reclaimable pages from an LRU type if reclaim
+		 * actually made headway on that type in the last cycle.
+		 * This prevents the allocator from looping endlessly on
+		 * account of a large pool of pages that reclaim cannot make
+		 * progress on, e.g. anonymous pages when the only swap is
+		 * RAM-backed (zram).
+		 */
+		reclaimable = 0;
+		reclaimable_file = zone_reclaimable_file_pages(zone);
+		reclaimable_anon = zone_reclaimable_anon_pages(zone);
+
+		if (reclaim_progress_sufficient(progress->nr_file, reclaimable_file))
+			reclaimable += reclaimable_file;
+		if (reclaim_progress_sufficient(progress->nr_anon, reclaimable_anon))
+			reclaimable += reclaimable_anon;
+
+		available = reclaimable;
 		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
 
 		/*
@@ -4716,7 +4768,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
 	struct page *page = NULL;
 	unsigned int alloc_flags;
-	unsigned long did_some_progress;
+	struct reclaim_progress reclaim_progress = {};
+	unsigned long oom_progress;
 	enum compact_priority compact_priority;
 	enum compact_result compact_result;
 	int compaction_retries;
@@ -4727,6 +4780,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	bool compact_first = false;
 	bool can_retry_reserves = true;
 
+
 	if (unlikely(nofail)) {
 		/*
 		 * Also we don't support __GFP_NOFAIL without __GFP_DIRECT_RECLAIM,
@@ -4844,7 +4898,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	/* Try direct reclaim and then allocating */
 	if (!compact_first) {
 		page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags,
-							ac, &did_some_progress);
+						ac, &reclaim_progress);
 		if (page)
 			goto got_pg;
 	}
@@ -4904,7 +4958,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto restart;
 
 	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
-				 did_some_progress > 0, &no_progress_loops))
+				 &reclaim_progress, &no_progress_loops))
 		goto retry;
 
 	/*
@@ -4913,7 +4967,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	 * implementation of the compaction depends on the sufficient amount
 	 * of free memory (see __compaction_suitable)
 	 */
-	if (did_some_progress > 0 && can_compact &&
+	if (reclaim_progress.nr_reclaimed > 0 && can_compact &&
 			should_compact_retry(ac, order, alloc_flags,
 				compact_result, &compact_priority,
 				&compaction_retries))
@@ -4934,7 +4988,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto restart;
 
 	/* Reclaim has failed us, start killing things */
-	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
+	page = __alloc_pages_may_oom(gfp_mask, order, ac, &oom_progress);
 	if (page)
 		goto got_pg;
 
@@ -4945,7 +4999,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto nopage;
 
 	/* Retry as long as the OOM killer is making progress */
-	if (did_some_progress) {
+	if (oom_progress) {
 		no_progress_loops = 0;
 		goto retry;
 	}
@@ -6775,6 +6829,15 @@ static const struct ctl_table page_alloc_sysctl_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE,
 	},
+	{
+		.procname	= "reclaim_progress_pct",
+		.data		= &reclaim_progress_pct,
+		.maxlen		= sizeof(reclaim_progress_pct),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE_HUNDRED,
+	},
 	{
 		.procname	= "percpu_pagelist_high_fraction",
 		.data		= &percpu_pagelist_high_fraction,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0fc9373e8251..9087b4e0a704 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -167,6 +167,10 @@ struct scan_control {
 	/* Number of pages freed so far during a call to shrink_zones() */
 	unsigned long nr_reclaimed;
 
+	/* Anon/file LRU contributions to nr_reclaimed */
+	unsigned long nr_reclaimed_anon;
+	unsigned long nr_reclaimed_file;
+
 	struct {
 		unsigned int dirty;
 		unsigned int unqueued_dirty;
@@ -385,6 +389,21 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
 	return can_demote(nid, sc, memcg);
 }
 
+unsigned long zone_reclaimable_file_pages(struct zone *zone)
+{
+	return zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
+		zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
+}
+
+unsigned long zone_reclaimable_anon_pages(struct zone *zone)
+{
+	if (!can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
+		return 0;
+
+	return zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
+		zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
+}
+
 /*
  * This misses isolated folios which are not accounted for to save counters.
  * As the data only determines if reclaim or compaction continues, it is
@@ -392,15 +411,8 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
  */
 unsigned long zone_reclaimable_pages(struct zone *zone)
 {
-	unsigned long nr;
-
-	nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
-		zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
-	if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
-		nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
-			zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
-
-	return nr;
+	return zone_reclaimable_file_pages(zone) +
+		zone_reclaimable_anon_pages(zone);
 }
 
 /**
@@ -4718,6 +4730,10 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
 	reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg);
 	sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
 	sc->nr_reclaimed += reclaimed;
+	if (type)
+		sc->nr_reclaimed_file += reclaimed;
+	else
+		sc->nr_reclaimed_anon += reclaimed;
 	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
 			scanned, reclaimed, &stat, sc->priority,
 			type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
@@ -5776,6 +5792,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	unsigned long nr_to_scan;
 	enum lru_list lru;
 	unsigned long nr_reclaimed = 0;
+	unsigned long nr_reclaimed_anon = 0;
+	unsigned long nr_reclaimed_file = 0;
 	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
 	bool proportional_reclaim;
 	struct blk_plug plug;
@@ -5812,11 +5830,18 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 
 		for_each_evictable_lru(lru) {
 			if (nr[lru]) {
+				unsigned long reclaimed;
+
 				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
 				nr[lru] -= nr_to_scan;
 
-				nr_reclaimed += shrink_list(lru, nr_to_scan,
-							    lruvec, sc);
+				reclaimed = shrink_list(lru, nr_to_scan,
+							lruvec, sc);
+				nr_reclaimed += reclaimed;
+				if (is_file_lru(lru))
+					nr_reclaimed_file += reclaimed;
+				else
+					nr_reclaimed_anon += reclaimed;
 			}
 		}
 
@@ -5876,6 +5901,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	}
 	blk_finish_plug(&plug);
 	sc->nr_reclaimed += nr_reclaimed;
+	sc->nr_reclaimed_anon += nr_reclaimed_anon;
+	sc->nr_reclaimed_file += nr_reclaimed_file;
 
 	/*
 	 * Even if we did not try to evict anon pages at all, we want to
@@ -6563,8 +6590,9 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 	return false;
 }
 
-unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
-				gfp_t gfp_mask, nodemask_t *nodemask)
+void try_to_free_pages(struct zonelist *zonelist, int order,
+		       gfp_t gfp_mask, nodemask_t *nodemask,
+		       struct reclaim_progress *progress)
 {
 	unsigned long nr_reclaimed;
 	struct scan_control sc = {
@@ -6588,12 +6616,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 	BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
 
 	/*
-	 * Do not enter reclaim if fatal signal was delivered while throttled.
-	 * 1 is returned so that the page allocator does not OOM kill at this
-	 * point.
+	 * Do not enter reclaim if fatal signal was delivered while
+	 * throttled. nr_reclaimed is set to 1 so that the page
+	 * allocator does not OOM kill at this point.
 	 */
-	if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
-		return 1;
+	if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) {
+		nr_reclaimed = 1;
+		goto out;
+	}
 
 	set_task_reclaim_state(current, &sc.reclaim_state);
 	trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
@@ -6603,7 +6633,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
 	set_task_reclaim_state(current, NULL);
 
-	return nr_reclaimed;
+	progress->nr_anon = sc.nr_reclaimed_anon;
+	progress->nr_file = sc.nr_reclaimed_file;
+
+out:
+	progress->nr_reclaimed = nr_reclaimed;
 }
 
 #ifdef CONFIG_MEMCG
-- 
2.43.0

Re: [PATCH] mm: Require LRU reclaim progress before retrying direct reclaim

Posted by Barry Song 2 months ago

On Fri, Apr 10, 2026 at 6:16 PM Matt Fleming <matt@readmodwrite.com> wrote:
>
> From: Matt Fleming <mfleming@cloudflare.com>
>
> should_reclaim_retry() uses zone_reclaimable_pages() to estimate whether
> retrying reclaim could eventually satisfy an allocation. It's possible
> for reclaim to make minimal or no progress on an LRU type despite having
> ample reclaimable pages, e.g. anonymous pages when the only swap is
> RAM-backed (zram). This can cause the reclaim path to loop indefinitely.

I am still struggling to understand when zram-backed
reclamation cannot make progress. Is it because zram is
full, or because folio_alloc_swap() fails?

Or does zs_malloc() fail, causing pageout() to fail?
Even incompressible pages are still written as
ZRAM_HUGE pages and reclaimed successfully.

>
> Track LRU reclaim progress (anon vs file) through a new struct
> reclaim_progress passed out of try_to_free_pages(), and only count a
> type's reclaimable pages if at least reclaim_progress_pct% was actually
> reclaimed in the last cycle.

I would rather detect what causes the lack of progress
and implement a better fallback.

>
> The threshold is exposed as /proc/sys/vm/reclaim_progress_pct (default
> 1, range 0-100). Setting 0 disables the gate and restores the previous
> behaviour. Environments with only RAM-backed swap (zram) and small
> memory may need a higher value to prevent futile anon LRU churn from
> keeping the allocator spinning.
>
> Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Matt Fleming <mfleming@cloudflare.com>

Thanks
Barry

Re: [PATCH] mm: Require LRU reclaim progress before retrying direct reclaim

Posted by Shakeel Butt 1 month, 4 weeks ago

On Thu, Apr 16, 2026 at 09:44:55AM +0800, Barry Song wrote:
> On Fri, Apr 10, 2026 at 6:16 PM Matt Fleming <matt@readmodwrite.com> wrote:
> >
> > From: Matt Fleming <mfleming@cloudflare.com>
> >
> > should_reclaim_retry() uses zone_reclaimable_pages() to estimate whether
> > retrying reclaim could eventually satisfy an allocation. It's possible
> > for reclaim to make minimal or no progress on an LRU type despite having
> > ample reclaimable pages, e.g. anonymous pages when the only swap is
> > RAM-backed (zram). This can cause the reclaim path to loop indefinitely.
> 
> I am still struggling to understand when zram-backed
> reclamation cannot make progress. Is it because zram is
> full, or because folio_alloc_swap() fails?
> 
> Or does zs_malloc() fail, causing pageout() to fail?
> Even incompressible pages are still written as
> ZRAM_HUGE pages and reclaimed successfully.

We should have counters for these, right?

> 
> >
> > Track LRU reclaim progress (anon vs file) through a new struct
> > reclaim_progress passed out of try_to_free_pages(), and only count a
> > type's reclaimable pages if at least reclaim_progress_pct% was actually
> > reclaimed in the last cycle.
> 
> I would rather detect what causes the lack of progress
> and implement a better fallback.

This is a good question. I think we have appropriate counters in /proc/vmstat
for cases where pages keep getting recycled in the LRUs instead of reclaim.

Matt, do you see anything unexpected in /proc/vmstat?

Re: [PATCH] mm: Require LRU reclaim progress before retrying direct reclaim

Posted by Matt Fleming 1 month, 3 weeks ago

On Thu, Apr 16, 2026 at 02:58:30PM -0700, Shakeel Butt wrote:
> On Thu, Apr 16, 2026 at 09:44:55AM +0800, Barry Song wrote:
> > 
> > I am still struggling to understand when zram-backed
> > reclamation cannot make progress. Is it because zram is
> > full, or because folio_alloc_swap() fails?
> > 
> > Or does zs_malloc() fail, causing pageout() to fail?
> > Even incompressible pages are still written as
> > ZRAM_HUGE pages and reclaimed successfully.
> 
> We should have counters for these, right?
 
Let me try and provide some more data for this. It's hard to replicate
on our production systems so I've resorted to creating a minimal Qemu
repro that has 1GiB RAM and zram disk = 1GiB. The workload is a simple
anon memory mapper that allocs 900MiB of memory and touches all pages
for 60s.

zs_malloc
---------
None of the zs_malloc() calls failed and we made ~1.2M of them during
the test. Here's a breakdown of allocation sizes:

@hist_zs_malloc_size: 
[32, 64)         4831015 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[64, 128)            409 |                                                    |
[128, 256)          1090 |                                                    |
[256, 512)          2334 |                                                    |
[512, 1K)           5069 |                                                    |
[1K, 2K)           11174 |                                                    |
[2K, 4K)            2395 |                                                    |
[4K, 8K)             237 |                                                    |

During direct reclaim only:
@hist_zs_malloc_size_in_dr: 
[32, 64)         1268042 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[64, 128)             52 |                                                    |
[128, 256)           149 |                                                    |
[256, 512)           292 |                                                    |
[512, 1K)           1234 |                                                    |
[1K, 2K)            3539 |                                                    |
[2K, 4K)            1156 |                                                    |
[4K, 8K)             135 |                                                    |


/sys/block/zram0/mm_stat
--------------------------
before: 4096       74    12288        0    12288        0        0        0        0
after:  42622976  9412667 10985472        0 34131968        0     1962        0      237


trace_mm_vmscan_lru_shrink_inactive
-----------------------------------
Anon LRU shrink events:                 397,949
  sum(args->nr_scanned):                11,837,216
  sum(args->nr_reclaimed):              4,871,775
  sum(args->nr_dirty):                  0
  sum(args->nr_writeback):              0
  sum(args->nr_congested):              0
  sum(args->nr_immediate):              0
  sum(args->nr_ref_keep):               5,200,896
  sum(args->nr_unmap_fail):             0

File LRU shrink events:                 2,632
  sum(args->nr_scanned):                26,048
  sum(args->nr_reclaimed):              12,681
  sum(args->nr_dirty):                  0
  sum(args->nr_writeback):              0
  sum(args->nr_congested):              0
  sum(args->nr_immediate):              0
  sum(args->nr_ref_keep):               476
  sum(args->nr_unmap_fail):             0


> > I would rather detect what causes the lack of progress
> > and implement a better fallback.
> 
> This is a good question. I think we have appropriate counters in /proc/vmstat
> for cases where pages keep getting recycled in the LRUs instead of reclaim.

Here's the output of /proc/vmstat before and after the test runs.

nr_free_pages                             210,825 ->       206,742  (delta=-4,083)
nr_free_pages_blocks                      209,920 ->        65,536  (delta=-144,384)
nr_zone_inactive_anon                       1,685 ->           136  (delta=-1,549)
nr_zone_active_anon                            15 ->         3,774  (delta=3,759)
nr_zone_inactive_file                         329 ->           591  (delta=262)
nr_zone_active_file                           673 ->           504  (delta=-169)
nr_zspages                                      3 ->         2,716  (delta=2,713)
nr_inactive_anon                            1,685 ->           136  (delta=-1,549)
nr_active_anon                                 15 ->         3,774  (delta=3,759)
nr_inactive_file                              329 ->           591  (delta=262)
nr_active_file                                673 ->           504  (delta=-169)
nr_slab_reclaimable                         1,352 ->         2,037  (delta=685)
nr_slab_unreclaimable                       9,581 ->        11,689  (delta=2,108)
nr_anon_pages                               1,526 ->           262  (delta=-1,264)
nr_mapped                                     912 ->           442  (delta=-470)
nr_file_pages                               1,132 ->         4,760  (delta=3,628)
nr_shmem                                      162 ->         3,608  (delta=3,446)
nr_swapcached                                   0 ->            19  (delta=19)
nr_vmscan_write                                 0 ->     4,872,846  (delta=4,872,846)
nr_written                                      1 ->     4,853,727  (delta=4,853,726)

pgpgin                                      1,200 ->    19,035,312  (delta=19,034,112)
pgpgout                                         4 ->    19,414,908  (delta=19,414,904)
pswpin                                          0 ->     4,758,528  (delta=4,758,528)
pswpout                                         0 ->     4,853,726  (delta=4,853,726)

pgalloc_dma                                    32 ->        84,262  (delta=84,230)
pgalloc_dma32                              45,989 ->     5,095,307  (delta=5,049,318)
pgfree                                    269,896 ->     5,415,629  (delta=5,145,733)
pgactivate                                  2,820 ->        14,490  (delta=11,670)
pgdeactivate                                   10 ->        10,924  (delta=10,914)
pgfault                                    29,321 ->     5,088,427  (delta=5,059,106)
pgmajfault                                  3,750 ->     4,794,781  (delta=4,791,031)
pgrefill                                        0 ->        13,733  (delta=13,733)
pgreuse                                     3,333 ->         5,852  (delta=2,519)

pgsteal_kswapd                                  0 ->     3,605,552  (delta=3,605,552)
pgsteal_direct                                  0 ->     1,280,091  (delta=1,280,091)
pgscan_kswapd                                   0 ->     6,579,240  (delta=6,579,240)
pgscan_direct                                   0 ->     5,290,778  (delta=5,290,778)
pgscan_anon                                     0 ->    11,843,970  (delta=11,843,970)
pgscan_file                                     0 ->        26,048  (delta=26,048)
pgsteal_anon                                    0 ->     4,872,962  (delta=4,872,962)
pgsteal_file                                    0 ->        12,681  (delta=12,681)

allocstall_normal                               0 ->           110  (delta=110)
allocstall_movable                              0 ->        32,088  (delta=32,088)
oom_kill                                        0 ->             0  (delta=0)

workingset_nodes                                0 ->           302  (delta=302)
workingset_refault_anon                         0 ->     4,777,591  (delta=4,777,591)
workingset_refault_file                         0 ->           870  (delta=870)
workingset_activate_anon                        0 ->           487  (delta=487)

kswapd_low_wmark_hit_quickly                    0 ->            35  (delta=35)
kswapd_high_wmark_hit_quickly                   0 ->            99  (delta=99)
pageoutrun                                      0 ->           135  (delta=135)

pgmigrate_success                               0 ->        21,317  (delta=21,317)
compact_migrate_scanned                         0 ->        98,848  (delta=98,848)
compact_free_scanned                            0 ->       136,667  (delta=136,667)

swpin_zero                                      0 ->        19,069  (delta=19,069)
swpout_zero                                     0 ->        19,120  (delta=19,120)
swap_ra                                         0 ->            63  (delta=63)
swap_ra_hit                                     0 ->            26  (delta=26)

Happy to do any other tests or pull any other data for you to help.

Thanks,
Matt

Re: [PATCH] mm: Require LRU reclaim progress before retrying direct reclaim

Posted by Shakeel Butt 2 months ago

On Fri, Apr 10, 2026 at 11:15:49AM +0100, Matt Fleming wrote:
> From: Matt Fleming <mfleming@cloudflare.com>
> 
> should_reclaim_retry() uses zone_reclaimable_pages() to estimate whether
> retrying reclaim could eventually satisfy an allocation. It's possible
> for reclaim to make minimal or no progress on an LRU type despite having
> ample reclaimable pages, e.g. anonymous pages when the only swap is
> RAM-backed (zram). 

Or incompressible memory on zswap with writeback disabled or overcommitted
memory.min.

> This can cause the reclaim path to loop indefinitely.
> 
> Track LRU reclaim progress (anon vs file) through a new struct
> reclaim_progress passed out of try_to_free_pages(), and only count a
> type's reclaimable pages if at least reclaim_progress_pct% was actually
> reclaimed in the last cycle.
> 
> The threshold is exposed as /proc/sys/vm/reclaim_progress_pct (default
> 1, range 0-100). 

Let's not expose any sysctl or user visible API for this heuristic. It will
evolve and then this interface would be awkward and hard to remove.

> Setting 0 disables the gate and restores the previous
> behaviour. Environments with only RAM-backed swap (zram) and small
> memory may need a higher value to prevent futile anon LRU churn from
> keeping the allocator spinning.
> 
> Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Matt Fleming <mfleming@cloudflare.com>
> ---

[...]

>  
> @@ -4637,7 +4672,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
>  			!__cpuset_zone_allowed(zone, gfp_mask))
>  				continue;
>  
> -		available = reclaimable = zone_reclaimable_pages(zone);
> +		/*
> +		 * Only count reclaimable pages from an LRU type if reclaim
> +		 * actually made headway on that type in the last cycle.
> +		 * This prevents the allocator from looping endlessly on
> +		 * account of a large pool of pages that reclaim cannot make
> +		 * progress on, e.g. anonymous pages when the only swap is
> +		 * RAM-backed (zram).
> +		 */
> +		reclaimable = 0;
> +		reclaimable_file = zone_reclaimable_file_pages(zone);
> +		reclaimable_anon = zone_reclaimable_anon_pages(zone);

Here we are getting the current reclaimable pages.

> +
> +		if (reclaim_progress_sufficient(progress->nr_file, reclaimable_file))
> +			reclaimable += reclaimable_file;
> +		if (reclaim_progress_sufficient(progress->nr_anon, reclaimable_anon))
> +			reclaimable += reclaimable_anon;

And here we are comparing the current reclaimable pages with last iteration. Is
this intentional to keep things simple?

> +
> +		available = reclaimable;
>  		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
>  

Another heuristic we can play with is to also pass through the vmscan scan
count. If for couple of consecutive iterations, we continue to see low reclaim
efficiency, go for OOM. Also maybe compare the scan count with the watermark as
I expect we don't see much difference scan count for consecutive reclaim
iteration, so, it is a good representative of reclaimable memory.

The reclaim efficiency heuristic should handle the swap-on-zram or
incomp-zswap-with-no-writeback. Treating scan count as proxy for reclaimable
memory should handle the overcommitted memory.min case.

Re: [PATCH] mm: Require LRU reclaim progress before retrying direct reclaim

Posted by Matt Fleming 1 month, 4 weeks ago

On Wed, Apr 15, 2026 at 06:01:54PM -0700, Shakeel Butt wrote:
> On Fri, Apr 10, 2026 at 11:15:49AM +0100, Matt Fleming wrote:
> >  
> > @@ -4637,7 +4672,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
> >  			!__cpuset_zone_allowed(zone, gfp_mask))
> >  				continue;
> >  
> > -		available = reclaimable = zone_reclaimable_pages(zone);
> > +		/*
> > +		 * Only count reclaimable pages from an LRU type if reclaim
> > +		 * actually made headway on that type in the last cycle.
> > +		 * This prevents the allocator from looping endlessly on
> > +		 * account of a large pool of pages that reclaim cannot make
> > +		 * progress on, e.g. anonymous pages when the only swap is
> > +		 * RAM-backed (zram).
> > +		 */
> > +		reclaimable = 0;
> > +		reclaimable_file = zone_reclaimable_file_pages(zone);
> > +		reclaimable_anon = zone_reclaimable_anon_pages(zone);
> 
> Here we are getting the current reclaimable pages.
> 
> > +
> > +		if (reclaim_progress_sufficient(progress->nr_file, reclaimable_file))
> > +			reclaimable += reclaimable_file;
> > +		if (reclaim_progress_sufficient(progress->nr_anon, reclaimable_anon))
> > +			reclaimable += reclaimable_anon;
> 
> And here we are comparing the current reclaimable pages with last iteration. Is
> this intentional to keep things simple?
 
Yep, that was the intent.

> > +
> > +		available = reclaimable;
> >  		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
> >  
> 
> Another heuristic we can play with is to also pass through the vmscan scan
> count. If for couple of consecutive iterations, we continue to see low reclaim
> efficiency, go for OOM. Also maybe compare the scan count with the watermark as
> I expect we don't see much difference scan count for consecutive reclaim
> iteration, so, it is a good representative of reclaimable memory.
> 
> The reclaim efficiency heuristic should handle the swap-on-zram or
> incomp-zswap-with-no-writeback. Treating scan count as proxy for reclaimable
> memory should handle the overcommitted memory.min case.
 
Nice. I'll take a look at this.

Re: [PATCH] mm: Require LRU reclaim progress before retrying direct reclaim

Posted by Pedro Falcato 2 months ago

On Fri, Apr 10, 2026 at 11:15:49AM +0100, Matt Fleming wrote:
> From: Matt Fleming <mfleming@cloudflare.com>
> 
> should_reclaim_retry() uses zone_reclaimable_pages() to estimate whether
> retrying reclaim could eventually satisfy an allocation. It's possible
> for reclaim to make minimal or no progress on an LRU type despite having
> ample reclaimable pages, e.g. anonymous pages when the only swap is
> RAM-backed (zram). This can cause the reclaim path to loop indefinitely.
> 
> Track LRU reclaim progress (anon vs file) through a new struct
> reclaim_progress passed out of try_to_free_pages(), and only count a
> type's reclaimable pages if at least reclaim_progress_pct% was actually
> reclaimed in the last cycle.

I think there is at least one problem with this heuristic: you are counting
everything that hasn't made progress as "we cannot reclaim it". When in reality
you can simply fail to make progress on any given folio as e.g it's referenced
and we want to give it another spin in the LRU.

My theory (from merely reading the patch, maybe I missed something) is that
a pathological case for this is a lot of folios added to the LRU in a row,
that are set referenced (or dirty). Say SWAP_CLUSTER_MAX * MAX_RECLAIM_RETRIES
- it will simply OOM too early.

The other question is whether this effectively solves reclaim problems - some
hard numbers would be great.

-- 
Pedro

Re: [PATCH] mm: Require LRU reclaim progress before retrying direct reclaim

Posted by Matt Fleming 1 month, 4 weeks ago

On Wed, Apr 15, 2026 at 03:57:25PM +0100, Pedro Falcato wrote:
> On Fri, Apr 10, 2026 at 11:15:49AM +0100, Matt Fleming wrote:
> > From: Matt Fleming <mfleming@cloudflare.com>
> > 
> > should_reclaim_retry() uses zone_reclaimable_pages() to estimate whether
> > retrying reclaim could eventually satisfy an allocation. It's possible
> > for reclaim to make minimal or no progress on an LRU type despite having
> > ample reclaimable pages, e.g. anonymous pages when the only swap is
> > RAM-backed (zram). This can cause the reclaim path to loop indefinitely.
> > 
> > Track LRU reclaim progress (anon vs file) through a new struct
> > reclaim_progress passed out of try_to_free_pages(), and only count a
> > type's reclaimable pages if at least reclaim_progress_pct% was actually
> > reclaimed in the last cycle.
> 
> I think there is at least one problem with this heuristic: you are counting
> everything that hasn't made progress as "we cannot reclaim it". When in reality
> you can simply fail to make progress on any given folio as e.g it's referenced
> and we want to give it another spin in the LRU.
 
The intention was that the percentage threshold would avoid giving up
on reclaim as long as "sufficient" progress was made. This should allow
for some folios to need another trip through the LRU but...

> My theory (from merely reading the patch, maybe I missed something) is that
> a pathological case for this is a lot of folios added to the LRU in a row,
> that are set referenced (or dirty). Say SWAP_CLUSTER_MAX * MAX_RECLAIM_RETRIES
> - it will simply OOM too early.
 
OK yeah I think I see the problem now: this heuristic applies the
threshold against all reclaimable pages but that falls apart when doing
SWAP_CLUSTER_MAX chunks of reclaim.

> The other question is whether this effectively solves reclaim problems - some
> hard numbers would be great.

I shared some numbers in my reply to Vlastimil, but if there are other
cases you'd like measured I'm happy to run them.

Re: [PATCH] mm: Require LRU reclaim progress before retrying direct reclaim

Posted by Shakeel Butt 1 month, 4 weeks ago

On Thu, Apr 16, 2026 at 03:51:04PM +0100, Matt Fleming wrote:
> On Wed, Apr 15, 2026 at 03:57:25PM +0100, Pedro Falcato wrote:
[...]
> 
> > My theory (from merely reading the patch, maybe I missed something) is that
> > a pathological case for this is a lot of folios added to the LRU in a row,
> > that are set referenced (or dirty). Say SWAP_CLUSTER_MAX * MAX_RECLAIM_RETRIES
> > - it will simply OOM too early.
>  
> OK yeah I think I see the problem now: this heuristic applies the
> threshold against all reclaimable pages but that falls apart when doing
> SWAP_CLUSTER_MAX chunks of reclaim.

I am not sure I understand the pathological case. Yes SWAP_CLUSTER_MAX is
requested amount of pages to reclaim but the kernel can potentially scan full
memory twice to reclaim that much amount. Though those reclaimed pages can get
stolen but that can still happen today before this patch.

Re: [PATCH] mm: Require LRU reclaim progress before retrying direct reclaim

Posted by Pedro Falcato 1 month, 4 weeks ago

On Thu, Apr 16, 2026 at 02:49:28PM -0700, Shakeel Butt wrote:
> On Thu, Apr 16, 2026 at 03:51:04PM +0100, Matt Fleming wrote:
> > On Wed, Apr 15, 2026 at 03:57:25PM +0100, Pedro Falcato wrote:
> [...]
> > 
> > > My theory (from merely reading the patch, maybe I missed something) is that
> > > a pathological case for this is a lot of folios added to the LRU in a row,
> > > that are set referenced (or dirty). Say SWAP_CLUSTER_MAX * MAX_RECLAIM_RETRIES
> > > - it will simply OOM too early.
> >  
> > OK yeah I think I see the problem now: this heuristic applies the
> > threshold against all reclaimable pages but that falls apart when doing
> > SWAP_CLUSTER_MAX chunks of reclaim.
> 
> I am not sure I understand the pathological case. Yes SWAP_CLUSTER_MAX is
> requested amount of pages to reclaim but the kernel can potentially scan full
> memory twice to reclaim that much amount. Though those reclaimed pages can get
> stolen but that can still happen today before this patch.

I see, yes, you are totally correct. Had a look at the vmscan code again and just
realized I had missed some details.

Matt, please disregard :)

-- 
Pedro

Re: [PATCH] mm: Require LRU reclaim progress before retrying direct reclaim

Posted by Vlastimil Babka (SUSE) 2 months ago

On 4/10/26 12:15, Matt Fleming wrote:
> From: Matt Fleming <mfleming@cloudflare.com>
> 
> should_reclaim_retry() uses zone_reclaimable_pages() to estimate whether
> retrying reclaim could eventually satisfy an allocation. It's possible
> for reclaim to make minimal or no progress on an LRU type despite having
> ample reclaimable pages, e.g. anonymous pages when the only swap is
> RAM-backed (zram). This can cause the reclaim path to loop indefinitely.
> 
> Track LRU reclaim progress (anon vs file) through a new struct
> reclaim_progress passed out of try_to_free_pages(), and only count a
> type's reclaimable pages if at least reclaim_progress_pct% was actually
> reclaimed in the last cycle.
> 
> The threshold is exposed as /proc/sys/vm/reclaim_progress_pct (default
> 1, range 0-100). Setting 0 disables the gate and restores the previous
> behaviour. Environments with only RAM-backed swap (zram) and small
> memory may need a higher value to prevent futile anon LRU churn from
> keeping the allocator spinning.
> 
> Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Matt Fleming <mfleming@cloudflare.com>

Hi Matt,

so have you tested it for your usecase with zram and have any observations
how it helped, what values did you set etc?

Vlastimil

Re: [PATCH] mm: Require LRU reclaim progress before retrying direct reclaim

Posted by Matt Fleming 2 months ago

On Mon, Apr 13, 2026 at 05:38:19PM +0200, Vlastimil Babka (SUSE) wrote:
> 
> Hi Matt,
> 
> so have you tested it for your usecase with zram and have any observations
> how it helped, what values did you set etc?

Hey Vlastimil,

Yeah I've tested this out. So far, results have been positive -- I see
system-wide OOM kills when memory is low and direct reclaim occurs, but
not so many OOM kills that the SRE folks have started screaming at me.

I've only run with the proposed 1% value so far. I also ran a bunch of
benchmarks alongside a memory hogging app that peridoically touches
anoymous memory.

Workload                     rpp=0              rpp=1               Notes
----------------------------------------------------------------------------------------------
Kernel compile + anon hog    Completed, no OOM  Completed,          Global OOM confirmed from
                                                Global OOM fired    __alloc_pages_slowpath

Memcached + anon hog         282k / 2.30M ops/s 562k / 3.53M ops/s  Global OOM killed hog,
                             No OOM             Global OOM fired    then benchmark ran faster

Pure fio (5 reruns each)     median 3710 MiB/s  median 3702 MiB/s   No reproducible regression
Mixed fio + anon hog         2747 MiB/s         2915 MiB/s          Global OOM killed
                                                                    unrelated services

reclaim_progress_pct=1 seems to help in these memory exhausted
situations, and doesn't appear to cause a regression for the pure file
workload case.

If you have any suggestions for other tests or benchmarks to run I'd be
happy to do that.

Thanks,
Matt

Re: [PATCH] mm: Require LRU reclaim progress before retrying direct reclaim

Posted by Vlastimil Babka (SUSE) 1 month, 3 weeks ago

On 4/15/26 11:11, Matt Fleming wrote:
> On Mon, Apr 13, 2026 at 05:38:19PM +0200, Vlastimil Babka (SUSE) wrote:
>> 
>> Hi Matt,
>> 
>> so have you tested it for your usecase with zram and have any observations
>> how it helped, what values did you set etc?
> 
> Hey Vlastimil,
> 
> Yeah I've tested this out. So far, results have been positive -- I see
> system-wide OOM kills when memory is low and direct reclaim occurs, but
> not so many OOM kills that the SRE folks have started screaming at me.

Hmm...

> I've only run with the proposed 1% value so far. I also ran a bunch of
> benchmarks alongside a memory hogging app that peridoically touches
> anoymous memory.
> 
> Workload                     rpp=0              rpp=1               Notes
> ----------------------------------------------------------------------------------------------
> Kernel compile + anon hog    Completed, no OOM  Completed,          Global OOM confirmed from
>                                                 Global OOM fired    __alloc_pages_slowpath

Completed in both cases... but was it faster? Also what got OOM killed, the hog?

> 
> Memcached + anon hog         282k / 2.30M ops/s 562k / 3.53M ops/s  Global OOM killed hog,
>                              No OOM             Global OOM fired    then benchmark ran faster

The improvement is nice. However even in the rpp=0 case there didn't seem to
have been a thrashing so bad the system wouldn't recover.

I think this is minimally an argument against having it enabled by default,
as by default we don't want to cause premature OOMs if the system is still
working (And yes, we do have problems to recognize when it's not working,
and actually doing OOM). But these tradeoffs for killing something to get
better throughput on something else are good for certain kind of
servers/workloads but not as a default.

And once you go that way then you might be better of looking at the PSI
metrics that would be more holistic than this heuristic?

> Pure fio (5 reruns each)     median 3710 MiB/s  median 3702 MiB/s   No reproducible regression
> Mixed fio + anon hog         2747 MiB/s         2915 MiB/s          Global OOM killed
>                                                                     unrelated services
> 
> reclaim_progress_pct=1 seems to help in these memory exhausted
> situations, and doesn't appear to cause a regression for the pure file
> workload case.
> 
> If you have any suggestions for other tests or benchmarks to run I'd be
> happy to do that.
> 
> Thanks,
> Matt